diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/sched |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/sched')
39 files changed, 22039 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig new file mode 100644 index 000000000000..3d1d902dd1a1 --- /dev/null +++ b/net/sched/Kconfig | |||
@@ -0,0 +1,508 @@ | |||
1 | # | ||
2 | # Traffic control configuration. | ||
3 | # | ||
4 | choice | ||
5 | prompt "Packet scheduler clock source" | ||
6 | depends on NET_SCHED | ||
7 | default NET_SCH_CLK_JIFFIES | ||
8 | help | ||
9 | Packet schedulers need a monotonic clock that increments at a static | ||
10 | rate. The kernel provides several suitable interfaces, each with | ||
11 | different properties: | ||
12 | |||
13 | - high resolution (us or better) | ||
14 | - fast to read (minimal locking, no i/o access) | ||
15 | - synchronized on all processors | ||
16 | - handles cpu clock frequency changes | ||
17 | |||
18 | but nothing provides all of the above. | ||
19 | |||
20 | config NET_SCH_CLK_JIFFIES | ||
21 | bool "Timer interrupt" | ||
22 | help | ||
23 | Say Y here if you want to use the timer interrupt (jiffies) as clock | ||
24 | source. This clock source is fast, synchronized on all processors and | ||
25 | handles cpu clock frequency changes, but its resolution is too low | ||
26 | for accurate shaping except at very low speed. | ||
27 | |||
28 | config NET_SCH_CLK_GETTIMEOFDAY | ||
29 | bool "gettimeofday" | ||
30 | help | ||
31 | Say Y here if you want to use gettimeofday as clock source. This clock | ||
32 | source has high resolution, is synchronized on all processors and | ||
33 | handles cpu clock frequency changes, but it is slow. | ||
34 | |||
35 | Choose this if you need a high resolution clock source but can't use | ||
36 | the CPU's cycle counter. | ||
37 | |||
38 | config NET_SCH_CLK_CPU | ||
39 | bool "CPU cycle counter" | ||
40 | depends on X86_TSC || X86_64 || ALPHA || SPARC64 || PPC64 || IA64 | ||
41 | help | ||
42 | Say Y here if you want to use the CPU's cycle counter as clock source. | ||
43 | This is a cheap and high resolution clock source, but on some | ||
44 | architectures it is not synchronized on all processors and doesn't | ||
45 | handle cpu clock frequency changes. | ||
46 | |||
47 | The useable cycle counters are: | ||
48 | |||
49 | x86/x86_64 - Timestamp Counter | ||
50 | alpha - Cycle Counter | ||
51 | sparc64 - %ticks register | ||
52 | ppc64 - Time base | ||
53 | ia64 - Interval Time Counter | ||
54 | |||
55 | Choose this if your CPU's cycle counter is working properly. | ||
56 | |||
57 | endchoice | ||
58 | |||
59 | config NET_SCH_CBQ | ||
60 | tristate "CBQ packet scheduler" | ||
61 | depends on NET_SCHED | ||
62 | ---help--- | ||
63 | Say Y here if you want to use the Class-Based Queueing (CBQ) packet | ||
64 | scheduling algorithm for some of your network devices. This | ||
65 | algorithm classifies the waiting packets into a tree-like hierarchy | ||
66 | of classes; the leaves of this tree are in turn scheduled by | ||
67 | separate algorithms (called "disciplines" in this context). | ||
68 | |||
69 | See the top of <file:net/sched/sch_cbq.c> for references about the | ||
70 | CBQ algorithm. | ||
71 | |||
72 | CBQ is a commonly used scheduler, so if you're unsure, you should | ||
73 | say Y here. Then say Y to all the queueing algorithms below that you | ||
74 | want to use as CBQ disciplines. Then say Y to "Packet classifier | ||
75 | API" and say Y to all the classifiers you want to use; a classifier | ||
76 | is a routine that allows you to sort your outgoing traffic into | ||
77 | classes based on a certain criterion. | ||
78 | |||
79 | To compile this code as a module, choose M here: the | ||
80 | module will be called sch_cbq. | ||
81 | |||
82 | config NET_SCH_HTB | ||
83 | tristate "HTB packet scheduler" | ||
84 | depends on NET_SCHED | ||
85 | ---help--- | ||
86 | Say Y here if you want to use the Hierarchical Token Buckets (HTB) | ||
87 | packet scheduling algorithm for some of your network devices. See | ||
88 | <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and | ||
89 | in-depth articles. | ||
90 | |||
91 | HTB is very similar to the CBQ regarding its goals however is has | ||
92 | different properties and different algorithm. | ||
93 | |||
94 | To compile this code as a module, choose M here: the | ||
95 | module will be called sch_htb. | ||
96 | |||
97 | config NET_SCH_HFSC | ||
98 | tristate "HFSC packet scheduler" | ||
99 | depends on NET_SCHED | ||
100 | ---help--- | ||
101 | Say Y here if you want to use the Hierarchical Fair Service Curve | ||
102 | (HFSC) packet scheduling algorithm for some of your network devices. | ||
103 | |||
104 | To compile this code as a module, choose M here: the | ||
105 | module will be called sch_hfsc. | ||
106 | |||
107 | #tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ | ||
108 | config NET_SCH_ATM | ||
109 | tristate "ATM pseudo-scheduler" | ||
110 | depends on NET_SCHED && ATM | ||
111 | ---help--- | ||
112 | Say Y here if you want to use the ATM pseudo-scheduler. This | ||
113 | provides a framework for invoking classifiers (aka "filters"), which | ||
114 | in turn select classes of this queuing discipline. Each class maps | ||
115 | the flow(s) it is handling to a given virtual circuit (see the top of | ||
116 | <file:net/sched/sch_atm.c>). | ||
117 | |||
118 | To compile this code as a module, choose M here: the | ||
119 | module will be called sch_atm. | ||
120 | |||
121 | config NET_SCH_PRIO | ||
122 | tristate "The simplest PRIO pseudoscheduler" | ||
123 | depends on NET_SCHED | ||
124 | help | ||
125 | Say Y here if you want to use an n-band priority queue packet | ||
126 | "scheduler" for some of your network devices or as a leaf discipline | ||
127 | for the CBQ scheduling algorithm. If unsure, say Y. | ||
128 | |||
129 | To compile this code as a module, choose M here: the | ||
130 | module will be called sch_prio. | ||
131 | |||
132 | config NET_SCH_RED | ||
133 | tristate "RED queue" | ||
134 | depends on NET_SCHED | ||
135 | help | ||
136 | Say Y here if you want to use the Random Early Detection (RED) | ||
137 | packet scheduling algorithm for some of your network devices (see | ||
138 | the top of <file:net/sched/sch_red.c> for details and references | ||
139 | about the algorithm). | ||
140 | |||
141 | To compile this code as a module, choose M here: the | ||
142 | module will be called sch_red. | ||
143 | |||
144 | config NET_SCH_SFQ | ||
145 | tristate "SFQ queue" | ||
146 | depends on NET_SCHED | ||
147 | ---help--- | ||
148 | Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) | ||
149 | packet scheduling algorithm for some of your network devices or as a | ||
150 | leaf discipline for the CBQ scheduling algorithm (see the top of | ||
151 | <file:net/sched/sch_sfq.c> for details and references about the SFQ | ||
152 | algorithm). | ||
153 | |||
154 | To compile this code as a module, choose M here: the | ||
155 | module will be called sch_sfq. | ||
156 | |||
157 | config NET_SCH_TEQL | ||
158 | tristate "TEQL queue" | ||
159 | depends on NET_SCHED | ||
160 | ---help--- | ||
161 | Say Y here if you want to use the True Link Equalizer (TLE) packet | ||
162 | scheduling algorithm for some of your network devices or as a leaf | ||
163 | discipline for the CBQ scheduling algorithm. This queueing | ||
164 | discipline allows the combination of several physical devices into | ||
165 | one virtual device. (see the top of <file:net/sched/sch_teql.c> for | ||
166 | details). | ||
167 | |||
168 | To compile this code as a module, choose M here: the | ||
169 | module will be called sch_teql. | ||
170 | |||
171 | config NET_SCH_TBF | ||
172 | tristate "TBF queue" | ||
173 | depends on NET_SCHED | ||
174 | help | ||
175 | Say Y here if you want to use the Simple Token Bucket Filter (TBF) | ||
176 | packet scheduling algorithm for some of your network devices or as a | ||
177 | leaf discipline for the CBQ scheduling algorithm (see the top of | ||
178 | <file:net/sched/sch_tbf.c> for a description of the TBF algorithm). | ||
179 | |||
180 | To compile this code as a module, choose M here: the | ||
181 | module will be called sch_tbf. | ||
182 | |||
183 | config NET_SCH_GRED | ||
184 | tristate "GRED queue" | ||
185 | depends on NET_SCHED | ||
186 | help | ||
187 | Say Y here if you want to use the Generic Random Early Detection | ||
188 | (RED) packet scheduling algorithm for some of your network devices | ||
189 | (see the top of <file:net/sched/sch_red.c> for details and | ||
190 | references about the algorithm). | ||
191 | |||
192 | To compile this code as a module, choose M here: the | ||
193 | module will be called sch_gred. | ||
194 | |||
195 | config NET_SCH_DSMARK | ||
196 | tristate "Diffserv field marker" | ||
197 | depends on NET_SCHED | ||
198 | help | ||
199 | Say Y if you want to schedule packets according to the | ||
200 | Differentiated Services architecture proposed in RFC 2475. | ||
201 | Technical information on this method, with pointers to associated | ||
202 | RFCs, is available at <http://www.gta.ufrj.br/diffserv/>. | ||
203 | |||
204 | To compile this code as a module, choose M here: the | ||
205 | module will be called sch_dsmark. | ||
206 | |||
207 | config NET_SCH_NETEM | ||
208 | tristate "Network emulator" | ||
209 | depends on NET_SCHED | ||
210 | help | ||
211 | Say Y if you want to emulate network delay, loss, and packet | ||
212 | re-ordering. This is often useful to simulate networks when | ||
213 | testing applications or protocols. | ||
214 | |||
215 | To compile this driver as a module, choose M here: the module | ||
216 | will be called sch_netem. | ||
217 | |||
218 | If unsure, say N. | ||
219 | |||
220 | config NET_SCH_INGRESS | ||
221 | tristate "Ingress Qdisc" | ||
222 | depends on NET_SCHED | ||
223 | help | ||
224 | If you say Y here, you will be able to police incoming bandwidth | ||
225 | and drop packets when this bandwidth exceeds your desired rate. | ||
226 | If unsure, say Y. | ||
227 | |||
228 | To compile this code as a module, choose M here: the | ||
229 | module will be called sch_ingress. | ||
230 | |||
231 | config NET_QOS | ||
232 | bool "QoS support" | ||
233 | depends on NET_SCHED | ||
234 | ---help--- | ||
235 | Say Y here if you want to include Quality Of Service scheduling | ||
236 | features, which means that you will be able to request certain | ||
237 | rate-of-flow limits for your network devices. | ||
238 | |||
239 | This Quality of Service (QoS) support will enable you to use | ||
240 | Differentiated Services (diffserv) and Resource Reservation Protocol | ||
241 | (RSVP) on your Linux router if you also say Y to "Packet classifier | ||
242 | API" and to some classifiers below. Documentation and software is at | ||
243 | <http://diffserv.sourceforge.net/>. | ||
244 | |||
245 | Note that the answer to this question won't directly affect the | ||
246 | kernel: saying N will just cause the configurator to skip all | ||
247 | the questions about QoS support. | ||
248 | |||
249 | config NET_ESTIMATOR | ||
250 | bool "Rate estimator" | ||
251 | depends on NET_QOS | ||
252 | help | ||
253 | In order for Quality of Service scheduling to work, the current | ||
254 | rate-of-flow for a network device has to be estimated; if you say Y | ||
255 | here, the kernel will do just that. | ||
256 | |||
257 | config NET_CLS | ||
258 | bool "Packet classifier API" | ||
259 | depends on NET_SCHED | ||
260 | ---help--- | ||
261 | The CBQ scheduling algorithm requires that network packets which are | ||
262 | scheduled to be sent out over a network device be classified | ||
263 | according to some criterion. If you say Y here, you will get a | ||
264 | choice of several different packet classifiers with the following | ||
265 | questions. | ||
266 | |||
267 | This will enable you to use Differentiated Services (diffserv) and | ||
268 | Resource Reservation Protocol (RSVP) on your Linux router. | ||
269 | Documentation and software is at | ||
270 | <http://diffserv.sourceforge.net/>. | ||
271 | |||
272 | config NET_CLS_BASIC | ||
273 | tristate "Basic classifier" | ||
274 | depends on NET_CLS | ||
275 | ---help--- | ||
276 | Say Y here if you want to be able to classify packets using | ||
277 | only extended matches and actions. | ||
278 | |||
279 | To compile this code as a module, choose M here: the | ||
280 | module will be called cls_basic. | ||
281 | |||
282 | config NET_CLS_TCINDEX | ||
283 | tristate "TC index classifier" | ||
284 | depends on NET_CLS | ||
285 | help | ||
286 | If you say Y here, you will be able to classify outgoing packets | ||
287 | according to the tc_index field of the skb. You will want this | ||
288 | feature if you want to implement Differentiated Services using | ||
289 | sch_dsmark. If unsure, say Y. | ||
290 | |||
291 | To compile this code as a module, choose M here: the | ||
292 | module will be called cls_tcindex. | ||
293 | |||
294 | config NET_CLS_ROUTE4 | ||
295 | tristate "Routing table based classifier" | ||
296 | depends on NET_CLS | ||
297 | select NET_CLS_ROUTE | ||
298 | help | ||
299 | If you say Y here, you will be able to classify outgoing packets | ||
300 | according to the route table entry they matched. If unsure, say Y. | ||
301 | |||
302 | To compile this code as a module, choose M here: the | ||
303 | module will be called cls_route. | ||
304 | |||
305 | config NET_CLS_ROUTE | ||
306 | bool | ||
307 | default n | ||
308 | |||
309 | config NET_CLS_FW | ||
310 | tristate "Firewall based classifier" | ||
311 | depends on NET_CLS | ||
312 | help | ||
313 | If you say Y here, you will be able to classify outgoing packets | ||
314 | according to firewall criteria you specified. | ||
315 | |||
316 | To compile this code as a module, choose M here: the | ||
317 | module will be called cls_fw. | ||
318 | |||
319 | config NET_CLS_U32 | ||
320 | tristate "U32 classifier" | ||
321 | depends on NET_CLS | ||
322 | help | ||
323 | If you say Y here, you will be able to classify outgoing packets | ||
324 | according to their destination address. If unsure, say Y. | ||
325 | |||
326 | To compile this code as a module, choose M here: the | ||
327 | module will be called cls_u32. | ||
328 | |||
329 | config CLS_U32_PERF | ||
330 | bool "U32 classifier performance counters" | ||
331 | depends on NET_CLS_U32 | ||
332 | help | ||
333 | gathers stats that could be used to tune u32 classifier performance. | ||
334 | Requires a new iproute2 | ||
335 | You MUST NOT turn this on if you dont have an update iproute2. | ||
336 | |||
337 | config NET_CLS_IND | ||
338 | bool "classify input device (slows things u32/fw) " | ||
339 | depends on NET_CLS_U32 || NET_CLS_FW | ||
340 | help | ||
341 | This option will be killed eventually when a | ||
342 | metadata action appears because it slows things a little | ||
343 | Available only for u32 and fw classifiers. | ||
344 | Requires a new iproute2 | ||
345 | You MUST NOT turn this on if you dont have an update iproute2. | ||
346 | |||
347 | config CLS_U32_MARK | ||
348 | bool "Use nfmark as a key in U32 classifier" | ||
349 | depends on NET_CLS_U32 && NETFILTER | ||
350 | help | ||
351 | This allows you to match mark in a u32 filter. | ||
352 | Example: | ||
353 | tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \ | ||
354 | match mark 0x0090 0xffff \ | ||
355 | match ip dst 4.4.4.4 \ | ||
356 | flowid 1:90 | ||
357 | You must use a new iproute2 to use this feature. | ||
358 | |||
359 | config NET_CLS_RSVP | ||
360 | tristate "Special RSVP classifier" | ||
361 | depends on NET_CLS && NET_QOS | ||
362 | ---help--- | ||
363 | The Resource Reservation Protocol (RSVP) permits end systems to | ||
364 | request a minimum and maximum data flow rate for a connection; this | ||
365 | is important for real time data such as streaming sound or video. | ||
366 | |||
367 | Say Y here if you want to be able to classify outgoing packets based | ||
368 | on their RSVP requests. | ||
369 | |||
370 | To compile this code as a module, choose M here: the | ||
371 | module will be called cls_rsvp. | ||
372 | |||
373 | config NET_CLS_RSVP6 | ||
374 | tristate "Special RSVP classifier for IPv6" | ||
375 | depends on NET_CLS && NET_QOS | ||
376 | ---help--- | ||
377 | The Resource Reservation Protocol (RSVP) permits end systems to | ||
378 | request a minimum and maximum data flow rate for a connection; this | ||
379 | is important for real time data such as streaming sound or video. | ||
380 | |||
381 | Say Y here if you want to be able to classify outgoing packets based | ||
382 | on their RSVP requests and you are using the new Internet Protocol | ||
383 | IPv6 as opposed to the older and more common IPv4. | ||
384 | |||
385 | To compile this code as a module, choose M here: the | ||
386 | module will be called cls_rsvp6. | ||
387 | |||
388 | config NET_EMATCH | ||
389 | bool "Extended Matches" | ||
390 | depends on NET_CLS | ||
391 | ---help--- | ||
392 | Say Y here if you want to use extended matches on top of classifiers | ||
393 | and select the extended matches below. | ||
394 | |||
395 | Extended matches are small classification helpers not worth writing | ||
396 | a separate classifier. | ||
397 | |||
398 | You must have a recent version of the iproute2 tools in order to use | ||
399 | extended matches. | ||
400 | |||
401 | config NET_EMATCH_STACK | ||
402 | int "Stack size" | ||
403 | depends on NET_EMATCH | ||
404 | default "32" | ||
405 | ---help--- | ||
406 | Size of the local stack variable used while evaluating the tree of | ||
407 | ematches. Limits the depth of the tree, i.e. the number of | ||
408 | encapsulated precedences. Every level requires 4 bytes of addtional | ||
409 | stack space. | ||
410 | |||
411 | config NET_EMATCH_CMP | ||
412 | tristate "Simple packet data comparison" | ||
413 | depends on NET_EMATCH | ||
414 | ---help--- | ||
415 | Say Y here if you want to be able to classify packets based on | ||
416 | simple packet data comparisons for 8, 16, and 32bit values. | ||
417 | |||
418 | To compile this code as a module, choose M here: the | ||
419 | module will be called em_cmp. | ||
420 | |||
421 | config NET_EMATCH_NBYTE | ||
422 | tristate "Multi byte comparison" | ||
423 | depends on NET_EMATCH | ||
424 | ---help--- | ||
425 | Say Y here if you want to be able to classify packets based on | ||
426 | multiple byte comparisons mainly useful for IPv6 address comparisons. | ||
427 | |||
428 | To compile this code as a module, choose M here: the | ||
429 | module will be called em_nbyte. | ||
430 | |||
431 | config NET_EMATCH_U32 | ||
432 | tristate "U32 hashing key" | ||
433 | depends on NET_EMATCH | ||
434 | ---help--- | ||
435 | Say Y here if you want to be able to classify packets using | ||
436 | the famous u32 key in combination with logic relations. | ||
437 | |||
438 | To compile this code as a module, choose M here: the | ||
439 | module will be called em_u32. | ||
440 | |||
441 | config NET_EMATCH_META | ||
442 | tristate "Metadata" | ||
443 | depends on NET_EMATCH | ||
444 | ---help--- | ||
445 | Say Y here if you want to be ablt to classify packets based on | ||
446 | metadata such as load average, netfilter attributes, socket | ||
447 | attributes and routing decisions. | ||
448 | |||
449 | To compile this code as a module, choose M here: the | ||
450 | module will be called em_meta. | ||
451 | |||
452 | config NET_CLS_ACT | ||
453 | bool "Packet ACTION" | ||
454 | depends on EXPERIMENTAL && NET_CLS && NET_QOS | ||
455 | ---help--- | ||
456 | This option requires you have a new iproute2. It enables | ||
457 | tc extensions which can be used with tc classifiers. | ||
458 | You MUST NOT turn this on if you dont have an update iproute2. | ||
459 | |||
460 | config NET_ACT_POLICE | ||
461 | tristate "Policing Actions" | ||
462 | depends on NET_CLS_ACT | ||
463 | ---help--- | ||
464 | If you are using a newer iproute2 select this one, otherwise use one | ||
465 | below to select a policer. | ||
466 | You MUST NOT turn this on if you dont have an update iproute2. | ||
467 | |||
468 | config NET_ACT_GACT | ||
469 | tristate "generic Actions" | ||
470 | depends on NET_CLS_ACT | ||
471 | ---help--- | ||
472 | You must have new iproute2 to use this feature. | ||
473 | This adds simple filtering actions like drop, accept etc. | ||
474 | |||
475 | config GACT_PROB | ||
476 | bool "generic Actions probability" | ||
477 | depends on NET_ACT_GACT | ||
478 | ---help--- | ||
479 | Allows generic actions to be randomly or deterministically used. | ||
480 | |||
481 | config NET_ACT_MIRRED | ||
482 | tristate "Packet In/Egress redirecton/mirror Actions" | ||
483 | depends on NET_CLS_ACT | ||
484 | ---help--- | ||
485 | requires new iproute2 | ||
486 | This allows packets to be mirrored or redirected to netdevices | ||
487 | |||
488 | config NET_ACT_IPT | ||
489 | tristate "iptables Actions" | ||
490 | depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES | ||
491 | ---help--- | ||
492 | requires new iproute2 | ||
493 | This allows iptables targets to be used by tc filters | ||
494 | |||
495 | config NET_ACT_PEDIT | ||
496 | tristate "Generic Packet Editor Actions" | ||
497 | depends on NET_CLS_ACT | ||
498 | ---help--- | ||
499 | requires new iproute2 | ||
500 | This allows for packets to be generically edited | ||
501 | |||
502 | config NET_CLS_POLICE | ||
503 | bool "Traffic policing (needed for in/egress)" | ||
504 | depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y | ||
505 | help | ||
506 | Say Y to support traffic policing (bandwidth limits). Needed for | ||
507 | ingress and egress rate limiting. | ||
508 | |||
diff --git a/net/sched/Makefile b/net/sched/Makefile new file mode 100644 index 000000000000..431e55786efd --- /dev/null +++ b/net/sched/Makefile | |||
@@ -0,0 +1,41 @@ | |||
1 | # | ||
2 | # Makefile for the Linux Traffic Control Unit. | ||
3 | # | ||
4 | |||
5 | obj-y := sch_generic.o | ||
6 | |||
7 | obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o | ||
8 | obj-$(CONFIG_NET_CLS) += cls_api.o | ||
9 | obj-$(CONFIG_NET_CLS_ACT) += act_api.o | ||
10 | obj-$(CONFIG_NET_ACT_POLICE) += police.o | ||
11 | obj-$(CONFIG_NET_CLS_POLICE) += police.o | ||
12 | obj-$(CONFIG_NET_ACT_GACT) += gact.o | ||
13 | obj-$(CONFIG_NET_ACT_MIRRED) += mirred.o | ||
14 | obj-$(CONFIG_NET_ACT_IPT) += ipt.o | ||
15 | obj-$(CONFIG_NET_ACT_PEDIT) += pedit.o | ||
16 | obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o | ||
17 | obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o | ||
18 | obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o | ||
19 | obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o | ||
20 | obj-$(CONFIG_NET_SCH_RED) += sch_red.o | ||
21 | obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o | ||
22 | obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o | ||
23 | obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o | ||
24 | obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o | ||
25 | obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o | ||
26 | obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o | ||
27 | obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o | ||
28 | obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o | ||
29 | obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o | ||
30 | obj-$(CONFIG_NET_CLS_U32) += cls_u32.o | ||
31 | obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o | ||
32 | obj-$(CONFIG_NET_CLS_FW) += cls_fw.o | ||
33 | obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o | ||
34 | obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o | ||
35 | obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o | ||
36 | obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o | ||
37 | obj-$(CONFIG_NET_EMATCH) += ematch.o | ||
38 | obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o | ||
39 | obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o | ||
40 | obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o | ||
41 | obj-$(CONFIG_NET_EMATCH_META) += em_meta.o | ||
diff --git a/net/sched/act_api.c b/net/sched/act_api.c new file mode 100644 index 000000000000..5e6cc371b39e --- /dev/null +++ b/net/sched/act_api.c | |||
@@ -0,0 +1,894 @@ | |||
1 | /* | ||
2 | * net/sched/act_api.c Packet action API. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Author: Jamal Hadi Salim | ||
10 | * | ||
11 | * | ||
12 | */ | ||
13 | |||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <linux/bitops.h> | ||
17 | #include <linux/config.h> | ||
18 | #include <linux/types.h> | ||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/string.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/socket.h> | ||
24 | #include <linux/sockios.h> | ||
25 | #include <linux/in.h> | ||
26 | #include <linux/errno.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #include <linux/skbuff.h> | ||
30 | #include <linux/rtnetlink.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/kmod.h> | ||
33 | #include <net/sock.h> | ||
34 | #include <net/sch_generic.h> | ||
35 | #include <net/act_api.h> | ||
36 | |||
37 | #if 1 /* control */ | ||
38 | #define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args) | ||
39 | #else | ||
40 | #define DPRINTK(format, args...) | ||
41 | #endif | ||
42 | #if 0 /* data */ | ||
43 | #define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args) | ||
44 | #else | ||
45 | #define D2PRINTK(format, args...) | ||
46 | #endif | ||
47 | |||
48 | static struct tc_action_ops *act_base = NULL; | ||
49 | static DEFINE_RWLOCK(act_mod_lock); | ||
50 | |||
51 | int tcf_register_action(struct tc_action_ops *act) | ||
52 | { | ||
53 | struct tc_action_ops *a, **ap; | ||
54 | |||
55 | write_lock(&act_mod_lock); | ||
56 | for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { | ||
57 | if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { | ||
58 | write_unlock(&act_mod_lock); | ||
59 | return -EEXIST; | ||
60 | } | ||
61 | } | ||
62 | act->next = NULL; | ||
63 | *ap = act; | ||
64 | write_unlock(&act_mod_lock); | ||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | int tcf_unregister_action(struct tc_action_ops *act) | ||
69 | { | ||
70 | struct tc_action_ops *a, **ap; | ||
71 | int err = -ENOENT; | ||
72 | |||
73 | write_lock(&act_mod_lock); | ||
74 | for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) | ||
75 | if (a == act) | ||
76 | break; | ||
77 | if (a) { | ||
78 | *ap = a->next; | ||
79 | a->next = NULL; | ||
80 | err = 0; | ||
81 | } | ||
82 | write_unlock(&act_mod_lock); | ||
83 | return err; | ||
84 | } | ||
85 | |||
86 | /* lookup by name */ | ||
87 | static struct tc_action_ops *tc_lookup_action_n(char *kind) | ||
88 | { | ||
89 | struct tc_action_ops *a = NULL; | ||
90 | |||
91 | if (kind) { | ||
92 | read_lock(&act_mod_lock); | ||
93 | for (a = act_base; a; a = a->next) { | ||
94 | if (strcmp(kind, a->kind) == 0) { | ||
95 | if (!try_module_get(a->owner)) { | ||
96 | read_unlock(&act_mod_lock); | ||
97 | return NULL; | ||
98 | } | ||
99 | break; | ||
100 | } | ||
101 | } | ||
102 | read_unlock(&act_mod_lock); | ||
103 | } | ||
104 | return a; | ||
105 | } | ||
106 | |||
107 | /* lookup by rtattr */ | ||
108 | static struct tc_action_ops *tc_lookup_action(struct rtattr *kind) | ||
109 | { | ||
110 | struct tc_action_ops *a = NULL; | ||
111 | |||
112 | if (kind) { | ||
113 | read_lock(&act_mod_lock); | ||
114 | for (a = act_base; a; a = a->next) { | ||
115 | if (rtattr_strcmp(kind, a->kind) == 0) { | ||
116 | if (!try_module_get(a->owner)) { | ||
117 | read_unlock(&act_mod_lock); | ||
118 | return NULL; | ||
119 | } | ||
120 | break; | ||
121 | } | ||
122 | } | ||
123 | read_unlock(&act_mod_lock); | ||
124 | } | ||
125 | return a; | ||
126 | } | ||
127 | |||
128 | #if 0 | ||
129 | /* lookup by id */ | ||
130 | static struct tc_action_ops *tc_lookup_action_id(u32 type) | ||
131 | { | ||
132 | struct tc_action_ops *a = NULL; | ||
133 | |||
134 | if (type) { | ||
135 | read_lock(&act_mod_lock); | ||
136 | for (a = act_base; a; a = a->next) { | ||
137 | if (a->type == type) { | ||
138 | if (!try_module_get(a->owner)) { | ||
139 | read_unlock(&act_mod_lock); | ||
140 | return NULL; | ||
141 | } | ||
142 | break; | ||
143 | } | ||
144 | } | ||
145 | read_unlock(&act_mod_lock); | ||
146 | } | ||
147 | return a; | ||
148 | } | ||
149 | #endif | ||
150 | |||
151 | int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, | ||
152 | struct tcf_result *res) | ||
153 | { | ||
154 | struct tc_action *a; | ||
155 | int ret = -1; | ||
156 | |||
157 | if (skb->tc_verd & TC_NCLS) { | ||
158 | skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); | ||
159 | D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n", | ||
160 | skb, skb->input_dev ? skb->input_dev->name : "xxx", | ||
161 | skb->dev->name); | ||
162 | ret = TC_ACT_OK; | ||
163 | goto exec_done; | ||
164 | } | ||
165 | while ((a = act) != NULL) { | ||
166 | repeat: | ||
167 | if (a->ops && a->ops->act) { | ||
168 | ret = a->ops->act(&skb, a); | ||
169 | if (TC_MUNGED & skb->tc_verd) { | ||
170 | /* copied already, allow trampling */ | ||
171 | skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); | ||
172 | skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); | ||
173 | } | ||
174 | if (ret != TC_ACT_PIPE) | ||
175 | goto exec_done; | ||
176 | if (ret == TC_ACT_REPEAT) | ||
177 | goto repeat; /* we need a ttl - JHS */ | ||
178 | } | ||
179 | act = a->next; | ||
180 | } | ||
181 | exec_done: | ||
182 | if (skb->tc_classid > 0) { | ||
183 | res->classid = skb->tc_classid; | ||
184 | res->class = 0; | ||
185 | skb->tc_classid = 0; | ||
186 | } | ||
187 | return ret; | ||
188 | } | ||
189 | |||
190 | void tcf_action_destroy(struct tc_action *act, int bind) | ||
191 | { | ||
192 | struct tc_action *a; | ||
193 | |||
194 | for (a = act; a; a = act) { | ||
195 | if (a->ops && a->ops->cleanup) { | ||
196 | DPRINTK("tcf_action_destroy destroying %p next %p\n", | ||
197 | a, a->next); | ||
198 | if (a->ops->cleanup(a, bind) == ACT_P_DELETED) | ||
199 | module_put(a->ops->owner); | ||
200 | act = act->next; | ||
201 | kfree(a); | ||
202 | } else { /*FIXME: Remove later - catch insertion bugs*/ | ||
203 | printk("tcf_action_destroy: BUG? destroying NULL ops\n"); | ||
204 | act = act->next; | ||
205 | kfree(a); | ||
206 | } | ||
207 | } | ||
208 | } | ||
209 | |||
210 | int | ||
211 | tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) | ||
212 | { | ||
213 | int err = -EINVAL; | ||
214 | |||
215 | if (a->ops == NULL || a->ops->dump == NULL) | ||
216 | return err; | ||
217 | return a->ops->dump(skb, a, bind, ref); | ||
218 | } | ||
219 | |||
220 | int | ||
221 | tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) | ||
222 | { | ||
223 | int err = -EINVAL; | ||
224 | unsigned char *b = skb->tail; | ||
225 | struct rtattr *r; | ||
226 | |||
227 | if (a->ops == NULL || a->ops->dump == NULL) | ||
228 | return err; | ||
229 | |||
230 | RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); | ||
231 | if (tcf_action_copy_stats(skb, a, 0)) | ||
232 | goto rtattr_failure; | ||
233 | r = (struct rtattr*) skb->tail; | ||
234 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
235 | if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { | ||
236 | r->rta_len = skb->tail - (u8*)r; | ||
237 | return err; | ||
238 | } | ||
239 | |||
240 | rtattr_failure: | ||
241 | skb_trim(skb, b - skb->data); | ||
242 | return -1; | ||
243 | } | ||
244 | |||
245 | int | ||
246 | tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) | ||
247 | { | ||
248 | struct tc_action *a; | ||
249 | int err = -EINVAL; | ||
250 | unsigned char *b = skb->tail; | ||
251 | struct rtattr *r ; | ||
252 | |||
253 | while ((a = act) != NULL) { | ||
254 | r = (struct rtattr*) skb->tail; | ||
255 | act = a->next; | ||
256 | RTA_PUT(skb, a->order, 0, NULL); | ||
257 | err = tcf_action_dump_1(skb, a, bind, ref); | ||
258 | if (err < 0) | ||
259 | goto rtattr_failure; | ||
260 | r->rta_len = skb->tail - (u8*)r; | ||
261 | } | ||
262 | |||
263 | return 0; | ||
264 | |||
265 | rtattr_failure: | ||
266 | skb_trim(skb, b - skb->data); | ||
267 | return -err; | ||
268 | } | ||
269 | |||
270 | struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, | ||
271 | char *name, int ovr, int bind, int *err) | ||
272 | { | ||
273 | struct tc_action *a; | ||
274 | struct tc_action_ops *a_o; | ||
275 | char act_name[IFNAMSIZ]; | ||
276 | struct rtattr *tb[TCA_ACT_MAX+1]; | ||
277 | struct rtattr *kind; | ||
278 | |||
279 | *err = -EINVAL; | ||
280 | |||
281 | if (name == NULL) { | ||
282 | if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) | ||
283 | goto err_out; | ||
284 | kind = tb[TCA_ACT_KIND-1]; | ||
285 | if (kind == NULL) | ||
286 | goto err_out; | ||
287 | if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) | ||
288 | goto err_out; | ||
289 | } else { | ||
290 | if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) | ||
291 | goto err_out; | ||
292 | } | ||
293 | |||
294 | a_o = tc_lookup_action_n(act_name); | ||
295 | if (a_o == NULL) { | ||
296 | #ifdef CONFIG_KMOD | ||
297 | rtnl_unlock(); | ||
298 | request_module(act_name); | ||
299 | rtnl_lock(); | ||
300 | |||
301 | a_o = tc_lookup_action_n(act_name); | ||
302 | |||
303 | /* We dropped the RTNL semaphore in order to | ||
304 | * perform the module load. So, even if we | ||
305 | * succeeded in loading the module we have to | ||
306 | * tell the caller to replay the request. We | ||
307 | * indicate this using -EAGAIN. | ||
308 | */ | ||
309 | if (a_o != NULL) { | ||
310 | *err = -EAGAIN; | ||
311 | goto err_mod; | ||
312 | } | ||
313 | #endif | ||
314 | goto err_out; | ||
315 | } | ||
316 | |||
317 | *err = -ENOMEM; | ||
318 | a = kmalloc(sizeof(*a), GFP_KERNEL); | ||
319 | if (a == NULL) | ||
320 | goto err_mod; | ||
321 | memset(a, 0, sizeof(*a)); | ||
322 | |||
323 | /* backward compatibility for policer */ | ||
324 | if (name == NULL) | ||
325 | *err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind); | ||
326 | else | ||
327 | *err = a_o->init(rta, est, a, ovr, bind); | ||
328 | if (*err < 0) | ||
329 | goto err_free; | ||
330 | |||
331 | /* module count goes up only when brand new policy is created | ||
332 | if it exists and is only bound to in a_o->init() then | ||
333 | ACT_P_CREATED is not returned (a zero is). | ||
334 | */ | ||
335 | if (*err != ACT_P_CREATED) | ||
336 | module_put(a_o->owner); | ||
337 | a->ops = a_o; | ||
338 | DPRINTK("tcf_action_init_1: successfull %s\n", act_name); | ||
339 | |||
340 | *err = 0; | ||
341 | return a; | ||
342 | |||
343 | err_free: | ||
344 | kfree(a); | ||
345 | err_mod: | ||
346 | module_put(a_o->owner); | ||
347 | err_out: | ||
348 | return NULL; | ||
349 | } | ||
350 | |||
351 | struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est, | ||
352 | char *name, int ovr, int bind, int *err) | ||
353 | { | ||
354 | struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; | ||
355 | struct tc_action *head = NULL, *act, *act_prev = NULL; | ||
356 | int i; | ||
357 | |||
358 | if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) { | ||
359 | *err = -EINVAL; | ||
360 | return head; | ||
361 | } | ||
362 | |||
363 | for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { | ||
364 | act = tcf_action_init_1(tb[i], est, name, ovr, bind, err); | ||
365 | if (act == NULL) | ||
366 | goto err; | ||
367 | act->order = i+1; | ||
368 | |||
369 | if (head == NULL) | ||
370 | head = act; | ||
371 | else | ||
372 | act_prev->next = act; | ||
373 | act_prev = act; | ||
374 | } | ||
375 | return head; | ||
376 | |||
377 | err: | ||
378 | if (head != NULL) | ||
379 | tcf_action_destroy(head, bind); | ||
380 | return NULL; | ||
381 | } | ||
382 | |||
383 | int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, | ||
384 | int compat_mode) | ||
385 | { | ||
386 | int err = 0; | ||
387 | struct gnet_dump d; | ||
388 | struct tcf_act_hdr *h = a->priv; | ||
389 | |||
390 | if (h == NULL) | ||
391 | goto errout; | ||
392 | |||
393 | /* compat_mode being true specifies a call that is supposed | ||
394 | * to add additional backward compatiblity statistic TLVs. | ||
395 | */ | ||
396 | if (compat_mode) { | ||
397 | if (a->type == TCA_OLD_COMPAT) | ||
398 | err = gnet_stats_start_copy_compat(skb, 0, | ||
399 | TCA_STATS, TCA_XSTATS, h->stats_lock, &d); | ||
400 | else | ||
401 | return 0; | ||
402 | } else | ||
403 | err = gnet_stats_start_copy(skb, TCA_ACT_STATS, | ||
404 | h->stats_lock, &d); | ||
405 | |||
406 | if (err < 0) | ||
407 | goto errout; | ||
408 | |||
409 | if (a->ops != NULL && a->ops->get_stats != NULL) | ||
410 | if (a->ops->get_stats(skb, a) < 0) | ||
411 | goto errout; | ||
412 | |||
413 | if (gnet_stats_copy_basic(&d, &h->bstats) < 0 || | ||
414 | #ifdef CONFIG_NET_ESTIMATOR | ||
415 | gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 || | ||
416 | #endif | ||
417 | gnet_stats_copy_queue(&d, &h->qstats) < 0) | ||
418 | goto errout; | ||
419 | |||
420 | if (gnet_stats_finish_copy(&d) < 0) | ||
421 | goto errout; | ||
422 | |||
423 | return 0; | ||
424 | |||
425 | errout: | ||
426 | return -1; | ||
427 | } | ||
428 | |||
429 | static int | ||
430 | tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, | ||
431 | unsigned flags, int event, int bind, int ref) | ||
432 | { | ||
433 | struct tcamsg *t; | ||
434 | struct nlmsghdr *nlh; | ||
435 | unsigned char *b = skb->tail; | ||
436 | struct rtattr *x; | ||
437 | |||
438 | nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); | ||
439 | nlh->nlmsg_flags = flags; | ||
440 | t = NLMSG_DATA(nlh); | ||
441 | t->tca_family = AF_UNSPEC; | ||
442 | |||
443 | x = (struct rtattr*) skb->tail; | ||
444 | RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); | ||
445 | |||
446 | if (tcf_action_dump(skb, a, bind, ref) < 0) | ||
447 | goto rtattr_failure; | ||
448 | |||
449 | x->rta_len = skb->tail - (u8*)x; | ||
450 | |||
451 | nlh->nlmsg_len = skb->tail - b; | ||
452 | return skb->len; | ||
453 | |||
454 | rtattr_failure: | ||
455 | nlmsg_failure: | ||
456 | skb_trim(skb, b - skb->data); | ||
457 | return -1; | ||
458 | } | ||
459 | |||
460 | static int | ||
461 | act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event) | ||
462 | { | ||
463 | struct sk_buff *skb; | ||
464 | int err = 0; | ||
465 | |||
466 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
467 | if (!skb) | ||
468 | return -ENOBUFS; | ||
469 | if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { | ||
470 | kfree_skb(skb); | ||
471 | return -EINVAL; | ||
472 | } | ||
473 | err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); | ||
474 | if (err > 0) | ||
475 | err = 0; | ||
476 | return err; | ||
477 | } | ||
478 | |||
479 | static struct tc_action * | ||
480 | tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err) | ||
481 | { | ||
482 | struct rtattr *tb[TCA_ACT_MAX+1]; | ||
483 | struct tc_action *a; | ||
484 | int index; | ||
485 | |||
486 | *err = -EINVAL; | ||
487 | if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) | ||
488 | return NULL; | ||
489 | |||
490 | if (tb[TCA_ACT_INDEX - 1] == NULL || | ||
491 | RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index)) | ||
492 | return NULL; | ||
493 | index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]); | ||
494 | |||
495 | *err = -ENOMEM; | ||
496 | a = kmalloc(sizeof(struct tc_action), GFP_KERNEL); | ||
497 | if (a == NULL) | ||
498 | return NULL; | ||
499 | memset(a, 0, sizeof(struct tc_action)); | ||
500 | |||
501 | *err = -EINVAL; | ||
502 | a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]); | ||
503 | if (a->ops == NULL) | ||
504 | goto err_free; | ||
505 | if (a->ops->lookup == NULL) | ||
506 | goto err_mod; | ||
507 | *err = -ENOENT; | ||
508 | if (a->ops->lookup(a, index) == 0) | ||
509 | goto err_mod; | ||
510 | |||
511 | module_put(a->ops->owner); | ||
512 | *err = 0; | ||
513 | return a; | ||
514 | err_mod: | ||
515 | module_put(a->ops->owner); | ||
516 | err_free: | ||
517 | kfree(a); | ||
518 | return NULL; | ||
519 | } | ||
520 | |||
521 | static void cleanup_a(struct tc_action *act) | ||
522 | { | ||
523 | struct tc_action *a; | ||
524 | |||
525 | for (a = act; a; a = act) { | ||
526 | act = a->next; | ||
527 | kfree(a); | ||
528 | } | ||
529 | } | ||
530 | |||
531 | static struct tc_action *create_a(int i) | ||
532 | { | ||
533 | struct tc_action *act; | ||
534 | |||
535 | act = kmalloc(sizeof(*act), GFP_KERNEL); | ||
536 | if (act == NULL) { | ||
537 | printk("create_a: failed to alloc!\n"); | ||
538 | return NULL; | ||
539 | } | ||
540 | memset(act, 0, sizeof(*act)); | ||
541 | act->order = i; | ||
542 | return act; | ||
543 | } | ||
544 | |||
545 | static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) | ||
546 | { | ||
547 | struct sk_buff *skb; | ||
548 | unsigned char *b; | ||
549 | struct nlmsghdr *nlh; | ||
550 | struct tcamsg *t; | ||
551 | struct netlink_callback dcb; | ||
552 | struct rtattr *x; | ||
553 | struct rtattr *tb[TCA_ACT_MAX+1]; | ||
554 | struct rtattr *kind; | ||
555 | struct tc_action *a = create_a(0); | ||
556 | int err = -EINVAL; | ||
557 | |||
558 | if (a == NULL) { | ||
559 | printk("tca_action_flush: couldnt create tc_action\n"); | ||
560 | return err; | ||
561 | } | ||
562 | |||
563 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
564 | if (!skb) { | ||
565 | printk("tca_action_flush: failed skb alloc\n"); | ||
566 | kfree(a); | ||
567 | return -ENOBUFS; | ||
568 | } | ||
569 | |||
570 | b = (unsigned char *)skb->tail; | ||
571 | |||
572 | if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) | ||
573 | goto err_out; | ||
574 | |||
575 | kind = tb[TCA_ACT_KIND-1]; | ||
576 | a->ops = tc_lookup_action(kind); | ||
577 | if (a->ops == NULL) | ||
578 | goto err_out; | ||
579 | |||
580 | nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); | ||
581 | t = NLMSG_DATA(nlh); | ||
582 | t->tca_family = AF_UNSPEC; | ||
583 | |||
584 | x = (struct rtattr *) skb->tail; | ||
585 | RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); | ||
586 | |||
587 | err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); | ||
588 | if (err < 0) | ||
589 | goto rtattr_failure; | ||
590 | |||
591 | x->rta_len = skb->tail - (u8 *) x; | ||
592 | |||
593 | nlh->nlmsg_len = skb->tail - b; | ||
594 | nlh->nlmsg_flags |= NLM_F_ROOT; | ||
595 | module_put(a->ops->owner); | ||
596 | kfree(a); | ||
597 | err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); | ||
598 | if (err > 0) | ||
599 | return 0; | ||
600 | |||
601 | return err; | ||
602 | |||
603 | rtattr_failure: | ||
604 | module_put(a->ops->owner); | ||
605 | nlmsg_failure: | ||
606 | err_out: | ||
607 | kfree_skb(skb); | ||
608 | kfree(a); | ||
609 | return err; | ||
610 | } | ||
611 | |||
612 | static int | ||
613 | tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) | ||
614 | { | ||
615 | int i, ret = 0; | ||
616 | struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; | ||
617 | struct tc_action *head = NULL, *act, *act_prev = NULL; | ||
618 | |||
619 | if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) | ||
620 | return -EINVAL; | ||
621 | |||
622 | if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { | ||
623 | if (tb[0] != NULL && tb[1] == NULL) | ||
624 | return tca_action_flush(tb[0], n, pid); | ||
625 | } | ||
626 | |||
627 | for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { | ||
628 | act = tcf_action_get_1(tb[i], n, pid, &ret); | ||
629 | if (act == NULL) | ||
630 | goto err; | ||
631 | act->order = i+1; | ||
632 | |||
633 | if (head == NULL) | ||
634 | head = act; | ||
635 | else | ||
636 | act_prev->next = act; | ||
637 | act_prev = act; | ||
638 | } | ||
639 | |||
640 | if (event == RTM_GETACTION) | ||
641 | ret = act_get_notify(pid, n, head, event); | ||
642 | else { /* delete */ | ||
643 | struct sk_buff *skb; | ||
644 | |||
645 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
646 | if (!skb) { | ||
647 | ret = -ENOBUFS; | ||
648 | goto err; | ||
649 | } | ||
650 | |||
651 | if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, | ||
652 | 0, 1) <= 0) { | ||
653 | kfree_skb(skb); | ||
654 | ret = -EINVAL; | ||
655 | goto err; | ||
656 | } | ||
657 | |||
658 | /* now do the delete */ | ||
659 | tcf_action_destroy(head, 0); | ||
660 | ret = rtnetlink_send(skb, pid, RTMGRP_TC, | ||
661 | n->nlmsg_flags&NLM_F_ECHO); | ||
662 | if (ret > 0) | ||
663 | return 0; | ||
664 | return ret; | ||
665 | } | ||
666 | err: | ||
667 | cleanup_a(head); | ||
668 | return ret; | ||
669 | } | ||
670 | |||
671 | static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, | ||
672 | unsigned flags) | ||
673 | { | ||
674 | struct tcamsg *t; | ||
675 | struct nlmsghdr *nlh; | ||
676 | struct sk_buff *skb; | ||
677 | struct rtattr *x; | ||
678 | unsigned char *b; | ||
679 | int err = 0; | ||
680 | |||
681 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
682 | if (!skb) | ||
683 | return -ENOBUFS; | ||
684 | |||
685 | b = (unsigned char *)skb->tail; | ||
686 | |||
687 | nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); | ||
688 | nlh->nlmsg_flags = flags; | ||
689 | t = NLMSG_DATA(nlh); | ||
690 | t->tca_family = AF_UNSPEC; | ||
691 | |||
692 | x = (struct rtattr*) skb->tail; | ||
693 | RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); | ||
694 | |||
695 | if (tcf_action_dump(skb, a, 0, 0) < 0) | ||
696 | goto rtattr_failure; | ||
697 | |||
698 | x->rta_len = skb->tail - (u8*)x; | ||
699 | |||
700 | nlh->nlmsg_len = skb->tail - b; | ||
701 | NETLINK_CB(skb).dst_groups = RTMGRP_TC; | ||
702 | |||
703 | err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO); | ||
704 | if (err > 0) | ||
705 | err = 0; | ||
706 | return err; | ||
707 | |||
708 | rtattr_failure: | ||
709 | nlmsg_failure: | ||
710 | skb_trim(skb, b - skb->data); | ||
711 | return -1; | ||
712 | } | ||
713 | |||
714 | |||
715 | static int | ||
716 | tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr) | ||
717 | { | ||
718 | int ret = 0; | ||
719 | struct tc_action *act; | ||
720 | struct tc_action *a; | ||
721 | u32 seq = n->nlmsg_seq; | ||
722 | |||
723 | act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret); | ||
724 | if (act == NULL) | ||
725 | goto done; | ||
726 | |||
727 | /* dump then free all the actions after update; inserted policy | ||
728 | * stays intact | ||
729 | * */ | ||
730 | ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); | ||
731 | for (a = act; a; a = act) { | ||
732 | act = a->next; | ||
733 | kfree(a); | ||
734 | } | ||
735 | done: | ||
736 | return ret; | ||
737 | } | ||
738 | |||
739 | static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) | ||
740 | { | ||
741 | struct rtattr **tca = arg; | ||
742 | u32 pid = skb ? NETLINK_CB(skb).pid : 0; | ||
743 | int ret = 0, ovr = 0; | ||
744 | |||
745 | if (tca[TCA_ACT_TAB-1] == NULL) { | ||
746 | printk("tc_ctl_action: received NO action attribs\n"); | ||
747 | return -EINVAL; | ||
748 | } | ||
749 | |||
750 | /* n->nlmsg_flags&NLM_F_CREATE | ||
751 | * */ | ||
752 | switch (n->nlmsg_type) { | ||
753 | case RTM_NEWACTION: | ||
754 | /* we are going to assume all other flags | ||
755 | * imply create only if it doesnt exist | ||
756 | * Note that CREATE | EXCL implies that | ||
757 | * but since we want avoid ambiguity (eg when flags | ||
758 | * is zero) then just set this | ||
759 | */ | ||
760 | if (n->nlmsg_flags&NLM_F_REPLACE) | ||
761 | ovr = 1; | ||
762 | replay: | ||
763 | ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr); | ||
764 | if (ret == -EAGAIN) | ||
765 | goto replay; | ||
766 | break; | ||
767 | case RTM_DELACTION: | ||
768 | ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION); | ||
769 | break; | ||
770 | case RTM_GETACTION: | ||
771 | ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION); | ||
772 | break; | ||
773 | default: | ||
774 | BUG(); | ||
775 | } | ||
776 | |||
777 | return ret; | ||
778 | } | ||
779 | |||
780 | static char * | ||
781 | find_dump_kind(struct nlmsghdr *n) | ||
782 | { | ||
783 | struct rtattr *tb1, *tb2[TCA_ACT_MAX+1]; | ||
784 | struct rtattr *tb[TCA_ACT_MAX_PRIO + 1]; | ||
785 | struct rtattr *rta[TCAA_MAX + 1]; | ||
786 | struct rtattr *kind; | ||
787 | int min_len = NLMSG_LENGTH(sizeof(struct tcamsg)); | ||
788 | int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len); | ||
789 | struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len); | ||
790 | |||
791 | if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0) | ||
792 | return NULL; | ||
793 | tb1 = rta[TCA_ACT_TAB - 1]; | ||
794 | if (tb1 == NULL) | ||
795 | return NULL; | ||
796 | |||
797 | if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1), | ||
798 | NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0) | ||
799 | return NULL; | ||
800 | if (tb[0] == NULL) | ||
801 | return NULL; | ||
802 | |||
803 | if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]), | ||
804 | RTA_PAYLOAD(tb[0])) < 0) | ||
805 | return NULL; | ||
806 | kind = tb2[TCA_ACT_KIND-1]; | ||
807 | |||
808 | return (char *) RTA_DATA(kind); | ||
809 | } | ||
810 | |||
811 | static int | ||
812 | tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) | ||
813 | { | ||
814 | struct nlmsghdr *nlh; | ||
815 | unsigned char *b = skb->tail; | ||
816 | struct rtattr *x; | ||
817 | struct tc_action_ops *a_o; | ||
818 | struct tc_action a; | ||
819 | int ret = 0; | ||
820 | struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); | ||
821 | char *kind = find_dump_kind(cb->nlh); | ||
822 | |||
823 | if (kind == NULL) { | ||
824 | printk("tc_dump_action: action bad kind\n"); | ||
825 | return 0; | ||
826 | } | ||
827 | |||
828 | a_o = tc_lookup_action_n(kind); | ||
829 | if (a_o == NULL) { | ||
830 | printk("failed to find %s\n", kind); | ||
831 | return 0; | ||
832 | } | ||
833 | |||
834 | memset(&a, 0, sizeof(struct tc_action)); | ||
835 | a.ops = a_o; | ||
836 | |||
837 | if (a_o->walk == NULL) { | ||
838 | printk("tc_dump_action: %s !capable of dumping table\n", kind); | ||
839 | goto rtattr_failure; | ||
840 | } | ||
841 | |||
842 | nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, | ||
843 | cb->nlh->nlmsg_type, sizeof(*t)); | ||
844 | t = NLMSG_DATA(nlh); | ||
845 | t->tca_family = AF_UNSPEC; | ||
846 | |||
847 | x = (struct rtattr *) skb->tail; | ||
848 | RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); | ||
849 | |||
850 | ret = a_o->walk(skb, cb, RTM_GETACTION, &a); | ||
851 | if (ret < 0) | ||
852 | goto rtattr_failure; | ||
853 | |||
854 | if (ret > 0) { | ||
855 | x->rta_len = skb->tail - (u8 *) x; | ||
856 | ret = skb->len; | ||
857 | } else | ||
858 | skb_trim(skb, (u8*)x - skb->data); | ||
859 | |||
860 | nlh->nlmsg_len = skb->tail - b; | ||
861 | if (NETLINK_CB(cb->skb).pid && ret) | ||
862 | nlh->nlmsg_flags |= NLM_F_MULTI; | ||
863 | module_put(a_o->owner); | ||
864 | return skb->len; | ||
865 | |||
866 | rtattr_failure: | ||
867 | nlmsg_failure: | ||
868 | module_put(a_o->owner); | ||
869 | skb_trim(skb, b - skb->data); | ||
870 | return skb->len; | ||
871 | } | ||
872 | |||
873 | static int __init tc_action_init(void) | ||
874 | { | ||
875 | struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; | ||
876 | |||
877 | if (link_p) { | ||
878 | link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action; | ||
879 | link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action; | ||
880 | link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action; | ||
881 | link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action; | ||
882 | } | ||
883 | |||
884 | printk("TC classifier action (bugs to netdev@oss.sgi.com cc " | ||
885 | "hadi@cyberus.ca)\n"); | ||
886 | return 0; | ||
887 | } | ||
888 | |||
889 | subsys_initcall(tc_action_init); | ||
890 | |||
891 | EXPORT_SYMBOL(tcf_register_action); | ||
892 | EXPORT_SYMBOL(tcf_unregister_action); | ||
893 | EXPORT_SYMBOL(tcf_action_exec); | ||
894 | EXPORT_SYMBOL(tcf_action_dump_1); | ||
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c new file mode 100644 index 000000000000..56e66c3fe0fa --- /dev/null +++ b/net/sched/cls_api.c | |||
@@ -0,0 +1,642 @@ | |||
1 | /* | ||
2 | * net/sched/cls_api.c Packet classifier API. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/system.h> | ||
19 | #include <linux/bitops.h> | ||
20 | #include <linux/config.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/types.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/sched.h> | ||
25 | #include <linux/string.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/socket.h> | ||
28 | #include <linux/sockios.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/errno.h> | ||
31 | #include <linux/interrupt.h> | ||
32 | #include <linux/netdevice.h> | ||
33 | #include <linux/skbuff.h> | ||
34 | #include <linux/rtnetlink.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/kmod.h> | ||
37 | #include <net/sock.h> | ||
38 | #include <net/pkt_sched.h> | ||
39 | #include <net/pkt_cls.h> | ||
40 | |||
41 | #if 0 /* control */ | ||
42 | #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
43 | #else | ||
44 | #define DPRINTK(format,args...) | ||
45 | #endif | ||
46 | |||
47 | /* The list of all installed classifier types */ | ||
48 | |||
49 | static struct tcf_proto_ops *tcf_proto_base; | ||
50 | |||
51 | /* Protects list of registered TC modules. It is pure SMP lock. */ | ||
52 | static DEFINE_RWLOCK(cls_mod_lock); | ||
53 | |||
54 | /* Find classifier type by string name */ | ||
55 | |||
56 | static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) | ||
57 | { | ||
58 | struct tcf_proto_ops *t = NULL; | ||
59 | |||
60 | if (kind) { | ||
61 | read_lock(&cls_mod_lock); | ||
62 | for (t = tcf_proto_base; t; t = t->next) { | ||
63 | if (rtattr_strcmp(kind, t->kind) == 0) { | ||
64 | if (!try_module_get(t->owner)) | ||
65 | t = NULL; | ||
66 | break; | ||
67 | } | ||
68 | } | ||
69 | read_unlock(&cls_mod_lock); | ||
70 | } | ||
71 | return t; | ||
72 | } | ||
73 | |||
74 | /* Register(unregister) new classifier type */ | ||
75 | |||
76 | int register_tcf_proto_ops(struct tcf_proto_ops *ops) | ||
77 | { | ||
78 | struct tcf_proto_ops *t, **tp; | ||
79 | int rc = -EEXIST; | ||
80 | |||
81 | write_lock(&cls_mod_lock); | ||
82 | for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) | ||
83 | if (!strcmp(ops->kind, t->kind)) | ||
84 | goto out; | ||
85 | |||
86 | ops->next = NULL; | ||
87 | *tp = ops; | ||
88 | rc = 0; | ||
89 | out: | ||
90 | write_unlock(&cls_mod_lock); | ||
91 | return rc; | ||
92 | } | ||
93 | |||
94 | int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) | ||
95 | { | ||
96 | struct tcf_proto_ops *t, **tp; | ||
97 | int rc = -ENOENT; | ||
98 | |||
99 | write_lock(&cls_mod_lock); | ||
100 | for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) | ||
101 | if (t == ops) | ||
102 | break; | ||
103 | |||
104 | if (!t) | ||
105 | goto out; | ||
106 | *tp = t->next; | ||
107 | rc = 0; | ||
108 | out: | ||
109 | write_unlock(&cls_mod_lock); | ||
110 | return rc; | ||
111 | } | ||
112 | |||
113 | static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, | ||
114 | struct tcf_proto *tp, unsigned long fh, int event); | ||
115 | |||
116 | |||
117 | /* Select new prio value from the range, managed by kernel. */ | ||
118 | |||
119 | static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) | ||
120 | { | ||
121 | u32 first = TC_H_MAKE(0xC0000000U,0U); | ||
122 | |||
123 | if (tp) | ||
124 | first = tp->prio-1; | ||
125 | |||
126 | return first; | ||
127 | } | ||
128 | |||
129 | /* Add/change/delete/get a filter node */ | ||
130 | |||
131 | static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) | ||
132 | { | ||
133 | struct rtattr **tca; | ||
134 | struct tcmsg *t; | ||
135 | u32 protocol; | ||
136 | u32 prio; | ||
137 | u32 nprio; | ||
138 | u32 parent; | ||
139 | struct net_device *dev; | ||
140 | struct Qdisc *q; | ||
141 | struct tcf_proto **back, **chain; | ||
142 | struct tcf_proto *tp; | ||
143 | struct tcf_proto_ops *tp_ops; | ||
144 | struct Qdisc_class_ops *cops; | ||
145 | unsigned long cl; | ||
146 | unsigned long fh; | ||
147 | int err; | ||
148 | |||
149 | replay: | ||
150 | tca = arg; | ||
151 | t = NLMSG_DATA(n); | ||
152 | protocol = TC_H_MIN(t->tcm_info); | ||
153 | prio = TC_H_MAJ(t->tcm_info); | ||
154 | nprio = prio; | ||
155 | parent = t->tcm_parent; | ||
156 | cl = 0; | ||
157 | |||
158 | if (prio == 0) { | ||
159 | /* If no priority is given, user wants we allocated it. */ | ||
160 | if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) | ||
161 | return -ENOENT; | ||
162 | prio = TC_H_MAKE(0x80000000U,0U); | ||
163 | } | ||
164 | |||
165 | /* Find head of filter chain. */ | ||
166 | |||
167 | /* Find link */ | ||
168 | if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL) | ||
169 | return -ENODEV; | ||
170 | |||
171 | /* Find qdisc */ | ||
172 | if (!parent) { | ||
173 | q = dev->qdisc_sleeping; | ||
174 | parent = q->handle; | ||
175 | } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) | ||
176 | return -EINVAL; | ||
177 | |||
178 | /* Is it classful? */ | ||
179 | if ((cops = q->ops->cl_ops) == NULL) | ||
180 | return -EINVAL; | ||
181 | |||
182 | /* Do we search for filter, attached to class? */ | ||
183 | if (TC_H_MIN(parent)) { | ||
184 | cl = cops->get(q, parent); | ||
185 | if (cl == 0) | ||
186 | return -ENOENT; | ||
187 | } | ||
188 | |||
189 | /* And the last stroke */ | ||
190 | chain = cops->tcf_chain(q, cl); | ||
191 | err = -EINVAL; | ||
192 | if (chain == NULL) | ||
193 | goto errout; | ||
194 | |||
195 | /* Check the chain for existence of proto-tcf with this priority */ | ||
196 | for (back = chain; (tp=*back) != NULL; back = &tp->next) { | ||
197 | if (tp->prio >= prio) { | ||
198 | if (tp->prio == prio) { | ||
199 | if (!nprio || (tp->protocol != protocol && protocol)) | ||
200 | goto errout; | ||
201 | } else | ||
202 | tp = NULL; | ||
203 | break; | ||
204 | } | ||
205 | } | ||
206 | |||
207 | if (tp == NULL) { | ||
208 | /* Proto-tcf does not exist, create new one */ | ||
209 | |||
210 | if (tca[TCA_KIND-1] == NULL || !protocol) | ||
211 | goto errout; | ||
212 | |||
213 | err = -ENOENT; | ||
214 | if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) | ||
215 | goto errout; | ||
216 | |||
217 | |||
218 | /* Create new proto tcf */ | ||
219 | |||
220 | err = -ENOBUFS; | ||
221 | if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) | ||
222 | goto errout; | ||
223 | err = -EINVAL; | ||
224 | tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); | ||
225 | if (tp_ops == NULL) { | ||
226 | #ifdef CONFIG_KMOD | ||
227 | struct rtattr *kind = tca[TCA_KIND-1]; | ||
228 | char name[IFNAMSIZ]; | ||
229 | |||
230 | if (kind != NULL && | ||
231 | rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { | ||
232 | rtnl_unlock(); | ||
233 | request_module("cls_%s", name); | ||
234 | rtnl_lock(); | ||
235 | tp_ops = tcf_proto_lookup_ops(kind); | ||
236 | /* We dropped the RTNL semaphore in order to | ||
237 | * perform the module load. So, even if we | ||
238 | * succeeded in loading the module we have to | ||
239 | * replay the request. We indicate this using | ||
240 | * -EAGAIN. | ||
241 | */ | ||
242 | if (tp_ops != NULL) { | ||
243 | module_put(tp_ops->owner); | ||
244 | err = -EAGAIN; | ||
245 | } | ||
246 | } | ||
247 | #endif | ||
248 | kfree(tp); | ||
249 | goto errout; | ||
250 | } | ||
251 | memset(tp, 0, sizeof(*tp)); | ||
252 | tp->ops = tp_ops; | ||
253 | tp->protocol = protocol; | ||
254 | tp->prio = nprio ? : tcf_auto_prio(*back); | ||
255 | tp->q = q; | ||
256 | tp->classify = tp_ops->classify; | ||
257 | tp->classid = parent; | ||
258 | if ((err = tp_ops->init(tp)) != 0) { | ||
259 | module_put(tp_ops->owner); | ||
260 | kfree(tp); | ||
261 | goto errout; | ||
262 | } | ||
263 | |||
264 | qdisc_lock_tree(dev); | ||
265 | tp->next = *back; | ||
266 | *back = tp; | ||
267 | qdisc_unlock_tree(dev); | ||
268 | |||
269 | } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) | ||
270 | goto errout; | ||
271 | |||
272 | fh = tp->ops->get(tp, t->tcm_handle); | ||
273 | |||
274 | if (fh == 0) { | ||
275 | if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { | ||
276 | qdisc_lock_tree(dev); | ||
277 | *back = tp->next; | ||
278 | qdisc_unlock_tree(dev); | ||
279 | |||
280 | tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); | ||
281 | tcf_destroy(tp); | ||
282 | err = 0; | ||
283 | goto errout; | ||
284 | } | ||
285 | |||
286 | err = -ENOENT; | ||
287 | if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) | ||
288 | goto errout; | ||
289 | } else { | ||
290 | switch (n->nlmsg_type) { | ||
291 | case RTM_NEWTFILTER: | ||
292 | err = -EEXIST; | ||
293 | if (n->nlmsg_flags&NLM_F_EXCL) | ||
294 | goto errout; | ||
295 | break; | ||
296 | case RTM_DELTFILTER: | ||
297 | err = tp->ops->delete(tp, fh); | ||
298 | if (err == 0) | ||
299 | tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); | ||
300 | goto errout; | ||
301 | case RTM_GETTFILTER: | ||
302 | err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); | ||
303 | goto errout; | ||
304 | default: | ||
305 | err = -EINVAL; | ||
306 | goto errout; | ||
307 | } | ||
308 | } | ||
309 | |||
310 | err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); | ||
311 | if (err == 0) | ||
312 | tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); | ||
313 | |||
314 | errout: | ||
315 | if (cl) | ||
316 | cops->put(q, cl); | ||
317 | if (err == -EAGAIN) | ||
318 | /* Replay the request. */ | ||
319 | goto replay; | ||
320 | return err; | ||
321 | } | ||
322 | |||
323 | static int | ||
324 | tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, | ||
325 | u32 pid, u32 seq, unsigned flags, int event) | ||
326 | { | ||
327 | struct tcmsg *tcm; | ||
328 | struct nlmsghdr *nlh; | ||
329 | unsigned char *b = skb->tail; | ||
330 | |||
331 | nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); | ||
332 | nlh->nlmsg_flags = flags; | ||
333 | tcm = NLMSG_DATA(nlh); | ||
334 | tcm->tcm_family = AF_UNSPEC; | ||
335 | tcm->tcm_ifindex = tp->q->dev->ifindex; | ||
336 | tcm->tcm_parent = tp->classid; | ||
337 | tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); | ||
338 | RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); | ||
339 | tcm->tcm_handle = fh; | ||
340 | if (RTM_DELTFILTER != event) { | ||
341 | tcm->tcm_handle = 0; | ||
342 | if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) | ||
343 | goto rtattr_failure; | ||
344 | } | ||
345 | nlh->nlmsg_len = skb->tail - b; | ||
346 | return skb->len; | ||
347 | |||
348 | nlmsg_failure: | ||
349 | rtattr_failure: | ||
350 | skb_trim(skb, b - skb->data); | ||
351 | return -1; | ||
352 | } | ||
353 | |||
354 | static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, | ||
355 | struct tcf_proto *tp, unsigned long fh, int event) | ||
356 | { | ||
357 | struct sk_buff *skb; | ||
358 | u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; | ||
359 | |||
360 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
361 | if (!skb) | ||
362 | return -ENOBUFS; | ||
363 | |||
364 | if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { | ||
365 | kfree_skb(skb); | ||
366 | return -EINVAL; | ||
367 | } | ||
368 | |||
369 | return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); | ||
370 | } | ||
371 | |||
372 | struct tcf_dump_args | ||
373 | { | ||
374 | struct tcf_walker w; | ||
375 | struct sk_buff *skb; | ||
376 | struct netlink_callback *cb; | ||
377 | }; | ||
378 | |||
379 | static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) | ||
380 | { | ||
381 | struct tcf_dump_args *a = (void*)arg; | ||
382 | |||
383 | return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, | ||
384 | a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); | ||
385 | } | ||
386 | |||
387 | static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) | ||
388 | { | ||
389 | int t; | ||
390 | int s_t; | ||
391 | struct net_device *dev; | ||
392 | struct Qdisc *q; | ||
393 | struct tcf_proto *tp, **chain; | ||
394 | struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); | ||
395 | unsigned long cl = 0; | ||
396 | struct Qdisc_class_ops *cops; | ||
397 | struct tcf_dump_args arg; | ||
398 | |||
399 | if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) | ||
400 | return skb->len; | ||
401 | if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) | ||
402 | return skb->len; | ||
403 | |||
404 | read_lock_bh(&qdisc_tree_lock); | ||
405 | if (!tcm->tcm_parent) | ||
406 | q = dev->qdisc_sleeping; | ||
407 | else | ||
408 | q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); | ||
409 | if (!q) | ||
410 | goto out; | ||
411 | if ((cops = q->ops->cl_ops) == NULL) | ||
412 | goto errout; | ||
413 | if (TC_H_MIN(tcm->tcm_parent)) { | ||
414 | cl = cops->get(q, tcm->tcm_parent); | ||
415 | if (cl == 0) | ||
416 | goto errout; | ||
417 | } | ||
418 | chain = cops->tcf_chain(q, cl); | ||
419 | if (chain == NULL) | ||
420 | goto errout; | ||
421 | |||
422 | s_t = cb->args[0]; | ||
423 | |||
424 | for (tp=*chain, t=0; tp; tp = tp->next, t++) { | ||
425 | if (t < s_t) continue; | ||
426 | if (TC_H_MAJ(tcm->tcm_info) && | ||
427 | TC_H_MAJ(tcm->tcm_info) != tp->prio) | ||
428 | continue; | ||
429 | if (TC_H_MIN(tcm->tcm_info) && | ||
430 | TC_H_MIN(tcm->tcm_info) != tp->protocol) | ||
431 | continue; | ||
432 | if (t > s_t) | ||
433 | memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); | ||
434 | if (cb->args[1] == 0) { | ||
435 | if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, | ||
436 | cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { | ||
437 | break; | ||
438 | } | ||
439 | cb->args[1] = 1; | ||
440 | } | ||
441 | if (tp->ops->walk == NULL) | ||
442 | continue; | ||
443 | arg.w.fn = tcf_node_dump; | ||
444 | arg.skb = skb; | ||
445 | arg.cb = cb; | ||
446 | arg.w.stop = 0; | ||
447 | arg.w.skip = cb->args[1]-1; | ||
448 | arg.w.count = 0; | ||
449 | tp->ops->walk(tp, &arg.w); | ||
450 | cb->args[1] = arg.w.count+1; | ||
451 | if (arg.w.stop) | ||
452 | break; | ||
453 | } | ||
454 | |||
455 | cb->args[0] = t; | ||
456 | |||
457 | errout: | ||
458 | if (cl) | ||
459 | cops->put(q, cl); | ||
460 | out: | ||
461 | read_unlock_bh(&qdisc_tree_lock); | ||
462 | dev_put(dev); | ||
463 | return skb->len; | ||
464 | } | ||
465 | |||
466 | void | ||
467 | tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) | ||
468 | { | ||
469 | #ifdef CONFIG_NET_CLS_ACT | ||
470 | if (exts->action) { | ||
471 | tcf_action_destroy(exts->action, TCA_ACT_UNBIND); | ||
472 | exts->action = NULL; | ||
473 | } | ||
474 | #elif defined CONFIG_NET_CLS_POLICE | ||
475 | if (exts->police) { | ||
476 | tcf_police_release(exts->police, TCA_ACT_UNBIND); | ||
477 | exts->police = NULL; | ||
478 | } | ||
479 | #endif | ||
480 | } | ||
481 | |||
482 | |||
483 | int | ||
484 | tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb, | ||
485 | struct rtattr *rate_tlv, struct tcf_exts *exts, | ||
486 | struct tcf_ext_map *map) | ||
487 | { | ||
488 | memset(exts, 0, sizeof(*exts)); | ||
489 | |||
490 | #ifdef CONFIG_NET_CLS_ACT | ||
491 | { | ||
492 | int err; | ||
493 | struct tc_action *act; | ||
494 | |||
495 | if (map->police && tb[map->police-1]) { | ||
496 | act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police", | ||
497 | TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); | ||
498 | if (act == NULL) | ||
499 | return err; | ||
500 | |||
501 | act->type = TCA_OLD_COMPAT; | ||
502 | exts->action = act; | ||
503 | } else if (map->action && tb[map->action-1]) { | ||
504 | act = tcf_action_init(tb[map->action-1], rate_tlv, NULL, | ||
505 | TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); | ||
506 | if (act == NULL) | ||
507 | return err; | ||
508 | |||
509 | exts->action = act; | ||
510 | } | ||
511 | } | ||
512 | #elif defined CONFIG_NET_CLS_POLICE | ||
513 | if (map->police && tb[map->police-1]) { | ||
514 | struct tcf_police *p; | ||
515 | |||
516 | p = tcf_police_locate(tb[map->police-1], rate_tlv); | ||
517 | if (p == NULL) | ||
518 | return -EINVAL; | ||
519 | |||
520 | exts->police = p; | ||
521 | } else if (map->action && tb[map->action-1]) | ||
522 | return -EOPNOTSUPP; | ||
523 | #else | ||
524 | if ((map->action && tb[map->action-1]) || | ||
525 | (map->police && tb[map->police-1])) | ||
526 | return -EOPNOTSUPP; | ||
527 | #endif | ||
528 | |||
529 | return 0; | ||
530 | } | ||
531 | |||
532 | void | ||
533 | tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, | ||
534 | struct tcf_exts *src) | ||
535 | { | ||
536 | #ifdef CONFIG_NET_CLS_ACT | ||
537 | if (src->action) { | ||
538 | struct tc_action *act; | ||
539 | tcf_tree_lock(tp); | ||
540 | act = xchg(&dst->action, src->action); | ||
541 | tcf_tree_unlock(tp); | ||
542 | if (act) | ||
543 | tcf_action_destroy(act, TCA_ACT_UNBIND); | ||
544 | } | ||
545 | #elif defined CONFIG_NET_CLS_POLICE | ||
546 | if (src->police) { | ||
547 | struct tcf_police *p; | ||
548 | tcf_tree_lock(tp); | ||
549 | p = xchg(&dst->police, src->police); | ||
550 | tcf_tree_unlock(tp); | ||
551 | if (p) | ||
552 | tcf_police_release(p, TCA_ACT_UNBIND); | ||
553 | } | ||
554 | #endif | ||
555 | } | ||
556 | |||
557 | int | ||
558 | tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, | ||
559 | struct tcf_ext_map *map) | ||
560 | { | ||
561 | #ifdef CONFIG_NET_CLS_ACT | ||
562 | if (map->action && exts->action) { | ||
563 | /* | ||
564 | * again for backward compatible mode - we want | ||
565 | * to work with both old and new modes of entering | ||
566 | * tc data even if iproute2 was newer - jhs | ||
567 | */ | ||
568 | struct rtattr * p_rta = (struct rtattr*) skb->tail; | ||
569 | |||
570 | if (exts->action->type != TCA_OLD_COMPAT) { | ||
571 | RTA_PUT(skb, map->action, 0, NULL); | ||
572 | if (tcf_action_dump(skb, exts->action, 0, 0) < 0) | ||
573 | goto rtattr_failure; | ||
574 | p_rta->rta_len = skb->tail - (u8*)p_rta; | ||
575 | } else if (map->police) { | ||
576 | RTA_PUT(skb, map->police, 0, NULL); | ||
577 | if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) | ||
578 | goto rtattr_failure; | ||
579 | p_rta->rta_len = skb->tail - (u8*)p_rta; | ||
580 | } | ||
581 | } | ||
582 | #elif defined CONFIG_NET_CLS_POLICE | ||
583 | if (map->police && exts->police) { | ||
584 | struct rtattr * p_rta = (struct rtattr*) skb->tail; | ||
585 | |||
586 | RTA_PUT(skb, map->police, 0, NULL); | ||
587 | |||
588 | if (tcf_police_dump(skb, exts->police) < 0) | ||
589 | goto rtattr_failure; | ||
590 | |||
591 | p_rta->rta_len = skb->tail - (u8*)p_rta; | ||
592 | } | ||
593 | #endif | ||
594 | return 0; | ||
595 | rtattr_failure: __attribute__ ((unused)) | ||
596 | return -1; | ||
597 | } | ||
598 | |||
599 | int | ||
600 | tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, | ||
601 | struct tcf_ext_map *map) | ||
602 | { | ||
603 | #ifdef CONFIG_NET_CLS_ACT | ||
604 | if (exts->action) | ||
605 | if (tcf_action_copy_stats(skb, exts->action, 1) < 0) | ||
606 | goto rtattr_failure; | ||
607 | #elif defined CONFIG_NET_CLS_POLICE | ||
608 | if (exts->police) | ||
609 | if (tcf_police_dump_stats(skb, exts->police) < 0) | ||
610 | goto rtattr_failure; | ||
611 | #endif | ||
612 | return 0; | ||
613 | rtattr_failure: __attribute__ ((unused)) | ||
614 | return -1; | ||
615 | } | ||
616 | |||
617 | static int __init tc_filter_init(void) | ||
618 | { | ||
619 | struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; | ||
620 | |||
621 | /* Setup rtnetlink links. It is made here to avoid | ||
622 | exporting large number of public symbols. | ||
623 | */ | ||
624 | |||
625 | if (link_p) { | ||
626 | link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; | ||
627 | link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; | ||
628 | link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; | ||
629 | link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; | ||
630 | } | ||
631 | return 0; | ||
632 | } | ||
633 | |||
634 | subsys_initcall(tc_filter_init); | ||
635 | |||
636 | EXPORT_SYMBOL(register_tcf_proto_ops); | ||
637 | EXPORT_SYMBOL(unregister_tcf_proto_ops); | ||
638 | EXPORT_SYMBOL(tcf_exts_validate); | ||
639 | EXPORT_SYMBOL(tcf_exts_destroy); | ||
640 | EXPORT_SYMBOL(tcf_exts_change); | ||
641 | EXPORT_SYMBOL(tcf_exts_dump); | ||
642 | EXPORT_SYMBOL(tcf_exts_dump_stats); | ||
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c new file mode 100644 index 000000000000..0d2d4415f334 --- /dev/null +++ b/net/sched/cls_basic.c | |||
@@ -0,0 +1,303 @@ | |||
1 | /* | ||
2 | * net/sched/cls_basic.c Basic Packet Classifier. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Thomas Graf <tgraf@suug.ch> | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/errno.h> | ||
20 | #include <linux/rtnetlink.h> | ||
21 | #include <linux/skbuff.h> | ||
22 | #include <net/act_api.h> | ||
23 | #include <net/pkt_cls.h> | ||
24 | |||
25 | struct basic_head | ||
26 | { | ||
27 | u32 hgenerator; | ||
28 | struct list_head flist; | ||
29 | }; | ||
30 | |||
31 | struct basic_filter | ||
32 | { | ||
33 | u32 handle; | ||
34 | struct tcf_exts exts; | ||
35 | struct tcf_ematch_tree ematches; | ||
36 | struct tcf_result res; | ||
37 | struct list_head link; | ||
38 | }; | ||
39 | |||
40 | static struct tcf_ext_map basic_ext_map = { | ||
41 | .action = TCA_BASIC_ACT, | ||
42 | .police = TCA_BASIC_POLICE | ||
43 | }; | ||
44 | |||
45 | static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp, | ||
46 | struct tcf_result *res) | ||
47 | { | ||
48 | int r; | ||
49 | struct basic_head *head = (struct basic_head *) tp->root; | ||
50 | struct basic_filter *f; | ||
51 | |||
52 | list_for_each_entry(f, &head->flist, link) { | ||
53 | if (!tcf_em_tree_match(skb, &f->ematches, NULL)) | ||
54 | continue; | ||
55 | *res = f->res; | ||
56 | r = tcf_exts_exec(skb, &f->exts, res); | ||
57 | if (r < 0) | ||
58 | continue; | ||
59 | return r; | ||
60 | } | ||
61 | return -1; | ||
62 | } | ||
63 | |||
64 | static unsigned long basic_get(struct tcf_proto *tp, u32 handle) | ||
65 | { | ||
66 | unsigned long l = 0UL; | ||
67 | struct basic_head *head = (struct basic_head *) tp->root; | ||
68 | struct basic_filter *f; | ||
69 | |||
70 | if (head == NULL) | ||
71 | return 0UL; | ||
72 | |||
73 | list_for_each_entry(f, &head->flist, link) | ||
74 | if (f->handle == handle) | ||
75 | l = (unsigned long) f; | ||
76 | |||
77 | return l; | ||
78 | } | ||
79 | |||
80 | static void basic_put(struct tcf_proto *tp, unsigned long f) | ||
81 | { | ||
82 | } | ||
83 | |||
84 | static int basic_init(struct tcf_proto *tp) | ||
85 | { | ||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | static inline void basic_delete_filter(struct tcf_proto *tp, | ||
90 | struct basic_filter *f) | ||
91 | { | ||
92 | tcf_unbind_filter(tp, &f->res); | ||
93 | tcf_exts_destroy(tp, &f->exts); | ||
94 | tcf_em_tree_destroy(tp, &f->ematches); | ||
95 | kfree(f); | ||
96 | } | ||
97 | |||
98 | static void basic_destroy(struct tcf_proto *tp) | ||
99 | { | ||
100 | struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL); | ||
101 | struct basic_filter *f, *n; | ||
102 | |||
103 | list_for_each_entry_safe(f, n, &head->flist, link) { | ||
104 | list_del(&f->link); | ||
105 | basic_delete_filter(tp, f); | ||
106 | } | ||
107 | } | ||
108 | |||
109 | static int basic_delete(struct tcf_proto *tp, unsigned long arg) | ||
110 | { | ||
111 | struct basic_head *head = (struct basic_head *) tp->root; | ||
112 | struct basic_filter *t, *f = (struct basic_filter *) arg; | ||
113 | |||
114 | list_for_each_entry(t, &head->flist, link) | ||
115 | if (t == f) { | ||
116 | tcf_tree_lock(tp); | ||
117 | list_del(&t->link); | ||
118 | tcf_tree_unlock(tp); | ||
119 | basic_delete_filter(tp, t); | ||
120 | return 0; | ||
121 | } | ||
122 | |||
123 | return -ENOENT; | ||
124 | } | ||
125 | |||
126 | static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, | ||
127 | unsigned long base, struct rtattr **tb, | ||
128 | struct rtattr *est) | ||
129 | { | ||
130 | int err = -EINVAL; | ||
131 | struct tcf_exts e; | ||
132 | struct tcf_ematch_tree t; | ||
133 | |||
134 | if (tb[TCA_BASIC_CLASSID-1]) | ||
135 | if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32)) | ||
136 | return err; | ||
137 | |||
138 | err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); | ||
139 | if (err < 0) | ||
140 | return err; | ||
141 | |||
142 | err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t); | ||
143 | if (err < 0) | ||
144 | goto errout; | ||
145 | |||
146 | if (tb[TCA_BASIC_CLASSID-1]) { | ||
147 | f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]); | ||
148 | tcf_bind_filter(tp, &f->res, base); | ||
149 | } | ||
150 | |||
151 | tcf_exts_change(tp, &f->exts, &e); | ||
152 | tcf_em_tree_change(tp, &f->ematches, &t); | ||
153 | |||
154 | return 0; | ||
155 | errout: | ||
156 | tcf_exts_destroy(tp, &e); | ||
157 | return err; | ||
158 | } | ||
159 | |||
160 | static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, | ||
161 | struct rtattr **tca, unsigned long *arg) | ||
162 | { | ||
163 | int err = -EINVAL; | ||
164 | struct basic_head *head = (struct basic_head *) tp->root; | ||
165 | struct rtattr *tb[TCA_BASIC_MAX]; | ||
166 | struct basic_filter *f = (struct basic_filter *) *arg; | ||
167 | |||
168 | if (tca[TCA_OPTIONS-1] == NULL) | ||
169 | return -EINVAL; | ||
170 | |||
171 | if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0) | ||
172 | return -EINVAL; | ||
173 | |||
174 | if (f != NULL) { | ||
175 | if (handle && f->handle != handle) | ||
176 | return -EINVAL; | ||
177 | return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); | ||
178 | } | ||
179 | |||
180 | err = -ENOBUFS; | ||
181 | if (head == NULL) { | ||
182 | head = kmalloc(sizeof(*head), GFP_KERNEL); | ||
183 | if (head == NULL) | ||
184 | goto errout; | ||
185 | |||
186 | memset(head, 0, sizeof(*head)); | ||
187 | INIT_LIST_HEAD(&head->flist); | ||
188 | tp->root = head; | ||
189 | } | ||
190 | |||
191 | f = kmalloc(sizeof(*f), GFP_KERNEL); | ||
192 | if (f == NULL) | ||
193 | goto errout; | ||
194 | memset(f, 0, sizeof(*f)); | ||
195 | |||
196 | err = -EINVAL; | ||
197 | if (handle) | ||
198 | f->handle = handle; | ||
199 | else { | ||
200 | int i = 0x80000000; | ||
201 | do { | ||
202 | if (++head->hgenerator == 0x7FFFFFFF) | ||
203 | head->hgenerator = 1; | ||
204 | } while (--i > 0 && basic_get(tp, head->hgenerator)); | ||
205 | |||
206 | if (i <= 0) { | ||
207 | printk(KERN_ERR "Insufficient number of handles\n"); | ||
208 | goto errout; | ||
209 | } | ||
210 | |||
211 | f->handle = head->hgenerator; | ||
212 | } | ||
213 | |||
214 | err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); | ||
215 | if (err < 0) | ||
216 | goto errout; | ||
217 | |||
218 | tcf_tree_lock(tp); | ||
219 | list_add(&f->link, &head->flist); | ||
220 | tcf_tree_unlock(tp); | ||
221 | *arg = (unsigned long) f; | ||
222 | |||
223 | return 0; | ||
224 | errout: | ||
225 | if (*arg == 0UL && f) | ||
226 | kfree(f); | ||
227 | |||
228 | return err; | ||
229 | } | ||
230 | |||
231 | static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) | ||
232 | { | ||
233 | struct basic_head *head = (struct basic_head *) tp->root; | ||
234 | struct basic_filter *f; | ||
235 | |||
236 | list_for_each_entry(f, &head->flist, link) { | ||
237 | if (arg->count < arg->skip) | ||
238 | goto skip; | ||
239 | |||
240 | if (arg->fn(tp, (unsigned long) f, arg) < 0) { | ||
241 | arg->stop = 1; | ||
242 | break; | ||
243 | } | ||
244 | skip: | ||
245 | arg->count++; | ||
246 | } | ||
247 | } | ||
248 | |||
249 | static int basic_dump(struct tcf_proto *tp, unsigned long fh, | ||
250 | struct sk_buff *skb, struct tcmsg *t) | ||
251 | { | ||
252 | struct basic_filter *f = (struct basic_filter *) fh; | ||
253 | unsigned char *b = skb->tail; | ||
254 | struct rtattr *rta; | ||
255 | |||
256 | if (f == NULL) | ||
257 | return skb->len; | ||
258 | |||
259 | t->tcm_handle = f->handle; | ||
260 | |||
261 | rta = (struct rtattr *) b; | ||
262 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
263 | |||
264 | if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || | ||
265 | tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) | ||
266 | goto rtattr_failure; | ||
267 | |||
268 | rta->rta_len = (skb->tail - b); | ||
269 | return skb->len; | ||
270 | |||
271 | rtattr_failure: | ||
272 | skb_trim(skb, b - skb->data); | ||
273 | return -1; | ||
274 | } | ||
275 | |||
276 | static struct tcf_proto_ops cls_basic_ops = { | ||
277 | .kind = "basic", | ||
278 | .classify = basic_classify, | ||
279 | .init = basic_init, | ||
280 | .destroy = basic_destroy, | ||
281 | .get = basic_get, | ||
282 | .put = basic_put, | ||
283 | .change = basic_change, | ||
284 | .delete = basic_delete, | ||
285 | .walk = basic_walk, | ||
286 | .dump = basic_dump, | ||
287 | .owner = THIS_MODULE, | ||
288 | }; | ||
289 | |||
290 | static int __init init_basic(void) | ||
291 | { | ||
292 | return register_tcf_proto_ops(&cls_basic_ops); | ||
293 | } | ||
294 | |||
295 | static void __exit exit_basic(void) | ||
296 | { | ||
297 | unregister_tcf_proto_ops(&cls_basic_ops); | ||
298 | } | ||
299 | |||
300 | module_init(init_basic) | ||
301 | module_exit(exit_basic) | ||
302 | MODULE_LICENSE("GPL"); | ||
303 | |||
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c new file mode 100644 index 000000000000..fdfc83af3d1f --- /dev/null +++ b/net/sched/cls_fw.c | |||
@@ -0,0 +1,378 @@ | |||
1 | /* | ||
2 | * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * | ||
11 | * Changes: | ||
12 | * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one | ||
13 | * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). | ||
14 | * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension | ||
15 | * | ||
16 | * JHS: We should remove the CONFIG_NET_CLS_IND from here | ||
17 | * eventually when the meta match extension is made available | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/config.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <asm/uaccess.h> | ||
24 | #include <asm/system.h> | ||
25 | #include <linux/bitops.h> | ||
26 | #include <linux/types.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/string.h> | ||
30 | #include <linux/mm.h> | ||
31 | #include <linux/socket.h> | ||
32 | #include <linux/sockios.h> | ||
33 | #include <linux/in.h> | ||
34 | #include <linux/errno.h> | ||
35 | #include <linux/interrupt.h> | ||
36 | #include <linux/if_ether.h> | ||
37 | #include <linux/inet.h> | ||
38 | #include <linux/netdevice.h> | ||
39 | #include <linux/etherdevice.h> | ||
40 | #include <linux/notifier.h> | ||
41 | #include <linux/netfilter.h> | ||
42 | #include <net/ip.h> | ||
43 | #include <net/route.h> | ||
44 | #include <linux/skbuff.h> | ||
45 | #include <net/sock.h> | ||
46 | #include <net/act_api.h> | ||
47 | #include <net/pkt_cls.h> | ||
48 | |||
49 | struct fw_head | ||
50 | { | ||
51 | struct fw_filter *ht[256]; | ||
52 | }; | ||
53 | |||
54 | struct fw_filter | ||
55 | { | ||
56 | struct fw_filter *next; | ||
57 | u32 id; | ||
58 | struct tcf_result res; | ||
59 | #ifdef CONFIG_NET_CLS_IND | ||
60 | char indev[IFNAMSIZ]; | ||
61 | #endif /* CONFIG_NET_CLS_IND */ | ||
62 | struct tcf_exts exts; | ||
63 | }; | ||
64 | |||
65 | static struct tcf_ext_map fw_ext_map = { | ||
66 | .action = TCA_FW_ACT, | ||
67 | .police = TCA_FW_POLICE | ||
68 | }; | ||
69 | |||
70 | static __inline__ int fw_hash(u32 handle) | ||
71 | { | ||
72 | return handle&0xFF; | ||
73 | } | ||
74 | |||
75 | static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, | ||
76 | struct tcf_result *res) | ||
77 | { | ||
78 | struct fw_head *head = (struct fw_head*)tp->root; | ||
79 | struct fw_filter *f; | ||
80 | int r; | ||
81 | #ifdef CONFIG_NETFILTER | ||
82 | u32 id = skb->nfmark; | ||
83 | #else | ||
84 | u32 id = 0; | ||
85 | #endif | ||
86 | |||
87 | if (head != NULL) { | ||
88 | for (f=head->ht[fw_hash(id)]; f; f=f->next) { | ||
89 | if (f->id == id) { | ||
90 | *res = f->res; | ||
91 | #ifdef CONFIG_NET_CLS_IND | ||
92 | if (!tcf_match_indev(skb, f->indev)) | ||
93 | continue; | ||
94 | #endif /* CONFIG_NET_CLS_IND */ | ||
95 | r = tcf_exts_exec(skb, &f->exts, res); | ||
96 | if (r < 0) | ||
97 | continue; | ||
98 | |||
99 | return r; | ||
100 | } | ||
101 | } | ||
102 | } else { | ||
103 | /* old method */ | ||
104 | if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { | ||
105 | res->classid = id; | ||
106 | res->class = 0; | ||
107 | return 0; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | return -1; | ||
112 | } | ||
113 | |||
114 | static unsigned long fw_get(struct tcf_proto *tp, u32 handle) | ||
115 | { | ||
116 | struct fw_head *head = (struct fw_head*)tp->root; | ||
117 | struct fw_filter *f; | ||
118 | |||
119 | if (head == NULL) | ||
120 | return 0; | ||
121 | |||
122 | for (f=head->ht[fw_hash(handle)]; f; f=f->next) { | ||
123 | if (f->id == handle) | ||
124 | return (unsigned long)f; | ||
125 | } | ||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | static void fw_put(struct tcf_proto *tp, unsigned long f) | ||
130 | { | ||
131 | } | ||
132 | |||
133 | static int fw_init(struct tcf_proto *tp) | ||
134 | { | ||
135 | return 0; | ||
136 | } | ||
137 | |||
138 | static inline void | ||
139 | fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) | ||
140 | { | ||
141 | tcf_unbind_filter(tp, &f->res); | ||
142 | tcf_exts_destroy(tp, &f->exts); | ||
143 | kfree(f); | ||
144 | } | ||
145 | |||
146 | static void fw_destroy(struct tcf_proto *tp) | ||
147 | { | ||
148 | struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL); | ||
149 | struct fw_filter *f; | ||
150 | int h; | ||
151 | |||
152 | if (head == NULL) | ||
153 | return; | ||
154 | |||
155 | for (h=0; h<256; h++) { | ||
156 | while ((f=head->ht[h]) != NULL) { | ||
157 | head->ht[h] = f->next; | ||
158 | fw_delete_filter(tp, f); | ||
159 | } | ||
160 | } | ||
161 | kfree(head); | ||
162 | } | ||
163 | |||
164 | static int fw_delete(struct tcf_proto *tp, unsigned long arg) | ||
165 | { | ||
166 | struct fw_head *head = (struct fw_head*)tp->root; | ||
167 | struct fw_filter *f = (struct fw_filter*)arg; | ||
168 | struct fw_filter **fp; | ||
169 | |||
170 | if (head == NULL || f == NULL) | ||
171 | goto out; | ||
172 | |||
173 | for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { | ||
174 | if (*fp == f) { | ||
175 | tcf_tree_lock(tp); | ||
176 | *fp = f->next; | ||
177 | tcf_tree_unlock(tp); | ||
178 | fw_delete_filter(tp, f); | ||
179 | return 0; | ||
180 | } | ||
181 | } | ||
182 | out: | ||
183 | return -EINVAL; | ||
184 | } | ||
185 | |||
186 | static int | ||
187 | fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, | ||
188 | struct rtattr **tb, struct rtattr **tca, unsigned long base) | ||
189 | { | ||
190 | struct tcf_exts e; | ||
191 | int err; | ||
192 | |||
193 | err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map); | ||
194 | if (err < 0) | ||
195 | return err; | ||
196 | |||
197 | err = -EINVAL; | ||
198 | if (tb[TCA_FW_CLASSID-1]) { | ||
199 | if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32)) | ||
200 | goto errout; | ||
201 | f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); | ||
202 | tcf_bind_filter(tp, &f->res, base); | ||
203 | } | ||
204 | |||
205 | #ifdef CONFIG_NET_CLS_IND | ||
206 | if (tb[TCA_FW_INDEV-1]) { | ||
207 | err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]); | ||
208 | if (err < 0) | ||
209 | goto errout; | ||
210 | } | ||
211 | #endif /* CONFIG_NET_CLS_IND */ | ||
212 | |||
213 | tcf_exts_change(tp, &f->exts, &e); | ||
214 | |||
215 | return 0; | ||
216 | errout: | ||
217 | tcf_exts_destroy(tp, &e); | ||
218 | return err; | ||
219 | } | ||
220 | |||
221 | static int fw_change(struct tcf_proto *tp, unsigned long base, | ||
222 | u32 handle, | ||
223 | struct rtattr **tca, | ||
224 | unsigned long *arg) | ||
225 | { | ||
226 | struct fw_head *head = (struct fw_head*)tp->root; | ||
227 | struct fw_filter *f = (struct fw_filter *) *arg; | ||
228 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
229 | struct rtattr *tb[TCA_FW_MAX]; | ||
230 | int err; | ||
231 | |||
232 | if (!opt) | ||
233 | return handle ? -EINVAL : 0; | ||
234 | |||
235 | if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0) | ||
236 | return -EINVAL; | ||
237 | |||
238 | if (f != NULL) { | ||
239 | if (f->id != handle && handle) | ||
240 | return -EINVAL; | ||
241 | return fw_change_attrs(tp, f, tb, tca, base); | ||
242 | } | ||
243 | |||
244 | if (!handle) | ||
245 | return -EINVAL; | ||
246 | |||
247 | if (head == NULL) { | ||
248 | head = kmalloc(sizeof(struct fw_head), GFP_KERNEL); | ||
249 | if (head == NULL) | ||
250 | return -ENOBUFS; | ||
251 | memset(head, 0, sizeof(*head)); | ||
252 | |||
253 | tcf_tree_lock(tp); | ||
254 | tp->root = head; | ||
255 | tcf_tree_unlock(tp); | ||
256 | } | ||
257 | |||
258 | f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL); | ||
259 | if (f == NULL) | ||
260 | return -ENOBUFS; | ||
261 | memset(f, 0, sizeof(*f)); | ||
262 | |||
263 | f->id = handle; | ||
264 | |||
265 | err = fw_change_attrs(tp, f, tb, tca, base); | ||
266 | if (err < 0) | ||
267 | goto errout; | ||
268 | |||
269 | f->next = head->ht[fw_hash(handle)]; | ||
270 | tcf_tree_lock(tp); | ||
271 | head->ht[fw_hash(handle)] = f; | ||
272 | tcf_tree_unlock(tp); | ||
273 | |||
274 | *arg = (unsigned long)f; | ||
275 | return 0; | ||
276 | |||
277 | errout: | ||
278 | if (f) | ||
279 | kfree(f); | ||
280 | return err; | ||
281 | } | ||
282 | |||
283 | static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) | ||
284 | { | ||
285 | struct fw_head *head = (struct fw_head*)tp->root; | ||
286 | int h; | ||
287 | |||
288 | if (head == NULL) | ||
289 | arg->stop = 1; | ||
290 | |||
291 | if (arg->stop) | ||
292 | return; | ||
293 | |||
294 | for (h = 0; h < 256; h++) { | ||
295 | struct fw_filter *f; | ||
296 | |||
297 | for (f = head->ht[h]; f; f = f->next) { | ||
298 | if (arg->count < arg->skip) { | ||
299 | arg->count++; | ||
300 | continue; | ||
301 | } | ||
302 | if (arg->fn(tp, (unsigned long)f, arg) < 0) { | ||
303 | arg->stop = 1; | ||
304 | return; | ||
305 | } | ||
306 | arg->count++; | ||
307 | } | ||
308 | } | ||
309 | } | ||
310 | |||
311 | static int fw_dump(struct tcf_proto *tp, unsigned long fh, | ||
312 | struct sk_buff *skb, struct tcmsg *t) | ||
313 | { | ||
314 | struct fw_filter *f = (struct fw_filter*)fh; | ||
315 | unsigned char *b = skb->tail; | ||
316 | struct rtattr *rta; | ||
317 | |||
318 | if (f == NULL) | ||
319 | return skb->len; | ||
320 | |||
321 | t->tcm_handle = f->id; | ||
322 | |||
323 | if (!f->res.classid && !tcf_exts_is_available(&f->exts)) | ||
324 | return skb->len; | ||
325 | |||
326 | rta = (struct rtattr*)b; | ||
327 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
328 | |||
329 | if (f->res.classid) | ||
330 | RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid); | ||
331 | #ifdef CONFIG_NET_CLS_IND | ||
332 | if (strlen(f->indev)) | ||
333 | RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev); | ||
334 | #endif /* CONFIG_NET_CLS_IND */ | ||
335 | |||
336 | if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) | ||
337 | goto rtattr_failure; | ||
338 | |||
339 | rta->rta_len = skb->tail - b; | ||
340 | |||
341 | if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) | ||
342 | goto rtattr_failure; | ||
343 | |||
344 | return skb->len; | ||
345 | |||
346 | rtattr_failure: | ||
347 | skb_trim(skb, b - skb->data); | ||
348 | return -1; | ||
349 | } | ||
350 | |||
351 | static struct tcf_proto_ops cls_fw_ops = { | ||
352 | .next = NULL, | ||
353 | .kind = "fw", | ||
354 | .classify = fw_classify, | ||
355 | .init = fw_init, | ||
356 | .destroy = fw_destroy, | ||
357 | .get = fw_get, | ||
358 | .put = fw_put, | ||
359 | .change = fw_change, | ||
360 | .delete = fw_delete, | ||
361 | .walk = fw_walk, | ||
362 | .dump = fw_dump, | ||
363 | .owner = THIS_MODULE, | ||
364 | }; | ||
365 | |||
366 | static int __init init_fw(void) | ||
367 | { | ||
368 | return register_tcf_proto_ops(&cls_fw_ops); | ||
369 | } | ||
370 | |||
371 | static void __exit exit_fw(void) | ||
372 | { | ||
373 | unregister_tcf_proto_ops(&cls_fw_ops); | ||
374 | } | ||
375 | |||
376 | module_init(init_fw) | ||
377 | module_exit(exit_fw) | ||
378 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c new file mode 100644 index 000000000000..02996ac05c75 --- /dev/null +++ b/net/sched/cls_route.c | |||
@@ -0,0 +1,639 @@ | |||
1 | /* | ||
2 | * net/sched/cls_route.c ROUTE4 classifier. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/config.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <linux/bitops.h> | ||
17 | #include <linux/types.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/socket.h> | ||
23 | #include <linux/sockios.h> | ||
24 | #include <linux/in.h> | ||
25 | #include <linux/errno.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/if_ether.h> | ||
28 | #include <linux/inet.h> | ||
29 | #include <linux/netdevice.h> | ||
30 | #include <linux/etherdevice.h> | ||
31 | #include <linux/notifier.h> | ||
32 | #include <net/ip.h> | ||
33 | #include <net/route.h> | ||
34 | #include <linux/skbuff.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <net/act_api.h> | ||
37 | #include <net/pkt_cls.h> | ||
38 | |||
39 | /* | ||
40 | 1. For now we assume that route tags < 256. | ||
41 | It allows to use direct table lookups, instead of hash tables. | ||
42 | 2. For now we assume that "from TAG" and "fromdev DEV" statements | ||
43 | are mutually exclusive. | ||
44 | 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" | ||
45 | */ | ||
46 | |||
47 | struct route4_fastmap | ||
48 | { | ||
49 | struct route4_filter *filter; | ||
50 | u32 id; | ||
51 | int iif; | ||
52 | }; | ||
53 | |||
54 | struct route4_head | ||
55 | { | ||
56 | struct route4_fastmap fastmap[16]; | ||
57 | struct route4_bucket *table[256+1]; | ||
58 | }; | ||
59 | |||
60 | struct route4_bucket | ||
61 | { | ||
62 | /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ | ||
63 | struct route4_filter *ht[16+16+1]; | ||
64 | }; | ||
65 | |||
66 | struct route4_filter | ||
67 | { | ||
68 | struct route4_filter *next; | ||
69 | u32 id; | ||
70 | int iif; | ||
71 | |||
72 | struct tcf_result res; | ||
73 | struct tcf_exts exts; | ||
74 | u32 handle; | ||
75 | struct route4_bucket *bkt; | ||
76 | }; | ||
77 | |||
78 | #define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) | ||
79 | |||
80 | static struct tcf_ext_map route_ext_map = { | ||
81 | .police = TCA_ROUTE4_POLICE, | ||
82 | .action = TCA_ROUTE4_ACT | ||
83 | }; | ||
84 | |||
85 | static __inline__ int route4_fastmap_hash(u32 id, int iif) | ||
86 | { | ||
87 | return id&0xF; | ||
88 | } | ||
89 | |||
90 | static inline | ||
91 | void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id) | ||
92 | { | ||
93 | spin_lock_bh(&dev->queue_lock); | ||
94 | memset(head->fastmap, 0, sizeof(head->fastmap)); | ||
95 | spin_unlock_bh(&dev->queue_lock); | ||
96 | } | ||
97 | |||
98 | static void __inline__ | ||
99 | route4_set_fastmap(struct route4_head *head, u32 id, int iif, | ||
100 | struct route4_filter *f) | ||
101 | { | ||
102 | int h = route4_fastmap_hash(id, iif); | ||
103 | head->fastmap[h].id = id; | ||
104 | head->fastmap[h].iif = iif; | ||
105 | head->fastmap[h].filter = f; | ||
106 | } | ||
107 | |||
108 | static __inline__ int route4_hash_to(u32 id) | ||
109 | { | ||
110 | return id&0xFF; | ||
111 | } | ||
112 | |||
113 | static __inline__ int route4_hash_from(u32 id) | ||
114 | { | ||
115 | return (id>>16)&0xF; | ||
116 | } | ||
117 | |||
118 | static __inline__ int route4_hash_iif(int iif) | ||
119 | { | ||
120 | return 16 + ((iif>>16)&0xF); | ||
121 | } | ||
122 | |||
123 | static __inline__ int route4_hash_wild(void) | ||
124 | { | ||
125 | return 32; | ||
126 | } | ||
127 | |||
128 | #define ROUTE4_APPLY_RESULT() \ | ||
129 | { \ | ||
130 | *res = f->res; \ | ||
131 | if (tcf_exts_is_available(&f->exts)) { \ | ||
132 | int r = tcf_exts_exec(skb, &f->exts, res); \ | ||
133 | if (r < 0) { \ | ||
134 | dont_cache = 1; \ | ||
135 | continue; \ | ||
136 | } \ | ||
137 | return r; \ | ||
138 | } else if (!dont_cache) \ | ||
139 | route4_set_fastmap(head, id, iif, f); \ | ||
140 | return 0; \ | ||
141 | } | ||
142 | |||
143 | static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, | ||
144 | struct tcf_result *res) | ||
145 | { | ||
146 | struct route4_head *head = (struct route4_head*)tp->root; | ||
147 | struct dst_entry *dst; | ||
148 | struct route4_bucket *b; | ||
149 | struct route4_filter *f; | ||
150 | u32 id, h; | ||
151 | int iif, dont_cache = 0; | ||
152 | |||
153 | if ((dst = skb->dst) == NULL) | ||
154 | goto failure; | ||
155 | |||
156 | id = dst->tclassid; | ||
157 | if (head == NULL) | ||
158 | goto old_method; | ||
159 | |||
160 | iif = ((struct rtable*)dst)->fl.iif; | ||
161 | |||
162 | h = route4_fastmap_hash(id, iif); | ||
163 | if (id == head->fastmap[h].id && | ||
164 | iif == head->fastmap[h].iif && | ||
165 | (f = head->fastmap[h].filter) != NULL) { | ||
166 | if (f == ROUTE4_FAILURE) | ||
167 | goto failure; | ||
168 | |||
169 | *res = f->res; | ||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | h = route4_hash_to(id); | ||
174 | |||
175 | restart: | ||
176 | if ((b = head->table[h]) != NULL) { | ||
177 | for (f = b->ht[route4_hash_from(id)]; f; f = f->next) | ||
178 | if (f->id == id) | ||
179 | ROUTE4_APPLY_RESULT(); | ||
180 | |||
181 | for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) | ||
182 | if (f->iif == iif) | ||
183 | ROUTE4_APPLY_RESULT(); | ||
184 | |||
185 | for (f = b->ht[route4_hash_wild()]; f; f = f->next) | ||
186 | ROUTE4_APPLY_RESULT(); | ||
187 | |||
188 | } | ||
189 | if (h < 256) { | ||
190 | h = 256; | ||
191 | id &= ~0xFFFF; | ||
192 | goto restart; | ||
193 | } | ||
194 | |||
195 | if (!dont_cache) | ||
196 | route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); | ||
197 | failure: | ||
198 | return -1; | ||
199 | |||
200 | old_method: | ||
201 | if (id && (TC_H_MAJ(id) == 0 || | ||
202 | !(TC_H_MAJ(id^tp->q->handle)))) { | ||
203 | res->classid = id; | ||
204 | res->class = 0; | ||
205 | return 0; | ||
206 | } | ||
207 | return -1; | ||
208 | } | ||
209 | |||
210 | static inline u32 to_hash(u32 id) | ||
211 | { | ||
212 | u32 h = id&0xFF; | ||
213 | if (id&0x8000) | ||
214 | h += 256; | ||
215 | return h; | ||
216 | } | ||
217 | |||
218 | static inline u32 from_hash(u32 id) | ||
219 | { | ||
220 | id &= 0xFFFF; | ||
221 | if (id == 0xFFFF) | ||
222 | return 32; | ||
223 | if (!(id & 0x8000)) { | ||
224 | if (id > 255) | ||
225 | return 256; | ||
226 | return id&0xF; | ||
227 | } | ||
228 | return 16 + (id&0xF); | ||
229 | } | ||
230 | |||
231 | static unsigned long route4_get(struct tcf_proto *tp, u32 handle) | ||
232 | { | ||
233 | struct route4_head *head = (struct route4_head*)tp->root; | ||
234 | struct route4_bucket *b; | ||
235 | struct route4_filter *f; | ||
236 | unsigned h1, h2; | ||
237 | |||
238 | if (!head) | ||
239 | return 0; | ||
240 | |||
241 | h1 = to_hash(handle); | ||
242 | if (h1 > 256) | ||
243 | return 0; | ||
244 | |||
245 | h2 = from_hash(handle>>16); | ||
246 | if (h2 > 32) | ||
247 | return 0; | ||
248 | |||
249 | if ((b = head->table[h1]) != NULL) { | ||
250 | for (f = b->ht[h2]; f; f = f->next) | ||
251 | if (f->handle == handle) | ||
252 | return (unsigned long)f; | ||
253 | } | ||
254 | return 0; | ||
255 | } | ||
256 | |||
257 | static void route4_put(struct tcf_proto *tp, unsigned long f) | ||
258 | { | ||
259 | } | ||
260 | |||
261 | static int route4_init(struct tcf_proto *tp) | ||
262 | { | ||
263 | return 0; | ||
264 | } | ||
265 | |||
266 | static inline void | ||
267 | route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) | ||
268 | { | ||
269 | tcf_unbind_filter(tp, &f->res); | ||
270 | tcf_exts_destroy(tp, &f->exts); | ||
271 | kfree(f); | ||
272 | } | ||
273 | |||
274 | static void route4_destroy(struct tcf_proto *tp) | ||
275 | { | ||
276 | struct route4_head *head = xchg(&tp->root, NULL); | ||
277 | int h1, h2; | ||
278 | |||
279 | if (head == NULL) | ||
280 | return; | ||
281 | |||
282 | for (h1=0; h1<=256; h1++) { | ||
283 | struct route4_bucket *b; | ||
284 | |||
285 | if ((b = head->table[h1]) != NULL) { | ||
286 | for (h2=0; h2<=32; h2++) { | ||
287 | struct route4_filter *f; | ||
288 | |||
289 | while ((f = b->ht[h2]) != NULL) { | ||
290 | b->ht[h2] = f->next; | ||
291 | route4_delete_filter(tp, f); | ||
292 | } | ||
293 | } | ||
294 | kfree(b); | ||
295 | } | ||
296 | } | ||
297 | kfree(head); | ||
298 | } | ||
299 | |||
300 | static int route4_delete(struct tcf_proto *tp, unsigned long arg) | ||
301 | { | ||
302 | struct route4_head *head = (struct route4_head*)tp->root; | ||
303 | struct route4_filter **fp, *f = (struct route4_filter*)arg; | ||
304 | unsigned h = 0; | ||
305 | struct route4_bucket *b; | ||
306 | int i; | ||
307 | |||
308 | if (!head || !f) | ||
309 | return -EINVAL; | ||
310 | |||
311 | h = f->handle; | ||
312 | b = f->bkt; | ||
313 | |||
314 | for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { | ||
315 | if (*fp == f) { | ||
316 | tcf_tree_lock(tp); | ||
317 | *fp = f->next; | ||
318 | tcf_tree_unlock(tp); | ||
319 | |||
320 | route4_reset_fastmap(tp->q->dev, head, f->id); | ||
321 | route4_delete_filter(tp, f); | ||
322 | |||
323 | /* Strip tree */ | ||
324 | |||
325 | for (i=0; i<=32; i++) | ||
326 | if (b->ht[i]) | ||
327 | return 0; | ||
328 | |||
329 | /* OK, session has no flows */ | ||
330 | tcf_tree_lock(tp); | ||
331 | head->table[to_hash(h)] = NULL; | ||
332 | tcf_tree_unlock(tp); | ||
333 | |||
334 | kfree(b); | ||
335 | return 0; | ||
336 | } | ||
337 | } | ||
338 | return 0; | ||
339 | } | ||
340 | |||
341 | static int route4_set_parms(struct tcf_proto *tp, unsigned long base, | ||
342 | struct route4_filter *f, u32 handle, struct route4_head *head, | ||
343 | struct rtattr **tb, struct rtattr *est, int new) | ||
344 | { | ||
345 | int err; | ||
346 | u32 id = 0, to = 0, nhandle = 0x8000; | ||
347 | struct route4_filter *fp; | ||
348 | unsigned int h1; | ||
349 | struct route4_bucket *b; | ||
350 | struct tcf_exts e; | ||
351 | |||
352 | err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); | ||
353 | if (err < 0) | ||
354 | return err; | ||
355 | |||
356 | err = -EINVAL; | ||
357 | if (tb[TCA_ROUTE4_CLASSID-1]) | ||
358 | if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32)) | ||
359 | goto errout; | ||
360 | |||
361 | if (tb[TCA_ROUTE4_TO-1]) { | ||
362 | if (new && handle & 0x8000) | ||
363 | goto errout; | ||
364 | if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32)) | ||
365 | goto errout; | ||
366 | to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]); | ||
367 | if (to > 0xFF) | ||
368 | goto errout; | ||
369 | nhandle = to; | ||
370 | } | ||
371 | |||
372 | if (tb[TCA_ROUTE4_FROM-1]) { | ||
373 | if (tb[TCA_ROUTE4_IIF-1]) | ||
374 | goto errout; | ||
375 | if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32)) | ||
376 | goto errout; | ||
377 | id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]); | ||
378 | if (id > 0xFF) | ||
379 | goto errout; | ||
380 | nhandle |= id << 16; | ||
381 | } else if (tb[TCA_ROUTE4_IIF-1]) { | ||
382 | if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32)) | ||
383 | goto errout; | ||
384 | id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]); | ||
385 | if (id > 0x7FFF) | ||
386 | goto errout; | ||
387 | nhandle |= (id | 0x8000) << 16; | ||
388 | } else | ||
389 | nhandle |= 0xFFFF << 16; | ||
390 | |||
391 | if (handle && new) { | ||
392 | nhandle |= handle & 0x7F00; | ||
393 | if (nhandle != handle) | ||
394 | goto errout; | ||
395 | } | ||
396 | |||
397 | h1 = to_hash(nhandle); | ||
398 | if ((b = head->table[h1]) == NULL) { | ||
399 | err = -ENOBUFS; | ||
400 | b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL); | ||
401 | if (b == NULL) | ||
402 | goto errout; | ||
403 | memset(b, 0, sizeof(*b)); | ||
404 | |||
405 | tcf_tree_lock(tp); | ||
406 | head->table[h1] = b; | ||
407 | tcf_tree_unlock(tp); | ||
408 | } else { | ||
409 | unsigned int h2 = from_hash(nhandle >> 16); | ||
410 | err = -EEXIST; | ||
411 | for (fp = b->ht[h2]; fp; fp = fp->next) | ||
412 | if (fp->handle == f->handle) | ||
413 | goto errout; | ||
414 | } | ||
415 | |||
416 | tcf_tree_lock(tp); | ||
417 | if (tb[TCA_ROUTE4_TO-1]) | ||
418 | f->id = to; | ||
419 | |||
420 | if (tb[TCA_ROUTE4_FROM-1]) | ||
421 | f->id = to | id<<16; | ||
422 | else if (tb[TCA_ROUTE4_IIF-1]) | ||
423 | f->iif = id; | ||
424 | |||
425 | f->handle = nhandle; | ||
426 | f->bkt = b; | ||
427 | tcf_tree_unlock(tp); | ||
428 | |||
429 | if (tb[TCA_ROUTE4_CLASSID-1]) { | ||
430 | f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); | ||
431 | tcf_bind_filter(tp, &f->res, base); | ||
432 | } | ||
433 | |||
434 | tcf_exts_change(tp, &f->exts, &e); | ||
435 | |||
436 | return 0; | ||
437 | errout: | ||
438 | tcf_exts_destroy(tp, &e); | ||
439 | return err; | ||
440 | } | ||
441 | |||
442 | static int route4_change(struct tcf_proto *tp, unsigned long base, | ||
443 | u32 handle, | ||
444 | struct rtattr **tca, | ||
445 | unsigned long *arg) | ||
446 | { | ||
447 | struct route4_head *head = tp->root; | ||
448 | struct route4_filter *f, *f1, **fp; | ||
449 | struct route4_bucket *b; | ||
450 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
451 | struct rtattr *tb[TCA_ROUTE4_MAX]; | ||
452 | unsigned int h, th; | ||
453 | u32 old_handle = 0; | ||
454 | int err; | ||
455 | |||
456 | if (opt == NULL) | ||
457 | return handle ? -EINVAL : 0; | ||
458 | |||
459 | if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0) | ||
460 | return -EINVAL; | ||
461 | |||
462 | if ((f = (struct route4_filter*)*arg) != NULL) { | ||
463 | if (f->handle != handle && handle) | ||
464 | return -EINVAL; | ||
465 | |||
466 | if (f->bkt) | ||
467 | old_handle = f->handle; | ||
468 | |||
469 | err = route4_set_parms(tp, base, f, handle, head, tb, | ||
470 | tca[TCA_RATE-1], 0); | ||
471 | if (err < 0) | ||
472 | return err; | ||
473 | |||
474 | goto reinsert; | ||
475 | } | ||
476 | |||
477 | err = -ENOBUFS; | ||
478 | if (head == NULL) { | ||
479 | head = kmalloc(sizeof(struct route4_head), GFP_KERNEL); | ||
480 | if (head == NULL) | ||
481 | goto errout; | ||
482 | memset(head, 0, sizeof(struct route4_head)); | ||
483 | |||
484 | tcf_tree_lock(tp); | ||
485 | tp->root = head; | ||
486 | tcf_tree_unlock(tp); | ||
487 | } | ||
488 | |||
489 | f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL); | ||
490 | if (f == NULL) | ||
491 | goto errout; | ||
492 | memset(f, 0, sizeof(*f)); | ||
493 | |||
494 | err = route4_set_parms(tp, base, f, handle, head, tb, | ||
495 | tca[TCA_RATE-1], 1); | ||
496 | if (err < 0) | ||
497 | goto errout; | ||
498 | |||
499 | reinsert: | ||
500 | h = from_hash(f->handle >> 16); | ||
501 | for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next) | ||
502 | if (f->handle < f1->handle) | ||
503 | break; | ||
504 | |||
505 | f->next = f1; | ||
506 | tcf_tree_lock(tp); | ||
507 | *fp = f; | ||
508 | |||
509 | if (old_handle && f->handle != old_handle) { | ||
510 | th = to_hash(old_handle); | ||
511 | h = from_hash(old_handle >> 16); | ||
512 | if ((b = head->table[th]) != NULL) { | ||
513 | for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { | ||
514 | if (*fp == f) { | ||
515 | *fp = f->next; | ||
516 | break; | ||
517 | } | ||
518 | } | ||
519 | } | ||
520 | } | ||
521 | tcf_tree_unlock(tp); | ||
522 | |||
523 | route4_reset_fastmap(tp->q->dev, head, f->id); | ||
524 | *arg = (unsigned long)f; | ||
525 | return 0; | ||
526 | |||
527 | errout: | ||
528 | if (f) | ||
529 | kfree(f); | ||
530 | return err; | ||
531 | } | ||
532 | |||
533 | static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) | ||
534 | { | ||
535 | struct route4_head *head = tp->root; | ||
536 | unsigned h, h1; | ||
537 | |||
538 | if (head == NULL) | ||
539 | arg->stop = 1; | ||
540 | |||
541 | if (arg->stop) | ||
542 | return; | ||
543 | |||
544 | for (h = 0; h <= 256; h++) { | ||
545 | struct route4_bucket *b = head->table[h]; | ||
546 | |||
547 | if (b) { | ||
548 | for (h1 = 0; h1 <= 32; h1++) { | ||
549 | struct route4_filter *f; | ||
550 | |||
551 | for (f = b->ht[h1]; f; f = f->next) { | ||
552 | if (arg->count < arg->skip) { | ||
553 | arg->count++; | ||
554 | continue; | ||
555 | } | ||
556 | if (arg->fn(tp, (unsigned long)f, arg) < 0) { | ||
557 | arg->stop = 1; | ||
558 | return; | ||
559 | } | ||
560 | arg->count++; | ||
561 | } | ||
562 | } | ||
563 | } | ||
564 | } | ||
565 | } | ||
566 | |||
567 | static int route4_dump(struct tcf_proto *tp, unsigned long fh, | ||
568 | struct sk_buff *skb, struct tcmsg *t) | ||
569 | { | ||
570 | struct route4_filter *f = (struct route4_filter*)fh; | ||
571 | unsigned char *b = skb->tail; | ||
572 | struct rtattr *rta; | ||
573 | u32 id; | ||
574 | |||
575 | if (f == NULL) | ||
576 | return skb->len; | ||
577 | |||
578 | t->tcm_handle = f->handle; | ||
579 | |||
580 | rta = (struct rtattr*)b; | ||
581 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
582 | |||
583 | if (!(f->handle&0x8000)) { | ||
584 | id = f->id&0xFF; | ||
585 | RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id); | ||
586 | } | ||
587 | if (f->handle&0x80000000) { | ||
588 | if ((f->handle>>16) != 0xFFFF) | ||
589 | RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif); | ||
590 | } else { | ||
591 | id = f->id>>16; | ||
592 | RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id); | ||
593 | } | ||
594 | if (f->res.classid) | ||
595 | RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid); | ||
596 | |||
597 | if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) | ||
598 | goto rtattr_failure; | ||
599 | |||
600 | rta->rta_len = skb->tail - b; | ||
601 | |||
602 | if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) | ||
603 | goto rtattr_failure; | ||
604 | |||
605 | return skb->len; | ||
606 | |||
607 | rtattr_failure: | ||
608 | skb_trim(skb, b - skb->data); | ||
609 | return -1; | ||
610 | } | ||
611 | |||
612 | static struct tcf_proto_ops cls_route4_ops = { | ||
613 | .next = NULL, | ||
614 | .kind = "route", | ||
615 | .classify = route4_classify, | ||
616 | .init = route4_init, | ||
617 | .destroy = route4_destroy, | ||
618 | .get = route4_get, | ||
619 | .put = route4_put, | ||
620 | .change = route4_change, | ||
621 | .delete = route4_delete, | ||
622 | .walk = route4_walk, | ||
623 | .dump = route4_dump, | ||
624 | .owner = THIS_MODULE, | ||
625 | }; | ||
626 | |||
627 | static int __init init_route4(void) | ||
628 | { | ||
629 | return register_tcf_proto_ops(&cls_route4_ops); | ||
630 | } | ||
631 | |||
632 | static void __exit exit_route4(void) | ||
633 | { | ||
634 | unregister_tcf_proto_ops(&cls_route4_ops); | ||
635 | } | ||
636 | |||
637 | module_init(init_route4) | ||
638 | module_exit(exit_route4) | ||
639 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c new file mode 100644 index 000000000000..ad2613790d85 --- /dev/null +++ b/net/sched/cls_rsvp.c | |||
@@ -0,0 +1,43 @@ | |||
1 | /* | ||
2 | * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <asm/uaccess.h> | ||
14 | #include <asm/system.h> | ||
15 | #include <linux/bitops.h> | ||
16 | #include <linux/types.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/socket.h> | ||
22 | #include <linux/sockios.h> | ||
23 | #include <linux/in.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/if_ether.h> | ||
27 | #include <linux/inet.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #include <linux/etherdevice.h> | ||
30 | #include <linux/notifier.h> | ||
31 | #include <net/ip.h> | ||
32 | #include <net/route.h> | ||
33 | #include <linux/skbuff.h> | ||
34 | #include <net/sock.h> | ||
35 | #include <net/act_api.h> | ||
36 | #include <net/pkt_cls.h> | ||
37 | |||
38 | #define RSVP_DST_LEN 1 | ||
39 | #define RSVP_ID "rsvp" | ||
40 | #define RSVP_OPS cls_rsvp_ops | ||
41 | |||
42 | #include "cls_rsvp.h" | ||
43 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h new file mode 100644 index 000000000000..232fb9196810 --- /dev/null +++ b/net/sched/cls_rsvp.h | |||
@@ -0,0 +1,667 @@ | |||
1 | /* | ||
2 | * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | /* | ||
13 | Comparing to general packet classification problem, | ||
14 | RSVP needs only sevaral relatively simple rules: | ||
15 | |||
16 | * (dst, protocol) are always specified, | ||
17 | so that we are able to hash them. | ||
18 | * src may be exact, or may be wildcard, so that | ||
19 | we can keep a hash table plus one wildcard entry. | ||
20 | * source port (or flow label) is important only if src is given. | ||
21 | |||
22 | IMPLEMENTATION. | ||
23 | |||
24 | We use a two level hash table: The top level is keyed by | ||
25 | destination address and protocol ID, every bucket contains a list | ||
26 | of "rsvp sessions", identified by destination address, protocol and | ||
27 | DPI(="Destination Port ID"): triple (key, mask, offset). | ||
28 | |||
29 | Every bucket has a smaller hash table keyed by source address | ||
30 | (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. | ||
31 | Every bucket is again a list of "RSVP flows", selected by | ||
32 | source address and SPI(="Source Port ID" here rather than | ||
33 | "security parameter index"): triple (key, mask, offset). | ||
34 | |||
35 | |||
36 | NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) | ||
37 | and all fragmented packets go to the best-effort traffic class. | ||
38 | |||
39 | |||
40 | NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires | ||
41 | only one "Generalized Port Identifier". So that for classic | ||
42 | ah, esp (and udp,tcp) both *pi should coincide or one of them | ||
43 | should be wildcard. | ||
44 | |||
45 | At first sight, this redundancy is just a waste of CPU | ||
46 | resources. But DPI and SPI add the possibility to assign different | ||
47 | priorities to GPIs. Look also at note 4 about tunnels below. | ||
48 | |||
49 | |||
50 | NOTE 3. One complication is the case of tunneled packets. | ||
51 | We implement it as following: if the first lookup | ||
52 | matches a special session with "tunnelhdr" value not zero, | ||
53 | flowid doesn't contain the true flow ID, but the tunnel ID (1...255). | ||
54 | In this case, we pull tunnelhdr bytes and restart lookup | ||
55 | with tunnel ID added to the list of keys. Simple and stupid 8)8) | ||
56 | It's enough for PIMREG and IPIP. | ||
57 | |||
58 | |||
59 | NOTE 4. Two GPIs make it possible to parse even GRE packets. | ||
60 | F.e. DPI can select ETH_P_IP (and necessary flags to make | ||
61 | tunnelhdr correct) in GRE protocol field and SPI matches | ||
62 | GRE key. Is it not nice? 8)8) | ||
63 | |||
64 | |||
65 | Well, as result, despite its simplicity, we get a pretty | ||
66 | powerful classification engine. */ | ||
67 | |||
68 | #include <linux/config.h> | ||
69 | |||
70 | struct rsvp_head | ||
71 | { | ||
72 | u32 tmap[256/32]; | ||
73 | u32 hgenerator; | ||
74 | u8 tgenerator; | ||
75 | struct rsvp_session *ht[256]; | ||
76 | }; | ||
77 | |||
78 | struct rsvp_session | ||
79 | { | ||
80 | struct rsvp_session *next; | ||
81 | u32 dst[RSVP_DST_LEN]; | ||
82 | struct tc_rsvp_gpi dpi; | ||
83 | u8 protocol; | ||
84 | u8 tunnelid; | ||
85 | /* 16 (src,sport) hash slots, and one wildcard source slot */ | ||
86 | struct rsvp_filter *ht[16+1]; | ||
87 | }; | ||
88 | |||
89 | |||
90 | struct rsvp_filter | ||
91 | { | ||
92 | struct rsvp_filter *next; | ||
93 | u32 src[RSVP_DST_LEN]; | ||
94 | struct tc_rsvp_gpi spi; | ||
95 | u8 tunnelhdr; | ||
96 | |||
97 | struct tcf_result res; | ||
98 | struct tcf_exts exts; | ||
99 | |||
100 | u32 handle; | ||
101 | struct rsvp_session *sess; | ||
102 | }; | ||
103 | |||
104 | static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) | ||
105 | { | ||
106 | unsigned h = dst[RSVP_DST_LEN-1]; | ||
107 | h ^= h>>16; | ||
108 | h ^= h>>8; | ||
109 | return (h ^ protocol ^ tunnelid) & 0xFF; | ||
110 | } | ||
111 | |||
112 | static __inline__ unsigned hash_src(u32 *src) | ||
113 | { | ||
114 | unsigned h = src[RSVP_DST_LEN-1]; | ||
115 | h ^= h>>16; | ||
116 | h ^= h>>8; | ||
117 | h ^= h>>4; | ||
118 | return h & 0xF; | ||
119 | } | ||
120 | |||
121 | static struct tcf_ext_map rsvp_ext_map = { | ||
122 | .police = TCA_RSVP_POLICE, | ||
123 | .action = TCA_RSVP_ACT | ||
124 | }; | ||
125 | |||
126 | #define RSVP_APPLY_RESULT() \ | ||
127 | { \ | ||
128 | int r = tcf_exts_exec(skb, &f->exts, res); \ | ||
129 | if (r < 0) \ | ||
130 | continue; \ | ||
131 | else if (r > 0) \ | ||
132 | return r; \ | ||
133 | } | ||
134 | |||
135 | static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, | ||
136 | struct tcf_result *res) | ||
137 | { | ||
138 | struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; | ||
139 | struct rsvp_session *s; | ||
140 | struct rsvp_filter *f; | ||
141 | unsigned h1, h2; | ||
142 | u32 *dst, *src; | ||
143 | u8 protocol; | ||
144 | u8 tunnelid = 0; | ||
145 | u8 *xprt; | ||
146 | #if RSVP_DST_LEN == 4 | ||
147 | struct ipv6hdr *nhptr = skb->nh.ipv6h; | ||
148 | #else | ||
149 | struct iphdr *nhptr = skb->nh.iph; | ||
150 | #endif | ||
151 | |||
152 | restart: | ||
153 | |||
154 | #if RSVP_DST_LEN == 4 | ||
155 | src = &nhptr->saddr.s6_addr32[0]; | ||
156 | dst = &nhptr->daddr.s6_addr32[0]; | ||
157 | protocol = nhptr->nexthdr; | ||
158 | xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); | ||
159 | #else | ||
160 | src = &nhptr->saddr; | ||
161 | dst = &nhptr->daddr; | ||
162 | protocol = nhptr->protocol; | ||
163 | xprt = ((u8*)nhptr) + (nhptr->ihl<<2); | ||
164 | if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) | ||
165 | return -1; | ||
166 | #endif | ||
167 | |||
168 | h1 = hash_dst(dst, protocol, tunnelid); | ||
169 | h2 = hash_src(src); | ||
170 | |||
171 | for (s = sht[h1]; s; s = s->next) { | ||
172 | if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && | ||
173 | protocol == s->protocol && | ||
174 | !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) | ||
175 | #if RSVP_DST_LEN == 4 | ||
176 | && dst[0] == s->dst[0] | ||
177 | && dst[1] == s->dst[1] | ||
178 | && dst[2] == s->dst[2] | ||
179 | #endif | ||
180 | && tunnelid == s->tunnelid) { | ||
181 | |||
182 | for (f = s->ht[h2]; f; f = f->next) { | ||
183 | if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && | ||
184 | !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) | ||
185 | #if RSVP_DST_LEN == 4 | ||
186 | && src[0] == f->src[0] | ||
187 | && src[1] == f->src[1] | ||
188 | && src[2] == f->src[2] | ||
189 | #endif | ||
190 | ) { | ||
191 | *res = f->res; | ||
192 | RSVP_APPLY_RESULT(); | ||
193 | |||
194 | matched: | ||
195 | if (f->tunnelhdr == 0) | ||
196 | return 0; | ||
197 | |||
198 | tunnelid = f->res.classid; | ||
199 | nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); | ||
200 | goto restart; | ||
201 | } | ||
202 | } | ||
203 | |||
204 | /* And wildcard bucket... */ | ||
205 | for (f = s->ht[16]; f; f = f->next) { | ||
206 | *res = f->res; | ||
207 | RSVP_APPLY_RESULT(); | ||
208 | goto matched; | ||
209 | } | ||
210 | return -1; | ||
211 | } | ||
212 | } | ||
213 | return -1; | ||
214 | } | ||
215 | |||
216 | static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) | ||
217 | { | ||
218 | struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; | ||
219 | struct rsvp_session *s; | ||
220 | struct rsvp_filter *f; | ||
221 | unsigned h1 = handle&0xFF; | ||
222 | unsigned h2 = (handle>>8)&0xFF; | ||
223 | |||
224 | if (h2 > 16) | ||
225 | return 0; | ||
226 | |||
227 | for (s = sht[h1]; s; s = s->next) { | ||
228 | for (f = s->ht[h2]; f; f = f->next) { | ||
229 | if (f->handle == handle) | ||
230 | return (unsigned long)f; | ||
231 | } | ||
232 | } | ||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static void rsvp_put(struct tcf_proto *tp, unsigned long f) | ||
237 | { | ||
238 | } | ||
239 | |||
240 | static int rsvp_init(struct tcf_proto *tp) | ||
241 | { | ||
242 | struct rsvp_head *data; | ||
243 | |||
244 | data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); | ||
245 | if (data) { | ||
246 | memset(data, 0, sizeof(struct rsvp_head)); | ||
247 | tp->root = data; | ||
248 | return 0; | ||
249 | } | ||
250 | return -ENOBUFS; | ||
251 | } | ||
252 | |||
253 | static inline void | ||
254 | rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) | ||
255 | { | ||
256 | tcf_unbind_filter(tp, &f->res); | ||
257 | tcf_exts_destroy(tp, &f->exts); | ||
258 | kfree(f); | ||
259 | } | ||
260 | |||
261 | static void rsvp_destroy(struct tcf_proto *tp) | ||
262 | { | ||
263 | struct rsvp_head *data = xchg(&tp->root, NULL); | ||
264 | struct rsvp_session **sht; | ||
265 | int h1, h2; | ||
266 | |||
267 | if (data == NULL) | ||
268 | return; | ||
269 | |||
270 | sht = data->ht; | ||
271 | |||
272 | for (h1=0; h1<256; h1++) { | ||
273 | struct rsvp_session *s; | ||
274 | |||
275 | while ((s = sht[h1]) != NULL) { | ||
276 | sht[h1] = s->next; | ||
277 | |||
278 | for (h2=0; h2<=16; h2++) { | ||
279 | struct rsvp_filter *f; | ||
280 | |||
281 | while ((f = s->ht[h2]) != NULL) { | ||
282 | s->ht[h2] = f->next; | ||
283 | rsvp_delete_filter(tp, f); | ||
284 | } | ||
285 | } | ||
286 | kfree(s); | ||
287 | } | ||
288 | } | ||
289 | kfree(data); | ||
290 | } | ||
291 | |||
292 | static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) | ||
293 | { | ||
294 | struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; | ||
295 | unsigned h = f->handle; | ||
296 | struct rsvp_session **sp; | ||
297 | struct rsvp_session *s = f->sess; | ||
298 | int i; | ||
299 | |||
300 | for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { | ||
301 | if (*fp == f) { | ||
302 | tcf_tree_lock(tp); | ||
303 | *fp = f->next; | ||
304 | tcf_tree_unlock(tp); | ||
305 | rsvp_delete_filter(tp, f); | ||
306 | |||
307 | /* Strip tree */ | ||
308 | |||
309 | for (i=0; i<=16; i++) | ||
310 | if (s->ht[i]) | ||
311 | return 0; | ||
312 | |||
313 | /* OK, session has no flows */ | ||
314 | for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; | ||
315 | *sp; sp = &(*sp)->next) { | ||
316 | if (*sp == s) { | ||
317 | tcf_tree_lock(tp); | ||
318 | *sp = s->next; | ||
319 | tcf_tree_unlock(tp); | ||
320 | |||
321 | kfree(s); | ||
322 | return 0; | ||
323 | } | ||
324 | } | ||
325 | |||
326 | return 0; | ||
327 | } | ||
328 | } | ||
329 | return 0; | ||
330 | } | ||
331 | |||
332 | static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) | ||
333 | { | ||
334 | struct rsvp_head *data = tp->root; | ||
335 | int i = 0xFFFF; | ||
336 | |||
337 | while (i-- > 0) { | ||
338 | u32 h; | ||
339 | if ((data->hgenerator += 0x10000) == 0) | ||
340 | data->hgenerator = 0x10000; | ||
341 | h = data->hgenerator|salt; | ||
342 | if (rsvp_get(tp, h) == 0) | ||
343 | return h; | ||
344 | } | ||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | static int tunnel_bts(struct rsvp_head *data) | ||
349 | { | ||
350 | int n = data->tgenerator>>5; | ||
351 | u32 b = 1<<(data->tgenerator&0x1F); | ||
352 | |||
353 | if (data->tmap[n]&b) | ||
354 | return 0; | ||
355 | data->tmap[n] |= b; | ||
356 | return 1; | ||
357 | } | ||
358 | |||
359 | static void tunnel_recycle(struct rsvp_head *data) | ||
360 | { | ||
361 | struct rsvp_session **sht = data->ht; | ||
362 | u32 tmap[256/32]; | ||
363 | int h1, h2; | ||
364 | |||
365 | memset(tmap, 0, sizeof(tmap)); | ||
366 | |||
367 | for (h1=0; h1<256; h1++) { | ||
368 | struct rsvp_session *s; | ||
369 | for (s = sht[h1]; s; s = s->next) { | ||
370 | for (h2=0; h2<=16; h2++) { | ||
371 | struct rsvp_filter *f; | ||
372 | |||
373 | for (f = s->ht[h2]; f; f = f->next) { | ||
374 | if (f->tunnelhdr == 0) | ||
375 | continue; | ||
376 | data->tgenerator = f->res.classid; | ||
377 | tunnel_bts(data); | ||
378 | } | ||
379 | } | ||
380 | } | ||
381 | } | ||
382 | |||
383 | memcpy(data->tmap, tmap, sizeof(tmap)); | ||
384 | } | ||
385 | |||
386 | static u32 gen_tunnel(struct rsvp_head *data) | ||
387 | { | ||
388 | int i, k; | ||
389 | |||
390 | for (k=0; k<2; k++) { | ||
391 | for (i=255; i>0; i--) { | ||
392 | if (++data->tgenerator == 0) | ||
393 | data->tgenerator = 1; | ||
394 | if (tunnel_bts(data)) | ||
395 | return data->tgenerator; | ||
396 | } | ||
397 | tunnel_recycle(data); | ||
398 | } | ||
399 | return 0; | ||
400 | } | ||
401 | |||
402 | static int rsvp_change(struct tcf_proto *tp, unsigned long base, | ||
403 | u32 handle, | ||
404 | struct rtattr **tca, | ||
405 | unsigned long *arg) | ||
406 | { | ||
407 | struct rsvp_head *data = tp->root; | ||
408 | struct rsvp_filter *f, **fp; | ||
409 | struct rsvp_session *s, **sp; | ||
410 | struct tc_rsvp_pinfo *pinfo = NULL; | ||
411 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
412 | struct rtattr *tb[TCA_RSVP_MAX]; | ||
413 | struct tcf_exts e; | ||
414 | unsigned h1, h2; | ||
415 | u32 *dst; | ||
416 | int err; | ||
417 | |||
418 | if (opt == NULL) | ||
419 | return handle ? -EINVAL : 0; | ||
420 | |||
421 | if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0) | ||
422 | return -EINVAL; | ||
423 | |||
424 | err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map); | ||
425 | if (err < 0) | ||
426 | return err; | ||
427 | |||
428 | if ((f = (struct rsvp_filter*)*arg) != NULL) { | ||
429 | /* Node exists: adjust only classid */ | ||
430 | |||
431 | if (f->handle != handle && handle) | ||
432 | goto errout2; | ||
433 | if (tb[TCA_RSVP_CLASSID-1]) { | ||
434 | f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); | ||
435 | tcf_bind_filter(tp, &f->res, base); | ||
436 | } | ||
437 | |||
438 | tcf_exts_change(tp, &f->exts, &e); | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | /* Now more serious part... */ | ||
443 | err = -EINVAL; | ||
444 | if (handle) | ||
445 | goto errout2; | ||
446 | if (tb[TCA_RSVP_DST-1] == NULL) | ||
447 | goto errout2; | ||
448 | |||
449 | err = -ENOBUFS; | ||
450 | f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); | ||
451 | if (f == NULL) | ||
452 | goto errout2; | ||
453 | |||
454 | memset(f, 0, sizeof(*f)); | ||
455 | h2 = 16; | ||
456 | if (tb[TCA_RSVP_SRC-1]) { | ||
457 | err = -EINVAL; | ||
458 | if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) | ||
459 | goto errout; | ||
460 | memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); | ||
461 | h2 = hash_src(f->src); | ||
462 | } | ||
463 | if (tb[TCA_RSVP_PINFO-1]) { | ||
464 | err = -EINVAL; | ||
465 | if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) | ||
466 | goto errout; | ||
467 | pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); | ||
468 | f->spi = pinfo->spi; | ||
469 | f->tunnelhdr = pinfo->tunnelhdr; | ||
470 | } | ||
471 | if (tb[TCA_RSVP_CLASSID-1]) { | ||
472 | err = -EINVAL; | ||
473 | if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) | ||
474 | goto errout; | ||
475 | f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); | ||
476 | } | ||
477 | |||
478 | err = -EINVAL; | ||
479 | if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) | ||
480 | goto errout; | ||
481 | dst = RTA_DATA(tb[TCA_RSVP_DST-1]); | ||
482 | h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); | ||
483 | |||
484 | err = -ENOMEM; | ||
485 | if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) | ||
486 | goto errout; | ||
487 | |||
488 | if (f->tunnelhdr) { | ||
489 | err = -EINVAL; | ||
490 | if (f->res.classid > 255) | ||
491 | goto errout; | ||
492 | |||
493 | err = -ENOMEM; | ||
494 | if (f->res.classid == 0 && | ||
495 | (f->res.classid = gen_tunnel(data)) == 0) | ||
496 | goto errout; | ||
497 | } | ||
498 | |||
499 | for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { | ||
500 | if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && | ||
501 | pinfo && pinfo->protocol == s->protocol && | ||
502 | memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 | ||
503 | #if RSVP_DST_LEN == 4 | ||
504 | && dst[0] == s->dst[0] | ||
505 | && dst[1] == s->dst[1] | ||
506 | && dst[2] == s->dst[2] | ||
507 | #endif | ||
508 | && pinfo->tunnelid == s->tunnelid) { | ||
509 | |||
510 | insert: | ||
511 | /* OK, we found appropriate session */ | ||
512 | |||
513 | fp = &s->ht[h2]; | ||
514 | |||
515 | f->sess = s; | ||
516 | if (f->tunnelhdr == 0) | ||
517 | tcf_bind_filter(tp, &f->res, base); | ||
518 | |||
519 | tcf_exts_change(tp, &f->exts, &e); | ||
520 | |||
521 | for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) | ||
522 | if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) | ||
523 | break; | ||
524 | f->next = *fp; | ||
525 | wmb(); | ||
526 | *fp = f; | ||
527 | |||
528 | *arg = (unsigned long)f; | ||
529 | return 0; | ||
530 | } | ||
531 | } | ||
532 | |||
533 | /* No session found. Create new one. */ | ||
534 | |||
535 | err = -ENOBUFS; | ||
536 | s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); | ||
537 | if (s == NULL) | ||
538 | goto errout; | ||
539 | memset(s, 0, sizeof(*s)); | ||
540 | memcpy(s->dst, dst, sizeof(s->dst)); | ||
541 | |||
542 | if (pinfo) { | ||
543 | s->dpi = pinfo->dpi; | ||
544 | s->protocol = pinfo->protocol; | ||
545 | s->tunnelid = pinfo->tunnelid; | ||
546 | } | ||
547 | for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { | ||
548 | if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) | ||
549 | break; | ||
550 | } | ||
551 | s->next = *sp; | ||
552 | wmb(); | ||
553 | *sp = s; | ||
554 | |||
555 | goto insert; | ||
556 | |||
557 | errout: | ||
558 | if (f) | ||
559 | kfree(f); | ||
560 | errout2: | ||
561 | tcf_exts_destroy(tp, &e); | ||
562 | return err; | ||
563 | } | ||
564 | |||
565 | static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) | ||
566 | { | ||
567 | struct rsvp_head *head = tp->root; | ||
568 | unsigned h, h1; | ||
569 | |||
570 | if (arg->stop) | ||
571 | return; | ||
572 | |||
573 | for (h = 0; h < 256; h++) { | ||
574 | struct rsvp_session *s; | ||
575 | |||
576 | for (s = head->ht[h]; s; s = s->next) { | ||
577 | for (h1 = 0; h1 <= 16; h1++) { | ||
578 | struct rsvp_filter *f; | ||
579 | |||
580 | for (f = s->ht[h1]; f; f = f->next) { | ||
581 | if (arg->count < arg->skip) { | ||
582 | arg->count++; | ||
583 | continue; | ||
584 | } | ||
585 | if (arg->fn(tp, (unsigned long)f, arg) < 0) { | ||
586 | arg->stop = 1; | ||
587 | return; | ||
588 | } | ||
589 | arg->count++; | ||
590 | } | ||
591 | } | ||
592 | } | ||
593 | } | ||
594 | } | ||
595 | |||
596 | static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, | ||
597 | struct sk_buff *skb, struct tcmsg *t) | ||
598 | { | ||
599 | struct rsvp_filter *f = (struct rsvp_filter*)fh; | ||
600 | struct rsvp_session *s; | ||
601 | unsigned char *b = skb->tail; | ||
602 | struct rtattr *rta; | ||
603 | struct tc_rsvp_pinfo pinfo; | ||
604 | |||
605 | if (f == NULL) | ||
606 | return skb->len; | ||
607 | s = f->sess; | ||
608 | |||
609 | t->tcm_handle = f->handle; | ||
610 | |||
611 | |||
612 | rta = (struct rtattr*)b; | ||
613 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
614 | |||
615 | RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); | ||
616 | pinfo.dpi = s->dpi; | ||
617 | pinfo.spi = f->spi; | ||
618 | pinfo.protocol = s->protocol; | ||
619 | pinfo.tunnelid = s->tunnelid; | ||
620 | pinfo.tunnelhdr = f->tunnelhdr; | ||
621 | RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); | ||
622 | if (f->res.classid) | ||
623 | RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); | ||
624 | if (((f->handle>>8)&0xFF) != 16) | ||
625 | RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); | ||
626 | |||
627 | if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) | ||
628 | goto rtattr_failure; | ||
629 | |||
630 | rta->rta_len = skb->tail - b; | ||
631 | |||
632 | if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) | ||
633 | goto rtattr_failure; | ||
634 | return skb->len; | ||
635 | |||
636 | rtattr_failure: | ||
637 | skb_trim(skb, b - skb->data); | ||
638 | return -1; | ||
639 | } | ||
640 | |||
641 | static struct tcf_proto_ops RSVP_OPS = { | ||
642 | .next = NULL, | ||
643 | .kind = RSVP_ID, | ||
644 | .classify = rsvp_classify, | ||
645 | .init = rsvp_init, | ||
646 | .destroy = rsvp_destroy, | ||
647 | .get = rsvp_get, | ||
648 | .put = rsvp_put, | ||
649 | .change = rsvp_change, | ||
650 | .delete = rsvp_delete, | ||
651 | .walk = rsvp_walk, | ||
652 | .dump = rsvp_dump, | ||
653 | .owner = THIS_MODULE, | ||
654 | }; | ||
655 | |||
656 | static int __init init_rsvp(void) | ||
657 | { | ||
658 | return register_tcf_proto_ops(&RSVP_OPS); | ||
659 | } | ||
660 | |||
661 | static void __exit exit_rsvp(void) | ||
662 | { | ||
663 | unregister_tcf_proto_ops(&RSVP_OPS); | ||
664 | } | ||
665 | |||
666 | module_init(init_rsvp) | ||
667 | module_exit(exit_rsvp) | ||
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c new file mode 100644 index 000000000000..fde51f7848eb --- /dev/null +++ b/net/sched/cls_rsvp6.c | |||
@@ -0,0 +1,44 @@ | |||
1 | /* | ||
2 | * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <asm/uaccess.h> | ||
14 | #include <asm/system.h> | ||
15 | #include <linux/bitops.h> | ||
16 | #include <linux/types.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/socket.h> | ||
22 | #include <linux/sockios.h> | ||
23 | #include <linux/in.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/if_ether.h> | ||
27 | #include <linux/inet.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #include <linux/etherdevice.h> | ||
30 | #include <linux/notifier.h> | ||
31 | #include <net/ip.h> | ||
32 | #include <linux/ipv6.h> | ||
33 | #include <net/route.h> | ||
34 | #include <linux/skbuff.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <net/act_api.h> | ||
37 | #include <net/pkt_cls.h> | ||
38 | |||
39 | #define RSVP_DST_LEN 4 | ||
40 | #define RSVP_ID "rsvp6" | ||
41 | #define RSVP_OPS cls_rsvp6_ops | ||
42 | |||
43 | #include "cls_rsvp.h" | ||
44 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c new file mode 100644 index 000000000000..404d9d83a7fa --- /dev/null +++ b/net/sched/cls_tcindex.c | |||
@@ -0,0 +1,537 @@ | |||
1 | /* | ||
2 | * net/sched/cls_tcindex.c Packet classifier for skb->tc_index | ||
3 | * | ||
4 | * Written 1998,1999 by Werner Almesberger, EPFL ICA | ||
5 | */ | ||
6 | |||
7 | #include <linux/config.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/skbuff.h> | ||
12 | #include <linux/errno.h> | ||
13 | #include <linux/netdevice.h> | ||
14 | #include <net/ip.h> | ||
15 | #include <net/act_api.h> | ||
16 | #include <net/pkt_cls.h> | ||
17 | #include <net/route.h> | ||
18 | |||
19 | |||
20 | /* | ||
21 | * Not quite sure if we need all the xchgs Alexey uses when accessing things. | ||
22 | * Can always add them later ... :) | ||
23 | */ | ||
24 | |||
25 | /* | ||
26 | * Passing parameters to the root seems to be done more awkwardly than really | ||
27 | * necessary. At least, u32 doesn't seem to use such dirty hacks. To be | ||
28 | * verified. FIXME. | ||
29 | */ | ||
30 | |||
31 | #define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ | ||
32 | #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ | ||
33 | |||
34 | |||
35 | #if 1 /* control */ | ||
36 | #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
37 | #else | ||
38 | #define DPRINTK(format,args...) | ||
39 | #endif | ||
40 | |||
41 | #if 0 /* data */ | ||
42 | #define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
43 | #else | ||
44 | #define D2PRINTK(format,args...) | ||
45 | #endif | ||
46 | |||
47 | |||
48 | #define PRIV(tp) ((struct tcindex_data *) (tp)->root) | ||
49 | |||
50 | |||
51 | struct tcindex_filter_result { | ||
52 | struct tcf_exts exts; | ||
53 | struct tcf_result res; | ||
54 | }; | ||
55 | |||
56 | struct tcindex_filter { | ||
57 | u16 key; | ||
58 | struct tcindex_filter_result result; | ||
59 | struct tcindex_filter *next; | ||
60 | }; | ||
61 | |||
62 | |||
63 | struct tcindex_data { | ||
64 | struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ | ||
65 | struct tcindex_filter **h; /* imperfect hash; only used if !perfect; | ||
66 | NULL if unused */ | ||
67 | u16 mask; /* AND key with mask */ | ||
68 | int shift; /* shift ANDed key to the right */ | ||
69 | int hash; /* hash table size; 0 if undefined */ | ||
70 | int alloc_hash; /* allocated size */ | ||
71 | int fall_through; /* 0: only classify if explicit match */ | ||
72 | }; | ||
73 | |||
74 | static struct tcf_ext_map tcindex_ext_map = { | ||
75 | .police = TCA_TCINDEX_POLICE, | ||
76 | .action = TCA_TCINDEX_ACT | ||
77 | }; | ||
78 | |||
79 | static inline int | ||
80 | tcindex_filter_is_set(struct tcindex_filter_result *r) | ||
81 | { | ||
82 | return tcf_exts_is_predicative(&r->exts) || r->res.classid; | ||
83 | } | ||
84 | |||
85 | static struct tcindex_filter_result * | ||
86 | tcindex_lookup(struct tcindex_data *p, u16 key) | ||
87 | { | ||
88 | struct tcindex_filter *f; | ||
89 | |||
90 | if (p->perfect) | ||
91 | return tcindex_filter_is_set(p->perfect + key) ? | ||
92 | p->perfect + key : NULL; | ||
93 | else if (p->h) { | ||
94 | for (f = p->h[key % p->hash]; f; f = f->next) | ||
95 | if (f->key == key) | ||
96 | return &f->result; | ||
97 | } | ||
98 | |||
99 | return NULL; | ||
100 | } | ||
101 | |||
102 | |||
103 | static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, | ||
104 | struct tcf_result *res) | ||
105 | { | ||
106 | struct tcindex_data *p = PRIV(tp); | ||
107 | struct tcindex_filter_result *f; | ||
108 | int key = (skb->tc_index & p->mask) >> p->shift; | ||
109 | |||
110 | D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p); | ||
111 | |||
112 | f = tcindex_lookup(p, key); | ||
113 | if (!f) { | ||
114 | if (!p->fall_through) | ||
115 | return -1; | ||
116 | res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); | ||
117 | res->class = 0; | ||
118 | D2PRINTK("alg 0x%x\n",res->classid); | ||
119 | return 0; | ||
120 | } | ||
121 | *res = f->res; | ||
122 | D2PRINTK("map 0x%x\n",res->classid); | ||
123 | |||
124 | return tcf_exts_exec(skb, &f->exts, res); | ||
125 | } | ||
126 | |||
127 | |||
128 | static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) | ||
129 | { | ||
130 | struct tcindex_data *p = PRIV(tp); | ||
131 | struct tcindex_filter_result *r; | ||
132 | |||
133 | DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle); | ||
134 | if (p->perfect && handle >= p->alloc_hash) | ||
135 | return 0; | ||
136 | r = tcindex_lookup(p, handle); | ||
137 | return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL; | ||
138 | } | ||
139 | |||
140 | |||
141 | static void tcindex_put(struct tcf_proto *tp, unsigned long f) | ||
142 | { | ||
143 | DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f); | ||
144 | } | ||
145 | |||
146 | |||
147 | static int tcindex_init(struct tcf_proto *tp) | ||
148 | { | ||
149 | struct tcindex_data *p; | ||
150 | |||
151 | DPRINTK("tcindex_init(tp %p)\n",tp); | ||
152 | p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL); | ||
153 | if (!p) | ||
154 | return -ENOMEM; | ||
155 | |||
156 | memset(p, 0, sizeof(*p)); | ||
157 | p->mask = 0xffff; | ||
158 | p->hash = DEFAULT_HASH_SIZE; | ||
159 | p->fall_through = 1; | ||
160 | |||
161 | tp->root = p; | ||
162 | return 0; | ||
163 | } | ||
164 | |||
165 | |||
166 | static int | ||
167 | __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) | ||
168 | { | ||
169 | struct tcindex_data *p = PRIV(tp); | ||
170 | struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; | ||
171 | struct tcindex_filter *f = NULL; | ||
172 | |||
173 | DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f); | ||
174 | if (p->perfect) { | ||
175 | if (!r->res.class) | ||
176 | return -ENOENT; | ||
177 | } else { | ||
178 | int i; | ||
179 | struct tcindex_filter **walk = NULL; | ||
180 | |||
181 | for (i = 0; i < p->hash; i++) | ||
182 | for (walk = p->h+i; *walk; walk = &(*walk)->next) | ||
183 | if (&(*walk)->result == r) | ||
184 | goto found; | ||
185 | return -ENOENT; | ||
186 | |||
187 | found: | ||
188 | f = *walk; | ||
189 | if (lock) | ||
190 | tcf_tree_lock(tp); | ||
191 | *walk = f->next; | ||
192 | if (lock) | ||
193 | tcf_tree_unlock(tp); | ||
194 | } | ||
195 | tcf_unbind_filter(tp, &r->res); | ||
196 | tcf_exts_destroy(tp, &r->exts); | ||
197 | if (f) | ||
198 | kfree(f); | ||
199 | return 0; | ||
200 | } | ||
201 | |||
202 | static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) | ||
203 | { | ||
204 | return __tcindex_delete(tp, arg, 1); | ||
205 | } | ||
206 | |||
207 | static inline int | ||
208 | valid_perfect_hash(struct tcindex_data *p) | ||
209 | { | ||
210 | return p->hash > (p->mask >> p->shift); | ||
211 | } | ||
212 | |||
213 | static int | ||
214 | tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, | ||
215 | struct tcindex_data *p, struct tcindex_filter_result *r, | ||
216 | struct rtattr **tb, struct rtattr *est) | ||
217 | { | ||
218 | int err, balloc = 0; | ||
219 | struct tcindex_filter_result new_filter_result, *old_r = r; | ||
220 | struct tcindex_filter_result cr; | ||
221 | struct tcindex_data cp; | ||
222 | struct tcindex_filter *f = NULL; /* make gcc behave */ | ||
223 | struct tcf_exts e; | ||
224 | |||
225 | err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); | ||
226 | if (err < 0) | ||
227 | return err; | ||
228 | |||
229 | memcpy(&cp, p, sizeof(cp)); | ||
230 | memset(&new_filter_result, 0, sizeof(new_filter_result)); | ||
231 | |||
232 | if (old_r) | ||
233 | memcpy(&cr, r, sizeof(cr)); | ||
234 | else | ||
235 | memset(&cr, 0, sizeof(cr)); | ||
236 | |||
237 | err = -EINVAL; | ||
238 | if (tb[TCA_TCINDEX_HASH-1]) { | ||
239 | if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32)) | ||
240 | goto errout; | ||
241 | cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]); | ||
242 | } | ||
243 | |||
244 | if (tb[TCA_TCINDEX_MASK-1]) { | ||
245 | if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16)) | ||
246 | goto errout; | ||
247 | cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]); | ||
248 | } | ||
249 | |||
250 | if (tb[TCA_TCINDEX_SHIFT-1]) { | ||
251 | if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(u16)) | ||
252 | goto errout; | ||
253 | cp.shift = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]); | ||
254 | } | ||
255 | |||
256 | err = -EBUSY; | ||
257 | /* Hash already allocated, make sure that we still meet the | ||
258 | * requirements for the allocated hash. | ||
259 | */ | ||
260 | if (cp.perfect) { | ||
261 | if (!valid_perfect_hash(&cp) || | ||
262 | cp.hash > cp.alloc_hash) | ||
263 | goto errout; | ||
264 | } else if (cp.h && cp.hash != cp.alloc_hash) | ||
265 | goto errout; | ||
266 | |||
267 | err = -EINVAL; | ||
268 | if (tb[TCA_TCINDEX_FALL_THROUGH-1]) { | ||
269 | if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32)) | ||
270 | goto errout; | ||
271 | cp.fall_through = | ||
272 | *(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]); | ||
273 | } | ||
274 | |||
275 | if (!cp.hash) { | ||
276 | /* Hash not specified, use perfect hash if the upper limit | ||
277 | * of the hashing index is below the threshold. | ||
278 | */ | ||
279 | if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) | ||
280 | cp.hash = (cp.mask >> cp.shift)+1; | ||
281 | else | ||
282 | cp.hash = DEFAULT_HASH_SIZE; | ||
283 | } | ||
284 | |||
285 | if (!cp.perfect && !cp.h) | ||
286 | cp.alloc_hash = cp.hash; | ||
287 | |||
288 | /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) | ||
289 | * but then, we'd fail handles that may become valid after some future | ||
290 | * mask change. While this is extremely unlikely to ever matter, | ||
291 | * the check below is safer (and also more backwards-compatible). | ||
292 | */ | ||
293 | if (cp.perfect || valid_perfect_hash(&cp)) | ||
294 | if (handle >= cp.alloc_hash) | ||
295 | goto errout; | ||
296 | |||
297 | |||
298 | err = -ENOMEM; | ||
299 | if (!cp.perfect && !cp.h) { | ||
300 | if (valid_perfect_hash(&cp)) { | ||
301 | cp.perfect = kmalloc(cp.hash * sizeof(*r), GFP_KERNEL); | ||
302 | if (!cp.perfect) | ||
303 | goto errout; | ||
304 | memset(cp.perfect, 0, cp.hash * sizeof(*r)); | ||
305 | balloc = 1; | ||
306 | } else { | ||
307 | cp.h = kmalloc(cp.hash * sizeof(f), GFP_KERNEL); | ||
308 | if (!cp.h) | ||
309 | goto errout; | ||
310 | memset(cp.h, 0, cp.hash * sizeof(f)); | ||
311 | balloc = 2; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | if (cp.perfect) | ||
316 | r = cp.perfect + handle; | ||
317 | else | ||
318 | r = tcindex_lookup(&cp, handle) ? : &new_filter_result; | ||
319 | |||
320 | if (r == &new_filter_result) { | ||
321 | f = kmalloc(sizeof(*f), GFP_KERNEL); | ||
322 | if (!f) | ||
323 | goto errout_alloc; | ||
324 | memset(f, 0, sizeof(*f)); | ||
325 | } | ||
326 | |||
327 | if (tb[TCA_TCINDEX_CLASSID-1]) { | ||
328 | cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]); | ||
329 | tcf_bind_filter(tp, &cr.res, base); | ||
330 | } | ||
331 | |||
332 | tcf_exts_change(tp, &cr.exts, &e); | ||
333 | |||
334 | tcf_tree_lock(tp); | ||
335 | if (old_r && old_r != r) | ||
336 | memset(old_r, 0, sizeof(*old_r)); | ||
337 | |||
338 | memcpy(p, &cp, sizeof(cp)); | ||
339 | memcpy(r, &cr, sizeof(cr)); | ||
340 | |||
341 | if (r == &new_filter_result) { | ||
342 | struct tcindex_filter **fp; | ||
343 | |||
344 | f->key = handle; | ||
345 | f->result = new_filter_result; | ||
346 | f->next = NULL; | ||
347 | for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) | ||
348 | /* nothing */; | ||
349 | *fp = f; | ||
350 | } | ||
351 | tcf_tree_unlock(tp); | ||
352 | |||
353 | return 0; | ||
354 | |||
355 | errout_alloc: | ||
356 | if (balloc == 1) | ||
357 | kfree(cp.perfect); | ||
358 | else if (balloc == 2) | ||
359 | kfree(cp.h); | ||
360 | errout: | ||
361 | tcf_exts_destroy(tp, &e); | ||
362 | return err; | ||
363 | } | ||
364 | |||
365 | static int | ||
366 | tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, | ||
367 | struct rtattr **tca, unsigned long *arg) | ||
368 | { | ||
369 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
370 | struct rtattr *tb[TCA_TCINDEX_MAX]; | ||
371 | struct tcindex_data *p = PRIV(tp); | ||
372 | struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; | ||
373 | |||
374 | DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," | ||
375 | "p %p,r %p,*arg 0x%lx\n", | ||
376 | tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L); | ||
377 | |||
378 | if (!opt) | ||
379 | return 0; | ||
380 | |||
381 | if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0) | ||
382 | return -EINVAL; | ||
383 | |||
384 | return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]); | ||
385 | } | ||
386 | |||
387 | |||
388 | static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) | ||
389 | { | ||
390 | struct tcindex_data *p = PRIV(tp); | ||
391 | struct tcindex_filter *f,*next; | ||
392 | int i; | ||
393 | |||
394 | DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p); | ||
395 | if (p->perfect) { | ||
396 | for (i = 0; i < p->hash; i++) { | ||
397 | if (!p->perfect[i].res.class) | ||
398 | continue; | ||
399 | if (walker->count >= walker->skip) { | ||
400 | if (walker->fn(tp, | ||
401 | (unsigned long) (p->perfect+i), walker) | ||
402 | < 0) { | ||
403 | walker->stop = 1; | ||
404 | return; | ||
405 | } | ||
406 | } | ||
407 | walker->count++; | ||
408 | } | ||
409 | } | ||
410 | if (!p->h) | ||
411 | return; | ||
412 | for (i = 0; i < p->hash; i++) { | ||
413 | for (f = p->h[i]; f; f = next) { | ||
414 | next = f->next; | ||
415 | if (walker->count >= walker->skip) { | ||
416 | if (walker->fn(tp,(unsigned long) &f->result, | ||
417 | walker) < 0) { | ||
418 | walker->stop = 1; | ||
419 | return; | ||
420 | } | ||
421 | } | ||
422 | walker->count++; | ||
423 | } | ||
424 | } | ||
425 | } | ||
426 | |||
427 | |||
428 | static int tcindex_destroy_element(struct tcf_proto *tp, | ||
429 | unsigned long arg, struct tcf_walker *walker) | ||
430 | { | ||
431 | return __tcindex_delete(tp, arg, 0); | ||
432 | } | ||
433 | |||
434 | |||
435 | static void tcindex_destroy(struct tcf_proto *tp) | ||
436 | { | ||
437 | struct tcindex_data *p = PRIV(tp); | ||
438 | struct tcf_walker walker; | ||
439 | |||
440 | DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p); | ||
441 | walker.count = 0; | ||
442 | walker.skip = 0; | ||
443 | walker.fn = &tcindex_destroy_element; | ||
444 | tcindex_walk(tp,&walker); | ||
445 | if (p->perfect) | ||
446 | kfree(p->perfect); | ||
447 | if (p->h) | ||
448 | kfree(p->h); | ||
449 | kfree(p); | ||
450 | tp->root = NULL; | ||
451 | } | ||
452 | |||
453 | |||
454 | static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, | ||
455 | struct sk_buff *skb, struct tcmsg *t) | ||
456 | { | ||
457 | struct tcindex_data *p = PRIV(tp); | ||
458 | struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; | ||
459 | unsigned char *b = skb->tail; | ||
460 | struct rtattr *rta; | ||
461 | |||
462 | DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", | ||
463 | tp,fh,skb,t,p,r,b); | ||
464 | DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h); | ||
465 | rta = (struct rtattr *) b; | ||
466 | RTA_PUT(skb,TCA_OPTIONS,0,NULL); | ||
467 | if (!fh) { | ||
468 | t->tcm_handle = ~0; /* whatever ... */ | ||
469 | RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash); | ||
470 | RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask); | ||
471 | RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); | ||
472 | RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), | ||
473 | &p->fall_through); | ||
474 | rta->rta_len = skb->tail-b; | ||
475 | } else { | ||
476 | if (p->perfect) { | ||
477 | t->tcm_handle = r-p->perfect; | ||
478 | } else { | ||
479 | struct tcindex_filter *f; | ||
480 | int i; | ||
481 | |||
482 | t->tcm_handle = 0; | ||
483 | for (i = 0; !t->tcm_handle && i < p->hash; i++) { | ||
484 | for (f = p->h[i]; !t->tcm_handle && f; | ||
485 | f = f->next) { | ||
486 | if (&f->result == r) | ||
487 | t->tcm_handle = f->key; | ||
488 | } | ||
489 | } | ||
490 | } | ||
491 | DPRINTK("handle = %d\n",t->tcm_handle); | ||
492 | if (r->res.class) | ||
493 | RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid); | ||
494 | |||
495 | if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) | ||
496 | goto rtattr_failure; | ||
497 | rta->rta_len = skb->tail-b; | ||
498 | |||
499 | if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) | ||
500 | goto rtattr_failure; | ||
501 | } | ||
502 | |||
503 | return skb->len; | ||
504 | |||
505 | rtattr_failure: | ||
506 | skb_trim(skb, b - skb->data); | ||
507 | return -1; | ||
508 | } | ||
509 | |||
510 | static struct tcf_proto_ops cls_tcindex_ops = { | ||
511 | .next = NULL, | ||
512 | .kind = "tcindex", | ||
513 | .classify = tcindex_classify, | ||
514 | .init = tcindex_init, | ||
515 | .destroy = tcindex_destroy, | ||
516 | .get = tcindex_get, | ||
517 | .put = tcindex_put, | ||
518 | .change = tcindex_change, | ||
519 | .delete = tcindex_delete, | ||
520 | .walk = tcindex_walk, | ||
521 | .dump = tcindex_dump, | ||
522 | .owner = THIS_MODULE, | ||
523 | }; | ||
524 | |||
525 | static int __init init_tcindex(void) | ||
526 | { | ||
527 | return register_tcf_proto_ops(&cls_tcindex_ops); | ||
528 | } | ||
529 | |||
530 | static void __exit exit_tcindex(void) | ||
531 | { | ||
532 | unregister_tcf_proto_ops(&cls_tcindex_ops); | ||
533 | } | ||
534 | |||
535 | module_init(init_tcindex) | ||
536 | module_exit(exit_tcindex) | ||
537 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c new file mode 100644 index 000000000000..364b87d86455 --- /dev/null +++ b/net/sched/cls_u32.c | |||
@@ -0,0 +1,828 @@ | |||
1 | /* | ||
2 | * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * | ||
11 | * The filters are packed to hash tables of key nodes | ||
12 | * with a set of 32bit key/mask pairs at every node. | ||
13 | * Nodes reference next level hash tables etc. | ||
14 | * | ||
15 | * This scheme is the best universal classifier I managed to | ||
16 | * invent; it is not super-fast, but it is not slow (provided you | ||
17 | * program it correctly), and general enough. And its relative | ||
18 | * speed grows as the number of rules becomes larger. | ||
19 | * | ||
20 | * It seems that it represents the best middle point between | ||
21 | * speed and manageability both by human and by machine. | ||
22 | * | ||
23 | * It is especially useful for link sharing combined with QoS; | ||
24 | * pure RSVP doesn't need such a general approach and can use | ||
25 | * much simpler (and faster) schemes, sort of cls_rsvp.c. | ||
26 | * | ||
27 | * JHS: We should remove the CONFIG_NET_CLS_IND from here | ||
28 | * eventually when the meta match extension is made available | ||
29 | * | ||
30 | * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> | ||
31 | */ | ||
32 | |||
33 | #include <asm/uaccess.h> | ||
34 | #include <asm/system.h> | ||
35 | #include <linux/bitops.h> | ||
36 | #include <linux/config.h> | ||
37 | #include <linux/module.h> | ||
38 | #include <linux/types.h> | ||
39 | #include <linux/kernel.h> | ||
40 | #include <linux/sched.h> | ||
41 | #include <linux/string.h> | ||
42 | #include <linux/mm.h> | ||
43 | #include <linux/socket.h> | ||
44 | #include <linux/sockios.h> | ||
45 | #include <linux/in.h> | ||
46 | #include <linux/errno.h> | ||
47 | #include <linux/interrupt.h> | ||
48 | #include <linux/if_ether.h> | ||
49 | #include <linux/inet.h> | ||
50 | #include <linux/netdevice.h> | ||
51 | #include <linux/etherdevice.h> | ||
52 | #include <linux/notifier.h> | ||
53 | #include <linux/rtnetlink.h> | ||
54 | #include <net/ip.h> | ||
55 | #include <net/route.h> | ||
56 | #include <linux/skbuff.h> | ||
57 | #include <net/sock.h> | ||
58 | #include <net/act_api.h> | ||
59 | #include <net/pkt_cls.h> | ||
60 | |||
61 | struct tc_u_knode | ||
62 | { | ||
63 | struct tc_u_knode *next; | ||
64 | u32 handle; | ||
65 | struct tc_u_hnode *ht_up; | ||
66 | struct tcf_exts exts; | ||
67 | #ifdef CONFIG_NET_CLS_IND | ||
68 | char indev[IFNAMSIZ]; | ||
69 | #endif | ||
70 | u8 fshift; | ||
71 | struct tcf_result res; | ||
72 | struct tc_u_hnode *ht_down; | ||
73 | #ifdef CONFIG_CLS_U32_PERF | ||
74 | struct tc_u32_pcnt *pf; | ||
75 | #endif | ||
76 | #ifdef CONFIG_CLS_U32_MARK | ||
77 | struct tc_u32_mark mark; | ||
78 | #endif | ||
79 | struct tc_u32_sel sel; | ||
80 | }; | ||
81 | |||
82 | struct tc_u_hnode | ||
83 | { | ||
84 | struct tc_u_hnode *next; | ||
85 | u32 handle; | ||
86 | u32 prio; | ||
87 | struct tc_u_common *tp_c; | ||
88 | int refcnt; | ||
89 | unsigned divisor; | ||
90 | struct tc_u_knode *ht[1]; | ||
91 | }; | ||
92 | |||
93 | struct tc_u_common | ||
94 | { | ||
95 | struct tc_u_common *next; | ||
96 | struct tc_u_hnode *hlist; | ||
97 | struct Qdisc *q; | ||
98 | int refcnt; | ||
99 | u32 hgenerator; | ||
100 | }; | ||
101 | |||
102 | static struct tcf_ext_map u32_ext_map = { | ||
103 | .action = TCA_U32_ACT, | ||
104 | .police = TCA_U32_POLICE | ||
105 | }; | ||
106 | |||
107 | static struct tc_u_common *u32_list; | ||
108 | |||
109 | static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift) | ||
110 | { | ||
111 | unsigned h = (key & sel->hmask)>>fshift; | ||
112 | |||
113 | return h; | ||
114 | } | ||
115 | |||
116 | static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) | ||
117 | { | ||
118 | struct { | ||
119 | struct tc_u_knode *knode; | ||
120 | u8 *ptr; | ||
121 | } stack[TC_U32_MAXDEPTH]; | ||
122 | |||
123 | struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; | ||
124 | u8 *ptr = skb->nh.raw; | ||
125 | struct tc_u_knode *n; | ||
126 | int sdepth = 0; | ||
127 | int off2 = 0; | ||
128 | int sel = 0; | ||
129 | #ifdef CONFIG_CLS_U32_PERF | ||
130 | int j; | ||
131 | #endif | ||
132 | int i, r; | ||
133 | |||
134 | next_ht: | ||
135 | n = ht->ht[sel]; | ||
136 | |||
137 | next_knode: | ||
138 | if (n) { | ||
139 | struct tc_u32_key *key = n->sel.keys; | ||
140 | |||
141 | #ifdef CONFIG_CLS_U32_PERF | ||
142 | n->pf->rcnt +=1; | ||
143 | j = 0; | ||
144 | #endif | ||
145 | |||
146 | #ifdef CONFIG_CLS_U32_MARK | ||
147 | if ((skb->nfmark & n->mark.mask) != n->mark.val) { | ||
148 | n = n->next; | ||
149 | goto next_knode; | ||
150 | } else { | ||
151 | n->mark.success++; | ||
152 | } | ||
153 | #endif | ||
154 | |||
155 | for (i = n->sel.nkeys; i>0; i--, key++) { | ||
156 | |||
157 | if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { | ||
158 | n = n->next; | ||
159 | goto next_knode; | ||
160 | } | ||
161 | #ifdef CONFIG_CLS_U32_PERF | ||
162 | n->pf->kcnts[j] +=1; | ||
163 | j++; | ||
164 | #endif | ||
165 | } | ||
166 | if (n->ht_down == NULL) { | ||
167 | check_terminal: | ||
168 | if (n->sel.flags&TC_U32_TERMINAL) { | ||
169 | |||
170 | *res = n->res; | ||
171 | #ifdef CONFIG_NET_CLS_IND | ||
172 | if (!tcf_match_indev(skb, n->indev)) { | ||
173 | n = n->next; | ||
174 | goto next_knode; | ||
175 | } | ||
176 | #endif | ||
177 | #ifdef CONFIG_CLS_U32_PERF | ||
178 | n->pf->rhit +=1; | ||
179 | #endif | ||
180 | r = tcf_exts_exec(skb, &n->exts, res); | ||
181 | if (r < 0) { | ||
182 | n = n->next; | ||
183 | goto next_knode; | ||
184 | } | ||
185 | |||
186 | return r; | ||
187 | } | ||
188 | n = n->next; | ||
189 | goto next_knode; | ||
190 | } | ||
191 | |||
192 | /* PUSH */ | ||
193 | if (sdepth >= TC_U32_MAXDEPTH) | ||
194 | goto deadloop; | ||
195 | stack[sdepth].knode = n; | ||
196 | stack[sdepth].ptr = ptr; | ||
197 | sdepth++; | ||
198 | |||
199 | ht = n->ht_down; | ||
200 | sel = 0; | ||
201 | if (ht->divisor) | ||
202 | sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift); | ||
203 | |||
204 | if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) | ||
205 | goto next_ht; | ||
206 | |||
207 | if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { | ||
208 | off2 = n->sel.off + 3; | ||
209 | if (n->sel.flags&TC_U32_VAROFFSET) | ||
210 | off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; | ||
211 | off2 &= ~3; | ||
212 | } | ||
213 | if (n->sel.flags&TC_U32_EAT) { | ||
214 | ptr += off2; | ||
215 | off2 = 0; | ||
216 | } | ||
217 | |||
218 | if (ptr < skb->tail) | ||
219 | goto next_ht; | ||
220 | } | ||
221 | |||
222 | /* POP */ | ||
223 | if (sdepth--) { | ||
224 | n = stack[sdepth].knode; | ||
225 | ht = n->ht_up; | ||
226 | ptr = stack[sdepth].ptr; | ||
227 | goto check_terminal; | ||
228 | } | ||
229 | return -1; | ||
230 | |||
231 | deadloop: | ||
232 | if (net_ratelimit()) | ||
233 | printk("cls_u32: dead loop\n"); | ||
234 | return -1; | ||
235 | } | ||
236 | |||
237 | static __inline__ struct tc_u_hnode * | ||
238 | u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) | ||
239 | { | ||
240 | struct tc_u_hnode *ht; | ||
241 | |||
242 | for (ht = tp_c->hlist; ht; ht = ht->next) | ||
243 | if (ht->handle == handle) | ||
244 | break; | ||
245 | |||
246 | return ht; | ||
247 | } | ||
248 | |||
249 | static __inline__ struct tc_u_knode * | ||
250 | u32_lookup_key(struct tc_u_hnode *ht, u32 handle) | ||
251 | { | ||
252 | unsigned sel; | ||
253 | struct tc_u_knode *n = NULL; | ||
254 | |||
255 | sel = TC_U32_HASH(handle); | ||
256 | if (sel > ht->divisor) | ||
257 | goto out; | ||
258 | |||
259 | for (n = ht->ht[sel]; n; n = n->next) | ||
260 | if (n->handle == handle) | ||
261 | break; | ||
262 | out: | ||
263 | return n; | ||
264 | } | ||
265 | |||
266 | |||
267 | static unsigned long u32_get(struct tcf_proto *tp, u32 handle) | ||
268 | { | ||
269 | struct tc_u_hnode *ht; | ||
270 | struct tc_u_common *tp_c = tp->data; | ||
271 | |||
272 | if (TC_U32_HTID(handle) == TC_U32_ROOT) | ||
273 | ht = tp->root; | ||
274 | else | ||
275 | ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); | ||
276 | |||
277 | if (!ht) | ||
278 | return 0; | ||
279 | |||
280 | if (TC_U32_KEY(handle) == 0) | ||
281 | return (unsigned long)ht; | ||
282 | |||
283 | return (unsigned long)u32_lookup_key(ht, handle); | ||
284 | } | ||
285 | |||
286 | static void u32_put(struct tcf_proto *tp, unsigned long f) | ||
287 | { | ||
288 | } | ||
289 | |||
290 | static u32 gen_new_htid(struct tc_u_common *tp_c) | ||
291 | { | ||
292 | int i = 0x800; | ||
293 | |||
294 | do { | ||
295 | if (++tp_c->hgenerator == 0x7FF) | ||
296 | tp_c->hgenerator = 1; | ||
297 | } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); | ||
298 | |||
299 | return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; | ||
300 | } | ||
301 | |||
302 | static int u32_init(struct tcf_proto *tp) | ||
303 | { | ||
304 | struct tc_u_hnode *root_ht; | ||
305 | struct tc_u_common *tp_c; | ||
306 | |||
307 | for (tp_c = u32_list; tp_c; tp_c = tp_c->next) | ||
308 | if (tp_c->q == tp->q) | ||
309 | break; | ||
310 | |||
311 | root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); | ||
312 | if (root_ht == NULL) | ||
313 | return -ENOBUFS; | ||
314 | |||
315 | memset(root_ht, 0, sizeof(*root_ht)); | ||
316 | root_ht->divisor = 0; | ||
317 | root_ht->refcnt++; | ||
318 | root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; | ||
319 | root_ht->prio = tp->prio; | ||
320 | |||
321 | if (tp_c == NULL) { | ||
322 | tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); | ||
323 | if (tp_c == NULL) { | ||
324 | kfree(root_ht); | ||
325 | return -ENOBUFS; | ||
326 | } | ||
327 | memset(tp_c, 0, sizeof(*tp_c)); | ||
328 | tp_c->q = tp->q; | ||
329 | tp_c->next = u32_list; | ||
330 | u32_list = tp_c; | ||
331 | } | ||
332 | |||
333 | tp_c->refcnt++; | ||
334 | root_ht->next = tp_c->hlist; | ||
335 | tp_c->hlist = root_ht; | ||
336 | root_ht->tp_c = tp_c; | ||
337 | |||
338 | tp->root = root_ht; | ||
339 | tp->data = tp_c; | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) | ||
344 | { | ||
345 | tcf_unbind_filter(tp, &n->res); | ||
346 | tcf_exts_destroy(tp, &n->exts); | ||
347 | if (n->ht_down) | ||
348 | n->ht_down->refcnt--; | ||
349 | #ifdef CONFIG_CLS_U32_PERF | ||
350 | if (n && (NULL != n->pf)) | ||
351 | kfree(n->pf); | ||
352 | #endif | ||
353 | kfree(n); | ||
354 | return 0; | ||
355 | } | ||
356 | |||
357 | static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) | ||
358 | { | ||
359 | struct tc_u_knode **kp; | ||
360 | struct tc_u_hnode *ht = key->ht_up; | ||
361 | |||
362 | if (ht) { | ||
363 | for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { | ||
364 | if (*kp == key) { | ||
365 | tcf_tree_lock(tp); | ||
366 | *kp = key->next; | ||
367 | tcf_tree_unlock(tp); | ||
368 | |||
369 | u32_destroy_key(tp, key); | ||
370 | return 0; | ||
371 | } | ||
372 | } | ||
373 | } | ||
374 | BUG_TRAP(0); | ||
375 | return 0; | ||
376 | } | ||
377 | |||
378 | static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) | ||
379 | { | ||
380 | struct tc_u_knode *n; | ||
381 | unsigned h; | ||
382 | |||
383 | for (h=0; h<=ht->divisor; h++) { | ||
384 | while ((n = ht->ht[h]) != NULL) { | ||
385 | ht->ht[h] = n->next; | ||
386 | |||
387 | u32_destroy_key(tp, n); | ||
388 | } | ||
389 | } | ||
390 | } | ||
391 | |||
392 | static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) | ||
393 | { | ||
394 | struct tc_u_common *tp_c = tp->data; | ||
395 | struct tc_u_hnode **hn; | ||
396 | |||
397 | BUG_TRAP(!ht->refcnt); | ||
398 | |||
399 | u32_clear_hnode(tp, ht); | ||
400 | |||
401 | for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { | ||
402 | if (*hn == ht) { | ||
403 | *hn = ht->next; | ||
404 | kfree(ht); | ||
405 | return 0; | ||
406 | } | ||
407 | } | ||
408 | |||
409 | BUG_TRAP(0); | ||
410 | return -ENOENT; | ||
411 | } | ||
412 | |||
413 | static void u32_destroy(struct tcf_proto *tp) | ||
414 | { | ||
415 | struct tc_u_common *tp_c = tp->data; | ||
416 | struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); | ||
417 | |||
418 | BUG_TRAP(root_ht != NULL); | ||
419 | |||
420 | if (root_ht && --root_ht->refcnt == 0) | ||
421 | u32_destroy_hnode(tp, root_ht); | ||
422 | |||
423 | if (--tp_c->refcnt == 0) { | ||
424 | struct tc_u_hnode *ht; | ||
425 | struct tc_u_common **tp_cp; | ||
426 | |||
427 | for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { | ||
428 | if (*tp_cp == tp_c) { | ||
429 | *tp_cp = tp_c->next; | ||
430 | break; | ||
431 | } | ||
432 | } | ||
433 | |||
434 | for (ht=tp_c->hlist; ht; ht = ht->next) | ||
435 | u32_clear_hnode(tp, ht); | ||
436 | |||
437 | while ((ht = tp_c->hlist) != NULL) { | ||
438 | tp_c->hlist = ht->next; | ||
439 | |||
440 | BUG_TRAP(ht->refcnt == 0); | ||
441 | |||
442 | kfree(ht); | ||
443 | }; | ||
444 | |||
445 | kfree(tp_c); | ||
446 | } | ||
447 | |||
448 | tp->data = NULL; | ||
449 | } | ||
450 | |||
451 | static int u32_delete(struct tcf_proto *tp, unsigned long arg) | ||
452 | { | ||
453 | struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; | ||
454 | |||
455 | if (ht == NULL) | ||
456 | return 0; | ||
457 | |||
458 | if (TC_U32_KEY(ht->handle)) | ||
459 | return u32_delete_key(tp, (struct tc_u_knode*)ht); | ||
460 | |||
461 | if (tp->root == ht) | ||
462 | return -EINVAL; | ||
463 | |||
464 | if (--ht->refcnt == 0) | ||
465 | u32_destroy_hnode(tp, ht); | ||
466 | |||
467 | return 0; | ||
468 | } | ||
469 | |||
470 | static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) | ||
471 | { | ||
472 | struct tc_u_knode *n; | ||
473 | unsigned i = 0x7FF; | ||
474 | |||
475 | for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) | ||
476 | if (i < TC_U32_NODE(n->handle)) | ||
477 | i = TC_U32_NODE(n->handle); | ||
478 | i++; | ||
479 | |||
480 | return handle|(i>0xFFF ? 0xFFF : i); | ||
481 | } | ||
482 | |||
483 | static int u32_set_parms(struct tcf_proto *tp, unsigned long base, | ||
484 | struct tc_u_hnode *ht, | ||
485 | struct tc_u_knode *n, struct rtattr **tb, | ||
486 | struct rtattr *est) | ||
487 | { | ||
488 | int err; | ||
489 | struct tcf_exts e; | ||
490 | |||
491 | err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); | ||
492 | if (err < 0) | ||
493 | return err; | ||
494 | |||
495 | err = -EINVAL; | ||
496 | if (tb[TCA_U32_LINK-1]) { | ||
497 | u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); | ||
498 | struct tc_u_hnode *ht_down = NULL; | ||
499 | |||
500 | if (TC_U32_KEY(handle)) | ||
501 | goto errout; | ||
502 | |||
503 | if (handle) { | ||
504 | ht_down = u32_lookup_ht(ht->tp_c, handle); | ||
505 | |||
506 | if (ht_down == NULL) | ||
507 | goto errout; | ||
508 | ht_down->refcnt++; | ||
509 | } | ||
510 | |||
511 | tcf_tree_lock(tp); | ||
512 | ht_down = xchg(&n->ht_down, ht_down); | ||
513 | tcf_tree_unlock(tp); | ||
514 | |||
515 | if (ht_down) | ||
516 | ht_down->refcnt--; | ||
517 | } | ||
518 | if (tb[TCA_U32_CLASSID-1]) { | ||
519 | n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); | ||
520 | tcf_bind_filter(tp, &n->res, base); | ||
521 | } | ||
522 | |||
523 | #ifdef CONFIG_NET_CLS_IND | ||
524 | if (tb[TCA_U32_INDEV-1]) { | ||
525 | int err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]); | ||
526 | if (err < 0) | ||
527 | goto errout; | ||
528 | } | ||
529 | #endif | ||
530 | tcf_exts_change(tp, &n->exts, &e); | ||
531 | |||
532 | return 0; | ||
533 | errout: | ||
534 | tcf_exts_destroy(tp, &e); | ||
535 | return err; | ||
536 | } | ||
537 | |||
538 | static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, | ||
539 | struct rtattr **tca, | ||
540 | unsigned long *arg) | ||
541 | { | ||
542 | struct tc_u_common *tp_c = tp->data; | ||
543 | struct tc_u_hnode *ht; | ||
544 | struct tc_u_knode *n; | ||
545 | struct tc_u32_sel *s; | ||
546 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
547 | struct rtattr *tb[TCA_U32_MAX]; | ||
548 | u32 htid; | ||
549 | int err; | ||
550 | |||
551 | if (opt == NULL) | ||
552 | return handle ? -EINVAL : 0; | ||
553 | |||
554 | if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0) | ||
555 | return -EINVAL; | ||
556 | |||
557 | if ((n = (struct tc_u_knode*)*arg) != NULL) { | ||
558 | if (TC_U32_KEY(n->handle) == 0) | ||
559 | return -EINVAL; | ||
560 | |||
561 | return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]); | ||
562 | } | ||
563 | |||
564 | if (tb[TCA_U32_DIVISOR-1]) { | ||
565 | unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); | ||
566 | |||
567 | if (--divisor > 0x100) | ||
568 | return -EINVAL; | ||
569 | if (TC_U32_KEY(handle)) | ||
570 | return -EINVAL; | ||
571 | if (handle == 0) { | ||
572 | handle = gen_new_htid(tp->data); | ||
573 | if (handle == 0) | ||
574 | return -ENOMEM; | ||
575 | } | ||
576 | ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); | ||
577 | if (ht == NULL) | ||
578 | return -ENOBUFS; | ||
579 | memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); | ||
580 | ht->tp_c = tp_c; | ||
581 | ht->refcnt = 0; | ||
582 | ht->divisor = divisor; | ||
583 | ht->handle = handle; | ||
584 | ht->prio = tp->prio; | ||
585 | ht->next = tp_c->hlist; | ||
586 | tp_c->hlist = ht; | ||
587 | *arg = (unsigned long)ht; | ||
588 | return 0; | ||
589 | } | ||
590 | |||
591 | if (tb[TCA_U32_HASH-1]) { | ||
592 | htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); | ||
593 | if (TC_U32_HTID(htid) == TC_U32_ROOT) { | ||
594 | ht = tp->root; | ||
595 | htid = ht->handle; | ||
596 | } else { | ||
597 | ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); | ||
598 | if (ht == NULL) | ||
599 | return -EINVAL; | ||
600 | } | ||
601 | } else { | ||
602 | ht = tp->root; | ||
603 | htid = ht->handle; | ||
604 | } | ||
605 | |||
606 | if (ht->divisor < TC_U32_HASH(htid)) | ||
607 | return -EINVAL; | ||
608 | |||
609 | if (handle) { | ||
610 | if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) | ||
611 | return -EINVAL; | ||
612 | handle = htid | TC_U32_NODE(handle); | ||
613 | } else | ||
614 | handle = gen_new_kid(ht, htid); | ||
615 | |||
616 | if (tb[TCA_U32_SEL-1] == 0 || | ||
617 | RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) | ||
618 | return -EINVAL; | ||
619 | |||
620 | s = RTA_DATA(tb[TCA_U32_SEL-1]); | ||
621 | |||
622 | n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); | ||
623 | if (n == NULL) | ||
624 | return -ENOBUFS; | ||
625 | |||
626 | memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); | ||
627 | #ifdef CONFIG_CLS_U32_PERF | ||
628 | n->pf = kmalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); | ||
629 | if (n->pf == NULL) { | ||
630 | kfree(n); | ||
631 | return -ENOBUFS; | ||
632 | } | ||
633 | memset(n->pf, 0, sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64)); | ||
634 | #endif | ||
635 | |||
636 | memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); | ||
637 | n->ht_up = ht; | ||
638 | n->handle = handle; | ||
639 | { | ||
640 | u8 i = 0; | ||
641 | u32 mask = s->hmask; | ||
642 | if (mask) { | ||
643 | while (!(mask & 1)) { | ||
644 | i++; | ||
645 | mask>>=1; | ||
646 | } | ||
647 | } | ||
648 | n->fshift = i; | ||
649 | } | ||
650 | |||
651 | #ifdef CONFIG_CLS_U32_MARK | ||
652 | if (tb[TCA_U32_MARK-1]) { | ||
653 | struct tc_u32_mark *mark; | ||
654 | |||
655 | if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) { | ||
656 | #ifdef CONFIG_CLS_U32_PERF | ||
657 | kfree(n->pf); | ||
658 | #endif | ||
659 | kfree(n); | ||
660 | return -EINVAL; | ||
661 | } | ||
662 | mark = RTA_DATA(tb[TCA_U32_MARK-1]); | ||
663 | memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); | ||
664 | n->mark.success = 0; | ||
665 | } | ||
666 | #endif | ||
667 | |||
668 | err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]); | ||
669 | if (err == 0) { | ||
670 | struct tc_u_knode **ins; | ||
671 | for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) | ||
672 | if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) | ||
673 | break; | ||
674 | |||
675 | n->next = *ins; | ||
676 | wmb(); | ||
677 | *ins = n; | ||
678 | |||
679 | *arg = (unsigned long)n; | ||
680 | return 0; | ||
681 | } | ||
682 | #ifdef CONFIG_CLS_U32_PERF | ||
683 | if (n && (NULL != n->pf)) | ||
684 | kfree(n->pf); | ||
685 | #endif | ||
686 | kfree(n); | ||
687 | return err; | ||
688 | } | ||
689 | |||
690 | static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) | ||
691 | { | ||
692 | struct tc_u_common *tp_c = tp->data; | ||
693 | struct tc_u_hnode *ht; | ||
694 | struct tc_u_knode *n; | ||
695 | unsigned h; | ||
696 | |||
697 | if (arg->stop) | ||
698 | return; | ||
699 | |||
700 | for (ht = tp_c->hlist; ht; ht = ht->next) { | ||
701 | if (ht->prio != tp->prio) | ||
702 | continue; | ||
703 | if (arg->count >= arg->skip) { | ||
704 | if (arg->fn(tp, (unsigned long)ht, arg) < 0) { | ||
705 | arg->stop = 1; | ||
706 | return; | ||
707 | } | ||
708 | } | ||
709 | arg->count++; | ||
710 | for (h = 0; h <= ht->divisor; h++) { | ||
711 | for (n = ht->ht[h]; n; n = n->next) { | ||
712 | if (arg->count < arg->skip) { | ||
713 | arg->count++; | ||
714 | continue; | ||
715 | } | ||
716 | if (arg->fn(tp, (unsigned long)n, arg) < 0) { | ||
717 | arg->stop = 1; | ||
718 | return; | ||
719 | } | ||
720 | arg->count++; | ||
721 | } | ||
722 | } | ||
723 | } | ||
724 | } | ||
725 | |||
726 | static int u32_dump(struct tcf_proto *tp, unsigned long fh, | ||
727 | struct sk_buff *skb, struct tcmsg *t) | ||
728 | { | ||
729 | struct tc_u_knode *n = (struct tc_u_knode*)fh; | ||
730 | unsigned char *b = skb->tail; | ||
731 | struct rtattr *rta; | ||
732 | |||
733 | if (n == NULL) | ||
734 | return skb->len; | ||
735 | |||
736 | t->tcm_handle = n->handle; | ||
737 | |||
738 | rta = (struct rtattr*)b; | ||
739 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
740 | |||
741 | if (TC_U32_KEY(n->handle) == 0) { | ||
742 | struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; | ||
743 | u32 divisor = ht->divisor+1; | ||
744 | RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); | ||
745 | } else { | ||
746 | RTA_PUT(skb, TCA_U32_SEL, | ||
747 | sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), | ||
748 | &n->sel); | ||
749 | if (n->ht_up) { | ||
750 | u32 htid = n->handle & 0xFFFFF000; | ||
751 | RTA_PUT(skb, TCA_U32_HASH, 4, &htid); | ||
752 | } | ||
753 | if (n->res.classid) | ||
754 | RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); | ||
755 | if (n->ht_down) | ||
756 | RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); | ||
757 | |||
758 | #ifdef CONFIG_CLS_U32_MARK | ||
759 | if (n->mark.val || n->mark.mask) | ||
760 | RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); | ||
761 | #endif | ||
762 | |||
763 | if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) | ||
764 | goto rtattr_failure; | ||
765 | |||
766 | #ifdef CONFIG_NET_CLS_IND | ||
767 | if(strlen(n->indev)) | ||
768 | RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev); | ||
769 | #endif | ||
770 | #ifdef CONFIG_CLS_U32_PERF | ||
771 | RTA_PUT(skb, TCA_U32_PCNT, | ||
772 | sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), | ||
773 | n->pf); | ||
774 | #endif | ||
775 | } | ||
776 | |||
777 | rta->rta_len = skb->tail - b; | ||
778 | if (TC_U32_KEY(n->handle)) | ||
779 | if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) | ||
780 | goto rtattr_failure; | ||
781 | return skb->len; | ||
782 | |||
783 | rtattr_failure: | ||
784 | skb_trim(skb, b - skb->data); | ||
785 | return -1; | ||
786 | } | ||
787 | |||
788 | static struct tcf_proto_ops cls_u32_ops = { | ||
789 | .next = NULL, | ||
790 | .kind = "u32", | ||
791 | .classify = u32_classify, | ||
792 | .init = u32_init, | ||
793 | .destroy = u32_destroy, | ||
794 | .get = u32_get, | ||
795 | .put = u32_put, | ||
796 | .change = u32_change, | ||
797 | .delete = u32_delete, | ||
798 | .walk = u32_walk, | ||
799 | .dump = u32_dump, | ||
800 | .owner = THIS_MODULE, | ||
801 | }; | ||
802 | |||
803 | static int __init init_u32(void) | ||
804 | { | ||
805 | printk("u32 classifier\n"); | ||
806 | #ifdef CONFIG_CLS_U32_PERF | ||
807 | printk(" Perfomance counters on\n"); | ||
808 | #endif | ||
809 | #ifdef CONFIG_NET_CLS_POLICE | ||
810 | printk(" OLD policer on \n"); | ||
811 | #endif | ||
812 | #ifdef CONFIG_NET_CLS_IND | ||
813 | printk(" input device check on \n"); | ||
814 | #endif | ||
815 | #ifdef CONFIG_NET_CLS_ACT | ||
816 | printk(" Actions configured \n"); | ||
817 | #endif | ||
818 | return register_tcf_proto_ops(&cls_u32_ops); | ||
819 | } | ||
820 | |||
821 | static void __exit exit_u32(void) | ||
822 | { | ||
823 | unregister_tcf_proto_ops(&cls_u32_ops); | ||
824 | } | ||
825 | |||
826 | module_init(init_u32) | ||
827 | module_exit(exit_u32) | ||
828 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c new file mode 100644 index 000000000000..bf1f00f8b1bf --- /dev/null +++ b/net/sched/em_cmp.c | |||
@@ -0,0 +1,101 @@ | |||
1 | /* | ||
2 | * net/sched/em_cmp.c Simple packet data comparison ematch | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Thomas Graf <tgraf@suug.ch> | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/skbuff.h> | ||
17 | #include <linux/tc_ematch/tc_em_cmp.h> | ||
18 | #include <net/pkt_cls.h> | ||
19 | |||
20 | static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp) | ||
21 | { | ||
22 | return unlikely(cmp->flags & TCF_EM_CMP_TRANS); | ||
23 | } | ||
24 | |||
25 | static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, | ||
26 | struct tcf_pkt_info *info) | ||
27 | { | ||
28 | struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data; | ||
29 | unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off; | ||
30 | u32 val = 0; | ||
31 | |||
32 | if (!tcf_valid_offset(skb, ptr, cmp->align)) | ||
33 | return 0; | ||
34 | |||
35 | switch (cmp->align) { | ||
36 | case TCF_EM_ALIGN_U8: | ||
37 | val = *ptr; | ||
38 | break; | ||
39 | |||
40 | case TCF_EM_ALIGN_U16: | ||
41 | val = *ptr << 8; | ||
42 | val |= *(ptr+1); | ||
43 | |||
44 | if (cmp_needs_transformation(cmp)) | ||
45 | val = be16_to_cpu(val); | ||
46 | break; | ||
47 | |||
48 | case TCF_EM_ALIGN_U32: | ||
49 | /* Worth checking boundries? The branching seems | ||
50 | * to get worse. Visit again. */ | ||
51 | val = *ptr << 24; | ||
52 | val |= *(ptr+1) << 16; | ||
53 | val |= *(ptr+2) << 8; | ||
54 | val |= *(ptr+3); | ||
55 | |||
56 | if (cmp_needs_transformation(cmp)) | ||
57 | val = be32_to_cpu(val); | ||
58 | break; | ||
59 | |||
60 | default: | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | if (cmp->mask) | ||
65 | val &= cmp->mask; | ||
66 | |||
67 | switch (cmp->opnd) { | ||
68 | case TCF_EM_OPND_EQ: | ||
69 | return val == cmp->val; | ||
70 | case TCF_EM_OPND_LT: | ||
71 | return val < cmp->val; | ||
72 | case TCF_EM_OPND_GT: | ||
73 | return val > cmp->val; | ||
74 | } | ||
75 | |||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | static struct tcf_ematch_ops em_cmp_ops = { | ||
80 | .kind = TCF_EM_CMP, | ||
81 | .datalen = sizeof(struct tcf_em_cmp), | ||
82 | .match = em_cmp_match, | ||
83 | .owner = THIS_MODULE, | ||
84 | .link = LIST_HEAD_INIT(em_cmp_ops.link) | ||
85 | }; | ||
86 | |||
87 | static int __init init_em_cmp(void) | ||
88 | { | ||
89 | return tcf_em_register(&em_cmp_ops); | ||
90 | } | ||
91 | |||
92 | static void __exit exit_em_cmp(void) | ||
93 | { | ||
94 | tcf_em_unregister(&em_cmp_ops); | ||
95 | } | ||
96 | |||
97 | MODULE_LICENSE("GPL"); | ||
98 | |||
99 | module_init(init_em_cmp); | ||
100 | module_exit(exit_em_cmp); | ||
101 | |||
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c new file mode 100644 index 000000000000..f1eeaf65cee5 --- /dev/null +++ b/net/sched/em_meta.c | |||
@@ -0,0 +1,661 @@ | |||
1 | /* | ||
2 | * net/sched/em_meta.c Metadata ematch | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Thomas Graf <tgraf@suug.ch> | ||
10 | * | ||
11 | * ========================================================================== | ||
12 | * | ||
13 | * The metadata ematch compares two meta objects where each object | ||
14 | * represents either a meta value stored in the kernel or a static | ||
15 | * value provided by userspace. The objects are not provided by | ||
16 | * userspace itself but rather a definition providing the information | ||
17 | * to build them. Every object is of a certain type which must be | ||
18 | * equal to the object it is being compared to. | ||
19 | * | ||
20 | * The definition of a objects conists of the type (meta type), a | ||
21 | * identifier (meta id) and additional type specific information. | ||
22 | * The meta id is either TCF_META_TYPE_VALUE for values provided by | ||
23 | * userspace or a index to the meta operations table consisting of | ||
24 | * function pointers to type specific meta data collectors returning | ||
25 | * the value of the requested meta value. | ||
26 | * | ||
27 | * lvalue rvalue | ||
28 | * +-----------+ +-----------+ | ||
29 | * | type: INT | | type: INT | | ||
30 | * def | id: INDEV | | id: VALUE | | ||
31 | * | data: | | data: 3 | | ||
32 | * +-----------+ +-----------+ | ||
33 | * | | | ||
34 | * ---> meta_ops[INT][INDEV](...) | | ||
35 | * | | | ||
36 | * ----------- | | ||
37 | * V V | ||
38 | * +-----------+ +-----------+ | ||
39 | * | type: INT | | type: INT | | ||
40 | * obj | id: INDEV | | id: VALUE | | ||
41 | * | data: 2 |<--data got filled out | data: 3 | | ||
42 | * +-----------+ +-----------+ | ||
43 | * | | | ||
44 | * --------------> 2 equals 3 <-------------- | ||
45 | * | ||
46 | * This is a simplified schema, the complexity varies depending | ||
47 | * on the meta type. Obviously, the length of the data must also | ||
48 | * be provided for non-numeric types. | ||
49 | * | ||
50 | * Additionaly, type dependant modifiers such as shift operators | ||
51 | * or mask may be applied to extend the functionaliy. As of now, | ||
52 | * the variable length type supports shifting the byte string to | ||
53 | * the right, eating up any number of octets and thus supporting | ||
54 | * wildcard interface name comparisons such as "ppp%" matching | ||
55 | * ppp0..9. | ||
56 | * | ||
57 | * NOTE: Certain meta values depend on other subsystems and are | ||
58 | * only available if that subsytem is enabled in the kernel. | ||
59 | */ | ||
60 | |||
61 | #include <linux/config.h> | ||
62 | #include <linux/module.h> | ||
63 | #include <linux/types.h> | ||
64 | #include <linux/kernel.h> | ||
65 | #include <linux/sched.h> | ||
66 | #include <linux/string.h> | ||
67 | #include <linux/skbuff.h> | ||
68 | #include <linux/random.h> | ||
69 | #include <linux/tc_ematch/tc_em_meta.h> | ||
70 | #include <net/dst.h> | ||
71 | #include <net/route.h> | ||
72 | #include <net/pkt_cls.h> | ||
73 | |||
74 | struct meta_obj | ||
75 | { | ||
76 | unsigned long value; | ||
77 | unsigned int len; | ||
78 | }; | ||
79 | |||
80 | struct meta_value | ||
81 | { | ||
82 | struct tcf_meta_val hdr; | ||
83 | unsigned long val; | ||
84 | unsigned int len; | ||
85 | }; | ||
86 | |||
87 | struct meta_match | ||
88 | { | ||
89 | struct meta_value lvalue; | ||
90 | struct meta_value rvalue; | ||
91 | }; | ||
92 | |||
93 | static inline int meta_id(struct meta_value *v) | ||
94 | { | ||
95 | return TCF_META_ID(v->hdr.kind); | ||
96 | } | ||
97 | |||
98 | static inline int meta_type(struct meta_value *v) | ||
99 | { | ||
100 | return TCF_META_TYPE(v->hdr.kind); | ||
101 | } | ||
102 | |||
103 | #define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \ | ||
104 | struct tcf_pkt_info *info, struct meta_value *v, \ | ||
105 | struct meta_obj *dst, int *err) | ||
106 | |||
107 | /************************************************************************** | ||
108 | * System status & misc | ||
109 | **************************************************************************/ | ||
110 | |||
111 | META_COLLECTOR(int_random) | ||
112 | { | ||
113 | get_random_bytes(&dst->value, sizeof(dst->value)); | ||
114 | } | ||
115 | |||
116 | static inline unsigned long fixed_loadavg(int load) | ||
117 | { | ||
118 | int rnd_load = load + (FIXED_1/200); | ||
119 | int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT; | ||
120 | |||
121 | return ((rnd_load >> FSHIFT) * 100) + rnd_frac; | ||
122 | } | ||
123 | |||
124 | META_COLLECTOR(int_loadavg_0) | ||
125 | { | ||
126 | dst->value = fixed_loadavg(avenrun[0]); | ||
127 | } | ||
128 | |||
129 | META_COLLECTOR(int_loadavg_1) | ||
130 | { | ||
131 | dst->value = fixed_loadavg(avenrun[1]); | ||
132 | } | ||
133 | |||
134 | META_COLLECTOR(int_loadavg_2) | ||
135 | { | ||
136 | dst->value = fixed_loadavg(avenrun[2]); | ||
137 | } | ||
138 | |||
139 | /************************************************************************** | ||
140 | * Device names & indices | ||
141 | **************************************************************************/ | ||
142 | |||
143 | static inline int int_dev(struct net_device *dev, struct meta_obj *dst) | ||
144 | { | ||
145 | if (unlikely(dev == NULL)) | ||
146 | return -1; | ||
147 | |||
148 | dst->value = dev->ifindex; | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | static inline int var_dev(struct net_device *dev, struct meta_obj *dst) | ||
153 | { | ||
154 | if (unlikely(dev == NULL)) | ||
155 | return -1; | ||
156 | |||
157 | dst->value = (unsigned long) dev->name; | ||
158 | dst->len = strlen(dev->name); | ||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | META_COLLECTOR(int_dev) | ||
163 | { | ||
164 | *err = int_dev(skb->dev, dst); | ||
165 | } | ||
166 | |||
167 | META_COLLECTOR(var_dev) | ||
168 | { | ||
169 | *err = var_dev(skb->dev, dst); | ||
170 | } | ||
171 | |||
172 | META_COLLECTOR(int_indev) | ||
173 | { | ||
174 | *err = int_dev(skb->input_dev, dst); | ||
175 | } | ||
176 | |||
177 | META_COLLECTOR(var_indev) | ||
178 | { | ||
179 | *err = var_dev(skb->input_dev, dst); | ||
180 | } | ||
181 | |||
182 | META_COLLECTOR(int_realdev) | ||
183 | { | ||
184 | *err = int_dev(skb->real_dev, dst); | ||
185 | } | ||
186 | |||
187 | META_COLLECTOR(var_realdev) | ||
188 | { | ||
189 | *err = var_dev(skb->real_dev, dst); | ||
190 | } | ||
191 | |||
192 | /************************************************************************** | ||
193 | * skb attributes | ||
194 | **************************************************************************/ | ||
195 | |||
196 | META_COLLECTOR(int_priority) | ||
197 | { | ||
198 | dst->value = skb->priority; | ||
199 | } | ||
200 | |||
201 | META_COLLECTOR(int_protocol) | ||
202 | { | ||
203 | /* Let userspace take care of the byte ordering */ | ||
204 | dst->value = skb->protocol; | ||
205 | } | ||
206 | |||
207 | META_COLLECTOR(int_security) | ||
208 | { | ||
209 | dst->value = skb->security; | ||
210 | } | ||
211 | |||
212 | META_COLLECTOR(int_pkttype) | ||
213 | { | ||
214 | dst->value = skb->pkt_type; | ||
215 | } | ||
216 | |||
217 | META_COLLECTOR(int_pktlen) | ||
218 | { | ||
219 | dst->value = skb->len; | ||
220 | } | ||
221 | |||
222 | META_COLLECTOR(int_datalen) | ||
223 | { | ||
224 | dst->value = skb->data_len; | ||
225 | } | ||
226 | |||
227 | META_COLLECTOR(int_maclen) | ||
228 | { | ||
229 | dst->value = skb->mac_len; | ||
230 | } | ||
231 | |||
232 | /************************************************************************** | ||
233 | * Netfilter | ||
234 | **************************************************************************/ | ||
235 | |||
236 | #ifdef CONFIG_NETFILTER | ||
237 | META_COLLECTOR(int_nfmark) | ||
238 | { | ||
239 | dst->value = skb->nfmark; | ||
240 | } | ||
241 | #endif | ||
242 | |||
243 | /************************************************************************** | ||
244 | * Traffic Control | ||
245 | **************************************************************************/ | ||
246 | |||
247 | META_COLLECTOR(int_tcindex) | ||
248 | { | ||
249 | dst->value = skb->tc_index; | ||
250 | } | ||
251 | |||
252 | #ifdef CONFIG_NET_CLS_ACT | ||
253 | META_COLLECTOR(int_tcverd) | ||
254 | { | ||
255 | dst->value = skb->tc_verd; | ||
256 | } | ||
257 | |||
258 | META_COLLECTOR(int_tcclassid) | ||
259 | { | ||
260 | dst->value = skb->tc_classid; | ||
261 | } | ||
262 | #endif | ||
263 | |||
264 | /************************************************************************** | ||
265 | * Routing | ||
266 | **************************************************************************/ | ||
267 | |||
268 | #ifdef CONFIG_NET_CLS_ROUTE | ||
269 | META_COLLECTOR(int_rtclassid) | ||
270 | { | ||
271 | if (unlikely(skb->dst == NULL)) | ||
272 | *err = -1; | ||
273 | else | ||
274 | dst->value = skb->dst->tclassid; | ||
275 | } | ||
276 | #endif | ||
277 | |||
278 | META_COLLECTOR(int_rtiif) | ||
279 | { | ||
280 | if (unlikely(skb->dst == NULL)) | ||
281 | *err = -1; | ||
282 | else | ||
283 | dst->value = ((struct rtable*) skb->dst)->fl.iif; | ||
284 | } | ||
285 | |||
286 | /************************************************************************** | ||
287 | * Meta value collectors assignment table | ||
288 | **************************************************************************/ | ||
289 | |||
290 | struct meta_ops | ||
291 | { | ||
292 | void (*get)(struct sk_buff *, struct tcf_pkt_info *, | ||
293 | struct meta_value *, struct meta_obj *, int *); | ||
294 | }; | ||
295 | |||
296 | /* Meta value operations table listing all meta value collectors and | ||
297 | * assigns them to a type and meta id. */ | ||
298 | static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { | ||
299 | [TCF_META_TYPE_VAR] = { | ||
300 | [TCF_META_ID_DEV] = { .get = meta_var_dev }, | ||
301 | [TCF_META_ID_INDEV] = { .get = meta_var_indev }, | ||
302 | [TCF_META_ID_REALDEV] = { .get = meta_var_realdev } | ||
303 | }, | ||
304 | [TCF_META_TYPE_INT] = { | ||
305 | [TCF_META_ID_RANDOM] = { .get = meta_int_random }, | ||
306 | [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 }, | ||
307 | [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 }, | ||
308 | [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 }, | ||
309 | [TCF_META_ID_DEV] = { .get = meta_int_dev }, | ||
310 | [TCF_META_ID_INDEV] = { .get = meta_int_indev }, | ||
311 | [TCF_META_ID_REALDEV] = { .get = meta_int_realdev }, | ||
312 | [TCF_META_ID_PRIORITY] = { .get = meta_int_priority }, | ||
313 | [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol }, | ||
314 | [TCF_META_ID_SECURITY] = { .get = meta_int_security }, | ||
315 | [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype }, | ||
316 | [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen }, | ||
317 | [TCF_META_ID_DATALEN] = { .get = meta_int_datalen }, | ||
318 | [TCF_META_ID_MACLEN] = { .get = meta_int_maclen }, | ||
319 | #ifdef CONFIG_NETFILTER | ||
320 | [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark }, | ||
321 | #endif | ||
322 | [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex }, | ||
323 | #ifdef CONFIG_NET_CLS_ACT | ||
324 | [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd }, | ||
325 | [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid }, | ||
326 | #endif | ||
327 | #ifdef CONFIG_NET_CLS_ROUTE | ||
328 | [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid }, | ||
329 | #endif | ||
330 | [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif } | ||
331 | } | ||
332 | }; | ||
333 | |||
334 | static inline struct meta_ops * meta_ops(struct meta_value *val) | ||
335 | { | ||
336 | return &__meta_ops[meta_type(val)][meta_id(val)]; | ||
337 | } | ||
338 | |||
339 | /************************************************************************** | ||
340 | * Type specific operations for TCF_META_TYPE_VAR | ||
341 | **************************************************************************/ | ||
342 | |||
343 | static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) | ||
344 | { | ||
345 | int r = a->len - b->len; | ||
346 | |||
347 | if (r == 0) | ||
348 | r = memcmp((void *) a->value, (void *) b->value, a->len); | ||
349 | |||
350 | return r; | ||
351 | } | ||
352 | |||
353 | static int meta_var_change(struct meta_value *dst, struct rtattr *rta) | ||
354 | { | ||
355 | int len = RTA_PAYLOAD(rta); | ||
356 | |||
357 | dst->val = (unsigned long) kmalloc(len, GFP_KERNEL); | ||
358 | if (dst->val == 0UL) | ||
359 | return -ENOMEM; | ||
360 | memcpy((void *) dst->val, RTA_DATA(rta), len); | ||
361 | dst->len = len; | ||
362 | return 0; | ||
363 | } | ||
364 | |||
365 | static void meta_var_destroy(struct meta_value *v) | ||
366 | { | ||
367 | if (v->val) | ||
368 | kfree((void *) v->val); | ||
369 | } | ||
370 | |||
371 | static void meta_var_apply_extras(struct meta_value *v, | ||
372 | struct meta_obj *dst) | ||
373 | { | ||
374 | int shift = v->hdr.shift; | ||
375 | |||
376 | if (shift && shift < dst->len) | ||
377 | dst->len -= shift; | ||
378 | } | ||
379 | |||
380 | static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) | ||
381 | { | ||
382 | if (v->val && v->len) | ||
383 | RTA_PUT(skb, tlv, v->len, (void *) v->val); | ||
384 | return 0; | ||
385 | |||
386 | rtattr_failure: | ||
387 | return -1; | ||
388 | } | ||
389 | |||
390 | /************************************************************************** | ||
391 | * Type specific operations for TCF_META_TYPE_INT | ||
392 | **************************************************************************/ | ||
393 | |||
394 | static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) | ||
395 | { | ||
396 | /* Let gcc optimize it, the unlikely is not really based on | ||
397 | * some numbers but jump free code for mismatches seems | ||
398 | * more logical. */ | ||
399 | if (unlikely(a == b)) | ||
400 | return 0; | ||
401 | else if (a < b) | ||
402 | return -1; | ||
403 | else | ||
404 | return 1; | ||
405 | } | ||
406 | |||
407 | static int meta_int_change(struct meta_value *dst, struct rtattr *rta) | ||
408 | { | ||
409 | if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) { | ||
410 | dst->val = *(unsigned long *) RTA_DATA(rta); | ||
411 | dst->len = sizeof(unsigned long); | ||
412 | } else if (RTA_PAYLOAD(rta) == sizeof(u32)) { | ||
413 | dst->val = *(u32 *) RTA_DATA(rta); | ||
414 | dst->len = sizeof(u32); | ||
415 | } else | ||
416 | return -EINVAL; | ||
417 | |||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | static void meta_int_apply_extras(struct meta_value *v, | ||
422 | struct meta_obj *dst) | ||
423 | { | ||
424 | if (v->hdr.shift) | ||
425 | dst->value >>= v->hdr.shift; | ||
426 | |||
427 | if (v->val) | ||
428 | dst->value &= v->val; | ||
429 | } | ||
430 | |||
431 | static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) | ||
432 | { | ||
433 | if (v->len == sizeof(unsigned long)) | ||
434 | RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val); | ||
435 | else if (v->len == sizeof(u32)) { | ||
436 | u32 d = v->val; | ||
437 | RTA_PUT(skb, tlv, sizeof(d), &d); | ||
438 | } | ||
439 | |||
440 | return 0; | ||
441 | |||
442 | rtattr_failure: | ||
443 | return -1; | ||
444 | } | ||
445 | |||
446 | /************************************************************************** | ||
447 | * Type specific operations table | ||
448 | **************************************************************************/ | ||
449 | |||
450 | struct meta_type_ops | ||
451 | { | ||
452 | void (*destroy)(struct meta_value *); | ||
453 | int (*compare)(struct meta_obj *, struct meta_obj *); | ||
454 | int (*change)(struct meta_value *, struct rtattr *); | ||
455 | void (*apply_extras)(struct meta_value *, struct meta_obj *); | ||
456 | int (*dump)(struct sk_buff *, struct meta_value *, int); | ||
457 | }; | ||
458 | |||
459 | static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { | ||
460 | [TCF_META_TYPE_VAR] = { | ||
461 | .destroy = meta_var_destroy, | ||
462 | .compare = meta_var_compare, | ||
463 | .change = meta_var_change, | ||
464 | .apply_extras = meta_var_apply_extras, | ||
465 | .dump = meta_var_dump | ||
466 | }, | ||
467 | [TCF_META_TYPE_INT] = { | ||
468 | .compare = meta_int_compare, | ||
469 | .change = meta_int_change, | ||
470 | .apply_extras = meta_int_apply_extras, | ||
471 | .dump = meta_int_dump | ||
472 | } | ||
473 | }; | ||
474 | |||
475 | static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) | ||
476 | { | ||
477 | return &__meta_type_ops[meta_type(v)]; | ||
478 | } | ||
479 | |||
480 | /************************************************************************** | ||
481 | * Core | ||
482 | **************************************************************************/ | ||
483 | |||
484 | static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, | ||
485 | struct meta_value *v, struct meta_obj *dst) | ||
486 | { | ||
487 | int err = 0; | ||
488 | |||
489 | if (meta_id(v) == TCF_META_ID_VALUE) { | ||
490 | dst->value = v->val; | ||
491 | dst->len = v->len; | ||
492 | return 0; | ||
493 | } | ||
494 | |||
495 | meta_ops(v)->get(skb, info, v, dst, &err); | ||
496 | if (err < 0) | ||
497 | return err; | ||
498 | |||
499 | if (meta_type_ops(v)->apply_extras) | ||
500 | meta_type_ops(v)->apply_extras(v, dst); | ||
501 | |||
502 | return 0; | ||
503 | } | ||
504 | |||
505 | static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, | ||
506 | struct tcf_pkt_info *info) | ||
507 | { | ||
508 | int r; | ||
509 | struct meta_match *meta = (struct meta_match *) m->data; | ||
510 | struct meta_obj l_value, r_value; | ||
511 | |||
512 | if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 || | ||
513 | meta_get(skb, info, &meta->rvalue, &r_value) < 0) | ||
514 | return 0; | ||
515 | |||
516 | r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); | ||
517 | |||
518 | switch (meta->lvalue.hdr.op) { | ||
519 | case TCF_EM_OPND_EQ: | ||
520 | return !r; | ||
521 | case TCF_EM_OPND_LT: | ||
522 | return r < 0; | ||
523 | case TCF_EM_OPND_GT: | ||
524 | return r > 0; | ||
525 | } | ||
526 | |||
527 | return 0; | ||
528 | } | ||
529 | |||
530 | static inline void meta_delete(struct meta_match *meta) | ||
531 | { | ||
532 | struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); | ||
533 | |||
534 | if (ops && ops->destroy) { | ||
535 | ops->destroy(&meta->lvalue); | ||
536 | ops->destroy(&meta->rvalue); | ||
537 | } | ||
538 | |||
539 | kfree(meta); | ||
540 | } | ||
541 | |||
542 | static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta) | ||
543 | { | ||
544 | if (rta) { | ||
545 | if (RTA_PAYLOAD(rta) == 0) | ||
546 | return -EINVAL; | ||
547 | |||
548 | return meta_type_ops(dst)->change(dst, rta); | ||
549 | } | ||
550 | |||
551 | return 0; | ||
552 | } | ||
553 | |||
554 | static inline int meta_is_supported(struct meta_value *val) | ||
555 | { | ||
556 | return (!meta_id(val) || meta_ops(val)->get); | ||
557 | } | ||
558 | |||
559 | static int em_meta_change(struct tcf_proto *tp, void *data, int len, | ||
560 | struct tcf_ematch *m) | ||
561 | { | ||
562 | int err = -EINVAL; | ||
563 | struct rtattr *tb[TCA_EM_META_MAX]; | ||
564 | struct tcf_meta_hdr *hdr; | ||
565 | struct meta_match *meta = NULL; | ||
566 | |||
567 | if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0) | ||
568 | goto errout; | ||
569 | |||
570 | if (tb[TCA_EM_META_HDR-1] == NULL || | ||
571 | RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr)) | ||
572 | goto errout; | ||
573 | hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]); | ||
574 | |||
575 | if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || | ||
576 | TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || | ||
577 | TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX || | ||
578 | TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) | ||
579 | goto errout; | ||
580 | |||
581 | meta = kmalloc(sizeof(*meta), GFP_KERNEL); | ||
582 | if (meta == NULL) | ||
583 | goto errout; | ||
584 | memset(meta, 0, sizeof(*meta)); | ||
585 | |||
586 | memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); | ||
587 | memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); | ||
588 | |||
589 | if (!meta_is_supported(&meta->lvalue) || | ||
590 | !meta_is_supported(&meta->rvalue)) { | ||
591 | err = -EOPNOTSUPP; | ||
592 | goto errout; | ||
593 | } | ||
594 | |||
595 | if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 || | ||
596 | meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0) | ||
597 | goto errout; | ||
598 | |||
599 | m->datalen = sizeof(*meta); | ||
600 | m->data = (unsigned long) meta; | ||
601 | |||
602 | err = 0; | ||
603 | errout: | ||
604 | if (err && meta) | ||
605 | meta_delete(meta); | ||
606 | return err; | ||
607 | } | ||
608 | |||
609 | static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m) | ||
610 | { | ||
611 | if (m) | ||
612 | meta_delete((struct meta_match *) m->data); | ||
613 | } | ||
614 | |||
615 | static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) | ||
616 | { | ||
617 | struct meta_match *meta = (struct meta_match *) em->data; | ||
618 | struct tcf_meta_hdr hdr; | ||
619 | struct meta_type_ops *ops; | ||
620 | |||
621 | memset(&hdr, 0, sizeof(hdr)); | ||
622 | memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); | ||
623 | memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); | ||
624 | |||
625 | RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); | ||
626 | |||
627 | ops = meta_type_ops(&meta->lvalue); | ||
628 | if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || | ||
629 | ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) | ||
630 | goto rtattr_failure; | ||
631 | |||
632 | return 0; | ||
633 | |||
634 | rtattr_failure: | ||
635 | return -1; | ||
636 | } | ||
637 | |||
638 | static struct tcf_ematch_ops em_meta_ops = { | ||
639 | .kind = TCF_EM_META, | ||
640 | .change = em_meta_change, | ||
641 | .match = em_meta_match, | ||
642 | .destroy = em_meta_destroy, | ||
643 | .dump = em_meta_dump, | ||
644 | .owner = THIS_MODULE, | ||
645 | .link = LIST_HEAD_INIT(em_meta_ops.link) | ||
646 | }; | ||
647 | |||
648 | static int __init init_em_meta(void) | ||
649 | { | ||
650 | return tcf_em_register(&em_meta_ops); | ||
651 | } | ||
652 | |||
653 | static void __exit exit_em_meta(void) | ||
654 | { | ||
655 | tcf_em_unregister(&em_meta_ops); | ||
656 | } | ||
657 | |||
658 | MODULE_LICENSE("GPL"); | ||
659 | |||
660 | module_init(init_em_meta); | ||
661 | module_exit(exit_em_meta); | ||
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c new file mode 100644 index 000000000000..71ea926a9f09 --- /dev/null +++ b/net/sched/em_nbyte.c | |||
@@ -0,0 +1,82 @@ | |||
1 | /* | ||
2 | * net/sched/em_nbyte.c N-Byte ematch | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Thomas Graf <tgraf@suug.ch> | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <linux/skbuff.h> | ||
19 | #include <linux/tc_ematch/tc_em_nbyte.h> | ||
20 | #include <net/pkt_cls.h> | ||
21 | |||
22 | struct nbyte_data | ||
23 | { | ||
24 | struct tcf_em_nbyte hdr; | ||
25 | char pattern[0]; | ||
26 | }; | ||
27 | |||
28 | static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, | ||
29 | struct tcf_ematch *em) | ||
30 | { | ||
31 | struct tcf_em_nbyte *nbyte = data; | ||
32 | |||
33 | if (data_len < sizeof(*nbyte) || | ||
34 | data_len < (sizeof(*nbyte) + nbyte->len)) | ||
35 | return -EINVAL; | ||
36 | |||
37 | em->datalen = sizeof(*nbyte) + nbyte->len; | ||
38 | em->data = (unsigned long) kmalloc(em->datalen, GFP_KERNEL); | ||
39 | if (em->data == 0UL) | ||
40 | return -ENOBUFS; | ||
41 | |||
42 | memcpy((void *) em->data, data, em->datalen); | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, | ||
48 | struct tcf_pkt_info *info) | ||
49 | { | ||
50 | struct nbyte_data *nbyte = (struct nbyte_data *) em->data; | ||
51 | unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer); | ||
52 | |||
53 | ptr += nbyte->hdr.off; | ||
54 | |||
55 | if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) | ||
56 | return 0; | ||
57 | |||
58 | return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len); | ||
59 | } | ||
60 | |||
61 | static struct tcf_ematch_ops em_nbyte_ops = { | ||
62 | .kind = TCF_EM_NBYTE, | ||
63 | .change = em_nbyte_change, | ||
64 | .match = em_nbyte_match, | ||
65 | .owner = THIS_MODULE, | ||
66 | .link = LIST_HEAD_INIT(em_nbyte_ops.link) | ||
67 | }; | ||
68 | |||
69 | static int __init init_em_nbyte(void) | ||
70 | { | ||
71 | return tcf_em_register(&em_nbyte_ops); | ||
72 | } | ||
73 | |||
74 | static void __exit exit_em_nbyte(void) | ||
75 | { | ||
76 | tcf_em_unregister(&em_nbyte_ops); | ||
77 | } | ||
78 | |||
79 | MODULE_LICENSE("GPL"); | ||
80 | |||
81 | module_init(init_em_nbyte); | ||
82 | module_exit(exit_em_nbyte); | ||
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c new file mode 100644 index 000000000000..34e7e51e601e --- /dev/null +++ b/net/sched/em_u32.c | |||
@@ -0,0 +1,63 @@ | |||
1 | /* | ||
2 | * net/sched/em_u32.c U32 Ematch | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Thomas Graf <tgraf@suug.ch> | ||
10 | * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
11 | * | ||
12 | * Based on net/sched/cls_u32.c | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/types.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/skbuff.h> | ||
20 | #include <net/pkt_cls.h> | ||
21 | |||
22 | static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, | ||
23 | struct tcf_pkt_info *info) | ||
24 | { | ||
25 | struct tc_u32_key *key = (struct tc_u32_key *) em->data; | ||
26 | unsigned char *ptr = skb->nh.raw; | ||
27 | |||
28 | if (info) { | ||
29 | if (info->ptr) | ||
30 | ptr = info->ptr; | ||
31 | ptr += (info->nexthdr & key->offmask); | ||
32 | } | ||
33 | |||
34 | ptr += key->off; | ||
35 | |||
36 | if (!tcf_valid_offset(skb, ptr, sizeof(u32))) | ||
37 | return 0; | ||
38 | |||
39 | return !(((*(u32*) ptr) ^ key->val) & key->mask); | ||
40 | } | ||
41 | |||
42 | static struct tcf_ematch_ops em_u32_ops = { | ||
43 | .kind = TCF_EM_U32, | ||
44 | .datalen = sizeof(struct tc_u32_key), | ||
45 | .match = em_u32_match, | ||
46 | .owner = THIS_MODULE, | ||
47 | .link = LIST_HEAD_INIT(em_u32_ops.link) | ||
48 | }; | ||
49 | |||
50 | static int __init init_em_u32(void) | ||
51 | { | ||
52 | return tcf_em_register(&em_u32_ops); | ||
53 | } | ||
54 | |||
55 | static void __exit exit_em_u32(void) | ||
56 | { | ||
57 | tcf_em_unregister(&em_u32_ops); | ||
58 | } | ||
59 | |||
60 | MODULE_LICENSE("GPL"); | ||
61 | |||
62 | module_init(init_em_u32); | ||
63 | module_exit(exit_em_u32); | ||
diff --git a/net/sched/ematch.c b/net/sched/ematch.c new file mode 100644 index 000000000000..ebfe2e7d21bd --- /dev/null +++ b/net/sched/ematch.c | |||
@@ -0,0 +1,524 @@ | |||
1 | /* | ||
2 | * net/sched/ematch.c Extended Match API | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Thomas Graf <tgraf@suug.ch> | ||
10 | * | ||
11 | * ========================================================================== | ||
12 | * | ||
13 | * An extended match (ematch) is a small classification tool not worth | ||
14 | * writing a full classifier for. Ematches can be interconnected to form | ||
15 | * a logic expression and get attached to classifiers to extend their | ||
16 | * functionatlity. | ||
17 | * | ||
18 | * The userspace part transforms the logic expressions into an array | ||
19 | * consisting of multiple sequences of interconnected ematches separated | ||
20 | * by markers. Precedence is implemented by a special ematch kind | ||
21 | * referencing a sequence beyond the marker of the current sequence | ||
22 | * causing the current position in the sequence to be pushed onto a stack | ||
23 | * to allow the current position to be overwritten by the position referenced | ||
24 | * in the special ematch. Matching continues in the new sequence until a | ||
25 | * marker is reached causing the position to be restored from the stack. | ||
26 | * | ||
27 | * Example: | ||
28 | * A AND (B1 OR B2) AND C AND D | ||
29 | * | ||
30 | * ------->-PUSH------- | ||
31 | * -->-- / -->-- \ -->-- | ||
32 | * / \ / / \ \ / \ | ||
33 | * +-------+-------+-------+-------+-------+--------+ | ||
34 | * | A AND | B AND | C AND | D END | B1 OR | B2 END | | ||
35 | * +-------+-------+-------+-------+-------+--------+ | ||
36 | * \ / | ||
37 | * --------<-POP--------- | ||
38 | * | ||
39 | * where B is a virtual ematch referencing to sequence starting with B1. | ||
40 | * | ||
41 | * ========================================================================== | ||
42 | * | ||
43 | * How to write an ematch in 60 seconds | ||
44 | * ------------------------------------ | ||
45 | * | ||
46 | * 1) Provide a matcher function: | ||
47 | * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, | ||
48 | * struct tcf_pkt_info *info) | ||
49 | * { | ||
50 | * struct mydata *d = (struct mydata *) m->data; | ||
51 | * | ||
52 | * if (...matching goes here...) | ||
53 | * return 1; | ||
54 | * else | ||
55 | * return 0; | ||
56 | * } | ||
57 | * | ||
58 | * 2) Fill out a struct tcf_ematch_ops: | ||
59 | * static struct tcf_ematch_ops my_ops = { | ||
60 | * .kind = unique id, | ||
61 | * .datalen = sizeof(struct mydata), | ||
62 | * .match = my_match, | ||
63 | * .owner = THIS_MODULE, | ||
64 | * }; | ||
65 | * | ||
66 | * 3) Register/Unregister your ematch: | ||
67 | * static int __init init_my_ematch(void) | ||
68 | * { | ||
69 | * return tcf_em_register(&my_ops); | ||
70 | * } | ||
71 | * | ||
72 | * static void __exit exit_my_ematch(void) | ||
73 | * { | ||
74 | * return tcf_em_unregister(&my_ops); | ||
75 | * } | ||
76 | * | ||
77 | * module_init(init_my_ematch); | ||
78 | * module_exit(exit_my_ematch); | ||
79 | * | ||
80 | * 4) By now you should have two more seconds left, barely enough to | ||
81 | * open up a beer to watch the compilation going. | ||
82 | */ | ||
83 | |||
84 | #include <linux/config.h> | ||
85 | #include <linux/module.h> | ||
86 | #include <linux/types.h> | ||
87 | #include <linux/kernel.h> | ||
88 | #include <linux/sched.h> | ||
89 | #include <linux/mm.h> | ||
90 | #include <linux/errno.h> | ||
91 | #include <linux/interrupt.h> | ||
92 | #include <linux/rtnetlink.h> | ||
93 | #include <linux/skbuff.h> | ||
94 | #include <net/pkt_cls.h> | ||
95 | #include <config/net/ematch/stack.h> | ||
96 | |||
97 | static LIST_HEAD(ematch_ops); | ||
98 | static DEFINE_RWLOCK(ematch_mod_lock); | ||
99 | |||
100 | static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) | ||
101 | { | ||
102 | struct tcf_ematch_ops *e = NULL; | ||
103 | |||
104 | read_lock(&ematch_mod_lock); | ||
105 | list_for_each_entry(e, &ematch_ops, link) { | ||
106 | if (kind == e->kind) { | ||
107 | if (!try_module_get(e->owner)) | ||
108 | e = NULL; | ||
109 | read_unlock(&ematch_mod_lock); | ||
110 | return e; | ||
111 | } | ||
112 | } | ||
113 | read_unlock(&ematch_mod_lock); | ||
114 | |||
115 | return NULL; | ||
116 | } | ||
117 | |||
118 | /** | ||
119 | * tcf_em_register - register an extended match | ||
120 | * | ||
121 | * @ops: ematch operations lookup table | ||
122 | * | ||
123 | * This function must be called by ematches to announce their presence. | ||
124 | * The given @ops must have kind set to a unique identifier and the | ||
125 | * callback match() must be implemented. All other callbacks are optional | ||
126 | * and a fallback implementation is used instead. | ||
127 | * | ||
128 | * Returns -EEXISTS if an ematch of the same kind has already registered. | ||
129 | */ | ||
130 | int tcf_em_register(struct tcf_ematch_ops *ops) | ||
131 | { | ||
132 | int err = -EEXIST; | ||
133 | struct tcf_ematch_ops *e; | ||
134 | |||
135 | if (ops->match == NULL) | ||
136 | return -EINVAL; | ||
137 | |||
138 | write_lock(&ematch_mod_lock); | ||
139 | list_for_each_entry(e, &ematch_ops, link) | ||
140 | if (ops->kind == e->kind) | ||
141 | goto errout; | ||
142 | |||
143 | list_add_tail(&ops->link, &ematch_ops); | ||
144 | err = 0; | ||
145 | errout: | ||
146 | write_unlock(&ematch_mod_lock); | ||
147 | return err; | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * tcf_em_unregister - unregster and extended match | ||
152 | * | ||
153 | * @ops: ematch operations lookup table | ||
154 | * | ||
155 | * This function must be called by ematches to announce their disappearance | ||
156 | * for examples when the module gets unloaded. The @ops parameter must be | ||
157 | * the same as the one used for registration. | ||
158 | * | ||
159 | * Returns -ENOENT if no matching ematch was found. | ||
160 | */ | ||
161 | int tcf_em_unregister(struct tcf_ematch_ops *ops) | ||
162 | { | ||
163 | int err = 0; | ||
164 | struct tcf_ematch_ops *e; | ||
165 | |||
166 | write_lock(&ematch_mod_lock); | ||
167 | list_for_each_entry(e, &ematch_ops, link) { | ||
168 | if (e == ops) { | ||
169 | list_del(&e->link); | ||
170 | goto out; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | err = -ENOENT; | ||
175 | out: | ||
176 | write_unlock(&ematch_mod_lock); | ||
177 | return err; | ||
178 | } | ||
179 | |||
180 | static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, | ||
181 | int index) | ||
182 | { | ||
183 | return &tree->matches[index]; | ||
184 | } | ||
185 | |||
186 | |||
187 | static int tcf_em_validate(struct tcf_proto *tp, | ||
188 | struct tcf_ematch_tree_hdr *tree_hdr, | ||
189 | struct tcf_ematch *em, struct rtattr *rta, int idx) | ||
190 | { | ||
191 | int err = -EINVAL; | ||
192 | struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta); | ||
193 | int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr); | ||
194 | void *data = (void *) em_hdr + sizeof(*em_hdr); | ||
195 | |||
196 | if (!TCF_EM_REL_VALID(em_hdr->flags)) | ||
197 | goto errout; | ||
198 | |||
199 | if (em_hdr->kind == TCF_EM_CONTAINER) { | ||
200 | /* Special ematch called "container", carries an index | ||
201 | * referencing an external ematch sequence. */ | ||
202 | u32 ref; | ||
203 | |||
204 | if (data_len < sizeof(ref)) | ||
205 | goto errout; | ||
206 | ref = *(u32 *) data; | ||
207 | |||
208 | if (ref >= tree_hdr->nmatches) | ||
209 | goto errout; | ||
210 | |||
211 | /* We do not allow backward jumps to avoid loops and jumps | ||
212 | * to our own position are of course illegal. */ | ||
213 | if (ref <= idx) | ||
214 | goto errout; | ||
215 | |||
216 | |||
217 | em->data = ref; | ||
218 | } else { | ||
219 | /* Note: This lookup will increase the module refcnt | ||
220 | * of the ematch module referenced. In case of a failure, | ||
221 | * a destroy function is called by the underlying layer | ||
222 | * which automatically releases the reference again, therefore | ||
223 | * the module MUST not be given back under any circumstances | ||
224 | * here. Be aware, the destroy function assumes that the | ||
225 | * module is held if the ops field is non zero. */ | ||
226 | em->ops = tcf_em_lookup(em_hdr->kind); | ||
227 | |||
228 | if (em->ops == NULL) { | ||
229 | err = -ENOENT; | ||
230 | goto errout; | ||
231 | } | ||
232 | |||
233 | /* ematch module provides expected length of data, so we | ||
234 | * can do a basic sanity check. */ | ||
235 | if (em->ops->datalen && data_len < em->ops->datalen) | ||
236 | goto errout; | ||
237 | |||
238 | if (em->ops->change) { | ||
239 | err = em->ops->change(tp, data, data_len, em); | ||
240 | if (err < 0) | ||
241 | goto errout; | ||
242 | } else if (data_len > 0) { | ||
243 | /* ematch module doesn't provide an own change | ||
244 | * procedure and expects us to allocate and copy | ||
245 | * the ematch data. | ||
246 | * | ||
247 | * TCF_EM_SIMPLE may be specified stating that the | ||
248 | * data only consists of a u32 integer and the module | ||
249 | * does not expected a memory reference but rather | ||
250 | * the value carried. */ | ||
251 | if (em_hdr->flags & TCF_EM_SIMPLE) { | ||
252 | if (data_len < sizeof(u32)) | ||
253 | goto errout; | ||
254 | em->data = *(u32 *) data; | ||
255 | } else { | ||
256 | void *v = kmalloc(data_len, GFP_KERNEL); | ||
257 | if (v == NULL) { | ||
258 | err = -ENOBUFS; | ||
259 | goto errout; | ||
260 | } | ||
261 | memcpy(v, data, data_len); | ||
262 | em->data = (unsigned long) v; | ||
263 | } | ||
264 | } | ||
265 | } | ||
266 | |||
267 | em->matchid = em_hdr->matchid; | ||
268 | em->flags = em_hdr->flags; | ||
269 | em->datalen = data_len; | ||
270 | |||
271 | err = 0; | ||
272 | errout: | ||
273 | return err; | ||
274 | } | ||
275 | |||
276 | /** | ||
277 | * tcf_em_tree_validate - validate ematch config TLV and build ematch tree | ||
278 | * | ||
279 | * @tp: classifier kind handle | ||
280 | * @rta: ematch tree configuration TLV | ||
281 | * @tree: destination ematch tree variable to store the resulting | ||
282 | * ematch tree. | ||
283 | * | ||
284 | * This function validates the given configuration TLV @rta and builds an | ||
285 | * ematch tree in @tree. The resulting tree must later be copied into | ||
286 | * the private classifier data using tcf_em_tree_change(). You MUST NOT | ||
287 | * provide the ematch tree variable of the private classifier data directly, | ||
288 | * the changes would not be locked properly. | ||
289 | * | ||
290 | * Returns a negative error code if the configuration TLV contains errors. | ||
291 | */ | ||
292 | int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, | ||
293 | struct tcf_ematch_tree *tree) | ||
294 | { | ||
295 | int idx, list_len, matches_len, err = -EINVAL; | ||
296 | struct rtattr *tb[TCA_EMATCH_TREE_MAX]; | ||
297 | struct rtattr *rt_match, *rt_hdr, *rt_list; | ||
298 | struct tcf_ematch_tree_hdr *tree_hdr; | ||
299 | struct tcf_ematch *em; | ||
300 | |||
301 | if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) | ||
302 | goto errout; | ||
303 | |||
304 | rt_hdr = tb[TCA_EMATCH_TREE_HDR-1]; | ||
305 | rt_list = tb[TCA_EMATCH_TREE_LIST-1]; | ||
306 | |||
307 | if (rt_hdr == NULL || rt_list == NULL) | ||
308 | goto errout; | ||
309 | |||
310 | if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) || | ||
311 | RTA_PAYLOAD(rt_list) < sizeof(*rt_match)) | ||
312 | goto errout; | ||
313 | |||
314 | tree_hdr = RTA_DATA(rt_hdr); | ||
315 | memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); | ||
316 | |||
317 | rt_match = RTA_DATA(rt_list); | ||
318 | list_len = RTA_PAYLOAD(rt_list); | ||
319 | matches_len = tree_hdr->nmatches * sizeof(*em); | ||
320 | |||
321 | tree->matches = kmalloc(matches_len, GFP_KERNEL); | ||
322 | if (tree->matches == NULL) | ||
323 | goto errout; | ||
324 | memset(tree->matches, 0, matches_len); | ||
325 | |||
326 | /* We do not use rtattr_parse_nested here because the maximum | ||
327 | * number of attributes is unknown. This saves us the allocation | ||
328 | * for a tb buffer which would serve no purpose at all. | ||
329 | * | ||
330 | * The array of rt attributes is parsed in the order as they are | ||
331 | * provided, their type must be incremental from 1 to n. Even | ||
332 | * if it does not serve any real purpose, a failure of sticking | ||
333 | * to this policy will result in parsing failure. */ | ||
334 | for (idx = 0; RTA_OK(rt_match, list_len); idx++) { | ||
335 | err = -EINVAL; | ||
336 | |||
337 | if (rt_match->rta_type != (idx + 1)) | ||
338 | goto errout_abort; | ||
339 | |||
340 | if (idx >= tree_hdr->nmatches) | ||
341 | goto errout_abort; | ||
342 | |||
343 | if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr)) | ||
344 | goto errout_abort; | ||
345 | |||
346 | em = tcf_em_get_match(tree, idx); | ||
347 | |||
348 | err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); | ||
349 | if (err < 0) | ||
350 | goto errout_abort; | ||
351 | |||
352 | rt_match = RTA_NEXT(rt_match, list_len); | ||
353 | } | ||
354 | |||
355 | /* Check if the number of matches provided by userspace actually | ||
356 | * complies with the array of matches. The number was used for | ||
357 | * the validation of references and a mismatch could lead to | ||
358 | * undefined references during the matching process. */ | ||
359 | if (idx != tree_hdr->nmatches) { | ||
360 | err = -EINVAL; | ||
361 | goto errout_abort; | ||
362 | } | ||
363 | |||
364 | err = 0; | ||
365 | errout: | ||
366 | return err; | ||
367 | |||
368 | errout_abort: | ||
369 | tcf_em_tree_destroy(tp, tree); | ||
370 | return err; | ||
371 | } | ||
372 | |||
373 | /** | ||
374 | * tcf_em_tree_destroy - destroy an ematch tree | ||
375 | * | ||
376 | * @tp: classifier kind handle | ||
377 | * @tree: ematch tree to be deleted | ||
378 | * | ||
379 | * This functions destroys an ematch tree previously created by | ||
380 | * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that | ||
381 | * the ematch tree is not in use before calling this function. | ||
382 | */ | ||
383 | void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) | ||
384 | { | ||
385 | int i; | ||
386 | |||
387 | if (tree->matches == NULL) | ||
388 | return; | ||
389 | |||
390 | for (i = 0; i < tree->hdr.nmatches; i++) { | ||
391 | struct tcf_ematch *em = tcf_em_get_match(tree, i); | ||
392 | |||
393 | if (em->ops) { | ||
394 | if (em->ops->destroy) | ||
395 | em->ops->destroy(tp, em); | ||
396 | else if (!tcf_em_is_simple(em) && em->data) | ||
397 | kfree((void *) em->data); | ||
398 | module_put(em->ops->owner); | ||
399 | } | ||
400 | } | ||
401 | |||
402 | tree->hdr.nmatches = 0; | ||
403 | kfree(tree->matches); | ||
404 | } | ||
405 | |||
406 | /** | ||
407 | * tcf_em_tree_dump - dump ematch tree into a rtnl message | ||
408 | * | ||
409 | * @skb: skb holding the rtnl message | ||
410 | * @t: ematch tree to be dumped | ||
411 | * @tlv: TLV type to be used to encapsulate the tree | ||
412 | * | ||
413 | * This function dumps a ematch tree into a rtnl message. It is valid to | ||
414 | * call this function while the ematch tree is in use. | ||
415 | * | ||
416 | * Returns -1 if the skb tailroom is insufficient. | ||
417 | */ | ||
418 | int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) | ||
419 | { | ||
420 | int i; | ||
421 | struct rtattr * top_start = (struct rtattr*) skb->tail; | ||
422 | struct rtattr * list_start; | ||
423 | |||
424 | RTA_PUT(skb, tlv, 0, NULL); | ||
425 | RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); | ||
426 | |||
427 | list_start = (struct rtattr *) skb->tail; | ||
428 | RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL); | ||
429 | |||
430 | for (i = 0; i < tree->hdr.nmatches; i++) { | ||
431 | struct rtattr *match_start = (struct rtattr*) skb->tail; | ||
432 | struct tcf_ematch *em = tcf_em_get_match(tree, i); | ||
433 | struct tcf_ematch_hdr em_hdr = { | ||
434 | .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, | ||
435 | .matchid = em->matchid, | ||
436 | .flags = em->flags | ||
437 | }; | ||
438 | |||
439 | RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); | ||
440 | |||
441 | if (em->ops && em->ops->dump) { | ||
442 | if (em->ops->dump(skb, em) < 0) | ||
443 | goto rtattr_failure; | ||
444 | } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { | ||
445 | u32 u = em->data; | ||
446 | RTA_PUT_NOHDR(skb, sizeof(u), &u); | ||
447 | } else if (em->datalen > 0) | ||
448 | RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data); | ||
449 | |||
450 | match_start->rta_len = skb->tail - (u8*) match_start; | ||
451 | } | ||
452 | |||
453 | list_start->rta_len = skb->tail - (u8 *) list_start; | ||
454 | top_start->rta_len = skb->tail - (u8 *) top_start; | ||
455 | |||
456 | return 0; | ||
457 | |||
458 | rtattr_failure: | ||
459 | return -1; | ||
460 | } | ||
461 | |||
462 | static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, | ||
463 | struct tcf_pkt_info *info) | ||
464 | { | ||
465 | int r = em->ops->match(skb, em, info); | ||
466 | return tcf_em_is_inverted(em) ? !r : r; | ||
467 | } | ||
468 | |||
469 | /* Do not use this function directly, use tcf_em_tree_match instead */ | ||
470 | int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, | ||
471 | struct tcf_pkt_info *info) | ||
472 | { | ||
473 | int stackp = 0, match_idx = 0, res = 0; | ||
474 | struct tcf_ematch *cur_match; | ||
475 | int stack[CONFIG_NET_EMATCH_STACK]; | ||
476 | |||
477 | proceed: | ||
478 | while (match_idx < tree->hdr.nmatches) { | ||
479 | cur_match = tcf_em_get_match(tree, match_idx); | ||
480 | |||
481 | if (tcf_em_is_container(cur_match)) { | ||
482 | if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) | ||
483 | goto stack_overflow; | ||
484 | |||
485 | stack[stackp++] = match_idx; | ||
486 | match_idx = cur_match->data; | ||
487 | goto proceed; | ||
488 | } | ||
489 | |||
490 | res = tcf_em_match(skb, cur_match, info); | ||
491 | |||
492 | if (tcf_em_early_end(cur_match, res)) | ||
493 | break; | ||
494 | |||
495 | match_idx++; | ||
496 | } | ||
497 | |||
498 | pop_stack: | ||
499 | if (stackp > 0) { | ||
500 | match_idx = stack[--stackp]; | ||
501 | cur_match = tcf_em_get_match(tree, match_idx); | ||
502 | |||
503 | if (tcf_em_early_end(cur_match, res)) | ||
504 | goto pop_stack; | ||
505 | else { | ||
506 | match_idx++; | ||
507 | goto proceed; | ||
508 | } | ||
509 | } | ||
510 | |||
511 | return res; | ||
512 | |||
513 | stack_overflow: | ||
514 | if (net_ratelimit()) | ||
515 | printk("Local stack overflow, increase NET_EMATCH_STACK\n"); | ||
516 | return -1; | ||
517 | } | ||
518 | |||
519 | EXPORT_SYMBOL(tcf_em_register); | ||
520 | EXPORT_SYMBOL(tcf_em_unregister); | ||
521 | EXPORT_SYMBOL(tcf_em_tree_validate); | ||
522 | EXPORT_SYMBOL(tcf_em_tree_destroy); | ||
523 | EXPORT_SYMBOL(tcf_em_tree_dump); | ||
524 | EXPORT_SYMBOL(__tcf_em_tree_match); | ||
diff --git a/net/sched/estimator.c b/net/sched/estimator.c new file mode 100644 index 000000000000..5d3ae03e22a7 --- /dev/null +++ b/net/sched/estimator.c | |||
@@ -0,0 +1,197 @@ | |||
1 | /* | ||
2 | * net/sched/estimator.c Simple rate estimator. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | #include <asm/uaccess.h> | ||
13 | #include <asm/system.h> | ||
14 | #include <linux/bitops.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/types.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/jiffies.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/socket.h> | ||
22 | #include <linux/sockios.h> | ||
23 | #include <linux/in.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/netdevice.h> | ||
27 | #include <linux/skbuff.h> | ||
28 | #include <linux/rtnetlink.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <net/sock.h> | ||
31 | #include <net/pkt_sched.h> | ||
32 | |||
33 | /* | ||
34 | This code is NOT intended to be used for statistics collection, | ||
35 | its purpose is to provide a base for statistical multiplexing | ||
36 | for controlled load service. | ||
37 | If you need only statistics, run a user level daemon which | ||
38 | periodically reads byte counters. | ||
39 | |||
40 | Unfortunately, rate estimation is not a very easy task. | ||
41 | F.e. I did not find a simple way to estimate the current peak rate | ||
42 | and even failed to formulate the problem 8)8) | ||
43 | |||
44 | So I preferred not to built an estimator into the scheduler, | ||
45 | but run this task separately. | ||
46 | Ideally, it should be kernel thread(s), but for now it runs | ||
47 | from timers, which puts apparent top bounds on the number of rated | ||
48 | flows, has minimal overhead on small, but is enough | ||
49 | to handle controlled load service, sets of aggregates. | ||
50 | |||
51 | We measure rate over A=(1<<interval) seconds and evaluate EWMA: | ||
52 | |||
53 | avrate = avrate*(1-W) + rate*W | ||
54 | |||
55 | where W is chosen as negative power of 2: W = 2^(-ewma_log) | ||
56 | |||
57 | The resulting time constant is: | ||
58 | |||
59 | T = A/(-ln(1-W)) | ||
60 | |||
61 | |||
62 | NOTES. | ||
63 | |||
64 | * The stored value for avbps is scaled by 2^5, so that maximal | ||
65 | rate is ~1Gbit, avpps is scaled by 2^10. | ||
66 | |||
67 | * Minimal interval is HZ/4=250msec (it is the greatest common divisor | ||
68 | for HZ=100 and HZ=1024 8)), maximal interval | ||
69 | is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals | ||
70 | are too expensive, longer ones can be implemented | ||
71 | at user level painlessly. | ||
72 | */ | ||
73 | |||
74 | #define EST_MAX_INTERVAL 5 | ||
75 | |||
76 | struct qdisc_estimator | ||
77 | { | ||
78 | struct qdisc_estimator *next; | ||
79 | struct tc_stats *stats; | ||
80 | spinlock_t *stats_lock; | ||
81 | unsigned interval; | ||
82 | int ewma_log; | ||
83 | u64 last_bytes; | ||
84 | u32 last_packets; | ||
85 | u32 avpps; | ||
86 | u32 avbps; | ||
87 | }; | ||
88 | |||
89 | struct qdisc_estimator_head | ||
90 | { | ||
91 | struct timer_list timer; | ||
92 | struct qdisc_estimator *list; | ||
93 | }; | ||
94 | |||
95 | static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; | ||
96 | |||
97 | /* Estimator array lock */ | ||
98 | static DEFINE_RWLOCK(est_lock); | ||
99 | |||
100 | static void est_timer(unsigned long arg) | ||
101 | { | ||
102 | int idx = (int)arg; | ||
103 | struct qdisc_estimator *e; | ||
104 | |||
105 | read_lock(&est_lock); | ||
106 | for (e = elist[idx].list; e; e = e->next) { | ||
107 | struct tc_stats *st = e->stats; | ||
108 | u64 nbytes; | ||
109 | u32 npackets; | ||
110 | u32 rate; | ||
111 | |||
112 | spin_lock(e->stats_lock); | ||
113 | nbytes = st->bytes; | ||
114 | npackets = st->packets; | ||
115 | rate = (nbytes - e->last_bytes)<<(7 - idx); | ||
116 | e->last_bytes = nbytes; | ||
117 | e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; | ||
118 | st->bps = (e->avbps+0xF)>>5; | ||
119 | |||
120 | rate = (npackets - e->last_packets)<<(12 - idx); | ||
121 | e->last_packets = npackets; | ||
122 | e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; | ||
123 | e->stats->pps = (e->avpps+0x1FF)>>10; | ||
124 | spin_unlock(e->stats_lock); | ||
125 | } | ||
126 | |||
127 | mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); | ||
128 | read_unlock(&est_lock); | ||
129 | } | ||
130 | |||
131 | int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt) | ||
132 | { | ||
133 | struct qdisc_estimator *est; | ||
134 | struct tc_estimator *parm = RTA_DATA(opt); | ||
135 | |||
136 | if (RTA_PAYLOAD(opt) < sizeof(*parm)) | ||
137 | return -EINVAL; | ||
138 | |||
139 | if (parm->interval < -2 || parm->interval > 3) | ||
140 | return -EINVAL; | ||
141 | |||
142 | est = kmalloc(sizeof(*est), GFP_KERNEL); | ||
143 | if (est == NULL) | ||
144 | return -ENOBUFS; | ||
145 | |||
146 | memset(est, 0, sizeof(*est)); | ||
147 | est->interval = parm->interval + 2; | ||
148 | est->stats = stats; | ||
149 | est->stats_lock = stats_lock; | ||
150 | est->ewma_log = parm->ewma_log; | ||
151 | est->last_bytes = stats->bytes; | ||
152 | est->avbps = stats->bps<<5; | ||
153 | est->last_packets = stats->packets; | ||
154 | est->avpps = stats->pps<<10; | ||
155 | |||
156 | est->next = elist[est->interval].list; | ||
157 | if (est->next == NULL) { | ||
158 | init_timer(&elist[est->interval].timer); | ||
159 | elist[est->interval].timer.data = est->interval; | ||
160 | elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); | ||
161 | elist[est->interval].timer.function = est_timer; | ||
162 | add_timer(&elist[est->interval].timer); | ||
163 | } | ||
164 | write_lock_bh(&est_lock); | ||
165 | elist[est->interval].list = est; | ||
166 | write_unlock_bh(&est_lock); | ||
167 | return 0; | ||
168 | } | ||
169 | |||
170 | void qdisc_kill_estimator(struct tc_stats *stats) | ||
171 | { | ||
172 | int idx; | ||
173 | struct qdisc_estimator *est, **pest; | ||
174 | |||
175 | for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { | ||
176 | int killed = 0; | ||
177 | pest = &elist[idx].list; | ||
178 | while ((est=*pest) != NULL) { | ||
179 | if (est->stats != stats) { | ||
180 | pest = &est->next; | ||
181 | continue; | ||
182 | } | ||
183 | |||
184 | write_lock_bh(&est_lock); | ||
185 | *pest = est->next; | ||
186 | write_unlock_bh(&est_lock); | ||
187 | |||
188 | kfree(est); | ||
189 | killed++; | ||
190 | } | ||
191 | if (killed && elist[idx].list == NULL) | ||
192 | del_timer(&elist[idx].timer); | ||
193 | } | ||
194 | } | ||
195 | |||
196 | EXPORT_SYMBOL(qdisc_kill_estimator); | ||
197 | EXPORT_SYMBOL(qdisc_new_estimator); | ||
diff --git a/net/sched/gact.c b/net/sched/gact.c new file mode 100644 index 000000000000..a811c89fef7f --- /dev/null +++ b/net/sched/gact.c | |||
@@ -0,0 +1,231 @@ | |||
1 | /* | ||
2 | * net/sched/gact.c Generic actions | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * copyright Jamal Hadi Salim (2002-4) | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <asm/uaccess.h> | ||
14 | #include <asm/system.h> | ||
15 | #include <linux/bitops.h> | ||
16 | #include <linux/config.h> | ||
17 | #include <linux/types.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/socket.h> | ||
23 | #include <linux/sockios.h> | ||
24 | #include <linux/in.h> | ||
25 | #include <linux/errno.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/netdevice.h> | ||
28 | #include <linux/skbuff.h> | ||
29 | #include <linux/rtnetlink.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/proc_fs.h> | ||
33 | #include <net/sock.h> | ||
34 | #include <net/pkt_sched.h> | ||
35 | #include <linux/tc_act/tc_gact.h> | ||
36 | #include <net/tc_act/tc_gact.h> | ||
37 | |||
38 | /* use generic hash table */ | ||
39 | #define MY_TAB_SIZE 16 | ||
40 | #define MY_TAB_MASK 15 | ||
41 | |||
42 | static u32 idx_gen; | ||
43 | static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE]; | ||
44 | static DEFINE_RWLOCK(gact_lock); | ||
45 | |||
46 | /* ovewrride the defaults */ | ||
47 | #define tcf_st tcf_gact | ||
48 | #define tc_st tc_gact | ||
49 | #define tcf_t_lock gact_lock | ||
50 | #define tcf_ht tcf_gact_ht | ||
51 | |||
52 | #define CONFIG_NET_ACT_INIT 1 | ||
53 | #include <net/pkt_act.h> | ||
54 | |||
55 | #ifdef CONFIG_GACT_PROB | ||
56 | static int gact_net_rand(struct tcf_gact *p) | ||
57 | { | ||
58 | if (net_random()%p->pval) | ||
59 | return p->action; | ||
60 | return p->paction; | ||
61 | } | ||
62 | |||
63 | static int gact_determ(struct tcf_gact *p) | ||
64 | { | ||
65 | if (p->bstats.packets%p->pval) | ||
66 | return p->action; | ||
67 | return p->paction; | ||
68 | } | ||
69 | |||
70 | typedef int (*g_rand)(struct tcf_gact *p); | ||
71 | static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; | ||
72 | #endif | ||
73 | |||
74 | static int tcf_gact_init(struct rtattr *rta, struct rtattr *est, | ||
75 | struct tc_action *a, int ovr, int bind) | ||
76 | { | ||
77 | struct rtattr *tb[TCA_GACT_MAX]; | ||
78 | struct tc_gact *parm; | ||
79 | struct tcf_gact *p; | ||
80 | int ret = 0; | ||
81 | |||
82 | if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0) | ||
83 | return -EINVAL; | ||
84 | |||
85 | if (tb[TCA_GACT_PARMS - 1] == NULL || | ||
86 | RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm)) | ||
87 | return -EINVAL; | ||
88 | parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]); | ||
89 | |||
90 | if (tb[TCA_GACT_PROB-1] != NULL) | ||
91 | #ifdef CONFIG_GACT_PROB | ||
92 | if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p)) | ||
93 | return -EINVAL; | ||
94 | #else | ||
95 | return -EOPNOTSUPP; | ||
96 | #endif | ||
97 | |||
98 | p = tcf_hash_check(parm->index, a, ovr, bind); | ||
99 | if (p == NULL) { | ||
100 | p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); | ||
101 | if (p == NULL) | ||
102 | return -ENOMEM; | ||
103 | ret = ACT_P_CREATED; | ||
104 | } else { | ||
105 | if (!ovr) { | ||
106 | tcf_hash_release(p, bind); | ||
107 | return -EEXIST; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | spin_lock_bh(&p->lock); | ||
112 | p->action = parm->action; | ||
113 | #ifdef CONFIG_GACT_PROB | ||
114 | if (tb[TCA_GACT_PROB-1] != NULL) { | ||
115 | struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]); | ||
116 | p->paction = p_parm->paction; | ||
117 | p->pval = p_parm->pval; | ||
118 | p->ptype = p_parm->ptype; | ||
119 | } | ||
120 | #endif | ||
121 | spin_unlock_bh(&p->lock); | ||
122 | if (ret == ACT_P_CREATED) | ||
123 | tcf_hash_insert(p); | ||
124 | return ret; | ||
125 | } | ||
126 | |||
127 | static int | ||
128 | tcf_gact_cleanup(struct tc_action *a, int bind) | ||
129 | { | ||
130 | struct tcf_gact *p = PRIV(a, gact); | ||
131 | |||
132 | if (p != NULL) | ||
133 | return tcf_hash_release(p, bind); | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | static int | ||
138 | tcf_gact(struct sk_buff **pskb, struct tc_action *a) | ||
139 | { | ||
140 | struct tcf_gact *p = PRIV(a, gact); | ||
141 | struct sk_buff *skb = *pskb; | ||
142 | int action = TC_ACT_SHOT; | ||
143 | |||
144 | spin_lock(&p->lock); | ||
145 | #ifdef CONFIG_GACT_PROB | ||
146 | if (p->ptype && gact_rand[p->ptype] != NULL) | ||
147 | action = gact_rand[p->ptype](p); | ||
148 | else | ||
149 | action = p->action; | ||
150 | #else | ||
151 | action = p->action; | ||
152 | #endif | ||
153 | p->bstats.bytes += skb->len; | ||
154 | p->bstats.packets++; | ||
155 | if (action == TC_ACT_SHOT) | ||
156 | p->qstats.drops++; | ||
157 | p->tm.lastuse = jiffies; | ||
158 | spin_unlock(&p->lock); | ||
159 | |||
160 | return action; | ||
161 | } | ||
162 | |||
163 | static int | ||
164 | tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) | ||
165 | { | ||
166 | unsigned char *b = skb->tail; | ||
167 | struct tc_gact opt; | ||
168 | struct tcf_gact *p = PRIV(a, gact); | ||
169 | struct tcf_t t; | ||
170 | |||
171 | opt.index = p->index; | ||
172 | opt.refcnt = p->refcnt - ref; | ||
173 | opt.bindcnt = p->bindcnt - bind; | ||
174 | opt.action = p->action; | ||
175 | RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); | ||
176 | #ifdef CONFIG_GACT_PROB | ||
177 | if (p->ptype) { | ||
178 | struct tc_gact_p p_opt; | ||
179 | p_opt.paction = p->paction; | ||
180 | p_opt.pval = p->pval; | ||
181 | p_opt.ptype = p->ptype; | ||
182 | RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); | ||
183 | } | ||
184 | #endif | ||
185 | t.install = jiffies_to_clock_t(jiffies - p->tm.install); | ||
186 | t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); | ||
187 | t.expires = jiffies_to_clock_t(p->tm.expires); | ||
188 | RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); | ||
189 | return skb->len; | ||
190 | |||
191 | rtattr_failure: | ||
192 | skb_trim(skb, b - skb->data); | ||
193 | return -1; | ||
194 | } | ||
195 | |||
196 | static struct tc_action_ops act_gact_ops = { | ||
197 | .kind = "gact", | ||
198 | .type = TCA_ACT_GACT, | ||
199 | .capab = TCA_CAP_NONE, | ||
200 | .owner = THIS_MODULE, | ||
201 | .act = tcf_gact, | ||
202 | .dump = tcf_gact_dump, | ||
203 | .cleanup = tcf_gact_cleanup, | ||
204 | .lookup = tcf_hash_search, | ||
205 | .init = tcf_gact_init, | ||
206 | .walk = tcf_generic_walker | ||
207 | }; | ||
208 | |||
209 | MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); | ||
210 | MODULE_DESCRIPTION("Generic Classifier actions"); | ||
211 | MODULE_LICENSE("GPL"); | ||
212 | |||
213 | static int __init | ||
214 | gact_init_module(void) | ||
215 | { | ||
216 | #ifdef CONFIG_GACT_PROB | ||
217 | printk("GACT probability on\n"); | ||
218 | #else | ||
219 | printk("GACT probability NOT on\n"); | ||
220 | #endif | ||
221 | return tcf_register_action(&act_gact_ops); | ||
222 | } | ||
223 | |||
224 | static void __exit | ||
225 | gact_cleanup_module(void) | ||
226 | { | ||
227 | tcf_unregister_action(&act_gact_ops); | ||
228 | } | ||
229 | |||
230 | module_init(gact_init_module); | ||
231 | module_exit(gact_cleanup_module); | ||
diff --git a/net/sched/ipt.c b/net/sched/ipt.c new file mode 100644 index 000000000000..b114d994d523 --- /dev/null +++ b/net/sched/ipt.c | |||
@@ -0,0 +1,326 @@ | |||
1 | /* | ||
2 | * net/sched/ipt.c iptables target interface | ||
3 | * | ||
4 | *TODO: Add other tables. For now we only support the ipv4 table targets | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Copyright: Jamal Hadi Salim (2002-4) | ||
12 | */ | ||
13 | |||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <asm/bitops.h> | ||
17 | #include <linux/config.h> | ||
18 | #include <linux/types.h> | ||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/string.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/socket.h> | ||
24 | #include <linux/sockios.h> | ||
25 | #include <linux/in.h> | ||
26 | #include <linux/errno.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #include <linux/skbuff.h> | ||
30 | #include <linux/rtnetlink.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/proc_fs.h> | ||
34 | #include <linux/kmod.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <net/pkt_sched.h> | ||
37 | #include <linux/tc_act/tc_ipt.h> | ||
38 | #include <net/tc_act/tc_ipt.h> | ||
39 | |||
40 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
41 | |||
42 | /* use generic hash table */ | ||
43 | #define MY_TAB_SIZE 16 | ||
44 | #define MY_TAB_MASK 15 | ||
45 | |||
46 | static u32 idx_gen; | ||
47 | static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE]; | ||
48 | /* ipt hash table lock */ | ||
49 | static DEFINE_RWLOCK(ipt_lock); | ||
50 | |||
51 | /* ovewrride the defaults */ | ||
52 | #define tcf_st tcf_ipt | ||
53 | #define tcf_t_lock ipt_lock | ||
54 | #define tcf_ht tcf_ipt_ht | ||
55 | |||
56 | #define CONFIG_NET_ACT_INIT | ||
57 | #include <net/pkt_act.h> | ||
58 | |||
59 | static int | ||
60 | ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) | ||
61 | { | ||
62 | struct ipt_target *target; | ||
63 | int ret = 0; | ||
64 | |||
65 | target = ipt_find_target(t->u.user.name, t->u.user.revision); | ||
66 | if (!target) | ||
67 | return -ENOENT; | ||
68 | |||
69 | DPRINTK("ipt_init_target: found %s\n", target->name); | ||
70 | t->u.kernel.target = target; | ||
71 | |||
72 | if (t->u.kernel.target->checkentry | ||
73 | && !t->u.kernel.target->checkentry(table, NULL, t->data, | ||
74 | t->u.target_size - sizeof(*t), | ||
75 | hook)) { | ||
76 | DPRINTK("ipt_init_target: check failed for `%s'.\n", | ||
77 | t->u.kernel.target->name); | ||
78 | module_put(t->u.kernel.target->me); | ||
79 | ret = -EINVAL; | ||
80 | } | ||
81 | |||
82 | return ret; | ||
83 | } | ||
84 | |||
85 | static void | ||
86 | ipt_destroy_target(struct ipt_entry_target *t) | ||
87 | { | ||
88 | if (t->u.kernel.target->destroy) | ||
89 | t->u.kernel.target->destroy(t->data, | ||
90 | t->u.target_size - sizeof(*t)); | ||
91 | module_put(t->u.kernel.target->me); | ||
92 | } | ||
93 | |||
94 | static int | ||
95 | tcf_ipt_release(struct tcf_ipt *p, int bind) | ||
96 | { | ||
97 | int ret = 0; | ||
98 | if (p) { | ||
99 | if (bind) | ||
100 | p->bindcnt--; | ||
101 | p->refcnt--; | ||
102 | if (p->bindcnt <= 0 && p->refcnt <= 0) { | ||
103 | ipt_destroy_target(p->t); | ||
104 | kfree(p->tname); | ||
105 | kfree(p->t); | ||
106 | tcf_hash_destroy(p); | ||
107 | ret = ACT_P_DELETED; | ||
108 | } | ||
109 | } | ||
110 | return ret; | ||
111 | } | ||
112 | |||
113 | static int | ||
114 | tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, | ||
115 | int ovr, int bind) | ||
116 | { | ||
117 | struct rtattr *tb[TCA_IPT_MAX]; | ||
118 | struct tcf_ipt *p; | ||
119 | struct ipt_entry_target *td, *t; | ||
120 | char *tname; | ||
121 | int ret = 0, err; | ||
122 | u32 hook = 0; | ||
123 | u32 index = 0; | ||
124 | |||
125 | if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0) | ||
126 | return -EINVAL; | ||
127 | |||
128 | if (tb[TCA_IPT_HOOK-1] == NULL || | ||
129 | RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32)) | ||
130 | return -EINVAL; | ||
131 | if (tb[TCA_IPT_TARG-1] == NULL || | ||
132 | RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t)) | ||
133 | return -EINVAL; | ||
134 | td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]); | ||
135 | if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size) | ||
136 | return -EINVAL; | ||
137 | |||
138 | if (tb[TCA_IPT_INDEX-1] != NULL && | ||
139 | RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32)) | ||
140 | index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]); | ||
141 | |||
142 | p = tcf_hash_check(index, a, ovr, bind); | ||
143 | if (p == NULL) { | ||
144 | p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind); | ||
145 | if (p == NULL) | ||
146 | return -ENOMEM; | ||
147 | ret = ACT_P_CREATED; | ||
148 | } else { | ||
149 | if (!ovr) { | ||
150 | tcf_ipt_release(p, bind); | ||
151 | return -EEXIST; | ||
152 | } | ||
153 | } | ||
154 | |||
155 | hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]); | ||
156 | |||
157 | err = -ENOMEM; | ||
158 | tname = kmalloc(IFNAMSIZ, GFP_KERNEL); | ||
159 | if (tname == NULL) | ||
160 | goto err1; | ||
161 | if (tb[TCA_IPT_TABLE - 1] == NULL || | ||
162 | rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ) | ||
163 | strcpy(tname, "mangle"); | ||
164 | |||
165 | t = kmalloc(td->u.target_size, GFP_KERNEL); | ||
166 | if (t == NULL) | ||
167 | goto err2; | ||
168 | memcpy(t, td, td->u.target_size); | ||
169 | |||
170 | if ((err = ipt_init_target(t, tname, hook)) < 0) | ||
171 | goto err3; | ||
172 | |||
173 | spin_lock_bh(&p->lock); | ||
174 | if (ret != ACT_P_CREATED) { | ||
175 | ipt_destroy_target(p->t); | ||
176 | kfree(p->tname); | ||
177 | kfree(p->t); | ||
178 | } | ||
179 | p->tname = tname; | ||
180 | p->t = t; | ||
181 | p->hook = hook; | ||
182 | spin_unlock_bh(&p->lock); | ||
183 | if (ret == ACT_P_CREATED) | ||
184 | tcf_hash_insert(p); | ||
185 | return ret; | ||
186 | |||
187 | err3: | ||
188 | kfree(t); | ||
189 | err2: | ||
190 | kfree(tname); | ||
191 | err1: | ||
192 | kfree(p); | ||
193 | return err; | ||
194 | } | ||
195 | |||
196 | static int | ||
197 | tcf_ipt_cleanup(struct tc_action *a, int bind) | ||
198 | { | ||
199 | struct tcf_ipt *p = PRIV(a, ipt); | ||
200 | return tcf_ipt_release(p, bind); | ||
201 | } | ||
202 | |||
203 | static int | ||
204 | tcf_ipt(struct sk_buff **pskb, struct tc_action *a) | ||
205 | { | ||
206 | int ret = 0, result = 0; | ||
207 | struct tcf_ipt *p = PRIV(a, ipt); | ||
208 | struct sk_buff *skb = *pskb; | ||
209 | |||
210 | if (skb_cloned(skb)) { | ||
211 | if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) | ||
212 | return TC_ACT_UNSPEC; | ||
213 | } | ||
214 | |||
215 | spin_lock(&p->lock); | ||
216 | |||
217 | p->tm.lastuse = jiffies; | ||
218 | p->bstats.bytes += skb->len; | ||
219 | p->bstats.packets++; | ||
220 | |||
221 | /* yes, we have to worry about both in and out dev | ||
222 | worry later - danger - this API seems to have changed | ||
223 | from earlier kernels */ | ||
224 | |||
225 | ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL, | ||
226 | p->hook, p->t->data, NULL); | ||
227 | switch (ret) { | ||
228 | case NF_ACCEPT: | ||
229 | result = TC_ACT_OK; | ||
230 | break; | ||
231 | case NF_DROP: | ||
232 | result = TC_ACT_SHOT; | ||
233 | p->qstats.drops++; | ||
234 | break; | ||
235 | case IPT_CONTINUE: | ||
236 | result = TC_ACT_PIPE; | ||
237 | break; | ||
238 | default: | ||
239 | if (net_ratelimit()) | ||
240 | printk("Bogus netfilter code %d assume ACCEPT\n", ret); | ||
241 | result = TC_POLICE_OK; | ||
242 | break; | ||
243 | } | ||
244 | spin_unlock(&p->lock); | ||
245 | return result; | ||
246 | |||
247 | } | ||
248 | |||
249 | static int | ||
250 | tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) | ||
251 | { | ||
252 | struct ipt_entry_target *t; | ||
253 | struct tcf_t tm; | ||
254 | struct tc_cnt c; | ||
255 | unsigned char *b = skb->tail; | ||
256 | struct tcf_ipt *p = PRIV(a, ipt); | ||
257 | |||
258 | /* for simple targets kernel size == user size | ||
259 | ** user name = target name | ||
260 | ** for foolproof you need to not assume this | ||
261 | */ | ||
262 | |||
263 | t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC); | ||
264 | if (t == NULL) | ||
265 | goto rtattr_failure; | ||
266 | |||
267 | c.bindcnt = p->bindcnt - bind; | ||
268 | c.refcnt = p->refcnt - ref; | ||
269 | memcpy(t, p->t, p->t->u.user.target_size); | ||
270 | strcpy(t->u.user.name, p->t->u.kernel.target->name); | ||
271 | |||
272 | DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname, | ||
273 | strlen(p->tname)); | ||
274 | DPRINTK("\tdump target name %s size %d size user %d " | ||
275 | "data[0] %x data[1] %x\n", p->t->u.kernel.target->name, | ||
276 | p->t->u.target_size, p->t->u.user.target_size, | ||
277 | p->t->data[0], p->t->data[1]); | ||
278 | RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t); | ||
279 | RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index); | ||
280 | RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook); | ||
281 | RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); | ||
282 | RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname); | ||
283 | tm.install = jiffies_to_clock_t(jiffies - p->tm.install); | ||
284 | tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); | ||
285 | tm.expires = jiffies_to_clock_t(p->tm.expires); | ||
286 | RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); | ||
287 | kfree(t); | ||
288 | return skb->len; | ||
289 | |||
290 | rtattr_failure: | ||
291 | skb_trim(skb, b - skb->data); | ||
292 | kfree(t); | ||
293 | return -1; | ||
294 | } | ||
295 | |||
296 | static struct tc_action_ops act_ipt_ops = { | ||
297 | .kind = "ipt", | ||
298 | .type = TCA_ACT_IPT, | ||
299 | .capab = TCA_CAP_NONE, | ||
300 | .owner = THIS_MODULE, | ||
301 | .act = tcf_ipt, | ||
302 | .dump = tcf_ipt_dump, | ||
303 | .cleanup = tcf_ipt_cleanup, | ||
304 | .lookup = tcf_hash_search, | ||
305 | .init = tcf_ipt_init, | ||
306 | .walk = tcf_generic_walker | ||
307 | }; | ||
308 | |||
309 | MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); | ||
310 | MODULE_DESCRIPTION("Iptables target actions"); | ||
311 | MODULE_LICENSE("GPL"); | ||
312 | |||
313 | static int __init | ||
314 | ipt_init_module(void) | ||
315 | { | ||
316 | return tcf_register_action(&act_ipt_ops); | ||
317 | } | ||
318 | |||
319 | static void __exit | ||
320 | ipt_cleanup_module(void) | ||
321 | { | ||
322 | tcf_unregister_action(&act_ipt_ops); | ||
323 | } | ||
324 | |||
325 | module_init(ipt_init_module); | ||
326 | module_exit(ipt_cleanup_module); | ||
diff --git a/net/sched/mirred.c b/net/sched/mirred.c new file mode 100644 index 000000000000..f309ce336803 --- /dev/null +++ b/net/sched/mirred.c | |||
@@ -0,0 +1,276 @@ | |||
1 | /* | ||
2 | * net/sched/mirred.c packet mirroring and redirect actions | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Jamal Hadi Salim (2002-4) | ||
10 | * | ||
11 | * TODO: Add ingress support (and socket redirect support) | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <asm/uaccess.h> | ||
16 | #include <asm/system.h> | ||
17 | #include <asm/bitops.h> | ||
18 | #include <linux/config.h> | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/socket.h> | ||
25 | #include <linux/sockios.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/errno.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | #include <linux/netdevice.h> | ||
30 | #include <linux/skbuff.h> | ||
31 | #include <linux/rtnetlink.h> | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/proc_fs.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <net/pkt_sched.h> | ||
37 | #include <linux/tc_act/tc_mirred.h> | ||
38 | #include <net/tc_act/tc_mirred.h> | ||
39 | |||
40 | #include <linux/etherdevice.h> | ||
41 | #include <linux/if_arp.h> | ||
42 | |||
43 | |||
44 | /* use generic hash table */ | ||
45 | #define MY_TAB_SIZE 8 | ||
46 | #define MY_TAB_MASK (MY_TAB_SIZE - 1) | ||
47 | static u32 idx_gen; | ||
48 | static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE]; | ||
49 | static DEFINE_RWLOCK(mirred_lock); | ||
50 | |||
51 | /* ovewrride the defaults */ | ||
52 | #define tcf_st tcf_mirred | ||
53 | #define tc_st tc_mirred | ||
54 | #define tcf_t_lock mirred_lock | ||
55 | #define tcf_ht tcf_mirred_ht | ||
56 | |||
57 | #define CONFIG_NET_ACT_INIT 1 | ||
58 | #include <net/pkt_act.h> | ||
59 | |||
60 | static inline int | ||
61 | tcf_mirred_release(struct tcf_mirred *p, int bind) | ||
62 | { | ||
63 | if (p) { | ||
64 | if (bind) | ||
65 | p->bindcnt--; | ||
66 | p->refcnt--; | ||
67 | if(!p->bindcnt && p->refcnt <= 0) { | ||
68 | dev_put(p->dev); | ||
69 | tcf_hash_destroy(p); | ||
70 | return 1; | ||
71 | } | ||
72 | } | ||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | static int | ||
77 | tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, | ||
78 | int ovr, int bind) | ||
79 | { | ||
80 | struct rtattr *tb[TCA_MIRRED_MAX]; | ||
81 | struct tc_mirred *parm; | ||
82 | struct tcf_mirred *p; | ||
83 | struct net_device *dev = NULL; | ||
84 | int ret = 0; | ||
85 | int ok_push = 0; | ||
86 | |||
87 | if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0) | ||
88 | return -EINVAL; | ||
89 | |||
90 | if (tb[TCA_MIRRED_PARMS-1] == NULL || | ||
91 | RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm)) | ||
92 | return -EINVAL; | ||
93 | parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]); | ||
94 | |||
95 | if (parm->ifindex) { | ||
96 | dev = __dev_get_by_index(parm->ifindex); | ||
97 | if (dev == NULL) | ||
98 | return -ENODEV; | ||
99 | switch (dev->type) { | ||
100 | case ARPHRD_TUNNEL: | ||
101 | case ARPHRD_TUNNEL6: | ||
102 | case ARPHRD_SIT: | ||
103 | case ARPHRD_IPGRE: | ||
104 | case ARPHRD_VOID: | ||
105 | case ARPHRD_NONE: | ||
106 | ok_push = 0; | ||
107 | break; | ||
108 | default: | ||
109 | ok_push = 1; | ||
110 | break; | ||
111 | } | ||
112 | } | ||
113 | |||
114 | p = tcf_hash_check(parm->index, a, ovr, bind); | ||
115 | if (p == NULL) { | ||
116 | if (!parm->ifindex) | ||
117 | return -EINVAL; | ||
118 | p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); | ||
119 | if (p == NULL) | ||
120 | return -ENOMEM; | ||
121 | ret = ACT_P_CREATED; | ||
122 | } else { | ||
123 | if (!ovr) { | ||
124 | tcf_mirred_release(p, bind); | ||
125 | return -EEXIST; | ||
126 | } | ||
127 | } | ||
128 | |||
129 | spin_lock_bh(&p->lock); | ||
130 | p->action = parm->action; | ||
131 | p->eaction = parm->eaction; | ||
132 | if (parm->ifindex) { | ||
133 | p->ifindex = parm->ifindex; | ||
134 | if (ret != ACT_P_CREATED) | ||
135 | dev_put(p->dev); | ||
136 | p->dev = dev; | ||
137 | dev_hold(dev); | ||
138 | p->ok_push = ok_push; | ||
139 | } | ||
140 | spin_unlock_bh(&p->lock); | ||
141 | if (ret == ACT_P_CREATED) | ||
142 | tcf_hash_insert(p); | ||
143 | |||
144 | DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s " | ||
145 | "ifindex %d\n", parm->index, parm->action, parm->eaction, | ||
146 | dev->name, parm->ifindex); | ||
147 | return ret; | ||
148 | } | ||
149 | |||
150 | static int | ||
151 | tcf_mirred_cleanup(struct tc_action *a, int bind) | ||
152 | { | ||
153 | struct tcf_mirred *p = PRIV(a, mirred); | ||
154 | |||
155 | if (p != NULL) | ||
156 | return tcf_mirred_release(p, bind); | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | static int | ||
161 | tcf_mirred(struct sk_buff **pskb, struct tc_action *a) | ||
162 | { | ||
163 | struct tcf_mirred *p = PRIV(a, mirred); | ||
164 | struct net_device *dev; | ||
165 | struct sk_buff *skb2 = NULL; | ||
166 | struct sk_buff *skb = *pskb; | ||
167 | u32 at = G_TC_AT(skb->tc_verd); | ||
168 | |||
169 | spin_lock(&p->lock); | ||
170 | |||
171 | dev = p->dev; | ||
172 | p->tm.lastuse = jiffies; | ||
173 | |||
174 | if (!(dev->flags&IFF_UP) ) { | ||
175 | if (net_ratelimit()) | ||
176 | printk("mirred to Houston: device %s is gone!\n", | ||
177 | dev->name); | ||
178 | bad_mirred: | ||
179 | if (skb2 != NULL) | ||
180 | kfree_skb(skb2); | ||
181 | p->qstats.overlimits++; | ||
182 | p->bstats.bytes += skb->len; | ||
183 | p->bstats.packets++; | ||
184 | spin_unlock(&p->lock); | ||
185 | /* should we be asking for packet to be dropped? | ||
186 | * may make sense for redirect case only | ||
187 | */ | ||
188 | return TC_ACT_SHOT; | ||
189 | } | ||
190 | |||
191 | skb2 = skb_clone(skb, GFP_ATOMIC); | ||
192 | if (skb2 == NULL) | ||
193 | goto bad_mirred; | ||
194 | if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) { | ||
195 | if (net_ratelimit()) | ||
196 | printk("tcf_mirred unknown action %d\n", p->eaction); | ||
197 | goto bad_mirred; | ||
198 | } | ||
199 | |||
200 | p->bstats.bytes += skb2->len; | ||
201 | p->bstats.packets++; | ||
202 | if (!(at & AT_EGRESS)) | ||
203 | if (p->ok_push) | ||
204 | skb_push(skb2, skb2->dev->hard_header_len); | ||
205 | |||
206 | /* mirror is always swallowed */ | ||
207 | if (p->eaction != TCA_EGRESS_MIRROR) | ||
208 | skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); | ||
209 | |||
210 | skb2->dev = dev; | ||
211 | skb2->input_dev = skb->dev; | ||
212 | dev_queue_xmit(skb2); | ||
213 | spin_unlock(&p->lock); | ||
214 | return p->action; | ||
215 | } | ||
216 | |||
217 | static int | ||
218 | tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) | ||
219 | { | ||
220 | unsigned char *b = skb->tail; | ||
221 | struct tc_mirred opt; | ||
222 | struct tcf_mirred *p = PRIV(a, mirred); | ||
223 | struct tcf_t t; | ||
224 | |||
225 | opt.index = p->index; | ||
226 | opt.action = p->action; | ||
227 | opt.refcnt = p->refcnt - ref; | ||
228 | opt.bindcnt = p->bindcnt - bind; | ||
229 | opt.eaction = p->eaction; | ||
230 | opt.ifindex = p->ifindex; | ||
231 | DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n", | ||
232 | p->index, p->action, p->eaction, p->ifindex); | ||
233 | RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); | ||
234 | t.install = jiffies_to_clock_t(jiffies - p->tm.install); | ||
235 | t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); | ||
236 | t.expires = jiffies_to_clock_t(p->tm.expires); | ||
237 | RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); | ||
238 | return skb->len; | ||
239 | |||
240 | rtattr_failure: | ||
241 | skb_trim(skb, b - skb->data); | ||
242 | return -1; | ||
243 | } | ||
244 | |||
245 | static struct tc_action_ops act_mirred_ops = { | ||
246 | .kind = "mirred", | ||
247 | .type = TCA_ACT_MIRRED, | ||
248 | .capab = TCA_CAP_NONE, | ||
249 | .owner = THIS_MODULE, | ||
250 | .act = tcf_mirred, | ||
251 | .dump = tcf_mirred_dump, | ||
252 | .cleanup = tcf_mirred_cleanup, | ||
253 | .lookup = tcf_hash_search, | ||
254 | .init = tcf_mirred_init, | ||
255 | .walk = tcf_generic_walker | ||
256 | }; | ||
257 | |||
258 | MODULE_AUTHOR("Jamal Hadi Salim(2002)"); | ||
259 | MODULE_DESCRIPTION("Device Mirror/redirect actions"); | ||
260 | MODULE_LICENSE("GPL"); | ||
261 | |||
262 | static int __init | ||
263 | mirred_init_module(void) | ||
264 | { | ||
265 | printk("Mirror/redirect action on\n"); | ||
266 | return tcf_register_action(&act_mirred_ops); | ||
267 | } | ||
268 | |||
269 | static void __exit | ||
270 | mirred_cleanup_module(void) | ||
271 | { | ||
272 | tcf_unregister_action(&act_mirred_ops); | ||
273 | } | ||
274 | |||
275 | module_init(mirred_init_module); | ||
276 | module_exit(mirred_cleanup_module); | ||
diff --git a/net/sched/pedit.c b/net/sched/pedit.c new file mode 100644 index 000000000000..678be6a645fb --- /dev/null +++ b/net/sched/pedit.c | |||
@@ -0,0 +1,288 @@ | |||
1 | /* | ||
2 | * net/sched/pedit.c Generic packet editor | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Jamal Hadi Salim (2002-4) | ||
10 | */ | ||
11 | |||
12 | #include <asm/uaccess.h> | ||
13 | #include <asm/system.h> | ||
14 | #include <asm/bitops.h> | ||
15 | #include <linux/config.h> | ||
16 | #include <linux/types.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/socket.h> | ||
22 | #include <linux/sockios.h> | ||
23 | #include <linux/in.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/netdevice.h> | ||
27 | #include <linux/skbuff.h> | ||
28 | #include <linux/rtnetlink.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/init.h> | ||
31 | #include <linux/proc_fs.h> | ||
32 | #include <net/sock.h> | ||
33 | #include <net/pkt_sched.h> | ||
34 | #include <linux/tc_act/tc_pedit.h> | ||
35 | #include <net/tc_act/tc_pedit.h> | ||
36 | |||
37 | |||
38 | #define PEDIT_DEB 1 | ||
39 | |||
40 | /* use generic hash table */ | ||
41 | #define MY_TAB_SIZE 16 | ||
42 | #define MY_TAB_MASK 15 | ||
43 | static u32 idx_gen; | ||
44 | static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE]; | ||
45 | static DEFINE_RWLOCK(pedit_lock); | ||
46 | |||
47 | #define tcf_st tcf_pedit | ||
48 | #define tc_st tc_pedit | ||
49 | #define tcf_t_lock pedit_lock | ||
50 | #define tcf_ht tcf_pedit_ht | ||
51 | |||
52 | #define CONFIG_NET_ACT_INIT 1 | ||
53 | #include <net/pkt_act.h> | ||
54 | |||
55 | static int | ||
56 | tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, | ||
57 | int ovr, int bind) | ||
58 | { | ||
59 | struct rtattr *tb[TCA_PEDIT_MAX]; | ||
60 | struct tc_pedit *parm; | ||
61 | int ret = 0; | ||
62 | struct tcf_pedit *p; | ||
63 | struct tc_pedit_key *keys = NULL; | ||
64 | int ksize; | ||
65 | |||
66 | if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0) | ||
67 | return -EINVAL; | ||
68 | |||
69 | if (tb[TCA_PEDIT_PARMS - 1] == NULL || | ||
70 | RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm)) | ||
71 | return -EINVAL; | ||
72 | parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]); | ||
73 | ksize = parm->nkeys * sizeof(struct tc_pedit_key); | ||
74 | if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize) | ||
75 | return -EINVAL; | ||
76 | |||
77 | p = tcf_hash_check(parm->index, a, ovr, bind); | ||
78 | if (p == NULL) { | ||
79 | if (!parm->nkeys) | ||
80 | return -EINVAL; | ||
81 | p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); | ||
82 | if (p == NULL) | ||
83 | return -ENOMEM; | ||
84 | keys = kmalloc(ksize, GFP_KERNEL); | ||
85 | if (keys == NULL) { | ||
86 | kfree(p); | ||
87 | return -ENOMEM; | ||
88 | } | ||
89 | ret = ACT_P_CREATED; | ||
90 | } else { | ||
91 | if (!ovr) { | ||
92 | tcf_hash_release(p, bind); | ||
93 | return -EEXIST; | ||
94 | } | ||
95 | if (p->nkeys && p->nkeys != parm->nkeys) { | ||
96 | keys = kmalloc(ksize, GFP_KERNEL); | ||
97 | if (keys == NULL) | ||
98 | return -ENOMEM; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | spin_lock_bh(&p->lock); | ||
103 | p->flags = parm->flags; | ||
104 | p->action = parm->action; | ||
105 | if (keys) { | ||
106 | kfree(p->keys); | ||
107 | p->keys = keys; | ||
108 | p->nkeys = parm->nkeys; | ||
109 | } | ||
110 | memcpy(p->keys, parm->keys, ksize); | ||
111 | spin_unlock_bh(&p->lock); | ||
112 | if (ret == ACT_P_CREATED) | ||
113 | tcf_hash_insert(p); | ||
114 | return ret; | ||
115 | } | ||
116 | |||
117 | static int | ||
118 | tcf_pedit_cleanup(struct tc_action *a, int bind) | ||
119 | { | ||
120 | struct tcf_pedit *p = PRIV(a, pedit); | ||
121 | |||
122 | if (p != NULL) { | ||
123 | struct tc_pedit_key *keys = p->keys; | ||
124 | if (tcf_hash_release(p, bind)) { | ||
125 | kfree(keys); | ||
126 | return 1; | ||
127 | } | ||
128 | } | ||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | static int | ||
133 | tcf_pedit(struct sk_buff **pskb, struct tc_action *a) | ||
134 | { | ||
135 | struct tcf_pedit *p = PRIV(a, pedit); | ||
136 | struct sk_buff *skb = *pskb; | ||
137 | int i, munged = 0; | ||
138 | u8 *pptr; | ||
139 | |||
140 | if (!(skb->tc_verd & TC_OK2MUNGE)) { | ||
141 | /* should we set skb->cloned? */ | ||
142 | if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { | ||
143 | return p->action; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | pptr = skb->nh.raw; | ||
148 | |||
149 | spin_lock(&p->lock); | ||
150 | |||
151 | p->tm.lastuse = jiffies; | ||
152 | |||
153 | if (p->nkeys > 0) { | ||
154 | struct tc_pedit_key *tkey = p->keys; | ||
155 | |||
156 | for (i = p->nkeys; i > 0; i--, tkey++) { | ||
157 | u32 *ptr; | ||
158 | int offset = tkey->off; | ||
159 | |||
160 | if (tkey->offmask) { | ||
161 | if (skb->len > tkey->at) { | ||
162 | char *j = pptr + tkey->at; | ||
163 | offset += ((*j & tkey->offmask) >> | ||
164 | tkey->shift); | ||
165 | } else { | ||
166 | goto bad; | ||
167 | } | ||
168 | } | ||
169 | |||
170 | if (offset % 4) { | ||
171 | printk("offset must be on 32 bit boundaries\n"); | ||
172 | goto bad; | ||
173 | } | ||
174 | if (skb->len < 0 || (offset > 0 && offset > skb->len)) { | ||
175 | printk("offset %d cant exceed pkt length %d\n", | ||
176 | offset, skb->len); | ||
177 | goto bad; | ||
178 | } | ||
179 | |||
180 | ptr = (u32 *)(pptr+offset); | ||
181 | /* just do it, baby */ | ||
182 | *ptr = ((*ptr & tkey->mask) ^ tkey->val); | ||
183 | munged++; | ||
184 | } | ||
185 | |||
186 | if (munged) | ||
187 | skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); | ||
188 | goto done; | ||
189 | } else { | ||
190 | printk("pedit BUG: index %d\n",p->index); | ||
191 | } | ||
192 | |||
193 | bad: | ||
194 | p->qstats.overlimits++; | ||
195 | done: | ||
196 | p->bstats.bytes += skb->len; | ||
197 | p->bstats.packets++; | ||
198 | spin_unlock(&p->lock); | ||
199 | return p->action; | ||
200 | } | ||
201 | |||
202 | static int | ||
203 | tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref) | ||
204 | { | ||
205 | unsigned char *b = skb->tail; | ||
206 | struct tc_pedit *opt; | ||
207 | struct tcf_pedit *p = PRIV(a, pedit); | ||
208 | struct tcf_t t; | ||
209 | int s; | ||
210 | |||
211 | s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key); | ||
212 | |||
213 | /* netlink spinlocks held above us - must use ATOMIC */ | ||
214 | opt = kmalloc(s, GFP_ATOMIC); | ||
215 | if (opt == NULL) | ||
216 | return -ENOBUFS; | ||
217 | memset(opt, 0, s); | ||
218 | |||
219 | memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key)); | ||
220 | opt->index = p->index; | ||
221 | opt->nkeys = p->nkeys; | ||
222 | opt->flags = p->flags; | ||
223 | opt->action = p->action; | ||
224 | opt->refcnt = p->refcnt - ref; | ||
225 | opt->bindcnt = p->bindcnt - bind; | ||
226 | |||
227 | |||
228 | #ifdef PEDIT_DEB | ||
229 | { | ||
230 | /* Debug - get rid of later */ | ||
231 | int i; | ||
232 | struct tc_pedit_key *key = opt->keys; | ||
233 | |||
234 | for (i=0; i<opt->nkeys; i++, key++) { | ||
235 | printk( "\n key #%d",i); | ||
236 | printk( " at %d: val %08x mask %08x", | ||
237 | (unsigned int)key->off, | ||
238 | (unsigned int)key->val, | ||
239 | (unsigned int)key->mask); | ||
240 | } | ||
241 | } | ||
242 | #endif | ||
243 | |||
244 | RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt); | ||
245 | t.install = jiffies_to_clock_t(jiffies - p->tm.install); | ||
246 | t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); | ||
247 | t.expires = jiffies_to_clock_t(p->tm.expires); | ||
248 | RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); | ||
249 | return skb->len; | ||
250 | |||
251 | rtattr_failure: | ||
252 | skb_trim(skb, b - skb->data); | ||
253 | return -1; | ||
254 | } | ||
255 | |||
256 | static | ||
257 | struct tc_action_ops act_pedit_ops = { | ||
258 | .kind = "pedit", | ||
259 | .type = TCA_ACT_PEDIT, | ||
260 | .capab = TCA_CAP_NONE, | ||
261 | .owner = THIS_MODULE, | ||
262 | .act = tcf_pedit, | ||
263 | .dump = tcf_pedit_dump, | ||
264 | .cleanup = tcf_pedit_cleanup, | ||
265 | .lookup = tcf_hash_search, | ||
266 | .init = tcf_pedit_init, | ||
267 | .walk = tcf_generic_walker | ||
268 | }; | ||
269 | |||
270 | MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); | ||
271 | MODULE_DESCRIPTION("Generic Packet Editor actions"); | ||
272 | MODULE_LICENSE("GPL"); | ||
273 | |||
274 | static int __init | ||
275 | pedit_init_module(void) | ||
276 | { | ||
277 | return tcf_register_action(&act_pedit_ops); | ||
278 | } | ||
279 | |||
280 | static void __exit | ||
281 | pedit_cleanup_module(void) | ||
282 | { | ||
283 | tcf_unregister_action(&act_pedit_ops); | ||
284 | } | ||
285 | |||
286 | module_init(pedit_init_module); | ||
287 | module_exit(pedit_cleanup_module); | ||
288 | |||
diff --git a/net/sched/police.c b/net/sched/police.c new file mode 100644 index 000000000000..c03545faf523 --- /dev/null +++ b/net/sched/police.c | |||
@@ -0,0 +1,612 @@ | |||
1 | /* | ||
2 | * net/sched/police.c Input police filter. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * J Hadi Salim (action changes) | ||
11 | */ | ||
12 | |||
13 | #include <asm/uaccess.h> | ||
14 | #include <asm/system.h> | ||
15 | #include <linux/bitops.h> | ||
16 | #include <linux/config.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/types.h> | ||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/string.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/socket.h> | ||
24 | #include <linux/sockios.h> | ||
25 | #include <linux/in.h> | ||
26 | #include <linux/errno.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #include <linux/skbuff.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/rtnetlink.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <net/sock.h> | ||
34 | #include <net/act_api.h> | ||
35 | |||
36 | #define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) | ||
37 | #define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) | ||
38 | #define PRIV(a) ((struct tcf_police *) (a)->priv) | ||
39 | |||
40 | /* use generic hash table */ | ||
41 | #define MY_TAB_SIZE 16 | ||
42 | #define MY_TAB_MASK 15 | ||
43 | static u32 idx_gen; | ||
44 | static struct tcf_police *tcf_police_ht[MY_TAB_SIZE]; | ||
45 | /* Policer hash table lock */ | ||
46 | static DEFINE_RWLOCK(police_lock); | ||
47 | |||
48 | /* Each policer is serialized by its individual spinlock */ | ||
49 | |||
50 | static __inline__ unsigned tcf_police_hash(u32 index) | ||
51 | { | ||
52 | return index&0xF; | ||
53 | } | ||
54 | |||
55 | static __inline__ struct tcf_police * tcf_police_lookup(u32 index) | ||
56 | { | ||
57 | struct tcf_police *p; | ||
58 | |||
59 | read_lock(&police_lock); | ||
60 | for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { | ||
61 | if (p->index == index) | ||
62 | break; | ||
63 | } | ||
64 | read_unlock(&police_lock); | ||
65 | return p; | ||
66 | } | ||
67 | |||
68 | #ifdef CONFIG_NET_CLS_ACT | ||
69 | static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, | ||
70 | int type, struct tc_action *a) | ||
71 | { | ||
72 | struct tcf_police *p; | ||
73 | int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; | ||
74 | struct rtattr *r; | ||
75 | |||
76 | read_lock(&police_lock); | ||
77 | |||
78 | s_i = cb->args[0]; | ||
79 | |||
80 | for (i = 0; i < MY_TAB_SIZE; i++) { | ||
81 | p = tcf_police_ht[tcf_police_hash(i)]; | ||
82 | |||
83 | for (; p; p = p->next) { | ||
84 | index++; | ||
85 | if (index < s_i) | ||
86 | continue; | ||
87 | a->priv = p; | ||
88 | a->order = index; | ||
89 | r = (struct rtattr*) skb->tail; | ||
90 | RTA_PUT(skb, a->order, 0, NULL); | ||
91 | if (type == RTM_DELACTION) | ||
92 | err = tcf_action_dump_1(skb, a, 0, 1); | ||
93 | else | ||
94 | err = tcf_action_dump_1(skb, a, 0, 0); | ||
95 | if (err < 0) { | ||
96 | index--; | ||
97 | skb_trim(skb, (u8*)r - skb->data); | ||
98 | goto done; | ||
99 | } | ||
100 | r->rta_len = skb->tail - (u8*)r; | ||
101 | n_i++; | ||
102 | } | ||
103 | } | ||
104 | done: | ||
105 | read_unlock(&police_lock); | ||
106 | if (n_i) | ||
107 | cb->args[0] += n_i; | ||
108 | return n_i; | ||
109 | |||
110 | rtattr_failure: | ||
111 | skb_trim(skb, (u8*)r - skb->data); | ||
112 | goto done; | ||
113 | } | ||
114 | |||
115 | static inline int | ||
116 | tcf_hash_search(struct tc_action *a, u32 index) | ||
117 | { | ||
118 | struct tcf_police *p = tcf_police_lookup(index); | ||
119 | |||
120 | if (p != NULL) { | ||
121 | a->priv = p; | ||
122 | return 1; | ||
123 | } else { | ||
124 | return 0; | ||
125 | } | ||
126 | } | ||
127 | #endif | ||
128 | |||
129 | static inline u32 tcf_police_new_index(void) | ||
130 | { | ||
131 | do { | ||
132 | if (++idx_gen == 0) | ||
133 | idx_gen = 1; | ||
134 | } while (tcf_police_lookup(idx_gen)); | ||
135 | |||
136 | return idx_gen; | ||
137 | } | ||
138 | |||
139 | void tcf_police_destroy(struct tcf_police *p) | ||
140 | { | ||
141 | unsigned h = tcf_police_hash(p->index); | ||
142 | struct tcf_police **p1p; | ||
143 | |||
144 | for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { | ||
145 | if (*p1p == p) { | ||
146 | write_lock_bh(&police_lock); | ||
147 | *p1p = p->next; | ||
148 | write_unlock_bh(&police_lock); | ||
149 | #ifdef CONFIG_NET_ESTIMATOR | ||
150 | gen_kill_estimator(&p->bstats, &p->rate_est); | ||
151 | #endif | ||
152 | if (p->R_tab) | ||
153 | qdisc_put_rtab(p->R_tab); | ||
154 | if (p->P_tab) | ||
155 | qdisc_put_rtab(p->P_tab); | ||
156 | kfree(p); | ||
157 | return; | ||
158 | } | ||
159 | } | ||
160 | BUG_TRAP(0); | ||
161 | } | ||
162 | |||
163 | #ifdef CONFIG_NET_CLS_ACT | ||
164 | static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est, | ||
165 | struct tc_action *a, int ovr, int bind) | ||
166 | { | ||
167 | unsigned h; | ||
168 | int ret = 0, err; | ||
169 | struct rtattr *tb[TCA_POLICE_MAX]; | ||
170 | struct tc_police *parm; | ||
171 | struct tcf_police *p; | ||
172 | struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; | ||
173 | |||
174 | if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) | ||
175 | return -EINVAL; | ||
176 | |||
177 | if (tb[TCA_POLICE_TBF-1] == NULL || | ||
178 | RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) | ||
179 | return -EINVAL; | ||
180 | parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); | ||
181 | |||
182 | if (tb[TCA_POLICE_RESULT-1] != NULL && | ||
183 | RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) | ||
184 | return -EINVAL; | ||
185 | if (tb[TCA_POLICE_RESULT-1] != NULL && | ||
186 | RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) | ||
187 | return -EINVAL; | ||
188 | |||
189 | if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { | ||
190 | a->priv = p; | ||
191 | if (bind) { | ||
192 | p->bindcnt += 1; | ||
193 | p->refcnt += 1; | ||
194 | } | ||
195 | if (ovr) | ||
196 | goto override; | ||
197 | return ret; | ||
198 | } | ||
199 | |||
200 | p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
201 | if (p == NULL) | ||
202 | return -ENOMEM; | ||
203 | memset(p, 0, sizeof(*p)); | ||
204 | |||
205 | ret = ACT_P_CREATED; | ||
206 | p->refcnt = 1; | ||
207 | spin_lock_init(&p->lock); | ||
208 | p->stats_lock = &p->lock; | ||
209 | if (bind) | ||
210 | p->bindcnt = 1; | ||
211 | override: | ||
212 | if (parm->rate.rate) { | ||
213 | err = -ENOMEM; | ||
214 | R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); | ||
215 | if (R_tab == NULL) | ||
216 | goto failure; | ||
217 | if (parm->peakrate.rate) { | ||
218 | P_tab = qdisc_get_rtab(&parm->peakrate, | ||
219 | tb[TCA_POLICE_PEAKRATE-1]); | ||
220 | if (p->P_tab == NULL) { | ||
221 | qdisc_put_rtab(R_tab); | ||
222 | goto failure; | ||
223 | } | ||
224 | } | ||
225 | } | ||
226 | /* No failure allowed after this point */ | ||
227 | spin_lock_bh(&p->lock); | ||
228 | if (R_tab != NULL) { | ||
229 | qdisc_put_rtab(p->R_tab); | ||
230 | p->R_tab = R_tab; | ||
231 | } | ||
232 | if (P_tab != NULL) { | ||
233 | qdisc_put_rtab(p->P_tab); | ||
234 | p->P_tab = P_tab; | ||
235 | } | ||
236 | |||
237 | if (tb[TCA_POLICE_RESULT-1]) | ||
238 | p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); | ||
239 | p->toks = p->burst = parm->burst; | ||
240 | p->mtu = parm->mtu; | ||
241 | if (p->mtu == 0) { | ||
242 | p->mtu = ~0; | ||
243 | if (p->R_tab) | ||
244 | p->mtu = 255<<p->R_tab->rate.cell_log; | ||
245 | } | ||
246 | if (p->P_tab) | ||
247 | p->ptoks = L2T_P(p, p->mtu); | ||
248 | p->action = parm->action; | ||
249 | |||
250 | #ifdef CONFIG_NET_ESTIMATOR | ||
251 | if (tb[TCA_POLICE_AVRATE-1]) | ||
252 | p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); | ||
253 | if (est) | ||
254 | gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); | ||
255 | #endif | ||
256 | |||
257 | spin_unlock_bh(&p->lock); | ||
258 | if (ret != ACT_P_CREATED) | ||
259 | return ret; | ||
260 | |||
261 | PSCHED_GET_TIME(p->t_c); | ||
262 | p->index = parm->index ? : tcf_police_new_index(); | ||
263 | h = tcf_police_hash(p->index); | ||
264 | write_lock_bh(&police_lock); | ||
265 | p->next = tcf_police_ht[h]; | ||
266 | tcf_police_ht[h] = p; | ||
267 | write_unlock_bh(&police_lock); | ||
268 | |||
269 | a->priv = p; | ||
270 | return ret; | ||
271 | |||
272 | failure: | ||
273 | if (ret == ACT_P_CREATED) | ||
274 | kfree(p); | ||
275 | return err; | ||
276 | } | ||
277 | |||
278 | static int tcf_act_police_cleanup(struct tc_action *a, int bind) | ||
279 | { | ||
280 | struct tcf_police *p = PRIV(a); | ||
281 | |||
282 | if (p != NULL) | ||
283 | return tcf_police_release(p, bind); | ||
284 | return 0; | ||
285 | } | ||
286 | |||
287 | static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a) | ||
288 | { | ||
289 | psched_time_t now; | ||
290 | struct sk_buff *skb = *pskb; | ||
291 | struct tcf_police *p = PRIV(a); | ||
292 | long toks; | ||
293 | long ptoks = 0; | ||
294 | |||
295 | spin_lock(&p->lock); | ||
296 | |||
297 | p->bstats.bytes += skb->len; | ||
298 | p->bstats.packets++; | ||
299 | |||
300 | #ifdef CONFIG_NET_ESTIMATOR | ||
301 | if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { | ||
302 | p->qstats.overlimits++; | ||
303 | spin_unlock(&p->lock); | ||
304 | return p->action; | ||
305 | } | ||
306 | #endif | ||
307 | |||
308 | if (skb->len <= p->mtu) { | ||
309 | if (p->R_tab == NULL) { | ||
310 | spin_unlock(&p->lock); | ||
311 | return p->result; | ||
312 | } | ||
313 | |||
314 | PSCHED_GET_TIME(now); | ||
315 | |||
316 | toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); | ||
317 | |||
318 | if (p->P_tab) { | ||
319 | ptoks = toks + p->ptoks; | ||
320 | if (ptoks > (long)L2T_P(p, p->mtu)) | ||
321 | ptoks = (long)L2T_P(p, p->mtu); | ||
322 | ptoks -= L2T_P(p, skb->len); | ||
323 | } | ||
324 | toks += p->toks; | ||
325 | if (toks > (long)p->burst) | ||
326 | toks = p->burst; | ||
327 | toks -= L2T(p, skb->len); | ||
328 | |||
329 | if ((toks|ptoks) >= 0) { | ||
330 | p->t_c = now; | ||
331 | p->toks = toks; | ||
332 | p->ptoks = ptoks; | ||
333 | spin_unlock(&p->lock); | ||
334 | return p->result; | ||
335 | } | ||
336 | } | ||
337 | |||
338 | p->qstats.overlimits++; | ||
339 | spin_unlock(&p->lock); | ||
340 | return p->action; | ||
341 | } | ||
342 | |||
343 | static int | ||
344 | tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) | ||
345 | { | ||
346 | unsigned char *b = skb->tail; | ||
347 | struct tc_police opt; | ||
348 | struct tcf_police *p = PRIV(a); | ||
349 | |||
350 | opt.index = p->index; | ||
351 | opt.action = p->action; | ||
352 | opt.mtu = p->mtu; | ||
353 | opt.burst = p->burst; | ||
354 | opt.refcnt = p->refcnt - ref; | ||
355 | opt.bindcnt = p->bindcnt - bind; | ||
356 | if (p->R_tab) | ||
357 | opt.rate = p->R_tab->rate; | ||
358 | else | ||
359 | memset(&opt.rate, 0, sizeof(opt.rate)); | ||
360 | if (p->P_tab) | ||
361 | opt.peakrate = p->P_tab->rate; | ||
362 | else | ||
363 | memset(&opt.peakrate, 0, sizeof(opt.peakrate)); | ||
364 | RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); | ||
365 | if (p->result) | ||
366 | RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); | ||
367 | #ifdef CONFIG_NET_ESTIMATOR | ||
368 | if (p->ewma_rate) | ||
369 | RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); | ||
370 | #endif | ||
371 | return skb->len; | ||
372 | |||
373 | rtattr_failure: | ||
374 | skb_trim(skb, b - skb->data); | ||
375 | return -1; | ||
376 | } | ||
377 | |||
378 | MODULE_AUTHOR("Alexey Kuznetsov"); | ||
379 | MODULE_DESCRIPTION("Policing actions"); | ||
380 | MODULE_LICENSE("GPL"); | ||
381 | |||
382 | static struct tc_action_ops act_police_ops = { | ||
383 | .kind = "police", | ||
384 | .type = TCA_ID_POLICE, | ||
385 | .capab = TCA_CAP_NONE, | ||
386 | .owner = THIS_MODULE, | ||
387 | .act = tcf_act_police, | ||
388 | .dump = tcf_act_police_dump, | ||
389 | .cleanup = tcf_act_police_cleanup, | ||
390 | .lookup = tcf_hash_search, | ||
391 | .init = tcf_act_police_locate, | ||
392 | .walk = tcf_generic_walker | ||
393 | }; | ||
394 | |||
395 | static int __init | ||
396 | police_init_module(void) | ||
397 | { | ||
398 | return tcf_register_action(&act_police_ops); | ||
399 | } | ||
400 | |||
401 | static void __exit | ||
402 | police_cleanup_module(void) | ||
403 | { | ||
404 | tcf_unregister_action(&act_police_ops); | ||
405 | } | ||
406 | |||
407 | module_init(police_init_module); | ||
408 | module_exit(police_cleanup_module); | ||
409 | |||
410 | #endif | ||
411 | |||
412 | struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est) | ||
413 | { | ||
414 | unsigned h; | ||
415 | struct tcf_police *p; | ||
416 | struct rtattr *tb[TCA_POLICE_MAX]; | ||
417 | struct tc_police *parm; | ||
418 | |||
419 | if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) | ||
420 | return NULL; | ||
421 | |||
422 | if (tb[TCA_POLICE_TBF-1] == NULL || | ||
423 | RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) | ||
424 | return NULL; | ||
425 | |||
426 | parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); | ||
427 | |||
428 | if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { | ||
429 | p->refcnt++; | ||
430 | return p; | ||
431 | } | ||
432 | |||
433 | p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
434 | if (p == NULL) | ||
435 | return NULL; | ||
436 | |||
437 | memset(p, 0, sizeof(*p)); | ||
438 | p->refcnt = 1; | ||
439 | spin_lock_init(&p->lock); | ||
440 | p->stats_lock = &p->lock; | ||
441 | if (parm->rate.rate) { | ||
442 | p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); | ||
443 | if (p->R_tab == NULL) | ||
444 | goto failure; | ||
445 | if (parm->peakrate.rate) { | ||
446 | p->P_tab = qdisc_get_rtab(&parm->peakrate, | ||
447 | tb[TCA_POLICE_PEAKRATE-1]); | ||
448 | if (p->P_tab == NULL) | ||
449 | goto failure; | ||
450 | } | ||
451 | } | ||
452 | if (tb[TCA_POLICE_RESULT-1]) { | ||
453 | if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) | ||
454 | goto failure; | ||
455 | p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); | ||
456 | } | ||
457 | #ifdef CONFIG_NET_ESTIMATOR | ||
458 | if (tb[TCA_POLICE_AVRATE-1]) { | ||
459 | if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32)) | ||
460 | goto failure; | ||
461 | p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); | ||
462 | } | ||
463 | #endif | ||
464 | p->toks = p->burst = parm->burst; | ||
465 | p->mtu = parm->mtu; | ||
466 | if (p->mtu == 0) { | ||
467 | p->mtu = ~0; | ||
468 | if (p->R_tab) | ||
469 | p->mtu = 255<<p->R_tab->rate.cell_log; | ||
470 | } | ||
471 | if (p->P_tab) | ||
472 | p->ptoks = L2T_P(p, p->mtu); | ||
473 | PSCHED_GET_TIME(p->t_c); | ||
474 | p->index = parm->index ? : tcf_police_new_index(); | ||
475 | p->action = parm->action; | ||
476 | #ifdef CONFIG_NET_ESTIMATOR | ||
477 | if (est) | ||
478 | gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); | ||
479 | #endif | ||
480 | h = tcf_police_hash(p->index); | ||
481 | write_lock_bh(&police_lock); | ||
482 | p->next = tcf_police_ht[h]; | ||
483 | tcf_police_ht[h] = p; | ||
484 | write_unlock_bh(&police_lock); | ||
485 | return p; | ||
486 | |||
487 | failure: | ||
488 | if (p->R_tab) | ||
489 | qdisc_put_rtab(p->R_tab); | ||
490 | kfree(p); | ||
491 | return NULL; | ||
492 | } | ||
493 | |||
494 | int tcf_police(struct sk_buff *skb, struct tcf_police *p) | ||
495 | { | ||
496 | psched_time_t now; | ||
497 | long toks; | ||
498 | long ptoks = 0; | ||
499 | |||
500 | spin_lock(&p->lock); | ||
501 | |||
502 | p->bstats.bytes += skb->len; | ||
503 | p->bstats.packets++; | ||
504 | |||
505 | #ifdef CONFIG_NET_ESTIMATOR | ||
506 | if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { | ||
507 | p->qstats.overlimits++; | ||
508 | spin_unlock(&p->lock); | ||
509 | return p->action; | ||
510 | } | ||
511 | #endif | ||
512 | |||
513 | if (skb->len <= p->mtu) { | ||
514 | if (p->R_tab == NULL) { | ||
515 | spin_unlock(&p->lock); | ||
516 | return p->result; | ||
517 | } | ||
518 | |||
519 | PSCHED_GET_TIME(now); | ||
520 | |||
521 | toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); | ||
522 | |||
523 | if (p->P_tab) { | ||
524 | ptoks = toks + p->ptoks; | ||
525 | if (ptoks > (long)L2T_P(p, p->mtu)) | ||
526 | ptoks = (long)L2T_P(p, p->mtu); | ||
527 | ptoks -= L2T_P(p, skb->len); | ||
528 | } | ||
529 | toks += p->toks; | ||
530 | if (toks > (long)p->burst) | ||
531 | toks = p->burst; | ||
532 | toks -= L2T(p, skb->len); | ||
533 | |||
534 | if ((toks|ptoks) >= 0) { | ||
535 | p->t_c = now; | ||
536 | p->toks = toks; | ||
537 | p->ptoks = ptoks; | ||
538 | spin_unlock(&p->lock); | ||
539 | return p->result; | ||
540 | } | ||
541 | } | ||
542 | |||
543 | p->qstats.overlimits++; | ||
544 | spin_unlock(&p->lock); | ||
545 | return p->action; | ||
546 | } | ||
547 | |||
548 | int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) | ||
549 | { | ||
550 | unsigned char *b = skb->tail; | ||
551 | struct tc_police opt; | ||
552 | |||
553 | opt.index = p->index; | ||
554 | opt.action = p->action; | ||
555 | opt.mtu = p->mtu; | ||
556 | opt.burst = p->burst; | ||
557 | if (p->R_tab) | ||
558 | opt.rate = p->R_tab->rate; | ||
559 | else | ||
560 | memset(&opt.rate, 0, sizeof(opt.rate)); | ||
561 | if (p->P_tab) | ||
562 | opt.peakrate = p->P_tab->rate; | ||
563 | else | ||
564 | memset(&opt.peakrate, 0, sizeof(opt.peakrate)); | ||
565 | RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); | ||
566 | if (p->result) | ||
567 | RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); | ||
568 | #ifdef CONFIG_NET_ESTIMATOR | ||
569 | if (p->ewma_rate) | ||
570 | RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); | ||
571 | #endif | ||
572 | return skb->len; | ||
573 | |||
574 | rtattr_failure: | ||
575 | skb_trim(skb, b - skb->data); | ||
576 | return -1; | ||
577 | } | ||
578 | |||
579 | int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p) | ||
580 | { | ||
581 | struct gnet_dump d; | ||
582 | |||
583 | if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, | ||
584 | TCA_XSTATS, p->stats_lock, &d) < 0) | ||
585 | goto errout; | ||
586 | |||
587 | if (gnet_stats_copy_basic(&d, &p->bstats) < 0 || | ||
588 | #ifdef CONFIG_NET_ESTIMATOR | ||
589 | gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 || | ||
590 | #endif | ||
591 | gnet_stats_copy_queue(&d, &p->qstats) < 0) | ||
592 | goto errout; | ||
593 | |||
594 | if (gnet_stats_finish_copy(&d) < 0) | ||
595 | goto errout; | ||
596 | |||
597 | return 0; | ||
598 | |||
599 | errout: | ||
600 | return -1; | ||
601 | } | ||
602 | |||
603 | |||
604 | EXPORT_SYMBOL(tcf_police); | ||
605 | EXPORT_SYMBOL(tcf_police_destroy); | ||
606 | EXPORT_SYMBOL(tcf_police_dump); | ||
607 | EXPORT_SYMBOL(tcf_police_dump_stats); | ||
608 | EXPORT_SYMBOL(tcf_police_hash); | ||
609 | EXPORT_SYMBOL(tcf_police_ht); | ||
610 | EXPORT_SYMBOL(tcf_police_locate); | ||
611 | EXPORT_SYMBOL(tcf_police_lookup); | ||
612 | EXPORT_SYMBOL(tcf_police_new_index); | ||
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c new file mode 100644 index 000000000000..4323a74eea30 --- /dev/null +++ b/net/sched/sch_api.c | |||
@@ -0,0 +1,1296 @@ | |||
1 | /* | ||
2 | * net/sched/sch_api.c Packet scheduler API. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * | ||
11 | * Fixes: | ||
12 | * | ||
13 | * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. | ||
14 | * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support | ||
15 | * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support | ||
16 | */ | ||
17 | |||
18 | #include <linux/config.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/types.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/socket.h> | ||
26 | #include <linux/sockios.h> | ||
27 | #include <linux/in.h> | ||
28 | #include <linux/errno.h> | ||
29 | #include <linux/interrupt.h> | ||
30 | #include <linux/netdevice.h> | ||
31 | #include <linux/skbuff.h> | ||
32 | #include <linux/rtnetlink.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/proc_fs.h> | ||
35 | #include <linux/seq_file.h> | ||
36 | #include <linux/kmod.h> | ||
37 | #include <linux/list.h> | ||
38 | #include <linux/bitops.h> | ||
39 | |||
40 | #include <net/sock.h> | ||
41 | #include <net/pkt_sched.h> | ||
42 | |||
43 | #include <asm/processor.h> | ||
44 | #include <asm/uaccess.h> | ||
45 | #include <asm/system.h> | ||
46 | |||
47 | static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, | ||
48 | struct Qdisc *old, struct Qdisc *new); | ||
49 | static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, | ||
50 | struct Qdisc *q, unsigned long cl, int event); | ||
51 | |||
52 | /* | ||
53 | |||
54 | Short review. | ||
55 | ------------- | ||
56 | |||
57 | This file consists of two interrelated parts: | ||
58 | |||
59 | 1. queueing disciplines manager frontend. | ||
60 | 2. traffic classes manager frontend. | ||
61 | |||
62 | Generally, queueing discipline ("qdisc") is a black box, | ||
63 | which is able to enqueue packets and to dequeue them (when | ||
64 | device is ready to send something) in order and at times | ||
65 | determined by algorithm hidden in it. | ||
66 | |||
67 | qdisc's are divided to two categories: | ||
68 | - "queues", which have no internal structure visible from outside. | ||
69 | - "schedulers", which split all the packets to "traffic classes", | ||
70 | using "packet classifiers" (look at cls_api.c) | ||
71 | |||
72 | In turn, classes may have child qdiscs (as rule, queues) | ||
73 | attached to them etc. etc. etc. | ||
74 | |||
75 | The goal of the routines in this file is to translate | ||
76 | information supplied by user in the form of handles | ||
77 | to more intelligible for kernel form, to make some sanity | ||
78 | checks and part of work, which is common to all qdiscs | ||
79 | and to provide rtnetlink notifications. | ||
80 | |||
81 | All real intelligent work is done inside qdisc modules. | ||
82 | |||
83 | |||
84 | |||
85 | Every discipline has two major routines: enqueue and dequeue. | ||
86 | |||
87 | ---dequeue | ||
88 | |||
89 | dequeue usually returns a skb to send. It is allowed to return NULL, | ||
90 | but it does not mean that queue is empty, it just means that | ||
91 | discipline does not want to send anything this time. | ||
92 | Queue is really empty if q->q.qlen == 0. | ||
93 | For complicated disciplines with multiple queues q->q is not | ||
94 | real packet queue, but however q->q.qlen must be valid. | ||
95 | |||
96 | ---enqueue | ||
97 | |||
98 | enqueue returns 0, if packet was enqueued successfully. | ||
99 | If packet (this one or another one) was dropped, it returns | ||
100 | not zero error code. | ||
101 | NET_XMIT_DROP - this packet dropped | ||
102 | Expected action: do not backoff, but wait until queue will clear. | ||
103 | NET_XMIT_CN - probably this packet enqueued, but another one dropped. | ||
104 | Expected action: backoff or ignore | ||
105 | NET_XMIT_POLICED - dropped by police. | ||
106 | Expected action: backoff or error to real-time apps. | ||
107 | |||
108 | Auxiliary routines: | ||
109 | |||
110 | ---requeue | ||
111 | |||
112 | requeues once dequeued packet. It is used for non-standard or | ||
113 | just buggy devices, which can defer output even if dev->tbusy=0. | ||
114 | |||
115 | ---reset | ||
116 | |||
117 | returns qdisc to initial state: purge all buffers, clear all | ||
118 | timers, counters (except for statistics) etc. | ||
119 | |||
120 | ---init | ||
121 | |||
122 | initializes newly created qdisc. | ||
123 | |||
124 | ---destroy | ||
125 | |||
126 | destroys resources allocated by init and during lifetime of qdisc. | ||
127 | |||
128 | ---change | ||
129 | |||
130 | changes qdisc parameters. | ||
131 | */ | ||
132 | |||
133 | /* Protects list of registered TC modules. It is pure SMP lock. */ | ||
134 | static DEFINE_RWLOCK(qdisc_mod_lock); | ||
135 | |||
136 | |||
137 | /************************************************ | ||
138 | * Queueing disciplines manipulation. * | ||
139 | ************************************************/ | ||
140 | |||
141 | |||
142 | /* The list of all installed queueing disciplines. */ | ||
143 | |||
144 | static struct Qdisc_ops *qdisc_base; | ||
145 | |||
146 | /* Register/uregister queueing discipline */ | ||
147 | |||
148 | int register_qdisc(struct Qdisc_ops *qops) | ||
149 | { | ||
150 | struct Qdisc_ops *q, **qp; | ||
151 | int rc = -EEXIST; | ||
152 | |||
153 | write_lock(&qdisc_mod_lock); | ||
154 | for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) | ||
155 | if (!strcmp(qops->id, q->id)) | ||
156 | goto out; | ||
157 | |||
158 | if (qops->enqueue == NULL) | ||
159 | qops->enqueue = noop_qdisc_ops.enqueue; | ||
160 | if (qops->requeue == NULL) | ||
161 | qops->requeue = noop_qdisc_ops.requeue; | ||
162 | if (qops->dequeue == NULL) | ||
163 | qops->dequeue = noop_qdisc_ops.dequeue; | ||
164 | |||
165 | qops->next = NULL; | ||
166 | *qp = qops; | ||
167 | rc = 0; | ||
168 | out: | ||
169 | write_unlock(&qdisc_mod_lock); | ||
170 | return rc; | ||
171 | } | ||
172 | |||
173 | int unregister_qdisc(struct Qdisc_ops *qops) | ||
174 | { | ||
175 | struct Qdisc_ops *q, **qp; | ||
176 | int err = -ENOENT; | ||
177 | |||
178 | write_lock(&qdisc_mod_lock); | ||
179 | for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) | ||
180 | if (q == qops) | ||
181 | break; | ||
182 | if (q) { | ||
183 | *qp = q->next; | ||
184 | q->next = NULL; | ||
185 | err = 0; | ||
186 | } | ||
187 | write_unlock(&qdisc_mod_lock); | ||
188 | return err; | ||
189 | } | ||
190 | |||
191 | /* We know handle. Find qdisc among all qdisc's attached to device | ||
192 | (root qdisc, all its children, children of children etc.) | ||
193 | */ | ||
194 | |||
195 | struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) | ||
196 | { | ||
197 | struct Qdisc *q; | ||
198 | |||
199 | read_lock_bh(&qdisc_tree_lock); | ||
200 | list_for_each_entry(q, &dev->qdisc_list, list) { | ||
201 | if (q->handle == handle) { | ||
202 | read_unlock_bh(&qdisc_tree_lock); | ||
203 | return q; | ||
204 | } | ||
205 | } | ||
206 | read_unlock_bh(&qdisc_tree_lock); | ||
207 | return NULL; | ||
208 | } | ||
209 | |||
210 | static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) | ||
211 | { | ||
212 | unsigned long cl; | ||
213 | struct Qdisc *leaf; | ||
214 | struct Qdisc_class_ops *cops = p->ops->cl_ops; | ||
215 | |||
216 | if (cops == NULL) | ||
217 | return NULL; | ||
218 | cl = cops->get(p, classid); | ||
219 | |||
220 | if (cl == 0) | ||
221 | return NULL; | ||
222 | leaf = cops->leaf(p, cl); | ||
223 | cops->put(p, cl); | ||
224 | return leaf; | ||
225 | } | ||
226 | |||
227 | /* Find queueing discipline by name */ | ||
228 | |||
229 | static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) | ||
230 | { | ||
231 | struct Qdisc_ops *q = NULL; | ||
232 | |||
233 | if (kind) { | ||
234 | read_lock(&qdisc_mod_lock); | ||
235 | for (q = qdisc_base; q; q = q->next) { | ||
236 | if (rtattr_strcmp(kind, q->id) == 0) { | ||
237 | if (!try_module_get(q->owner)) | ||
238 | q = NULL; | ||
239 | break; | ||
240 | } | ||
241 | } | ||
242 | read_unlock(&qdisc_mod_lock); | ||
243 | } | ||
244 | return q; | ||
245 | } | ||
246 | |||
247 | static struct qdisc_rate_table *qdisc_rtab_list; | ||
248 | |||
249 | struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) | ||
250 | { | ||
251 | struct qdisc_rate_table *rtab; | ||
252 | |||
253 | for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { | ||
254 | if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { | ||
255 | rtab->refcnt++; | ||
256 | return rtab; | ||
257 | } | ||
258 | } | ||
259 | |||
260 | if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) | ||
261 | return NULL; | ||
262 | |||
263 | rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); | ||
264 | if (rtab) { | ||
265 | rtab->rate = *r; | ||
266 | rtab->refcnt = 1; | ||
267 | memcpy(rtab->data, RTA_DATA(tab), 1024); | ||
268 | rtab->next = qdisc_rtab_list; | ||
269 | qdisc_rtab_list = rtab; | ||
270 | } | ||
271 | return rtab; | ||
272 | } | ||
273 | |||
274 | void qdisc_put_rtab(struct qdisc_rate_table *tab) | ||
275 | { | ||
276 | struct qdisc_rate_table *rtab, **rtabp; | ||
277 | |||
278 | if (!tab || --tab->refcnt) | ||
279 | return; | ||
280 | |||
281 | for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { | ||
282 | if (rtab == tab) { | ||
283 | *rtabp = rtab->next; | ||
284 | kfree(rtab); | ||
285 | return; | ||
286 | } | ||
287 | } | ||
288 | } | ||
289 | |||
290 | |||
291 | /* Allocate an unique handle from space managed by kernel */ | ||
292 | |||
293 | static u32 qdisc_alloc_handle(struct net_device *dev) | ||
294 | { | ||
295 | int i = 0x10000; | ||
296 | static u32 autohandle = TC_H_MAKE(0x80000000U, 0); | ||
297 | |||
298 | do { | ||
299 | autohandle += TC_H_MAKE(0x10000U, 0); | ||
300 | if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) | ||
301 | autohandle = TC_H_MAKE(0x80000000U, 0); | ||
302 | } while (qdisc_lookup(dev, autohandle) && --i > 0); | ||
303 | |||
304 | return i>0 ? autohandle : 0; | ||
305 | } | ||
306 | |||
307 | /* Attach toplevel qdisc to device dev */ | ||
308 | |||
309 | static struct Qdisc * | ||
310 | dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) | ||
311 | { | ||
312 | struct Qdisc *oqdisc; | ||
313 | |||
314 | if (dev->flags & IFF_UP) | ||
315 | dev_deactivate(dev); | ||
316 | |||
317 | qdisc_lock_tree(dev); | ||
318 | if (qdisc && qdisc->flags&TCQ_F_INGRESS) { | ||
319 | oqdisc = dev->qdisc_ingress; | ||
320 | /* Prune old scheduler */ | ||
321 | if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { | ||
322 | /* delete */ | ||
323 | qdisc_reset(oqdisc); | ||
324 | dev->qdisc_ingress = NULL; | ||
325 | } else { /* new */ | ||
326 | dev->qdisc_ingress = qdisc; | ||
327 | } | ||
328 | |||
329 | } else { | ||
330 | |||
331 | oqdisc = dev->qdisc_sleeping; | ||
332 | |||
333 | /* Prune old scheduler */ | ||
334 | if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) | ||
335 | qdisc_reset(oqdisc); | ||
336 | |||
337 | /* ... and graft new one */ | ||
338 | if (qdisc == NULL) | ||
339 | qdisc = &noop_qdisc; | ||
340 | dev->qdisc_sleeping = qdisc; | ||
341 | dev->qdisc = &noop_qdisc; | ||
342 | } | ||
343 | |||
344 | qdisc_unlock_tree(dev); | ||
345 | |||
346 | if (dev->flags & IFF_UP) | ||
347 | dev_activate(dev); | ||
348 | |||
349 | return oqdisc; | ||
350 | } | ||
351 | |||
352 | |||
353 | /* Graft qdisc "new" to class "classid" of qdisc "parent" or | ||
354 | to device "dev". | ||
355 | |||
356 | Old qdisc is not destroyed but returned in *old. | ||
357 | */ | ||
358 | |||
359 | static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, | ||
360 | u32 classid, | ||
361 | struct Qdisc *new, struct Qdisc **old) | ||
362 | { | ||
363 | int err = 0; | ||
364 | struct Qdisc *q = *old; | ||
365 | |||
366 | |||
367 | if (parent == NULL) { | ||
368 | if (q && q->flags&TCQ_F_INGRESS) { | ||
369 | *old = dev_graft_qdisc(dev, q); | ||
370 | } else { | ||
371 | *old = dev_graft_qdisc(dev, new); | ||
372 | } | ||
373 | } else { | ||
374 | struct Qdisc_class_ops *cops = parent->ops->cl_ops; | ||
375 | |||
376 | err = -EINVAL; | ||
377 | |||
378 | if (cops) { | ||
379 | unsigned long cl = cops->get(parent, classid); | ||
380 | if (cl) { | ||
381 | err = cops->graft(parent, cl, new, old); | ||
382 | if (new) | ||
383 | new->parent = classid; | ||
384 | cops->put(parent, cl); | ||
385 | } | ||
386 | } | ||
387 | } | ||
388 | return err; | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | Allocate and initialize new qdisc. | ||
393 | |||
394 | Parameters are passed via opt. | ||
395 | */ | ||
396 | |||
397 | static struct Qdisc * | ||
398 | qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) | ||
399 | { | ||
400 | int err; | ||
401 | struct rtattr *kind = tca[TCA_KIND-1]; | ||
402 | void *p = NULL; | ||
403 | struct Qdisc *sch; | ||
404 | struct Qdisc_ops *ops; | ||
405 | int size; | ||
406 | |||
407 | ops = qdisc_lookup_ops(kind); | ||
408 | #ifdef CONFIG_KMOD | ||
409 | if (ops == NULL && kind != NULL) { | ||
410 | char name[IFNAMSIZ]; | ||
411 | if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { | ||
412 | /* We dropped the RTNL semaphore in order to | ||
413 | * perform the module load. So, even if we | ||
414 | * succeeded in loading the module we have to | ||
415 | * tell the caller to replay the request. We | ||
416 | * indicate this using -EAGAIN. | ||
417 | * We replay the request because the device may | ||
418 | * go away in the mean time. | ||
419 | */ | ||
420 | rtnl_unlock(); | ||
421 | request_module("sch_%s", name); | ||
422 | rtnl_lock(); | ||
423 | ops = qdisc_lookup_ops(kind); | ||
424 | if (ops != NULL) { | ||
425 | /* We will try again qdisc_lookup_ops, | ||
426 | * so don't keep a reference. | ||
427 | */ | ||
428 | module_put(ops->owner); | ||
429 | err = -EAGAIN; | ||
430 | goto err_out; | ||
431 | } | ||
432 | } | ||
433 | } | ||
434 | #endif | ||
435 | |||
436 | err = -EINVAL; | ||
437 | if (ops == NULL) | ||
438 | goto err_out; | ||
439 | |||
440 | /* ensure that the Qdisc and the private data are 32-byte aligned */ | ||
441 | size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); | ||
442 | size += ops->priv_size + QDISC_ALIGN_CONST; | ||
443 | |||
444 | p = kmalloc(size, GFP_KERNEL); | ||
445 | err = -ENOBUFS; | ||
446 | if (!p) | ||
447 | goto err_out2; | ||
448 | memset(p, 0, size); | ||
449 | sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) | ||
450 | & ~QDISC_ALIGN_CONST); | ||
451 | sch->padded = (char *)sch - (char *)p; | ||
452 | |||
453 | INIT_LIST_HEAD(&sch->list); | ||
454 | skb_queue_head_init(&sch->q); | ||
455 | |||
456 | if (handle == TC_H_INGRESS) | ||
457 | sch->flags |= TCQ_F_INGRESS; | ||
458 | |||
459 | sch->ops = ops; | ||
460 | sch->enqueue = ops->enqueue; | ||
461 | sch->dequeue = ops->dequeue; | ||
462 | sch->dev = dev; | ||
463 | dev_hold(dev); | ||
464 | atomic_set(&sch->refcnt, 1); | ||
465 | sch->stats_lock = &dev->queue_lock; | ||
466 | if (handle == 0) { | ||
467 | handle = qdisc_alloc_handle(dev); | ||
468 | err = -ENOMEM; | ||
469 | if (handle == 0) | ||
470 | goto err_out3; | ||
471 | } | ||
472 | |||
473 | if (handle == TC_H_INGRESS) | ||
474 | sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); | ||
475 | else | ||
476 | sch->handle = handle; | ||
477 | |||
478 | if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { | ||
479 | qdisc_lock_tree(dev); | ||
480 | list_add_tail(&sch->list, &dev->qdisc_list); | ||
481 | qdisc_unlock_tree(dev); | ||
482 | |||
483 | #ifdef CONFIG_NET_ESTIMATOR | ||
484 | if (tca[TCA_RATE-1]) | ||
485 | gen_new_estimator(&sch->bstats, &sch->rate_est, | ||
486 | sch->stats_lock, tca[TCA_RATE-1]); | ||
487 | #endif | ||
488 | return sch; | ||
489 | } | ||
490 | err_out3: | ||
491 | dev_put(dev); | ||
492 | err_out2: | ||
493 | module_put(ops->owner); | ||
494 | err_out: | ||
495 | *errp = err; | ||
496 | if (p) | ||
497 | kfree(p); | ||
498 | return NULL; | ||
499 | } | ||
500 | |||
501 | static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) | ||
502 | { | ||
503 | if (tca[TCA_OPTIONS-1]) { | ||
504 | int err; | ||
505 | |||
506 | if (sch->ops->change == NULL) | ||
507 | return -EINVAL; | ||
508 | err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); | ||
509 | if (err) | ||
510 | return err; | ||
511 | } | ||
512 | #ifdef CONFIG_NET_ESTIMATOR | ||
513 | if (tca[TCA_RATE-1]) | ||
514 | gen_replace_estimator(&sch->bstats, &sch->rate_est, | ||
515 | sch->stats_lock, tca[TCA_RATE-1]); | ||
516 | #endif | ||
517 | return 0; | ||
518 | } | ||
519 | |||
520 | struct check_loop_arg | ||
521 | { | ||
522 | struct qdisc_walker w; | ||
523 | struct Qdisc *p; | ||
524 | int depth; | ||
525 | }; | ||
526 | |||
527 | static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); | ||
528 | |||
529 | static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) | ||
530 | { | ||
531 | struct check_loop_arg arg; | ||
532 | |||
533 | if (q->ops->cl_ops == NULL) | ||
534 | return 0; | ||
535 | |||
536 | arg.w.stop = arg.w.skip = arg.w.count = 0; | ||
537 | arg.w.fn = check_loop_fn; | ||
538 | arg.depth = depth; | ||
539 | arg.p = p; | ||
540 | q->ops->cl_ops->walk(q, &arg.w); | ||
541 | return arg.w.stop ? -ELOOP : 0; | ||
542 | } | ||
543 | |||
544 | static int | ||
545 | check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) | ||
546 | { | ||
547 | struct Qdisc *leaf; | ||
548 | struct Qdisc_class_ops *cops = q->ops->cl_ops; | ||
549 | struct check_loop_arg *arg = (struct check_loop_arg *)w; | ||
550 | |||
551 | leaf = cops->leaf(q, cl); | ||
552 | if (leaf) { | ||
553 | if (leaf == arg->p || arg->depth > 7) | ||
554 | return -ELOOP; | ||
555 | return check_loop(leaf, arg->p, arg->depth + 1); | ||
556 | } | ||
557 | return 0; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Delete/get qdisc. | ||
562 | */ | ||
563 | |||
564 | static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) | ||
565 | { | ||
566 | struct tcmsg *tcm = NLMSG_DATA(n); | ||
567 | struct rtattr **tca = arg; | ||
568 | struct net_device *dev; | ||
569 | u32 clid = tcm->tcm_parent; | ||
570 | struct Qdisc *q = NULL; | ||
571 | struct Qdisc *p = NULL; | ||
572 | int err; | ||
573 | |||
574 | if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) | ||
575 | return -ENODEV; | ||
576 | |||
577 | if (clid) { | ||
578 | if (clid != TC_H_ROOT) { | ||
579 | if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { | ||
580 | if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) | ||
581 | return -ENOENT; | ||
582 | q = qdisc_leaf(p, clid); | ||
583 | } else { /* ingress */ | ||
584 | q = dev->qdisc_ingress; | ||
585 | } | ||
586 | } else { | ||
587 | q = dev->qdisc_sleeping; | ||
588 | } | ||
589 | if (!q) | ||
590 | return -ENOENT; | ||
591 | |||
592 | if (tcm->tcm_handle && q->handle != tcm->tcm_handle) | ||
593 | return -EINVAL; | ||
594 | } else { | ||
595 | if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) | ||
596 | return -ENOENT; | ||
597 | } | ||
598 | |||
599 | if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) | ||
600 | return -EINVAL; | ||
601 | |||
602 | if (n->nlmsg_type == RTM_DELQDISC) { | ||
603 | if (!clid) | ||
604 | return -EINVAL; | ||
605 | if (q->handle == 0) | ||
606 | return -ENOENT; | ||
607 | if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) | ||
608 | return err; | ||
609 | if (q) { | ||
610 | qdisc_notify(skb, n, clid, q, NULL); | ||
611 | spin_lock_bh(&dev->queue_lock); | ||
612 | qdisc_destroy(q); | ||
613 | spin_unlock_bh(&dev->queue_lock); | ||
614 | } | ||
615 | } else { | ||
616 | qdisc_notify(skb, n, clid, NULL, q); | ||
617 | } | ||
618 | return 0; | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | Create/change qdisc. | ||
623 | */ | ||
624 | |||
625 | static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) | ||
626 | { | ||
627 | struct tcmsg *tcm; | ||
628 | struct rtattr **tca; | ||
629 | struct net_device *dev; | ||
630 | u32 clid; | ||
631 | struct Qdisc *q, *p; | ||
632 | int err; | ||
633 | |||
634 | replay: | ||
635 | /* Reinit, just in case something touches this. */ | ||
636 | tcm = NLMSG_DATA(n); | ||
637 | tca = arg; | ||
638 | clid = tcm->tcm_parent; | ||
639 | q = p = NULL; | ||
640 | |||
641 | if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) | ||
642 | return -ENODEV; | ||
643 | |||
644 | if (clid) { | ||
645 | if (clid != TC_H_ROOT) { | ||
646 | if (clid != TC_H_INGRESS) { | ||
647 | if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) | ||
648 | return -ENOENT; | ||
649 | q = qdisc_leaf(p, clid); | ||
650 | } else { /*ingress */ | ||
651 | q = dev->qdisc_ingress; | ||
652 | } | ||
653 | } else { | ||
654 | q = dev->qdisc_sleeping; | ||
655 | } | ||
656 | |||
657 | /* It may be default qdisc, ignore it */ | ||
658 | if (q && q->handle == 0) | ||
659 | q = NULL; | ||
660 | |||
661 | if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { | ||
662 | if (tcm->tcm_handle) { | ||
663 | if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) | ||
664 | return -EEXIST; | ||
665 | if (TC_H_MIN(tcm->tcm_handle)) | ||
666 | return -EINVAL; | ||
667 | if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) | ||
668 | goto create_n_graft; | ||
669 | if (n->nlmsg_flags&NLM_F_EXCL) | ||
670 | return -EEXIST; | ||
671 | if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) | ||
672 | return -EINVAL; | ||
673 | if (q == p || | ||
674 | (p && check_loop(q, p, 0))) | ||
675 | return -ELOOP; | ||
676 | atomic_inc(&q->refcnt); | ||
677 | goto graft; | ||
678 | } else { | ||
679 | if (q == NULL) | ||
680 | goto create_n_graft; | ||
681 | |||
682 | /* This magic test requires explanation. | ||
683 | * | ||
684 | * We know, that some child q is already | ||
685 | * attached to this parent and have choice: | ||
686 | * either to change it or to create/graft new one. | ||
687 | * | ||
688 | * 1. We are allowed to create/graft only | ||
689 | * if CREATE and REPLACE flags are set. | ||
690 | * | ||
691 | * 2. If EXCL is set, requestor wanted to say, | ||
692 | * that qdisc tcm_handle is not expected | ||
693 | * to exist, so that we choose create/graft too. | ||
694 | * | ||
695 | * 3. The last case is when no flags are set. | ||
696 | * Alas, it is sort of hole in API, we | ||
697 | * cannot decide what to do unambiguously. | ||
698 | * For now we select create/graft, if | ||
699 | * user gave KIND, which does not match existing. | ||
700 | */ | ||
701 | if ((n->nlmsg_flags&NLM_F_CREATE) && | ||
702 | (n->nlmsg_flags&NLM_F_REPLACE) && | ||
703 | ((n->nlmsg_flags&NLM_F_EXCL) || | ||
704 | (tca[TCA_KIND-1] && | ||
705 | rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) | ||
706 | goto create_n_graft; | ||
707 | } | ||
708 | } | ||
709 | } else { | ||
710 | if (!tcm->tcm_handle) | ||
711 | return -EINVAL; | ||
712 | q = qdisc_lookup(dev, tcm->tcm_handle); | ||
713 | } | ||
714 | |||
715 | /* Change qdisc parameters */ | ||
716 | if (q == NULL) | ||
717 | return -ENOENT; | ||
718 | if (n->nlmsg_flags&NLM_F_EXCL) | ||
719 | return -EEXIST; | ||
720 | if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) | ||
721 | return -EINVAL; | ||
722 | err = qdisc_change(q, tca); | ||
723 | if (err == 0) | ||
724 | qdisc_notify(skb, n, clid, NULL, q); | ||
725 | return err; | ||
726 | |||
727 | create_n_graft: | ||
728 | if (!(n->nlmsg_flags&NLM_F_CREATE)) | ||
729 | return -ENOENT; | ||
730 | if (clid == TC_H_INGRESS) | ||
731 | q = qdisc_create(dev, tcm->tcm_parent, tca, &err); | ||
732 | else | ||
733 | q = qdisc_create(dev, tcm->tcm_handle, tca, &err); | ||
734 | if (q == NULL) { | ||
735 | if (err == -EAGAIN) | ||
736 | goto replay; | ||
737 | return err; | ||
738 | } | ||
739 | |||
740 | graft: | ||
741 | if (1) { | ||
742 | struct Qdisc *old_q = NULL; | ||
743 | err = qdisc_graft(dev, p, clid, q, &old_q); | ||
744 | if (err) { | ||
745 | if (q) { | ||
746 | spin_lock_bh(&dev->queue_lock); | ||
747 | qdisc_destroy(q); | ||
748 | spin_unlock_bh(&dev->queue_lock); | ||
749 | } | ||
750 | return err; | ||
751 | } | ||
752 | qdisc_notify(skb, n, clid, old_q, q); | ||
753 | if (old_q) { | ||
754 | spin_lock_bh(&dev->queue_lock); | ||
755 | qdisc_destroy(old_q); | ||
756 | spin_unlock_bh(&dev->queue_lock); | ||
757 | } | ||
758 | } | ||
759 | return 0; | ||
760 | } | ||
761 | |||
762 | static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, | ||
763 | u32 pid, u32 seq, unsigned flags, int event) | ||
764 | { | ||
765 | struct tcmsg *tcm; | ||
766 | struct nlmsghdr *nlh; | ||
767 | unsigned char *b = skb->tail; | ||
768 | struct gnet_dump d; | ||
769 | |||
770 | nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); | ||
771 | nlh->nlmsg_flags = flags; | ||
772 | tcm = NLMSG_DATA(nlh); | ||
773 | tcm->tcm_family = AF_UNSPEC; | ||
774 | tcm->tcm_ifindex = q->dev->ifindex; | ||
775 | tcm->tcm_parent = clid; | ||
776 | tcm->tcm_handle = q->handle; | ||
777 | tcm->tcm_info = atomic_read(&q->refcnt); | ||
778 | RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); | ||
779 | if (q->ops->dump && q->ops->dump(q, skb) < 0) | ||
780 | goto rtattr_failure; | ||
781 | q->qstats.qlen = q->q.qlen; | ||
782 | |||
783 | if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, | ||
784 | TCA_XSTATS, q->stats_lock, &d) < 0) | ||
785 | goto rtattr_failure; | ||
786 | |||
787 | if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) | ||
788 | goto rtattr_failure; | ||
789 | |||
790 | if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || | ||
791 | #ifdef CONFIG_NET_ESTIMATOR | ||
792 | gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || | ||
793 | #endif | ||
794 | gnet_stats_copy_queue(&d, &q->qstats) < 0) | ||
795 | goto rtattr_failure; | ||
796 | |||
797 | if (gnet_stats_finish_copy(&d) < 0) | ||
798 | goto rtattr_failure; | ||
799 | |||
800 | nlh->nlmsg_len = skb->tail - b; | ||
801 | return skb->len; | ||
802 | |||
803 | nlmsg_failure: | ||
804 | rtattr_failure: | ||
805 | skb_trim(skb, b - skb->data); | ||
806 | return -1; | ||
807 | } | ||
808 | |||
809 | static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, | ||
810 | u32 clid, struct Qdisc *old, struct Qdisc *new) | ||
811 | { | ||
812 | struct sk_buff *skb; | ||
813 | u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; | ||
814 | |||
815 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
816 | if (!skb) | ||
817 | return -ENOBUFS; | ||
818 | |||
819 | if (old && old->handle) { | ||
820 | if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) | ||
821 | goto err_out; | ||
822 | } | ||
823 | if (new) { | ||
824 | if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) | ||
825 | goto err_out; | ||
826 | } | ||
827 | |||
828 | if (skb->len) | ||
829 | return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); | ||
830 | |||
831 | err_out: | ||
832 | kfree_skb(skb); | ||
833 | return -EINVAL; | ||
834 | } | ||
835 | |||
836 | static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) | ||
837 | { | ||
838 | int idx, q_idx; | ||
839 | int s_idx, s_q_idx; | ||
840 | struct net_device *dev; | ||
841 | struct Qdisc *q; | ||
842 | |||
843 | s_idx = cb->args[0]; | ||
844 | s_q_idx = q_idx = cb->args[1]; | ||
845 | read_lock(&dev_base_lock); | ||
846 | for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { | ||
847 | if (idx < s_idx) | ||
848 | continue; | ||
849 | if (idx > s_idx) | ||
850 | s_q_idx = 0; | ||
851 | read_lock_bh(&qdisc_tree_lock); | ||
852 | q_idx = 0; | ||
853 | list_for_each_entry(q, &dev->qdisc_list, list) { | ||
854 | if (q_idx < s_q_idx) { | ||
855 | q_idx++; | ||
856 | continue; | ||
857 | } | ||
858 | if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, | ||
859 | cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { | ||
860 | read_unlock_bh(&qdisc_tree_lock); | ||
861 | goto done; | ||
862 | } | ||
863 | q_idx++; | ||
864 | } | ||
865 | read_unlock_bh(&qdisc_tree_lock); | ||
866 | } | ||
867 | |||
868 | done: | ||
869 | read_unlock(&dev_base_lock); | ||
870 | |||
871 | cb->args[0] = idx; | ||
872 | cb->args[1] = q_idx; | ||
873 | |||
874 | return skb->len; | ||
875 | } | ||
876 | |||
877 | |||
878 | |||
879 | /************************************************ | ||
880 | * Traffic classes manipulation. * | ||
881 | ************************************************/ | ||
882 | |||
883 | |||
884 | |||
885 | static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) | ||
886 | { | ||
887 | struct tcmsg *tcm = NLMSG_DATA(n); | ||
888 | struct rtattr **tca = arg; | ||
889 | struct net_device *dev; | ||
890 | struct Qdisc *q = NULL; | ||
891 | struct Qdisc_class_ops *cops; | ||
892 | unsigned long cl = 0; | ||
893 | unsigned long new_cl; | ||
894 | u32 pid = tcm->tcm_parent; | ||
895 | u32 clid = tcm->tcm_handle; | ||
896 | u32 qid = TC_H_MAJ(clid); | ||
897 | int err; | ||
898 | |||
899 | if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) | ||
900 | return -ENODEV; | ||
901 | |||
902 | /* | ||
903 | parent == TC_H_UNSPEC - unspecified parent. | ||
904 | parent == TC_H_ROOT - class is root, which has no parent. | ||
905 | parent == X:0 - parent is root class. | ||
906 | parent == X:Y - parent is a node in hierarchy. | ||
907 | parent == 0:Y - parent is X:Y, where X:0 is qdisc. | ||
908 | |||
909 | handle == 0:0 - generate handle from kernel pool. | ||
910 | handle == 0:Y - class is X:Y, where X:0 is qdisc. | ||
911 | handle == X:Y - clear. | ||
912 | handle == X:0 - root class. | ||
913 | */ | ||
914 | |||
915 | /* Step 1. Determine qdisc handle X:0 */ | ||
916 | |||
917 | if (pid != TC_H_ROOT) { | ||
918 | u32 qid1 = TC_H_MAJ(pid); | ||
919 | |||
920 | if (qid && qid1) { | ||
921 | /* If both majors are known, they must be identical. */ | ||
922 | if (qid != qid1) | ||
923 | return -EINVAL; | ||
924 | } else if (qid1) { | ||
925 | qid = qid1; | ||
926 | } else if (qid == 0) | ||
927 | qid = dev->qdisc_sleeping->handle; | ||
928 | |||
929 | /* Now qid is genuine qdisc handle consistent | ||
930 | both with parent and child. | ||
931 | |||
932 | TC_H_MAJ(pid) still may be unspecified, complete it now. | ||
933 | */ | ||
934 | if (pid) | ||
935 | pid = TC_H_MAKE(qid, pid); | ||
936 | } else { | ||
937 | if (qid == 0) | ||
938 | qid = dev->qdisc_sleeping->handle; | ||
939 | } | ||
940 | |||
941 | /* OK. Locate qdisc */ | ||
942 | if ((q = qdisc_lookup(dev, qid)) == NULL) | ||
943 | return -ENOENT; | ||
944 | |||
945 | /* An check that it supports classes */ | ||
946 | cops = q->ops->cl_ops; | ||
947 | if (cops == NULL) | ||
948 | return -EINVAL; | ||
949 | |||
950 | /* Now try to get class */ | ||
951 | if (clid == 0) { | ||
952 | if (pid == TC_H_ROOT) | ||
953 | clid = qid; | ||
954 | } else | ||
955 | clid = TC_H_MAKE(qid, clid); | ||
956 | |||
957 | if (clid) | ||
958 | cl = cops->get(q, clid); | ||
959 | |||
960 | if (cl == 0) { | ||
961 | err = -ENOENT; | ||
962 | if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) | ||
963 | goto out; | ||
964 | } else { | ||
965 | switch (n->nlmsg_type) { | ||
966 | case RTM_NEWTCLASS: | ||
967 | err = -EEXIST; | ||
968 | if (n->nlmsg_flags&NLM_F_EXCL) | ||
969 | goto out; | ||
970 | break; | ||
971 | case RTM_DELTCLASS: | ||
972 | err = cops->delete(q, cl); | ||
973 | if (err == 0) | ||
974 | tclass_notify(skb, n, q, cl, RTM_DELTCLASS); | ||
975 | goto out; | ||
976 | case RTM_GETTCLASS: | ||
977 | err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); | ||
978 | goto out; | ||
979 | default: | ||
980 | err = -EINVAL; | ||
981 | goto out; | ||
982 | } | ||
983 | } | ||
984 | |||
985 | new_cl = cl; | ||
986 | err = cops->change(q, clid, pid, tca, &new_cl); | ||
987 | if (err == 0) | ||
988 | tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); | ||
989 | |||
990 | out: | ||
991 | if (cl) | ||
992 | cops->put(q, cl); | ||
993 | |||
994 | return err; | ||
995 | } | ||
996 | |||
997 | |||
998 | static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, | ||
999 | unsigned long cl, | ||
1000 | u32 pid, u32 seq, unsigned flags, int event) | ||
1001 | { | ||
1002 | struct tcmsg *tcm; | ||
1003 | struct nlmsghdr *nlh; | ||
1004 | unsigned char *b = skb->tail; | ||
1005 | struct gnet_dump d; | ||
1006 | struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; | ||
1007 | |||
1008 | nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); | ||
1009 | nlh->nlmsg_flags = flags; | ||
1010 | tcm = NLMSG_DATA(nlh); | ||
1011 | tcm->tcm_family = AF_UNSPEC; | ||
1012 | tcm->tcm_ifindex = q->dev->ifindex; | ||
1013 | tcm->tcm_parent = q->handle; | ||
1014 | tcm->tcm_handle = q->handle; | ||
1015 | tcm->tcm_info = 0; | ||
1016 | RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); | ||
1017 | if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) | ||
1018 | goto rtattr_failure; | ||
1019 | |||
1020 | if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, | ||
1021 | TCA_XSTATS, q->stats_lock, &d) < 0) | ||
1022 | goto rtattr_failure; | ||
1023 | |||
1024 | if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) | ||
1025 | goto rtattr_failure; | ||
1026 | |||
1027 | if (gnet_stats_finish_copy(&d) < 0) | ||
1028 | goto rtattr_failure; | ||
1029 | |||
1030 | nlh->nlmsg_len = skb->tail - b; | ||
1031 | return skb->len; | ||
1032 | |||
1033 | nlmsg_failure: | ||
1034 | rtattr_failure: | ||
1035 | skb_trim(skb, b - skb->data); | ||
1036 | return -1; | ||
1037 | } | ||
1038 | |||
1039 | static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, | ||
1040 | struct Qdisc *q, unsigned long cl, int event) | ||
1041 | { | ||
1042 | struct sk_buff *skb; | ||
1043 | u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; | ||
1044 | |||
1045 | skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); | ||
1046 | if (!skb) | ||
1047 | return -ENOBUFS; | ||
1048 | |||
1049 | if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { | ||
1050 | kfree_skb(skb); | ||
1051 | return -EINVAL; | ||
1052 | } | ||
1053 | |||
1054 | return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); | ||
1055 | } | ||
1056 | |||
1057 | struct qdisc_dump_args | ||
1058 | { | ||
1059 | struct qdisc_walker w; | ||
1060 | struct sk_buff *skb; | ||
1061 | struct netlink_callback *cb; | ||
1062 | }; | ||
1063 | |||
1064 | static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) | ||
1065 | { | ||
1066 | struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; | ||
1067 | |||
1068 | return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, | ||
1069 | a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); | ||
1070 | } | ||
1071 | |||
1072 | static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) | ||
1073 | { | ||
1074 | int t; | ||
1075 | int s_t; | ||
1076 | struct net_device *dev; | ||
1077 | struct Qdisc *q; | ||
1078 | struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); | ||
1079 | struct qdisc_dump_args arg; | ||
1080 | |||
1081 | if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) | ||
1082 | return 0; | ||
1083 | if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) | ||
1084 | return 0; | ||
1085 | |||
1086 | s_t = cb->args[0]; | ||
1087 | t = 0; | ||
1088 | |||
1089 | read_lock_bh(&qdisc_tree_lock); | ||
1090 | list_for_each_entry(q, &dev->qdisc_list, list) { | ||
1091 | if (t < s_t || !q->ops->cl_ops || | ||
1092 | (tcm->tcm_parent && | ||
1093 | TC_H_MAJ(tcm->tcm_parent) != q->handle)) { | ||
1094 | t++; | ||
1095 | continue; | ||
1096 | } | ||
1097 | if (t > s_t) | ||
1098 | memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); | ||
1099 | arg.w.fn = qdisc_class_dump; | ||
1100 | arg.skb = skb; | ||
1101 | arg.cb = cb; | ||
1102 | arg.w.stop = 0; | ||
1103 | arg.w.skip = cb->args[1]; | ||
1104 | arg.w.count = 0; | ||
1105 | q->ops->cl_ops->walk(q, &arg.w); | ||
1106 | cb->args[1] = arg.w.count; | ||
1107 | if (arg.w.stop) | ||
1108 | break; | ||
1109 | t++; | ||
1110 | } | ||
1111 | read_unlock_bh(&qdisc_tree_lock); | ||
1112 | |||
1113 | cb->args[0] = t; | ||
1114 | |||
1115 | dev_put(dev); | ||
1116 | return skb->len; | ||
1117 | } | ||
1118 | |||
1119 | /* Main classifier routine: scans classifier chain attached | ||
1120 | to this qdisc, (optionally) tests for protocol and asks | ||
1121 | specific classifiers. | ||
1122 | */ | ||
1123 | int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, | ||
1124 | struct tcf_result *res) | ||
1125 | { | ||
1126 | int err = 0; | ||
1127 | u32 protocol = skb->protocol; | ||
1128 | #ifdef CONFIG_NET_CLS_ACT | ||
1129 | struct tcf_proto *otp = tp; | ||
1130 | reclassify: | ||
1131 | #endif | ||
1132 | protocol = skb->protocol; | ||
1133 | |||
1134 | for ( ; tp; tp = tp->next) { | ||
1135 | if ((tp->protocol == protocol || | ||
1136 | tp->protocol == __constant_htons(ETH_P_ALL)) && | ||
1137 | (err = tp->classify(skb, tp, res)) >= 0) { | ||
1138 | #ifdef CONFIG_NET_CLS_ACT | ||
1139 | if ( TC_ACT_RECLASSIFY == err) { | ||
1140 | __u32 verd = (__u32) G_TC_VERD(skb->tc_verd); | ||
1141 | tp = otp; | ||
1142 | |||
1143 | if (MAX_REC_LOOP < verd++) { | ||
1144 | printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", | ||
1145 | tp->prio&0xffff, ntohs(tp->protocol)); | ||
1146 | return TC_ACT_SHOT; | ||
1147 | } | ||
1148 | skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd); | ||
1149 | goto reclassify; | ||
1150 | } else { | ||
1151 | if (skb->tc_verd) | ||
1152 | skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); | ||
1153 | return err; | ||
1154 | } | ||
1155 | #else | ||
1156 | |||
1157 | return err; | ||
1158 | #endif | ||
1159 | } | ||
1160 | |||
1161 | } | ||
1162 | return -1; | ||
1163 | } | ||
1164 | |||
1165 | static int psched_us_per_tick = 1; | ||
1166 | static int psched_tick_per_us = 1; | ||
1167 | |||
1168 | #ifdef CONFIG_PROC_FS | ||
1169 | static int psched_show(struct seq_file *seq, void *v) | ||
1170 | { | ||
1171 | seq_printf(seq, "%08x %08x %08x %08x\n", | ||
1172 | psched_tick_per_us, psched_us_per_tick, | ||
1173 | 1000000, HZ); | ||
1174 | |||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | static int psched_open(struct inode *inode, struct file *file) | ||
1179 | { | ||
1180 | return single_open(file, psched_show, PDE(inode)->data); | ||
1181 | } | ||
1182 | |||
1183 | static struct file_operations psched_fops = { | ||
1184 | .owner = THIS_MODULE, | ||
1185 | .open = psched_open, | ||
1186 | .read = seq_read, | ||
1187 | .llseek = seq_lseek, | ||
1188 | .release = single_release, | ||
1189 | }; | ||
1190 | #endif | ||
1191 | |||
1192 | #ifdef CONFIG_NET_SCH_CLK_CPU | ||
1193 | psched_tdiff_t psched_clock_per_hz; | ||
1194 | int psched_clock_scale; | ||
1195 | EXPORT_SYMBOL(psched_clock_per_hz); | ||
1196 | EXPORT_SYMBOL(psched_clock_scale); | ||
1197 | |||
1198 | psched_time_t psched_time_base; | ||
1199 | cycles_t psched_time_mark; | ||
1200 | EXPORT_SYMBOL(psched_time_mark); | ||
1201 | EXPORT_SYMBOL(psched_time_base); | ||
1202 | |||
1203 | /* | ||
1204 | * Periodically adjust psched_time_base to avoid overflow | ||
1205 | * with 32-bit get_cycles(). Safe up to 4GHz CPU. | ||
1206 | */ | ||
1207 | static void psched_tick(unsigned long); | ||
1208 | static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0); | ||
1209 | |||
1210 | static void psched_tick(unsigned long dummy) | ||
1211 | { | ||
1212 | if (sizeof(cycles_t) == sizeof(u32)) { | ||
1213 | psched_time_t dummy_stamp; | ||
1214 | PSCHED_GET_TIME(dummy_stamp); | ||
1215 | psched_timer.expires = jiffies + 1*HZ; | ||
1216 | add_timer(&psched_timer); | ||
1217 | } | ||
1218 | } | ||
1219 | |||
1220 | int __init psched_calibrate_clock(void) | ||
1221 | { | ||
1222 | psched_time_t stamp, stamp1; | ||
1223 | struct timeval tv, tv1; | ||
1224 | psched_tdiff_t delay; | ||
1225 | long rdelay; | ||
1226 | unsigned long stop; | ||
1227 | |||
1228 | psched_tick(0); | ||
1229 | stop = jiffies + HZ/10; | ||
1230 | PSCHED_GET_TIME(stamp); | ||
1231 | do_gettimeofday(&tv); | ||
1232 | while (time_before(jiffies, stop)) { | ||
1233 | barrier(); | ||
1234 | cpu_relax(); | ||
1235 | } | ||
1236 | PSCHED_GET_TIME(stamp1); | ||
1237 | do_gettimeofday(&tv1); | ||
1238 | |||
1239 | delay = PSCHED_TDIFF(stamp1, stamp); | ||
1240 | rdelay = tv1.tv_usec - tv.tv_usec; | ||
1241 | rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; | ||
1242 | if (rdelay > delay) | ||
1243 | return -1; | ||
1244 | delay /= rdelay; | ||
1245 | psched_tick_per_us = delay; | ||
1246 | while ((delay>>=1) != 0) | ||
1247 | psched_clock_scale++; | ||
1248 | psched_us_per_tick = 1<<psched_clock_scale; | ||
1249 | psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; | ||
1250 | return 0; | ||
1251 | } | ||
1252 | #endif | ||
1253 | |||
1254 | static int __init pktsched_init(void) | ||
1255 | { | ||
1256 | struct rtnetlink_link *link_p; | ||
1257 | |||
1258 | #ifdef CONFIG_NET_SCH_CLK_CPU | ||
1259 | if (psched_calibrate_clock() < 0) | ||
1260 | return -1; | ||
1261 | #elif defined(CONFIG_NET_SCH_CLK_JIFFIES) | ||
1262 | psched_tick_per_us = HZ<<PSCHED_JSCALE; | ||
1263 | psched_us_per_tick = 1000000; | ||
1264 | #endif | ||
1265 | |||
1266 | link_p = rtnetlink_links[PF_UNSPEC]; | ||
1267 | |||
1268 | /* Setup rtnetlink links. It is made here to avoid | ||
1269 | exporting large number of public symbols. | ||
1270 | */ | ||
1271 | |||
1272 | if (link_p) { | ||
1273 | link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; | ||
1274 | link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; | ||
1275 | link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; | ||
1276 | link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; | ||
1277 | link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; | ||
1278 | link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; | ||
1279 | link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; | ||
1280 | link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; | ||
1281 | } | ||
1282 | |||
1283 | register_qdisc(&pfifo_qdisc_ops); | ||
1284 | register_qdisc(&bfifo_qdisc_ops); | ||
1285 | proc_net_fops_create("psched", 0, &psched_fops); | ||
1286 | |||
1287 | return 0; | ||
1288 | } | ||
1289 | |||
1290 | subsys_initcall(pktsched_init); | ||
1291 | |||
1292 | EXPORT_SYMBOL(qdisc_get_rtab); | ||
1293 | EXPORT_SYMBOL(qdisc_put_rtab); | ||
1294 | EXPORT_SYMBOL(register_qdisc); | ||
1295 | EXPORT_SYMBOL(unregister_qdisc); | ||
1296 | EXPORT_SYMBOL(tc_classify); | ||
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c new file mode 100644 index 000000000000..93ebce40acac --- /dev/null +++ b/net/sched/sch_atm.c | |||
@@ -0,0 +1,735 @@ | |||
1 | /* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ | ||
2 | |||
3 | /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ | ||
4 | |||
5 | |||
6 | #include <linux/config.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/string.h> | ||
10 | #include <linux/errno.h> | ||
11 | #include <linux/skbuff.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/atmdev.h> | ||
14 | #include <linux/atmclip.h> | ||
15 | #include <linux/netdevice.h> | ||
16 | #include <linux/rtnetlink.h> | ||
17 | #include <linux/file.h> /* for fput */ | ||
18 | #include <net/pkt_sched.h> | ||
19 | #include <net/sock.h> | ||
20 | |||
21 | |||
22 | extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ | ||
23 | |||
24 | #if 0 /* control */ | ||
25 | #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
26 | #else | ||
27 | #define DPRINTK(format,args...) | ||
28 | #endif | ||
29 | |||
30 | #if 0 /* data */ | ||
31 | #define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
32 | #else | ||
33 | #define D2PRINTK(format,args...) | ||
34 | #endif | ||
35 | |||
36 | |||
37 | /* | ||
38 | * The ATM queuing discipline provides a framework for invoking classifiers | ||
39 | * (aka "filters"), which in turn select classes of this queuing discipline. | ||
40 | * Each class maps the flow(s) it is handling to a given VC. Multiple classes | ||
41 | * may share the same VC. | ||
42 | * | ||
43 | * When creating a class, VCs are specified by passing the number of the open | ||
44 | * socket descriptor by which the calling process references the VC. The kernel | ||
45 | * keeps the VC open at least until all classes using it are removed. | ||
46 | * | ||
47 | * In this file, most functions are named atm_tc_* to avoid confusion with all | ||
48 | * the atm_* in net/atm. This naming convention differs from what's used in the | ||
49 | * rest of net/sched. | ||
50 | * | ||
51 | * Known bugs: | ||
52 | * - sometimes messes up the IP stack | ||
53 | * - any manipulations besides the few operations described in the README, are | ||
54 | * untested and likely to crash the system | ||
55 | * - should lock the flow while there is data in the queue (?) | ||
56 | */ | ||
57 | |||
58 | |||
59 | #define PRIV(sch) qdisc_priv(sch) | ||
60 | #define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) | ||
61 | |||
62 | |||
63 | struct atm_flow_data { | ||
64 | struct Qdisc *q; /* FIFO, TBF, etc. */ | ||
65 | struct tcf_proto *filter_list; | ||
66 | struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ | ||
67 | void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */ | ||
68 | struct atm_qdisc_data *parent; /* parent qdisc */ | ||
69 | struct socket *sock; /* for closing */ | ||
70 | u32 classid; /* x:y type ID */ | ||
71 | int ref; /* reference count */ | ||
72 | struct gnet_stats_basic bstats; | ||
73 | struct gnet_stats_queue qstats; | ||
74 | spinlock_t *stats_lock; | ||
75 | struct atm_flow_data *next; | ||
76 | struct atm_flow_data *excess; /* flow for excess traffic; | ||
77 | NULL to set CLP instead */ | ||
78 | int hdr_len; | ||
79 | unsigned char hdr[0]; /* header data; MUST BE LAST */ | ||
80 | }; | ||
81 | |||
82 | struct atm_qdisc_data { | ||
83 | struct atm_flow_data link; /* unclassified skbs go here */ | ||
84 | struct atm_flow_data *flows; /* NB: "link" is also on this | ||
85 | list */ | ||
86 | struct tasklet_struct task; /* requeue tasklet */ | ||
87 | }; | ||
88 | |||
89 | |||
90 | /* ------------------------- Class/flow operations ------------------------- */ | ||
91 | |||
92 | |||
93 | static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow) | ||
94 | { | ||
95 | struct atm_flow_data *walk; | ||
96 | |||
97 | DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow); | ||
98 | for (walk = qdisc->flows; walk; walk = walk->next) | ||
99 | if (walk == flow) return 1; | ||
100 | DPRINTK("find_flow: not found\n"); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | |||
105 | static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch, | ||
106 | u32 classid) | ||
107 | { | ||
108 | struct atm_qdisc_data *p = PRIV(sch); | ||
109 | struct atm_flow_data *flow; | ||
110 | |||
111 | for (flow = p->flows; flow; flow = flow->next) | ||
112 | if (flow->classid == classid) break; | ||
113 | return flow; | ||
114 | } | ||
115 | |||
116 | |||
117 | static int atm_tc_graft(struct Qdisc *sch,unsigned long arg, | ||
118 | struct Qdisc *new,struct Qdisc **old) | ||
119 | { | ||
120 | struct atm_qdisc_data *p = PRIV(sch); | ||
121 | struct atm_flow_data *flow = (struct atm_flow_data *) arg; | ||
122 | |||
123 | DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch, | ||
124 | p,flow,new,old); | ||
125 | if (!find_flow(p,flow)) return -EINVAL; | ||
126 | if (!new) new = &noop_qdisc; | ||
127 | *old = xchg(&flow->q,new); | ||
128 | if (*old) qdisc_reset(*old); | ||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | |||
133 | static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl) | ||
134 | { | ||
135 | struct atm_flow_data *flow = (struct atm_flow_data *) cl; | ||
136 | |||
137 | DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow); | ||
138 | return flow ? flow->q : NULL; | ||
139 | } | ||
140 | |||
141 | |||
142 | static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid) | ||
143 | { | ||
144 | struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch); | ||
145 | struct atm_flow_data *flow; | ||
146 | |||
147 | DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); | ||
148 | flow = lookup_flow(sch,classid); | ||
149 | if (flow) flow->ref++; | ||
150 | DPRINTK("atm_tc_get: flow %p\n",flow); | ||
151 | return (unsigned long) flow; | ||
152 | } | ||
153 | |||
154 | |||
155 | static unsigned long atm_tc_bind_filter(struct Qdisc *sch, | ||
156 | unsigned long parent, u32 classid) | ||
157 | { | ||
158 | return atm_tc_get(sch,classid); | ||
159 | } | ||
160 | |||
161 | |||
162 | static void destroy_filters(struct atm_flow_data *flow) | ||
163 | { | ||
164 | struct tcf_proto *filter; | ||
165 | |||
166 | while ((filter = flow->filter_list)) { | ||
167 | DPRINTK("destroy_filters: destroying filter %p\n",filter); | ||
168 | flow->filter_list = filter->next; | ||
169 | tcf_destroy(filter); | ||
170 | } | ||
171 | } | ||
172 | |||
173 | |||
174 | /* | ||
175 | * atm_tc_put handles all destructions, including the ones that are explicitly | ||
176 | * requested (atm_tc_destroy, etc.). The assumption here is that we never drop | ||
177 | * anything that still seems to be in use. | ||
178 | */ | ||
179 | |||
180 | static void atm_tc_put(struct Qdisc *sch, unsigned long cl) | ||
181 | { | ||
182 | struct atm_qdisc_data *p = PRIV(sch); | ||
183 | struct atm_flow_data *flow = (struct atm_flow_data *) cl; | ||
184 | struct atm_flow_data **prev; | ||
185 | |||
186 | DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); | ||
187 | if (--flow->ref) return; | ||
188 | DPRINTK("atm_tc_put: destroying\n"); | ||
189 | for (prev = &p->flows; *prev; prev = &(*prev)->next) | ||
190 | if (*prev == flow) break; | ||
191 | if (!*prev) { | ||
192 | printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow); | ||
193 | return; | ||
194 | } | ||
195 | *prev = flow->next; | ||
196 | DPRINTK("atm_tc_put: qdisc %p\n",flow->q); | ||
197 | qdisc_destroy(flow->q); | ||
198 | destroy_filters(flow); | ||
199 | if (flow->sock) { | ||
200 | DPRINTK("atm_tc_put: f_count %d\n", | ||
201 | file_count(flow->sock->file)); | ||
202 | flow->vcc->pop = flow->old_pop; | ||
203 | sockfd_put(flow->sock); | ||
204 | } | ||
205 | if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess); | ||
206 | if (flow != &p->link) kfree(flow); | ||
207 | /* | ||
208 | * If flow == &p->link, the qdisc no longer works at this point and | ||
209 | * needs to be removed. (By the caller of atm_tc_put.) | ||
210 | */ | ||
211 | } | ||
212 | |||
213 | |||
214 | static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb) | ||
215 | { | ||
216 | struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; | ||
217 | |||
218 | D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p); | ||
219 | VCC2FLOW(vcc)->old_pop(vcc,skb); | ||
220 | tasklet_schedule(&p->task); | ||
221 | } | ||
222 | |||
223 | static const u8 llc_oui_ip[] = { | ||
224 | 0xaa, /* DSAP: non-ISO */ | ||
225 | 0xaa, /* SSAP: non-ISO */ | ||
226 | 0x03, /* Ctrl: Unnumbered Information Command PDU */ | ||
227 | 0x00, /* OUI: EtherType */ | ||
228 | 0x00, 0x00, | ||
229 | 0x08, 0x00 }; /* Ethertype IP (0800) */ | ||
230 | |||
231 | static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, | ||
232 | struct rtattr **tca, unsigned long *arg) | ||
233 | { | ||
234 | struct atm_qdisc_data *p = PRIV(sch); | ||
235 | struct atm_flow_data *flow = (struct atm_flow_data *) *arg; | ||
236 | struct atm_flow_data *excess = NULL; | ||
237 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
238 | struct rtattr *tb[TCA_ATM_MAX]; | ||
239 | struct socket *sock; | ||
240 | int fd,error,hdr_len; | ||
241 | void *hdr; | ||
242 | |||
243 | DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," | ||
244 | "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt); | ||
245 | /* | ||
246 | * The concept of parents doesn't apply for this qdisc. | ||
247 | */ | ||
248 | if (parent && parent != TC_H_ROOT && parent != sch->handle) | ||
249 | return -EINVAL; | ||
250 | /* | ||
251 | * ATM classes cannot be changed. In order to change properties of the | ||
252 | * ATM connection, that socket needs to be modified directly (via the | ||
253 | * native ATM API. In order to send a flow to a different VC, the old | ||
254 | * class needs to be removed and a new one added. (This may be changed | ||
255 | * later.) | ||
256 | */ | ||
257 | if (flow) return -EBUSY; | ||
258 | if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt)) | ||
259 | return -EINVAL; | ||
260 | if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd)) | ||
261 | return -EINVAL; | ||
262 | fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]); | ||
263 | DPRINTK("atm_tc_change: fd %d\n",fd); | ||
264 | if (tb[TCA_ATM_HDR-1]) { | ||
265 | hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]); | ||
266 | hdr = RTA_DATA(tb[TCA_ATM_HDR-1]); | ||
267 | } | ||
268 | else { | ||
269 | hdr_len = RFC1483LLC_LEN; | ||
270 | hdr = NULL; /* default LLC/SNAP for IP */ | ||
271 | } | ||
272 | if (!tb[TCA_ATM_EXCESS-1]) excess = NULL; | ||
273 | else { | ||
274 | if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32)) | ||
275 | return -EINVAL; | ||
276 | excess = (struct atm_flow_data *) atm_tc_get(sch, | ||
277 | *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1])); | ||
278 | if (!excess) return -ENOENT; | ||
279 | } | ||
280 | DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n", | ||
281 | opt->rta_type,RTA_PAYLOAD(opt),hdr_len); | ||
282 | if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */ | ||
283 | DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file)); | ||
284 | if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { | ||
285 | error = -EPROTOTYPE; | ||
286 | goto err_out; | ||
287 | } | ||
288 | /* @@@ should check if the socket is really operational or we'll crash | ||
289 | on vcc->send */ | ||
290 | if (classid) { | ||
291 | if (TC_H_MAJ(classid ^ sch->handle)) { | ||
292 | DPRINTK("atm_tc_change: classid mismatch\n"); | ||
293 | error = -EINVAL; | ||
294 | goto err_out; | ||
295 | } | ||
296 | if (find_flow(p,flow)) { | ||
297 | error = -EEXIST; | ||
298 | goto err_out; | ||
299 | } | ||
300 | } | ||
301 | else { | ||
302 | int i; | ||
303 | unsigned long cl; | ||
304 | |||
305 | for (i = 1; i < 0x8000; i++) { | ||
306 | classid = TC_H_MAKE(sch->handle,0x8000 | i); | ||
307 | if (!(cl = atm_tc_get(sch,classid))) break; | ||
308 | atm_tc_put(sch,cl); | ||
309 | } | ||
310 | } | ||
311 | DPRINTK("atm_tc_change: new id %x\n",classid); | ||
312 | flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL); | ||
313 | DPRINTK("atm_tc_change: flow %p\n",flow); | ||
314 | if (!flow) { | ||
315 | error = -ENOBUFS; | ||
316 | goto err_out; | ||
317 | } | ||
318 | memset(flow,0,sizeof(*flow)); | ||
319 | flow->filter_list = NULL; | ||
320 | if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) | ||
321 | flow->q = &noop_qdisc; | ||
322 | DPRINTK("atm_tc_change: qdisc %p\n",flow->q); | ||
323 | flow->sock = sock; | ||
324 | flow->vcc = ATM_SD(sock); /* speedup */ | ||
325 | flow->vcc->user_back = flow; | ||
326 | DPRINTK("atm_tc_change: vcc %p\n",flow->vcc); | ||
327 | flow->old_pop = flow->vcc->pop; | ||
328 | flow->parent = p; | ||
329 | flow->vcc->pop = sch_atm_pop; | ||
330 | flow->classid = classid; | ||
331 | flow->ref = 1; | ||
332 | flow->excess = excess; | ||
333 | flow->next = p->link.next; | ||
334 | p->link.next = flow; | ||
335 | flow->hdr_len = hdr_len; | ||
336 | if (hdr) | ||
337 | memcpy(flow->hdr,hdr,hdr_len); | ||
338 | else | ||
339 | memcpy(flow->hdr,llc_oui_ip,sizeof(llc_oui_ip)); | ||
340 | *arg = (unsigned long) flow; | ||
341 | return 0; | ||
342 | err_out: | ||
343 | if (excess) atm_tc_put(sch,(unsigned long) excess); | ||
344 | sockfd_put(sock); | ||
345 | return error; | ||
346 | } | ||
347 | |||
348 | |||
349 | static int atm_tc_delete(struct Qdisc *sch,unsigned long arg) | ||
350 | { | ||
351 | struct atm_qdisc_data *p = PRIV(sch); | ||
352 | struct atm_flow_data *flow = (struct atm_flow_data *) arg; | ||
353 | |||
354 | DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); | ||
355 | if (!find_flow(PRIV(sch),flow)) return -EINVAL; | ||
356 | if (flow->filter_list || flow == &p->link) return -EBUSY; | ||
357 | /* | ||
358 | * Reference count must be 2: one for "keepalive" (set at class | ||
359 | * creation), and one for the reference held when calling delete. | ||
360 | */ | ||
361 | if (flow->ref < 2) { | ||
362 | printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref); | ||
363 | return -EINVAL; | ||
364 | } | ||
365 | if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/ | ||
366 | atm_tc_put(sch,arg); | ||
367 | return 0; | ||
368 | } | ||
369 | |||
370 | |||
371 | static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker) | ||
372 | { | ||
373 | struct atm_qdisc_data *p = PRIV(sch); | ||
374 | struct atm_flow_data *flow; | ||
375 | |||
376 | DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); | ||
377 | if (walker->stop) return; | ||
378 | for (flow = p->flows; flow; flow = flow->next) { | ||
379 | if (walker->count >= walker->skip) | ||
380 | if (walker->fn(sch,(unsigned long) flow,walker) < 0) { | ||
381 | walker->stop = 1; | ||
382 | break; | ||
383 | } | ||
384 | walker->count++; | ||
385 | } | ||
386 | } | ||
387 | |||
388 | |||
389 | static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl) | ||
390 | { | ||
391 | struct atm_qdisc_data *p = PRIV(sch); | ||
392 | struct atm_flow_data *flow = (struct atm_flow_data *) cl; | ||
393 | |||
394 | DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); | ||
395 | return flow ? &flow->filter_list : &p->link.filter_list; | ||
396 | } | ||
397 | |||
398 | |||
399 | /* --------------------------- Qdisc operations ---------------------------- */ | ||
400 | |||
401 | |||
402 | static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) | ||
403 | { | ||
404 | struct atm_qdisc_data *p = PRIV(sch); | ||
405 | struct atm_flow_data *flow = NULL ; /* @@@ */ | ||
406 | struct tcf_result res; | ||
407 | int result; | ||
408 | int ret = NET_XMIT_POLICED; | ||
409 | |||
410 | D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); | ||
411 | result = TC_POLICE_OK; /* be nice to gcc */ | ||
412 | if (TC_H_MAJ(skb->priority) != sch->handle || | ||
413 | !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority))) | ||
414 | for (flow = p->flows; flow; flow = flow->next) | ||
415 | if (flow->filter_list) { | ||
416 | result = tc_classify(skb,flow->filter_list, | ||
417 | &res); | ||
418 | if (result < 0) continue; | ||
419 | flow = (struct atm_flow_data *) res.class; | ||
420 | if (!flow) flow = lookup_flow(sch,res.classid); | ||
421 | break; | ||
422 | } | ||
423 | if (!flow) flow = &p->link; | ||
424 | else { | ||
425 | if (flow->vcc) | ||
426 | ATM_SKB(skb)->atm_options = flow->vcc->atm_options; | ||
427 | /*@@@ looks good ... but it's not supposed to work :-)*/ | ||
428 | #ifdef CONFIG_NET_CLS_POLICE | ||
429 | switch (result) { | ||
430 | case TC_POLICE_SHOT: | ||
431 | kfree_skb(skb); | ||
432 | break; | ||
433 | case TC_POLICE_RECLASSIFY: | ||
434 | if (flow->excess) flow = flow->excess; | ||
435 | else { | ||
436 | ATM_SKB(skb)->atm_options |= | ||
437 | ATM_ATMOPT_CLP; | ||
438 | break; | ||
439 | } | ||
440 | /* fall through */ | ||
441 | case TC_POLICE_OK: | ||
442 | /* fall through */ | ||
443 | default: | ||
444 | break; | ||
445 | } | ||
446 | #endif | ||
447 | } | ||
448 | if ( | ||
449 | #ifdef CONFIG_NET_CLS_POLICE | ||
450 | result == TC_POLICE_SHOT || | ||
451 | #endif | ||
452 | (ret = flow->q->enqueue(skb,flow->q)) != 0) { | ||
453 | sch->qstats.drops++; | ||
454 | if (flow) flow->qstats.drops++; | ||
455 | return ret; | ||
456 | } | ||
457 | sch->bstats.bytes += skb->len; | ||
458 | sch->bstats.packets++; | ||
459 | flow->bstats.bytes += skb->len; | ||
460 | flow->bstats.packets++; | ||
461 | /* | ||
462 | * Okay, this may seem weird. We pretend we've dropped the packet if | ||
463 | * it goes via ATM. The reason for this is that the outer qdisc | ||
464 | * expects to be able to q->dequeue the packet later on if we return | ||
465 | * success at this place. Also, sch->q.qdisc needs to reflect whether | ||
466 | * there is a packet egligible for dequeuing or not. Note that the | ||
467 | * statistics of the outer qdisc are necessarily wrong because of all | ||
468 | * this. There's currently no correct solution for this. | ||
469 | */ | ||
470 | if (flow == &p->link) { | ||
471 | sch->q.qlen++; | ||
472 | return 0; | ||
473 | } | ||
474 | tasklet_schedule(&p->task); | ||
475 | return NET_XMIT_BYPASS; | ||
476 | } | ||
477 | |||
478 | |||
479 | /* | ||
480 | * Dequeue packets and send them over ATM. Note that we quite deliberately | ||
481 | * avoid checking net_device's flow control here, simply because sch_atm | ||
482 | * uses its own channels, which have nothing to do with any CLIP/LANE/or | ||
483 | * non-ATM interfaces. | ||
484 | */ | ||
485 | |||
486 | |||
487 | static void sch_atm_dequeue(unsigned long data) | ||
488 | { | ||
489 | struct Qdisc *sch = (struct Qdisc *) data; | ||
490 | struct atm_qdisc_data *p = PRIV(sch); | ||
491 | struct atm_flow_data *flow; | ||
492 | struct sk_buff *skb; | ||
493 | |||
494 | D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p); | ||
495 | for (flow = p->link.next; flow; flow = flow->next) | ||
496 | /* | ||
497 | * If traffic is properly shaped, this won't generate nasty | ||
498 | * little bursts. Otherwise, it may ... (but that's okay) | ||
499 | */ | ||
500 | while ((skb = flow->q->dequeue(flow->q))) { | ||
501 | if (!atm_may_send(flow->vcc,skb->truesize)) { | ||
502 | (void) flow->q->ops->requeue(skb,flow->q); | ||
503 | break; | ||
504 | } | ||
505 | D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow); | ||
506 | /* remove any LL header somebody else has attached */ | ||
507 | skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data); | ||
508 | if (skb_headroom(skb) < flow->hdr_len) { | ||
509 | struct sk_buff *new; | ||
510 | |||
511 | new = skb_realloc_headroom(skb,flow->hdr_len); | ||
512 | dev_kfree_skb(skb); | ||
513 | if (!new) continue; | ||
514 | skb = new; | ||
515 | } | ||
516 | D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", | ||
517 | skb->nh.iph,skb->data); | ||
518 | ATM_SKB(skb)->vcc = flow->vcc; | ||
519 | memcpy(skb_push(skb,flow->hdr_len),flow->hdr, | ||
520 | flow->hdr_len); | ||
521 | atomic_add(skb->truesize, | ||
522 | &sk_atm(flow->vcc)->sk_wmem_alloc); | ||
523 | /* atm.atm_options are already set by atm_tc_enqueue */ | ||
524 | (void) flow->vcc->send(flow->vcc,skb); | ||
525 | } | ||
526 | } | ||
527 | |||
528 | |||
529 | static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) | ||
530 | { | ||
531 | struct atm_qdisc_data *p = PRIV(sch); | ||
532 | struct sk_buff *skb; | ||
533 | |||
534 | D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p); | ||
535 | tasklet_schedule(&p->task); | ||
536 | skb = p->link.q->dequeue(p->link.q); | ||
537 | if (skb) sch->q.qlen--; | ||
538 | return skb; | ||
539 | } | ||
540 | |||
541 | |||
542 | static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch) | ||
543 | { | ||
544 | struct atm_qdisc_data *p = PRIV(sch); | ||
545 | int ret; | ||
546 | |||
547 | D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); | ||
548 | ret = p->link.q->ops->requeue(skb,p->link.q); | ||
549 | if (!ret) { | ||
550 | sch->q.qlen++; | ||
551 | sch->qstats.requeues++; | ||
552 | } else { | ||
553 | sch->qstats.drops++; | ||
554 | p->link.qstats.drops++; | ||
555 | } | ||
556 | return ret; | ||
557 | } | ||
558 | |||
559 | |||
560 | static unsigned int atm_tc_drop(struct Qdisc *sch) | ||
561 | { | ||
562 | struct atm_qdisc_data *p = PRIV(sch); | ||
563 | struct atm_flow_data *flow; | ||
564 | unsigned int len; | ||
565 | |||
566 | DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p); | ||
567 | for (flow = p->flows; flow; flow = flow->next) | ||
568 | if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) | ||
569 | return len; | ||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | |||
574 | static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt) | ||
575 | { | ||
576 | struct atm_qdisc_data *p = PRIV(sch); | ||
577 | |||
578 | DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); | ||
579 | p->flows = &p->link; | ||
580 | if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) | ||
581 | p->link.q = &noop_qdisc; | ||
582 | DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q); | ||
583 | p->link.filter_list = NULL; | ||
584 | p->link.vcc = NULL; | ||
585 | p->link.sock = NULL; | ||
586 | p->link.classid = sch->handle; | ||
587 | p->link.ref = 1; | ||
588 | p->link.next = NULL; | ||
589 | tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch); | ||
590 | return 0; | ||
591 | } | ||
592 | |||
593 | |||
594 | static void atm_tc_reset(struct Qdisc *sch) | ||
595 | { | ||
596 | struct atm_qdisc_data *p = PRIV(sch); | ||
597 | struct atm_flow_data *flow; | ||
598 | |||
599 | DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p); | ||
600 | for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q); | ||
601 | sch->q.qlen = 0; | ||
602 | } | ||
603 | |||
604 | |||
605 | static void atm_tc_destroy(struct Qdisc *sch) | ||
606 | { | ||
607 | struct atm_qdisc_data *p = PRIV(sch); | ||
608 | struct atm_flow_data *flow; | ||
609 | |||
610 | DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p); | ||
611 | /* races ? */ | ||
612 | while ((flow = p->flows)) { | ||
613 | destroy_filters(flow); | ||
614 | if (flow->ref > 1) | ||
615 | printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow, | ||
616 | flow->ref); | ||
617 | atm_tc_put(sch,(unsigned long) flow); | ||
618 | if (p->flows == flow) { | ||
619 | printk(KERN_ERR "atm_destroy: putting flow %p didn't " | ||
620 | "kill it\n",flow); | ||
621 | p->flows = flow->next; /* brute force */ | ||
622 | break; | ||
623 | } | ||
624 | } | ||
625 | tasklet_kill(&p->task); | ||
626 | } | ||
627 | |||
628 | |||
629 | static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, | ||
630 | struct sk_buff *skb, struct tcmsg *tcm) | ||
631 | { | ||
632 | struct atm_qdisc_data *p = PRIV(sch); | ||
633 | struct atm_flow_data *flow = (struct atm_flow_data *) cl; | ||
634 | unsigned char *b = skb->tail; | ||
635 | struct rtattr *rta; | ||
636 | |||
637 | DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", | ||
638 | sch,p,flow,skb,tcm); | ||
639 | if (!find_flow(p,flow)) return -EINVAL; | ||
640 | tcm->tcm_handle = flow->classid; | ||
641 | rta = (struct rtattr *) b; | ||
642 | RTA_PUT(skb,TCA_OPTIONS,0,NULL); | ||
643 | RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr); | ||
644 | if (flow->vcc) { | ||
645 | struct sockaddr_atmpvc pvc; | ||
646 | int state; | ||
647 | |||
648 | pvc.sap_family = AF_ATMPVC; | ||
649 | pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; | ||
650 | pvc.sap_addr.vpi = flow->vcc->vpi; | ||
651 | pvc.sap_addr.vci = flow->vcc->vci; | ||
652 | RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc); | ||
653 | state = ATM_VF2VS(flow->vcc->flags); | ||
654 | RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state); | ||
655 | } | ||
656 | if (flow->excess) | ||
657 | RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid); | ||
658 | else { | ||
659 | static u32 zero; | ||
660 | |||
661 | RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero); | ||
662 | } | ||
663 | rta->rta_len = skb->tail-b; | ||
664 | return skb->len; | ||
665 | |||
666 | rtattr_failure: | ||
667 | skb_trim(skb,b-skb->data); | ||
668 | return -1; | ||
669 | } | ||
670 | static int | ||
671 | atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, | ||
672 | struct gnet_dump *d) | ||
673 | { | ||
674 | struct atm_flow_data *flow = (struct atm_flow_data *) arg; | ||
675 | |||
676 | flow->qstats.qlen = flow->q->q.qlen; | ||
677 | |||
678 | if (gnet_stats_copy_basic(d, &flow->bstats) < 0 || | ||
679 | gnet_stats_copy_queue(d, &flow->qstats) < 0) | ||
680 | return -1; | ||
681 | |||
682 | return 0; | ||
683 | } | ||
684 | |||
685 | static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
686 | { | ||
687 | return 0; | ||
688 | } | ||
689 | |||
690 | static struct Qdisc_class_ops atm_class_ops = { | ||
691 | .graft = atm_tc_graft, | ||
692 | .leaf = atm_tc_leaf, | ||
693 | .get = atm_tc_get, | ||
694 | .put = atm_tc_put, | ||
695 | .change = atm_tc_change, | ||
696 | .delete = atm_tc_delete, | ||
697 | .walk = atm_tc_walk, | ||
698 | .tcf_chain = atm_tc_find_tcf, | ||
699 | .bind_tcf = atm_tc_bind_filter, | ||
700 | .unbind_tcf = atm_tc_put, | ||
701 | .dump = atm_tc_dump_class, | ||
702 | .dump_stats = atm_tc_dump_class_stats, | ||
703 | }; | ||
704 | |||
705 | static struct Qdisc_ops atm_qdisc_ops = { | ||
706 | .next = NULL, | ||
707 | .cl_ops = &atm_class_ops, | ||
708 | .id = "atm", | ||
709 | .priv_size = sizeof(struct atm_qdisc_data), | ||
710 | .enqueue = atm_tc_enqueue, | ||
711 | .dequeue = atm_tc_dequeue, | ||
712 | .requeue = atm_tc_requeue, | ||
713 | .drop = atm_tc_drop, | ||
714 | .init = atm_tc_init, | ||
715 | .reset = atm_tc_reset, | ||
716 | .destroy = atm_tc_destroy, | ||
717 | .change = NULL, | ||
718 | .dump = atm_tc_dump, | ||
719 | .owner = THIS_MODULE, | ||
720 | }; | ||
721 | |||
722 | |||
723 | static int __init atm_init(void) | ||
724 | { | ||
725 | return register_qdisc(&atm_qdisc_ops); | ||
726 | } | ||
727 | |||
728 | static void __exit atm_exit(void) | ||
729 | { | ||
730 | unregister_qdisc(&atm_qdisc_ops); | ||
731 | } | ||
732 | |||
733 | module_init(atm_init) | ||
734 | module_exit(atm_exit) | ||
735 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c new file mode 100644 index 000000000000..d43e3b8cbf6a --- /dev/null +++ b/net/sched/sch_cbq.c | |||
@@ -0,0 +1,2124 @@ | |||
1 | /* | ||
2 | * net/sched/sch_cbq.c Class-Based Queueing discipline. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/config.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <asm/uaccess.h> | ||
16 | #include <asm/system.h> | ||
17 | #include <linux/bitops.h> | ||
18 | #include <linux/types.h> | ||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/string.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/socket.h> | ||
24 | #include <linux/sockios.h> | ||
25 | #include <linux/in.h> | ||
26 | #include <linux/errno.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/if_ether.h> | ||
29 | #include <linux/inet.h> | ||
30 | #include <linux/netdevice.h> | ||
31 | #include <linux/etherdevice.h> | ||
32 | #include <linux/notifier.h> | ||
33 | #include <net/ip.h> | ||
34 | #include <net/route.h> | ||
35 | #include <linux/skbuff.h> | ||
36 | #include <net/sock.h> | ||
37 | #include <net/pkt_sched.h> | ||
38 | |||
39 | |||
40 | /* Class-Based Queueing (CBQ) algorithm. | ||
41 | ======================================= | ||
42 | |||
43 | Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource | ||
44 | Management Models for Packet Networks", | ||
45 | IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 | ||
46 | |||
47 | [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 | ||
48 | |||
49 | [3] Sally Floyd, "Notes on Class-Based Queueing: Setting | ||
50 | Parameters", 1996 | ||
51 | |||
52 | [4] Sally Floyd and Michael Speer, "Experimental Results | ||
53 | for Class-Based Queueing", 1998, not published. | ||
54 | |||
55 | ----------------------------------------------------------------------- | ||
56 | |||
57 | Algorithm skeleton was taken from NS simulator cbq.cc. | ||
58 | If someone wants to check this code against the LBL version, | ||
59 | he should take into account that ONLY the skeleton was borrowed, | ||
60 | the implementation is different. Particularly: | ||
61 | |||
62 | --- The WRR algorithm is different. Our version looks more | ||
63 | reasonable (I hope) and works when quanta are allowed to be | ||
64 | less than MTU, which is always the case when real time classes | ||
65 | have small rates. Note, that the statement of [3] is | ||
66 | incomplete, delay may actually be estimated even if class | ||
67 | per-round allotment is less than MTU. Namely, if per-round | ||
68 | allotment is W*r_i, and r_1+...+r_k = r < 1 | ||
69 | |||
70 | delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B | ||
71 | |||
72 | In the worst case we have IntServ estimate with D = W*r+k*MTU | ||
73 | and C = MTU*r. The proof (if correct at all) is trivial. | ||
74 | |||
75 | |||
76 | --- It seems that cbq-2.0 is not very accurate. At least, I cannot | ||
77 | interpret some places, which look like wrong translations | ||
78 | from NS. Anyone is advised to find these differences | ||
79 | and explain to me, why I am wrong 8). | ||
80 | |||
81 | --- Linux has no EOI event, so that we cannot estimate true class | ||
82 | idle time. Workaround is to consider the next dequeue event | ||
83 | as sign that previous packet is finished. This is wrong because of | ||
84 | internal device queueing, but on a permanently loaded link it is true. | ||
85 | Moreover, combined with clock integrator, this scheme looks | ||
86 | very close to an ideal solution. */ | ||
87 | |||
88 | struct cbq_sched_data; | ||
89 | |||
90 | |||
91 | struct cbq_class | ||
92 | { | ||
93 | struct cbq_class *next; /* hash table link */ | ||
94 | struct cbq_class *next_alive; /* next class with backlog in this priority band */ | ||
95 | |||
96 | /* Parameters */ | ||
97 | u32 classid; | ||
98 | unsigned char priority; /* class priority */ | ||
99 | unsigned char priority2; /* priority to be used after overlimit */ | ||
100 | unsigned char ewma_log; /* time constant for idle time calculation */ | ||
101 | unsigned char ovl_strategy; | ||
102 | #ifdef CONFIG_NET_CLS_POLICE | ||
103 | unsigned char police; | ||
104 | #endif | ||
105 | |||
106 | u32 defmap; | ||
107 | |||
108 | /* Link-sharing scheduler parameters */ | ||
109 | long maxidle; /* Class parameters: see below. */ | ||
110 | long offtime; | ||
111 | long minidle; | ||
112 | u32 avpkt; | ||
113 | struct qdisc_rate_table *R_tab; | ||
114 | |||
115 | /* Overlimit strategy parameters */ | ||
116 | void (*overlimit)(struct cbq_class *cl); | ||
117 | long penalty; | ||
118 | |||
119 | /* General scheduler (WRR) parameters */ | ||
120 | long allot; | ||
121 | long quantum; /* Allotment per WRR round */ | ||
122 | long weight; /* Relative allotment: see below */ | ||
123 | |||
124 | struct Qdisc *qdisc; /* Ptr to CBQ discipline */ | ||
125 | struct cbq_class *split; /* Ptr to split node */ | ||
126 | struct cbq_class *share; /* Ptr to LS parent in the class tree */ | ||
127 | struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ | ||
128 | struct cbq_class *borrow; /* NULL if class is bandwidth limited; | ||
129 | parent otherwise */ | ||
130 | struct cbq_class *sibling; /* Sibling chain */ | ||
131 | struct cbq_class *children; /* Pointer to children chain */ | ||
132 | |||
133 | struct Qdisc *q; /* Elementary queueing discipline */ | ||
134 | |||
135 | |||
136 | /* Variables */ | ||
137 | unsigned char cpriority; /* Effective priority */ | ||
138 | unsigned char delayed; | ||
139 | unsigned char level; /* level of the class in hierarchy: | ||
140 | 0 for leaf classes, and maximal | ||
141 | level of children + 1 for nodes. | ||
142 | */ | ||
143 | |||
144 | psched_time_t last; /* Last end of service */ | ||
145 | psched_time_t undertime; | ||
146 | long avgidle; | ||
147 | long deficit; /* Saved deficit for WRR */ | ||
148 | unsigned long penalized; | ||
149 | struct gnet_stats_basic bstats; | ||
150 | struct gnet_stats_queue qstats; | ||
151 | struct gnet_stats_rate_est rate_est; | ||
152 | spinlock_t *stats_lock; | ||
153 | struct tc_cbq_xstats xstats; | ||
154 | |||
155 | struct tcf_proto *filter_list; | ||
156 | |||
157 | int refcnt; | ||
158 | int filters; | ||
159 | |||
160 | struct cbq_class *defaults[TC_PRIO_MAX+1]; | ||
161 | }; | ||
162 | |||
163 | struct cbq_sched_data | ||
164 | { | ||
165 | struct cbq_class *classes[16]; /* Hash table of all classes */ | ||
166 | int nclasses[TC_CBQ_MAXPRIO+1]; | ||
167 | unsigned quanta[TC_CBQ_MAXPRIO+1]; | ||
168 | |||
169 | struct cbq_class link; | ||
170 | |||
171 | unsigned activemask; | ||
172 | struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes | ||
173 | with backlog */ | ||
174 | |||
175 | #ifdef CONFIG_NET_CLS_POLICE | ||
176 | struct cbq_class *rx_class; | ||
177 | #endif | ||
178 | struct cbq_class *tx_class; | ||
179 | struct cbq_class *tx_borrowed; | ||
180 | int tx_len; | ||
181 | psched_time_t now; /* Cached timestamp */ | ||
182 | psched_time_t now_rt; /* Cached real time */ | ||
183 | unsigned pmask; | ||
184 | |||
185 | struct timer_list delay_timer; | ||
186 | struct timer_list wd_timer; /* Watchdog timer, | ||
187 | started when CBQ has | ||
188 | backlog, but cannot | ||
189 | transmit just now */ | ||
190 | long wd_expires; | ||
191 | int toplevel; | ||
192 | u32 hgenerator; | ||
193 | }; | ||
194 | |||
195 | |||
196 | #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) | ||
197 | |||
198 | |||
199 | static __inline__ unsigned cbq_hash(u32 h) | ||
200 | { | ||
201 | h ^= h>>8; | ||
202 | h ^= h>>4; | ||
203 | return h&0xF; | ||
204 | } | ||
205 | |||
206 | static __inline__ struct cbq_class * | ||
207 | cbq_class_lookup(struct cbq_sched_data *q, u32 classid) | ||
208 | { | ||
209 | struct cbq_class *cl; | ||
210 | |||
211 | for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) | ||
212 | if (cl->classid == classid) | ||
213 | return cl; | ||
214 | return NULL; | ||
215 | } | ||
216 | |||
217 | #ifdef CONFIG_NET_CLS_POLICE | ||
218 | |||
219 | static struct cbq_class * | ||
220 | cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) | ||
221 | { | ||
222 | struct cbq_class *cl, *new; | ||
223 | |||
224 | for (cl = this->tparent; cl; cl = cl->tparent) | ||
225 | if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) | ||
226 | return new; | ||
227 | |||
228 | return NULL; | ||
229 | } | ||
230 | |||
231 | #endif | ||
232 | |||
233 | /* Classify packet. The procedure is pretty complicated, but | ||
234 | it allows us to combine link sharing and priority scheduling | ||
235 | transparently. | ||
236 | |||
237 | Namely, you can put link sharing rules (f.e. route based) at root of CBQ, | ||
238 | so that it resolves to split nodes. Then packets are classified | ||
239 | by logical priority, or a more specific classifier may be attached | ||
240 | to the split node. | ||
241 | */ | ||
242 | |||
243 | static struct cbq_class * | ||
244 | cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) | ||
245 | { | ||
246 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
247 | struct cbq_class *head = &q->link; | ||
248 | struct cbq_class **defmap; | ||
249 | struct cbq_class *cl = NULL; | ||
250 | u32 prio = skb->priority; | ||
251 | struct tcf_result res; | ||
252 | |||
253 | /* | ||
254 | * Step 1. If skb->priority points to one of our classes, use it. | ||
255 | */ | ||
256 | if (TC_H_MAJ(prio^sch->handle) == 0 && | ||
257 | (cl = cbq_class_lookup(q, prio)) != NULL) | ||
258 | return cl; | ||
259 | |||
260 | *qerr = NET_XMIT_DROP; | ||
261 | for (;;) { | ||
262 | int result = 0; | ||
263 | defmap = head->defaults; | ||
264 | |||
265 | /* | ||
266 | * Step 2+n. Apply classifier. | ||
267 | */ | ||
268 | if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) | ||
269 | goto fallback; | ||
270 | |||
271 | if ((cl = (void*)res.class) == NULL) { | ||
272 | if (TC_H_MAJ(res.classid)) | ||
273 | cl = cbq_class_lookup(q, res.classid); | ||
274 | else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) | ||
275 | cl = defmap[TC_PRIO_BESTEFFORT]; | ||
276 | |||
277 | if (cl == NULL || cl->level >= head->level) | ||
278 | goto fallback; | ||
279 | } | ||
280 | |||
281 | #ifdef CONFIG_NET_CLS_ACT | ||
282 | switch (result) { | ||
283 | case TC_ACT_QUEUED: | ||
284 | case TC_ACT_STOLEN: | ||
285 | *qerr = NET_XMIT_SUCCESS; | ||
286 | case TC_ACT_SHOT: | ||
287 | return NULL; | ||
288 | } | ||
289 | #elif defined(CONFIG_NET_CLS_POLICE) | ||
290 | switch (result) { | ||
291 | case TC_POLICE_RECLASSIFY: | ||
292 | return cbq_reclassify(skb, cl); | ||
293 | case TC_POLICE_SHOT: | ||
294 | return NULL; | ||
295 | default: | ||
296 | break; | ||
297 | } | ||
298 | #endif | ||
299 | if (cl->level == 0) | ||
300 | return cl; | ||
301 | |||
302 | /* | ||
303 | * Step 3+n. If classifier selected a link sharing class, | ||
304 | * apply agency specific classifier. | ||
305 | * Repeat this procdure until we hit a leaf node. | ||
306 | */ | ||
307 | head = cl; | ||
308 | } | ||
309 | |||
310 | fallback: | ||
311 | cl = head; | ||
312 | |||
313 | /* | ||
314 | * Step 4. No success... | ||
315 | */ | ||
316 | if (TC_H_MAJ(prio) == 0 && | ||
317 | !(cl = head->defaults[prio&TC_PRIO_MAX]) && | ||
318 | !(cl = head->defaults[TC_PRIO_BESTEFFORT])) | ||
319 | return head; | ||
320 | |||
321 | return cl; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | A packet has just been enqueued on the empty class. | ||
326 | cbq_activate_class adds it to the tail of active class list | ||
327 | of its priority band. | ||
328 | */ | ||
329 | |||
330 | static __inline__ void cbq_activate_class(struct cbq_class *cl) | ||
331 | { | ||
332 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
333 | int prio = cl->cpriority; | ||
334 | struct cbq_class *cl_tail; | ||
335 | |||
336 | cl_tail = q->active[prio]; | ||
337 | q->active[prio] = cl; | ||
338 | |||
339 | if (cl_tail != NULL) { | ||
340 | cl->next_alive = cl_tail->next_alive; | ||
341 | cl_tail->next_alive = cl; | ||
342 | } else { | ||
343 | cl->next_alive = cl; | ||
344 | q->activemask |= (1<<prio); | ||
345 | } | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | Unlink class from active chain. | ||
350 | Note that this same procedure is done directly in cbq_dequeue* | ||
351 | during round-robin procedure. | ||
352 | */ | ||
353 | |||
354 | static void cbq_deactivate_class(struct cbq_class *this) | ||
355 | { | ||
356 | struct cbq_sched_data *q = qdisc_priv(this->qdisc); | ||
357 | int prio = this->cpriority; | ||
358 | struct cbq_class *cl; | ||
359 | struct cbq_class *cl_prev = q->active[prio]; | ||
360 | |||
361 | do { | ||
362 | cl = cl_prev->next_alive; | ||
363 | if (cl == this) { | ||
364 | cl_prev->next_alive = cl->next_alive; | ||
365 | cl->next_alive = NULL; | ||
366 | |||
367 | if (cl == q->active[prio]) { | ||
368 | q->active[prio] = cl_prev; | ||
369 | if (cl == q->active[prio]) { | ||
370 | q->active[prio] = NULL; | ||
371 | q->activemask &= ~(1<<prio); | ||
372 | return; | ||
373 | } | ||
374 | } | ||
375 | |||
376 | cl = cl_prev->next_alive; | ||
377 | return; | ||
378 | } | ||
379 | } while ((cl_prev = cl) != q->active[prio]); | ||
380 | } | ||
381 | |||
382 | static void | ||
383 | cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) | ||
384 | { | ||
385 | int toplevel = q->toplevel; | ||
386 | |||
387 | if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { | ||
388 | psched_time_t now; | ||
389 | psched_tdiff_t incr; | ||
390 | |||
391 | PSCHED_GET_TIME(now); | ||
392 | incr = PSCHED_TDIFF(now, q->now_rt); | ||
393 | PSCHED_TADD2(q->now, incr, now); | ||
394 | |||
395 | do { | ||
396 | if (PSCHED_TLESS(cl->undertime, now)) { | ||
397 | q->toplevel = cl->level; | ||
398 | return; | ||
399 | } | ||
400 | } while ((cl=cl->borrow) != NULL && toplevel > cl->level); | ||
401 | } | ||
402 | } | ||
403 | |||
404 | static int | ||
405 | cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
406 | { | ||
407 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
408 | int len = skb->len; | ||
409 | int ret; | ||
410 | struct cbq_class *cl = cbq_classify(skb, sch, &ret); | ||
411 | |||
412 | #ifdef CONFIG_NET_CLS_POLICE | ||
413 | q->rx_class = cl; | ||
414 | #endif | ||
415 | if (cl == NULL) { | ||
416 | if (ret == NET_XMIT_DROP) | ||
417 | sch->qstats.drops++; | ||
418 | kfree_skb(skb); | ||
419 | return ret; | ||
420 | } | ||
421 | |||
422 | #ifdef CONFIG_NET_CLS_POLICE | ||
423 | cl->q->__parent = sch; | ||
424 | #endif | ||
425 | if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) { | ||
426 | sch->q.qlen++; | ||
427 | sch->bstats.packets++; | ||
428 | sch->bstats.bytes+=len; | ||
429 | cbq_mark_toplevel(q, cl); | ||
430 | if (!cl->next_alive) | ||
431 | cbq_activate_class(cl); | ||
432 | return ret; | ||
433 | } | ||
434 | |||
435 | sch->qstats.drops++; | ||
436 | cbq_mark_toplevel(q, cl); | ||
437 | cl->qstats.drops++; | ||
438 | return ret; | ||
439 | } | ||
440 | |||
441 | static int | ||
442 | cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) | ||
443 | { | ||
444 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
445 | struct cbq_class *cl; | ||
446 | int ret; | ||
447 | |||
448 | if ((cl = q->tx_class) == NULL) { | ||
449 | kfree_skb(skb); | ||
450 | sch->qstats.drops++; | ||
451 | return NET_XMIT_CN; | ||
452 | } | ||
453 | q->tx_class = NULL; | ||
454 | |||
455 | cbq_mark_toplevel(q, cl); | ||
456 | |||
457 | #ifdef CONFIG_NET_CLS_POLICE | ||
458 | q->rx_class = cl; | ||
459 | cl->q->__parent = sch; | ||
460 | #endif | ||
461 | if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { | ||
462 | sch->q.qlen++; | ||
463 | sch->qstats.requeues++; | ||
464 | if (!cl->next_alive) | ||
465 | cbq_activate_class(cl); | ||
466 | return 0; | ||
467 | } | ||
468 | sch->qstats.drops++; | ||
469 | cl->qstats.drops++; | ||
470 | return ret; | ||
471 | } | ||
472 | |||
473 | /* Overlimit actions */ | ||
474 | |||
475 | /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ | ||
476 | |||
477 | static void cbq_ovl_classic(struct cbq_class *cl) | ||
478 | { | ||
479 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
480 | psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); | ||
481 | |||
482 | if (!cl->delayed) { | ||
483 | delay += cl->offtime; | ||
484 | |||
485 | /* | ||
486 | Class goes to sleep, so that it will have no | ||
487 | chance to work avgidle. Let's forgive it 8) | ||
488 | |||
489 | BTW cbq-2.0 has a crap in this | ||
490 | place, apparently they forgot to shift it by cl->ewma_log. | ||
491 | */ | ||
492 | if (cl->avgidle < 0) | ||
493 | delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); | ||
494 | if (cl->avgidle < cl->minidle) | ||
495 | cl->avgidle = cl->minidle; | ||
496 | if (delay <= 0) | ||
497 | delay = 1; | ||
498 | PSCHED_TADD2(q->now, delay, cl->undertime); | ||
499 | |||
500 | cl->xstats.overactions++; | ||
501 | cl->delayed = 1; | ||
502 | } | ||
503 | if (q->wd_expires == 0 || q->wd_expires > delay) | ||
504 | q->wd_expires = delay; | ||
505 | |||
506 | /* Dirty work! We must schedule wakeups based on | ||
507 | real available rate, rather than leaf rate, | ||
508 | which may be tiny (even zero). | ||
509 | */ | ||
510 | if (q->toplevel == TC_CBQ_MAXLEVEL) { | ||
511 | struct cbq_class *b; | ||
512 | psched_tdiff_t base_delay = q->wd_expires; | ||
513 | |||
514 | for (b = cl->borrow; b; b = b->borrow) { | ||
515 | delay = PSCHED_TDIFF(b->undertime, q->now); | ||
516 | if (delay < base_delay) { | ||
517 | if (delay <= 0) | ||
518 | delay = 1; | ||
519 | base_delay = delay; | ||
520 | } | ||
521 | } | ||
522 | |||
523 | q->wd_expires = base_delay; | ||
524 | } | ||
525 | } | ||
526 | |||
527 | /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when | ||
528 | they go overlimit | ||
529 | */ | ||
530 | |||
531 | static void cbq_ovl_rclassic(struct cbq_class *cl) | ||
532 | { | ||
533 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
534 | struct cbq_class *this = cl; | ||
535 | |||
536 | do { | ||
537 | if (cl->level > q->toplevel) { | ||
538 | cl = NULL; | ||
539 | break; | ||
540 | } | ||
541 | } while ((cl = cl->borrow) != NULL); | ||
542 | |||
543 | if (cl == NULL) | ||
544 | cl = this; | ||
545 | cbq_ovl_classic(cl); | ||
546 | } | ||
547 | |||
548 | /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ | ||
549 | |||
550 | static void cbq_ovl_delay(struct cbq_class *cl) | ||
551 | { | ||
552 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
553 | psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); | ||
554 | |||
555 | if (!cl->delayed) { | ||
556 | unsigned long sched = jiffies; | ||
557 | |||
558 | delay += cl->offtime; | ||
559 | if (cl->avgidle < 0) | ||
560 | delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); | ||
561 | if (cl->avgidle < cl->minidle) | ||
562 | cl->avgidle = cl->minidle; | ||
563 | PSCHED_TADD2(q->now, delay, cl->undertime); | ||
564 | |||
565 | if (delay > 0) { | ||
566 | sched += PSCHED_US2JIFFIE(delay) + cl->penalty; | ||
567 | cl->penalized = sched; | ||
568 | cl->cpriority = TC_CBQ_MAXPRIO; | ||
569 | q->pmask |= (1<<TC_CBQ_MAXPRIO); | ||
570 | if (del_timer(&q->delay_timer) && | ||
571 | (long)(q->delay_timer.expires - sched) > 0) | ||
572 | q->delay_timer.expires = sched; | ||
573 | add_timer(&q->delay_timer); | ||
574 | cl->delayed = 1; | ||
575 | cl->xstats.overactions++; | ||
576 | return; | ||
577 | } | ||
578 | delay = 1; | ||
579 | } | ||
580 | if (q->wd_expires == 0 || q->wd_expires > delay) | ||
581 | q->wd_expires = delay; | ||
582 | } | ||
583 | |||
584 | /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ | ||
585 | |||
586 | static void cbq_ovl_lowprio(struct cbq_class *cl) | ||
587 | { | ||
588 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
589 | |||
590 | cl->penalized = jiffies + cl->penalty; | ||
591 | |||
592 | if (cl->cpriority != cl->priority2) { | ||
593 | cl->cpriority = cl->priority2; | ||
594 | q->pmask |= (1<<cl->cpriority); | ||
595 | cl->xstats.overactions++; | ||
596 | } | ||
597 | cbq_ovl_classic(cl); | ||
598 | } | ||
599 | |||
600 | /* TC_CBQ_OVL_DROP: penalize class by dropping */ | ||
601 | |||
602 | static void cbq_ovl_drop(struct cbq_class *cl) | ||
603 | { | ||
604 | if (cl->q->ops->drop) | ||
605 | if (cl->q->ops->drop(cl->q)) | ||
606 | cl->qdisc->q.qlen--; | ||
607 | cl->xstats.overactions++; | ||
608 | cbq_ovl_classic(cl); | ||
609 | } | ||
610 | |||
611 | static void cbq_watchdog(unsigned long arg) | ||
612 | { | ||
613 | struct Qdisc *sch = (struct Qdisc*)arg; | ||
614 | |||
615 | sch->flags &= ~TCQ_F_THROTTLED; | ||
616 | netif_schedule(sch->dev); | ||
617 | } | ||
618 | |||
619 | static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) | ||
620 | { | ||
621 | struct cbq_class *cl; | ||
622 | struct cbq_class *cl_prev = q->active[prio]; | ||
623 | unsigned long now = jiffies; | ||
624 | unsigned long sched = now; | ||
625 | |||
626 | if (cl_prev == NULL) | ||
627 | return now; | ||
628 | |||
629 | do { | ||
630 | cl = cl_prev->next_alive; | ||
631 | if ((long)(now - cl->penalized) > 0) { | ||
632 | cl_prev->next_alive = cl->next_alive; | ||
633 | cl->next_alive = NULL; | ||
634 | cl->cpriority = cl->priority; | ||
635 | cl->delayed = 0; | ||
636 | cbq_activate_class(cl); | ||
637 | |||
638 | if (cl == q->active[prio]) { | ||
639 | q->active[prio] = cl_prev; | ||
640 | if (cl == q->active[prio]) { | ||
641 | q->active[prio] = NULL; | ||
642 | return 0; | ||
643 | } | ||
644 | } | ||
645 | |||
646 | cl = cl_prev->next_alive; | ||
647 | } else if ((long)(sched - cl->penalized) > 0) | ||
648 | sched = cl->penalized; | ||
649 | } while ((cl_prev = cl) != q->active[prio]); | ||
650 | |||
651 | return (long)(sched - now); | ||
652 | } | ||
653 | |||
654 | static void cbq_undelay(unsigned long arg) | ||
655 | { | ||
656 | struct Qdisc *sch = (struct Qdisc*)arg; | ||
657 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
658 | long delay = 0; | ||
659 | unsigned pmask; | ||
660 | |||
661 | pmask = q->pmask; | ||
662 | q->pmask = 0; | ||
663 | |||
664 | while (pmask) { | ||
665 | int prio = ffz(~pmask); | ||
666 | long tmp; | ||
667 | |||
668 | pmask &= ~(1<<prio); | ||
669 | |||
670 | tmp = cbq_undelay_prio(q, prio); | ||
671 | if (tmp > 0) { | ||
672 | q->pmask |= 1<<prio; | ||
673 | if (tmp < delay || delay == 0) | ||
674 | delay = tmp; | ||
675 | } | ||
676 | } | ||
677 | |||
678 | if (delay) { | ||
679 | q->delay_timer.expires = jiffies + delay; | ||
680 | add_timer(&q->delay_timer); | ||
681 | } | ||
682 | |||
683 | sch->flags &= ~TCQ_F_THROTTLED; | ||
684 | netif_schedule(sch->dev); | ||
685 | } | ||
686 | |||
687 | |||
688 | #ifdef CONFIG_NET_CLS_POLICE | ||
689 | |||
690 | static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) | ||
691 | { | ||
692 | int len = skb->len; | ||
693 | struct Qdisc *sch = child->__parent; | ||
694 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
695 | struct cbq_class *cl = q->rx_class; | ||
696 | |||
697 | q->rx_class = NULL; | ||
698 | |||
699 | if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { | ||
700 | |||
701 | cbq_mark_toplevel(q, cl); | ||
702 | |||
703 | q->rx_class = cl; | ||
704 | cl->q->__parent = sch; | ||
705 | |||
706 | if (cl->q->enqueue(skb, cl->q) == 0) { | ||
707 | sch->q.qlen++; | ||
708 | sch->bstats.packets++; | ||
709 | sch->bstats.bytes+=len; | ||
710 | if (!cl->next_alive) | ||
711 | cbq_activate_class(cl); | ||
712 | return 0; | ||
713 | } | ||
714 | sch->qstats.drops++; | ||
715 | return 0; | ||
716 | } | ||
717 | |||
718 | sch->qstats.drops++; | ||
719 | return -1; | ||
720 | } | ||
721 | #endif | ||
722 | |||
723 | /* | ||
724 | It is mission critical procedure. | ||
725 | |||
726 | We "regenerate" toplevel cutoff, if transmitting class | ||
727 | has backlog and it is not regulated. It is not part of | ||
728 | original CBQ description, but looks more reasonable. | ||
729 | Probably, it is wrong. This question needs further investigation. | ||
730 | */ | ||
731 | |||
732 | static __inline__ void | ||
733 | cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, | ||
734 | struct cbq_class *borrowed) | ||
735 | { | ||
736 | if (cl && q->toplevel >= borrowed->level) { | ||
737 | if (cl->q->q.qlen > 1) { | ||
738 | do { | ||
739 | if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { | ||
740 | q->toplevel = borrowed->level; | ||
741 | return; | ||
742 | } | ||
743 | } while ((borrowed=borrowed->borrow) != NULL); | ||
744 | } | ||
745 | #if 0 | ||
746 | /* It is not necessary now. Uncommenting it | ||
747 | will save CPU cycles, but decrease fairness. | ||
748 | */ | ||
749 | q->toplevel = TC_CBQ_MAXLEVEL; | ||
750 | #endif | ||
751 | } | ||
752 | } | ||
753 | |||
754 | static void | ||
755 | cbq_update(struct cbq_sched_data *q) | ||
756 | { | ||
757 | struct cbq_class *this = q->tx_class; | ||
758 | struct cbq_class *cl = this; | ||
759 | int len = q->tx_len; | ||
760 | |||
761 | q->tx_class = NULL; | ||
762 | |||
763 | for ( ; cl; cl = cl->share) { | ||
764 | long avgidle = cl->avgidle; | ||
765 | long idle; | ||
766 | |||
767 | cl->bstats.packets++; | ||
768 | cl->bstats.bytes += len; | ||
769 | |||
770 | /* | ||
771 | (now - last) is total time between packet right edges. | ||
772 | (last_pktlen/rate) is "virtual" busy time, so that | ||
773 | |||
774 | idle = (now - last) - last_pktlen/rate | ||
775 | */ | ||
776 | |||
777 | idle = PSCHED_TDIFF(q->now, cl->last); | ||
778 | if ((unsigned long)idle > 128*1024*1024) { | ||
779 | avgidle = cl->maxidle; | ||
780 | } else { | ||
781 | idle -= L2T(cl, len); | ||
782 | |||
783 | /* true_avgidle := (1-W)*true_avgidle + W*idle, | ||
784 | where W=2^{-ewma_log}. But cl->avgidle is scaled: | ||
785 | cl->avgidle == true_avgidle/W, | ||
786 | hence: | ||
787 | */ | ||
788 | avgidle += idle - (avgidle>>cl->ewma_log); | ||
789 | } | ||
790 | |||
791 | if (avgidle <= 0) { | ||
792 | /* Overlimit or at-limit */ | ||
793 | |||
794 | if (avgidle < cl->minidle) | ||
795 | avgidle = cl->minidle; | ||
796 | |||
797 | cl->avgidle = avgidle; | ||
798 | |||
799 | /* Calculate expected time, when this class | ||
800 | will be allowed to send. | ||
801 | It will occur, when: | ||
802 | (1-W)*true_avgidle + W*delay = 0, i.e. | ||
803 | idle = (1/W - 1)*(-true_avgidle) | ||
804 | or | ||
805 | idle = (1 - W)*(-cl->avgidle); | ||
806 | */ | ||
807 | idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); | ||
808 | |||
809 | /* | ||
810 | That is not all. | ||
811 | To maintain the rate allocated to the class, | ||
812 | we add to undertime virtual clock, | ||
813 | necessary to complete transmitted packet. | ||
814 | (len/phys_bandwidth has been already passed | ||
815 | to the moment of cbq_update) | ||
816 | */ | ||
817 | |||
818 | idle -= L2T(&q->link, len); | ||
819 | idle += L2T(cl, len); | ||
820 | |||
821 | PSCHED_AUDIT_TDIFF(idle); | ||
822 | |||
823 | PSCHED_TADD2(q->now, idle, cl->undertime); | ||
824 | } else { | ||
825 | /* Underlimit */ | ||
826 | |||
827 | PSCHED_SET_PASTPERFECT(cl->undertime); | ||
828 | if (avgidle > cl->maxidle) | ||
829 | cl->avgidle = cl->maxidle; | ||
830 | else | ||
831 | cl->avgidle = avgidle; | ||
832 | } | ||
833 | cl->last = q->now; | ||
834 | } | ||
835 | |||
836 | cbq_update_toplevel(q, this, q->tx_borrowed); | ||
837 | } | ||
838 | |||
839 | static __inline__ struct cbq_class * | ||
840 | cbq_under_limit(struct cbq_class *cl) | ||
841 | { | ||
842 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
843 | struct cbq_class *this_cl = cl; | ||
844 | |||
845 | if (cl->tparent == NULL) | ||
846 | return cl; | ||
847 | |||
848 | if (PSCHED_IS_PASTPERFECT(cl->undertime) || | ||
849 | !PSCHED_TLESS(q->now, cl->undertime)) { | ||
850 | cl->delayed = 0; | ||
851 | return cl; | ||
852 | } | ||
853 | |||
854 | do { | ||
855 | /* It is very suspicious place. Now overlimit | ||
856 | action is generated for not bounded classes | ||
857 | only if link is completely congested. | ||
858 | Though it is in agree with ancestor-only paradigm, | ||
859 | it looks very stupid. Particularly, | ||
860 | it means that this chunk of code will either | ||
861 | never be called or result in strong amplification | ||
862 | of burstiness. Dangerous, silly, and, however, | ||
863 | no another solution exists. | ||
864 | */ | ||
865 | if ((cl = cl->borrow) == NULL) { | ||
866 | this_cl->qstats.overlimits++; | ||
867 | this_cl->overlimit(this_cl); | ||
868 | return NULL; | ||
869 | } | ||
870 | if (cl->level > q->toplevel) | ||
871 | return NULL; | ||
872 | } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && | ||
873 | PSCHED_TLESS(q->now, cl->undertime)); | ||
874 | |||
875 | cl->delayed = 0; | ||
876 | return cl; | ||
877 | } | ||
878 | |||
879 | static __inline__ struct sk_buff * | ||
880 | cbq_dequeue_prio(struct Qdisc *sch, int prio) | ||
881 | { | ||
882 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
883 | struct cbq_class *cl_tail, *cl_prev, *cl; | ||
884 | struct sk_buff *skb; | ||
885 | int deficit; | ||
886 | |||
887 | cl_tail = cl_prev = q->active[prio]; | ||
888 | cl = cl_prev->next_alive; | ||
889 | |||
890 | do { | ||
891 | deficit = 0; | ||
892 | |||
893 | /* Start round */ | ||
894 | do { | ||
895 | struct cbq_class *borrow = cl; | ||
896 | |||
897 | if (cl->q->q.qlen && | ||
898 | (borrow = cbq_under_limit(cl)) == NULL) | ||
899 | goto skip_class; | ||
900 | |||
901 | if (cl->deficit <= 0) { | ||
902 | /* Class exhausted its allotment per | ||
903 | this round. Switch to the next one. | ||
904 | */ | ||
905 | deficit = 1; | ||
906 | cl->deficit += cl->quantum; | ||
907 | goto next_class; | ||
908 | } | ||
909 | |||
910 | skb = cl->q->dequeue(cl->q); | ||
911 | |||
912 | /* Class did not give us any skb :-( | ||
913 | It could occur even if cl->q->q.qlen != 0 | ||
914 | f.e. if cl->q == "tbf" | ||
915 | */ | ||
916 | if (skb == NULL) | ||
917 | goto skip_class; | ||
918 | |||
919 | cl->deficit -= skb->len; | ||
920 | q->tx_class = cl; | ||
921 | q->tx_borrowed = borrow; | ||
922 | if (borrow != cl) { | ||
923 | #ifndef CBQ_XSTATS_BORROWS_BYTES | ||
924 | borrow->xstats.borrows++; | ||
925 | cl->xstats.borrows++; | ||
926 | #else | ||
927 | borrow->xstats.borrows += skb->len; | ||
928 | cl->xstats.borrows += skb->len; | ||
929 | #endif | ||
930 | } | ||
931 | q->tx_len = skb->len; | ||
932 | |||
933 | if (cl->deficit <= 0) { | ||
934 | q->active[prio] = cl; | ||
935 | cl = cl->next_alive; | ||
936 | cl->deficit += cl->quantum; | ||
937 | } | ||
938 | return skb; | ||
939 | |||
940 | skip_class: | ||
941 | if (cl->q->q.qlen == 0 || prio != cl->cpriority) { | ||
942 | /* Class is empty or penalized. | ||
943 | Unlink it from active chain. | ||
944 | */ | ||
945 | cl_prev->next_alive = cl->next_alive; | ||
946 | cl->next_alive = NULL; | ||
947 | |||
948 | /* Did cl_tail point to it? */ | ||
949 | if (cl == cl_tail) { | ||
950 | /* Repair it! */ | ||
951 | cl_tail = cl_prev; | ||
952 | |||
953 | /* Was it the last class in this band? */ | ||
954 | if (cl == cl_tail) { | ||
955 | /* Kill the band! */ | ||
956 | q->active[prio] = NULL; | ||
957 | q->activemask &= ~(1<<prio); | ||
958 | if (cl->q->q.qlen) | ||
959 | cbq_activate_class(cl); | ||
960 | return NULL; | ||
961 | } | ||
962 | |||
963 | q->active[prio] = cl_tail; | ||
964 | } | ||
965 | if (cl->q->q.qlen) | ||
966 | cbq_activate_class(cl); | ||
967 | |||
968 | cl = cl_prev; | ||
969 | } | ||
970 | |||
971 | next_class: | ||
972 | cl_prev = cl; | ||
973 | cl = cl->next_alive; | ||
974 | } while (cl_prev != cl_tail); | ||
975 | } while (deficit); | ||
976 | |||
977 | q->active[prio] = cl_prev; | ||
978 | |||
979 | return NULL; | ||
980 | } | ||
981 | |||
982 | static __inline__ struct sk_buff * | ||
983 | cbq_dequeue_1(struct Qdisc *sch) | ||
984 | { | ||
985 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
986 | struct sk_buff *skb; | ||
987 | unsigned activemask; | ||
988 | |||
989 | activemask = q->activemask&0xFF; | ||
990 | while (activemask) { | ||
991 | int prio = ffz(~activemask); | ||
992 | activemask &= ~(1<<prio); | ||
993 | skb = cbq_dequeue_prio(sch, prio); | ||
994 | if (skb) | ||
995 | return skb; | ||
996 | } | ||
997 | return NULL; | ||
998 | } | ||
999 | |||
1000 | static struct sk_buff * | ||
1001 | cbq_dequeue(struct Qdisc *sch) | ||
1002 | { | ||
1003 | struct sk_buff *skb; | ||
1004 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1005 | psched_time_t now; | ||
1006 | psched_tdiff_t incr; | ||
1007 | |||
1008 | PSCHED_GET_TIME(now); | ||
1009 | incr = PSCHED_TDIFF(now, q->now_rt); | ||
1010 | |||
1011 | if (q->tx_class) { | ||
1012 | psched_tdiff_t incr2; | ||
1013 | /* Time integrator. We calculate EOS time | ||
1014 | by adding expected packet transmission time. | ||
1015 | If real time is greater, we warp artificial clock, | ||
1016 | so that: | ||
1017 | |||
1018 | cbq_time = max(real_time, work); | ||
1019 | */ | ||
1020 | incr2 = L2T(&q->link, q->tx_len); | ||
1021 | PSCHED_TADD(q->now, incr2); | ||
1022 | cbq_update(q); | ||
1023 | if ((incr -= incr2) < 0) | ||
1024 | incr = 0; | ||
1025 | } | ||
1026 | PSCHED_TADD(q->now, incr); | ||
1027 | q->now_rt = now; | ||
1028 | |||
1029 | for (;;) { | ||
1030 | q->wd_expires = 0; | ||
1031 | |||
1032 | skb = cbq_dequeue_1(sch); | ||
1033 | if (skb) { | ||
1034 | sch->q.qlen--; | ||
1035 | sch->flags &= ~TCQ_F_THROTTLED; | ||
1036 | return skb; | ||
1037 | } | ||
1038 | |||
1039 | /* All the classes are overlimit. | ||
1040 | |||
1041 | It is possible, if: | ||
1042 | |||
1043 | 1. Scheduler is empty. | ||
1044 | 2. Toplevel cutoff inhibited borrowing. | ||
1045 | 3. Root class is overlimit. | ||
1046 | |||
1047 | Reset 2d and 3d conditions and retry. | ||
1048 | |||
1049 | Note, that NS and cbq-2.0 are buggy, peeking | ||
1050 | an arbitrary class is appropriate for ancestor-only | ||
1051 | sharing, but not for toplevel algorithm. | ||
1052 | |||
1053 | Our version is better, but slower, because it requires | ||
1054 | two passes, but it is unavoidable with top-level sharing. | ||
1055 | */ | ||
1056 | |||
1057 | if (q->toplevel == TC_CBQ_MAXLEVEL && | ||
1058 | PSCHED_IS_PASTPERFECT(q->link.undertime)) | ||
1059 | break; | ||
1060 | |||
1061 | q->toplevel = TC_CBQ_MAXLEVEL; | ||
1062 | PSCHED_SET_PASTPERFECT(q->link.undertime); | ||
1063 | } | ||
1064 | |||
1065 | /* No packets in scheduler or nobody wants to give them to us :-( | ||
1066 | Sigh... start watchdog timer in the last case. */ | ||
1067 | |||
1068 | if (sch->q.qlen) { | ||
1069 | sch->qstats.overlimits++; | ||
1070 | if (q->wd_expires) { | ||
1071 | long delay = PSCHED_US2JIFFIE(q->wd_expires); | ||
1072 | if (delay <= 0) | ||
1073 | delay = 1; | ||
1074 | mod_timer(&q->wd_timer, jiffies + delay); | ||
1075 | sch->flags |= TCQ_F_THROTTLED; | ||
1076 | } | ||
1077 | } | ||
1078 | return NULL; | ||
1079 | } | ||
1080 | |||
1081 | /* CBQ class maintanance routines */ | ||
1082 | |||
1083 | static void cbq_adjust_levels(struct cbq_class *this) | ||
1084 | { | ||
1085 | if (this == NULL) | ||
1086 | return; | ||
1087 | |||
1088 | do { | ||
1089 | int level = 0; | ||
1090 | struct cbq_class *cl; | ||
1091 | |||
1092 | if ((cl = this->children) != NULL) { | ||
1093 | do { | ||
1094 | if (cl->level > level) | ||
1095 | level = cl->level; | ||
1096 | } while ((cl = cl->sibling) != this->children); | ||
1097 | } | ||
1098 | this->level = level+1; | ||
1099 | } while ((this = this->tparent) != NULL); | ||
1100 | } | ||
1101 | |||
1102 | static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) | ||
1103 | { | ||
1104 | struct cbq_class *cl; | ||
1105 | unsigned h; | ||
1106 | |||
1107 | if (q->quanta[prio] == 0) | ||
1108 | return; | ||
1109 | |||
1110 | for (h=0; h<16; h++) { | ||
1111 | for (cl = q->classes[h]; cl; cl = cl->next) { | ||
1112 | /* BUGGGG... Beware! This expression suffer of | ||
1113 | arithmetic overflows! | ||
1114 | */ | ||
1115 | if (cl->priority == prio) { | ||
1116 | cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ | ||
1117 | q->quanta[prio]; | ||
1118 | } | ||
1119 | if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { | ||
1120 | printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); | ||
1121 | cl->quantum = cl->qdisc->dev->mtu/2 + 1; | ||
1122 | } | ||
1123 | } | ||
1124 | } | ||
1125 | } | ||
1126 | |||
1127 | static void cbq_sync_defmap(struct cbq_class *cl) | ||
1128 | { | ||
1129 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
1130 | struct cbq_class *split = cl->split; | ||
1131 | unsigned h; | ||
1132 | int i; | ||
1133 | |||
1134 | if (split == NULL) | ||
1135 | return; | ||
1136 | |||
1137 | for (i=0; i<=TC_PRIO_MAX; i++) { | ||
1138 | if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) | ||
1139 | split->defaults[i] = NULL; | ||
1140 | } | ||
1141 | |||
1142 | for (i=0; i<=TC_PRIO_MAX; i++) { | ||
1143 | int level = split->level; | ||
1144 | |||
1145 | if (split->defaults[i]) | ||
1146 | continue; | ||
1147 | |||
1148 | for (h=0; h<16; h++) { | ||
1149 | struct cbq_class *c; | ||
1150 | |||
1151 | for (c = q->classes[h]; c; c = c->next) { | ||
1152 | if (c->split == split && c->level < level && | ||
1153 | c->defmap&(1<<i)) { | ||
1154 | split->defaults[i] = c; | ||
1155 | level = c->level; | ||
1156 | } | ||
1157 | } | ||
1158 | } | ||
1159 | } | ||
1160 | } | ||
1161 | |||
1162 | static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) | ||
1163 | { | ||
1164 | struct cbq_class *split = NULL; | ||
1165 | |||
1166 | if (splitid == 0) { | ||
1167 | if ((split = cl->split) == NULL) | ||
1168 | return; | ||
1169 | splitid = split->classid; | ||
1170 | } | ||
1171 | |||
1172 | if (split == NULL || split->classid != splitid) { | ||
1173 | for (split = cl->tparent; split; split = split->tparent) | ||
1174 | if (split->classid == splitid) | ||
1175 | break; | ||
1176 | } | ||
1177 | |||
1178 | if (split == NULL) | ||
1179 | return; | ||
1180 | |||
1181 | if (cl->split != split) { | ||
1182 | cl->defmap = 0; | ||
1183 | cbq_sync_defmap(cl); | ||
1184 | cl->split = split; | ||
1185 | cl->defmap = def&mask; | ||
1186 | } else | ||
1187 | cl->defmap = (cl->defmap&~mask)|(def&mask); | ||
1188 | |||
1189 | cbq_sync_defmap(cl); | ||
1190 | } | ||
1191 | |||
1192 | static void cbq_unlink_class(struct cbq_class *this) | ||
1193 | { | ||
1194 | struct cbq_class *cl, **clp; | ||
1195 | struct cbq_sched_data *q = qdisc_priv(this->qdisc); | ||
1196 | |||
1197 | for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { | ||
1198 | if (cl == this) { | ||
1199 | *clp = cl->next; | ||
1200 | cl->next = NULL; | ||
1201 | break; | ||
1202 | } | ||
1203 | } | ||
1204 | |||
1205 | if (this->tparent) { | ||
1206 | clp=&this->sibling; | ||
1207 | cl = *clp; | ||
1208 | do { | ||
1209 | if (cl == this) { | ||
1210 | *clp = cl->sibling; | ||
1211 | break; | ||
1212 | } | ||
1213 | clp = &cl->sibling; | ||
1214 | } while ((cl = *clp) != this->sibling); | ||
1215 | |||
1216 | if (this->tparent->children == this) { | ||
1217 | this->tparent->children = this->sibling; | ||
1218 | if (this->sibling == this) | ||
1219 | this->tparent->children = NULL; | ||
1220 | } | ||
1221 | } else { | ||
1222 | BUG_TRAP(this->sibling == this); | ||
1223 | } | ||
1224 | } | ||
1225 | |||
1226 | static void cbq_link_class(struct cbq_class *this) | ||
1227 | { | ||
1228 | struct cbq_sched_data *q = qdisc_priv(this->qdisc); | ||
1229 | unsigned h = cbq_hash(this->classid); | ||
1230 | struct cbq_class *parent = this->tparent; | ||
1231 | |||
1232 | this->sibling = this; | ||
1233 | this->next = q->classes[h]; | ||
1234 | q->classes[h] = this; | ||
1235 | |||
1236 | if (parent == NULL) | ||
1237 | return; | ||
1238 | |||
1239 | if (parent->children == NULL) { | ||
1240 | parent->children = this; | ||
1241 | } else { | ||
1242 | this->sibling = parent->children->sibling; | ||
1243 | parent->children->sibling = this; | ||
1244 | } | ||
1245 | } | ||
1246 | |||
1247 | static unsigned int cbq_drop(struct Qdisc* sch) | ||
1248 | { | ||
1249 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1250 | struct cbq_class *cl, *cl_head; | ||
1251 | int prio; | ||
1252 | unsigned int len; | ||
1253 | |||
1254 | for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { | ||
1255 | if ((cl_head = q->active[prio]) == NULL) | ||
1256 | continue; | ||
1257 | |||
1258 | cl = cl_head; | ||
1259 | do { | ||
1260 | if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { | ||
1261 | sch->q.qlen--; | ||
1262 | return len; | ||
1263 | } | ||
1264 | } while ((cl = cl->next_alive) != cl_head); | ||
1265 | } | ||
1266 | return 0; | ||
1267 | } | ||
1268 | |||
1269 | static void | ||
1270 | cbq_reset(struct Qdisc* sch) | ||
1271 | { | ||
1272 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1273 | struct cbq_class *cl; | ||
1274 | int prio; | ||
1275 | unsigned h; | ||
1276 | |||
1277 | q->activemask = 0; | ||
1278 | q->pmask = 0; | ||
1279 | q->tx_class = NULL; | ||
1280 | q->tx_borrowed = NULL; | ||
1281 | del_timer(&q->wd_timer); | ||
1282 | del_timer(&q->delay_timer); | ||
1283 | q->toplevel = TC_CBQ_MAXLEVEL; | ||
1284 | PSCHED_GET_TIME(q->now); | ||
1285 | q->now_rt = q->now; | ||
1286 | |||
1287 | for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) | ||
1288 | q->active[prio] = NULL; | ||
1289 | |||
1290 | for (h = 0; h < 16; h++) { | ||
1291 | for (cl = q->classes[h]; cl; cl = cl->next) { | ||
1292 | qdisc_reset(cl->q); | ||
1293 | |||
1294 | cl->next_alive = NULL; | ||
1295 | PSCHED_SET_PASTPERFECT(cl->undertime); | ||
1296 | cl->avgidle = cl->maxidle; | ||
1297 | cl->deficit = cl->quantum; | ||
1298 | cl->cpriority = cl->priority; | ||
1299 | } | ||
1300 | } | ||
1301 | sch->q.qlen = 0; | ||
1302 | } | ||
1303 | |||
1304 | |||
1305 | static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) | ||
1306 | { | ||
1307 | if (lss->change&TCF_CBQ_LSS_FLAGS) { | ||
1308 | cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; | ||
1309 | cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; | ||
1310 | } | ||
1311 | if (lss->change&TCF_CBQ_LSS_EWMA) | ||
1312 | cl->ewma_log = lss->ewma_log; | ||
1313 | if (lss->change&TCF_CBQ_LSS_AVPKT) | ||
1314 | cl->avpkt = lss->avpkt; | ||
1315 | if (lss->change&TCF_CBQ_LSS_MINIDLE) | ||
1316 | cl->minidle = -(long)lss->minidle; | ||
1317 | if (lss->change&TCF_CBQ_LSS_MAXIDLE) { | ||
1318 | cl->maxidle = lss->maxidle; | ||
1319 | cl->avgidle = lss->maxidle; | ||
1320 | } | ||
1321 | if (lss->change&TCF_CBQ_LSS_OFFTIME) | ||
1322 | cl->offtime = lss->offtime; | ||
1323 | return 0; | ||
1324 | } | ||
1325 | |||
1326 | static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) | ||
1327 | { | ||
1328 | q->nclasses[cl->priority]--; | ||
1329 | q->quanta[cl->priority] -= cl->weight; | ||
1330 | cbq_normalize_quanta(q, cl->priority); | ||
1331 | } | ||
1332 | |||
1333 | static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) | ||
1334 | { | ||
1335 | q->nclasses[cl->priority]++; | ||
1336 | q->quanta[cl->priority] += cl->weight; | ||
1337 | cbq_normalize_quanta(q, cl->priority); | ||
1338 | } | ||
1339 | |||
1340 | static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) | ||
1341 | { | ||
1342 | struct cbq_sched_data *q = qdisc_priv(cl->qdisc); | ||
1343 | |||
1344 | if (wrr->allot) | ||
1345 | cl->allot = wrr->allot; | ||
1346 | if (wrr->weight) | ||
1347 | cl->weight = wrr->weight; | ||
1348 | if (wrr->priority) { | ||
1349 | cl->priority = wrr->priority-1; | ||
1350 | cl->cpriority = cl->priority; | ||
1351 | if (cl->priority >= cl->priority2) | ||
1352 | cl->priority2 = TC_CBQ_MAXPRIO-1; | ||
1353 | } | ||
1354 | |||
1355 | cbq_addprio(q, cl); | ||
1356 | return 0; | ||
1357 | } | ||
1358 | |||
1359 | static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) | ||
1360 | { | ||
1361 | switch (ovl->strategy) { | ||
1362 | case TC_CBQ_OVL_CLASSIC: | ||
1363 | cl->overlimit = cbq_ovl_classic; | ||
1364 | break; | ||
1365 | case TC_CBQ_OVL_DELAY: | ||
1366 | cl->overlimit = cbq_ovl_delay; | ||
1367 | break; | ||
1368 | case TC_CBQ_OVL_LOWPRIO: | ||
1369 | if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || | ||
1370 | ovl->priority2-1 <= cl->priority) | ||
1371 | return -EINVAL; | ||
1372 | cl->priority2 = ovl->priority2-1; | ||
1373 | cl->overlimit = cbq_ovl_lowprio; | ||
1374 | break; | ||
1375 | case TC_CBQ_OVL_DROP: | ||
1376 | cl->overlimit = cbq_ovl_drop; | ||
1377 | break; | ||
1378 | case TC_CBQ_OVL_RCLASSIC: | ||
1379 | cl->overlimit = cbq_ovl_rclassic; | ||
1380 | break; | ||
1381 | default: | ||
1382 | return -EINVAL; | ||
1383 | } | ||
1384 | cl->penalty = (ovl->penalty*HZ)/1000; | ||
1385 | return 0; | ||
1386 | } | ||
1387 | |||
1388 | #ifdef CONFIG_NET_CLS_POLICE | ||
1389 | static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) | ||
1390 | { | ||
1391 | cl->police = p->police; | ||
1392 | |||
1393 | if (cl->q->handle) { | ||
1394 | if (p->police == TC_POLICE_RECLASSIFY) | ||
1395 | cl->q->reshape_fail = cbq_reshape_fail; | ||
1396 | else | ||
1397 | cl->q->reshape_fail = NULL; | ||
1398 | } | ||
1399 | return 0; | ||
1400 | } | ||
1401 | #endif | ||
1402 | |||
1403 | static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) | ||
1404 | { | ||
1405 | cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); | ||
1406 | return 0; | ||
1407 | } | ||
1408 | |||
1409 | static int cbq_init(struct Qdisc *sch, struct rtattr *opt) | ||
1410 | { | ||
1411 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1412 | struct rtattr *tb[TCA_CBQ_MAX]; | ||
1413 | struct tc_ratespec *r; | ||
1414 | |||
1415 | if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 || | ||
1416 | tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || | ||
1417 | RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) | ||
1418 | return -EINVAL; | ||
1419 | |||
1420 | if (tb[TCA_CBQ_LSSOPT-1] && | ||
1421 | RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) | ||
1422 | return -EINVAL; | ||
1423 | |||
1424 | r = RTA_DATA(tb[TCA_CBQ_RATE-1]); | ||
1425 | |||
1426 | if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) | ||
1427 | return -EINVAL; | ||
1428 | |||
1429 | q->link.refcnt = 1; | ||
1430 | q->link.sibling = &q->link; | ||
1431 | q->link.classid = sch->handle; | ||
1432 | q->link.qdisc = sch; | ||
1433 | if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) | ||
1434 | q->link.q = &noop_qdisc; | ||
1435 | |||
1436 | q->link.priority = TC_CBQ_MAXPRIO-1; | ||
1437 | q->link.priority2 = TC_CBQ_MAXPRIO-1; | ||
1438 | q->link.cpriority = TC_CBQ_MAXPRIO-1; | ||
1439 | q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; | ||
1440 | q->link.overlimit = cbq_ovl_classic; | ||
1441 | q->link.allot = psched_mtu(sch->dev); | ||
1442 | q->link.quantum = q->link.allot; | ||
1443 | q->link.weight = q->link.R_tab->rate.rate; | ||
1444 | |||
1445 | q->link.ewma_log = TC_CBQ_DEF_EWMA; | ||
1446 | q->link.avpkt = q->link.allot/2; | ||
1447 | q->link.minidle = -0x7FFFFFFF; | ||
1448 | q->link.stats_lock = &sch->dev->queue_lock; | ||
1449 | |||
1450 | init_timer(&q->wd_timer); | ||
1451 | q->wd_timer.data = (unsigned long)sch; | ||
1452 | q->wd_timer.function = cbq_watchdog; | ||
1453 | init_timer(&q->delay_timer); | ||
1454 | q->delay_timer.data = (unsigned long)sch; | ||
1455 | q->delay_timer.function = cbq_undelay; | ||
1456 | q->toplevel = TC_CBQ_MAXLEVEL; | ||
1457 | PSCHED_GET_TIME(q->now); | ||
1458 | q->now_rt = q->now; | ||
1459 | |||
1460 | cbq_link_class(&q->link); | ||
1461 | |||
1462 | if (tb[TCA_CBQ_LSSOPT-1]) | ||
1463 | cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); | ||
1464 | |||
1465 | cbq_addprio(q, &q->link); | ||
1466 | return 0; | ||
1467 | } | ||
1468 | |||
1469 | static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) | ||
1470 | { | ||
1471 | unsigned char *b = skb->tail; | ||
1472 | |||
1473 | RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); | ||
1474 | return skb->len; | ||
1475 | |||
1476 | rtattr_failure: | ||
1477 | skb_trim(skb, b - skb->data); | ||
1478 | return -1; | ||
1479 | } | ||
1480 | |||
1481 | static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) | ||
1482 | { | ||
1483 | unsigned char *b = skb->tail; | ||
1484 | struct tc_cbq_lssopt opt; | ||
1485 | |||
1486 | opt.flags = 0; | ||
1487 | if (cl->borrow == NULL) | ||
1488 | opt.flags |= TCF_CBQ_LSS_BOUNDED; | ||
1489 | if (cl->share == NULL) | ||
1490 | opt.flags |= TCF_CBQ_LSS_ISOLATED; | ||
1491 | opt.ewma_log = cl->ewma_log; | ||
1492 | opt.level = cl->level; | ||
1493 | opt.avpkt = cl->avpkt; | ||
1494 | opt.maxidle = cl->maxidle; | ||
1495 | opt.minidle = (u32)(-cl->minidle); | ||
1496 | opt.offtime = cl->offtime; | ||
1497 | opt.change = ~0; | ||
1498 | RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); | ||
1499 | return skb->len; | ||
1500 | |||
1501 | rtattr_failure: | ||
1502 | skb_trim(skb, b - skb->data); | ||
1503 | return -1; | ||
1504 | } | ||
1505 | |||
1506 | static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) | ||
1507 | { | ||
1508 | unsigned char *b = skb->tail; | ||
1509 | struct tc_cbq_wrropt opt; | ||
1510 | |||
1511 | opt.flags = 0; | ||
1512 | opt.allot = cl->allot; | ||
1513 | opt.priority = cl->priority+1; | ||
1514 | opt.cpriority = cl->cpriority+1; | ||
1515 | opt.weight = cl->weight; | ||
1516 | RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); | ||
1517 | return skb->len; | ||
1518 | |||
1519 | rtattr_failure: | ||
1520 | skb_trim(skb, b - skb->data); | ||
1521 | return -1; | ||
1522 | } | ||
1523 | |||
1524 | static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) | ||
1525 | { | ||
1526 | unsigned char *b = skb->tail; | ||
1527 | struct tc_cbq_ovl opt; | ||
1528 | |||
1529 | opt.strategy = cl->ovl_strategy; | ||
1530 | opt.priority2 = cl->priority2+1; | ||
1531 | opt.penalty = (cl->penalty*1000)/HZ; | ||
1532 | RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); | ||
1533 | return skb->len; | ||
1534 | |||
1535 | rtattr_failure: | ||
1536 | skb_trim(skb, b - skb->data); | ||
1537 | return -1; | ||
1538 | } | ||
1539 | |||
1540 | static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) | ||
1541 | { | ||
1542 | unsigned char *b = skb->tail; | ||
1543 | struct tc_cbq_fopt opt; | ||
1544 | |||
1545 | if (cl->split || cl->defmap) { | ||
1546 | opt.split = cl->split ? cl->split->classid : 0; | ||
1547 | opt.defmap = cl->defmap; | ||
1548 | opt.defchange = ~0; | ||
1549 | RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); | ||
1550 | } | ||
1551 | return skb->len; | ||
1552 | |||
1553 | rtattr_failure: | ||
1554 | skb_trim(skb, b - skb->data); | ||
1555 | return -1; | ||
1556 | } | ||
1557 | |||
1558 | #ifdef CONFIG_NET_CLS_POLICE | ||
1559 | static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) | ||
1560 | { | ||
1561 | unsigned char *b = skb->tail; | ||
1562 | struct tc_cbq_police opt; | ||
1563 | |||
1564 | if (cl->police) { | ||
1565 | opt.police = cl->police; | ||
1566 | RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); | ||
1567 | } | ||
1568 | return skb->len; | ||
1569 | |||
1570 | rtattr_failure: | ||
1571 | skb_trim(skb, b - skb->data); | ||
1572 | return -1; | ||
1573 | } | ||
1574 | #endif | ||
1575 | |||
1576 | static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) | ||
1577 | { | ||
1578 | if (cbq_dump_lss(skb, cl) < 0 || | ||
1579 | cbq_dump_rate(skb, cl) < 0 || | ||
1580 | cbq_dump_wrr(skb, cl) < 0 || | ||
1581 | cbq_dump_ovl(skb, cl) < 0 || | ||
1582 | #ifdef CONFIG_NET_CLS_POLICE | ||
1583 | cbq_dump_police(skb, cl) < 0 || | ||
1584 | #endif | ||
1585 | cbq_dump_fopt(skb, cl) < 0) | ||
1586 | return -1; | ||
1587 | return 0; | ||
1588 | } | ||
1589 | |||
1590 | static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
1591 | { | ||
1592 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1593 | unsigned char *b = skb->tail; | ||
1594 | struct rtattr *rta; | ||
1595 | |||
1596 | rta = (struct rtattr*)b; | ||
1597 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
1598 | if (cbq_dump_attr(skb, &q->link) < 0) | ||
1599 | goto rtattr_failure; | ||
1600 | rta->rta_len = skb->tail - b; | ||
1601 | return skb->len; | ||
1602 | |||
1603 | rtattr_failure: | ||
1604 | skb_trim(skb, b - skb->data); | ||
1605 | return -1; | ||
1606 | } | ||
1607 | |||
1608 | static int | ||
1609 | cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) | ||
1610 | { | ||
1611 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1612 | |||
1613 | q->link.xstats.avgidle = q->link.avgidle; | ||
1614 | return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); | ||
1615 | } | ||
1616 | |||
1617 | static int | ||
1618 | cbq_dump_class(struct Qdisc *sch, unsigned long arg, | ||
1619 | struct sk_buff *skb, struct tcmsg *tcm) | ||
1620 | { | ||
1621 | struct cbq_class *cl = (struct cbq_class*)arg; | ||
1622 | unsigned char *b = skb->tail; | ||
1623 | struct rtattr *rta; | ||
1624 | |||
1625 | if (cl->tparent) | ||
1626 | tcm->tcm_parent = cl->tparent->classid; | ||
1627 | else | ||
1628 | tcm->tcm_parent = TC_H_ROOT; | ||
1629 | tcm->tcm_handle = cl->classid; | ||
1630 | tcm->tcm_info = cl->q->handle; | ||
1631 | |||
1632 | rta = (struct rtattr*)b; | ||
1633 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
1634 | if (cbq_dump_attr(skb, cl) < 0) | ||
1635 | goto rtattr_failure; | ||
1636 | rta->rta_len = skb->tail - b; | ||
1637 | return skb->len; | ||
1638 | |||
1639 | rtattr_failure: | ||
1640 | skb_trim(skb, b - skb->data); | ||
1641 | return -1; | ||
1642 | } | ||
1643 | |||
1644 | static int | ||
1645 | cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, | ||
1646 | struct gnet_dump *d) | ||
1647 | { | ||
1648 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1649 | struct cbq_class *cl = (struct cbq_class*)arg; | ||
1650 | |||
1651 | cl->qstats.qlen = cl->q->q.qlen; | ||
1652 | cl->xstats.avgidle = cl->avgidle; | ||
1653 | cl->xstats.undertime = 0; | ||
1654 | |||
1655 | if (!PSCHED_IS_PASTPERFECT(cl->undertime)) | ||
1656 | cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); | ||
1657 | |||
1658 | if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || | ||
1659 | #ifdef CONFIG_NET_ESTIMATOR | ||
1660 | gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || | ||
1661 | #endif | ||
1662 | gnet_stats_copy_queue(d, &cl->qstats) < 0) | ||
1663 | return -1; | ||
1664 | |||
1665 | return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); | ||
1666 | } | ||
1667 | |||
1668 | static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, | ||
1669 | struct Qdisc **old) | ||
1670 | { | ||
1671 | struct cbq_class *cl = (struct cbq_class*)arg; | ||
1672 | |||
1673 | if (cl) { | ||
1674 | if (new == NULL) { | ||
1675 | if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) | ||
1676 | return -ENOBUFS; | ||
1677 | } else { | ||
1678 | #ifdef CONFIG_NET_CLS_POLICE | ||
1679 | if (cl->police == TC_POLICE_RECLASSIFY) | ||
1680 | new->reshape_fail = cbq_reshape_fail; | ||
1681 | #endif | ||
1682 | } | ||
1683 | sch_tree_lock(sch); | ||
1684 | *old = cl->q; | ||
1685 | cl->q = new; | ||
1686 | sch->q.qlen -= (*old)->q.qlen; | ||
1687 | qdisc_reset(*old); | ||
1688 | sch_tree_unlock(sch); | ||
1689 | |||
1690 | return 0; | ||
1691 | } | ||
1692 | return -ENOENT; | ||
1693 | } | ||
1694 | |||
1695 | static struct Qdisc * | ||
1696 | cbq_leaf(struct Qdisc *sch, unsigned long arg) | ||
1697 | { | ||
1698 | struct cbq_class *cl = (struct cbq_class*)arg; | ||
1699 | |||
1700 | return cl ? cl->q : NULL; | ||
1701 | } | ||
1702 | |||
1703 | static unsigned long cbq_get(struct Qdisc *sch, u32 classid) | ||
1704 | { | ||
1705 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1706 | struct cbq_class *cl = cbq_class_lookup(q, classid); | ||
1707 | |||
1708 | if (cl) { | ||
1709 | cl->refcnt++; | ||
1710 | return (unsigned long)cl; | ||
1711 | } | ||
1712 | return 0; | ||
1713 | } | ||
1714 | |||
1715 | static void cbq_destroy_filters(struct cbq_class *cl) | ||
1716 | { | ||
1717 | struct tcf_proto *tp; | ||
1718 | |||
1719 | while ((tp = cl->filter_list) != NULL) { | ||
1720 | cl->filter_list = tp->next; | ||
1721 | tcf_destroy(tp); | ||
1722 | } | ||
1723 | } | ||
1724 | |||
1725 | static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) | ||
1726 | { | ||
1727 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1728 | |||
1729 | BUG_TRAP(!cl->filters); | ||
1730 | |||
1731 | cbq_destroy_filters(cl); | ||
1732 | qdisc_destroy(cl->q); | ||
1733 | qdisc_put_rtab(cl->R_tab); | ||
1734 | #ifdef CONFIG_NET_ESTIMATOR | ||
1735 | gen_kill_estimator(&cl->bstats, &cl->rate_est); | ||
1736 | #endif | ||
1737 | if (cl != &q->link) | ||
1738 | kfree(cl); | ||
1739 | } | ||
1740 | |||
1741 | static void | ||
1742 | cbq_destroy(struct Qdisc* sch) | ||
1743 | { | ||
1744 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1745 | struct cbq_class *cl; | ||
1746 | unsigned h; | ||
1747 | |||
1748 | #ifdef CONFIG_NET_CLS_POLICE | ||
1749 | q->rx_class = NULL; | ||
1750 | #endif | ||
1751 | /* | ||
1752 | * Filters must be destroyed first because we don't destroy the | ||
1753 | * classes from root to leafs which means that filters can still | ||
1754 | * be bound to classes which have been destroyed already. --TGR '04 | ||
1755 | */ | ||
1756 | for (h = 0; h < 16; h++) | ||
1757 | for (cl = q->classes[h]; cl; cl = cl->next) | ||
1758 | cbq_destroy_filters(cl); | ||
1759 | |||
1760 | for (h = 0; h < 16; h++) { | ||
1761 | struct cbq_class *next; | ||
1762 | |||
1763 | for (cl = q->classes[h]; cl; cl = next) { | ||
1764 | next = cl->next; | ||
1765 | cbq_destroy_class(sch, cl); | ||
1766 | } | ||
1767 | } | ||
1768 | } | ||
1769 | |||
1770 | static void cbq_put(struct Qdisc *sch, unsigned long arg) | ||
1771 | { | ||
1772 | struct cbq_class *cl = (struct cbq_class*)arg; | ||
1773 | |||
1774 | if (--cl->refcnt == 0) { | ||
1775 | #ifdef CONFIG_NET_CLS_POLICE | ||
1776 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1777 | |||
1778 | spin_lock_bh(&sch->dev->queue_lock); | ||
1779 | if (q->rx_class == cl) | ||
1780 | q->rx_class = NULL; | ||
1781 | spin_unlock_bh(&sch->dev->queue_lock); | ||
1782 | #endif | ||
1783 | |||
1784 | cbq_destroy_class(sch, cl); | ||
1785 | } | ||
1786 | } | ||
1787 | |||
1788 | static int | ||
1789 | cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, | ||
1790 | unsigned long *arg) | ||
1791 | { | ||
1792 | int err; | ||
1793 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1794 | struct cbq_class *cl = (struct cbq_class*)*arg; | ||
1795 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
1796 | struct rtattr *tb[TCA_CBQ_MAX]; | ||
1797 | struct cbq_class *parent; | ||
1798 | struct qdisc_rate_table *rtab = NULL; | ||
1799 | |||
1800 | if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt)) | ||
1801 | return -EINVAL; | ||
1802 | |||
1803 | if (tb[TCA_CBQ_OVL_STRATEGY-1] && | ||
1804 | RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) | ||
1805 | return -EINVAL; | ||
1806 | |||
1807 | if (tb[TCA_CBQ_FOPT-1] && | ||
1808 | RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) | ||
1809 | return -EINVAL; | ||
1810 | |||
1811 | if (tb[TCA_CBQ_RATE-1] && | ||
1812 | RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) | ||
1813 | return -EINVAL; | ||
1814 | |||
1815 | if (tb[TCA_CBQ_LSSOPT-1] && | ||
1816 | RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) | ||
1817 | return -EINVAL; | ||
1818 | |||
1819 | if (tb[TCA_CBQ_WRROPT-1] && | ||
1820 | RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) | ||
1821 | return -EINVAL; | ||
1822 | |||
1823 | #ifdef CONFIG_NET_CLS_POLICE | ||
1824 | if (tb[TCA_CBQ_POLICE-1] && | ||
1825 | RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) | ||
1826 | return -EINVAL; | ||
1827 | #endif | ||
1828 | |||
1829 | if (cl) { | ||
1830 | /* Check parent */ | ||
1831 | if (parentid) { | ||
1832 | if (cl->tparent && cl->tparent->classid != parentid) | ||
1833 | return -EINVAL; | ||
1834 | if (!cl->tparent && parentid != TC_H_ROOT) | ||
1835 | return -EINVAL; | ||
1836 | } | ||
1837 | |||
1838 | if (tb[TCA_CBQ_RATE-1]) { | ||
1839 | rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); | ||
1840 | if (rtab == NULL) | ||
1841 | return -EINVAL; | ||
1842 | } | ||
1843 | |||
1844 | /* Change class parameters */ | ||
1845 | sch_tree_lock(sch); | ||
1846 | |||
1847 | if (cl->next_alive != NULL) | ||
1848 | cbq_deactivate_class(cl); | ||
1849 | |||
1850 | if (rtab) { | ||
1851 | rtab = xchg(&cl->R_tab, rtab); | ||
1852 | qdisc_put_rtab(rtab); | ||
1853 | } | ||
1854 | |||
1855 | if (tb[TCA_CBQ_LSSOPT-1]) | ||
1856 | cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); | ||
1857 | |||
1858 | if (tb[TCA_CBQ_WRROPT-1]) { | ||
1859 | cbq_rmprio(q, cl); | ||
1860 | cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); | ||
1861 | } | ||
1862 | |||
1863 | if (tb[TCA_CBQ_OVL_STRATEGY-1]) | ||
1864 | cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); | ||
1865 | |||
1866 | #ifdef CONFIG_NET_CLS_POLICE | ||
1867 | if (tb[TCA_CBQ_POLICE-1]) | ||
1868 | cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); | ||
1869 | #endif | ||
1870 | |||
1871 | if (tb[TCA_CBQ_FOPT-1]) | ||
1872 | cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); | ||
1873 | |||
1874 | if (cl->q->q.qlen) | ||
1875 | cbq_activate_class(cl); | ||
1876 | |||
1877 | sch_tree_unlock(sch); | ||
1878 | |||
1879 | #ifdef CONFIG_NET_ESTIMATOR | ||
1880 | if (tca[TCA_RATE-1]) | ||
1881 | gen_replace_estimator(&cl->bstats, &cl->rate_est, | ||
1882 | cl->stats_lock, tca[TCA_RATE-1]); | ||
1883 | #endif | ||
1884 | return 0; | ||
1885 | } | ||
1886 | |||
1887 | if (parentid == TC_H_ROOT) | ||
1888 | return -EINVAL; | ||
1889 | |||
1890 | if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || | ||
1891 | tb[TCA_CBQ_LSSOPT-1] == NULL) | ||
1892 | return -EINVAL; | ||
1893 | |||
1894 | rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); | ||
1895 | if (rtab == NULL) | ||
1896 | return -EINVAL; | ||
1897 | |||
1898 | if (classid) { | ||
1899 | err = -EINVAL; | ||
1900 | if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) | ||
1901 | goto failure; | ||
1902 | } else { | ||
1903 | int i; | ||
1904 | classid = TC_H_MAKE(sch->handle,0x8000); | ||
1905 | |||
1906 | for (i=0; i<0x8000; i++) { | ||
1907 | if (++q->hgenerator >= 0x8000) | ||
1908 | q->hgenerator = 1; | ||
1909 | if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) | ||
1910 | break; | ||
1911 | } | ||
1912 | err = -ENOSR; | ||
1913 | if (i >= 0x8000) | ||
1914 | goto failure; | ||
1915 | classid = classid|q->hgenerator; | ||
1916 | } | ||
1917 | |||
1918 | parent = &q->link; | ||
1919 | if (parentid) { | ||
1920 | parent = cbq_class_lookup(q, parentid); | ||
1921 | err = -EINVAL; | ||
1922 | if (parent == NULL) | ||
1923 | goto failure; | ||
1924 | } | ||
1925 | |||
1926 | err = -ENOBUFS; | ||
1927 | cl = kmalloc(sizeof(*cl), GFP_KERNEL); | ||
1928 | if (cl == NULL) | ||
1929 | goto failure; | ||
1930 | memset(cl, 0, sizeof(*cl)); | ||
1931 | cl->R_tab = rtab; | ||
1932 | rtab = NULL; | ||
1933 | cl->refcnt = 1; | ||
1934 | if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) | ||
1935 | cl->q = &noop_qdisc; | ||
1936 | cl->classid = classid; | ||
1937 | cl->tparent = parent; | ||
1938 | cl->qdisc = sch; | ||
1939 | cl->allot = parent->allot; | ||
1940 | cl->quantum = cl->allot; | ||
1941 | cl->weight = cl->R_tab->rate.rate; | ||
1942 | cl->stats_lock = &sch->dev->queue_lock; | ||
1943 | |||
1944 | sch_tree_lock(sch); | ||
1945 | cbq_link_class(cl); | ||
1946 | cl->borrow = cl->tparent; | ||
1947 | if (cl->tparent != &q->link) | ||
1948 | cl->share = cl->tparent; | ||
1949 | cbq_adjust_levels(parent); | ||
1950 | cl->minidle = -0x7FFFFFFF; | ||
1951 | cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); | ||
1952 | cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); | ||
1953 | if (cl->ewma_log==0) | ||
1954 | cl->ewma_log = q->link.ewma_log; | ||
1955 | if (cl->maxidle==0) | ||
1956 | cl->maxidle = q->link.maxidle; | ||
1957 | if (cl->avpkt==0) | ||
1958 | cl->avpkt = q->link.avpkt; | ||
1959 | cl->overlimit = cbq_ovl_classic; | ||
1960 | if (tb[TCA_CBQ_OVL_STRATEGY-1]) | ||
1961 | cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); | ||
1962 | #ifdef CONFIG_NET_CLS_POLICE | ||
1963 | if (tb[TCA_CBQ_POLICE-1]) | ||
1964 | cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); | ||
1965 | #endif | ||
1966 | if (tb[TCA_CBQ_FOPT-1]) | ||
1967 | cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); | ||
1968 | sch_tree_unlock(sch); | ||
1969 | |||
1970 | #ifdef CONFIG_NET_ESTIMATOR | ||
1971 | if (tca[TCA_RATE-1]) | ||
1972 | gen_new_estimator(&cl->bstats, &cl->rate_est, | ||
1973 | cl->stats_lock, tca[TCA_RATE-1]); | ||
1974 | #endif | ||
1975 | |||
1976 | *arg = (unsigned long)cl; | ||
1977 | return 0; | ||
1978 | |||
1979 | failure: | ||
1980 | qdisc_put_rtab(rtab); | ||
1981 | return err; | ||
1982 | } | ||
1983 | |||
1984 | static int cbq_delete(struct Qdisc *sch, unsigned long arg) | ||
1985 | { | ||
1986 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
1987 | struct cbq_class *cl = (struct cbq_class*)arg; | ||
1988 | |||
1989 | if (cl->filters || cl->children || cl == &q->link) | ||
1990 | return -EBUSY; | ||
1991 | |||
1992 | sch_tree_lock(sch); | ||
1993 | |||
1994 | if (cl->next_alive) | ||
1995 | cbq_deactivate_class(cl); | ||
1996 | |||
1997 | if (q->tx_borrowed == cl) | ||
1998 | q->tx_borrowed = q->tx_class; | ||
1999 | if (q->tx_class == cl) { | ||
2000 | q->tx_class = NULL; | ||
2001 | q->tx_borrowed = NULL; | ||
2002 | } | ||
2003 | #ifdef CONFIG_NET_CLS_POLICE | ||
2004 | if (q->rx_class == cl) | ||
2005 | q->rx_class = NULL; | ||
2006 | #endif | ||
2007 | |||
2008 | cbq_unlink_class(cl); | ||
2009 | cbq_adjust_levels(cl->tparent); | ||
2010 | cl->defmap = 0; | ||
2011 | cbq_sync_defmap(cl); | ||
2012 | |||
2013 | cbq_rmprio(q, cl); | ||
2014 | sch_tree_unlock(sch); | ||
2015 | |||
2016 | if (--cl->refcnt == 0) | ||
2017 | cbq_destroy_class(sch, cl); | ||
2018 | |||
2019 | return 0; | ||
2020 | } | ||
2021 | |||
2022 | static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) | ||
2023 | { | ||
2024 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
2025 | struct cbq_class *cl = (struct cbq_class *)arg; | ||
2026 | |||
2027 | if (cl == NULL) | ||
2028 | cl = &q->link; | ||
2029 | |||
2030 | return &cl->filter_list; | ||
2031 | } | ||
2032 | |||
2033 | static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, | ||
2034 | u32 classid) | ||
2035 | { | ||
2036 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
2037 | struct cbq_class *p = (struct cbq_class*)parent; | ||
2038 | struct cbq_class *cl = cbq_class_lookup(q, classid); | ||
2039 | |||
2040 | if (cl) { | ||
2041 | if (p && p->level <= cl->level) | ||
2042 | return 0; | ||
2043 | cl->filters++; | ||
2044 | return (unsigned long)cl; | ||
2045 | } | ||
2046 | return 0; | ||
2047 | } | ||
2048 | |||
2049 | static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) | ||
2050 | { | ||
2051 | struct cbq_class *cl = (struct cbq_class*)arg; | ||
2052 | |||
2053 | cl->filters--; | ||
2054 | } | ||
2055 | |||
2056 | static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) | ||
2057 | { | ||
2058 | struct cbq_sched_data *q = qdisc_priv(sch); | ||
2059 | unsigned h; | ||
2060 | |||
2061 | if (arg->stop) | ||
2062 | return; | ||
2063 | |||
2064 | for (h = 0; h < 16; h++) { | ||
2065 | struct cbq_class *cl; | ||
2066 | |||
2067 | for (cl = q->classes[h]; cl; cl = cl->next) { | ||
2068 | if (arg->count < arg->skip) { | ||
2069 | arg->count++; | ||
2070 | continue; | ||
2071 | } | ||
2072 | if (arg->fn(sch, (unsigned long)cl, arg) < 0) { | ||
2073 | arg->stop = 1; | ||
2074 | return; | ||
2075 | } | ||
2076 | arg->count++; | ||
2077 | } | ||
2078 | } | ||
2079 | } | ||
2080 | |||
2081 | static struct Qdisc_class_ops cbq_class_ops = { | ||
2082 | .graft = cbq_graft, | ||
2083 | .leaf = cbq_leaf, | ||
2084 | .get = cbq_get, | ||
2085 | .put = cbq_put, | ||
2086 | .change = cbq_change_class, | ||
2087 | .delete = cbq_delete, | ||
2088 | .walk = cbq_walk, | ||
2089 | .tcf_chain = cbq_find_tcf, | ||
2090 | .bind_tcf = cbq_bind_filter, | ||
2091 | .unbind_tcf = cbq_unbind_filter, | ||
2092 | .dump = cbq_dump_class, | ||
2093 | .dump_stats = cbq_dump_class_stats, | ||
2094 | }; | ||
2095 | |||
2096 | static struct Qdisc_ops cbq_qdisc_ops = { | ||
2097 | .next = NULL, | ||
2098 | .cl_ops = &cbq_class_ops, | ||
2099 | .id = "cbq", | ||
2100 | .priv_size = sizeof(struct cbq_sched_data), | ||
2101 | .enqueue = cbq_enqueue, | ||
2102 | .dequeue = cbq_dequeue, | ||
2103 | .requeue = cbq_requeue, | ||
2104 | .drop = cbq_drop, | ||
2105 | .init = cbq_init, | ||
2106 | .reset = cbq_reset, | ||
2107 | .destroy = cbq_destroy, | ||
2108 | .change = NULL, | ||
2109 | .dump = cbq_dump, | ||
2110 | .dump_stats = cbq_dump_stats, | ||
2111 | .owner = THIS_MODULE, | ||
2112 | }; | ||
2113 | |||
2114 | static int __init cbq_module_init(void) | ||
2115 | { | ||
2116 | return register_qdisc(&cbq_qdisc_ops); | ||
2117 | } | ||
2118 | static void __exit cbq_module_exit(void) | ||
2119 | { | ||
2120 | unregister_qdisc(&cbq_qdisc_ops); | ||
2121 | } | ||
2122 | module_init(cbq_module_init) | ||
2123 | module_exit(cbq_module_exit) | ||
2124 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c new file mode 100644 index 000000000000..8a3db9d95bab --- /dev/null +++ b/net/sched/sch_dsmark.c | |||
@@ -0,0 +1,479 @@ | |||
1 | /* net/sched/sch_dsmark.c - Differentiated Services field marker */ | ||
2 | |||
3 | /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ | ||
4 | |||
5 | |||
6 | #include <linux/config.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/skbuff.h> | ||
13 | #include <linux/netdevice.h> /* for pkt_sched */ | ||
14 | #include <linux/rtnetlink.h> | ||
15 | #include <net/pkt_sched.h> | ||
16 | #include <net/dsfield.h> | ||
17 | #include <net/inet_ecn.h> | ||
18 | #include <asm/byteorder.h> | ||
19 | |||
20 | |||
21 | #if 1 /* control */ | ||
22 | #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
23 | #else | ||
24 | #define DPRINTK(format,args...) | ||
25 | #endif | ||
26 | |||
27 | #if 0 /* data */ | ||
28 | #define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
29 | #else | ||
30 | #define D2PRINTK(format,args...) | ||
31 | #endif | ||
32 | |||
33 | |||
34 | #define PRIV(sch) qdisc_priv(sch) | ||
35 | |||
36 | |||
37 | /* | ||
38 | * classid class marking | ||
39 | * ------- ----- ------- | ||
40 | * n/a 0 n/a | ||
41 | * x:0 1 use entry [0] | ||
42 | * ... ... ... | ||
43 | * x:y y>0 y+1 use entry [y] | ||
44 | * ... ... ... | ||
45 | * x:indices-1 indices use entry [indices-1] | ||
46 | * ... ... ... | ||
47 | * x:y y+1 use entry [y & (indices-1)] | ||
48 | * ... ... ... | ||
49 | * 0xffff 0x10000 use entry [indices-1] | ||
50 | */ | ||
51 | |||
52 | |||
53 | #define NO_DEFAULT_INDEX (1 << 16) | ||
54 | |||
55 | struct dsmark_qdisc_data { | ||
56 | struct Qdisc *q; | ||
57 | struct tcf_proto *filter_list; | ||
58 | __u8 *mask; /* "owns" the array */ | ||
59 | __u8 *value; | ||
60 | __u16 indices; | ||
61 | __u32 default_index; /* index range is 0...0xffff */ | ||
62 | int set_tc_index; | ||
63 | }; | ||
64 | |||
65 | |||
66 | /* ------------------------- Class/flow operations ------------------------- */ | ||
67 | |||
68 | |||
69 | static int dsmark_graft(struct Qdisc *sch,unsigned long arg, | ||
70 | struct Qdisc *new,struct Qdisc **old) | ||
71 | { | ||
72 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
73 | |||
74 | DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, | ||
75 | old); | ||
76 | if (!new) | ||
77 | new = &noop_qdisc; | ||
78 | sch_tree_lock(sch); | ||
79 | *old = xchg(&p->q,new); | ||
80 | if (*old) | ||
81 | qdisc_reset(*old); | ||
82 | sch->q.qlen = 0; | ||
83 | sch_tree_unlock(sch); /* @@@ move up ? */ | ||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | |||
88 | static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) | ||
89 | { | ||
90 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
91 | |||
92 | return p->q; | ||
93 | } | ||
94 | |||
95 | |||
96 | static unsigned long dsmark_get(struct Qdisc *sch,u32 classid) | ||
97 | { | ||
98 | struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch); | ||
99 | |||
100 | DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); | ||
101 | return TC_H_MIN(classid)+1; | ||
102 | } | ||
103 | |||
104 | |||
105 | static unsigned long dsmark_bind_filter(struct Qdisc *sch, | ||
106 | unsigned long parent, u32 classid) | ||
107 | { | ||
108 | return dsmark_get(sch,classid); | ||
109 | } | ||
110 | |||
111 | |||
112 | static void dsmark_put(struct Qdisc *sch, unsigned long cl) | ||
113 | { | ||
114 | } | ||
115 | |||
116 | |||
117 | static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, | ||
118 | struct rtattr **tca, unsigned long *arg) | ||
119 | { | ||
120 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
121 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
122 | struct rtattr *tb[TCA_DSMARK_MAX]; | ||
123 | |||
124 | DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," | ||
125 | "arg 0x%lx\n",sch,p,classid,parent,*arg); | ||
126 | if (*arg > p->indices) | ||
127 | return -ENOENT; | ||
128 | if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt)) | ||
129 | return -EINVAL; | ||
130 | if (tb[TCA_DSMARK_MASK-1]) { | ||
131 | if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1])) | ||
132 | return -EINVAL; | ||
133 | p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]); | ||
134 | } | ||
135 | if (tb[TCA_DSMARK_VALUE-1]) { | ||
136 | if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1])) | ||
137 | return -EINVAL; | ||
138 | p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]); | ||
139 | } | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | |||
144 | static int dsmark_delete(struct Qdisc *sch,unsigned long arg) | ||
145 | { | ||
146 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
147 | |||
148 | if (!arg || arg > p->indices) | ||
149 | return -EINVAL; | ||
150 | p->mask[arg-1] = 0xff; | ||
151 | p->value[arg-1] = 0; | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | |||
156 | static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) | ||
157 | { | ||
158 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
159 | int i; | ||
160 | |||
161 | DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); | ||
162 | if (walker->stop) | ||
163 | return; | ||
164 | for (i = 0; i < p->indices; i++) { | ||
165 | if (p->mask[i] == 0xff && !p->value[i]) | ||
166 | continue; | ||
167 | if (walker->count >= walker->skip) { | ||
168 | if (walker->fn(sch, i+1, walker) < 0) { | ||
169 | walker->stop = 1; | ||
170 | break; | ||
171 | } | ||
172 | } | ||
173 | walker->count++; | ||
174 | } | ||
175 | } | ||
176 | |||
177 | |||
178 | static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) | ||
179 | { | ||
180 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
181 | |||
182 | return &p->filter_list; | ||
183 | } | ||
184 | |||
185 | |||
186 | /* --------------------------- Qdisc operations ---------------------------- */ | ||
187 | |||
188 | |||
189 | static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) | ||
190 | { | ||
191 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
192 | struct tcf_result res; | ||
193 | int result; | ||
194 | int ret = NET_XMIT_POLICED; | ||
195 | |||
196 | D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); | ||
197 | if (p->set_tc_index) { | ||
198 | /* FIXME: Safe with non-linear skbs? --RR */ | ||
199 | switch (skb->protocol) { | ||
200 | case __constant_htons(ETH_P_IP): | ||
201 | skb->tc_index = ipv4_get_dsfield(skb->nh.iph) | ||
202 | & ~INET_ECN_MASK; | ||
203 | break; | ||
204 | case __constant_htons(ETH_P_IPV6): | ||
205 | skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h) | ||
206 | & ~INET_ECN_MASK; | ||
207 | break; | ||
208 | default: | ||
209 | skb->tc_index = 0; | ||
210 | break; | ||
211 | }; | ||
212 | } | ||
213 | result = TC_POLICE_OK; /* be nice to gcc */ | ||
214 | if (TC_H_MAJ(skb->priority) == sch->handle) { | ||
215 | skb->tc_index = TC_H_MIN(skb->priority); | ||
216 | } else { | ||
217 | result = tc_classify(skb,p->filter_list,&res); | ||
218 | D2PRINTK("result %d class 0x%04x\n",result,res.classid); | ||
219 | switch (result) { | ||
220 | #ifdef CONFIG_NET_CLS_POLICE | ||
221 | case TC_POLICE_SHOT: | ||
222 | kfree_skb(skb); | ||
223 | break; | ||
224 | #if 0 | ||
225 | case TC_POLICE_RECLASSIFY: | ||
226 | /* FIXME: what to do here ??? */ | ||
227 | #endif | ||
228 | #endif | ||
229 | case TC_POLICE_OK: | ||
230 | skb->tc_index = TC_H_MIN(res.classid); | ||
231 | break; | ||
232 | case TC_POLICE_UNSPEC: | ||
233 | /* fall through */ | ||
234 | default: | ||
235 | if (p->default_index != NO_DEFAULT_INDEX) | ||
236 | skb->tc_index = p->default_index; | ||
237 | break; | ||
238 | }; | ||
239 | } | ||
240 | if ( | ||
241 | #ifdef CONFIG_NET_CLS_POLICE | ||
242 | result == TC_POLICE_SHOT || | ||
243 | #endif | ||
244 | |||
245 | ((ret = p->q->enqueue(skb,p->q)) != 0)) { | ||
246 | sch->qstats.drops++; | ||
247 | return ret; | ||
248 | } | ||
249 | sch->bstats.bytes += skb->len; | ||
250 | sch->bstats.packets++; | ||
251 | sch->q.qlen++; | ||
252 | return ret; | ||
253 | } | ||
254 | |||
255 | |||
256 | static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) | ||
257 | { | ||
258 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
259 | struct sk_buff *skb; | ||
260 | int index; | ||
261 | |||
262 | D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p); | ||
263 | skb = p->q->ops->dequeue(p->q); | ||
264 | if (!skb) | ||
265 | return NULL; | ||
266 | sch->q.qlen--; | ||
267 | index = skb->tc_index & (p->indices-1); | ||
268 | D2PRINTK("index %d->%d\n",skb->tc_index,index); | ||
269 | switch (skb->protocol) { | ||
270 | case __constant_htons(ETH_P_IP): | ||
271 | ipv4_change_dsfield(skb->nh.iph, | ||
272 | p->mask[index],p->value[index]); | ||
273 | break; | ||
274 | case __constant_htons(ETH_P_IPV6): | ||
275 | ipv6_change_dsfield(skb->nh.ipv6h, | ||
276 | p->mask[index],p->value[index]); | ||
277 | break; | ||
278 | default: | ||
279 | /* | ||
280 | * Only complain if a change was actually attempted. | ||
281 | * This way, we can send non-IP traffic through dsmark | ||
282 | * and don't need yet another qdisc as a bypass. | ||
283 | */ | ||
284 | if (p->mask[index] != 0xff || p->value[index]) | ||
285 | printk(KERN_WARNING "dsmark_dequeue: " | ||
286 | "unsupported protocol %d\n", | ||
287 | htons(skb->protocol)); | ||
288 | break; | ||
289 | }; | ||
290 | return skb; | ||
291 | } | ||
292 | |||
293 | |||
294 | static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) | ||
295 | { | ||
296 | int ret; | ||
297 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
298 | |||
299 | D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); | ||
300 | if ((ret = p->q->ops->requeue(skb, p->q)) == 0) { | ||
301 | sch->q.qlen++; | ||
302 | sch->qstats.requeues++; | ||
303 | return 0; | ||
304 | } | ||
305 | sch->qstats.drops++; | ||
306 | return ret; | ||
307 | } | ||
308 | |||
309 | |||
310 | static unsigned int dsmark_drop(struct Qdisc *sch) | ||
311 | { | ||
312 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
313 | unsigned int len; | ||
314 | |||
315 | DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); | ||
316 | if (!p->q->ops->drop) | ||
317 | return 0; | ||
318 | if (!(len = p->q->ops->drop(p->q))) | ||
319 | return 0; | ||
320 | sch->q.qlen--; | ||
321 | return len; | ||
322 | } | ||
323 | |||
324 | |||
325 | static int dsmark_init(struct Qdisc *sch,struct rtattr *opt) | ||
326 | { | ||
327 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
328 | struct rtattr *tb[TCA_DSMARK_MAX]; | ||
329 | __u16 tmp; | ||
330 | |||
331 | DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); | ||
332 | if (!opt || | ||
333 | rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 || | ||
334 | !tb[TCA_DSMARK_INDICES-1] || | ||
335 | RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) | ||
336 | return -EINVAL; | ||
337 | p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); | ||
338 | if (!p->indices) | ||
339 | return -EINVAL; | ||
340 | for (tmp = p->indices; tmp != 1; tmp >>= 1) { | ||
341 | if (tmp & 1) | ||
342 | return -EINVAL; | ||
343 | } | ||
344 | p->default_index = NO_DEFAULT_INDEX; | ||
345 | if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) { | ||
346 | if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16)) | ||
347 | return -EINVAL; | ||
348 | p->default_index = | ||
349 | *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]); | ||
350 | } | ||
351 | p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1]; | ||
352 | p->mask = kmalloc(p->indices*2,GFP_KERNEL); | ||
353 | if (!p->mask) | ||
354 | return -ENOMEM; | ||
355 | p->value = p->mask+p->indices; | ||
356 | memset(p->mask,0xff,p->indices); | ||
357 | memset(p->value,0,p->indices); | ||
358 | if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) | ||
359 | p->q = &noop_qdisc; | ||
360 | DPRINTK("dsmark_init: qdisc %p\n",&p->q); | ||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | |||
365 | static void dsmark_reset(struct Qdisc *sch) | ||
366 | { | ||
367 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
368 | |||
369 | DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); | ||
370 | qdisc_reset(p->q); | ||
371 | sch->q.qlen = 0; | ||
372 | } | ||
373 | |||
374 | |||
375 | static void dsmark_destroy(struct Qdisc *sch) | ||
376 | { | ||
377 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
378 | struct tcf_proto *tp; | ||
379 | |||
380 | DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p); | ||
381 | while (p->filter_list) { | ||
382 | tp = p->filter_list; | ||
383 | p->filter_list = tp->next; | ||
384 | tcf_destroy(tp); | ||
385 | } | ||
386 | qdisc_destroy(p->q); | ||
387 | kfree(p->mask); | ||
388 | } | ||
389 | |||
390 | |||
391 | static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, | ||
392 | struct sk_buff *skb, struct tcmsg *tcm) | ||
393 | { | ||
394 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
395 | unsigned char *b = skb->tail; | ||
396 | struct rtattr *rta; | ||
397 | |||
398 | DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl); | ||
399 | if (!cl || cl > p->indices) | ||
400 | return -EINVAL; | ||
401 | tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1); | ||
402 | rta = (struct rtattr *) b; | ||
403 | RTA_PUT(skb,TCA_OPTIONS,0,NULL); | ||
404 | RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]); | ||
405 | RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]); | ||
406 | rta->rta_len = skb->tail-b; | ||
407 | return skb->len; | ||
408 | |||
409 | rtattr_failure: | ||
410 | skb_trim(skb,b-skb->data); | ||
411 | return -1; | ||
412 | } | ||
413 | |||
414 | static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
415 | { | ||
416 | struct dsmark_qdisc_data *p = PRIV(sch); | ||
417 | unsigned char *b = skb->tail; | ||
418 | struct rtattr *rta; | ||
419 | |||
420 | rta = (struct rtattr *) b; | ||
421 | RTA_PUT(skb,TCA_OPTIONS,0,NULL); | ||
422 | RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices); | ||
423 | if (p->default_index != NO_DEFAULT_INDEX) { | ||
424 | __u16 tmp = p->default_index; | ||
425 | |||
426 | RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp); | ||
427 | } | ||
428 | if (p->set_tc_index) | ||
429 | RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL); | ||
430 | rta->rta_len = skb->tail-b; | ||
431 | return skb->len; | ||
432 | |||
433 | rtattr_failure: | ||
434 | skb_trim(skb,b-skb->data); | ||
435 | return -1; | ||
436 | } | ||
437 | |||
438 | static struct Qdisc_class_ops dsmark_class_ops = { | ||
439 | .graft = dsmark_graft, | ||
440 | .leaf = dsmark_leaf, | ||
441 | .get = dsmark_get, | ||
442 | .put = dsmark_put, | ||
443 | .change = dsmark_change, | ||
444 | .delete = dsmark_delete, | ||
445 | .walk = dsmark_walk, | ||
446 | .tcf_chain = dsmark_find_tcf, | ||
447 | .bind_tcf = dsmark_bind_filter, | ||
448 | .unbind_tcf = dsmark_put, | ||
449 | .dump = dsmark_dump_class, | ||
450 | }; | ||
451 | |||
452 | static struct Qdisc_ops dsmark_qdisc_ops = { | ||
453 | .next = NULL, | ||
454 | .cl_ops = &dsmark_class_ops, | ||
455 | .id = "dsmark", | ||
456 | .priv_size = sizeof(struct dsmark_qdisc_data), | ||
457 | .enqueue = dsmark_enqueue, | ||
458 | .dequeue = dsmark_dequeue, | ||
459 | .requeue = dsmark_requeue, | ||
460 | .drop = dsmark_drop, | ||
461 | .init = dsmark_init, | ||
462 | .reset = dsmark_reset, | ||
463 | .destroy = dsmark_destroy, | ||
464 | .change = NULL, | ||
465 | .dump = dsmark_dump, | ||
466 | .owner = THIS_MODULE, | ||
467 | }; | ||
468 | |||
469 | static int __init dsmark_module_init(void) | ||
470 | { | ||
471 | return register_qdisc(&dsmark_qdisc_ops); | ||
472 | } | ||
473 | static void __exit dsmark_module_exit(void) | ||
474 | { | ||
475 | unregister_qdisc(&dsmark_qdisc_ops); | ||
476 | } | ||
477 | module_init(dsmark_module_init) | ||
478 | module_exit(dsmark_module_exit) | ||
479 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c new file mode 100644 index 000000000000..4888305c96da --- /dev/null +++ b/net/sched/sch_fifo.c | |||
@@ -0,0 +1,212 @@ | |||
1 | /* | ||
2 | * net/sched/sch_fifo.c The simplest FIFO queue. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <linux/bitops.h> | ||
17 | #include <linux/types.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/socket.h> | ||
23 | #include <linux/sockios.h> | ||
24 | #include <linux/in.h> | ||
25 | #include <linux/errno.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/if_ether.h> | ||
28 | #include <linux/inet.h> | ||
29 | #include <linux/netdevice.h> | ||
30 | #include <linux/etherdevice.h> | ||
31 | #include <linux/notifier.h> | ||
32 | #include <net/ip.h> | ||
33 | #include <net/route.h> | ||
34 | #include <linux/skbuff.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <net/pkt_sched.h> | ||
37 | |||
38 | /* 1 band FIFO pseudo-"scheduler" */ | ||
39 | |||
40 | struct fifo_sched_data | ||
41 | { | ||
42 | unsigned limit; | ||
43 | }; | ||
44 | |||
45 | static int | ||
46 | bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
47 | { | ||
48 | struct fifo_sched_data *q = qdisc_priv(sch); | ||
49 | |||
50 | if (sch->qstats.backlog + skb->len <= q->limit) { | ||
51 | __skb_queue_tail(&sch->q, skb); | ||
52 | sch->qstats.backlog += skb->len; | ||
53 | sch->bstats.bytes += skb->len; | ||
54 | sch->bstats.packets++; | ||
55 | return 0; | ||
56 | } | ||
57 | sch->qstats.drops++; | ||
58 | #ifdef CONFIG_NET_CLS_POLICE | ||
59 | if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) | ||
60 | #endif | ||
61 | kfree_skb(skb); | ||
62 | return NET_XMIT_DROP; | ||
63 | } | ||
64 | |||
65 | static int | ||
66 | bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
67 | { | ||
68 | __skb_queue_head(&sch->q, skb); | ||
69 | sch->qstats.backlog += skb->len; | ||
70 | sch->qstats.requeues++; | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static struct sk_buff * | ||
75 | bfifo_dequeue(struct Qdisc* sch) | ||
76 | { | ||
77 | struct sk_buff *skb; | ||
78 | |||
79 | skb = __skb_dequeue(&sch->q); | ||
80 | if (skb) | ||
81 | sch->qstats.backlog -= skb->len; | ||
82 | return skb; | ||
83 | } | ||
84 | |||
85 | static unsigned int | ||
86 | fifo_drop(struct Qdisc* sch) | ||
87 | { | ||
88 | struct sk_buff *skb; | ||
89 | |||
90 | skb = __skb_dequeue_tail(&sch->q); | ||
91 | if (skb) { | ||
92 | unsigned int len = skb->len; | ||
93 | sch->qstats.backlog -= len; | ||
94 | kfree_skb(skb); | ||
95 | return len; | ||
96 | } | ||
97 | return 0; | ||
98 | } | ||
99 | |||
100 | static void | ||
101 | fifo_reset(struct Qdisc* sch) | ||
102 | { | ||
103 | skb_queue_purge(&sch->q); | ||
104 | sch->qstats.backlog = 0; | ||
105 | } | ||
106 | |||
107 | static int | ||
108 | pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
109 | { | ||
110 | struct fifo_sched_data *q = qdisc_priv(sch); | ||
111 | |||
112 | if (sch->q.qlen < q->limit) { | ||
113 | __skb_queue_tail(&sch->q, skb); | ||
114 | sch->bstats.bytes += skb->len; | ||
115 | sch->bstats.packets++; | ||
116 | return 0; | ||
117 | } | ||
118 | sch->qstats.drops++; | ||
119 | #ifdef CONFIG_NET_CLS_POLICE | ||
120 | if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) | ||
121 | #endif | ||
122 | kfree_skb(skb); | ||
123 | return NET_XMIT_DROP; | ||
124 | } | ||
125 | |||
126 | static int | ||
127 | pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
128 | { | ||
129 | __skb_queue_head(&sch->q, skb); | ||
130 | sch->qstats.requeues++; | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | |||
135 | static struct sk_buff * | ||
136 | pfifo_dequeue(struct Qdisc* sch) | ||
137 | { | ||
138 | return __skb_dequeue(&sch->q); | ||
139 | } | ||
140 | |||
141 | static int fifo_init(struct Qdisc *sch, struct rtattr *opt) | ||
142 | { | ||
143 | struct fifo_sched_data *q = qdisc_priv(sch); | ||
144 | |||
145 | if (opt == NULL) { | ||
146 | unsigned int limit = sch->dev->tx_queue_len ? : 1; | ||
147 | |||
148 | if (sch->ops == &bfifo_qdisc_ops) | ||
149 | q->limit = limit*sch->dev->mtu; | ||
150 | else | ||
151 | q->limit = limit; | ||
152 | } else { | ||
153 | struct tc_fifo_qopt *ctl = RTA_DATA(opt); | ||
154 | if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) | ||
155 | return -EINVAL; | ||
156 | q->limit = ctl->limit; | ||
157 | } | ||
158 | return 0; | ||
159 | } | ||
160 | |||
161 | static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
162 | { | ||
163 | struct fifo_sched_data *q = qdisc_priv(sch); | ||
164 | unsigned char *b = skb->tail; | ||
165 | struct tc_fifo_qopt opt; | ||
166 | |||
167 | opt.limit = q->limit; | ||
168 | RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); | ||
169 | |||
170 | return skb->len; | ||
171 | |||
172 | rtattr_failure: | ||
173 | skb_trim(skb, b - skb->data); | ||
174 | return -1; | ||
175 | } | ||
176 | |||
177 | struct Qdisc_ops pfifo_qdisc_ops = { | ||
178 | .next = NULL, | ||
179 | .cl_ops = NULL, | ||
180 | .id = "pfifo", | ||
181 | .priv_size = sizeof(struct fifo_sched_data), | ||
182 | .enqueue = pfifo_enqueue, | ||
183 | .dequeue = pfifo_dequeue, | ||
184 | .requeue = pfifo_requeue, | ||
185 | .drop = fifo_drop, | ||
186 | .init = fifo_init, | ||
187 | .reset = fifo_reset, | ||
188 | .destroy = NULL, | ||
189 | .change = fifo_init, | ||
190 | .dump = fifo_dump, | ||
191 | .owner = THIS_MODULE, | ||
192 | }; | ||
193 | |||
194 | struct Qdisc_ops bfifo_qdisc_ops = { | ||
195 | .next = NULL, | ||
196 | .cl_ops = NULL, | ||
197 | .id = "bfifo", | ||
198 | .priv_size = sizeof(struct fifo_sched_data), | ||
199 | .enqueue = bfifo_enqueue, | ||
200 | .dequeue = bfifo_dequeue, | ||
201 | .requeue = bfifo_requeue, | ||
202 | .drop = fifo_drop, | ||
203 | .init = fifo_init, | ||
204 | .reset = fifo_reset, | ||
205 | .destroy = NULL, | ||
206 | .change = fifo_init, | ||
207 | .dump = fifo_dump, | ||
208 | .owner = THIS_MODULE, | ||
209 | }; | ||
210 | |||
211 | EXPORT_SYMBOL(bfifo_qdisc_ops); | ||
212 | EXPORT_SYMBOL(pfifo_qdisc_ops); | ||
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c new file mode 100644 index 000000000000..8c01e023f02e --- /dev/null +++ b/net/sched/sch_generic.c | |||
@@ -0,0 +1,609 @@ | |||
1 | /* | ||
2 | * net/sched/sch_generic.c Generic packet scheduler routines. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 | ||
11 | * - Ingress support | ||
12 | */ | ||
13 | |||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <linux/bitops.h> | ||
17 | #include <linux/config.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/socket.h> | ||
25 | #include <linux/sockios.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/errno.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | #include <linux/netdevice.h> | ||
30 | #include <linux/skbuff.h> | ||
31 | #include <linux/rtnetlink.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/rcupdate.h> | ||
34 | #include <linux/list.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <net/pkt_sched.h> | ||
37 | |||
38 | /* Main transmission queue. */ | ||
39 | |||
40 | /* Main qdisc structure lock. | ||
41 | |||
42 | However, modifications | ||
43 | to data, participating in scheduling must be additionally | ||
44 | protected with dev->queue_lock spinlock. | ||
45 | |||
46 | The idea is the following: | ||
47 | - enqueue, dequeue are serialized via top level device | ||
48 | spinlock dev->queue_lock. | ||
49 | - tree walking is protected by read_lock_bh(qdisc_tree_lock) | ||
50 | and this lock is used only in process context. | ||
51 | - updates to tree are made under rtnl semaphore or | ||
52 | from softirq context (__qdisc_destroy rcu-callback) | ||
53 | hence this lock needs local bh disabling. | ||
54 | |||
55 | qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! | ||
56 | */ | ||
57 | DEFINE_RWLOCK(qdisc_tree_lock); | ||
58 | |||
59 | void qdisc_lock_tree(struct net_device *dev) | ||
60 | { | ||
61 | write_lock_bh(&qdisc_tree_lock); | ||
62 | spin_lock_bh(&dev->queue_lock); | ||
63 | } | ||
64 | |||
65 | void qdisc_unlock_tree(struct net_device *dev) | ||
66 | { | ||
67 | spin_unlock_bh(&dev->queue_lock); | ||
68 | write_unlock_bh(&qdisc_tree_lock); | ||
69 | } | ||
70 | |||
71 | /* | ||
72 | dev->queue_lock serializes queue accesses for this device | ||
73 | AND dev->qdisc pointer itself. | ||
74 | |||
75 | dev->xmit_lock serializes accesses to device driver. | ||
76 | |||
77 | dev->queue_lock and dev->xmit_lock are mutually exclusive, | ||
78 | if one is grabbed, another must be free. | ||
79 | */ | ||
80 | |||
81 | |||
82 | /* Kick device. | ||
83 | Note, that this procedure can be called by a watchdog timer, so that | ||
84 | we do not check dev->tbusy flag here. | ||
85 | |||
86 | Returns: 0 - queue is empty. | ||
87 | >0 - queue is not empty, but throttled. | ||
88 | <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. | ||
89 | |||
90 | NOTE: Called under dev->queue_lock with locally disabled BH. | ||
91 | */ | ||
92 | |||
93 | int qdisc_restart(struct net_device *dev) | ||
94 | { | ||
95 | struct Qdisc *q = dev->qdisc; | ||
96 | struct sk_buff *skb; | ||
97 | |||
98 | /* Dequeue packet */ | ||
99 | if ((skb = q->dequeue(q)) != NULL) { | ||
100 | unsigned nolock = (dev->features & NETIF_F_LLTX); | ||
101 | /* | ||
102 | * When the driver has LLTX set it does its own locking | ||
103 | * in start_xmit. No need to add additional overhead by | ||
104 | * locking again. These checks are worth it because | ||
105 | * even uncongested locks can be quite expensive. | ||
106 | * The driver can do trylock like here too, in case | ||
107 | * of lock congestion it should return -1 and the packet | ||
108 | * will be requeued. | ||
109 | */ | ||
110 | if (!nolock) { | ||
111 | if (!spin_trylock(&dev->xmit_lock)) { | ||
112 | collision: | ||
113 | /* So, someone grabbed the driver. */ | ||
114 | |||
115 | /* It may be transient configuration error, | ||
116 | when hard_start_xmit() recurses. We detect | ||
117 | it by checking xmit owner and drop the | ||
118 | packet when deadloop is detected. | ||
119 | */ | ||
120 | if (dev->xmit_lock_owner == smp_processor_id()) { | ||
121 | kfree_skb(skb); | ||
122 | if (net_ratelimit()) | ||
123 | printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); | ||
124 | return -1; | ||
125 | } | ||
126 | __get_cpu_var(netdev_rx_stat).cpu_collision++; | ||
127 | goto requeue; | ||
128 | } | ||
129 | /* Remember that the driver is grabbed by us. */ | ||
130 | dev->xmit_lock_owner = smp_processor_id(); | ||
131 | } | ||
132 | |||
133 | { | ||
134 | /* And release queue */ | ||
135 | spin_unlock(&dev->queue_lock); | ||
136 | |||
137 | if (!netif_queue_stopped(dev)) { | ||
138 | int ret; | ||
139 | if (netdev_nit) | ||
140 | dev_queue_xmit_nit(skb, dev); | ||
141 | |||
142 | ret = dev->hard_start_xmit(skb, dev); | ||
143 | if (ret == NETDEV_TX_OK) { | ||
144 | if (!nolock) { | ||
145 | dev->xmit_lock_owner = -1; | ||
146 | spin_unlock(&dev->xmit_lock); | ||
147 | } | ||
148 | spin_lock(&dev->queue_lock); | ||
149 | return -1; | ||
150 | } | ||
151 | if (ret == NETDEV_TX_LOCKED && nolock) { | ||
152 | spin_lock(&dev->queue_lock); | ||
153 | goto collision; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | /* NETDEV_TX_BUSY - we need to requeue */ | ||
158 | /* Release the driver */ | ||
159 | if (!nolock) { | ||
160 | dev->xmit_lock_owner = -1; | ||
161 | spin_unlock(&dev->xmit_lock); | ||
162 | } | ||
163 | spin_lock(&dev->queue_lock); | ||
164 | q = dev->qdisc; | ||
165 | } | ||
166 | |||
167 | /* Device kicked us out :( | ||
168 | This is possible in three cases: | ||
169 | |||
170 | 0. driver is locked | ||
171 | 1. fastroute is enabled | ||
172 | 2. device cannot determine busy state | ||
173 | before start of transmission (f.e. dialout) | ||
174 | 3. device is buggy (ppp) | ||
175 | */ | ||
176 | |||
177 | requeue: | ||
178 | q->ops->requeue(skb, q); | ||
179 | netif_schedule(dev); | ||
180 | return 1; | ||
181 | } | ||
182 | return q->q.qlen; | ||
183 | } | ||
184 | |||
185 | static void dev_watchdog(unsigned long arg) | ||
186 | { | ||
187 | struct net_device *dev = (struct net_device *)arg; | ||
188 | |||
189 | spin_lock(&dev->xmit_lock); | ||
190 | if (dev->qdisc != &noop_qdisc) { | ||
191 | if (netif_device_present(dev) && | ||
192 | netif_running(dev) && | ||
193 | netif_carrier_ok(dev)) { | ||
194 | if (netif_queue_stopped(dev) && | ||
195 | (jiffies - dev->trans_start) > dev->watchdog_timeo) { | ||
196 | printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name); | ||
197 | dev->tx_timeout(dev); | ||
198 | } | ||
199 | if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) | ||
200 | dev_hold(dev); | ||
201 | } | ||
202 | } | ||
203 | spin_unlock(&dev->xmit_lock); | ||
204 | |||
205 | dev_put(dev); | ||
206 | } | ||
207 | |||
208 | static void dev_watchdog_init(struct net_device *dev) | ||
209 | { | ||
210 | init_timer(&dev->watchdog_timer); | ||
211 | dev->watchdog_timer.data = (unsigned long)dev; | ||
212 | dev->watchdog_timer.function = dev_watchdog; | ||
213 | } | ||
214 | |||
215 | void __netdev_watchdog_up(struct net_device *dev) | ||
216 | { | ||
217 | if (dev->tx_timeout) { | ||
218 | if (dev->watchdog_timeo <= 0) | ||
219 | dev->watchdog_timeo = 5*HZ; | ||
220 | if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) | ||
221 | dev_hold(dev); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | static void dev_watchdog_up(struct net_device *dev) | ||
226 | { | ||
227 | spin_lock_bh(&dev->xmit_lock); | ||
228 | __netdev_watchdog_up(dev); | ||
229 | spin_unlock_bh(&dev->xmit_lock); | ||
230 | } | ||
231 | |||
232 | static void dev_watchdog_down(struct net_device *dev) | ||
233 | { | ||
234 | spin_lock_bh(&dev->xmit_lock); | ||
235 | if (del_timer(&dev->watchdog_timer)) | ||
236 | __dev_put(dev); | ||
237 | spin_unlock_bh(&dev->xmit_lock); | ||
238 | } | ||
239 | |||
240 | /* "NOOP" scheduler: the best scheduler, recommended for all interfaces | ||
241 | under all circumstances. It is difficult to invent anything faster or | ||
242 | cheaper. | ||
243 | */ | ||
244 | |||
245 | static int | ||
246 | noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) | ||
247 | { | ||
248 | kfree_skb(skb); | ||
249 | return NET_XMIT_CN; | ||
250 | } | ||
251 | |||
252 | static struct sk_buff * | ||
253 | noop_dequeue(struct Qdisc * qdisc) | ||
254 | { | ||
255 | return NULL; | ||
256 | } | ||
257 | |||
258 | static int | ||
259 | noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) | ||
260 | { | ||
261 | if (net_ratelimit()) | ||
262 | printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); | ||
263 | kfree_skb(skb); | ||
264 | return NET_XMIT_CN; | ||
265 | } | ||
266 | |||
267 | struct Qdisc_ops noop_qdisc_ops = { | ||
268 | .next = NULL, | ||
269 | .cl_ops = NULL, | ||
270 | .id = "noop", | ||
271 | .priv_size = 0, | ||
272 | .enqueue = noop_enqueue, | ||
273 | .dequeue = noop_dequeue, | ||
274 | .requeue = noop_requeue, | ||
275 | .owner = THIS_MODULE, | ||
276 | }; | ||
277 | |||
278 | struct Qdisc noop_qdisc = { | ||
279 | .enqueue = noop_enqueue, | ||
280 | .dequeue = noop_dequeue, | ||
281 | .flags = TCQ_F_BUILTIN, | ||
282 | .ops = &noop_qdisc_ops, | ||
283 | .list = LIST_HEAD_INIT(noop_qdisc.list), | ||
284 | }; | ||
285 | |||
286 | static struct Qdisc_ops noqueue_qdisc_ops = { | ||
287 | .next = NULL, | ||
288 | .cl_ops = NULL, | ||
289 | .id = "noqueue", | ||
290 | .priv_size = 0, | ||
291 | .enqueue = noop_enqueue, | ||
292 | .dequeue = noop_dequeue, | ||
293 | .requeue = noop_requeue, | ||
294 | .owner = THIS_MODULE, | ||
295 | }; | ||
296 | |||
297 | static struct Qdisc noqueue_qdisc = { | ||
298 | .enqueue = NULL, | ||
299 | .dequeue = noop_dequeue, | ||
300 | .flags = TCQ_F_BUILTIN, | ||
301 | .ops = &noqueue_qdisc_ops, | ||
302 | .list = LIST_HEAD_INIT(noqueue_qdisc.list), | ||
303 | }; | ||
304 | |||
305 | |||
306 | static const u8 prio2band[TC_PRIO_MAX+1] = | ||
307 | { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; | ||
308 | |||
309 | /* 3-band FIFO queue: old style, but should be a bit faster than | ||
310 | generic prio+fifo combination. | ||
311 | */ | ||
312 | |||
313 | static int | ||
314 | pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) | ||
315 | { | ||
316 | struct sk_buff_head *list = qdisc_priv(qdisc); | ||
317 | |||
318 | list += prio2band[skb->priority&TC_PRIO_MAX]; | ||
319 | |||
320 | if (list->qlen < qdisc->dev->tx_queue_len) { | ||
321 | __skb_queue_tail(list, skb); | ||
322 | qdisc->q.qlen++; | ||
323 | qdisc->bstats.bytes += skb->len; | ||
324 | qdisc->bstats.packets++; | ||
325 | return 0; | ||
326 | } | ||
327 | qdisc->qstats.drops++; | ||
328 | kfree_skb(skb); | ||
329 | return NET_XMIT_DROP; | ||
330 | } | ||
331 | |||
332 | static struct sk_buff * | ||
333 | pfifo_fast_dequeue(struct Qdisc* qdisc) | ||
334 | { | ||
335 | int prio; | ||
336 | struct sk_buff_head *list = qdisc_priv(qdisc); | ||
337 | struct sk_buff *skb; | ||
338 | |||
339 | for (prio = 0; prio < 3; prio++, list++) { | ||
340 | skb = __skb_dequeue(list); | ||
341 | if (skb) { | ||
342 | qdisc->q.qlen--; | ||
343 | return skb; | ||
344 | } | ||
345 | } | ||
346 | return NULL; | ||
347 | } | ||
348 | |||
349 | static int | ||
350 | pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) | ||
351 | { | ||
352 | struct sk_buff_head *list = qdisc_priv(qdisc); | ||
353 | |||
354 | list += prio2band[skb->priority&TC_PRIO_MAX]; | ||
355 | |||
356 | __skb_queue_head(list, skb); | ||
357 | qdisc->q.qlen++; | ||
358 | qdisc->qstats.requeues++; | ||
359 | return 0; | ||
360 | } | ||
361 | |||
362 | static void | ||
363 | pfifo_fast_reset(struct Qdisc* qdisc) | ||
364 | { | ||
365 | int prio; | ||
366 | struct sk_buff_head *list = qdisc_priv(qdisc); | ||
367 | |||
368 | for (prio=0; prio < 3; prio++) | ||
369 | skb_queue_purge(list+prio); | ||
370 | qdisc->q.qlen = 0; | ||
371 | } | ||
372 | |||
373 | static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) | ||
374 | { | ||
375 | unsigned char *b = skb->tail; | ||
376 | struct tc_prio_qopt opt; | ||
377 | |||
378 | opt.bands = 3; | ||
379 | memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); | ||
380 | RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); | ||
381 | return skb->len; | ||
382 | |||
383 | rtattr_failure: | ||
384 | skb_trim(skb, b - skb->data); | ||
385 | return -1; | ||
386 | } | ||
387 | |||
388 | static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) | ||
389 | { | ||
390 | int i; | ||
391 | struct sk_buff_head *list = qdisc_priv(qdisc); | ||
392 | |||
393 | for (i=0; i<3; i++) | ||
394 | skb_queue_head_init(list+i); | ||
395 | |||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | static struct Qdisc_ops pfifo_fast_ops = { | ||
400 | .next = NULL, | ||
401 | .cl_ops = NULL, | ||
402 | .id = "pfifo_fast", | ||
403 | .priv_size = 3 * sizeof(struct sk_buff_head), | ||
404 | .enqueue = pfifo_fast_enqueue, | ||
405 | .dequeue = pfifo_fast_dequeue, | ||
406 | .requeue = pfifo_fast_requeue, | ||
407 | .init = pfifo_fast_init, | ||
408 | .reset = pfifo_fast_reset, | ||
409 | .dump = pfifo_fast_dump, | ||
410 | .owner = THIS_MODULE, | ||
411 | }; | ||
412 | |||
413 | struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) | ||
414 | { | ||
415 | void *p; | ||
416 | struct Qdisc *sch; | ||
417 | int size; | ||
418 | |||
419 | /* ensure that the Qdisc and the private data are 32-byte aligned */ | ||
420 | size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); | ||
421 | size += ops->priv_size + QDISC_ALIGN_CONST; | ||
422 | |||
423 | p = kmalloc(size, GFP_KERNEL); | ||
424 | if (!p) | ||
425 | return NULL; | ||
426 | memset(p, 0, size); | ||
427 | |||
428 | sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) | ||
429 | & ~QDISC_ALIGN_CONST); | ||
430 | sch->padded = (char *)sch - (char *)p; | ||
431 | |||
432 | INIT_LIST_HEAD(&sch->list); | ||
433 | skb_queue_head_init(&sch->q); | ||
434 | sch->ops = ops; | ||
435 | sch->enqueue = ops->enqueue; | ||
436 | sch->dequeue = ops->dequeue; | ||
437 | sch->dev = dev; | ||
438 | dev_hold(dev); | ||
439 | sch->stats_lock = &dev->queue_lock; | ||
440 | atomic_set(&sch->refcnt, 1); | ||
441 | if (!ops->init || ops->init(sch, NULL) == 0) | ||
442 | return sch; | ||
443 | |||
444 | dev_put(dev); | ||
445 | kfree(p); | ||
446 | return NULL; | ||
447 | } | ||
448 | |||
449 | /* Under dev->queue_lock and BH! */ | ||
450 | |||
451 | void qdisc_reset(struct Qdisc *qdisc) | ||
452 | { | ||
453 | struct Qdisc_ops *ops = qdisc->ops; | ||
454 | |||
455 | if (ops->reset) | ||
456 | ops->reset(qdisc); | ||
457 | } | ||
458 | |||
459 | /* this is the rcu callback function to clean up a qdisc when there | ||
460 | * are no further references to it */ | ||
461 | |||
462 | static void __qdisc_destroy(struct rcu_head *head) | ||
463 | { | ||
464 | struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); | ||
465 | struct Qdisc_ops *ops = qdisc->ops; | ||
466 | |||
467 | #ifdef CONFIG_NET_ESTIMATOR | ||
468 | gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); | ||
469 | #endif | ||
470 | write_lock(&qdisc_tree_lock); | ||
471 | if (ops->reset) | ||
472 | ops->reset(qdisc); | ||
473 | if (ops->destroy) | ||
474 | ops->destroy(qdisc); | ||
475 | write_unlock(&qdisc_tree_lock); | ||
476 | module_put(ops->owner); | ||
477 | |||
478 | dev_put(qdisc->dev); | ||
479 | kfree((char *) qdisc - qdisc->padded); | ||
480 | } | ||
481 | |||
482 | /* Under dev->queue_lock and BH! */ | ||
483 | |||
484 | void qdisc_destroy(struct Qdisc *qdisc) | ||
485 | { | ||
486 | struct list_head cql = LIST_HEAD_INIT(cql); | ||
487 | struct Qdisc *cq, *q, *n; | ||
488 | |||
489 | if (qdisc->flags & TCQ_F_BUILTIN || | ||
490 | !atomic_dec_and_test(&qdisc->refcnt)) | ||
491 | return; | ||
492 | |||
493 | if (!list_empty(&qdisc->list)) { | ||
494 | if (qdisc->ops->cl_ops == NULL) | ||
495 | list_del(&qdisc->list); | ||
496 | else | ||
497 | list_move(&qdisc->list, &cql); | ||
498 | } | ||
499 | |||
500 | /* unlink inner qdiscs from dev->qdisc_list immediately */ | ||
501 | list_for_each_entry(cq, &cql, list) | ||
502 | list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) | ||
503 | if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { | ||
504 | if (q->ops->cl_ops == NULL) | ||
505 | list_del_init(&q->list); | ||
506 | else | ||
507 | list_move_tail(&q->list, &cql); | ||
508 | } | ||
509 | list_for_each_entry_safe(cq, n, &cql, list) | ||
510 | list_del_init(&cq->list); | ||
511 | |||
512 | call_rcu(&qdisc->q_rcu, __qdisc_destroy); | ||
513 | } | ||
514 | |||
515 | void dev_activate(struct net_device *dev) | ||
516 | { | ||
517 | /* No queueing discipline is attached to device; | ||
518 | create default one i.e. pfifo_fast for devices, | ||
519 | which need queueing and noqueue_qdisc for | ||
520 | virtual interfaces | ||
521 | */ | ||
522 | |||
523 | if (dev->qdisc_sleeping == &noop_qdisc) { | ||
524 | struct Qdisc *qdisc; | ||
525 | if (dev->tx_queue_len) { | ||
526 | qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); | ||
527 | if (qdisc == NULL) { | ||
528 | printk(KERN_INFO "%s: activation failed\n", dev->name); | ||
529 | return; | ||
530 | } | ||
531 | write_lock_bh(&qdisc_tree_lock); | ||
532 | list_add_tail(&qdisc->list, &dev->qdisc_list); | ||
533 | write_unlock_bh(&qdisc_tree_lock); | ||
534 | } else { | ||
535 | qdisc = &noqueue_qdisc; | ||
536 | } | ||
537 | write_lock_bh(&qdisc_tree_lock); | ||
538 | dev->qdisc_sleeping = qdisc; | ||
539 | write_unlock_bh(&qdisc_tree_lock); | ||
540 | } | ||
541 | |||
542 | spin_lock_bh(&dev->queue_lock); | ||
543 | rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping); | ||
544 | if (dev->qdisc != &noqueue_qdisc) { | ||
545 | dev->trans_start = jiffies; | ||
546 | dev_watchdog_up(dev); | ||
547 | } | ||
548 | spin_unlock_bh(&dev->queue_lock); | ||
549 | } | ||
550 | |||
551 | void dev_deactivate(struct net_device *dev) | ||
552 | { | ||
553 | struct Qdisc *qdisc; | ||
554 | |||
555 | spin_lock_bh(&dev->queue_lock); | ||
556 | qdisc = dev->qdisc; | ||
557 | dev->qdisc = &noop_qdisc; | ||
558 | |||
559 | qdisc_reset(qdisc); | ||
560 | |||
561 | spin_unlock_bh(&dev->queue_lock); | ||
562 | |||
563 | dev_watchdog_down(dev); | ||
564 | |||
565 | while (test_bit(__LINK_STATE_SCHED, &dev->state)) | ||
566 | yield(); | ||
567 | |||
568 | spin_unlock_wait(&dev->xmit_lock); | ||
569 | } | ||
570 | |||
571 | void dev_init_scheduler(struct net_device *dev) | ||
572 | { | ||
573 | qdisc_lock_tree(dev); | ||
574 | dev->qdisc = &noop_qdisc; | ||
575 | dev->qdisc_sleeping = &noop_qdisc; | ||
576 | INIT_LIST_HEAD(&dev->qdisc_list); | ||
577 | qdisc_unlock_tree(dev); | ||
578 | |||
579 | dev_watchdog_init(dev); | ||
580 | } | ||
581 | |||
582 | void dev_shutdown(struct net_device *dev) | ||
583 | { | ||
584 | struct Qdisc *qdisc; | ||
585 | |||
586 | qdisc_lock_tree(dev); | ||
587 | qdisc = dev->qdisc_sleeping; | ||
588 | dev->qdisc = &noop_qdisc; | ||
589 | dev->qdisc_sleeping = &noop_qdisc; | ||
590 | qdisc_destroy(qdisc); | ||
591 | #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE) | ||
592 | if ((qdisc = dev->qdisc_ingress) != NULL) { | ||
593 | dev->qdisc_ingress = NULL; | ||
594 | qdisc_destroy(qdisc); | ||
595 | } | ||
596 | #endif | ||
597 | BUG_TRAP(!timer_pending(&dev->watchdog_timer)); | ||
598 | qdisc_unlock_tree(dev); | ||
599 | } | ||
600 | |||
601 | EXPORT_SYMBOL(__netdev_watchdog_up); | ||
602 | EXPORT_SYMBOL(noop_qdisc); | ||
603 | EXPORT_SYMBOL(noop_qdisc_ops); | ||
604 | EXPORT_SYMBOL(qdisc_create_dflt); | ||
605 | EXPORT_SYMBOL(qdisc_destroy); | ||
606 | EXPORT_SYMBOL(qdisc_reset); | ||
607 | EXPORT_SYMBOL(qdisc_restart); | ||
608 | EXPORT_SYMBOL(qdisc_lock_tree); | ||
609 | EXPORT_SYMBOL(qdisc_unlock_tree); | ||
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c new file mode 100644 index 000000000000..25c171c32715 --- /dev/null +++ b/net/sched/sch_gred.c | |||
@@ -0,0 +1,630 @@ | |||
1 | /* | ||
2 | * net/sched/sch_gred.c Generic Random Early Detection queue. | ||
3 | * | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; either version | ||
8 | * 2 of the License, or (at your option) any later version. | ||
9 | * | ||
10 | * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 | ||
11 | * | ||
12 | * 991129: - Bug fix with grio mode | ||
13 | * - a better sing. AvgQ mode with Grio(WRED) | ||
14 | * - A finer grained VQ dequeue based on sugestion | ||
15 | * from Ren Liu | ||
16 | * - More error checks | ||
17 | * | ||
18 | * | ||
19 | * | ||
20 | * For all the glorious comments look at Alexey's sch_red.c | ||
21 | */ | ||
22 | |||
23 | #include <linux/config.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/system.h> | ||
27 | #include <linux/bitops.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/kernel.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/string.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/socket.h> | ||
34 | #include <linux/sockios.h> | ||
35 | #include <linux/in.h> | ||
36 | #include <linux/errno.h> | ||
37 | #include <linux/interrupt.h> | ||
38 | #include <linux/if_ether.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/netdevice.h> | ||
41 | #include <linux/etherdevice.h> | ||
42 | #include <linux/notifier.h> | ||
43 | #include <net/ip.h> | ||
44 | #include <net/route.h> | ||
45 | #include <linux/skbuff.h> | ||
46 | #include <net/sock.h> | ||
47 | #include <net/pkt_sched.h> | ||
48 | |||
49 | #if 1 /* control */ | ||
50 | #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
51 | #else | ||
52 | #define DPRINTK(format,args...) | ||
53 | #endif | ||
54 | |||
55 | #if 0 /* data */ | ||
56 | #define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
57 | #else | ||
58 | #define D2PRINTK(format,args...) | ||
59 | #endif | ||
60 | |||
61 | struct gred_sched_data; | ||
62 | struct gred_sched; | ||
63 | |||
64 | struct gred_sched_data | ||
65 | { | ||
66 | /* Parameters */ | ||
67 | u32 limit; /* HARD maximal queue length */ | ||
68 | u32 qth_min; /* Min average length threshold: A scaled */ | ||
69 | u32 qth_max; /* Max average length threshold: A scaled */ | ||
70 | u32 DP; /* the drop pramaters */ | ||
71 | char Wlog; /* log(W) */ | ||
72 | char Plog; /* random number bits */ | ||
73 | u32 Scell_max; | ||
74 | u32 Rmask; | ||
75 | u32 bytesin; /* bytes seen on virtualQ so far*/ | ||
76 | u32 packetsin; /* packets seen on virtualQ so far*/ | ||
77 | u32 backlog; /* bytes on the virtualQ */ | ||
78 | u32 forced; /* packets dropped for exceeding limits */ | ||
79 | u32 early; /* packets dropped as a warning */ | ||
80 | u32 other; /* packets dropped by invoking drop() */ | ||
81 | u32 pdrop; /* packets dropped because we exceeded physical queue limits */ | ||
82 | char Scell_log; | ||
83 | u8 Stab[256]; | ||
84 | u8 prio; /* the prio of this vq */ | ||
85 | |||
86 | /* Variables */ | ||
87 | unsigned long qave; /* Average queue length: A scaled */ | ||
88 | int qcount; /* Packets since last random number generation */ | ||
89 | u32 qR; /* Cached random number */ | ||
90 | |||
91 | psched_time_t qidlestart; /* Start of idle period */ | ||
92 | }; | ||
93 | |||
94 | struct gred_sched | ||
95 | { | ||
96 | struct gred_sched_data *tab[MAX_DPs]; | ||
97 | u32 DPs; | ||
98 | u32 def; | ||
99 | u8 initd; | ||
100 | u8 grio; | ||
101 | u8 eqp; | ||
102 | }; | ||
103 | |||
104 | static int | ||
105 | gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
106 | { | ||
107 | psched_time_t now; | ||
108 | struct gred_sched_data *q=NULL; | ||
109 | struct gred_sched *t= qdisc_priv(sch); | ||
110 | unsigned long qave=0; | ||
111 | int i=0; | ||
112 | |||
113 | if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { | ||
114 | D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); | ||
115 | goto do_enqueue; | ||
116 | } | ||
117 | |||
118 | |||
119 | if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { | ||
120 | printk("GRED: setting to default (%d)\n ",t->def); | ||
121 | if (!(q=t->tab[t->def])) { | ||
122 | DPRINTK("GRED: setting to default FAILED! dropping!! " | ||
123 | "(%d)\n ", t->def); | ||
124 | goto drop; | ||
125 | } | ||
126 | /* fix tc_index? --could be controvesial but needed for | ||
127 | requeueing */ | ||
128 | skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; | ||
129 | } | ||
130 | |||
131 | D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " | ||
132 | "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, | ||
133 | sch->qstats.backlog); | ||
134 | /* sum up all the qaves of prios <= to ours to get the new qave*/ | ||
135 | if (!t->eqp && t->grio) { | ||
136 | for (i=0;i<t->DPs;i++) { | ||
137 | if ((!t->tab[i]) || (i==q->DP)) | ||
138 | continue; | ||
139 | |||
140 | if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) | ||
141 | qave +=t->tab[i]->qave; | ||
142 | } | ||
143 | |||
144 | } | ||
145 | |||
146 | q->packetsin++; | ||
147 | q->bytesin+=skb->len; | ||
148 | |||
149 | if (t->eqp && t->grio) { | ||
150 | qave=0; | ||
151 | q->qave=t->tab[t->def]->qave; | ||
152 | q->qidlestart=t->tab[t->def]->qidlestart; | ||
153 | } | ||
154 | |||
155 | if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { | ||
156 | long us_idle; | ||
157 | PSCHED_GET_TIME(now); | ||
158 | us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); | ||
159 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
160 | |||
161 | q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; | ||
162 | } else { | ||
163 | if (t->eqp) { | ||
164 | q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); | ||
165 | } else { | ||
166 | q->qave += q->backlog - (q->qave >> q->Wlog); | ||
167 | } | ||
168 | |||
169 | } | ||
170 | |||
171 | |||
172 | if (t->eqp && t->grio) | ||
173 | t->tab[t->def]->qave=q->qave; | ||
174 | |||
175 | if ((q->qave+qave) < q->qth_min) { | ||
176 | q->qcount = -1; | ||
177 | enqueue: | ||
178 | if (q->backlog + skb->len <= q->limit) { | ||
179 | q->backlog += skb->len; | ||
180 | do_enqueue: | ||
181 | __skb_queue_tail(&sch->q, skb); | ||
182 | sch->qstats.backlog += skb->len; | ||
183 | sch->bstats.bytes += skb->len; | ||
184 | sch->bstats.packets++; | ||
185 | return 0; | ||
186 | } else { | ||
187 | q->pdrop++; | ||
188 | } | ||
189 | |||
190 | drop: | ||
191 | kfree_skb(skb); | ||
192 | sch->qstats.drops++; | ||
193 | return NET_XMIT_DROP; | ||
194 | } | ||
195 | if ((q->qave+qave) >= q->qth_max) { | ||
196 | q->qcount = -1; | ||
197 | sch->qstats.overlimits++; | ||
198 | q->forced++; | ||
199 | goto drop; | ||
200 | } | ||
201 | if (++q->qcount) { | ||
202 | if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) | ||
203 | goto enqueue; | ||
204 | q->qcount = 0; | ||
205 | q->qR = net_random()&q->Rmask; | ||
206 | sch->qstats.overlimits++; | ||
207 | q->early++; | ||
208 | goto drop; | ||
209 | } | ||
210 | q->qR = net_random()&q->Rmask; | ||
211 | goto enqueue; | ||
212 | } | ||
213 | |||
214 | static int | ||
215 | gred_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
216 | { | ||
217 | struct gred_sched_data *q; | ||
218 | struct gred_sched *t= qdisc_priv(sch); | ||
219 | q= t->tab[(skb->tc_index&0xf)]; | ||
220 | /* error checking here -- probably unnecessary */ | ||
221 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
222 | |||
223 | __skb_queue_head(&sch->q, skb); | ||
224 | sch->qstats.backlog += skb->len; | ||
225 | sch->qstats.requeues++; | ||
226 | q->backlog += skb->len; | ||
227 | return 0; | ||
228 | } | ||
229 | |||
230 | static struct sk_buff * | ||
231 | gred_dequeue(struct Qdisc* sch) | ||
232 | { | ||
233 | struct sk_buff *skb; | ||
234 | struct gred_sched_data *q; | ||
235 | struct gred_sched *t= qdisc_priv(sch); | ||
236 | |||
237 | skb = __skb_dequeue(&sch->q); | ||
238 | if (skb) { | ||
239 | sch->qstats.backlog -= skb->len; | ||
240 | q= t->tab[(skb->tc_index&0xf)]; | ||
241 | if (q) { | ||
242 | q->backlog -= skb->len; | ||
243 | if (!q->backlog && !t->eqp) | ||
244 | PSCHED_GET_TIME(q->qidlestart); | ||
245 | } else { | ||
246 | D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); | ||
247 | } | ||
248 | return skb; | ||
249 | } | ||
250 | |||
251 | if (t->eqp) { | ||
252 | q= t->tab[t->def]; | ||
253 | if (!q) | ||
254 | D2PRINTK("no default VQ set: Results will be " | ||
255 | "screwed up\n"); | ||
256 | else | ||
257 | PSCHED_GET_TIME(q->qidlestart); | ||
258 | } | ||
259 | |||
260 | return NULL; | ||
261 | } | ||
262 | |||
263 | static unsigned int gred_drop(struct Qdisc* sch) | ||
264 | { | ||
265 | struct sk_buff *skb; | ||
266 | |||
267 | struct gred_sched_data *q; | ||
268 | struct gred_sched *t= qdisc_priv(sch); | ||
269 | |||
270 | skb = __skb_dequeue_tail(&sch->q); | ||
271 | if (skb) { | ||
272 | unsigned int len = skb->len; | ||
273 | sch->qstats.backlog -= len; | ||
274 | sch->qstats.drops++; | ||
275 | q= t->tab[(skb->tc_index&0xf)]; | ||
276 | if (q) { | ||
277 | q->backlog -= len; | ||
278 | q->other++; | ||
279 | if (!q->backlog && !t->eqp) | ||
280 | PSCHED_GET_TIME(q->qidlestart); | ||
281 | } else { | ||
282 | D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); | ||
283 | } | ||
284 | |||
285 | kfree_skb(skb); | ||
286 | return len; | ||
287 | } | ||
288 | |||
289 | q=t->tab[t->def]; | ||
290 | if (!q) { | ||
291 | D2PRINTK("no default VQ set: Results might be screwed up\n"); | ||
292 | return 0; | ||
293 | } | ||
294 | |||
295 | PSCHED_GET_TIME(q->qidlestart); | ||
296 | return 0; | ||
297 | |||
298 | } | ||
299 | |||
300 | static void gred_reset(struct Qdisc* sch) | ||
301 | { | ||
302 | int i; | ||
303 | struct gred_sched_data *q; | ||
304 | struct gred_sched *t= qdisc_priv(sch); | ||
305 | |||
306 | __skb_queue_purge(&sch->q); | ||
307 | |||
308 | sch->qstats.backlog = 0; | ||
309 | |||
310 | for (i=0;i<t->DPs;i++) { | ||
311 | q= t->tab[i]; | ||
312 | if (!q) | ||
313 | continue; | ||
314 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
315 | q->qave = 0; | ||
316 | q->qcount = -1; | ||
317 | q->backlog = 0; | ||
318 | q->other=0; | ||
319 | q->forced=0; | ||
320 | q->pdrop=0; | ||
321 | q->early=0; | ||
322 | } | ||
323 | } | ||
324 | |||
325 | static int gred_change(struct Qdisc *sch, struct rtattr *opt) | ||
326 | { | ||
327 | struct gred_sched *table = qdisc_priv(sch); | ||
328 | struct gred_sched_data *q; | ||
329 | struct tc_gred_qopt *ctl; | ||
330 | struct tc_gred_sopt *sopt; | ||
331 | struct rtattr *tb[TCA_GRED_STAB]; | ||
332 | struct rtattr *tb2[TCA_GRED_DPS]; | ||
333 | int i; | ||
334 | |||
335 | if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) | ||
336 | return -EINVAL; | ||
337 | |||
338 | if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { | ||
339 | rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); | ||
340 | |||
341 | if (tb2[TCA_GRED_DPS-1] == 0) | ||
342 | return -EINVAL; | ||
343 | |||
344 | sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); | ||
345 | table->DPs=sopt->DPs; | ||
346 | table->def=sopt->def_DP; | ||
347 | table->grio=sopt->grio; | ||
348 | table->initd=0; | ||
349 | /* probably need to clear all the table DP entries as well */ | ||
350 | return 0; | ||
351 | } | ||
352 | |||
353 | |||
354 | if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || | ||
355 | RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || | ||
356 | RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) | ||
357 | return -EINVAL; | ||
358 | |||
359 | ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); | ||
360 | if (ctl->DP > MAX_DPs-1 ) { | ||
361 | /* misbehaving is punished! Put in the default drop probability */ | ||
362 | DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " | ||
363 | "set to default at %d\n",ctl->DP,table->def); | ||
364 | ctl->DP=table->def; | ||
365 | } | ||
366 | |||
367 | if (table->tab[ctl->DP] == NULL) { | ||
368 | table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), | ||
369 | GFP_KERNEL); | ||
370 | if (NULL == table->tab[ctl->DP]) | ||
371 | return -ENOMEM; | ||
372 | memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); | ||
373 | } | ||
374 | q= table->tab[ctl->DP]; | ||
375 | |||
376 | if (table->grio) { | ||
377 | if (ctl->prio <=0) { | ||
378 | if (table->def && table->tab[table->def]) { | ||
379 | DPRINTK("\nGRED: DP %u does not have a prio" | ||
380 | "setting default to %d\n",ctl->DP, | ||
381 | table->tab[table->def]->prio); | ||
382 | q->prio=table->tab[table->def]->prio; | ||
383 | } else { | ||
384 | DPRINTK("\nGRED: DP %u does not have a prio" | ||
385 | " setting default to 8\n",ctl->DP); | ||
386 | q->prio=8; | ||
387 | } | ||
388 | } else { | ||
389 | q->prio=ctl->prio; | ||
390 | } | ||
391 | } else { | ||
392 | q->prio=8; | ||
393 | } | ||
394 | |||
395 | |||
396 | q->DP=ctl->DP; | ||
397 | q->Wlog = ctl->Wlog; | ||
398 | q->Plog = ctl->Plog; | ||
399 | q->limit = ctl->limit; | ||
400 | q->Scell_log = ctl->Scell_log; | ||
401 | q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; | ||
402 | q->Scell_max = (255<<q->Scell_log); | ||
403 | q->qth_min = ctl->qth_min<<ctl->Wlog; | ||
404 | q->qth_max = ctl->qth_max<<ctl->Wlog; | ||
405 | q->qave=0; | ||
406 | q->backlog=0; | ||
407 | q->qcount = -1; | ||
408 | q->other=0; | ||
409 | q->forced=0; | ||
410 | q->pdrop=0; | ||
411 | q->early=0; | ||
412 | |||
413 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
414 | memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); | ||
415 | |||
416 | if ( table->initd && table->grio) { | ||
417 | /* this looks ugly but it's not in the fast path */ | ||
418 | for (i=0;i<table->DPs;i++) { | ||
419 | if ((!table->tab[i]) || (i==q->DP) ) | ||
420 | continue; | ||
421 | if (table->tab[i]->prio == q->prio ){ | ||
422 | /* WRED mode detected */ | ||
423 | table->eqp=1; | ||
424 | break; | ||
425 | } | ||
426 | } | ||
427 | } | ||
428 | |||
429 | if (!table->initd) { | ||
430 | table->initd=1; | ||
431 | /* | ||
432 | the first entry also goes into the default until | ||
433 | over-written | ||
434 | */ | ||
435 | |||
436 | if (table->tab[table->def] == NULL) { | ||
437 | table->tab[table->def]= | ||
438 | kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); | ||
439 | if (NULL == table->tab[table->def]) | ||
440 | return -ENOMEM; | ||
441 | |||
442 | memset(table->tab[table->def], 0, | ||
443 | (sizeof(struct gred_sched_data))); | ||
444 | } | ||
445 | q= table->tab[table->def]; | ||
446 | q->DP=table->def; | ||
447 | q->Wlog = ctl->Wlog; | ||
448 | q->Plog = ctl->Plog; | ||
449 | q->limit = ctl->limit; | ||
450 | q->Scell_log = ctl->Scell_log; | ||
451 | q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; | ||
452 | q->Scell_max = (255<<q->Scell_log); | ||
453 | q->qth_min = ctl->qth_min<<ctl->Wlog; | ||
454 | q->qth_max = ctl->qth_max<<ctl->Wlog; | ||
455 | |||
456 | if (table->grio) | ||
457 | q->prio=table->tab[ctl->DP]->prio; | ||
458 | else | ||
459 | q->prio=8; | ||
460 | |||
461 | q->qcount = -1; | ||
462 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
463 | memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); | ||
464 | } | ||
465 | return 0; | ||
466 | |||
467 | } | ||
468 | |||
469 | static int gred_init(struct Qdisc *sch, struct rtattr *opt) | ||
470 | { | ||
471 | struct gred_sched *table = qdisc_priv(sch); | ||
472 | struct tc_gred_sopt *sopt; | ||
473 | struct rtattr *tb[TCA_GRED_STAB]; | ||
474 | struct rtattr *tb2[TCA_GRED_DPS]; | ||
475 | |||
476 | if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) | ||
477 | return -EINVAL; | ||
478 | |||
479 | if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { | ||
480 | rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); | ||
481 | |||
482 | if (tb2[TCA_GRED_DPS-1] == 0) | ||
483 | return -EINVAL; | ||
484 | |||
485 | sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); | ||
486 | table->DPs=sopt->DPs; | ||
487 | table->def=sopt->def_DP; | ||
488 | table->grio=sopt->grio; | ||
489 | table->initd=0; | ||
490 | return 0; | ||
491 | } | ||
492 | |||
493 | DPRINTK("\n GRED_INIT error!\n"); | ||
494 | return -EINVAL; | ||
495 | } | ||
496 | |||
497 | static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
498 | { | ||
499 | unsigned long qave; | ||
500 | struct rtattr *rta; | ||
501 | struct tc_gred_qopt *opt = NULL ; | ||
502 | struct tc_gred_qopt *dst; | ||
503 | struct gred_sched *table = qdisc_priv(sch); | ||
504 | struct gred_sched_data *q; | ||
505 | int i; | ||
506 | unsigned char *b = skb->tail; | ||
507 | |||
508 | rta = (struct rtattr*)b; | ||
509 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
510 | |||
511 | opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); | ||
512 | |||
513 | if (opt == NULL) { | ||
514 | DPRINTK("gred_dump:failed to malloc for %Zd\n", | ||
515 | sizeof(struct tc_gred_qopt)*MAX_DPs); | ||
516 | goto rtattr_failure; | ||
517 | } | ||
518 | |||
519 | memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); | ||
520 | |||
521 | if (!table->initd) { | ||
522 | DPRINTK("NO GRED Queues setup!\n"); | ||
523 | } | ||
524 | |||
525 | for (i=0;i<MAX_DPs;i++) { | ||
526 | dst= &opt[i]; | ||
527 | q= table->tab[i]; | ||
528 | |||
529 | if (!q) { | ||
530 | /* hack -- fix at some point with proper message | ||
531 | This is how we indicate to tc that there is no VQ | ||
532 | at this DP */ | ||
533 | |||
534 | dst->DP=MAX_DPs+i; | ||
535 | continue; | ||
536 | } | ||
537 | |||
538 | dst->limit=q->limit; | ||
539 | dst->qth_min=q->qth_min>>q->Wlog; | ||
540 | dst->qth_max=q->qth_max>>q->Wlog; | ||
541 | dst->DP=q->DP; | ||
542 | dst->backlog=q->backlog; | ||
543 | if (q->qave) { | ||
544 | if (table->eqp && table->grio) { | ||
545 | q->qidlestart=table->tab[table->def]->qidlestart; | ||
546 | q->qave=table->tab[table->def]->qave; | ||
547 | } | ||
548 | if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { | ||
549 | long idle; | ||
550 | psched_time_t now; | ||
551 | PSCHED_GET_TIME(now); | ||
552 | idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); | ||
553 | qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; | ||
554 | dst->qave = qave >> q->Wlog; | ||
555 | |||
556 | } else { | ||
557 | dst->qave = q->qave >> q->Wlog; | ||
558 | } | ||
559 | } else { | ||
560 | dst->qave = 0; | ||
561 | } | ||
562 | |||
563 | |||
564 | dst->Wlog = q->Wlog; | ||
565 | dst->Plog = q->Plog; | ||
566 | dst->Scell_log = q->Scell_log; | ||
567 | dst->other = q->other; | ||
568 | dst->forced = q->forced; | ||
569 | dst->early = q->early; | ||
570 | dst->pdrop = q->pdrop; | ||
571 | dst->prio = q->prio; | ||
572 | dst->packets=q->packetsin; | ||
573 | dst->bytesin=q->bytesin; | ||
574 | } | ||
575 | |||
576 | RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); | ||
577 | rta->rta_len = skb->tail - b; | ||
578 | |||
579 | kfree(opt); | ||
580 | return skb->len; | ||
581 | |||
582 | rtattr_failure: | ||
583 | if (opt) | ||
584 | kfree(opt); | ||
585 | DPRINTK("gred_dump: FAILURE!!!!\n"); | ||
586 | |||
587 | /* also free the opt struct here */ | ||
588 | skb_trim(skb, b - skb->data); | ||
589 | return -1; | ||
590 | } | ||
591 | |||
592 | static void gred_destroy(struct Qdisc *sch) | ||
593 | { | ||
594 | struct gred_sched *table = qdisc_priv(sch); | ||
595 | int i; | ||
596 | |||
597 | for (i = 0;i < table->DPs; i++) { | ||
598 | if (table->tab[i]) | ||
599 | kfree(table->tab[i]); | ||
600 | } | ||
601 | } | ||
602 | |||
603 | static struct Qdisc_ops gred_qdisc_ops = { | ||
604 | .next = NULL, | ||
605 | .cl_ops = NULL, | ||
606 | .id = "gred", | ||
607 | .priv_size = sizeof(struct gred_sched), | ||
608 | .enqueue = gred_enqueue, | ||
609 | .dequeue = gred_dequeue, | ||
610 | .requeue = gred_requeue, | ||
611 | .drop = gred_drop, | ||
612 | .init = gred_init, | ||
613 | .reset = gred_reset, | ||
614 | .destroy = gred_destroy, | ||
615 | .change = gred_change, | ||
616 | .dump = gred_dump, | ||
617 | .owner = THIS_MODULE, | ||
618 | }; | ||
619 | |||
620 | static int __init gred_module_init(void) | ||
621 | { | ||
622 | return register_qdisc(&gred_qdisc_ops); | ||
623 | } | ||
624 | static void __exit gred_module_exit(void) | ||
625 | { | ||
626 | unregister_qdisc(&gred_qdisc_ops); | ||
627 | } | ||
628 | module_init(gred_module_init) | ||
629 | module_exit(gred_module_exit) | ||
630 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c new file mode 100644 index 000000000000..c26764bc4103 --- /dev/null +++ b/net/sched/sch_hfsc.c | |||
@@ -0,0 +1,1822 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version 2 | ||
7 | * of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * 2003-10-17 - Ported from altq | ||
10 | */ | ||
11 | /* | ||
12 | * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. | ||
13 | * | ||
14 | * Permission to use, copy, modify, and distribute this software and | ||
15 | * its documentation is hereby granted (including for commercial or | ||
16 | * for-profit use), provided that both the copyright notice and this | ||
17 | * permission notice appear in all copies of the software, derivative | ||
18 | * works, or modified versions, and any portions thereof. | ||
19 | * | ||
20 | * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF | ||
21 | * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS | ||
22 | * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED | ||
23 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | ||
24 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
25 | * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE | ||
26 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
27 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT | ||
28 | * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR | ||
29 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
30 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
31 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||
32 | * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH | ||
33 | * DAMAGE. | ||
34 | * | ||
35 | * Carnegie Mellon encourages (but does not require) users of this | ||
36 | * software to return any improvements or extensions that they make, | ||
37 | * and to grant Carnegie Mellon the rights to redistribute these | ||
38 | * changes without encumbrance. | ||
39 | */ | ||
40 | /* | ||
41 | * H-FSC is described in Proceedings of SIGCOMM'97, | ||
42 | * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, | ||
43 | * Real-Time and Priority Service" | ||
44 | * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. | ||
45 | * | ||
46 | * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing. | ||
47 | * when a class has an upperlimit, the fit-time is computed from the | ||
48 | * upperlimit service curve. the link-sharing scheduler does not schedule | ||
49 | * a class whose fit-time exceeds the current time. | ||
50 | */ | ||
51 | |||
52 | #include <linux/kernel.h> | ||
53 | #include <linux/config.h> | ||
54 | #include <linux/module.h> | ||
55 | #include <linux/types.h> | ||
56 | #include <linux/errno.h> | ||
57 | #include <linux/jiffies.h> | ||
58 | #include <linux/compiler.h> | ||
59 | #include <linux/spinlock.h> | ||
60 | #include <linux/skbuff.h> | ||
61 | #include <linux/string.h> | ||
62 | #include <linux/slab.h> | ||
63 | #include <linux/timer.h> | ||
64 | #include <linux/list.h> | ||
65 | #include <linux/rbtree.h> | ||
66 | #include <linux/init.h> | ||
67 | #include <linux/netdevice.h> | ||
68 | #include <linux/rtnetlink.h> | ||
69 | #include <linux/pkt_sched.h> | ||
70 | #include <net/pkt_sched.h> | ||
71 | #include <net/pkt_cls.h> | ||
72 | #include <asm/system.h> | ||
73 | #include <asm/div64.h> | ||
74 | |||
75 | #define HFSC_DEBUG 1 | ||
76 | |||
77 | /* | ||
78 | * kernel internal service curve representation: | ||
79 | * coordinates are given by 64 bit unsigned integers. | ||
80 | * x-axis: unit is clock count. | ||
81 | * y-axis: unit is byte. | ||
82 | * | ||
83 | * The service curve parameters are converted to the internal | ||
84 | * representation. The slope values are scaled to avoid overflow. | ||
85 | * the inverse slope values as well as the y-projection of the 1st | ||
86 | * segment are kept in order to to avoid 64-bit divide operations | ||
87 | * that are expensive on 32-bit architectures. | ||
88 | */ | ||
89 | |||
90 | struct internal_sc | ||
91 | { | ||
92 | u64 sm1; /* scaled slope of the 1st segment */ | ||
93 | u64 ism1; /* scaled inverse-slope of the 1st segment */ | ||
94 | u64 dx; /* the x-projection of the 1st segment */ | ||
95 | u64 dy; /* the y-projection of the 1st segment */ | ||
96 | u64 sm2; /* scaled slope of the 2nd segment */ | ||
97 | u64 ism2; /* scaled inverse-slope of the 2nd segment */ | ||
98 | }; | ||
99 | |||
100 | /* runtime service curve */ | ||
101 | struct runtime_sc | ||
102 | { | ||
103 | u64 x; /* current starting position on x-axis */ | ||
104 | u64 y; /* current starting position on y-axis */ | ||
105 | u64 sm1; /* scaled slope of the 1st segment */ | ||
106 | u64 ism1; /* scaled inverse-slope of the 1st segment */ | ||
107 | u64 dx; /* the x-projection of the 1st segment */ | ||
108 | u64 dy; /* the y-projection of the 1st segment */ | ||
109 | u64 sm2; /* scaled slope of the 2nd segment */ | ||
110 | u64 ism2; /* scaled inverse-slope of the 2nd segment */ | ||
111 | }; | ||
112 | |||
113 | enum hfsc_class_flags | ||
114 | { | ||
115 | HFSC_RSC = 0x1, | ||
116 | HFSC_FSC = 0x2, | ||
117 | HFSC_USC = 0x4 | ||
118 | }; | ||
119 | |||
120 | struct hfsc_class | ||
121 | { | ||
122 | u32 classid; /* class id */ | ||
123 | unsigned int refcnt; /* usage count */ | ||
124 | |||
125 | struct gnet_stats_basic bstats; | ||
126 | struct gnet_stats_queue qstats; | ||
127 | struct gnet_stats_rate_est rate_est; | ||
128 | spinlock_t *stats_lock; | ||
129 | unsigned int level; /* class level in hierarchy */ | ||
130 | struct tcf_proto *filter_list; /* filter list */ | ||
131 | unsigned int filter_cnt; /* filter count */ | ||
132 | |||
133 | struct hfsc_sched *sched; /* scheduler data */ | ||
134 | struct hfsc_class *cl_parent; /* parent class */ | ||
135 | struct list_head siblings; /* sibling classes */ | ||
136 | struct list_head children; /* child classes */ | ||
137 | struct Qdisc *qdisc; /* leaf qdisc */ | ||
138 | |||
139 | struct rb_node el_node; /* qdisc's eligible tree member */ | ||
140 | struct rb_root vt_tree; /* active children sorted by cl_vt */ | ||
141 | struct rb_node vt_node; /* parent's vt_tree member */ | ||
142 | struct rb_root cf_tree; /* active children sorted by cl_f */ | ||
143 | struct rb_node cf_node; /* parent's cf_heap member */ | ||
144 | struct list_head hlist; /* hash list member */ | ||
145 | struct list_head dlist; /* drop list member */ | ||
146 | |||
147 | u64 cl_total; /* total work in bytes */ | ||
148 | u64 cl_cumul; /* cumulative work in bytes done by | ||
149 | real-time criteria */ | ||
150 | |||
151 | u64 cl_d; /* deadline*/ | ||
152 | u64 cl_e; /* eligible time */ | ||
153 | u64 cl_vt; /* virtual time */ | ||
154 | u64 cl_f; /* time when this class will fit for | ||
155 | link-sharing, max(myf, cfmin) */ | ||
156 | u64 cl_myf; /* my fit-time (calculated from this | ||
157 | class's own upperlimit curve) */ | ||
158 | u64 cl_myfadj; /* my fit-time adjustment (to cancel | ||
159 | history dependence) */ | ||
160 | u64 cl_cfmin; /* earliest children's fit-time (used | ||
161 | with cl_myf to obtain cl_f) */ | ||
162 | u64 cl_cvtmin; /* minimal virtual time among the | ||
163 | children fit for link-sharing | ||
164 | (monotonic within a period) */ | ||
165 | u64 cl_vtadj; /* intra-period cumulative vt | ||
166 | adjustment */ | ||
167 | u64 cl_vtoff; /* inter-period cumulative vt offset */ | ||
168 | u64 cl_cvtmax; /* max child's vt in the last period */ | ||
169 | u64 cl_cvtoff; /* cumulative cvtmax of all periods */ | ||
170 | u64 cl_pcvtoff; /* parent's cvtoff at initalization | ||
171 | time */ | ||
172 | |||
173 | struct internal_sc cl_rsc; /* internal real-time service curve */ | ||
174 | struct internal_sc cl_fsc; /* internal fair service curve */ | ||
175 | struct internal_sc cl_usc; /* internal upperlimit service curve */ | ||
176 | struct runtime_sc cl_deadline; /* deadline curve */ | ||
177 | struct runtime_sc cl_eligible; /* eligible curve */ | ||
178 | struct runtime_sc cl_virtual; /* virtual curve */ | ||
179 | struct runtime_sc cl_ulimit; /* upperlimit curve */ | ||
180 | |||
181 | unsigned long cl_flags; /* which curves are valid */ | ||
182 | unsigned long cl_vtperiod; /* vt period sequence number */ | ||
183 | unsigned long cl_parentperiod;/* parent's vt period sequence number*/ | ||
184 | unsigned long cl_nactive; /* number of active children */ | ||
185 | }; | ||
186 | |||
187 | #define HFSC_HSIZE 16 | ||
188 | |||
189 | struct hfsc_sched | ||
190 | { | ||
191 | u16 defcls; /* default class id */ | ||
192 | struct hfsc_class root; /* root class */ | ||
193 | struct list_head clhash[HFSC_HSIZE]; /* class hash */ | ||
194 | struct rb_root eligible; /* eligible tree */ | ||
195 | struct list_head droplist; /* active leaf class list (for | ||
196 | dropping) */ | ||
197 | struct sk_buff_head requeue; /* requeued packet */ | ||
198 | struct timer_list wd_timer; /* watchdog timer */ | ||
199 | }; | ||
200 | |||
201 | /* | ||
202 | * macros | ||
203 | */ | ||
204 | #ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY | ||
205 | #include <linux/time.h> | ||
206 | #undef PSCHED_GET_TIME | ||
207 | #define PSCHED_GET_TIME(stamp) \ | ||
208 | do { \ | ||
209 | struct timeval tv; \ | ||
210 | do_gettimeofday(&tv); \ | ||
211 | (stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \ | ||
212 | } while (0) | ||
213 | #endif | ||
214 | |||
215 | #if HFSC_DEBUG | ||
216 | #define ASSERT(cond) \ | ||
217 | do { \ | ||
218 | if (unlikely(!(cond))) \ | ||
219 | printk("assertion %s failed at %s:%i (%s)\n", \ | ||
220 | #cond, __FILE__, __LINE__, __FUNCTION__); \ | ||
221 | } while (0) | ||
222 | #else | ||
223 | #define ASSERT(cond) | ||
224 | #endif /* HFSC_DEBUG */ | ||
225 | |||
226 | #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ | ||
227 | |||
228 | |||
229 | /* | ||
230 | * eligible tree holds backlogged classes being sorted by their eligible times. | ||
231 | * there is one eligible tree per hfsc instance. | ||
232 | */ | ||
233 | |||
234 | static void | ||
235 | eltree_insert(struct hfsc_class *cl) | ||
236 | { | ||
237 | struct rb_node **p = &cl->sched->eligible.rb_node; | ||
238 | struct rb_node *parent = NULL; | ||
239 | struct hfsc_class *cl1; | ||
240 | |||
241 | while (*p != NULL) { | ||
242 | parent = *p; | ||
243 | cl1 = rb_entry(parent, struct hfsc_class, el_node); | ||
244 | if (cl->cl_e >= cl1->cl_e) | ||
245 | p = &parent->rb_right; | ||
246 | else | ||
247 | p = &parent->rb_left; | ||
248 | } | ||
249 | rb_link_node(&cl->el_node, parent, p); | ||
250 | rb_insert_color(&cl->el_node, &cl->sched->eligible); | ||
251 | } | ||
252 | |||
253 | static inline void | ||
254 | eltree_remove(struct hfsc_class *cl) | ||
255 | { | ||
256 | rb_erase(&cl->el_node, &cl->sched->eligible); | ||
257 | } | ||
258 | |||
259 | static inline void | ||
260 | eltree_update(struct hfsc_class *cl) | ||
261 | { | ||
262 | eltree_remove(cl); | ||
263 | eltree_insert(cl); | ||
264 | } | ||
265 | |||
266 | /* find the class with the minimum deadline among the eligible classes */ | ||
267 | static inline struct hfsc_class * | ||
268 | eltree_get_mindl(struct hfsc_sched *q, u64 cur_time) | ||
269 | { | ||
270 | struct hfsc_class *p, *cl = NULL; | ||
271 | struct rb_node *n; | ||
272 | |||
273 | for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) { | ||
274 | p = rb_entry(n, struct hfsc_class, el_node); | ||
275 | if (p->cl_e > cur_time) | ||
276 | break; | ||
277 | if (cl == NULL || p->cl_d < cl->cl_d) | ||
278 | cl = p; | ||
279 | } | ||
280 | return cl; | ||
281 | } | ||
282 | |||
283 | /* find the class with minimum eligible time among the eligible classes */ | ||
284 | static inline struct hfsc_class * | ||
285 | eltree_get_minel(struct hfsc_sched *q) | ||
286 | { | ||
287 | struct rb_node *n; | ||
288 | |||
289 | n = rb_first(&q->eligible); | ||
290 | if (n == NULL) | ||
291 | return NULL; | ||
292 | return rb_entry(n, struct hfsc_class, el_node); | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * vttree holds holds backlogged child classes being sorted by their virtual | ||
297 | * time. each intermediate class has one vttree. | ||
298 | */ | ||
299 | static void | ||
300 | vttree_insert(struct hfsc_class *cl) | ||
301 | { | ||
302 | struct rb_node **p = &cl->cl_parent->vt_tree.rb_node; | ||
303 | struct rb_node *parent = NULL; | ||
304 | struct hfsc_class *cl1; | ||
305 | |||
306 | while (*p != NULL) { | ||
307 | parent = *p; | ||
308 | cl1 = rb_entry(parent, struct hfsc_class, vt_node); | ||
309 | if (cl->cl_vt >= cl1->cl_vt) | ||
310 | p = &parent->rb_right; | ||
311 | else | ||
312 | p = &parent->rb_left; | ||
313 | } | ||
314 | rb_link_node(&cl->vt_node, parent, p); | ||
315 | rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree); | ||
316 | } | ||
317 | |||
318 | static inline void | ||
319 | vttree_remove(struct hfsc_class *cl) | ||
320 | { | ||
321 | rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree); | ||
322 | } | ||
323 | |||
324 | static inline void | ||
325 | vttree_update(struct hfsc_class *cl) | ||
326 | { | ||
327 | vttree_remove(cl); | ||
328 | vttree_insert(cl); | ||
329 | } | ||
330 | |||
331 | static inline struct hfsc_class * | ||
332 | vttree_firstfit(struct hfsc_class *cl, u64 cur_time) | ||
333 | { | ||
334 | struct hfsc_class *p; | ||
335 | struct rb_node *n; | ||
336 | |||
337 | for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) { | ||
338 | p = rb_entry(n, struct hfsc_class, vt_node); | ||
339 | if (p->cl_f <= cur_time) | ||
340 | return p; | ||
341 | } | ||
342 | return NULL; | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * get the leaf class with the minimum vt in the hierarchy | ||
347 | */ | ||
348 | static struct hfsc_class * | ||
349 | vttree_get_minvt(struct hfsc_class *cl, u64 cur_time) | ||
350 | { | ||
351 | /* if root-class's cfmin is bigger than cur_time nothing to do */ | ||
352 | if (cl->cl_cfmin > cur_time) | ||
353 | return NULL; | ||
354 | |||
355 | while (cl->level > 0) { | ||
356 | cl = vttree_firstfit(cl, cur_time); | ||
357 | if (cl == NULL) | ||
358 | return NULL; | ||
359 | /* | ||
360 | * update parent's cl_cvtmin. | ||
361 | */ | ||
362 | if (cl->cl_parent->cl_cvtmin < cl->cl_vt) | ||
363 | cl->cl_parent->cl_cvtmin = cl->cl_vt; | ||
364 | } | ||
365 | return cl; | ||
366 | } | ||
367 | |||
368 | static void | ||
369 | cftree_insert(struct hfsc_class *cl) | ||
370 | { | ||
371 | struct rb_node **p = &cl->cl_parent->cf_tree.rb_node; | ||
372 | struct rb_node *parent = NULL; | ||
373 | struct hfsc_class *cl1; | ||
374 | |||
375 | while (*p != NULL) { | ||
376 | parent = *p; | ||
377 | cl1 = rb_entry(parent, struct hfsc_class, cf_node); | ||
378 | if (cl->cl_f >= cl1->cl_f) | ||
379 | p = &parent->rb_right; | ||
380 | else | ||
381 | p = &parent->rb_left; | ||
382 | } | ||
383 | rb_link_node(&cl->cf_node, parent, p); | ||
384 | rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree); | ||
385 | } | ||
386 | |||
387 | static inline void | ||
388 | cftree_remove(struct hfsc_class *cl) | ||
389 | { | ||
390 | rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree); | ||
391 | } | ||
392 | |||
393 | static inline void | ||
394 | cftree_update(struct hfsc_class *cl) | ||
395 | { | ||
396 | cftree_remove(cl); | ||
397 | cftree_insert(cl); | ||
398 | } | ||
399 | |||
400 | /* | ||
401 | * service curve support functions | ||
402 | * | ||
403 | * external service curve parameters | ||
404 | * m: bps | ||
405 | * d: us | ||
406 | * internal service curve parameters | ||
407 | * sm: (bytes/psched_us) << SM_SHIFT | ||
408 | * ism: (psched_us/byte) << ISM_SHIFT | ||
409 | * dx: psched_us | ||
410 | * | ||
411 | * Clock source resolution (CONFIG_NET_SCH_CLK_*) | ||
412 | * JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us. | ||
413 | * CPU: resolution is between 0.5us and 1us. | ||
414 | * GETTIMEOFDAY: resolution is exactly 1us. | ||
415 | * | ||
416 | * sm and ism are scaled in order to keep effective digits. | ||
417 | * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective | ||
418 | * digits in decimal using the following table. | ||
419 | * | ||
420 | * Note: We can afford the additional accuracy (altq hfsc keeps at most | ||
421 | * 3 effective digits) thanks to the fact that linux clock is bounded | ||
422 | * much more tightly. | ||
423 | * | ||
424 | * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps | ||
425 | * ------------+------------------------------------------------------- | ||
426 | * bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3 | ||
427 | * bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3 | ||
428 | * bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3 | ||
429 | * | ||
430 | * 0.5us/byte 160 16 1.6 0.16 0.016 | ||
431 | * us/byte 80 8 0.8 0.08 0.008 | ||
432 | * 1.27us/byte 63 6.3 0.63 0.063 0.0063 | ||
433 | */ | ||
434 | #define SM_SHIFT 20 | ||
435 | #define ISM_SHIFT 18 | ||
436 | |||
437 | #define SM_MASK ((1ULL << SM_SHIFT) - 1) | ||
438 | #define ISM_MASK ((1ULL << ISM_SHIFT) - 1) | ||
439 | |||
440 | static inline u64 | ||
441 | seg_x2y(u64 x, u64 sm) | ||
442 | { | ||
443 | u64 y; | ||
444 | |||
445 | /* | ||
446 | * compute | ||
447 | * y = x * sm >> SM_SHIFT | ||
448 | * but divide it for the upper and lower bits to avoid overflow | ||
449 | */ | ||
450 | y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); | ||
451 | return y; | ||
452 | } | ||
453 | |||
454 | static inline u64 | ||
455 | seg_y2x(u64 y, u64 ism) | ||
456 | { | ||
457 | u64 x; | ||
458 | |||
459 | if (y == 0) | ||
460 | x = 0; | ||
461 | else if (ism == HT_INFINITY) | ||
462 | x = HT_INFINITY; | ||
463 | else { | ||
464 | x = (y >> ISM_SHIFT) * ism | ||
465 | + (((y & ISM_MASK) * ism) >> ISM_SHIFT); | ||
466 | } | ||
467 | return x; | ||
468 | } | ||
469 | |||
470 | /* Convert m (bps) into sm (bytes/psched us) */ | ||
471 | static u64 | ||
472 | m2sm(u32 m) | ||
473 | { | ||
474 | u64 sm; | ||
475 | |||
476 | sm = ((u64)m << SM_SHIFT); | ||
477 | sm += PSCHED_JIFFIE2US(HZ) - 1; | ||
478 | do_div(sm, PSCHED_JIFFIE2US(HZ)); | ||
479 | return sm; | ||
480 | } | ||
481 | |||
482 | /* convert m (bps) into ism (psched us/byte) */ | ||
483 | static u64 | ||
484 | m2ism(u32 m) | ||
485 | { | ||
486 | u64 ism; | ||
487 | |||
488 | if (m == 0) | ||
489 | ism = HT_INFINITY; | ||
490 | else { | ||
491 | ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT); | ||
492 | ism += m - 1; | ||
493 | do_div(ism, m); | ||
494 | } | ||
495 | return ism; | ||
496 | } | ||
497 | |||
498 | /* convert d (us) into dx (psched us) */ | ||
499 | static u64 | ||
500 | d2dx(u32 d) | ||
501 | { | ||
502 | u64 dx; | ||
503 | |||
504 | dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); | ||
505 | dx += 1000000 - 1; | ||
506 | do_div(dx, 1000000); | ||
507 | return dx; | ||
508 | } | ||
509 | |||
510 | /* convert sm (bytes/psched us) into m (bps) */ | ||
511 | static u32 | ||
512 | sm2m(u64 sm) | ||
513 | { | ||
514 | u64 m; | ||
515 | |||
516 | m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT; | ||
517 | return (u32)m; | ||
518 | } | ||
519 | |||
520 | /* convert dx (psched us) into d (us) */ | ||
521 | static u32 | ||
522 | dx2d(u64 dx) | ||
523 | { | ||
524 | u64 d; | ||
525 | |||
526 | d = dx * 1000000; | ||
527 | do_div(d, PSCHED_JIFFIE2US(HZ)); | ||
528 | return (u32)d; | ||
529 | } | ||
530 | |||
531 | static void | ||
532 | sc2isc(struct tc_service_curve *sc, struct internal_sc *isc) | ||
533 | { | ||
534 | isc->sm1 = m2sm(sc->m1); | ||
535 | isc->ism1 = m2ism(sc->m1); | ||
536 | isc->dx = d2dx(sc->d); | ||
537 | isc->dy = seg_x2y(isc->dx, isc->sm1); | ||
538 | isc->sm2 = m2sm(sc->m2); | ||
539 | isc->ism2 = m2ism(sc->m2); | ||
540 | } | ||
541 | |||
542 | /* | ||
543 | * initialize the runtime service curve with the given internal | ||
544 | * service curve starting at (x, y). | ||
545 | */ | ||
546 | static void | ||
547 | rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) | ||
548 | { | ||
549 | rtsc->x = x; | ||
550 | rtsc->y = y; | ||
551 | rtsc->sm1 = isc->sm1; | ||
552 | rtsc->ism1 = isc->ism1; | ||
553 | rtsc->dx = isc->dx; | ||
554 | rtsc->dy = isc->dy; | ||
555 | rtsc->sm2 = isc->sm2; | ||
556 | rtsc->ism2 = isc->ism2; | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * calculate the y-projection of the runtime service curve by the | ||
561 | * given x-projection value | ||
562 | */ | ||
563 | static u64 | ||
564 | rtsc_y2x(struct runtime_sc *rtsc, u64 y) | ||
565 | { | ||
566 | u64 x; | ||
567 | |||
568 | if (y < rtsc->y) | ||
569 | x = rtsc->x; | ||
570 | else if (y <= rtsc->y + rtsc->dy) { | ||
571 | /* x belongs to the 1st segment */ | ||
572 | if (rtsc->dy == 0) | ||
573 | x = rtsc->x + rtsc->dx; | ||
574 | else | ||
575 | x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); | ||
576 | } else { | ||
577 | /* x belongs to the 2nd segment */ | ||
578 | x = rtsc->x + rtsc->dx | ||
579 | + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); | ||
580 | } | ||
581 | return x; | ||
582 | } | ||
583 | |||
584 | static u64 | ||
585 | rtsc_x2y(struct runtime_sc *rtsc, u64 x) | ||
586 | { | ||
587 | u64 y; | ||
588 | |||
589 | if (x <= rtsc->x) | ||
590 | y = rtsc->y; | ||
591 | else if (x <= rtsc->x + rtsc->dx) | ||
592 | /* y belongs to the 1st segment */ | ||
593 | y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); | ||
594 | else | ||
595 | /* y belongs to the 2nd segment */ | ||
596 | y = rtsc->y + rtsc->dy | ||
597 | + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); | ||
598 | return y; | ||
599 | } | ||
600 | |||
601 | /* | ||
602 | * update the runtime service curve by taking the minimum of the current | ||
603 | * runtime service curve and the service curve starting at (x, y). | ||
604 | */ | ||
605 | static void | ||
606 | rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) | ||
607 | { | ||
608 | u64 y1, y2, dx, dy; | ||
609 | u32 dsm; | ||
610 | |||
611 | if (isc->sm1 <= isc->sm2) { | ||
612 | /* service curve is convex */ | ||
613 | y1 = rtsc_x2y(rtsc, x); | ||
614 | if (y1 < y) | ||
615 | /* the current rtsc is smaller */ | ||
616 | return; | ||
617 | rtsc->x = x; | ||
618 | rtsc->y = y; | ||
619 | return; | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * service curve is concave | ||
624 | * compute the two y values of the current rtsc | ||
625 | * y1: at x | ||
626 | * y2: at (x + dx) | ||
627 | */ | ||
628 | y1 = rtsc_x2y(rtsc, x); | ||
629 | if (y1 <= y) { | ||
630 | /* rtsc is below isc, no change to rtsc */ | ||
631 | return; | ||
632 | } | ||
633 | |||
634 | y2 = rtsc_x2y(rtsc, x + isc->dx); | ||
635 | if (y2 >= y + isc->dy) { | ||
636 | /* rtsc is above isc, replace rtsc by isc */ | ||
637 | rtsc->x = x; | ||
638 | rtsc->y = y; | ||
639 | rtsc->dx = isc->dx; | ||
640 | rtsc->dy = isc->dy; | ||
641 | return; | ||
642 | } | ||
643 | |||
644 | /* | ||
645 | * the two curves intersect | ||
646 | * compute the offsets (dx, dy) using the reverse | ||
647 | * function of seg_x2y() | ||
648 | * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) | ||
649 | */ | ||
650 | dx = (y1 - y) << SM_SHIFT; | ||
651 | dsm = isc->sm1 - isc->sm2; | ||
652 | do_div(dx, dsm); | ||
653 | /* | ||
654 | * check if (x, y1) belongs to the 1st segment of rtsc. | ||
655 | * if so, add the offset. | ||
656 | */ | ||
657 | if (rtsc->x + rtsc->dx > x) | ||
658 | dx += rtsc->x + rtsc->dx - x; | ||
659 | dy = seg_x2y(dx, isc->sm1); | ||
660 | |||
661 | rtsc->x = x; | ||
662 | rtsc->y = y; | ||
663 | rtsc->dx = dx; | ||
664 | rtsc->dy = dy; | ||
665 | return; | ||
666 | } | ||
667 | |||
668 | static void | ||
669 | init_ed(struct hfsc_class *cl, unsigned int next_len) | ||
670 | { | ||
671 | u64 cur_time; | ||
672 | |||
673 | PSCHED_GET_TIME(cur_time); | ||
674 | |||
675 | /* update the deadline curve */ | ||
676 | rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); | ||
677 | |||
678 | /* | ||
679 | * update the eligible curve. | ||
680 | * for concave, it is equal to the deadline curve. | ||
681 | * for convex, it is a linear curve with slope m2. | ||
682 | */ | ||
683 | cl->cl_eligible = cl->cl_deadline; | ||
684 | if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { | ||
685 | cl->cl_eligible.dx = 0; | ||
686 | cl->cl_eligible.dy = 0; | ||
687 | } | ||
688 | |||
689 | /* compute e and d */ | ||
690 | cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); | ||
691 | cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); | ||
692 | |||
693 | eltree_insert(cl); | ||
694 | } | ||
695 | |||
696 | static void | ||
697 | update_ed(struct hfsc_class *cl, unsigned int next_len) | ||
698 | { | ||
699 | cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); | ||
700 | cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); | ||
701 | |||
702 | eltree_update(cl); | ||
703 | } | ||
704 | |||
705 | static inline void | ||
706 | update_d(struct hfsc_class *cl, unsigned int next_len) | ||
707 | { | ||
708 | cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); | ||
709 | } | ||
710 | |||
711 | static inline void | ||
712 | update_cfmin(struct hfsc_class *cl) | ||
713 | { | ||
714 | struct rb_node *n = rb_first(&cl->cf_tree); | ||
715 | struct hfsc_class *p; | ||
716 | |||
717 | if (n == NULL) { | ||
718 | cl->cl_cfmin = 0; | ||
719 | return; | ||
720 | } | ||
721 | p = rb_entry(n, struct hfsc_class, cf_node); | ||
722 | cl->cl_cfmin = p->cl_f; | ||
723 | } | ||
724 | |||
725 | static void | ||
726 | init_vf(struct hfsc_class *cl, unsigned int len) | ||
727 | { | ||
728 | struct hfsc_class *max_cl; | ||
729 | struct rb_node *n; | ||
730 | u64 vt, f, cur_time; | ||
731 | int go_active; | ||
732 | |||
733 | cur_time = 0; | ||
734 | go_active = 1; | ||
735 | for (; cl->cl_parent != NULL; cl = cl->cl_parent) { | ||
736 | if (go_active && cl->cl_nactive++ == 0) | ||
737 | go_active = 1; | ||
738 | else | ||
739 | go_active = 0; | ||
740 | |||
741 | if (go_active) { | ||
742 | n = rb_last(&cl->cl_parent->vt_tree); | ||
743 | if (n != NULL) { | ||
744 | max_cl = rb_entry(n, struct hfsc_class,vt_node); | ||
745 | /* | ||
746 | * set vt to the average of the min and max | ||
747 | * classes. if the parent's period didn't | ||
748 | * change, don't decrease vt of the class. | ||
749 | */ | ||
750 | vt = max_cl->cl_vt; | ||
751 | if (cl->cl_parent->cl_cvtmin != 0) | ||
752 | vt = (cl->cl_parent->cl_cvtmin + vt)/2; | ||
753 | |||
754 | if (cl->cl_parent->cl_vtperiod != | ||
755 | cl->cl_parentperiod || vt > cl->cl_vt) | ||
756 | cl->cl_vt = vt; | ||
757 | } else { | ||
758 | /* | ||
759 | * first child for a new parent backlog period. | ||
760 | * add parent's cvtmax to cvtoff to make a new | ||
761 | * vt (vtoff + vt) larger than the vt in the | ||
762 | * last period for all children. | ||
763 | */ | ||
764 | vt = cl->cl_parent->cl_cvtmax; | ||
765 | cl->cl_parent->cl_cvtoff += vt; | ||
766 | cl->cl_parent->cl_cvtmax = 0; | ||
767 | cl->cl_parent->cl_cvtmin = 0; | ||
768 | cl->cl_vt = 0; | ||
769 | } | ||
770 | |||
771 | cl->cl_vtoff = cl->cl_parent->cl_cvtoff - | ||
772 | cl->cl_pcvtoff; | ||
773 | |||
774 | /* update the virtual curve */ | ||
775 | vt = cl->cl_vt + cl->cl_vtoff; | ||
776 | rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, | ||
777 | cl->cl_total); | ||
778 | if (cl->cl_virtual.x == vt) { | ||
779 | cl->cl_virtual.x -= cl->cl_vtoff; | ||
780 | cl->cl_vtoff = 0; | ||
781 | } | ||
782 | cl->cl_vtadj = 0; | ||
783 | |||
784 | cl->cl_vtperiod++; /* increment vt period */ | ||
785 | cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; | ||
786 | if (cl->cl_parent->cl_nactive == 0) | ||
787 | cl->cl_parentperiod++; | ||
788 | cl->cl_f = 0; | ||
789 | |||
790 | vttree_insert(cl); | ||
791 | cftree_insert(cl); | ||
792 | |||
793 | if (cl->cl_flags & HFSC_USC) { | ||
794 | /* class has upper limit curve */ | ||
795 | if (cur_time == 0) | ||
796 | PSCHED_GET_TIME(cur_time); | ||
797 | |||
798 | /* update the ulimit curve */ | ||
799 | rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, | ||
800 | cl->cl_total); | ||
801 | /* compute myf */ | ||
802 | cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, | ||
803 | cl->cl_total); | ||
804 | cl->cl_myfadj = 0; | ||
805 | } | ||
806 | } | ||
807 | |||
808 | f = max(cl->cl_myf, cl->cl_cfmin); | ||
809 | if (f != cl->cl_f) { | ||
810 | cl->cl_f = f; | ||
811 | cftree_update(cl); | ||
812 | update_cfmin(cl->cl_parent); | ||
813 | } | ||
814 | } | ||
815 | } | ||
816 | |||
817 | static void | ||
818 | update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) | ||
819 | { | ||
820 | u64 f; /* , myf_bound, delta; */ | ||
821 | int go_passive = 0; | ||
822 | |||
823 | if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC) | ||
824 | go_passive = 1; | ||
825 | |||
826 | for (; cl->cl_parent != NULL; cl = cl->cl_parent) { | ||
827 | cl->cl_total += len; | ||
828 | |||
829 | if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0) | ||
830 | continue; | ||
831 | |||
832 | if (go_passive && --cl->cl_nactive == 0) | ||
833 | go_passive = 1; | ||
834 | else | ||
835 | go_passive = 0; | ||
836 | |||
837 | if (go_passive) { | ||
838 | /* no more active child, going passive */ | ||
839 | |||
840 | /* update cvtmax of the parent class */ | ||
841 | if (cl->cl_vt > cl->cl_parent->cl_cvtmax) | ||
842 | cl->cl_parent->cl_cvtmax = cl->cl_vt; | ||
843 | |||
844 | /* remove this class from the vt tree */ | ||
845 | vttree_remove(cl); | ||
846 | |||
847 | cftree_remove(cl); | ||
848 | update_cfmin(cl->cl_parent); | ||
849 | |||
850 | continue; | ||
851 | } | ||
852 | |||
853 | /* | ||
854 | * update vt and f | ||
855 | */ | ||
856 | cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) | ||
857 | - cl->cl_vtoff + cl->cl_vtadj; | ||
858 | |||
859 | /* | ||
860 | * if vt of the class is smaller than cvtmin, | ||
861 | * the class was skipped in the past due to non-fit. | ||
862 | * if so, we need to adjust vtadj. | ||
863 | */ | ||
864 | if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { | ||
865 | cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; | ||
866 | cl->cl_vt = cl->cl_parent->cl_cvtmin; | ||
867 | } | ||
868 | |||
869 | /* update the vt tree */ | ||
870 | vttree_update(cl); | ||
871 | |||
872 | if (cl->cl_flags & HFSC_USC) { | ||
873 | cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, | ||
874 | cl->cl_total); | ||
875 | #if 0 | ||
876 | /* | ||
877 | * This code causes classes to stay way under their | ||
878 | * limit when multiple classes are used at gigabit | ||
879 | * speed. needs investigation. -kaber | ||
880 | */ | ||
881 | /* | ||
882 | * if myf lags behind by more than one clock tick | ||
883 | * from the current time, adjust myfadj to prevent | ||
884 | * a rate-limited class from going greedy. | ||
885 | * in a steady state under rate-limiting, myf | ||
886 | * fluctuates within one clock tick. | ||
887 | */ | ||
888 | myf_bound = cur_time - PSCHED_JIFFIE2US(1); | ||
889 | if (cl->cl_myf < myf_bound) { | ||
890 | delta = cur_time - cl->cl_myf; | ||
891 | cl->cl_myfadj += delta; | ||
892 | cl->cl_myf += delta; | ||
893 | } | ||
894 | #endif | ||
895 | } | ||
896 | |||
897 | f = max(cl->cl_myf, cl->cl_cfmin); | ||
898 | if (f != cl->cl_f) { | ||
899 | cl->cl_f = f; | ||
900 | cftree_update(cl); | ||
901 | update_cfmin(cl->cl_parent); | ||
902 | } | ||
903 | } | ||
904 | } | ||
905 | |||
906 | static void | ||
907 | set_active(struct hfsc_class *cl, unsigned int len) | ||
908 | { | ||
909 | if (cl->cl_flags & HFSC_RSC) | ||
910 | init_ed(cl, len); | ||
911 | if (cl->cl_flags & HFSC_FSC) | ||
912 | init_vf(cl, len); | ||
913 | |||
914 | list_add_tail(&cl->dlist, &cl->sched->droplist); | ||
915 | } | ||
916 | |||
917 | static void | ||
918 | set_passive(struct hfsc_class *cl) | ||
919 | { | ||
920 | if (cl->cl_flags & HFSC_RSC) | ||
921 | eltree_remove(cl); | ||
922 | |||
923 | list_del(&cl->dlist); | ||
924 | |||
925 | /* | ||
926 | * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) | ||
927 | * needs to be called explicitly to remove a class from vttree. | ||
928 | */ | ||
929 | } | ||
930 | |||
931 | /* | ||
932 | * hack to get length of first packet in queue. | ||
933 | */ | ||
934 | static unsigned int | ||
935 | qdisc_peek_len(struct Qdisc *sch) | ||
936 | { | ||
937 | struct sk_buff *skb; | ||
938 | unsigned int len; | ||
939 | |||
940 | skb = sch->dequeue(sch); | ||
941 | if (skb == NULL) { | ||
942 | if (net_ratelimit()) | ||
943 | printk("qdisc_peek_len: non work-conserving qdisc ?\n"); | ||
944 | return 0; | ||
945 | } | ||
946 | len = skb->len; | ||
947 | if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) { | ||
948 | if (net_ratelimit()) | ||
949 | printk("qdisc_peek_len: failed to requeue\n"); | ||
950 | return 0; | ||
951 | } | ||
952 | return len; | ||
953 | } | ||
954 | |||
955 | static void | ||
956 | hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) | ||
957 | { | ||
958 | unsigned int len = cl->qdisc->q.qlen; | ||
959 | |||
960 | qdisc_reset(cl->qdisc); | ||
961 | if (len > 0) { | ||
962 | update_vf(cl, 0, 0); | ||
963 | set_passive(cl); | ||
964 | sch->q.qlen -= len; | ||
965 | } | ||
966 | } | ||
967 | |||
968 | static void | ||
969 | hfsc_adjust_levels(struct hfsc_class *cl) | ||
970 | { | ||
971 | struct hfsc_class *p; | ||
972 | unsigned int level; | ||
973 | |||
974 | do { | ||
975 | level = 0; | ||
976 | list_for_each_entry(p, &cl->children, siblings) { | ||
977 | if (p->level > level) | ||
978 | level = p->level; | ||
979 | } | ||
980 | cl->level = level + 1; | ||
981 | } while ((cl = cl->cl_parent) != NULL); | ||
982 | } | ||
983 | |||
984 | static inline unsigned int | ||
985 | hfsc_hash(u32 h) | ||
986 | { | ||
987 | h ^= h >> 8; | ||
988 | h ^= h >> 4; | ||
989 | |||
990 | return h & (HFSC_HSIZE - 1); | ||
991 | } | ||
992 | |||
993 | static inline struct hfsc_class * | ||
994 | hfsc_find_class(u32 classid, struct Qdisc *sch) | ||
995 | { | ||
996 | struct hfsc_sched *q = qdisc_priv(sch); | ||
997 | struct hfsc_class *cl; | ||
998 | |||
999 | list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) { | ||
1000 | if (cl->classid == classid) | ||
1001 | return cl; | ||
1002 | } | ||
1003 | return NULL; | ||
1004 | } | ||
1005 | |||
1006 | static void | ||
1007 | hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, | ||
1008 | u64 cur_time) | ||
1009 | { | ||
1010 | sc2isc(rsc, &cl->cl_rsc); | ||
1011 | rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); | ||
1012 | cl->cl_eligible = cl->cl_deadline; | ||
1013 | if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { | ||
1014 | cl->cl_eligible.dx = 0; | ||
1015 | cl->cl_eligible.dy = 0; | ||
1016 | } | ||
1017 | cl->cl_flags |= HFSC_RSC; | ||
1018 | } | ||
1019 | |||
1020 | static void | ||
1021 | hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) | ||
1022 | { | ||
1023 | sc2isc(fsc, &cl->cl_fsc); | ||
1024 | rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); | ||
1025 | cl->cl_flags |= HFSC_FSC; | ||
1026 | } | ||
1027 | |||
1028 | static void | ||
1029 | hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, | ||
1030 | u64 cur_time) | ||
1031 | { | ||
1032 | sc2isc(usc, &cl->cl_usc); | ||
1033 | rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); | ||
1034 | cl->cl_flags |= HFSC_USC; | ||
1035 | } | ||
1036 | |||
1037 | static int | ||
1038 | hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | ||
1039 | struct rtattr **tca, unsigned long *arg) | ||
1040 | { | ||
1041 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1042 | struct hfsc_class *cl = (struct hfsc_class *)*arg; | ||
1043 | struct hfsc_class *parent = NULL; | ||
1044 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
1045 | struct rtattr *tb[TCA_HFSC_MAX]; | ||
1046 | struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; | ||
1047 | u64 cur_time; | ||
1048 | |||
1049 | if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt)) | ||
1050 | return -EINVAL; | ||
1051 | |||
1052 | if (tb[TCA_HFSC_RSC-1]) { | ||
1053 | if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc)) | ||
1054 | return -EINVAL; | ||
1055 | rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]); | ||
1056 | if (rsc->m1 == 0 && rsc->m2 == 0) | ||
1057 | rsc = NULL; | ||
1058 | } | ||
1059 | |||
1060 | if (tb[TCA_HFSC_FSC-1]) { | ||
1061 | if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc)) | ||
1062 | return -EINVAL; | ||
1063 | fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]); | ||
1064 | if (fsc->m1 == 0 && fsc->m2 == 0) | ||
1065 | fsc = NULL; | ||
1066 | } | ||
1067 | |||
1068 | if (tb[TCA_HFSC_USC-1]) { | ||
1069 | if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc)) | ||
1070 | return -EINVAL; | ||
1071 | usc = RTA_DATA(tb[TCA_HFSC_USC-1]); | ||
1072 | if (usc->m1 == 0 && usc->m2 == 0) | ||
1073 | usc = NULL; | ||
1074 | } | ||
1075 | |||
1076 | if (cl != NULL) { | ||
1077 | if (parentid) { | ||
1078 | if (cl->cl_parent && cl->cl_parent->classid != parentid) | ||
1079 | return -EINVAL; | ||
1080 | if (cl->cl_parent == NULL && parentid != TC_H_ROOT) | ||
1081 | return -EINVAL; | ||
1082 | } | ||
1083 | PSCHED_GET_TIME(cur_time); | ||
1084 | |||
1085 | sch_tree_lock(sch); | ||
1086 | if (rsc != NULL) | ||
1087 | hfsc_change_rsc(cl, rsc, cur_time); | ||
1088 | if (fsc != NULL) | ||
1089 | hfsc_change_fsc(cl, fsc); | ||
1090 | if (usc != NULL) | ||
1091 | hfsc_change_usc(cl, usc, cur_time); | ||
1092 | |||
1093 | if (cl->qdisc->q.qlen != 0) { | ||
1094 | if (cl->cl_flags & HFSC_RSC) | ||
1095 | update_ed(cl, qdisc_peek_len(cl->qdisc)); | ||
1096 | if (cl->cl_flags & HFSC_FSC) | ||
1097 | update_vf(cl, 0, cur_time); | ||
1098 | } | ||
1099 | sch_tree_unlock(sch); | ||
1100 | |||
1101 | #ifdef CONFIG_NET_ESTIMATOR | ||
1102 | if (tca[TCA_RATE-1]) | ||
1103 | gen_replace_estimator(&cl->bstats, &cl->rate_est, | ||
1104 | cl->stats_lock, tca[TCA_RATE-1]); | ||
1105 | #endif | ||
1106 | return 0; | ||
1107 | } | ||
1108 | |||
1109 | if (parentid == TC_H_ROOT) | ||
1110 | return -EEXIST; | ||
1111 | |||
1112 | parent = &q->root; | ||
1113 | if (parentid) { | ||
1114 | parent = hfsc_find_class(parentid, sch); | ||
1115 | if (parent == NULL) | ||
1116 | return -ENOENT; | ||
1117 | } | ||
1118 | |||
1119 | if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) | ||
1120 | return -EINVAL; | ||
1121 | if (hfsc_find_class(classid, sch)) | ||
1122 | return -EEXIST; | ||
1123 | |||
1124 | if (rsc == NULL && fsc == NULL) | ||
1125 | return -EINVAL; | ||
1126 | |||
1127 | cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL); | ||
1128 | if (cl == NULL) | ||
1129 | return -ENOBUFS; | ||
1130 | memset(cl, 0, sizeof(struct hfsc_class)); | ||
1131 | |||
1132 | if (rsc != NULL) | ||
1133 | hfsc_change_rsc(cl, rsc, 0); | ||
1134 | if (fsc != NULL) | ||
1135 | hfsc_change_fsc(cl, fsc); | ||
1136 | if (usc != NULL) | ||
1137 | hfsc_change_usc(cl, usc, 0); | ||
1138 | |||
1139 | cl->refcnt = 1; | ||
1140 | cl->classid = classid; | ||
1141 | cl->sched = q; | ||
1142 | cl->cl_parent = parent; | ||
1143 | cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); | ||
1144 | if (cl->qdisc == NULL) | ||
1145 | cl->qdisc = &noop_qdisc; | ||
1146 | cl->stats_lock = &sch->dev->queue_lock; | ||
1147 | INIT_LIST_HEAD(&cl->children); | ||
1148 | cl->vt_tree = RB_ROOT; | ||
1149 | cl->cf_tree = RB_ROOT; | ||
1150 | |||
1151 | sch_tree_lock(sch); | ||
1152 | list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]); | ||
1153 | list_add_tail(&cl->siblings, &parent->children); | ||
1154 | if (parent->level == 0) | ||
1155 | hfsc_purge_queue(sch, parent); | ||
1156 | hfsc_adjust_levels(parent); | ||
1157 | cl->cl_pcvtoff = parent->cl_cvtoff; | ||
1158 | sch_tree_unlock(sch); | ||
1159 | |||
1160 | #ifdef CONFIG_NET_ESTIMATOR | ||
1161 | if (tca[TCA_RATE-1]) | ||
1162 | gen_new_estimator(&cl->bstats, &cl->rate_est, | ||
1163 | cl->stats_lock, tca[TCA_RATE-1]); | ||
1164 | #endif | ||
1165 | *arg = (unsigned long)cl; | ||
1166 | return 0; | ||
1167 | } | ||
1168 | |||
1169 | static void | ||
1170 | hfsc_destroy_filters(struct tcf_proto **fl) | ||
1171 | { | ||
1172 | struct tcf_proto *tp; | ||
1173 | |||
1174 | while ((tp = *fl) != NULL) { | ||
1175 | *fl = tp->next; | ||
1176 | tcf_destroy(tp); | ||
1177 | } | ||
1178 | } | ||
1179 | |||
1180 | static void | ||
1181 | hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) | ||
1182 | { | ||
1183 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1184 | |||
1185 | hfsc_destroy_filters(&cl->filter_list); | ||
1186 | qdisc_destroy(cl->qdisc); | ||
1187 | #ifdef CONFIG_NET_ESTIMATOR | ||
1188 | gen_kill_estimator(&cl->bstats, &cl->rate_est); | ||
1189 | #endif | ||
1190 | if (cl != &q->root) | ||
1191 | kfree(cl); | ||
1192 | } | ||
1193 | |||
1194 | static int | ||
1195 | hfsc_delete_class(struct Qdisc *sch, unsigned long arg) | ||
1196 | { | ||
1197 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1198 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1199 | |||
1200 | if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) | ||
1201 | return -EBUSY; | ||
1202 | |||
1203 | sch_tree_lock(sch); | ||
1204 | |||
1205 | list_del(&cl->hlist); | ||
1206 | list_del(&cl->siblings); | ||
1207 | hfsc_adjust_levels(cl->cl_parent); | ||
1208 | hfsc_purge_queue(sch, cl); | ||
1209 | if (--cl->refcnt == 0) | ||
1210 | hfsc_destroy_class(sch, cl); | ||
1211 | |||
1212 | sch_tree_unlock(sch); | ||
1213 | return 0; | ||
1214 | } | ||
1215 | |||
1216 | static struct hfsc_class * | ||
1217 | hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) | ||
1218 | { | ||
1219 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1220 | struct hfsc_class *cl; | ||
1221 | struct tcf_result res; | ||
1222 | struct tcf_proto *tcf; | ||
1223 | int result; | ||
1224 | |||
1225 | if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 && | ||
1226 | (cl = hfsc_find_class(skb->priority, sch)) != NULL) | ||
1227 | if (cl->level == 0) | ||
1228 | return cl; | ||
1229 | |||
1230 | *qerr = NET_XMIT_DROP; | ||
1231 | tcf = q->root.filter_list; | ||
1232 | while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { | ||
1233 | #ifdef CONFIG_NET_CLS_ACT | ||
1234 | switch (result) { | ||
1235 | case TC_ACT_QUEUED: | ||
1236 | case TC_ACT_STOLEN: | ||
1237 | *qerr = NET_XMIT_SUCCESS; | ||
1238 | case TC_ACT_SHOT: | ||
1239 | return NULL; | ||
1240 | } | ||
1241 | #elif defined(CONFIG_NET_CLS_POLICE) | ||
1242 | if (result == TC_POLICE_SHOT) | ||
1243 | return NULL; | ||
1244 | #endif | ||
1245 | if ((cl = (struct hfsc_class *)res.class) == NULL) { | ||
1246 | if ((cl = hfsc_find_class(res.classid, sch)) == NULL) | ||
1247 | break; /* filter selected invalid classid */ | ||
1248 | } | ||
1249 | |||
1250 | if (cl->level == 0) | ||
1251 | return cl; /* hit leaf class */ | ||
1252 | |||
1253 | /* apply inner filter chain */ | ||
1254 | tcf = cl->filter_list; | ||
1255 | } | ||
1256 | |||
1257 | /* classification failed, try default class */ | ||
1258 | cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); | ||
1259 | if (cl == NULL || cl->level > 0) | ||
1260 | return NULL; | ||
1261 | |||
1262 | return cl; | ||
1263 | } | ||
1264 | |||
1265 | static int | ||
1266 | hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, | ||
1267 | struct Qdisc **old) | ||
1268 | { | ||
1269 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1270 | |||
1271 | if (cl == NULL) | ||
1272 | return -ENOENT; | ||
1273 | if (cl->level > 0) | ||
1274 | return -EINVAL; | ||
1275 | if (new == NULL) { | ||
1276 | new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); | ||
1277 | if (new == NULL) | ||
1278 | new = &noop_qdisc; | ||
1279 | } | ||
1280 | |||
1281 | sch_tree_lock(sch); | ||
1282 | hfsc_purge_queue(sch, cl); | ||
1283 | *old = xchg(&cl->qdisc, new); | ||
1284 | sch_tree_unlock(sch); | ||
1285 | return 0; | ||
1286 | } | ||
1287 | |||
1288 | static struct Qdisc * | ||
1289 | hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) | ||
1290 | { | ||
1291 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1292 | |||
1293 | if (cl != NULL && cl->level == 0) | ||
1294 | return cl->qdisc; | ||
1295 | |||
1296 | return NULL; | ||
1297 | } | ||
1298 | |||
1299 | static unsigned long | ||
1300 | hfsc_get_class(struct Qdisc *sch, u32 classid) | ||
1301 | { | ||
1302 | struct hfsc_class *cl = hfsc_find_class(classid, sch); | ||
1303 | |||
1304 | if (cl != NULL) | ||
1305 | cl->refcnt++; | ||
1306 | |||
1307 | return (unsigned long)cl; | ||
1308 | } | ||
1309 | |||
1310 | static void | ||
1311 | hfsc_put_class(struct Qdisc *sch, unsigned long arg) | ||
1312 | { | ||
1313 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1314 | |||
1315 | if (--cl->refcnt == 0) | ||
1316 | hfsc_destroy_class(sch, cl); | ||
1317 | } | ||
1318 | |||
1319 | static unsigned long | ||
1320 | hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) | ||
1321 | { | ||
1322 | struct hfsc_class *p = (struct hfsc_class *)parent; | ||
1323 | struct hfsc_class *cl = hfsc_find_class(classid, sch); | ||
1324 | |||
1325 | if (cl != NULL) { | ||
1326 | if (p != NULL && p->level <= cl->level) | ||
1327 | return 0; | ||
1328 | cl->filter_cnt++; | ||
1329 | } | ||
1330 | |||
1331 | return (unsigned long)cl; | ||
1332 | } | ||
1333 | |||
1334 | static void | ||
1335 | hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) | ||
1336 | { | ||
1337 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1338 | |||
1339 | cl->filter_cnt--; | ||
1340 | } | ||
1341 | |||
1342 | static struct tcf_proto ** | ||
1343 | hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) | ||
1344 | { | ||
1345 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1346 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1347 | |||
1348 | if (cl == NULL) | ||
1349 | cl = &q->root; | ||
1350 | |||
1351 | return &cl->filter_list; | ||
1352 | } | ||
1353 | |||
1354 | static int | ||
1355 | hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) | ||
1356 | { | ||
1357 | struct tc_service_curve tsc; | ||
1358 | |||
1359 | tsc.m1 = sm2m(sc->sm1); | ||
1360 | tsc.d = dx2d(sc->dx); | ||
1361 | tsc.m2 = sm2m(sc->sm2); | ||
1362 | RTA_PUT(skb, attr, sizeof(tsc), &tsc); | ||
1363 | |||
1364 | return skb->len; | ||
1365 | |||
1366 | rtattr_failure: | ||
1367 | return -1; | ||
1368 | } | ||
1369 | |||
1370 | static inline int | ||
1371 | hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) | ||
1372 | { | ||
1373 | if ((cl->cl_flags & HFSC_RSC) && | ||
1374 | (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) | ||
1375 | goto rtattr_failure; | ||
1376 | |||
1377 | if ((cl->cl_flags & HFSC_FSC) && | ||
1378 | (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) | ||
1379 | goto rtattr_failure; | ||
1380 | |||
1381 | if ((cl->cl_flags & HFSC_USC) && | ||
1382 | (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) | ||
1383 | goto rtattr_failure; | ||
1384 | |||
1385 | return skb->len; | ||
1386 | |||
1387 | rtattr_failure: | ||
1388 | return -1; | ||
1389 | } | ||
1390 | |||
1391 | static int | ||
1392 | hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, | ||
1393 | struct tcmsg *tcm) | ||
1394 | { | ||
1395 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1396 | unsigned char *b = skb->tail; | ||
1397 | struct rtattr *rta = (struct rtattr *)b; | ||
1398 | |||
1399 | tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; | ||
1400 | tcm->tcm_handle = cl->classid; | ||
1401 | if (cl->level == 0) | ||
1402 | tcm->tcm_info = cl->qdisc->handle; | ||
1403 | |||
1404 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
1405 | if (hfsc_dump_curves(skb, cl) < 0) | ||
1406 | goto rtattr_failure; | ||
1407 | rta->rta_len = skb->tail - b; | ||
1408 | return skb->len; | ||
1409 | |||
1410 | rtattr_failure: | ||
1411 | skb_trim(skb, b - skb->data); | ||
1412 | return -1; | ||
1413 | } | ||
1414 | |||
1415 | static int | ||
1416 | hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, | ||
1417 | struct gnet_dump *d) | ||
1418 | { | ||
1419 | struct hfsc_class *cl = (struct hfsc_class *)arg; | ||
1420 | struct tc_hfsc_stats xstats; | ||
1421 | |||
1422 | cl->qstats.qlen = cl->qdisc->q.qlen; | ||
1423 | xstats.level = cl->level; | ||
1424 | xstats.period = cl->cl_vtperiod; | ||
1425 | xstats.work = cl->cl_total; | ||
1426 | xstats.rtwork = cl->cl_cumul; | ||
1427 | |||
1428 | if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || | ||
1429 | #ifdef CONFIG_NET_ESTIMATOR | ||
1430 | gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || | ||
1431 | #endif | ||
1432 | gnet_stats_copy_queue(d, &cl->qstats) < 0) | ||
1433 | return -1; | ||
1434 | |||
1435 | return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); | ||
1436 | } | ||
1437 | |||
1438 | |||
1439 | |||
1440 | static void | ||
1441 | hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) | ||
1442 | { | ||
1443 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1444 | struct hfsc_class *cl; | ||
1445 | unsigned int i; | ||
1446 | |||
1447 | if (arg->stop) | ||
1448 | return; | ||
1449 | |||
1450 | for (i = 0; i < HFSC_HSIZE; i++) { | ||
1451 | list_for_each_entry(cl, &q->clhash[i], hlist) { | ||
1452 | if (arg->count < arg->skip) { | ||
1453 | arg->count++; | ||
1454 | continue; | ||
1455 | } | ||
1456 | if (arg->fn(sch, (unsigned long)cl, arg) < 0) { | ||
1457 | arg->stop = 1; | ||
1458 | return; | ||
1459 | } | ||
1460 | arg->count++; | ||
1461 | } | ||
1462 | } | ||
1463 | } | ||
1464 | |||
1465 | static void | ||
1466 | hfsc_watchdog(unsigned long arg) | ||
1467 | { | ||
1468 | struct Qdisc *sch = (struct Qdisc *)arg; | ||
1469 | |||
1470 | sch->flags &= ~TCQ_F_THROTTLED; | ||
1471 | netif_schedule(sch->dev); | ||
1472 | } | ||
1473 | |||
1474 | static void | ||
1475 | hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) | ||
1476 | { | ||
1477 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1478 | struct hfsc_class *cl; | ||
1479 | u64 next_time = 0; | ||
1480 | long delay; | ||
1481 | |||
1482 | if ((cl = eltree_get_minel(q)) != NULL) | ||
1483 | next_time = cl->cl_e; | ||
1484 | if (q->root.cl_cfmin != 0) { | ||
1485 | if (next_time == 0 || next_time > q->root.cl_cfmin) | ||
1486 | next_time = q->root.cl_cfmin; | ||
1487 | } | ||
1488 | ASSERT(next_time != 0); | ||
1489 | delay = next_time - cur_time; | ||
1490 | delay = PSCHED_US2JIFFIE(delay); | ||
1491 | |||
1492 | sch->flags |= TCQ_F_THROTTLED; | ||
1493 | mod_timer(&q->wd_timer, jiffies + delay); | ||
1494 | } | ||
1495 | |||
1496 | static int | ||
1497 | hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) | ||
1498 | { | ||
1499 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1500 | struct tc_hfsc_qopt *qopt; | ||
1501 | unsigned int i; | ||
1502 | |||
1503 | if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) | ||
1504 | return -EINVAL; | ||
1505 | qopt = RTA_DATA(opt); | ||
1506 | |||
1507 | sch->stats_lock = &sch->dev->queue_lock; | ||
1508 | |||
1509 | q->defcls = qopt->defcls; | ||
1510 | for (i = 0; i < HFSC_HSIZE; i++) | ||
1511 | INIT_LIST_HEAD(&q->clhash[i]); | ||
1512 | q->eligible = RB_ROOT; | ||
1513 | INIT_LIST_HEAD(&q->droplist); | ||
1514 | skb_queue_head_init(&q->requeue); | ||
1515 | |||
1516 | q->root.refcnt = 1; | ||
1517 | q->root.classid = sch->handle; | ||
1518 | q->root.sched = q; | ||
1519 | q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); | ||
1520 | if (q->root.qdisc == NULL) | ||
1521 | q->root.qdisc = &noop_qdisc; | ||
1522 | q->root.stats_lock = &sch->dev->queue_lock; | ||
1523 | INIT_LIST_HEAD(&q->root.children); | ||
1524 | q->root.vt_tree = RB_ROOT; | ||
1525 | q->root.cf_tree = RB_ROOT; | ||
1526 | |||
1527 | list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); | ||
1528 | |||
1529 | init_timer(&q->wd_timer); | ||
1530 | q->wd_timer.function = hfsc_watchdog; | ||
1531 | q->wd_timer.data = (unsigned long)sch; | ||
1532 | |||
1533 | return 0; | ||
1534 | } | ||
1535 | |||
1536 | static int | ||
1537 | hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt) | ||
1538 | { | ||
1539 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1540 | struct tc_hfsc_qopt *qopt; | ||
1541 | |||
1542 | if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) | ||
1543 | return -EINVAL; | ||
1544 | qopt = RTA_DATA(opt); | ||
1545 | |||
1546 | sch_tree_lock(sch); | ||
1547 | q->defcls = qopt->defcls; | ||
1548 | sch_tree_unlock(sch); | ||
1549 | |||
1550 | return 0; | ||
1551 | } | ||
1552 | |||
1553 | static void | ||
1554 | hfsc_reset_class(struct hfsc_class *cl) | ||
1555 | { | ||
1556 | cl->cl_total = 0; | ||
1557 | cl->cl_cumul = 0; | ||
1558 | cl->cl_d = 0; | ||
1559 | cl->cl_e = 0; | ||
1560 | cl->cl_vt = 0; | ||
1561 | cl->cl_vtadj = 0; | ||
1562 | cl->cl_vtoff = 0; | ||
1563 | cl->cl_cvtmin = 0; | ||
1564 | cl->cl_cvtmax = 0; | ||
1565 | cl->cl_cvtoff = 0; | ||
1566 | cl->cl_pcvtoff = 0; | ||
1567 | cl->cl_vtperiod = 0; | ||
1568 | cl->cl_parentperiod = 0; | ||
1569 | cl->cl_f = 0; | ||
1570 | cl->cl_myf = 0; | ||
1571 | cl->cl_myfadj = 0; | ||
1572 | cl->cl_cfmin = 0; | ||
1573 | cl->cl_nactive = 0; | ||
1574 | |||
1575 | cl->vt_tree = RB_ROOT; | ||
1576 | cl->cf_tree = RB_ROOT; | ||
1577 | qdisc_reset(cl->qdisc); | ||
1578 | |||
1579 | if (cl->cl_flags & HFSC_RSC) | ||
1580 | rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); | ||
1581 | if (cl->cl_flags & HFSC_FSC) | ||
1582 | rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); | ||
1583 | if (cl->cl_flags & HFSC_USC) | ||
1584 | rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); | ||
1585 | } | ||
1586 | |||
1587 | static void | ||
1588 | hfsc_reset_qdisc(struct Qdisc *sch) | ||
1589 | { | ||
1590 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1591 | struct hfsc_class *cl; | ||
1592 | unsigned int i; | ||
1593 | |||
1594 | for (i = 0; i < HFSC_HSIZE; i++) { | ||
1595 | list_for_each_entry(cl, &q->clhash[i], hlist) | ||
1596 | hfsc_reset_class(cl); | ||
1597 | } | ||
1598 | __skb_queue_purge(&q->requeue); | ||
1599 | q->eligible = RB_ROOT; | ||
1600 | INIT_LIST_HEAD(&q->droplist); | ||
1601 | del_timer(&q->wd_timer); | ||
1602 | sch->flags &= ~TCQ_F_THROTTLED; | ||
1603 | sch->q.qlen = 0; | ||
1604 | } | ||
1605 | |||
1606 | static void | ||
1607 | hfsc_destroy_qdisc(struct Qdisc *sch) | ||
1608 | { | ||
1609 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1610 | struct hfsc_class *cl, *next; | ||
1611 | unsigned int i; | ||
1612 | |||
1613 | for (i = 0; i < HFSC_HSIZE; i++) { | ||
1614 | list_for_each_entry_safe(cl, next, &q->clhash[i], hlist) | ||
1615 | hfsc_destroy_class(sch, cl); | ||
1616 | } | ||
1617 | __skb_queue_purge(&q->requeue); | ||
1618 | del_timer(&q->wd_timer); | ||
1619 | } | ||
1620 | |||
1621 | static int | ||
1622 | hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) | ||
1623 | { | ||
1624 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1625 | unsigned char *b = skb->tail; | ||
1626 | struct tc_hfsc_qopt qopt; | ||
1627 | |||
1628 | qopt.defcls = q->defcls; | ||
1629 | RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); | ||
1630 | return skb->len; | ||
1631 | |||
1632 | rtattr_failure: | ||
1633 | skb_trim(skb, b - skb->data); | ||
1634 | return -1; | ||
1635 | } | ||
1636 | |||
1637 | static int | ||
1638 | hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
1639 | { | ||
1640 | struct hfsc_class *cl; | ||
1641 | unsigned int len; | ||
1642 | int err; | ||
1643 | |||
1644 | cl = hfsc_classify(skb, sch, &err); | ||
1645 | if (cl == NULL) { | ||
1646 | if (err == NET_XMIT_DROP) | ||
1647 | sch->qstats.drops++; | ||
1648 | kfree_skb(skb); | ||
1649 | return err; | ||
1650 | } | ||
1651 | |||
1652 | len = skb->len; | ||
1653 | err = cl->qdisc->enqueue(skb, cl->qdisc); | ||
1654 | if (unlikely(err != NET_XMIT_SUCCESS)) { | ||
1655 | cl->qstats.drops++; | ||
1656 | sch->qstats.drops++; | ||
1657 | return err; | ||
1658 | } | ||
1659 | |||
1660 | if (cl->qdisc->q.qlen == 1) | ||
1661 | set_active(cl, len); | ||
1662 | |||
1663 | cl->bstats.packets++; | ||
1664 | cl->bstats.bytes += len; | ||
1665 | sch->bstats.packets++; | ||
1666 | sch->bstats.bytes += len; | ||
1667 | sch->q.qlen++; | ||
1668 | |||
1669 | return NET_XMIT_SUCCESS; | ||
1670 | } | ||
1671 | |||
1672 | static struct sk_buff * | ||
1673 | hfsc_dequeue(struct Qdisc *sch) | ||
1674 | { | ||
1675 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1676 | struct hfsc_class *cl; | ||
1677 | struct sk_buff *skb; | ||
1678 | u64 cur_time; | ||
1679 | unsigned int next_len; | ||
1680 | int realtime = 0; | ||
1681 | |||
1682 | if (sch->q.qlen == 0) | ||
1683 | return NULL; | ||
1684 | if ((skb = __skb_dequeue(&q->requeue))) | ||
1685 | goto out; | ||
1686 | |||
1687 | PSCHED_GET_TIME(cur_time); | ||
1688 | |||
1689 | /* | ||
1690 | * if there are eligible classes, use real-time criteria. | ||
1691 | * find the class with the minimum deadline among | ||
1692 | * the eligible classes. | ||
1693 | */ | ||
1694 | if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { | ||
1695 | realtime = 1; | ||
1696 | } else { | ||
1697 | /* | ||
1698 | * use link-sharing criteria | ||
1699 | * get the class with the minimum vt in the hierarchy | ||
1700 | */ | ||
1701 | cl = vttree_get_minvt(&q->root, cur_time); | ||
1702 | if (cl == NULL) { | ||
1703 | sch->qstats.overlimits++; | ||
1704 | hfsc_schedule_watchdog(sch, cur_time); | ||
1705 | return NULL; | ||
1706 | } | ||
1707 | } | ||
1708 | |||
1709 | skb = cl->qdisc->dequeue(cl->qdisc); | ||
1710 | if (skb == NULL) { | ||
1711 | if (net_ratelimit()) | ||
1712 | printk("HFSC: Non-work-conserving qdisc ?\n"); | ||
1713 | return NULL; | ||
1714 | } | ||
1715 | |||
1716 | update_vf(cl, skb->len, cur_time); | ||
1717 | if (realtime) | ||
1718 | cl->cl_cumul += skb->len; | ||
1719 | |||
1720 | if (cl->qdisc->q.qlen != 0) { | ||
1721 | if (cl->cl_flags & HFSC_RSC) { | ||
1722 | /* update ed */ | ||
1723 | next_len = qdisc_peek_len(cl->qdisc); | ||
1724 | if (realtime) | ||
1725 | update_ed(cl, next_len); | ||
1726 | else | ||
1727 | update_d(cl, next_len); | ||
1728 | } | ||
1729 | } else { | ||
1730 | /* the class becomes passive */ | ||
1731 | set_passive(cl); | ||
1732 | } | ||
1733 | |||
1734 | out: | ||
1735 | sch->flags &= ~TCQ_F_THROTTLED; | ||
1736 | sch->q.qlen--; | ||
1737 | |||
1738 | return skb; | ||
1739 | } | ||
1740 | |||
1741 | static int | ||
1742 | hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch) | ||
1743 | { | ||
1744 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1745 | |||
1746 | __skb_queue_head(&q->requeue, skb); | ||
1747 | sch->q.qlen++; | ||
1748 | sch->qstats.requeues++; | ||
1749 | return NET_XMIT_SUCCESS; | ||
1750 | } | ||
1751 | |||
1752 | static unsigned int | ||
1753 | hfsc_drop(struct Qdisc *sch) | ||
1754 | { | ||
1755 | struct hfsc_sched *q = qdisc_priv(sch); | ||
1756 | struct hfsc_class *cl; | ||
1757 | unsigned int len; | ||
1758 | |||
1759 | list_for_each_entry(cl, &q->droplist, dlist) { | ||
1760 | if (cl->qdisc->ops->drop != NULL && | ||
1761 | (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { | ||
1762 | if (cl->qdisc->q.qlen == 0) { | ||
1763 | update_vf(cl, 0, 0); | ||
1764 | set_passive(cl); | ||
1765 | } else { | ||
1766 | list_move_tail(&cl->dlist, &q->droplist); | ||
1767 | } | ||
1768 | cl->qstats.drops++; | ||
1769 | sch->qstats.drops++; | ||
1770 | sch->q.qlen--; | ||
1771 | return len; | ||
1772 | } | ||
1773 | } | ||
1774 | return 0; | ||
1775 | } | ||
1776 | |||
1777 | static struct Qdisc_class_ops hfsc_class_ops = { | ||
1778 | .change = hfsc_change_class, | ||
1779 | .delete = hfsc_delete_class, | ||
1780 | .graft = hfsc_graft_class, | ||
1781 | .leaf = hfsc_class_leaf, | ||
1782 | .get = hfsc_get_class, | ||
1783 | .put = hfsc_put_class, | ||
1784 | .bind_tcf = hfsc_bind_tcf, | ||
1785 | .unbind_tcf = hfsc_unbind_tcf, | ||
1786 | .tcf_chain = hfsc_tcf_chain, | ||
1787 | .dump = hfsc_dump_class, | ||
1788 | .dump_stats = hfsc_dump_class_stats, | ||
1789 | .walk = hfsc_walk | ||
1790 | }; | ||
1791 | |||
1792 | static struct Qdisc_ops hfsc_qdisc_ops = { | ||
1793 | .id = "hfsc", | ||
1794 | .init = hfsc_init_qdisc, | ||
1795 | .change = hfsc_change_qdisc, | ||
1796 | .reset = hfsc_reset_qdisc, | ||
1797 | .destroy = hfsc_destroy_qdisc, | ||
1798 | .dump = hfsc_dump_qdisc, | ||
1799 | .enqueue = hfsc_enqueue, | ||
1800 | .dequeue = hfsc_dequeue, | ||
1801 | .requeue = hfsc_requeue, | ||
1802 | .drop = hfsc_drop, | ||
1803 | .cl_ops = &hfsc_class_ops, | ||
1804 | .priv_size = sizeof(struct hfsc_sched), | ||
1805 | .owner = THIS_MODULE | ||
1806 | }; | ||
1807 | |||
1808 | static int __init | ||
1809 | hfsc_init(void) | ||
1810 | { | ||
1811 | return register_qdisc(&hfsc_qdisc_ops); | ||
1812 | } | ||
1813 | |||
1814 | static void __exit | ||
1815 | hfsc_cleanup(void) | ||
1816 | { | ||
1817 | unregister_qdisc(&hfsc_qdisc_ops); | ||
1818 | } | ||
1819 | |||
1820 | MODULE_LICENSE("GPL"); | ||
1821 | module_init(hfsc_init); | ||
1822 | module_exit(hfsc_cleanup); | ||
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c new file mode 100644 index 000000000000..a85935e7d53d --- /dev/null +++ b/net/sched/sch_htb.c | |||
@@ -0,0 +1,1759 @@ | |||
1 | /* vim: ts=8 sw=8 | ||
2 | * net/sched/sch_htb.c Hierarchical token bucket, feed tree version | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Martin Devera, <devik@cdi.cz> | ||
10 | * | ||
11 | * Credits (in time order) for older HTB versions: | ||
12 | * Stef Coene <stef.coene@docum.org> | ||
13 | * HTB support at LARTC mailing list | ||
14 | * Ondrej Kraus, <krauso@barr.cz> | ||
15 | * found missing INIT_QDISC(htb) | ||
16 | * Vladimir Smelhaus, Aamer Akhter, Bert Hubert | ||
17 | * helped a lot to locate nasty class stall bug | ||
18 | * Andi Kleen, Jamal Hadi, Bert Hubert | ||
19 | * code review and helpful comments on shaping | ||
20 | * Tomasz Wrona, <tw@eter.tym.pl> | ||
21 | * created test case so that I was able to fix nasty bug | ||
22 | * Wilfried Weissmann | ||
23 | * spotted bug in dequeue code and helped with fix | ||
24 | * Jiri Fojtasek | ||
25 | * fixed requeue routine | ||
26 | * and many others. thanks. | ||
27 | * | ||
28 | * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $ | ||
29 | */ | ||
30 | #include <linux/config.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <asm/uaccess.h> | ||
33 | #include <asm/system.h> | ||
34 | #include <linux/bitops.h> | ||
35 | #include <linux/types.h> | ||
36 | #include <linux/kernel.h> | ||
37 | #include <linux/sched.h> | ||
38 | #include <linux/string.h> | ||
39 | #include <linux/mm.h> | ||
40 | #include <linux/socket.h> | ||
41 | #include <linux/sockios.h> | ||
42 | #include <linux/in.h> | ||
43 | #include <linux/errno.h> | ||
44 | #include <linux/interrupt.h> | ||
45 | #include <linux/if_ether.h> | ||
46 | #include <linux/inet.h> | ||
47 | #include <linux/netdevice.h> | ||
48 | #include <linux/etherdevice.h> | ||
49 | #include <linux/notifier.h> | ||
50 | #include <net/ip.h> | ||
51 | #include <net/route.h> | ||
52 | #include <linux/skbuff.h> | ||
53 | #include <linux/list.h> | ||
54 | #include <linux/compiler.h> | ||
55 | #include <net/sock.h> | ||
56 | #include <net/pkt_sched.h> | ||
57 | #include <linux/rbtree.h> | ||
58 | |||
59 | /* HTB algorithm. | ||
60 | Author: devik@cdi.cz | ||
61 | ======================================================================== | ||
62 | HTB is like TBF with multiple classes. It is also similar to CBQ because | ||
63 | it allows to assign priority to each class in hierarchy. | ||
64 | In fact it is another implementation of Floyd's formal sharing. | ||
65 | |||
66 | Levels: | ||
67 | Each class is assigned level. Leaf has ALWAYS level 0 and root | ||
68 | classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level | ||
69 | one less than their parent. | ||
70 | */ | ||
71 | |||
72 | #define HTB_HSIZE 16 /* classid hash size */ | ||
73 | #define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ | ||
74 | #undef HTB_DEBUG /* compile debugging support (activated by tc tool) */ | ||
75 | #define HTB_RATECM 1 /* whether to use rate computer */ | ||
76 | #define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ | ||
77 | #define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) | ||
78 | #define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) | ||
79 | #define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ | ||
80 | |||
81 | #if HTB_VER >> 16 != TC_HTB_PROTOVER | ||
82 | #error "Mismatched sch_htb.c and pkt_sch.h" | ||
83 | #endif | ||
84 | |||
85 | /* debugging support; S is subsystem, these are defined: | ||
86 | 0 - netlink messages | ||
87 | 1 - enqueue | ||
88 | 2 - drop & requeue | ||
89 | 3 - dequeue main | ||
90 | 4 - dequeue one prio DRR part | ||
91 | 5 - dequeue class accounting | ||
92 | 6 - class overlimit status computation | ||
93 | 7 - hint tree | ||
94 | 8 - event queue | ||
95 | 10 - rate estimator | ||
96 | 11 - classifier | ||
97 | 12 - fast dequeue cache | ||
98 | |||
99 | L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full | ||
100 | q->debug uint32 contains 16 2-bit fields one for subsystem starting | ||
101 | from LSB | ||
102 | */ | ||
103 | #ifdef HTB_DEBUG | ||
104 | #define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L) | ||
105 | #define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \ | ||
106 | printk(KERN_DEBUG FMT,##ARG) | ||
107 | #define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) | ||
108 | #define HTB_PASSQ q, | ||
109 | #define HTB_ARGQ struct htb_sched *q, | ||
110 | #define static | ||
111 | #undef __inline__ | ||
112 | #define __inline__ | ||
113 | #undef inline | ||
114 | #define inline | ||
115 | #define HTB_CMAGIC 0xFEFAFEF1 | ||
116 | #define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \ | ||
117 | if ((N)->rb_color == -1) break; \ | ||
118 | rb_erase(N,R); \ | ||
119 | (N)->rb_color = -1; } while (0) | ||
120 | #else | ||
121 | #define HTB_DBG_COND(S,L) (0) | ||
122 | #define HTB_DBG(S,L,FMT,ARG...) | ||
123 | #define HTB_PASSQ | ||
124 | #define HTB_ARGQ | ||
125 | #define HTB_CHCL(cl) | ||
126 | #define htb_safe_rb_erase(N,R) rb_erase(N,R) | ||
127 | #endif | ||
128 | |||
129 | |||
130 | /* used internaly to keep status of single class */ | ||
131 | enum htb_cmode { | ||
132 | HTB_CANT_SEND, /* class can't send and can't borrow */ | ||
133 | HTB_MAY_BORROW, /* class can't send but may borrow */ | ||
134 | HTB_CAN_SEND /* class can send */ | ||
135 | }; | ||
136 | |||
137 | /* interior & leaf nodes; props specific to leaves are marked L: */ | ||
138 | struct htb_class | ||
139 | { | ||
140 | #ifdef HTB_DEBUG | ||
141 | unsigned magic; | ||
142 | #endif | ||
143 | /* general class parameters */ | ||
144 | u32 classid; | ||
145 | struct gnet_stats_basic bstats; | ||
146 | struct gnet_stats_queue qstats; | ||
147 | struct gnet_stats_rate_est rate_est; | ||
148 | struct tc_htb_xstats xstats;/* our special stats */ | ||
149 | int refcnt; /* usage count of this class */ | ||
150 | |||
151 | #ifdef HTB_RATECM | ||
152 | /* rate measurement counters */ | ||
153 | unsigned long rate_bytes,sum_bytes; | ||
154 | unsigned long rate_packets,sum_packets; | ||
155 | #endif | ||
156 | |||
157 | /* topology */ | ||
158 | int level; /* our level (see above) */ | ||
159 | struct htb_class *parent; /* parent class */ | ||
160 | struct list_head hlist; /* classid hash list item */ | ||
161 | struct list_head sibling; /* sibling list item */ | ||
162 | struct list_head children; /* children list */ | ||
163 | |||
164 | union { | ||
165 | struct htb_class_leaf { | ||
166 | struct Qdisc *q; | ||
167 | int prio; | ||
168 | int aprio; | ||
169 | int quantum; | ||
170 | int deficit[TC_HTB_MAXDEPTH]; | ||
171 | struct list_head drop_list; | ||
172 | } leaf; | ||
173 | struct htb_class_inner { | ||
174 | struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ | ||
175 | struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ | ||
176 | /* When class changes from state 1->2 and disconnects from | ||
177 | parent's feed then we lost ptr value and start from the | ||
178 | first child again. Here we store classid of the | ||
179 | last valid ptr (used when ptr is NULL). */ | ||
180 | u32 last_ptr_id[TC_HTB_NUMPRIO]; | ||
181 | } inner; | ||
182 | } un; | ||
183 | struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ | ||
184 | struct rb_node pq_node; /* node for event queue */ | ||
185 | unsigned long pq_key; /* the same type as jiffies global */ | ||
186 | |||
187 | int prio_activity; /* for which prios are we active */ | ||
188 | enum htb_cmode cmode; /* current mode of the class */ | ||
189 | |||
190 | /* class attached filters */ | ||
191 | struct tcf_proto *filter_list; | ||
192 | int filter_cnt; | ||
193 | |||
194 | int warned; /* only one warning about non work conserving .. */ | ||
195 | |||
196 | /* token bucket parameters */ | ||
197 | struct qdisc_rate_table *rate; /* rate table of the class itself */ | ||
198 | struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ | ||
199 | long buffer,cbuffer; /* token bucket depth/rate */ | ||
200 | long mbuffer; /* max wait time */ | ||
201 | long tokens,ctokens; /* current number of tokens */ | ||
202 | psched_time_t t_c; /* checkpoint time */ | ||
203 | }; | ||
204 | |||
205 | /* TODO: maybe compute rate when size is too large .. or drop ? */ | ||
206 | static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate, | ||
207 | int size) | ||
208 | { | ||
209 | int slot = size >> rate->rate.cell_log; | ||
210 | if (slot > 255) { | ||
211 | cl->xstats.giants++; | ||
212 | slot = 255; | ||
213 | } | ||
214 | return rate->data[slot]; | ||
215 | } | ||
216 | |||
217 | struct htb_sched | ||
218 | { | ||
219 | struct list_head root; /* root classes list */ | ||
220 | struct list_head hash[HTB_HSIZE]; /* hashed by classid */ | ||
221 | struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */ | ||
222 | |||
223 | /* self list - roots of self generating tree */ | ||
224 | struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; | ||
225 | int row_mask[TC_HTB_MAXDEPTH]; | ||
226 | struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; | ||
227 | u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; | ||
228 | |||
229 | /* self wait list - roots of wait PQs per row */ | ||
230 | struct rb_root wait_pq[TC_HTB_MAXDEPTH]; | ||
231 | |||
232 | /* time of nearest event per level (row) */ | ||
233 | unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; | ||
234 | |||
235 | /* cached value of jiffies in dequeue */ | ||
236 | unsigned long jiffies; | ||
237 | |||
238 | /* whether we hit non-work conserving class during this dequeue; we use */ | ||
239 | int nwc_hit; /* this to disable mindelay complaint in dequeue */ | ||
240 | |||
241 | int defcls; /* class where unclassified flows go to */ | ||
242 | u32 debug; /* subsystem debug levels */ | ||
243 | |||
244 | /* filters for qdisc itself */ | ||
245 | struct tcf_proto *filter_list; | ||
246 | int filter_cnt; | ||
247 | |||
248 | int rate2quantum; /* quant = rate / rate2quantum */ | ||
249 | psched_time_t now; /* cached dequeue time */ | ||
250 | struct timer_list timer; /* send delay timer */ | ||
251 | #ifdef HTB_RATECM | ||
252 | struct timer_list rttim; /* rate computer timer */ | ||
253 | int recmp_bucket; /* which hash bucket to recompute next */ | ||
254 | #endif | ||
255 | |||
256 | /* non shaped skbs; let them go directly thru */ | ||
257 | struct sk_buff_head direct_queue; | ||
258 | int direct_qlen; /* max qlen of above */ | ||
259 | |||
260 | long direct_pkts; | ||
261 | }; | ||
262 | |||
263 | /* compute hash of size HTB_HSIZE for given handle */ | ||
264 | static __inline__ int htb_hash(u32 h) | ||
265 | { | ||
266 | #if HTB_HSIZE != 16 | ||
267 | #error "Declare new hash for your HTB_HSIZE" | ||
268 | #endif | ||
269 | h ^= h>>8; /* stolen from cbq_hash */ | ||
270 | h ^= h>>4; | ||
271 | return h & 0xf; | ||
272 | } | ||
273 | |||
274 | /* find class in global hash table using given handle */ | ||
275 | static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch) | ||
276 | { | ||
277 | struct htb_sched *q = qdisc_priv(sch); | ||
278 | struct list_head *p; | ||
279 | if (TC_H_MAJ(handle) != sch->handle) | ||
280 | return NULL; | ||
281 | |||
282 | list_for_each (p,q->hash+htb_hash(handle)) { | ||
283 | struct htb_class *cl = list_entry(p,struct htb_class,hlist); | ||
284 | if (cl->classid == handle) | ||
285 | return cl; | ||
286 | } | ||
287 | return NULL; | ||
288 | } | ||
289 | |||
290 | /** | ||
291 | * htb_classify - classify a packet into class | ||
292 | * | ||
293 | * It returns NULL if the packet should be dropped or -1 if the packet | ||
294 | * should be passed directly thru. In all other cases leaf class is returned. | ||
295 | * We allow direct class selection by classid in priority. The we examine | ||
296 | * filters in qdisc and in inner nodes (if higher filter points to the inner | ||
297 | * node). If we end up with classid MAJOR:0 we enqueue the skb into special | ||
298 | * internal fifo (direct). These packets then go directly thru. If we still | ||
299 | * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull | ||
300 | * then finish and return direct queue. | ||
301 | */ | ||
302 | #define HTB_DIRECT (struct htb_class*)-1 | ||
303 | static inline u32 htb_classid(struct htb_class *cl) | ||
304 | { | ||
305 | return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC; | ||
306 | } | ||
307 | |||
308 | static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) | ||
309 | { | ||
310 | struct htb_sched *q = qdisc_priv(sch); | ||
311 | struct htb_class *cl; | ||
312 | struct tcf_result res; | ||
313 | struct tcf_proto *tcf; | ||
314 | int result; | ||
315 | |||
316 | /* allow to select class by setting skb->priority to valid classid; | ||
317 | note that nfmark can be used too by attaching filter fw with no | ||
318 | rules in it */ | ||
319 | if (skb->priority == sch->handle) | ||
320 | return HTB_DIRECT; /* X:0 (direct flow) selected */ | ||
321 | if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) | ||
322 | return cl; | ||
323 | |||
324 | *qerr = NET_XMIT_DROP; | ||
325 | tcf = q->filter_list; | ||
326 | while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { | ||
327 | #ifdef CONFIG_NET_CLS_ACT | ||
328 | switch (result) { | ||
329 | case TC_ACT_QUEUED: | ||
330 | case TC_ACT_STOLEN: | ||
331 | *qerr = NET_XMIT_SUCCESS; | ||
332 | case TC_ACT_SHOT: | ||
333 | return NULL; | ||
334 | } | ||
335 | #elif defined(CONFIG_NET_CLS_POLICE) | ||
336 | if (result == TC_POLICE_SHOT) | ||
337 | return HTB_DIRECT; | ||
338 | #endif | ||
339 | if ((cl = (void*)res.class) == NULL) { | ||
340 | if (res.classid == sch->handle) | ||
341 | return HTB_DIRECT; /* X:0 (direct flow) */ | ||
342 | if ((cl = htb_find(res.classid,sch)) == NULL) | ||
343 | break; /* filter selected invalid classid */ | ||
344 | } | ||
345 | if (!cl->level) | ||
346 | return cl; /* we hit leaf; return it */ | ||
347 | |||
348 | /* we have got inner class; apply inner filter chain */ | ||
349 | tcf = cl->filter_list; | ||
350 | } | ||
351 | /* classification failed; try to use default class */ | ||
352 | cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch); | ||
353 | if (!cl || cl->level) | ||
354 | return HTB_DIRECT; /* bad default .. this is safe bet */ | ||
355 | return cl; | ||
356 | } | ||
357 | |||
358 | #ifdef HTB_DEBUG | ||
359 | static void htb_next_rb_node(struct rb_node **n); | ||
360 | #define HTB_DUMTREE(root,memb) if(root) { \ | ||
361 | struct rb_node *n = (root)->rb_node; \ | ||
362 | while (n->rb_left) n = n->rb_left; \ | ||
363 | while (n) { \ | ||
364 | struct htb_class *cl = rb_entry(n, struct htb_class, memb); \ | ||
365 | printk(" %x",cl->classid); htb_next_rb_node (&n); \ | ||
366 | } } | ||
367 | |||
368 | static void htb_debug_dump (struct htb_sched *q) | ||
369 | { | ||
370 | int i,p; | ||
371 | printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies); | ||
372 | /* rows */ | ||
373 | for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) { | ||
374 | printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]); | ||
375 | for (p=0;p<TC_HTB_NUMPRIO;p++) { | ||
376 | if (!q->row[i][p].rb_node) continue; | ||
377 | printk(" p%d:",p); | ||
378 | HTB_DUMTREE(q->row[i]+p,node[p]); | ||
379 | } | ||
380 | printk("\n"); | ||
381 | } | ||
382 | /* classes */ | ||
383 | for (i = 0; i < HTB_HSIZE; i++) { | ||
384 | struct list_head *l; | ||
385 | list_for_each (l,q->hash+i) { | ||
386 | struct htb_class *cl = list_entry(l,struct htb_class,hlist); | ||
387 | long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); | ||
388 | printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d " | ||
389 | "pa=%x f:", | ||
390 | cl->classid,cl->cmode,cl->tokens,cl->ctokens, | ||
391 | cl->pq_node.rb_color==-1?0:cl->pq_key,diff, | ||
392 | cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity); | ||
393 | if (cl->level) | ||
394 | for (p=0;p<TC_HTB_NUMPRIO;p++) { | ||
395 | if (!cl->un.inner.feed[p].rb_node) continue; | ||
396 | printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0); | ||
397 | HTB_DUMTREE(cl->un.inner.feed+p,node[p]); | ||
398 | } | ||
399 | printk("\n"); | ||
400 | } | ||
401 | } | ||
402 | } | ||
403 | #endif | ||
404 | /** | ||
405 | * htb_add_to_id_tree - adds class to the round robin list | ||
406 | * | ||
407 | * Routine adds class to the list (actually tree) sorted by classid. | ||
408 | * Make sure that class is not already on such list for given prio. | ||
409 | */ | ||
410 | static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root, | ||
411 | struct htb_class *cl,int prio) | ||
412 | { | ||
413 | struct rb_node **p = &root->rb_node, *parent = NULL; | ||
414 | HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio); | ||
415 | #ifdef HTB_DEBUG | ||
416 | if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; } | ||
417 | HTB_CHCL(cl); | ||
418 | if (*p) { | ||
419 | struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]); | ||
420 | HTB_CHCL(x); | ||
421 | } | ||
422 | #endif | ||
423 | while (*p) { | ||
424 | struct htb_class *c; parent = *p; | ||
425 | c = rb_entry(parent, struct htb_class, node[prio]); | ||
426 | HTB_CHCL(c); | ||
427 | if (cl->classid > c->classid) | ||
428 | p = &parent->rb_right; | ||
429 | else | ||
430 | p = &parent->rb_left; | ||
431 | } | ||
432 | rb_link_node(&cl->node[prio], parent, p); | ||
433 | rb_insert_color(&cl->node[prio], root); | ||
434 | } | ||
435 | |||
436 | /** | ||
437 | * htb_add_to_wait_tree - adds class to the event queue with delay | ||
438 | * | ||
439 | * The class is added to priority event queue to indicate that class will | ||
440 | * change its mode in cl->pq_key microseconds. Make sure that class is not | ||
441 | * already in the queue. | ||
442 | */ | ||
443 | static void htb_add_to_wait_tree (struct htb_sched *q, | ||
444 | struct htb_class *cl,long delay,int debug_hint) | ||
445 | { | ||
446 | struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; | ||
447 | HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key); | ||
448 | #ifdef HTB_DEBUG | ||
449 | if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; } | ||
450 | HTB_CHCL(cl); | ||
451 | if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit()) | ||
452 | printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint); | ||
453 | #endif | ||
454 | cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay); | ||
455 | if (cl->pq_key == q->jiffies) | ||
456 | cl->pq_key++; | ||
457 | |||
458 | /* update the nearest event cache */ | ||
459 | if (time_after(q->near_ev_cache[cl->level], cl->pq_key)) | ||
460 | q->near_ev_cache[cl->level] = cl->pq_key; | ||
461 | |||
462 | while (*p) { | ||
463 | struct htb_class *c; parent = *p; | ||
464 | c = rb_entry(parent, struct htb_class, pq_node); | ||
465 | if (time_after_eq(cl->pq_key, c->pq_key)) | ||
466 | p = &parent->rb_right; | ||
467 | else | ||
468 | p = &parent->rb_left; | ||
469 | } | ||
470 | rb_link_node(&cl->pq_node, parent, p); | ||
471 | rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); | ||
472 | } | ||
473 | |||
474 | /** | ||
475 | * htb_next_rb_node - finds next node in binary tree | ||
476 | * | ||
477 | * When we are past last key we return NULL. | ||
478 | * Average complexity is 2 steps per call. | ||
479 | */ | ||
480 | static void htb_next_rb_node(struct rb_node **n) | ||
481 | { | ||
482 | *n = rb_next(*n); | ||
483 | } | ||
484 | |||
485 | /** | ||
486 | * htb_add_class_to_row - add class to its row | ||
487 | * | ||
488 | * The class is added to row at priorities marked in mask. | ||
489 | * It does nothing if mask == 0. | ||
490 | */ | ||
491 | static inline void htb_add_class_to_row(struct htb_sched *q, | ||
492 | struct htb_class *cl,int mask) | ||
493 | { | ||
494 | HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n", | ||
495 | cl->classid,mask,q->row_mask[cl->level]); | ||
496 | HTB_CHCL(cl); | ||
497 | q->row_mask[cl->level] |= mask; | ||
498 | while (mask) { | ||
499 | int prio = ffz(~mask); | ||
500 | mask &= ~(1 << prio); | ||
501 | htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio); | ||
502 | } | ||
503 | } | ||
504 | |||
505 | /** | ||
506 | * htb_remove_class_from_row - removes class from its row | ||
507 | * | ||
508 | * The class is removed from row at priorities marked in mask. | ||
509 | * It does nothing if mask == 0. | ||
510 | */ | ||
511 | static __inline__ void htb_remove_class_from_row(struct htb_sched *q, | ||
512 | struct htb_class *cl,int mask) | ||
513 | { | ||
514 | int m = 0; | ||
515 | HTB_CHCL(cl); | ||
516 | while (mask) { | ||
517 | int prio = ffz(~mask); | ||
518 | mask &= ~(1 << prio); | ||
519 | if (q->ptr[cl->level][prio] == cl->node+prio) | ||
520 | htb_next_rb_node(q->ptr[cl->level]+prio); | ||
521 | htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio); | ||
522 | if (!q->row[cl->level][prio].rb_node) | ||
523 | m |= 1 << prio; | ||
524 | } | ||
525 | HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n", | ||
526 | cl->classid,mask,q->row_mask[cl->level],m); | ||
527 | q->row_mask[cl->level] &= ~m; | ||
528 | } | ||
529 | |||
530 | /** | ||
531 | * htb_activate_prios - creates active classe's feed chain | ||
532 | * | ||
533 | * The class is connected to ancestors and/or appropriate rows | ||
534 | * for priorities it is participating on. cl->cmode must be new | ||
535 | * (activated) mode. It does nothing if cl->prio_activity == 0. | ||
536 | */ | ||
537 | static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl) | ||
538 | { | ||
539 | struct htb_class *p = cl->parent; | ||
540 | long m,mask = cl->prio_activity; | ||
541 | HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); | ||
542 | HTB_CHCL(cl); | ||
543 | |||
544 | while (cl->cmode == HTB_MAY_BORROW && p && mask) { | ||
545 | HTB_CHCL(p); | ||
546 | m = mask; while (m) { | ||
547 | int prio = ffz(~m); | ||
548 | m &= ~(1 << prio); | ||
549 | |||
550 | if (p->un.inner.feed[prio].rb_node) | ||
551 | /* parent already has its feed in use so that | ||
552 | reset bit in mask as parent is already ok */ | ||
553 | mask &= ~(1 << prio); | ||
554 | |||
555 | htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio); | ||
556 | } | ||
557 | HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", | ||
558 | p->classid,p->prio_activity,mask,p->cmode); | ||
559 | p->prio_activity |= mask; | ||
560 | cl = p; p = cl->parent; | ||
561 | HTB_CHCL(cl); | ||
562 | } | ||
563 | if (cl->cmode == HTB_CAN_SEND && mask) | ||
564 | htb_add_class_to_row(q,cl,mask); | ||
565 | } | ||
566 | |||
567 | /** | ||
568 | * htb_deactivate_prios - remove class from feed chain | ||
569 | * | ||
570 | * cl->cmode must represent old mode (before deactivation). It does | ||
571 | * nothing if cl->prio_activity == 0. Class is removed from all feed | ||
572 | * chains and rows. | ||
573 | */ | ||
574 | static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) | ||
575 | { | ||
576 | struct htb_class *p = cl->parent; | ||
577 | long m,mask = cl->prio_activity; | ||
578 | HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); | ||
579 | HTB_CHCL(cl); | ||
580 | |||
581 | while (cl->cmode == HTB_MAY_BORROW && p && mask) { | ||
582 | m = mask; mask = 0; | ||
583 | while (m) { | ||
584 | int prio = ffz(~m); | ||
585 | m &= ~(1 << prio); | ||
586 | |||
587 | if (p->un.inner.ptr[prio] == cl->node+prio) { | ||
588 | /* we are removing child which is pointed to from | ||
589 | parent feed - forget the pointer but remember | ||
590 | classid */ | ||
591 | p->un.inner.last_ptr_id[prio] = cl->classid; | ||
592 | p->un.inner.ptr[prio] = NULL; | ||
593 | } | ||
594 | |||
595 | htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio); | ||
596 | |||
597 | if (!p->un.inner.feed[prio].rb_node) | ||
598 | mask |= 1 << prio; | ||
599 | } | ||
600 | HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", | ||
601 | p->classid,p->prio_activity,mask,p->cmode); | ||
602 | p->prio_activity &= ~mask; | ||
603 | cl = p; p = cl->parent; | ||
604 | HTB_CHCL(cl); | ||
605 | } | ||
606 | if (cl->cmode == HTB_CAN_SEND && mask) | ||
607 | htb_remove_class_from_row(q,cl,mask); | ||
608 | } | ||
609 | |||
610 | /** | ||
611 | * htb_class_mode - computes and returns current class mode | ||
612 | * | ||
613 | * It computes cl's mode at time cl->t_c+diff and returns it. If mode | ||
614 | * is not HTB_CAN_SEND then cl->pq_key is updated to time difference | ||
615 | * from now to time when cl will change its state. | ||
616 | * Also it is worth to note that class mode doesn't change simply | ||
617 | * at cl->{c,}tokens == 0 but there can rather be hysteresis of | ||
618 | * 0 .. -cl->{c,}buffer range. It is meant to limit number of | ||
619 | * mode transitions per time unit. The speed gain is about 1/6. | ||
620 | */ | ||
621 | static __inline__ enum htb_cmode | ||
622 | htb_class_mode(struct htb_class *cl,long *diff) | ||
623 | { | ||
624 | long toks; | ||
625 | |||
626 | if ((toks = (cl->ctokens + *diff)) < ( | ||
627 | #if HTB_HYSTERESIS | ||
628 | cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : | ||
629 | #endif | ||
630 | 0)) { | ||
631 | *diff = -toks; | ||
632 | return HTB_CANT_SEND; | ||
633 | } | ||
634 | if ((toks = (cl->tokens + *diff)) >= ( | ||
635 | #if HTB_HYSTERESIS | ||
636 | cl->cmode == HTB_CAN_SEND ? -cl->buffer : | ||
637 | #endif | ||
638 | 0)) | ||
639 | return HTB_CAN_SEND; | ||
640 | |||
641 | *diff = -toks; | ||
642 | return HTB_MAY_BORROW; | ||
643 | } | ||
644 | |||
645 | /** | ||
646 | * htb_change_class_mode - changes classe's mode | ||
647 | * | ||
648 | * This should be the only way how to change classe's mode under normal | ||
649 | * cirsumstances. Routine will update feed lists linkage, change mode | ||
650 | * and add class to the wait event queue if appropriate. New mode should | ||
651 | * be different from old one and cl->pq_key has to be valid if changing | ||
652 | * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). | ||
653 | */ | ||
654 | static void | ||
655 | htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) | ||
656 | { | ||
657 | enum htb_cmode new_mode = htb_class_mode(cl,diff); | ||
658 | |||
659 | HTB_CHCL(cl); | ||
660 | HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid); | ||
661 | |||
662 | if (new_mode == cl->cmode) | ||
663 | return; | ||
664 | |||
665 | if (cl->prio_activity) { /* not necessary: speed optimization */ | ||
666 | if (cl->cmode != HTB_CANT_SEND) | ||
667 | htb_deactivate_prios(q,cl); | ||
668 | cl->cmode = new_mode; | ||
669 | if (new_mode != HTB_CANT_SEND) | ||
670 | htb_activate_prios(q,cl); | ||
671 | } else | ||
672 | cl->cmode = new_mode; | ||
673 | } | ||
674 | |||
675 | /** | ||
676 | * htb_activate - inserts leaf cl into appropriate active feeds | ||
677 | * | ||
678 | * Routine learns (new) priority of leaf and activates feed chain | ||
679 | * for the prio. It can be called on already active leaf safely. | ||
680 | * It also adds leaf into droplist. | ||
681 | */ | ||
682 | static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl) | ||
683 | { | ||
684 | BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen); | ||
685 | HTB_CHCL(cl); | ||
686 | if (!cl->prio_activity) { | ||
687 | cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio); | ||
688 | htb_activate_prios(q,cl); | ||
689 | list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio); | ||
690 | } | ||
691 | } | ||
692 | |||
693 | /** | ||
694 | * htb_deactivate - remove leaf cl from active feeds | ||
695 | * | ||
696 | * Make sure that leaf is active. In the other words it can't be called | ||
697 | * with non-active leaf. It also removes class from the drop list. | ||
698 | */ | ||
699 | static __inline__ void | ||
700 | htb_deactivate(struct htb_sched *q,struct htb_class *cl) | ||
701 | { | ||
702 | BUG_TRAP(cl->prio_activity); | ||
703 | HTB_CHCL(cl); | ||
704 | htb_deactivate_prios(q,cl); | ||
705 | cl->prio_activity = 0; | ||
706 | list_del_init(&cl->un.leaf.drop_list); | ||
707 | } | ||
708 | |||
709 | static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
710 | { | ||
711 | int ret; | ||
712 | struct htb_sched *q = qdisc_priv(sch); | ||
713 | struct htb_class *cl = htb_classify(skb,sch,&ret); | ||
714 | |||
715 | if (cl == HTB_DIRECT) { | ||
716 | /* enqueue to helper queue */ | ||
717 | if (q->direct_queue.qlen < q->direct_qlen) { | ||
718 | __skb_queue_tail(&q->direct_queue, skb); | ||
719 | q->direct_pkts++; | ||
720 | } | ||
721 | #ifdef CONFIG_NET_CLS_ACT | ||
722 | } else if (!cl) { | ||
723 | if (ret == NET_XMIT_DROP) | ||
724 | sch->qstats.drops++; | ||
725 | kfree_skb (skb); | ||
726 | return ret; | ||
727 | #endif | ||
728 | } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { | ||
729 | sch->qstats.drops++; | ||
730 | cl->qstats.drops++; | ||
731 | return NET_XMIT_DROP; | ||
732 | } else { | ||
733 | cl->bstats.packets++; cl->bstats.bytes += skb->len; | ||
734 | htb_activate (q,cl); | ||
735 | } | ||
736 | |||
737 | sch->q.qlen++; | ||
738 | sch->bstats.packets++; sch->bstats.bytes += skb->len; | ||
739 | HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); | ||
740 | return NET_XMIT_SUCCESS; | ||
741 | } | ||
742 | |||
743 | /* TODO: requeuing packet charges it to policers again !! */ | ||
744 | static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) | ||
745 | { | ||
746 | struct htb_sched *q = qdisc_priv(sch); | ||
747 | int ret = NET_XMIT_SUCCESS; | ||
748 | struct htb_class *cl = htb_classify(skb,sch, &ret); | ||
749 | struct sk_buff *tskb; | ||
750 | |||
751 | if (cl == HTB_DIRECT || !cl) { | ||
752 | /* enqueue to helper queue */ | ||
753 | if (q->direct_queue.qlen < q->direct_qlen && cl) { | ||
754 | __skb_queue_head(&q->direct_queue, skb); | ||
755 | } else { | ||
756 | __skb_queue_head(&q->direct_queue, skb); | ||
757 | tskb = __skb_dequeue_tail(&q->direct_queue); | ||
758 | kfree_skb (tskb); | ||
759 | sch->qstats.drops++; | ||
760 | return NET_XMIT_CN; | ||
761 | } | ||
762 | } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { | ||
763 | sch->qstats.drops++; | ||
764 | cl->qstats.drops++; | ||
765 | return NET_XMIT_DROP; | ||
766 | } else | ||
767 | htb_activate (q,cl); | ||
768 | |||
769 | sch->q.qlen++; | ||
770 | sch->qstats.requeues++; | ||
771 | HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); | ||
772 | return NET_XMIT_SUCCESS; | ||
773 | } | ||
774 | |||
775 | static void htb_timer(unsigned long arg) | ||
776 | { | ||
777 | struct Qdisc *sch = (struct Qdisc*)arg; | ||
778 | sch->flags &= ~TCQ_F_THROTTLED; | ||
779 | wmb(); | ||
780 | netif_schedule(sch->dev); | ||
781 | } | ||
782 | |||
783 | #ifdef HTB_RATECM | ||
784 | #define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 | ||
785 | static void htb_rate_timer(unsigned long arg) | ||
786 | { | ||
787 | struct Qdisc *sch = (struct Qdisc*)arg; | ||
788 | struct htb_sched *q = qdisc_priv(sch); | ||
789 | struct list_head *p; | ||
790 | |||
791 | /* lock queue so that we can muck with it */ | ||
792 | HTB_QLOCK(sch); | ||
793 | HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies); | ||
794 | |||
795 | q->rttim.expires = jiffies + HZ; | ||
796 | add_timer(&q->rttim); | ||
797 | |||
798 | /* scan and recompute one bucket at time */ | ||
799 | if (++q->recmp_bucket >= HTB_HSIZE) | ||
800 | q->recmp_bucket = 0; | ||
801 | list_for_each (p,q->hash+q->recmp_bucket) { | ||
802 | struct htb_class *cl = list_entry(p,struct htb_class,hlist); | ||
803 | HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n", | ||
804 | cl->classid,cl->sum_bytes,cl->sum_packets); | ||
805 | RT_GEN (cl->sum_bytes,cl->rate_bytes); | ||
806 | RT_GEN (cl->sum_packets,cl->rate_packets); | ||
807 | } | ||
808 | HTB_QUNLOCK(sch); | ||
809 | } | ||
810 | #endif | ||
811 | |||
812 | /** | ||
813 | * htb_charge_class - charges amount "bytes" to leaf and ancestors | ||
814 | * | ||
815 | * Routine assumes that packet "bytes" long was dequeued from leaf cl | ||
816 | * borrowing from "level". It accounts bytes to ceil leaky bucket for | ||
817 | * leaf and all ancestors and to rate bucket for ancestors at levels | ||
818 | * "level" and higher. It also handles possible change of mode resulting | ||
819 | * from the update. Note that mode can also increase here (MAY_BORROW to | ||
820 | * CAN_SEND) because we can use more precise clock that event queue here. | ||
821 | * In such case we remove class from event queue first. | ||
822 | */ | ||
823 | static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, | ||
824 | int level,int bytes) | ||
825 | { | ||
826 | long toks,diff; | ||
827 | enum htb_cmode old_mode; | ||
828 | HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes); | ||
829 | |||
830 | #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \ | ||
831 | if (toks > cl->B) toks = cl->B; \ | ||
832 | toks -= L2T(cl, cl->R, bytes); \ | ||
833 | if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \ | ||
834 | cl->T = toks | ||
835 | |||
836 | while (cl) { | ||
837 | HTB_CHCL(cl); | ||
838 | diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); | ||
839 | #ifdef HTB_DEBUG | ||
840 | if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { | ||
841 | if (net_ratelimit()) | ||
842 | printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", | ||
843 | cl->classid, diff, | ||
844 | #ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY | ||
845 | q->now.tv_sec * 1000000ULL + q->now.tv_usec, | ||
846 | cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, | ||
847 | #else | ||
848 | (unsigned long long) q->now, | ||
849 | (unsigned long long) cl->t_c, | ||
850 | #endif | ||
851 | q->jiffies); | ||
852 | diff = 1000; | ||
853 | } | ||
854 | #endif | ||
855 | if (cl->level >= level) { | ||
856 | if (cl->level == level) cl->xstats.lends++; | ||
857 | HTB_ACCNT (tokens,buffer,rate); | ||
858 | } else { | ||
859 | cl->xstats.borrows++; | ||
860 | cl->tokens += diff; /* we moved t_c; update tokens */ | ||
861 | } | ||
862 | HTB_ACCNT (ctokens,cbuffer,ceil); | ||
863 | cl->t_c = q->now; | ||
864 | HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens); | ||
865 | |||
866 | old_mode = cl->cmode; diff = 0; | ||
867 | htb_change_class_mode(q,cl,&diff); | ||
868 | if (old_mode != cl->cmode) { | ||
869 | if (old_mode != HTB_CAN_SEND) | ||
870 | htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); | ||
871 | if (cl->cmode != HTB_CAN_SEND) | ||
872 | htb_add_to_wait_tree (q,cl,diff,1); | ||
873 | } | ||
874 | |||
875 | #ifdef HTB_RATECM | ||
876 | /* update rate counters */ | ||
877 | cl->sum_bytes += bytes; cl->sum_packets++; | ||
878 | #endif | ||
879 | |||
880 | /* update byte stats except for leaves which are already updated */ | ||
881 | if (cl->level) { | ||
882 | cl->bstats.bytes += bytes; | ||
883 | cl->bstats.packets++; | ||
884 | } | ||
885 | cl = cl->parent; | ||
886 | } | ||
887 | } | ||
888 | |||
889 | /** | ||
890 | * htb_do_events - make mode changes to classes at the level | ||
891 | * | ||
892 | * Scans event queue for pending events and applies them. Returns jiffies to | ||
893 | * next pending event (0 for no event in pq). | ||
894 | * Note: Aplied are events whose have cl->pq_key <= jiffies. | ||
895 | */ | ||
896 | static long htb_do_events(struct htb_sched *q,int level) | ||
897 | { | ||
898 | int i; | ||
899 | HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n", | ||
900 | level,q->wait_pq[level].rb_node,q->row_mask[level]); | ||
901 | for (i = 0; i < 500; i++) { | ||
902 | struct htb_class *cl; | ||
903 | long diff; | ||
904 | struct rb_node *p = q->wait_pq[level].rb_node; | ||
905 | if (!p) return 0; | ||
906 | while (p->rb_left) p = p->rb_left; | ||
907 | |||
908 | cl = rb_entry(p, struct htb_class, pq_node); | ||
909 | if (time_after(cl->pq_key, q->jiffies)) { | ||
910 | HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies); | ||
911 | return cl->pq_key - q->jiffies; | ||
912 | } | ||
913 | htb_safe_rb_erase(p,q->wait_pq+level); | ||
914 | diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); | ||
915 | #ifdef HTB_DEBUG | ||
916 | if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { | ||
917 | if (net_ratelimit()) | ||
918 | printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", | ||
919 | cl->classid, diff, | ||
920 | #ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY | ||
921 | q->now.tv_sec * 1000000ULL + q->now.tv_usec, | ||
922 | cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, | ||
923 | #else | ||
924 | (unsigned long long) q->now, | ||
925 | (unsigned long long) cl->t_c, | ||
926 | #endif | ||
927 | q->jiffies); | ||
928 | diff = 1000; | ||
929 | } | ||
930 | #endif | ||
931 | htb_change_class_mode(q,cl,&diff); | ||
932 | if (cl->cmode != HTB_CAN_SEND) | ||
933 | htb_add_to_wait_tree (q,cl,diff,2); | ||
934 | } | ||
935 | if (net_ratelimit()) | ||
936 | printk(KERN_WARNING "htb: too many events !\n"); | ||
937 | return HZ/10; | ||
938 | } | ||
939 | |||
940 | /* Returns class->node+prio from id-tree where classe's id is >= id. NULL | ||
941 | is no such one exists. */ | ||
942 | static struct rb_node * | ||
943 | htb_id_find_next_upper(int prio,struct rb_node *n,u32 id) | ||
944 | { | ||
945 | struct rb_node *r = NULL; | ||
946 | while (n) { | ||
947 | struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]); | ||
948 | if (id == cl->classid) return n; | ||
949 | |||
950 | if (id > cl->classid) { | ||
951 | n = n->rb_right; | ||
952 | } else { | ||
953 | r = n; | ||
954 | n = n->rb_left; | ||
955 | } | ||
956 | } | ||
957 | return r; | ||
958 | } | ||
959 | |||
960 | /** | ||
961 | * htb_lookup_leaf - returns next leaf class in DRR order | ||
962 | * | ||
963 | * Find leaf where current feed pointers points to. | ||
964 | */ | ||
965 | static struct htb_class * | ||
966 | htb_lookup_leaf(HTB_ARGQ struct rb_root *tree,int prio,struct rb_node **pptr,u32 *pid) | ||
967 | { | ||
968 | int i; | ||
969 | struct { | ||
970 | struct rb_node *root; | ||
971 | struct rb_node **pptr; | ||
972 | u32 *pid; | ||
973 | } stk[TC_HTB_MAXDEPTH],*sp = stk; | ||
974 | |||
975 | BUG_TRAP(tree->rb_node); | ||
976 | sp->root = tree->rb_node; | ||
977 | sp->pptr = pptr; | ||
978 | sp->pid = pid; | ||
979 | |||
980 | for (i = 0; i < 65535; i++) { | ||
981 | HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid); | ||
982 | |||
983 | if (!*sp->pptr && *sp->pid) { | ||
984 | /* ptr was invalidated but id is valid - try to recover | ||
985 | the original or next ptr */ | ||
986 | *sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid); | ||
987 | } | ||
988 | *sp->pid = 0; /* ptr is valid now so that remove this hint as it | ||
989 | can become out of date quickly */ | ||
990 | if (!*sp->pptr) { /* we are at right end; rewind & go up */ | ||
991 | *sp->pptr = sp->root; | ||
992 | while ((*sp->pptr)->rb_left) | ||
993 | *sp->pptr = (*sp->pptr)->rb_left; | ||
994 | if (sp > stk) { | ||
995 | sp--; | ||
996 | BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL; | ||
997 | htb_next_rb_node (sp->pptr); | ||
998 | } | ||
999 | } else { | ||
1000 | struct htb_class *cl; | ||
1001 | cl = rb_entry(*sp->pptr,struct htb_class,node[prio]); | ||
1002 | HTB_CHCL(cl); | ||
1003 | if (!cl->level) | ||
1004 | return cl; | ||
1005 | (++sp)->root = cl->un.inner.feed[prio].rb_node; | ||
1006 | sp->pptr = cl->un.inner.ptr+prio; | ||
1007 | sp->pid = cl->un.inner.last_ptr_id+prio; | ||
1008 | } | ||
1009 | } | ||
1010 | BUG_TRAP(0); | ||
1011 | return NULL; | ||
1012 | } | ||
1013 | |||
1014 | /* dequeues packet at given priority and level; call only if | ||
1015 | you are sure that there is active class at prio/level */ | ||
1016 | static struct sk_buff * | ||
1017 | htb_dequeue_tree(struct htb_sched *q,int prio,int level) | ||
1018 | { | ||
1019 | struct sk_buff *skb = NULL; | ||
1020 | struct htb_class *cl,*start; | ||
1021 | /* look initial class up in the row */ | ||
1022 | start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio, | ||
1023 | q->ptr[level]+prio,q->last_ptr_id[level]+prio); | ||
1024 | |||
1025 | do { | ||
1026 | next: | ||
1027 | BUG_TRAP(cl); | ||
1028 | if (!cl) return NULL; | ||
1029 | HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", | ||
1030 | prio,level,cl->classid,cl->un.leaf.deficit[level]); | ||
1031 | |||
1032 | /* class can be empty - it is unlikely but can be true if leaf | ||
1033 | qdisc drops packets in enqueue routine or if someone used | ||
1034 | graft operation on the leaf since last dequeue; | ||
1035 | simply deactivate and skip such class */ | ||
1036 | if (unlikely(cl->un.leaf.q->q.qlen == 0)) { | ||
1037 | struct htb_class *next; | ||
1038 | htb_deactivate(q,cl); | ||
1039 | |||
1040 | /* row/level might become empty */ | ||
1041 | if ((q->row_mask[level] & (1 << prio)) == 0) | ||
1042 | return NULL; | ||
1043 | |||
1044 | next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio, | ||
1045 | prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio); | ||
1046 | |||
1047 | if (cl == start) /* fix start if we just deleted it */ | ||
1048 | start = next; | ||
1049 | cl = next; | ||
1050 | goto next; | ||
1051 | } | ||
1052 | |||
1053 | if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) | ||
1054 | break; | ||
1055 | if (!cl->warned) { | ||
1056 | printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid); | ||
1057 | cl->warned = 1; | ||
1058 | } | ||
1059 | q->nwc_hit++; | ||
1060 | htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); | ||
1061 | cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio, | ||
1062 | q->last_ptr_id[level]+prio); | ||
1063 | |||
1064 | } while (cl != start); | ||
1065 | |||
1066 | if (likely(skb != NULL)) { | ||
1067 | if ((cl->un.leaf.deficit[level] -= skb->len) < 0) { | ||
1068 | HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n", | ||
1069 | level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum); | ||
1070 | cl->un.leaf.deficit[level] += cl->un.leaf.quantum; | ||
1071 | htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); | ||
1072 | } | ||
1073 | /* this used to be after charge_class but this constelation | ||
1074 | gives us slightly better performance */ | ||
1075 | if (!cl->un.leaf.q->q.qlen) | ||
1076 | htb_deactivate (q,cl); | ||
1077 | htb_charge_class (q,cl,level,skb->len); | ||
1078 | } | ||
1079 | return skb; | ||
1080 | } | ||
1081 | |||
1082 | static void htb_delay_by(struct Qdisc *sch,long delay) | ||
1083 | { | ||
1084 | struct htb_sched *q = qdisc_priv(sch); | ||
1085 | if (delay <= 0) delay = 1; | ||
1086 | if (unlikely(delay > 5*HZ)) { | ||
1087 | if (net_ratelimit()) | ||
1088 | printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); | ||
1089 | delay = 5*HZ; | ||
1090 | } | ||
1091 | /* why don't use jiffies here ? because expires can be in past */ | ||
1092 | mod_timer(&q->timer, q->jiffies + delay); | ||
1093 | sch->flags |= TCQ_F_THROTTLED; | ||
1094 | sch->qstats.overlimits++; | ||
1095 | HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay); | ||
1096 | } | ||
1097 | |||
1098 | static struct sk_buff *htb_dequeue(struct Qdisc *sch) | ||
1099 | { | ||
1100 | struct sk_buff *skb = NULL; | ||
1101 | struct htb_sched *q = qdisc_priv(sch); | ||
1102 | int level; | ||
1103 | long min_delay; | ||
1104 | #ifdef HTB_DEBUG | ||
1105 | int evs_used = 0; | ||
1106 | #endif | ||
1107 | |||
1108 | q->jiffies = jiffies; | ||
1109 | HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue), | ||
1110 | sch->q.qlen); | ||
1111 | |||
1112 | /* try to dequeue direct packets as high prio (!) to minimize cpu work */ | ||
1113 | if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) { | ||
1114 | sch->flags &= ~TCQ_F_THROTTLED; | ||
1115 | sch->q.qlen--; | ||
1116 | return skb; | ||
1117 | } | ||
1118 | |||
1119 | if (!sch->q.qlen) goto fin; | ||
1120 | PSCHED_GET_TIME(q->now); | ||
1121 | |||
1122 | min_delay = LONG_MAX; | ||
1123 | q->nwc_hit = 0; | ||
1124 | for (level = 0; level < TC_HTB_MAXDEPTH; level++) { | ||
1125 | /* common case optimization - skip event handler quickly */ | ||
1126 | int m; | ||
1127 | long delay; | ||
1128 | if (time_after_eq(q->jiffies, q->near_ev_cache[level])) { | ||
1129 | delay = htb_do_events(q,level); | ||
1130 | q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ); | ||
1131 | #ifdef HTB_DEBUG | ||
1132 | evs_used++; | ||
1133 | #endif | ||
1134 | } else | ||
1135 | delay = q->near_ev_cache[level] - q->jiffies; | ||
1136 | |||
1137 | if (delay && min_delay > delay) | ||
1138 | min_delay = delay; | ||
1139 | m = ~q->row_mask[level]; | ||
1140 | while (m != (int)(-1)) { | ||
1141 | int prio = ffz (m); | ||
1142 | m |= 1 << prio; | ||
1143 | skb = htb_dequeue_tree(q,prio,level); | ||
1144 | if (likely(skb != NULL)) { | ||
1145 | sch->q.qlen--; | ||
1146 | sch->flags &= ~TCQ_F_THROTTLED; | ||
1147 | goto fin; | ||
1148 | } | ||
1149 | } | ||
1150 | } | ||
1151 | #ifdef HTB_DEBUG | ||
1152 | if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) { | ||
1153 | if (min_delay == LONG_MAX) { | ||
1154 | printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n", | ||
1155 | evs_used,q->jiffies,jiffies); | ||
1156 | htb_debug_dump(q); | ||
1157 | } else | ||
1158 | printk(KERN_WARNING "HTB: mindelay=%ld, some class has " | ||
1159 | "too small rate\n",min_delay); | ||
1160 | } | ||
1161 | #endif | ||
1162 | htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay); | ||
1163 | fin: | ||
1164 | HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb); | ||
1165 | return skb; | ||
1166 | } | ||
1167 | |||
1168 | /* try to drop from each class (by prio) until one succeed */ | ||
1169 | static unsigned int htb_drop(struct Qdisc* sch) | ||
1170 | { | ||
1171 | struct htb_sched *q = qdisc_priv(sch); | ||
1172 | int prio; | ||
1173 | |||
1174 | for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { | ||
1175 | struct list_head *p; | ||
1176 | list_for_each (p,q->drops+prio) { | ||
1177 | struct htb_class *cl = list_entry(p, struct htb_class, | ||
1178 | un.leaf.drop_list); | ||
1179 | unsigned int len; | ||
1180 | if (cl->un.leaf.q->ops->drop && | ||
1181 | (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { | ||
1182 | sch->q.qlen--; | ||
1183 | if (!cl->un.leaf.q->q.qlen) | ||
1184 | htb_deactivate (q,cl); | ||
1185 | return len; | ||
1186 | } | ||
1187 | } | ||
1188 | } | ||
1189 | return 0; | ||
1190 | } | ||
1191 | |||
1192 | /* reset all classes */ | ||
1193 | /* always caled under BH & queue lock */ | ||
1194 | static void htb_reset(struct Qdisc* sch) | ||
1195 | { | ||
1196 | struct htb_sched *q = qdisc_priv(sch); | ||
1197 | int i; | ||
1198 | HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle); | ||
1199 | |||
1200 | for (i = 0; i < HTB_HSIZE; i++) { | ||
1201 | struct list_head *p; | ||
1202 | list_for_each (p,q->hash+i) { | ||
1203 | struct htb_class *cl = list_entry(p,struct htb_class,hlist); | ||
1204 | if (cl->level) | ||
1205 | memset(&cl->un.inner,0,sizeof(cl->un.inner)); | ||
1206 | else { | ||
1207 | if (cl->un.leaf.q) | ||
1208 | qdisc_reset(cl->un.leaf.q); | ||
1209 | INIT_LIST_HEAD(&cl->un.leaf.drop_list); | ||
1210 | } | ||
1211 | cl->prio_activity = 0; | ||
1212 | cl->cmode = HTB_CAN_SEND; | ||
1213 | #ifdef HTB_DEBUG | ||
1214 | cl->pq_node.rb_color = -1; | ||
1215 | memset(cl->node,255,sizeof(cl->node)); | ||
1216 | #endif | ||
1217 | |||
1218 | } | ||
1219 | } | ||
1220 | sch->flags &= ~TCQ_F_THROTTLED; | ||
1221 | del_timer(&q->timer); | ||
1222 | __skb_queue_purge(&q->direct_queue); | ||
1223 | sch->q.qlen = 0; | ||
1224 | memset(q->row,0,sizeof(q->row)); | ||
1225 | memset(q->row_mask,0,sizeof(q->row_mask)); | ||
1226 | memset(q->wait_pq,0,sizeof(q->wait_pq)); | ||
1227 | memset(q->ptr,0,sizeof(q->ptr)); | ||
1228 | for (i = 0; i < TC_HTB_NUMPRIO; i++) | ||
1229 | INIT_LIST_HEAD(q->drops+i); | ||
1230 | } | ||
1231 | |||
1232 | static int htb_init(struct Qdisc *sch, struct rtattr *opt) | ||
1233 | { | ||
1234 | struct htb_sched *q = qdisc_priv(sch); | ||
1235 | struct rtattr *tb[TCA_HTB_INIT]; | ||
1236 | struct tc_htb_glob *gopt; | ||
1237 | int i; | ||
1238 | #ifdef HTB_DEBUG | ||
1239 | printk(KERN_INFO "HTB init, kernel part version %d.%d\n", | ||
1240 | HTB_VER >> 16,HTB_VER & 0xffff); | ||
1241 | #endif | ||
1242 | if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) || | ||
1243 | tb[TCA_HTB_INIT-1] == NULL || | ||
1244 | RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) { | ||
1245 | printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); | ||
1246 | return -EINVAL; | ||
1247 | } | ||
1248 | gopt = RTA_DATA(tb[TCA_HTB_INIT-1]); | ||
1249 | if (gopt->version != HTB_VER >> 16) { | ||
1250 | printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n", | ||
1251 | HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); | ||
1252 | return -EINVAL; | ||
1253 | } | ||
1254 | q->debug = gopt->debug; | ||
1255 | HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); | ||
1256 | |||
1257 | INIT_LIST_HEAD(&q->root); | ||
1258 | for (i = 0; i < HTB_HSIZE; i++) | ||
1259 | INIT_LIST_HEAD(q->hash+i); | ||
1260 | for (i = 0; i < TC_HTB_NUMPRIO; i++) | ||
1261 | INIT_LIST_HEAD(q->drops+i); | ||
1262 | |||
1263 | init_timer(&q->timer); | ||
1264 | skb_queue_head_init(&q->direct_queue); | ||
1265 | |||
1266 | q->direct_qlen = sch->dev->tx_queue_len; | ||
1267 | if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ | ||
1268 | q->direct_qlen = 2; | ||
1269 | q->timer.function = htb_timer; | ||
1270 | q->timer.data = (unsigned long)sch; | ||
1271 | |||
1272 | #ifdef HTB_RATECM | ||
1273 | init_timer(&q->rttim); | ||
1274 | q->rttim.function = htb_rate_timer; | ||
1275 | q->rttim.data = (unsigned long)sch; | ||
1276 | q->rttim.expires = jiffies + HZ; | ||
1277 | add_timer(&q->rttim); | ||
1278 | #endif | ||
1279 | if ((q->rate2quantum = gopt->rate2quantum) < 1) | ||
1280 | q->rate2quantum = 1; | ||
1281 | q->defcls = gopt->defcls; | ||
1282 | |||
1283 | return 0; | ||
1284 | } | ||
1285 | |||
1286 | static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
1287 | { | ||
1288 | struct htb_sched *q = qdisc_priv(sch); | ||
1289 | unsigned char *b = skb->tail; | ||
1290 | struct rtattr *rta; | ||
1291 | struct tc_htb_glob gopt; | ||
1292 | HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle); | ||
1293 | HTB_QLOCK(sch); | ||
1294 | gopt.direct_pkts = q->direct_pkts; | ||
1295 | |||
1296 | #ifdef HTB_DEBUG | ||
1297 | if (HTB_DBG_COND(0,2)) | ||
1298 | htb_debug_dump(q); | ||
1299 | #endif | ||
1300 | gopt.version = HTB_VER; | ||
1301 | gopt.rate2quantum = q->rate2quantum; | ||
1302 | gopt.defcls = q->defcls; | ||
1303 | gopt.debug = q->debug; | ||
1304 | rta = (struct rtattr*)b; | ||
1305 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
1306 | RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); | ||
1307 | rta->rta_len = skb->tail - b; | ||
1308 | HTB_QUNLOCK(sch); | ||
1309 | return skb->len; | ||
1310 | rtattr_failure: | ||
1311 | HTB_QUNLOCK(sch); | ||
1312 | skb_trim(skb, skb->tail - skb->data); | ||
1313 | return -1; | ||
1314 | } | ||
1315 | |||
1316 | static int htb_dump_class(struct Qdisc *sch, unsigned long arg, | ||
1317 | struct sk_buff *skb, struct tcmsg *tcm) | ||
1318 | { | ||
1319 | #ifdef HTB_DEBUG | ||
1320 | struct htb_sched *q = qdisc_priv(sch); | ||
1321 | #endif | ||
1322 | struct htb_class *cl = (struct htb_class*)arg; | ||
1323 | unsigned char *b = skb->tail; | ||
1324 | struct rtattr *rta; | ||
1325 | struct tc_htb_opt opt; | ||
1326 | |||
1327 | HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid); | ||
1328 | |||
1329 | HTB_QLOCK(sch); | ||
1330 | tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT; | ||
1331 | tcm->tcm_handle = cl->classid; | ||
1332 | if (!cl->level && cl->un.leaf.q) | ||
1333 | tcm->tcm_info = cl->un.leaf.q->handle; | ||
1334 | |||
1335 | rta = (struct rtattr*)b; | ||
1336 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
1337 | |||
1338 | memset (&opt,0,sizeof(opt)); | ||
1339 | |||
1340 | opt.rate = cl->rate->rate; opt.buffer = cl->buffer; | ||
1341 | opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer; | ||
1342 | opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio; | ||
1343 | opt.level = cl->level; | ||
1344 | RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); | ||
1345 | rta->rta_len = skb->tail - b; | ||
1346 | HTB_QUNLOCK(sch); | ||
1347 | return skb->len; | ||
1348 | rtattr_failure: | ||
1349 | HTB_QUNLOCK(sch); | ||
1350 | skb_trim(skb, b - skb->data); | ||
1351 | return -1; | ||
1352 | } | ||
1353 | |||
1354 | static int | ||
1355 | htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, | ||
1356 | struct gnet_dump *d) | ||
1357 | { | ||
1358 | struct htb_class *cl = (struct htb_class*)arg; | ||
1359 | |||
1360 | #ifdef HTB_RATECM | ||
1361 | cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE); | ||
1362 | cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE); | ||
1363 | #endif | ||
1364 | |||
1365 | if (!cl->level && cl->un.leaf.q) | ||
1366 | cl->qstats.qlen = cl->un.leaf.q->q.qlen; | ||
1367 | cl->xstats.tokens = cl->tokens; | ||
1368 | cl->xstats.ctokens = cl->ctokens; | ||
1369 | |||
1370 | if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || | ||
1371 | gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || | ||
1372 | gnet_stats_copy_queue(d, &cl->qstats) < 0) | ||
1373 | return -1; | ||
1374 | |||
1375 | return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); | ||
1376 | } | ||
1377 | |||
1378 | static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, | ||
1379 | struct Qdisc **old) | ||
1380 | { | ||
1381 | struct htb_class *cl = (struct htb_class*)arg; | ||
1382 | |||
1383 | if (cl && !cl->level) { | ||
1384 | if (new == NULL && (new = qdisc_create_dflt(sch->dev, | ||
1385 | &pfifo_qdisc_ops)) == NULL) | ||
1386 | return -ENOBUFS; | ||
1387 | sch_tree_lock(sch); | ||
1388 | if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { | ||
1389 | if (cl->prio_activity) | ||
1390 | htb_deactivate (qdisc_priv(sch),cl); | ||
1391 | |||
1392 | /* TODO: is it correct ? Why CBQ doesn't do it ? */ | ||
1393 | sch->q.qlen -= (*old)->q.qlen; | ||
1394 | qdisc_reset(*old); | ||
1395 | } | ||
1396 | sch_tree_unlock(sch); | ||
1397 | return 0; | ||
1398 | } | ||
1399 | return -ENOENT; | ||
1400 | } | ||
1401 | |||
1402 | static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg) | ||
1403 | { | ||
1404 | struct htb_class *cl = (struct htb_class*)arg; | ||
1405 | return (cl && !cl->level) ? cl->un.leaf.q : NULL; | ||
1406 | } | ||
1407 | |||
1408 | static unsigned long htb_get(struct Qdisc *sch, u32 classid) | ||
1409 | { | ||
1410 | #ifdef HTB_DEBUG | ||
1411 | struct htb_sched *q = qdisc_priv(sch); | ||
1412 | #endif | ||
1413 | struct htb_class *cl = htb_find(classid,sch); | ||
1414 | HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0); | ||
1415 | if (cl) | ||
1416 | cl->refcnt++; | ||
1417 | return (unsigned long)cl; | ||
1418 | } | ||
1419 | |||
1420 | static void htb_destroy_filters(struct tcf_proto **fl) | ||
1421 | { | ||
1422 | struct tcf_proto *tp; | ||
1423 | |||
1424 | while ((tp = *fl) != NULL) { | ||
1425 | *fl = tp->next; | ||
1426 | tcf_destroy(tp); | ||
1427 | } | ||
1428 | } | ||
1429 | |||
1430 | static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl) | ||
1431 | { | ||
1432 | struct htb_sched *q = qdisc_priv(sch); | ||
1433 | HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0); | ||
1434 | if (!cl->level) { | ||
1435 | BUG_TRAP(cl->un.leaf.q); | ||
1436 | sch->q.qlen -= cl->un.leaf.q->q.qlen; | ||
1437 | qdisc_destroy(cl->un.leaf.q); | ||
1438 | } | ||
1439 | qdisc_put_rtab(cl->rate); | ||
1440 | qdisc_put_rtab(cl->ceil); | ||
1441 | |||
1442 | htb_destroy_filters (&cl->filter_list); | ||
1443 | |||
1444 | while (!list_empty(&cl->children)) | ||
1445 | htb_destroy_class (sch,list_entry(cl->children.next, | ||
1446 | struct htb_class,sibling)); | ||
1447 | |||
1448 | /* note: this delete may happen twice (see htb_delete) */ | ||
1449 | list_del(&cl->hlist); | ||
1450 | list_del(&cl->sibling); | ||
1451 | |||
1452 | if (cl->prio_activity) | ||
1453 | htb_deactivate (q,cl); | ||
1454 | |||
1455 | if (cl->cmode != HTB_CAN_SEND) | ||
1456 | htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); | ||
1457 | |||
1458 | kfree(cl); | ||
1459 | } | ||
1460 | |||
1461 | /* always caled under BH & queue lock */ | ||
1462 | static void htb_destroy(struct Qdisc* sch) | ||
1463 | { | ||
1464 | struct htb_sched *q = qdisc_priv(sch); | ||
1465 | HTB_DBG(0,1,"htb_destroy q=%p\n",q); | ||
1466 | |||
1467 | del_timer_sync (&q->timer); | ||
1468 | #ifdef HTB_RATECM | ||
1469 | del_timer_sync (&q->rttim); | ||
1470 | #endif | ||
1471 | /* This line used to be after htb_destroy_class call below | ||
1472 | and surprisingly it worked in 2.4. But it must precede it | ||
1473 | because filter need its target class alive to be able to call | ||
1474 | unbind_filter on it (without Oops). */ | ||
1475 | htb_destroy_filters(&q->filter_list); | ||
1476 | |||
1477 | while (!list_empty(&q->root)) | ||
1478 | htb_destroy_class (sch,list_entry(q->root.next, | ||
1479 | struct htb_class,sibling)); | ||
1480 | |||
1481 | __skb_queue_purge(&q->direct_queue); | ||
1482 | } | ||
1483 | |||
1484 | static int htb_delete(struct Qdisc *sch, unsigned long arg) | ||
1485 | { | ||
1486 | struct htb_sched *q = qdisc_priv(sch); | ||
1487 | struct htb_class *cl = (struct htb_class*)arg; | ||
1488 | HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); | ||
1489 | |||
1490 | // TODO: why don't allow to delete subtree ? references ? does | ||
1491 | // tc subsys quarantee us that in htb_destroy it holds no class | ||
1492 | // refs so that we can remove children safely there ? | ||
1493 | if (!list_empty(&cl->children) || cl->filter_cnt) | ||
1494 | return -EBUSY; | ||
1495 | |||
1496 | sch_tree_lock(sch); | ||
1497 | |||
1498 | /* delete from hash and active; remainder in destroy_class */ | ||
1499 | list_del_init(&cl->hlist); | ||
1500 | if (cl->prio_activity) | ||
1501 | htb_deactivate (q,cl); | ||
1502 | |||
1503 | if (--cl->refcnt == 0) | ||
1504 | htb_destroy_class(sch,cl); | ||
1505 | |||
1506 | sch_tree_unlock(sch); | ||
1507 | return 0; | ||
1508 | } | ||
1509 | |||
1510 | static void htb_put(struct Qdisc *sch, unsigned long arg) | ||
1511 | { | ||
1512 | #ifdef HTB_DEBUG | ||
1513 | struct htb_sched *q = qdisc_priv(sch); | ||
1514 | #endif | ||
1515 | struct htb_class *cl = (struct htb_class*)arg; | ||
1516 | HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); | ||
1517 | |||
1518 | if (--cl->refcnt == 0) | ||
1519 | htb_destroy_class(sch,cl); | ||
1520 | } | ||
1521 | |||
1522 | static int htb_change_class(struct Qdisc *sch, u32 classid, | ||
1523 | u32 parentid, struct rtattr **tca, unsigned long *arg) | ||
1524 | { | ||
1525 | int err = -EINVAL; | ||
1526 | struct htb_sched *q = qdisc_priv(sch); | ||
1527 | struct htb_class *cl = (struct htb_class*)*arg,*parent; | ||
1528 | struct rtattr *opt = tca[TCA_OPTIONS-1]; | ||
1529 | struct qdisc_rate_table *rtab = NULL, *ctab = NULL; | ||
1530 | struct rtattr *tb[TCA_HTB_RTAB]; | ||
1531 | struct tc_htb_opt *hopt; | ||
1532 | |||
1533 | /* extract all subattrs from opt attr */ | ||
1534 | if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) || | ||
1535 | tb[TCA_HTB_PARMS-1] == NULL || | ||
1536 | RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt)) | ||
1537 | goto failure; | ||
1538 | |||
1539 | parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch); | ||
1540 | |||
1541 | hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]); | ||
1542 | HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); | ||
1543 | rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]); | ||
1544 | ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]); | ||
1545 | if (!rtab || !ctab) goto failure; | ||
1546 | |||
1547 | if (!cl) { /* new class */ | ||
1548 | struct Qdisc *new_q; | ||
1549 | /* check for valid classid */ | ||
1550 | if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) | ||
1551 | goto failure; | ||
1552 | |||
1553 | /* check maximal depth */ | ||
1554 | if (parent && parent->parent && parent->parent->level < 2) { | ||
1555 | printk(KERN_ERR "htb: tree is too deep\n"); | ||
1556 | goto failure; | ||
1557 | } | ||
1558 | err = -ENOBUFS; | ||
1559 | if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL) | ||
1560 | goto failure; | ||
1561 | |||
1562 | memset(cl, 0, sizeof(*cl)); | ||
1563 | cl->refcnt = 1; | ||
1564 | INIT_LIST_HEAD(&cl->sibling); | ||
1565 | INIT_LIST_HEAD(&cl->hlist); | ||
1566 | INIT_LIST_HEAD(&cl->children); | ||
1567 | INIT_LIST_HEAD(&cl->un.leaf.drop_list); | ||
1568 | #ifdef HTB_DEBUG | ||
1569 | cl->magic = HTB_CMAGIC; | ||
1570 | #endif | ||
1571 | |||
1572 | /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) | ||
1573 | so that can't be used inside of sch_tree_lock | ||
1574 | -- thanks to Karlis Peisenieks */ | ||
1575 | new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); | ||
1576 | sch_tree_lock(sch); | ||
1577 | if (parent && !parent->level) { | ||
1578 | /* turn parent into inner node */ | ||
1579 | sch->q.qlen -= parent->un.leaf.q->q.qlen; | ||
1580 | qdisc_destroy (parent->un.leaf.q); | ||
1581 | if (parent->prio_activity) | ||
1582 | htb_deactivate (q,parent); | ||
1583 | |||
1584 | /* remove from evt list because of level change */ | ||
1585 | if (parent->cmode != HTB_CAN_SEND) { | ||
1586 | htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/); | ||
1587 | parent->cmode = HTB_CAN_SEND; | ||
1588 | } | ||
1589 | parent->level = (parent->parent ? parent->parent->level | ||
1590 | : TC_HTB_MAXDEPTH) - 1; | ||
1591 | memset (&parent->un.inner,0,sizeof(parent->un.inner)); | ||
1592 | } | ||
1593 | /* leaf (we) needs elementary qdisc */ | ||
1594 | cl->un.leaf.q = new_q ? new_q : &noop_qdisc; | ||
1595 | |||
1596 | cl->classid = classid; cl->parent = parent; | ||
1597 | |||
1598 | /* set class to be in HTB_CAN_SEND state */ | ||
1599 | cl->tokens = hopt->buffer; | ||
1600 | cl->ctokens = hopt->cbuffer; | ||
1601 | cl->mbuffer = 60000000; /* 1min */ | ||
1602 | PSCHED_GET_TIME(cl->t_c); | ||
1603 | cl->cmode = HTB_CAN_SEND; | ||
1604 | |||
1605 | /* attach to the hash list and parent's family */ | ||
1606 | list_add_tail(&cl->hlist, q->hash+htb_hash(classid)); | ||
1607 | list_add_tail(&cl->sibling, parent ? &parent->children : &q->root); | ||
1608 | #ifdef HTB_DEBUG | ||
1609 | { | ||
1610 | int i; | ||
1611 | for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1; | ||
1612 | cl->pq_node.rb_color = -1; | ||
1613 | } | ||
1614 | #endif | ||
1615 | } else sch_tree_lock(sch); | ||
1616 | |||
1617 | /* it used to be a nasty bug here, we have to check that node | ||
1618 | is really leaf before changing cl->un.leaf ! */ | ||
1619 | if (!cl->level) { | ||
1620 | cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum; | ||
1621 | if (!hopt->quantum && cl->un.leaf.quantum < 1000) { | ||
1622 | printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid); | ||
1623 | cl->un.leaf.quantum = 1000; | ||
1624 | } | ||
1625 | if (!hopt->quantum && cl->un.leaf.quantum > 200000) { | ||
1626 | printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid); | ||
1627 | cl->un.leaf.quantum = 200000; | ||
1628 | } | ||
1629 | if (hopt->quantum) | ||
1630 | cl->un.leaf.quantum = hopt->quantum; | ||
1631 | if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO) | ||
1632 | cl->un.leaf.prio = TC_HTB_NUMPRIO - 1; | ||
1633 | } | ||
1634 | |||
1635 | cl->buffer = hopt->buffer; | ||
1636 | cl->cbuffer = hopt->cbuffer; | ||
1637 | if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab; | ||
1638 | if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab; | ||
1639 | sch_tree_unlock(sch); | ||
1640 | |||
1641 | *arg = (unsigned long)cl; | ||
1642 | return 0; | ||
1643 | |||
1644 | failure: | ||
1645 | if (rtab) qdisc_put_rtab(rtab); | ||
1646 | if (ctab) qdisc_put_rtab(ctab); | ||
1647 | return err; | ||
1648 | } | ||
1649 | |||
1650 | static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) | ||
1651 | { | ||
1652 | struct htb_sched *q = qdisc_priv(sch); | ||
1653 | struct htb_class *cl = (struct htb_class *)arg; | ||
1654 | struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; | ||
1655 | HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl); | ||
1656 | return fl; | ||
1657 | } | ||
1658 | |||
1659 | static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, | ||
1660 | u32 classid) | ||
1661 | { | ||
1662 | struct htb_sched *q = qdisc_priv(sch); | ||
1663 | struct htb_class *cl = htb_find (classid,sch); | ||
1664 | HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt); | ||
1665 | /*if (cl && !cl->level) return 0; | ||
1666 | The line above used to be there to prevent attaching filters to | ||
1667 | leaves. But at least tc_index filter uses this just to get class | ||
1668 | for other reasons so that we have to allow for it. | ||
1669 | ---- | ||
1670 | 19.6.2002 As Werner explained it is ok - bind filter is just | ||
1671 | another way to "lock" the class - unlike "get" this lock can | ||
1672 | be broken by class during destroy IIUC. | ||
1673 | */ | ||
1674 | if (cl) | ||
1675 | cl->filter_cnt++; | ||
1676 | else | ||
1677 | q->filter_cnt++; | ||
1678 | return (unsigned long)cl; | ||
1679 | } | ||
1680 | |||
1681 | static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) | ||
1682 | { | ||
1683 | struct htb_sched *q = qdisc_priv(sch); | ||
1684 | struct htb_class *cl = (struct htb_class *)arg; | ||
1685 | HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt); | ||
1686 | if (cl) | ||
1687 | cl->filter_cnt--; | ||
1688 | else | ||
1689 | q->filter_cnt--; | ||
1690 | } | ||
1691 | |||
1692 | static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) | ||
1693 | { | ||
1694 | struct htb_sched *q = qdisc_priv(sch); | ||
1695 | int i; | ||
1696 | |||
1697 | if (arg->stop) | ||
1698 | return; | ||
1699 | |||
1700 | for (i = 0; i < HTB_HSIZE; i++) { | ||
1701 | struct list_head *p; | ||
1702 | list_for_each (p,q->hash+i) { | ||
1703 | struct htb_class *cl = list_entry(p,struct htb_class,hlist); | ||
1704 | if (arg->count < arg->skip) { | ||
1705 | arg->count++; | ||
1706 | continue; | ||
1707 | } | ||
1708 | if (arg->fn(sch, (unsigned long)cl, arg) < 0) { | ||
1709 | arg->stop = 1; | ||
1710 | return; | ||
1711 | } | ||
1712 | arg->count++; | ||
1713 | } | ||
1714 | } | ||
1715 | } | ||
1716 | |||
1717 | static struct Qdisc_class_ops htb_class_ops = { | ||
1718 | .graft = htb_graft, | ||
1719 | .leaf = htb_leaf, | ||
1720 | .get = htb_get, | ||
1721 | .put = htb_put, | ||
1722 | .change = htb_change_class, | ||
1723 | .delete = htb_delete, | ||
1724 | .walk = htb_walk, | ||
1725 | .tcf_chain = htb_find_tcf, | ||
1726 | .bind_tcf = htb_bind_filter, | ||
1727 | .unbind_tcf = htb_unbind_filter, | ||
1728 | .dump = htb_dump_class, | ||
1729 | .dump_stats = htb_dump_class_stats, | ||
1730 | }; | ||
1731 | |||
1732 | static struct Qdisc_ops htb_qdisc_ops = { | ||
1733 | .next = NULL, | ||
1734 | .cl_ops = &htb_class_ops, | ||
1735 | .id = "htb", | ||
1736 | .priv_size = sizeof(struct htb_sched), | ||
1737 | .enqueue = htb_enqueue, | ||
1738 | .dequeue = htb_dequeue, | ||
1739 | .requeue = htb_requeue, | ||
1740 | .drop = htb_drop, | ||
1741 | .init = htb_init, | ||
1742 | .reset = htb_reset, | ||
1743 | .destroy = htb_destroy, | ||
1744 | .change = NULL /* htb_change */, | ||
1745 | .dump = htb_dump, | ||
1746 | .owner = THIS_MODULE, | ||
1747 | }; | ||
1748 | |||
1749 | static int __init htb_module_init(void) | ||
1750 | { | ||
1751 | return register_qdisc(&htb_qdisc_ops); | ||
1752 | } | ||
1753 | static void __exit htb_module_exit(void) | ||
1754 | { | ||
1755 | unregister_qdisc(&htb_qdisc_ops); | ||
1756 | } | ||
1757 | module_init(htb_module_init) | ||
1758 | module_exit(htb_module_exit) | ||
1759 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c new file mode 100644 index 000000000000..8edc32a6ad2f --- /dev/null +++ b/net/sched/sch_ingress.c | |||
@@ -0,0 +1,436 @@ | |||
1 | /* net/sched/sch_ingress.c - Ingress qdisc | ||
2 | * This program is free software; you can redistribute it and/or | ||
3 | * modify it under the terms of the GNU General Public License | ||
4 | * as published by the Free Software Foundation; either version | ||
5 | * 2 of the License, or (at your option) any later version. | ||
6 | * | ||
7 | * Authors: Jamal Hadi Salim 1999 | ||
8 | */ | ||
9 | |||
10 | #include <linux/config.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/skbuff.h> | ||
14 | #include <linux/netdevice.h> | ||
15 | #include <linux/rtnetlink.h> | ||
16 | #include <linux/netfilter_ipv4.h> | ||
17 | #include <linux/netfilter_ipv6.h> | ||
18 | #include <linux/netfilter.h> | ||
19 | #include <linux/smp.h> | ||
20 | #include <net/pkt_sched.h> | ||
21 | #include <asm/byteorder.h> | ||
22 | #include <asm/uaccess.h> | ||
23 | #include <linux/kmod.h> | ||
24 | #include <linux/stat.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/list.h> | ||
27 | |||
28 | |||
29 | #undef DEBUG_INGRESS | ||
30 | |||
31 | #ifdef DEBUG_INGRESS /* control */ | ||
32 | #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
33 | #else | ||
34 | #define DPRINTK(format,args...) | ||
35 | #endif | ||
36 | |||
37 | #if 0 /* data */ | ||
38 | #define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
39 | #else | ||
40 | #define D2PRINTK(format,args...) | ||
41 | #endif | ||
42 | |||
43 | |||
44 | #define PRIV(sch) qdisc_priv(sch) | ||
45 | |||
46 | |||
47 | /* Thanks to Doron Oz for this hack | ||
48 | */ | ||
49 | #ifndef CONFIG_NET_CLS_ACT | ||
50 | #ifdef CONFIG_NETFILTER | ||
51 | static int nf_registered; | ||
52 | #endif | ||
53 | #endif | ||
54 | |||
55 | struct ingress_qdisc_data { | ||
56 | struct Qdisc *q; | ||
57 | struct tcf_proto *filter_list; | ||
58 | }; | ||
59 | |||
60 | |||
61 | /* ------------------------- Class/flow operations ------------------------- */ | ||
62 | |||
63 | |||
64 | static int ingress_graft(struct Qdisc *sch,unsigned long arg, | ||
65 | struct Qdisc *new,struct Qdisc **old) | ||
66 | { | ||
67 | #ifdef DEBUG_INGRESS | ||
68 | struct ingress_qdisc_data *p = PRIV(sch); | ||
69 | #endif | ||
70 | |||
71 | DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n", | ||
72 | sch, p, new, old); | ||
73 | DPRINTK("\n ingress_graft: You cannot add qdiscs to classes"); | ||
74 | return 1; | ||
75 | } | ||
76 | |||
77 | |||
78 | static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) | ||
79 | { | ||
80 | return NULL; | ||
81 | } | ||
82 | |||
83 | |||
84 | static unsigned long ingress_get(struct Qdisc *sch,u32 classid) | ||
85 | { | ||
86 | #ifdef DEBUG_INGRESS | ||
87 | struct ingress_qdisc_data *p = PRIV(sch); | ||
88 | #endif | ||
89 | DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); | ||
90 | return TC_H_MIN(classid) + 1; | ||
91 | } | ||
92 | |||
93 | |||
94 | static unsigned long ingress_bind_filter(struct Qdisc *sch, | ||
95 | unsigned long parent, u32 classid) | ||
96 | { | ||
97 | return ingress_get(sch, classid); | ||
98 | } | ||
99 | |||
100 | |||
101 | static void ingress_put(struct Qdisc *sch, unsigned long cl) | ||
102 | { | ||
103 | } | ||
104 | |||
105 | |||
106 | static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent, | ||
107 | struct rtattr **tca, unsigned long *arg) | ||
108 | { | ||
109 | #ifdef DEBUG_INGRESS | ||
110 | struct ingress_qdisc_data *p = PRIV(sch); | ||
111 | #endif | ||
112 | DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x)," | ||
113 | "arg 0x%lx\n", sch, p, classid, parent, *arg); | ||
114 | DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); | ||
115 | return 0; | ||
116 | } | ||
117 | |||
118 | |||
119 | |||
120 | static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker) | ||
121 | { | ||
122 | #ifdef DEBUG_INGRESS | ||
123 | struct ingress_qdisc_data *p = PRIV(sch); | ||
124 | #endif | ||
125 | DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); | ||
126 | DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); | ||
127 | } | ||
128 | |||
129 | |||
130 | static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl) | ||
131 | { | ||
132 | struct ingress_qdisc_data *p = PRIV(sch); | ||
133 | |||
134 | return &p->filter_list; | ||
135 | } | ||
136 | |||
137 | |||
138 | /* --------------------------- Qdisc operations ---------------------------- */ | ||
139 | |||
140 | |||
141 | static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) | ||
142 | { | ||
143 | struct ingress_qdisc_data *p = PRIV(sch); | ||
144 | struct tcf_result res; | ||
145 | int result; | ||
146 | |||
147 | D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); | ||
148 | result = tc_classify(skb, p->filter_list, &res); | ||
149 | D2PRINTK("result %d class 0x%04x\n", result, res.classid); | ||
150 | /* | ||
151 | * Unlike normal "enqueue" functions, ingress_enqueue returns a | ||
152 | * firewall FW_* code. | ||
153 | */ | ||
154 | #ifdef CONFIG_NET_CLS_ACT | ||
155 | sch->bstats.packets++; | ||
156 | sch->bstats.bytes += skb->len; | ||
157 | switch (result) { | ||
158 | case TC_ACT_SHOT: | ||
159 | result = TC_ACT_SHOT; | ||
160 | sch->qstats.drops++; | ||
161 | break; | ||
162 | case TC_ACT_STOLEN: | ||
163 | case TC_ACT_QUEUED: | ||
164 | result = TC_ACT_STOLEN; | ||
165 | break; | ||
166 | case TC_ACT_RECLASSIFY: | ||
167 | case TC_ACT_OK: | ||
168 | case TC_ACT_UNSPEC: | ||
169 | default: | ||
170 | skb->tc_index = TC_H_MIN(res.classid); | ||
171 | result = TC_ACT_OK; | ||
172 | break; | ||
173 | }; | ||
174 | /* backward compat */ | ||
175 | #else | ||
176 | #ifdef CONFIG_NET_CLS_POLICE | ||
177 | switch (result) { | ||
178 | case TC_POLICE_SHOT: | ||
179 | result = NF_DROP; | ||
180 | sch->qstats.drops++; | ||
181 | break; | ||
182 | case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */ | ||
183 | case TC_POLICE_OK: | ||
184 | case TC_POLICE_UNSPEC: | ||
185 | default: | ||
186 | sch->bstats.packets++; | ||
187 | sch->bstats.bytes += skb->len; | ||
188 | result = NF_ACCEPT; | ||
189 | break; | ||
190 | }; | ||
191 | |||
192 | #else | ||
193 | D2PRINTK("Overriding result to ACCEPT\n"); | ||
194 | result = NF_ACCEPT; | ||
195 | sch->bstats.packets++; | ||
196 | sch->bstats.bytes += skb->len; | ||
197 | #endif | ||
198 | #endif | ||
199 | |||
200 | return result; | ||
201 | } | ||
202 | |||
203 | |||
204 | static struct sk_buff *ingress_dequeue(struct Qdisc *sch) | ||
205 | { | ||
206 | /* | ||
207 | struct ingress_qdisc_data *p = PRIV(sch); | ||
208 | D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p)); | ||
209 | */ | ||
210 | return NULL; | ||
211 | } | ||
212 | |||
213 | |||
214 | static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch) | ||
215 | { | ||
216 | /* | ||
217 | struct ingress_qdisc_data *p = PRIV(sch); | ||
218 | D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p)); | ||
219 | */ | ||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static unsigned int ingress_drop(struct Qdisc *sch) | ||
224 | { | ||
225 | #ifdef DEBUG_INGRESS | ||
226 | struct ingress_qdisc_data *p = PRIV(sch); | ||
227 | #endif | ||
228 | DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p); | ||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | #ifndef CONFIG_NET_CLS_ACT | ||
233 | #ifdef CONFIG_NETFILTER | ||
234 | static unsigned int | ||
235 | ing_hook(unsigned int hook, struct sk_buff **pskb, | ||
236 | const struct net_device *indev, | ||
237 | const struct net_device *outdev, | ||
238 | int (*okfn)(struct sk_buff *)) | ||
239 | { | ||
240 | |||
241 | struct Qdisc *q; | ||
242 | struct sk_buff *skb = *pskb; | ||
243 | struct net_device *dev = skb->dev; | ||
244 | int fwres=NF_ACCEPT; | ||
245 | |||
246 | DPRINTK("ing_hook: skb %s dev=%s len=%u\n", | ||
247 | skb->sk ? "(owned)" : "(unowned)", | ||
248 | skb->dev ? (*pskb)->dev->name : "(no dev)", | ||
249 | skb->len); | ||
250 | |||
251 | /* | ||
252 | revisit later: Use a private since lock dev->queue_lock is also | ||
253 | used on the egress (might slow things for an iota) | ||
254 | */ | ||
255 | |||
256 | if (dev->qdisc_ingress) { | ||
257 | spin_lock(&dev->queue_lock); | ||
258 | if ((q = dev->qdisc_ingress) != NULL) | ||
259 | fwres = q->enqueue(skb, q); | ||
260 | spin_unlock(&dev->queue_lock); | ||
261 | } | ||
262 | |||
263 | return fwres; | ||
264 | } | ||
265 | |||
266 | /* after ipt_filter */ | ||
267 | static struct nf_hook_ops ing_ops = { | ||
268 | .hook = ing_hook, | ||
269 | .owner = THIS_MODULE, | ||
270 | .pf = PF_INET, | ||
271 | .hooknum = NF_IP_PRE_ROUTING, | ||
272 | .priority = NF_IP_PRI_FILTER + 1, | ||
273 | }; | ||
274 | |||
275 | static struct nf_hook_ops ing6_ops = { | ||
276 | .hook = ing_hook, | ||
277 | .owner = THIS_MODULE, | ||
278 | .pf = PF_INET6, | ||
279 | .hooknum = NF_IP6_PRE_ROUTING, | ||
280 | .priority = NF_IP6_PRI_FILTER + 1, | ||
281 | }; | ||
282 | |||
283 | #endif | ||
284 | #endif | ||
285 | |||
286 | static int ingress_init(struct Qdisc *sch,struct rtattr *opt) | ||
287 | { | ||
288 | struct ingress_qdisc_data *p = PRIV(sch); | ||
289 | |||
290 | /* Make sure either netfilter or preferably CLS_ACT is | ||
291 | * compiled in */ | ||
292 | #ifndef CONFIG_NET_CLS_ACT | ||
293 | #ifndef CONFIG_NETFILTER | ||
294 | printk("You MUST compile classifier actions into the kernel\n"); | ||
295 | return -EINVAL; | ||
296 | #else | ||
297 | printk("Ingress scheduler: Classifier actions prefered over netfilter\n"); | ||
298 | #endif | ||
299 | #endif | ||
300 | |||
301 | #ifndef CONFIG_NET_CLS_ACT | ||
302 | #ifdef CONFIG_NETFILTER | ||
303 | if (!nf_registered) { | ||
304 | if (nf_register_hook(&ing_ops) < 0) { | ||
305 | printk("ingress qdisc registration error \n"); | ||
306 | return -EINVAL; | ||
307 | } | ||
308 | nf_registered++; | ||
309 | |||
310 | if (nf_register_hook(&ing6_ops) < 0) { | ||
311 | printk("IPv6 ingress qdisc registration error, " \ | ||
312 | "disabling IPv6 support.\n"); | ||
313 | } else | ||
314 | nf_registered++; | ||
315 | } | ||
316 | #endif | ||
317 | #endif | ||
318 | |||
319 | DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); | ||
320 | p->q = &noop_qdisc; | ||
321 | return 0; | ||
322 | } | ||
323 | |||
324 | |||
325 | static void ingress_reset(struct Qdisc *sch) | ||
326 | { | ||
327 | struct ingress_qdisc_data *p = PRIV(sch); | ||
328 | |||
329 | DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p); | ||
330 | |||
331 | /* | ||
332 | #if 0 | ||
333 | */ | ||
334 | /* for future use */ | ||
335 | qdisc_reset(p->q); | ||
336 | /* | ||
337 | #endif | ||
338 | */ | ||
339 | } | ||
340 | |||
341 | /* ------------------------------------------------------------- */ | ||
342 | |||
343 | |||
344 | /* ------------------------------------------------------------- */ | ||
345 | |||
346 | static void ingress_destroy(struct Qdisc *sch) | ||
347 | { | ||
348 | struct ingress_qdisc_data *p = PRIV(sch); | ||
349 | struct tcf_proto *tp; | ||
350 | |||
351 | DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); | ||
352 | while (p->filter_list) { | ||
353 | tp = p->filter_list; | ||
354 | p->filter_list = tp->next; | ||
355 | tcf_destroy(tp); | ||
356 | } | ||
357 | #if 0 | ||
358 | /* for future use */ | ||
359 | qdisc_destroy(p->q); | ||
360 | #endif | ||
361 | } | ||
362 | |||
363 | |||
364 | static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
365 | { | ||
366 | unsigned char *b = skb->tail; | ||
367 | struct rtattr *rta; | ||
368 | |||
369 | rta = (struct rtattr *) b; | ||
370 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
371 | rta->rta_len = skb->tail - b; | ||
372 | return skb->len; | ||
373 | |||
374 | rtattr_failure: | ||
375 | skb_trim(skb, b - skb->data); | ||
376 | return -1; | ||
377 | } | ||
378 | |||
379 | static struct Qdisc_class_ops ingress_class_ops = { | ||
380 | .graft = ingress_graft, | ||
381 | .leaf = ingress_leaf, | ||
382 | .get = ingress_get, | ||
383 | .put = ingress_put, | ||
384 | .change = ingress_change, | ||
385 | .delete = NULL, | ||
386 | .walk = ingress_walk, | ||
387 | .tcf_chain = ingress_find_tcf, | ||
388 | .bind_tcf = ingress_bind_filter, | ||
389 | .unbind_tcf = ingress_put, | ||
390 | .dump = NULL, | ||
391 | }; | ||
392 | |||
393 | static struct Qdisc_ops ingress_qdisc_ops = { | ||
394 | .next = NULL, | ||
395 | .cl_ops = &ingress_class_ops, | ||
396 | .id = "ingress", | ||
397 | .priv_size = sizeof(struct ingress_qdisc_data), | ||
398 | .enqueue = ingress_enqueue, | ||
399 | .dequeue = ingress_dequeue, | ||
400 | .requeue = ingress_requeue, | ||
401 | .drop = ingress_drop, | ||
402 | .init = ingress_init, | ||
403 | .reset = ingress_reset, | ||
404 | .destroy = ingress_destroy, | ||
405 | .change = NULL, | ||
406 | .dump = ingress_dump, | ||
407 | .owner = THIS_MODULE, | ||
408 | }; | ||
409 | |||
410 | static int __init ingress_module_init(void) | ||
411 | { | ||
412 | int ret = 0; | ||
413 | |||
414 | if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) { | ||
415 | printk("Unable to register Ingress qdisc\n"); | ||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | return ret; | ||
420 | } | ||
421 | static void __exit ingress_module_exit(void) | ||
422 | { | ||
423 | unregister_qdisc(&ingress_qdisc_ops); | ||
424 | #ifndef CONFIG_NET_CLS_ACT | ||
425 | #ifdef CONFIG_NETFILTER | ||
426 | if (nf_registered) { | ||
427 | nf_unregister_hook(&ing_ops); | ||
428 | if (nf_registered > 1) | ||
429 | nf_unregister_hook(&ing6_ops); | ||
430 | } | ||
431 | #endif | ||
432 | #endif | ||
433 | } | ||
434 | module_init(ingress_module_init) | ||
435 | module_exit(ingress_module_exit) | ||
436 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c new file mode 100644 index 000000000000..31c29deb139d --- /dev/null +++ b/net/sched/sch_netem.c | |||
@@ -0,0 +1,598 @@ | |||
1 | /* | ||
2 | * net/sched/sch_netem.c Network emulator | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Many of the algorithms and ideas for this came from | ||
10 | * NIST Net which is not copyrighted. | ||
11 | * | ||
12 | * Authors: Stephen Hemminger <shemminger@osdl.org> | ||
13 | * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> | ||
14 | */ | ||
15 | |||
16 | #include <linux/config.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/errno.h> | ||
22 | #include <linux/netdevice.h> | ||
23 | #include <linux/skbuff.h> | ||
24 | #include <linux/rtnetlink.h> | ||
25 | |||
26 | #include <net/pkt_sched.h> | ||
27 | |||
28 | /* Network Emulation Queuing algorithm. | ||
29 | ==================================== | ||
30 | |||
31 | Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based | ||
32 | Network Emulation Tool | ||
33 | [2] Luigi Rizzo, DummyNet for FreeBSD | ||
34 | |||
35 | ---------------------------------------------------------------- | ||
36 | |||
37 | This started out as a simple way to delay outgoing packets to | ||
38 | test TCP but has grown to include most of the functionality | ||
39 | of a full blown network emulator like NISTnet. It can delay | ||
40 | packets and add random jitter (and correlation). The random | ||
41 | distribution can be loaded from a table as well to provide | ||
42 | normal, Pareto, or experimental curves. Packet loss, | ||
43 | duplication, and reordering can also be emulated. | ||
44 | |||
45 | This qdisc does not do classification that can be handled in | ||
46 | layering other disciplines. It does not need to do bandwidth | ||
47 | control either since that can be handled by using token | ||
48 | bucket or other rate control. | ||
49 | |||
50 | The simulator is limited by the Linux timer resolution | ||
51 | and will create packet bursts on the HZ boundary (1ms). | ||
52 | */ | ||
53 | |||
54 | struct netem_sched_data { | ||
55 | struct Qdisc *qdisc; | ||
56 | struct sk_buff_head delayed; | ||
57 | struct timer_list timer; | ||
58 | |||
59 | u32 latency; | ||
60 | u32 loss; | ||
61 | u32 limit; | ||
62 | u32 counter; | ||
63 | u32 gap; | ||
64 | u32 jitter; | ||
65 | u32 duplicate; | ||
66 | |||
67 | struct crndstate { | ||
68 | unsigned long last; | ||
69 | unsigned long rho; | ||
70 | } delay_cor, loss_cor, dup_cor; | ||
71 | |||
72 | struct disttable { | ||
73 | u32 size; | ||
74 | s16 table[0]; | ||
75 | } *delay_dist; | ||
76 | }; | ||
77 | |||
78 | /* Time stamp put into socket buffer control block */ | ||
79 | struct netem_skb_cb { | ||
80 | psched_time_t time_to_send; | ||
81 | }; | ||
82 | |||
83 | /* init_crandom - initialize correlated random number generator | ||
84 | * Use entropy source for initial seed. | ||
85 | */ | ||
86 | static void init_crandom(struct crndstate *state, unsigned long rho) | ||
87 | { | ||
88 | state->rho = rho; | ||
89 | state->last = net_random(); | ||
90 | } | ||
91 | |||
92 | /* get_crandom - correlated random number generator | ||
93 | * Next number depends on last value. | ||
94 | * rho is scaled to avoid floating point. | ||
95 | */ | ||
96 | static unsigned long get_crandom(struct crndstate *state) | ||
97 | { | ||
98 | u64 value, rho; | ||
99 | unsigned long answer; | ||
100 | |||
101 | if (state->rho == 0) /* no correllation */ | ||
102 | return net_random(); | ||
103 | |||
104 | value = net_random(); | ||
105 | rho = (u64)state->rho + 1; | ||
106 | answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; | ||
107 | state->last = answer; | ||
108 | return answer; | ||
109 | } | ||
110 | |||
111 | /* tabledist - return a pseudo-randomly distributed value with mean mu and | ||
112 | * std deviation sigma. Uses table lookup to approximate the desired | ||
113 | * distribution, and a uniformly-distributed pseudo-random source. | ||
114 | */ | ||
115 | static long tabledist(unsigned long mu, long sigma, | ||
116 | struct crndstate *state, const struct disttable *dist) | ||
117 | { | ||
118 | long t, x; | ||
119 | unsigned long rnd; | ||
120 | |||
121 | if (sigma == 0) | ||
122 | return mu; | ||
123 | |||
124 | rnd = get_crandom(state); | ||
125 | |||
126 | /* default uniform distribution */ | ||
127 | if (dist == NULL) | ||
128 | return (rnd % (2*sigma)) - sigma + mu; | ||
129 | |||
130 | t = dist->table[rnd % dist->size]; | ||
131 | x = (sigma % NETEM_DIST_SCALE) * t; | ||
132 | if (x >= 0) | ||
133 | x += NETEM_DIST_SCALE/2; | ||
134 | else | ||
135 | x -= NETEM_DIST_SCALE/2; | ||
136 | |||
137 | return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; | ||
138 | } | ||
139 | |||
140 | /* Put skb in the private delayed queue. */ | ||
141 | static int delay_skb(struct Qdisc *sch, struct sk_buff *skb) | ||
142 | { | ||
143 | struct netem_sched_data *q = qdisc_priv(sch); | ||
144 | struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; | ||
145 | psched_tdiff_t td; | ||
146 | psched_time_t now; | ||
147 | |||
148 | PSCHED_GET_TIME(now); | ||
149 | td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); | ||
150 | PSCHED_TADD2(now, td, cb->time_to_send); | ||
151 | |||
152 | /* Always queue at tail to keep packets in order */ | ||
153 | if (likely(q->delayed.qlen < q->limit)) { | ||
154 | __skb_queue_tail(&q->delayed, skb); | ||
155 | if (!timer_pending(&q->timer)) { | ||
156 | q->timer.expires = jiffies + PSCHED_US2JIFFIE(td); | ||
157 | add_timer(&q->timer); | ||
158 | } | ||
159 | return NET_XMIT_SUCCESS; | ||
160 | } | ||
161 | |||
162 | kfree_skb(skb); | ||
163 | return NET_XMIT_DROP; | ||
164 | } | ||
165 | |||
166 | static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
167 | { | ||
168 | struct netem_sched_data *q = qdisc_priv(sch); | ||
169 | struct sk_buff *skb2; | ||
170 | int ret; | ||
171 | |||
172 | pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); | ||
173 | |||
174 | /* Random packet drop 0 => none, ~0 => all */ | ||
175 | if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { | ||
176 | pr_debug("netem_enqueue: random loss\n"); | ||
177 | sch->qstats.drops++; | ||
178 | kfree_skb(skb); | ||
179 | return 0; /* lie about loss so TCP doesn't know */ | ||
180 | } | ||
181 | |||
182 | /* Random duplication */ | ||
183 | if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor) | ||
184 | && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { | ||
185 | pr_debug("netem_enqueue: dup %p\n", skb2); | ||
186 | |||
187 | if (delay_skb(sch, skb2)) { | ||
188 | sch->q.qlen++; | ||
189 | sch->bstats.bytes += skb2->len; | ||
190 | sch->bstats.packets++; | ||
191 | } else | ||
192 | sch->qstats.drops++; | ||
193 | } | ||
194 | |||
195 | /* If doing simple delay then gap == 0 so all packets | ||
196 | * go into the delayed holding queue | ||
197 | * otherwise if doing out of order only "1 out of gap" | ||
198 | * packets will be delayed. | ||
199 | */ | ||
200 | if (q->counter < q->gap) { | ||
201 | ++q->counter; | ||
202 | ret = q->qdisc->enqueue(skb, q->qdisc); | ||
203 | } else { | ||
204 | q->counter = 0; | ||
205 | ret = delay_skb(sch, skb); | ||
206 | } | ||
207 | |||
208 | if (likely(ret == NET_XMIT_SUCCESS)) { | ||
209 | sch->q.qlen++; | ||
210 | sch->bstats.bytes += skb->len; | ||
211 | sch->bstats.packets++; | ||
212 | } else | ||
213 | sch->qstats.drops++; | ||
214 | |||
215 | return ret; | ||
216 | } | ||
217 | |||
218 | /* Requeue packets but don't change time stamp */ | ||
219 | static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) | ||
220 | { | ||
221 | struct netem_sched_data *q = qdisc_priv(sch); | ||
222 | int ret; | ||
223 | |||
224 | if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { | ||
225 | sch->q.qlen++; | ||
226 | sch->qstats.requeues++; | ||
227 | } | ||
228 | |||
229 | return ret; | ||
230 | } | ||
231 | |||
232 | static unsigned int netem_drop(struct Qdisc* sch) | ||
233 | { | ||
234 | struct netem_sched_data *q = qdisc_priv(sch); | ||
235 | unsigned int len; | ||
236 | |||
237 | if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { | ||
238 | sch->q.qlen--; | ||
239 | sch->qstats.drops++; | ||
240 | } | ||
241 | return len; | ||
242 | } | ||
243 | |||
244 | /* Dequeue packet. | ||
245 | * Move all packets that are ready to send from the delay holding | ||
246 | * list to the underlying qdisc, then just call dequeue | ||
247 | */ | ||
248 | static struct sk_buff *netem_dequeue(struct Qdisc *sch) | ||
249 | { | ||
250 | struct netem_sched_data *q = qdisc_priv(sch); | ||
251 | struct sk_buff *skb; | ||
252 | |||
253 | skb = q->qdisc->dequeue(q->qdisc); | ||
254 | if (skb) | ||
255 | sch->q.qlen--; | ||
256 | return skb; | ||
257 | } | ||
258 | |||
259 | static void netem_watchdog(unsigned long arg) | ||
260 | { | ||
261 | struct Qdisc *sch = (struct Qdisc *)arg; | ||
262 | struct netem_sched_data *q = qdisc_priv(sch); | ||
263 | struct net_device *dev = sch->dev; | ||
264 | struct sk_buff *skb; | ||
265 | psched_time_t now; | ||
266 | |||
267 | pr_debug("netem_watchdog: fired @%lu\n", jiffies); | ||
268 | |||
269 | spin_lock_bh(&dev->queue_lock); | ||
270 | PSCHED_GET_TIME(now); | ||
271 | |||
272 | while ((skb = skb_peek(&q->delayed)) != NULL) { | ||
273 | const struct netem_skb_cb *cb | ||
274 | = (const struct netem_skb_cb *)skb->cb; | ||
275 | long delay | ||
276 | = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); | ||
277 | pr_debug("netem_watchdog: skb %p@%lu %ld\n", | ||
278 | skb, jiffies, delay); | ||
279 | |||
280 | /* if more time remaining? */ | ||
281 | if (delay > 0) { | ||
282 | mod_timer(&q->timer, jiffies + delay); | ||
283 | break; | ||
284 | } | ||
285 | __skb_unlink(skb, &q->delayed); | ||
286 | |||
287 | if (q->qdisc->enqueue(skb, q->qdisc)) { | ||
288 | sch->q.qlen--; | ||
289 | sch->qstats.drops++; | ||
290 | } | ||
291 | } | ||
292 | qdisc_run(dev); | ||
293 | spin_unlock_bh(&dev->queue_lock); | ||
294 | } | ||
295 | |||
296 | static void netem_reset(struct Qdisc *sch) | ||
297 | { | ||
298 | struct netem_sched_data *q = qdisc_priv(sch); | ||
299 | |||
300 | qdisc_reset(q->qdisc); | ||
301 | skb_queue_purge(&q->delayed); | ||
302 | |||
303 | sch->q.qlen = 0; | ||
304 | del_timer_sync(&q->timer); | ||
305 | } | ||
306 | |||
307 | static int set_fifo_limit(struct Qdisc *q, int limit) | ||
308 | { | ||
309 | struct rtattr *rta; | ||
310 | int ret = -ENOMEM; | ||
311 | |||
312 | rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); | ||
313 | if (rta) { | ||
314 | rta->rta_type = RTM_NEWQDISC; | ||
315 | rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); | ||
316 | ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; | ||
317 | |||
318 | ret = q->ops->change(q, rta); | ||
319 | kfree(rta); | ||
320 | } | ||
321 | return ret; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Distribution data is a variable size payload containing | ||
326 | * signed 16 bit values. | ||
327 | */ | ||
328 | static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr) | ||
329 | { | ||
330 | struct netem_sched_data *q = qdisc_priv(sch); | ||
331 | unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16); | ||
332 | const __s16 *data = RTA_DATA(attr); | ||
333 | struct disttable *d; | ||
334 | int i; | ||
335 | |||
336 | if (n > 65536) | ||
337 | return -EINVAL; | ||
338 | |||
339 | d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL); | ||
340 | if (!d) | ||
341 | return -ENOMEM; | ||
342 | |||
343 | d->size = n; | ||
344 | for (i = 0; i < n; i++) | ||
345 | d->table[i] = data[i]; | ||
346 | |||
347 | spin_lock_bh(&sch->dev->queue_lock); | ||
348 | d = xchg(&q->delay_dist, d); | ||
349 | spin_unlock_bh(&sch->dev->queue_lock); | ||
350 | |||
351 | kfree(d); | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) | ||
356 | { | ||
357 | struct netem_sched_data *q = qdisc_priv(sch); | ||
358 | const struct tc_netem_corr *c = RTA_DATA(attr); | ||
359 | |||
360 | if (RTA_PAYLOAD(attr) != sizeof(*c)) | ||
361 | return -EINVAL; | ||
362 | |||
363 | init_crandom(&q->delay_cor, c->delay_corr); | ||
364 | init_crandom(&q->loss_cor, c->loss_corr); | ||
365 | init_crandom(&q->dup_cor, c->dup_corr); | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | static int netem_change(struct Qdisc *sch, struct rtattr *opt) | ||
370 | { | ||
371 | struct netem_sched_data *q = qdisc_priv(sch); | ||
372 | struct tc_netem_qopt *qopt; | ||
373 | int ret; | ||
374 | |||
375 | if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) | ||
376 | return -EINVAL; | ||
377 | |||
378 | qopt = RTA_DATA(opt); | ||
379 | ret = set_fifo_limit(q->qdisc, qopt->limit); | ||
380 | if (ret) { | ||
381 | pr_debug("netem: can't set fifo limit\n"); | ||
382 | return ret; | ||
383 | } | ||
384 | |||
385 | q->latency = qopt->latency; | ||
386 | q->jitter = qopt->jitter; | ||
387 | q->limit = qopt->limit; | ||
388 | q->gap = qopt->gap; | ||
389 | q->loss = qopt->loss; | ||
390 | q->duplicate = qopt->duplicate; | ||
391 | |||
392 | /* Handle nested options after initial queue options. | ||
393 | * Should have put all options in nested format but too late now. | ||
394 | */ | ||
395 | if (RTA_PAYLOAD(opt) > sizeof(*qopt)) { | ||
396 | struct rtattr *tb[TCA_NETEM_MAX]; | ||
397 | if (rtattr_parse(tb, TCA_NETEM_MAX, | ||
398 | RTA_DATA(opt) + sizeof(*qopt), | ||
399 | RTA_PAYLOAD(opt) - sizeof(*qopt))) | ||
400 | return -EINVAL; | ||
401 | |||
402 | if (tb[TCA_NETEM_CORR-1]) { | ||
403 | ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]); | ||
404 | if (ret) | ||
405 | return ret; | ||
406 | } | ||
407 | |||
408 | if (tb[TCA_NETEM_DELAY_DIST-1]) { | ||
409 | ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]); | ||
410 | if (ret) | ||
411 | return ret; | ||
412 | } | ||
413 | } | ||
414 | |||
415 | |||
416 | return 0; | ||
417 | } | ||
418 | |||
419 | static int netem_init(struct Qdisc *sch, struct rtattr *opt) | ||
420 | { | ||
421 | struct netem_sched_data *q = qdisc_priv(sch); | ||
422 | int ret; | ||
423 | |||
424 | if (!opt) | ||
425 | return -EINVAL; | ||
426 | |||
427 | skb_queue_head_init(&q->delayed); | ||
428 | init_timer(&q->timer); | ||
429 | q->timer.function = netem_watchdog; | ||
430 | q->timer.data = (unsigned long) sch; | ||
431 | q->counter = 0; | ||
432 | |||
433 | q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); | ||
434 | if (!q->qdisc) { | ||
435 | pr_debug("netem: qdisc create failed\n"); | ||
436 | return -ENOMEM; | ||
437 | } | ||
438 | |||
439 | ret = netem_change(sch, opt); | ||
440 | if (ret) { | ||
441 | pr_debug("netem: change failed\n"); | ||
442 | qdisc_destroy(q->qdisc); | ||
443 | } | ||
444 | return ret; | ||
445 | } | ||
446 | |||
447 | static void netem_destroy(struct Qdisc *sch) | ||
448 | { | ||
449 | struct netem_sched_data *q = qdisc_priv(sch); | ||
450 | |||
451 | del_timer_sync(&q->timer); | ||
452 | qdisc_destroy(q->qdisc); | ||
453 | kfree(q->delay_dist); | ||
454 | } | ||
455 | |||
456 | static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
457 | { | ||
458 | const struct netem_sched_data *q = qdisc_priv(sch); | ||
459 | unsigned char *b = skb->tail; | ||
460 | struct rtattr *rta = (struct rtattr *) b; | ||
461 | struct tc_netem_qopt qopt; | ||
462 | struct tc_netem_corr cor; | ||
463 | |||
464 | qopt.latency = q->latency; | ||
465 | qopt.jitter = q->jitter; | ||
466 | qopt.limit = q->limit; | ||
467 | qopt.loss = q->loss; | ||
468 | qopt.gap = q->gap; | ||
469 | qopt.duplicate = q->duplicate; | ||
470 | RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); | ||
471 | |||
472 | cor.delay_corr = q->delay_cor.rho; | ||
473 | cor.loss_corr = q->loss_cor.rho; | ||
474 | cor.dup_corr = q->dup_cor.rho; | ||
475 | RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); | ||
476 | rta->rta_len = skb->tail - b; | ||
477 | |||
478 | return skb->len; | ||
479 | |||
480 | rtattr_failure: | ||
481 | skb_trim(skb, b - skb->data); | ||
482 | return -1; | ||
483 | } | ||
484 | |||
485 | static int netem_dump_class(struct Qdisc *sch, unsigned long cl, | ||
486 | struct sk_buff *skb, struct tcmsg *tcm) | ||
487 | { | ||
488 | struct netem_sched_data *q = qdisc_priv(sch); | ||
489 | |||
490 | if (cl != 1) /* only one class */ | ||
491 | return -ENOENT; | ||
492 | |||
493 | tcm->tcm_handle |= TC_H_MIN(1); | ||
494 | tcm->tcm_info = q->qdisc->handle; | ||
495 | |||
496 | return 0; | ||
497 | } | ||
498 | |||
499 | static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, | ||
500 | struct Qdisc **old) | ||
501 | { | ||
502 | struct netem_sched_data *q = qdisc_priv(sch); | ||
503 | |||
504 | if (new == NULL) | ||
505 | new = &noop_qdisc; | ||
506 | |||
507 | sch_tree_lock(sch); | ||
508 | *old = xchg(&q->qdisc, new); | ||
509 | qdisc_reset(*old); | ||
510 | sch->q.qlen = 0; | ||
511 | sch_tree_unlock(sch); | ||
512 | |||
513 | return 0; | ||
514 | } | ||
515 | |||
516 | static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) | ||
517 | { | ||
518 | struct netem_sched_data *q = qdisc_priv(sch); | ||
519 | return q->qdisc; | ||
520 | } | ||
521 | |||
522 | static unsigned long netem_get(struct Qdisc *sch, u32 classid) | ||
523 | { | ||
524 | return 1; | ||
525 | } | ||
526 | |||
527 | static void netem_put(struct Qdisc *sch, unsigned long arg) | ||
528 | { | ||
529 | } | ||
530 | |||
531 | static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | ||
532 | struct rtattr **tca, unsigned long *arg) | ||
533 | { | ||
534 | return -ENOSYS; | ||
535 | } | ||
536 | |||
537 | static int netem_delete(struct Qdisc *sch, unsigned long arg) | ||
538 | { | ||
539 | return -ENOSYS; | ||
540 | } | ||
541 | |||
542 | static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) | ||
543 | { | ||
544 | if (!walker->stop) { | ||
545 | if (walker->count >= walker->skip) | ||
546 | if (walker->fn(sch, 1, walker) < 0) { | ||
547 | walker->stop = 1; | ||
548 | return; | ||
549 | } | ||
550 | walker->count++; | ||
551 | } | ||
552 | } | ||
553 | |||
554 | static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl) | ||
555 | { | ||
556 | return NULL; | ||
557 | } | ||
558 | |||
559 | static struct Qdisc_class_ops netem_class_ops = { | ||
560 | .graft = netem_graft, | ||
561 | .leaf = netem_leaf, | ||
562 | .get = netem_get, | ||
563 | .put = netem_put, | ||
564 | .change = netem_change_class, | ||
565 | .delete = netem_delete, | ||
566 | .walk = netem_walk, | ||
567 | .tcf_chain = netem_find_tcf, | ||
568 | .dump = netem_dump_class, | ||
569 | }; | ||
570 | |||
571 | static struct Qdisc_ops netem_qdisc_ops = { | ||
572 | .id = "netem", | ||
573 | .cl_ops = &netem_class_ops, | ||
574 | .priv_size = sizeof(struct netem_sched_data), | ||
575 | .enqueue = netem_enqueue, | ||
576 | .dequeue = netem_dequeue, | ||
577 | .requeue = netem_requeue, | ||
578 | .drop = netem_drop, | ||
579 | .init = netem_init, | ||
580 | .reset = netem_reset, | ||
581 | .destroy = netem_destroy, | ||
582 | .change = netem_change, | ||
583 | .dump = netem_dump, | ||
584 | .owner = THIS_MODULE, | ||
585 | }; | ||
586 | |||
587 | |||
588 | static int __init netem_module_init(void) | ||
589 | { | ||
590 | return register_qdisc(&netem_qdisc_ops); | ||
591 | } | ||
592 | static void __exit netem_module_exit(void) | ||
593 | { | ||
594 | unregister_qdisc(&netem_qdisc_ops); | ||
595 | } | ||
596 | module_init(netem_module_init) | ||
597 | module_exit(netem_module_exit) | ||
598 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c new file mode 100644 index 000000000000..3ac0f495bad0 --- /dev/null +++ b/net/sched/sch_prio.c | |||
@@ -0,0 +1,444 @@ | |||
1 | /* | ||
2 | * net/sched/sch_prio.c Simple 3-band priority "scheduler". | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: | ||
11 | * Init -- EINVAL when opt undefined | ||
12 | */ | ||
13 | |||
14 | #include <linux/config.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <asm/uaccess.h> | ||
17 | #include <asm/system.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/socket.h> | ||
25 | #include <linux/sockios.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/errno.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | #include <linux/if_ether.h> | ||
30 | #include <linux/inet.h> | ||
31 | #include <linux/netdevice.h> | ||
32 | #include <linux/etherdevice.h> | ||
33 | #include <linux/notifier.h> | ||
34 | #include <net/ip.h> | ||
35 | #include <net/route.h> | ||
36 | #include <linux/skbuff.h> | ||
37 | #include <net/sock.h> | ||
38 | #include <net/pkt_sched.h> | ||
39 | |||
40 | |||
41 | struct prio_sched_data | ||
42 | { | ||
43 | int bands; | ||
44 | struct tcf_proto *filter_list; | ||
45 | u8 prio2band[TC_PRIO_MAX+1]; | ||
46 | struct Qdisc *queues[TCQ_PRIO_BANDS]; | ||
47 | }; | ||
48 | |||
49 | |||
50 | static struct Qdisc * | ||
51 | prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) | ||
52 | { | ||
53 | struct prio_sched_data *q = qdisc_priv(sch); | ||
54 | u32 band = skb->priority; | ||
55 | struct tcf_result res; | ||
56 | |||
57 | *qerr = NET_XMIT_DROP; | ||
58 | if (TC_H_MAJ(skb->priority) != sch->handle) { | ||
59 | #ifdef CONFIG_NET_CLS_ACT | ||
60 | switch (tc_classify(skb, q->filter_list, &res)) { | ||
61 | case TC_ACT_STOLEN: | ||
62 | case TC_ACT_QUEUED: | ||
63 | *qerr = NET_XMIT_SUCCESS; | ||
64 | case TC_ACT_SHOT: | ||
65 | return NULL; | ||
66 | }; | ||
67 | |||
68 | if (!q->filter_list ) { | ||
69 | #else | ||
70 | if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { | ||
71 | #endif | ||
72 | if (TC_H_MAJ(band)) | ||
73 | band = 0; | ||
74 | return q->queues[q->prio2band[band&TC_PRIO_MAX]]; | ||
75 | } | ||
76 | band = res.classid; | ||
77 | } | ||
78 | band = TC_H_MIN(band) - 1; | ||
79 | if (band > q->bands) | ||
80 | return q->queues[q->prio2band[0]]; | ||
81 | |||
82 | return q->queues[band]; | ||
83 | } | ||
84 | |||
85 | static int | ||
86 | prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) | ||
87 | { | ||
88 | struct Qdisc *qdisc; | ||
89 | int ret; | ||
90 | |||
91 | qdisc = prio_classify(skb, sch, &ret); | ||
92 | #ifdef CONFIG_NET_CLS_ACT | ||
93 | if (qdisc == NULL) { | ||
94 | if (ret == NET_XMIT_DROP) | ||
95 | sch->qstats.drops++; | ||
96 | kfree_skb(skb); | ||
97 | return ret; | ||
98 | } | ||
99 | #endif | ||
100 | |||
101 | if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) { | ||
102 | sch->bstats.bytes += skb->len; | ||
103 | sch->bstats.packets++; | ||
104 | sch->q.qlen++; | ||
105 | return NET_XMIT_SUCCESS; | ||
106 | } | ||
107 | sch->qstats.drops++; | ||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | |||
112 | static int | ||
113 | prio_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
114 | { | ||
115 | struct Qdisc *qdisc; | ||
116 | int ret; | ||
117 | |||
118 | qdisc = prio_classify(skb, sch, &ret); | ||
119 | #ifdef CONFIG_NET_CLS_ACT | ||
120 | if (qdisc == NULL) { | ||
121 | if (ret == NET_XMIT_DROP) | ||
122 | sch->qstats.drops++; | ||
123 | kfree_skb(skb); | ||
124 | return ret; | ||
125 | } | ||
126 | #endif | ||
127 | |||
128 | if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) { | ||
129 | sch->q.qlen++; | ||
130 | sch->qstats.requeues++; | ||
131 | return 0; | ||
132 | } | ||
133 | sch->qstats.drops++; | ||
134 | return NET_XMIT_DROP; | ||
135 | } | ||
136 | |||
137 | |||
138 | static struct sk_buff * | ||
139 | prio_dequeue(struct Qdisc* sch) | ||
140 | { | ||
141 | struct sk_buff *skb; | ||
142 | struct prio_sched_data *q = qdisc_priv(sch); | ||
143 | int prio; | ||
144 | struct Qdisc *qdisc; | ||
145 | |||
146 | for (prio = 0; prio < q->bands; prio++) { | ||
147 | qdisc = q->queues[prio]; | ||
148 | skb = qdisc->dequeue(qdisc); | ||
149 | if (skb) { | ||
150 | sch->q.qlen--; | ||
151 | return skb; | ||
152 | } | ||
153 | } | ||
154 | return NULL; | ||
155 | |||
156 | } | ||
157 | |||
158 | static unsigned int prio_drop(struct Qdisc* sch) | ||
159 | { | ||
160 | struct prio_sched_data *q = qdisc_priv(sch); | ||
161 | int prio; | ||
162 | unsigned int len; | ||
163 | struct Qdisc *qdisc; | ||
164 | |||
165 | for (prio = q->bands-1; prio >= 0; prio--) { | ||
166 | qdisc = q->queues[prio]; | ||
167 | if ((len = qdisc->ops->drop(qdisc)) != 0) { | ||
168 | sch->q.qlen--; | ||
169 | return len; | ||
170 | } | ||
171 | } | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | |||
176 | static void | ||
177 | prio_reset(struct Qdisc* sch) | ||
178 | { | ||
179 | int prio; | ||
180 | struct prio_sched_data *q = qdisc_priv(sch); | ||
181 | |||
182 | for (prio=0; prio<q->bands; prio++) | ||
183 | qdisc_reset(q->queues[prio]); | ||
184 | sch->q.qlen = 0; | ||
185 | } | ||
186 | |||
187 | static void | ||
188 | prio_destroy(struct Qdisc* sch) | ||
189 | { | ||
190 | int prio; | ||
191 | struct prio_sched_data *q = qdisc_priv(sch); | ||
192 | struct tcf_proto *tp; | ||
193 | |||
194 | while ((tp = q->filter_list) != NULL) { | ||
195 | q->filter_list = tp->next; | ||
196 | tcf_destroy(tp); | ||
197 | } | ||
198 | |||
199 | for (prio=0; prio<q->bands; prio++) | ||
200 | qdisc_destroy(q->queues[prio]); | ||
201 | } | ||
202 | |||
203 | static int prio_tune(struct Qdisc *sch, struct rtattr *opt) | ||
204 | { | ||
205 | struct prio_sched_data *q = qdisc_priv(sch); | ||
206 | struct tc_prio_qopt *qopt = RTA_DATA(opt); | ||
207 | int i; | ||
208 | |||
209 | if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) | ||
210 | return -EINVAL; | ||
211 | if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) | ||
212 | return -EINVAL; | ||
213 | |||
214 | for (i=0; i<=TC_PRIO_MAX; i++) { | ||
215 | if (qopt->priomap[i] >= qopt->bands) | ||
216 | return -EINVAL; | ||
217 | } | ||
218 | |||
219 | sch_tree_lock(sch); | ||
220 | q->bands = qopt->bands; | ||
221 | memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); | ||
222 | |||
223 | for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { | ||
224 | struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc); | ||
225 | if (child != &noop_qdisc) | ||
226 | qdisc_destroy(child); | ||
227 | } | ||
228 | sch_tree_unlock(sch); | ||
229 | |||
230 | for (i=0; i<=TC_PRIO_MAX; i++) { | ||
231 | int band = q->prio2band[i]; | ||
232 | if (q->queues[band] == &noop_qdisc) { | ||
233 | struct Qdisc *child; | ||
234 | child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); | ||
235 | if (child) { | ||
236 | sch_tree_lock(sch); | ||
237 | child = xchg(&q->queues[band], child); | ||
238 | |||
239 | if (child != &noop_qdisc) | ||
240 | qdisc_destroy(child); | ||
241 | sch_tree_unlock(sch); | ||
242 | } | ||
243 | } | ||
244 | } | ||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | static int prio_init(struct Qdisc *sch, struct rtattr *opt) | ||
249 | { | ||
250 | struct prio_sched_data *q = qdisc_priv(sch); | ||
251 | int i; | ||
252 | |||
253 | for (i=0; i<TCQ_PRIO_BANDS; i++) | ||
254 | q->queues[i] = &noop_qdisc; | ||
255 | |||
256 | if (opt == NULL) { | ||
257 | return -EINVAL; | ||
258 | } else { | ||
259 | int err; | ||
260 | |||
261 | if ((err= prio_tune(sch, opt)) != 0) | ||
262 | return err; | ||
263 | } | ||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
268 | { | ||
269 | struct prio_sched_data *q = qdisc_priv(sch); | ||
270 | unsigned char *b = skb->tail; | ||
271 | struct tc_prio_qopt opt; | ||
272 | |||
273 | opt.bands = q->bands; | ||
274 | memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); | ||
275 | RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); | ||
276 | return skb->len; | ||
277 | |||
278 | rtattr_failure: | ||
279 | skb_trim(skb, b - skb->data); | ||
280 | return -1; | ||
281 | } | ||
282 | |||
283 | static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, | ||
284 | struct Qdisc **old) | ||
285 | { | ||
286 | struct prio_sched_data *q = qdisc_priv(sch); | ||
287 | unsigned long band = arg - 1; | ||
288 | |||
289 | if (band >= q->bands) | ||
290 | return -EINVAL; | ||
291 | |||
292 | if (new == NULL) | ||
293 | new = &noop_qdisc; | ||
294 | |||
295 | sch_tree_lock(sch); | ||
296 | *old = q->queues[band]; | ||
297 | q->queues[band] = new; | ||
298 | sch->q.qlen -= (*old)->q.qlen; | ||
299 | qdisc_reset(*old); | ||
300 | sch_tree_unlock(sch); | ||
301 | |||
302 | return 0; | ||
303 | } | ||
304 | |||
305 | static struct Qdisc * | ||
306 | prio_leaf(struct Qdisc *sch, unsigned long arg) | ||
307 | { | ||
308 | struct prio_sched_data *q = qdisc_priv(sch); | ||
309 | unsigned long band = arg - 1; | ||
310 | |||
311 | if (band >= q->bands) | ||
312 | return NULL; | ||
313 | |||
314 | return q->queues[band]; | ||
315 | } | ||
316 | |||
317 | static unsigned long prio_get(struct Qdisc *sch, u32 classid) | ||
318 | { | ||
319 | struct prio_sched_data *q = qdisc_priv(sch); | ||
320 | unsigned long band = TC_H_MIN(classid); | ||
321 | |||
322 | if (band - 1 >= q->bands) | ||
323 | return 0; | ||
324 | return band; | ||
325 | } | ||
326 | |||
327 | static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) | ||
328 | { | ||
329 | return prio_get(sch, classid); | ||
330 | } | ||
331 | |||
332 | |||
333 | static void prio_put(struct Qdisc *q, unsigned long cl) | ||
334 | { | ||
335 | return; | ||
336 | } | ||
337 | |||
338 | static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) | ||
339 | { | ||
340 | unsigned long cl = *arg; | ||
341 | struct prio_sched_data *q = qdisc_priv(sch); | ||
342 | |||
343 | if (cl - 1 > q->bands) | ||
344 | return -ENOENT; | ||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | static int prio_delete(struct Qdisc *sch, unsigned long cl) | ||
349 | { | ||
350 | struct prio_sched_data *q = qdisc_priv(sch); | ||
351 | if (cl - 1 > q->bands) | ||
352 | return -ENOENT; | ||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | |||
357 | static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, | ||
358 | struct tcmsg *tcm) | ||
359 | { | ||
360 | struct prio_sched_data *q = qdisc_priv(sch); | ||
361 | |||
362 | if (cl - 1 > q->bands) | ||
363 | return -ENOENT; | ||
364 | tcm->tcm_handle |= TC_H_MIN(cl); | ||
365 | if (q->queues[cl-1]) | ||
366 | tcm->tcm_info = q->queues[cl-1]->handle; | ||
367 | return 0; | ||
368 | } | ||
369 | |||
370 | static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) | ||
371 | { | ||
372 | struct prio_sched_data *q = qdisc_priv(sch); | ||
373 | int prio; | ||
374 | |||
375 | if (arg->stop) | ||
376 | return; | ||
377 | |||
378 | for (prio = 0; prio < q->bands; prio++) { | ||
379 | if (arg->count < arg->skip) { | ||
380 | arg->count++; | ||
381 | continue; | ||
382 | } | ||
383 | if (arg->fn(sch, prio+1, arg) < 0) { | ||
384 | arg->stop = 1; | ||
385 | break; | ||
386 | } | ||
387 | arg->count++; | ||
388 | } | ||
389 | } | ||
390 | |||
391 | static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) | ||
392 | { | ||
393 | struct prio_sched_data *q = qdisc_priv(sch); | ||
394 | |||
395 | if (cl) | ||
396 | return NULL; | ||
397 | return &q->filter_list; | ||
398 | } | ||
399 | |||
400 | static struct Qdisc_class_ops prio_class_ops = { | ||
401 | .graft = prio_graft, | ||
402 | .leaf = prio_leaf, | ||
403 | .get = prio_get, | ||
404 | .put = prio_put, | ||
405 | .change = prio_change, | ||
406 | .delete = prio_delete, | ||
407 | .walk = prio_walk, | ||
408 | .tcf_chain = prio_find_tcf, | ||
409 | .bind_tcf = prio_bind, | ||
410 | .unbind_tcf = prio_put, | ||
411 | .dump = prio_dump_class, | ||
412 | }; | ||
413 | |||
414 | static struct Qdisc_ops prio_qdisc_ops = { | ||
415 | .next = NULL, | ||
416 | .cl_ops = &prio_class_ops, | ||
417 | .id = "prio", | ||
418 | .priv_size = sizeof(struct prio_sched_data), | ||
419 | .enqueue = prio_enqueue, | ||
420 | .dequeue = prio_dequeue, | ||
421 | .requeue = prio_requeue, | ||
422 | .drop = prio_drop, | ||
423 | .init = prio_init, | ||
424 | .reset = prio_reset, | ||
425 | .destroy = prio_destroy, | ||
426 | .change = prio_tune, | ||
427 | .dump = prio_dump, | ||
428 | .owner = THIS_MODULE, | ||
429 | }; | ||
430 | |||
431 | static int __init prio_module_init(void) | ||
432 | { | ||
433 | return register_qdisc(&prio_qdisc_ops); | ||
434 | } | ||
435 | |||
436 | static void __exit prio_module_exit(void) | ||
437 | { | ||
438 | unregister_qdisc(&prio_qdisc_ops); | ||
439 | } | ||
440 | |||
441 | module_init(prio_module_init) | ||
442 | module_exit(prio_module_exit) | ||
443 | |||
444 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c new file mode 100644 index 000000000000..664d0e47374f --- /dev/null +++ b/net/sched/sch_red.c | |||
@@ -0,0 +1,459 @@ | |||
1 | /* | ||
2 | * net/sched/sch_red.c Random Early Detection queue. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * | ||
11 | * Changes: | ||
12 | * J Hadi Salim <hadi@nortel.com> 980914: computation fixes | ||
13 | * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. | ||
14 | * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <asm/uaccess.h> | ||
20 | #include <asm/system.h> | ||
21 | #include <linux/bitops.h> | ||
22 | #include <linux/types.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/sched.h> | ||
25 | #include <linux/string.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/socket.h> | ||
28 | #include <linux/sockios.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/errno.h> | ||
31 | #include <linux/interrupt.h> | ||
32 | #include <linux/if_ether.h> | ||
33 | #include <linux/inet.h> | ||
34 | #include <linux/netdevice.h> | ||
35 | #include <linux/etherdevice.h> | ||
36 | #include <linux/notifier.h> | ||
37 | #include <net/ip.h> | ||
38 | #include <net/route.h> | ||
39 | #include <linux/skbuff.h> | ||
40 | #include <net/sock.h> | ||
41 | #include <net/pkt_sched.h> | ||
42 | #include <net/inet_ecn.h> | ||
43 | #include <net/dsfield.h> | ||
44 | |||
45 | |||
46 | /* Random Early Detection (RED) algorithm. | ||
47 | ======================================= | ||
48 | |||
49 | Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways | ||
50 | for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. | ||
51 | |||
52 | This file codes a "divisionless" version of RED algorithm | ||
53 | as written down in Fig.17 of the paper. | ||
54 | |||
55 | Short description. | ||
56 | ------------------ | ||
57 | |||
58 | When a new packet arrives we calculate the average queue length: | ||
59 | |||
60 | avg = (1-W)*avg + W*current_queue_len, | ||
61 | |||
62 | W is the filter time constant (chosen as 2^(-Wlog)), it controls | ||
63 | the inertia of the algorithm. To allow larger bursts, W should be | ||
64 | decreased. | ||
65 | |||
66 | if (avg > th_max) -> packet marked (dropped). | ||
67 | if (avg < th_min) -> packet passes. | ||
68 | if (th_min < avg < th_max) we calculate probability: | ||
69 | |||
70 | Pb = max_P * (avg - th_min)/(th_max-th_min) | ||
71 | |||
72 | and mark (drop) packet with this probability. | ||
73 | Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). | ||
74 | max_P should be small (not 1), usually 0.01..0.02 is good value. | ||
75 | |||
76 | max_P is chosen as a number, so that max_P/(th_max-th_min) | ||
77 | is a negative power of two in order arithmetics to contain | ||
78 | only shifts. | ||
79 | |||
80 | |||
81 | Parameters, settable by user: | ||
82 | ----------------------------- | ||
83 | |||
84 | limit - bytes (must be > qth_max + burst) | ||
85 | |||
86 | Hard limit on queue length, should be chosen >qth_max | ||
87 | to allow packet bursts. This parameter does not | ||
88 | affect the algorithms behaviour and can be chosen | ||
89 | arbitrarily high (well, less than ram size) | ||
90 | Really, this limit will never be reached | ||
91 | if RED works correctly. | ||
92 | |||
93 | qth_min - bytes (should be < qth_max/2) | ||
94 | qth_max - bytes (should be at least 2*qth_min and less limit) | ||
95 | Wlog - bits (<32) log(1/W). | ||
96 | Plog - bits (<32) | ||
97 | |||
98 | Plog is related to max_P by formula: | ||
99 | |||
100 | max_P = (qth_max-qth_min)/2^Plog; | ||
101 | |||
102 | F.e. if qth_max=128K and qth_min=32K, then Plog=22 | ||
103 | corresponds to max_P=0.02 | ||
104 | |||
105 | Scell_log | ||
106 | Stab | ||
107 | |||
108 | Lookup table for log((1-W)^(t/t_ave). | ||
109 | |||
110 | |||
111 | NOTES: | ||
112 | |||
113 | Upper bound on W. | ||
114 | ----------------- | ||
115 | |||
116 | If you want to allow bursts of L packets of size S, | ||
117 | you should choose W: | ||
118 | |||
119 | L + 1 - th_min/S < (1-(1-W)^L)/W | ||
120 | |||
121 | th_min/S = 32 th_min/S = 4 | ||
122 | |||
123 | log(W) L | ||
124 | -1 33 | ||
125 | -2 35 | ||
126 | -3 39 | ||
127 | -4 46 | ||
128 | -5 57 | ||
129 | -6 75 | ||
130 | -7 101 | ||
131 | -8 135 | ||
132 | -9 190 | ||
133 | etc. | ||
134 | */ | ||
135 | |||
136 | struct red_sched_data | ||
137 | { | ||
138 | /* Parameters */ | ||
139 | u32 limit; /* HARD maximal queue length */ | ||
140 | u32 qth_min; /* Min average length threshold: A scaled */ | ||
141 | u32 qth_max; /* Max average length threshold: A scaled */ | ||
142 | u32 Rmask; | ||
143 | u32 Scell_max; | ||
144 | unsigned char flags; | ||
145 | char Wlog; /* log(W) */ | ||
146 | char Plog; /* random number bits */ | ||
147 | char Scell_log; | ||
148 | u8 Stab[256]; | ||
149 | |||
150 | /* Variables */ | ||
151 | unsigned long qave; /* Average queue length: A scaled */ | ||
152 | int qcount; /* Packets since last random number generation */ | ||
153 | u32 qR; /* Cached random number */ | ||
154 | |||
155 | psched_time_t qidlestart; /* Start of idle period */ | ||
156 | struct tc_red_xstats st; | ||
157 | }; | ||
158 | |||
159 | static int red_ecn_mark(struct sk_buff *skb) | ||
160 | { | ||
161 | if (skb->nh.raw + 20 > skb->tail) | ||
162 | return 0; | ||
163 | |||
164 | switch (skb->protocol) { | ||
165 | case __constant_htons(ETH_P_IP): | ||
166 | if (INET_ECN_is_not_ect(skb->nh.iph->tos)) | ||
167 | return 0; | ||
168 | IP_ECN_set_ce(skb->nh.iph); | ||
169 | return 1; | ||
170 | case __constant_htons(ETH_P_IPV6): | ||
171 | if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h))) | ||
172 | return 0; | ||
173 | IP6_ECN_set_ce(skb->nh.ipv6h); | ||
174 | return 1; | ||
175 | default: | ||
176 | return 0; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | static int | ||
181 | red_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
182 | { | ||
183 | struct red_sched_data *q = qdisc_priv(sch); | ||
184 | |||
185 | psched_time_t now; | ||
186 | |||
187 | if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { | ||
188 | long us_idle; | ||
189 | int shift; | ||
190 | |||
191 | PSCHED_GET_TIME(now); | ||
192 | us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); | ||
193 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
194 | |||
195 | /* | ||
196 | The problem: ideally, average length queue recalcultion should | ||
197 | be done over constant clock intervals. This is too expensive, so that | ||
198 | the calculation is driven by outgoing packets. | ||
199 | When the queue is idle we have to model this clock by hand. | ||
200 | |||
201 | SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) | ||
202 | dummy packets as a burst after idle time, i.e. | ||
203 | |||
204 | q->qave *= (1-W)^m | ||
205 | |||
206 | This is an apparently overcomplicated solution (f.e. we have to precompute | ||
207 | a table to make this calculation in reasonable time) | ||
208 | I believe that a simpler model may be used here, | ||
209 | but it is field for experiments. | ||
210 | */ | ||
211 | shift = q->Stab[us_idle>>q->Scell_log]; | ||
212 | |||
213 | if (shift) { | ||
214 | q->qave >>= shift; | ||
215 | } else { | ||
216 | /* Approximate initial part of exponent | ||
217 | with linear function: | ||
218 | (1-W)^m ~= 1-mW + ... | ||
219 | |||
220 | Seems, it is the best solution to | ||
221 | problem of too coarce exponent tabulation. | ||
222 | */ | ||
223 | |||
224 | us_idle = (q->qave * us_idle)>>q->Scell_log; | ||
225 | if (us_idle < q->qave/2) | ||
226 | q->qave -= us_idle; | ||
227 | else | ||
228 | q->qave >>= 1; | ||
229 | } | ||
230 | } else { | ||
231 | q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); | ||
232 | /* NOTE: | ||
233 | q->qave is fixed point number with point at Wlog. | ||
234 | The formulae above is equvalent to floating point | ||
235 | version: | ||
236 | |||
237 | qave = qave*(1-W) + sch->qstats.backlog*W; | ||
238 | --ANK (980924) | ||
239 | */ | ||
240 | } | ||
241 | |||
242 | if (q->qave < q->qth_min) { | ||
243 | q->qcount = -1; | ||
244 | enqueue: | ||
245 | if (sch->qstats.backlog + skb->len <= q->limit) { | ||
246 | __skb_queue_tail(&sch->q, skb); | ||
247 | sch->qstats.backlog += skb->len; | ||
248 | sch->bstats.bytes += skb->len; | ||
249 | sch->bstats.packets++; | ||
250 | return NET_XMIT_SUCCESS; | ||
251 | } else { | ||
252 | q->st.pdrop++; | ||
253 | } | ||
254 | kfree_skb(skb); | ||
255 | sch->qstats.drops++; | ||
256 | return NET_XMIT_DROP; | ||
257 | } | ||
258 | if (q->qave >= q->qth_max) { | ||
259 | q->qcount = -1; | ||
260 | sch->qstats.overlimits++; | ||
261 | mark: | ||
262 | if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { | ||
263 | q->st.early++; | ||
264 | goto drop; | ||
265 | } | ||
266 | q->st.marked++; | ||
267 | goto enqueue; | ||
268 | } | ||
269 | |||
270 | if (++q->qcount) { | ||
271 | /* The formula used below causes questions. | ||
272 | |||
273 | OK. qR is random number in the interval 0..Rmask | ||
274 | i.e. 0..(2^Plog). If we used floating point | ||
275 | arithmetics, it would be: (2^Plog)*rnd_num, | ||
276 | where rnd_num is less 1. | ||
277 | |||
278 | Taking into account, that qave have fixed | ||
279 | point at Wlog, and Plog is related to max_P by | ||
280 | max_P = (qth_max-qth_min)/2^Plog; two lines | ||
281 | below have the following floating point equivalent: | ||
282 | |||
283 | max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount | ||
284 | |||
285 | Any questions? --ANK (980924) | ||
286 | */ | ||
287 | if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) | ||
288 | goto enqueue; | ||
289 | q->qcount = 0; | ||
290 | q->qR = net_random()&q->Rmask; | ||
291 | sch->qstats.overlimits++; | ||
292 | goto mark; | ||
293 | } | ||
294 | q->qR = net_random()&q->Rmask; | ||
295 | goto enqueue; | ||
296 | |||
297 | drop: | ||
298 | kfree_skb(skb); | ||
299 | sch->qstats.drops++; | ||
300 | return NET_XMIT_CN; | ||
301 | } | ||
302 | |||
303 | static int | ||
304 | red_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
305 | { | ||
306 | struct red_sched_data *q = qdisc_priv(sch); | ||
307 | |||
308 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
309 | |||
310 | __skb_queue_head(&sch->q, skb); | ||
311 | sch->qstats.backlog += skb->len; | ||
312 | sch->qstats.requeues++; | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | static struct sk_buff * | ||
317 | red_dequeue(struct Qdisc* sch) | ||
318 | { | ||
319 | struct sk_buff *skb; | ||
320 | struct red_sched_data *q = qdisc_priv(sch); | ||
321 | |||
322 | skb = __skb_dequeue(&sch->q); | ||
323 | if (skb) { | ||
324 | sch->qstats.backlog -= skb->len; | ||
325 | return skb; | ||
326 | } | ||
327 | PSCHED_GET_TIME(q->qidlestart); | ||
328 | return NULL; | ||
329 | } | ||
330 | |||
331 | static unsigned int red_drop(struct Qdisc* sch) | ||
332 | { | ||
333 | struct sk_buff *skb; | ||
334 | struct red_sched_data *q = qdisc_priv(sch); | ||
335 | |||
336 | skb = __skb_dequeue_tail(&sch->q); | ||
337 | if (skb) { | ||
338 | unsigned int len = skb->len; | ||
339 | sch->qstats.backlog -= len; | ||
340 | sch->qstats.drops++; | ||
341 | q->st.other++; | ||
342 | kfree_skb(skb); | ||
343 | return len; | ||
344 | } | ||
345 | PSCHED_GET_TIME(q->qidlestart); | ||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | static void red_reset(struct Qdisc* sch) | ||
350 | { | ||
351 | struct red_sched_data *q = qdisc_priv(sch); | ||
352 | |||
353 | __skb_queue_purge(&sch->q); | ||
354 | sch->qstats.backlog = 0; | ||
355 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
356 | q->qave = 0; | ||
357 | q->qcount = -1; | ||
358 | } | ||
359 | |||
360 | static int red_change(struct Qdisc *sch, struct rtattr *opt) | ||
361 | { | ||
362 | struct red_sched_data *q = qdisc_priv(sch); | ||
363 | struct rtattr *tb[TCA_RED_STAB]; | ||
364 | struct tc_red_qopt *ctl; | ||
365 | |||
366 | if (opt == NULL || | ||
367 | rtattr_parse_nested(tb, TCA_RED_STAB, opt) || | ||
368 | tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || | ||
369 | RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || | ||
370 | RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) | ||
371 | return -EINVAL; | ||
372 | |||
373 | ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); | ||
374 | |||
375 | sch_tree_lock(sch); | ||
376 | q->flags = ctl->flags; | ||
377 | q->Wlog = ctl->Wlog; | ||
378 | q->Plog = ctl->Plog; | ||
379 | q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; | ||
380 | q->Scell_log = ctl->Scell_log; | ||
381 | q->Scell_max = (255<<q->Scell_log); | ||
382 | q->qth_min = ctl->qth_min<<ctl->Wlog; | ||
383 | q->qth_max = ctl->qth_max<<ctl->Wlog; | ||
384 | q->limit = ctl->limit; | ||
385 | memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); | ||
386 | |||
387 | q->qcount = -1; | ||
388 | if (skb_queue_len(&sch->q) == 0) | ||
389 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
390 | sch_tree_unlock(sch); | ||
391 | return 0; | ||
392 | } | ||
393 | |||
394 | static int red_init(struct Qdisc* sch, struct rtattr *opt) | ||
395 | { | ||
396 | return red_change(sch, opt); | ||
397 | } | ||
398 | |||
399 | static int red_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
400 | { | ||
401 | struct red_sched_data *q = qdisc_priv(sch); | ||
402 | unsigned char *b = skb->tail; | ||
403 | struct rtattr *rta; | ||
404 | struct tc_red_qopt opt; | ||
405 | |||
406 | rta = (struct rtattr*)b; | ||
407 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
408 | opt.limit = q->limit; | ||
409 | opt.qth_min = q->qth_min>>q->Wlog; | ||
410 | opt.qth_max = q->qth_max>>q->Wlog; | ||
411 | opt.Wlog = q->Wlog; | ||
412 | opt.Plog = q->Plog; | ||
413 | opt.Scell_log = q->Scell_log; | ||
414 | opt.flags = q->flags; | ||
415 | RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); | ||
416 | rta->rta_len = skb->tail - b; | ||
417 | |||
418 | return skb->len; | ||
419 | |||
420 | rtattr_failure: | ||
421 | skb_trim(skb, b - skb->data); | ||
422 | return -1; | ||
423 | } | ||
424 | |||
425 | static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) | ||
426 | { | ||
427 | struct red_sched_data *q = qdisc_priv(sch); | ||
428 | |||
429 | return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); | ||
430 | } | ||
431 | |||
432 | static struct Qdisc_ops red_qdisc_ops = { | ||
433 | .next = NULL, | ||
434 | .cl_ops = NULL, | ||
435 | .id = "red", | ||
436 | .priv_size = sizeof(struct red_sched_data), | ||
437 | .enqueue = red_enqueue, | ||
438 | .dequeue = red_dequeue, | ||
439 | .requeue = red_requeue, | ||
440 | .drop = red_drop, | ||
441 | .init = red_init, | ||
442 | .reset = red_reset, | ||
443 | .change = red_change, | ||
444 | .dump = red_dump, | ||
445 | .dump_stats = red_dump_stats, | ||
446 | .owner = THIS_MODULE, | ||
447 | }; | ||
448 | |||
449 | static int __init red_module_init(void) | ||
450 | { | ||
451 | return register_qdisc(&red_qdisc_ops); | ||
452 | } | ||
453 | static void __exit red_module_exit(void) | ||
454 | { | ||
455 | unregister_qdisc(&red_qdisc_ops); | ||
456 | } | ||
457 | module_init(red_module_init) | ||
458 | module_exit(red_module_exit) | ||
459 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c new file mode 100644 index 000000000000..8734bb7280e3 --- /dev/null +++ b/net/sched/sch_sfq.c | |||
@@ -0,0 +1,497 @@ | |||
1 | /* | ||
2 | * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <linux/bitops.h> | ||
17 | #include <linux/types.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/jiffies.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/socket.h> | ||
23 | #include <linux/sockios.h> | ||
24 | #include <linux/in.h> | ||
25 | #include <linux/errno.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/if_ether.h> | ||
28 | #include <linux/inet.h> | ||
29 | #include <linux/netdevice.h> | ||
30 | #include <linux/etherdevice.h> | ||
31 | #include <linux/notifier.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <net/ip.h> | ||
34 | #include <linux/ipv6.h> | ||
35 | #include <net/route.h> | ||
36 | #include <linux/skbuff.h> | ||
37 | #include <net/sock.h> | ||
38 | #include <net/pkt_sched.h> | ||
39 | |||
40 | |||
41 | /* Stochastic Fairness Queuing algorithm. | ||
42 | ======================================= | ||
43 | |||
44 | Source: | ||
45 | Paul E. McKenney "Stochastic Fairness Queuing", | ||
46 | IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. | ||
47 | |||
48 | Paul E. McKenney "Stochastic Fairness Queuing", | ||
49 | "Interworking: Research and Experience", v.2, 1991, p.113-131. | ||
50 | |||
51 | |||
52 | See also: | ||
53 | M. Shreedhar and George Varghese "Efficient Fair | ||
54 | Queuing using Deficit Round Robin", Proc. SIGCOMM 95. | ||
55 | |||
56 | |||
57 | This is not the thing that is usually called (W)FQ nowadays. | ||
58 | It does not use any timestamp mechanism, but instead | ||
59 | processes queues in round-robin order. | ||
60 | |||
61 | ADVANTAGE: | ||
62 | |||
63 | - It is very cheap. Both CPU and memory requirements are minimal. | ||
64 | |||
65 | DRAWBACKS: | ||
66 | |||
67 | - "Stochastic" -> It is not 100% fair. | ||
68 | When hash collisions occur, several flows are considered as one. | ||
69 | |||
70 | - "Round-robin" -> It introduces larger delays than virtual clock | ||
71 | based schemes, and should not be used for isolating interactive | ||
72 | traffic from non-interactive. It means, that this scheduler | ||
73 | should be used as leaf of CBQ or P3, which put interactive traffic | ||
74 | to higher priority band. | ||
75 | |||
76 | We still need true WFQ for top level CSZ, but using WFQ | ||
77 | for the best effort traffic is absolutely pointless: | ||
78 | SFQ is superior for this purpose. | ||
79 | |||
80 | IMPLEMENTATION: | ||
81 | This implementation limits maximal queue length to 128; | ||
82 | maximal mtu to 2^15-1; number of hash buckets to 1024. | ||
83 | The only goal of this restrictions was that all data | ||
84 | fit into one 4K page :-). Struct sfq_sched_data is | ||
85 | organized in anti-cache manner: all the data for a bucket | ||
86 | are scattered over different locations. This is not good, | ||
87 | but it allowed me to put it into 4K. | ||
88 | |||
89 | It is easy to increase these values, but not in flight. */ | ||
90 | |||
91 | #define SFQ_DEPTH 128 | ||
92 | #define SFQ_HASH_DIVISOR 1024 | ||
93 | |||
94 | /* This type should contain at least SFQ_DEPTH*2 values */ | ||
95 | typedef unsigned char sfq_index; | ||
96 | |||
97 | struct sfq_head | ||
98 | { | ||
99 | sfq_index next; | ||
100 | sfq_index prev; | ||
101 | }; | ||
102 | |||
103 | struct sfq_sched_data | ||
104 | { | ||
105 | /* Parameters */ | ||
106 | int perturb_period; | ||
107 | unsigned quantum; /* Allotment per round: MUST BE >= MTU */ | ||
108 | int limit; | ||
109 | |||
110 | /* Variables */ | ||
111 | struct timer_list perturb_timer; | ||
112 | int perturbation; | ||
113 | sfq_index tail; /* Index of current slot in round */ | ||
114 | sfq_index max_depth; /* Maximal depth */ | ||
115 | |||
116 | sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ | ||
117 | sfq_index next[SFQ_DEPTH]; /* Active slots link */ | ||
118 | short allot[SFQ_DEPTH]; /* Current allotment per slot */ | ||
119 | unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ | ||
120 | struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ | ||
121 | struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ | ||
122 | }; | ||
123 | |||
124 | static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) | ||
125 | { | ||
126 | int pert = q->perturbation; | ||
127 | |||
128 | /* Have we any rotation primitives? If not, WHY? */ | ||
129 | h ^= (h1<<pert) ^ (h1>>(0x1F - pert)); | ||
130 | h ^= h>>10; | ||
131 | return h & 0x3FF; | ||
132 | } | ||
133 | |||
134 | static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) | ||
135 | { | ||
136 | u32 h, h2; | ||
137 | |||
138 | switch (skb->protocol) { | ||
139 | case __constant_htons(ETH_P_IP): | ||
140 | { | ||
141 | struct iphdr *iph = skb->nh.iph; | ||
142 | h = iph->daddr; | ||
143 | h2 = iph->saddr^iph->protocol; | ||
144 | if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && | ||
145 | (iph->protocol == IPPROTO_TCP || | ||
146 | iph->protocol == IPPROTO_UDP || | ||
147 | iph->protocol == IPPROTO_ESP)) | ||
148 | h2 ^= *(((u32*)iph) + iph->ihl); | ||
149 | break; | ||
150 | } | ||
151 | case __constant_htons(ETH_P_IPV6): | ||
152 | { | ||
153 | struct ipv6hdr *iph = skb->nh.ipv6h; | ||
154 | h = iph->daddr.s6_addr32[3]; | ||
155 | h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; | ||
156 | if (iph->nexthdr == IPPROTO_TCP || | ||
157 | iph->nexthdr == IPPROTO_UDP || | ||
158 | iph->nexthdr == IPPROTO_ESP) | ||
159 | h2 ^= *(u32*)&iph[1]; | ||
160 | break; | ||
161 | } | ||
162 | default: | ||
163 | h = (u32)(unsigned long)skb->dst^skb->protocol; | ||
164 | h2 = (u32)(unsigned long)skb->sk; | ||
165 | } | ||
166 | return sfq_fold_hash(q, h, h2); | ||
167 | } | ||
168 | |||
169 | static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) | ||
170 | { | ||
171 | sfq_index p, n; | ||
172 | int d = q->qs[x].qlen + SFQ_DEPTH; | ||
173 | |||
174 | p = d; | ||
175 | n = q->dep[d].next; | ||
176 | q->dep[x].next = n; | ||
177 | q->dep[x].prev = p; | ||
178 | q->dep[p].next = q->dep[n].prev = x; | ||
179 | } | ||
180 | |||
181 | static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) | ||
182 | { | ||
183 | sfq_index p, n; | ||
184 | |||
185 | n = q->dep[x].next; | ||
186 | p = q->dep[x].prev; | ||
187 | q->dep[p].next = n; | ||
188 | q->dep[n].prev = p; | ||
189 | |||
190 | if (n == p && q->max_depth == q->qs[x].qlen + 1) | ||
191 | q->max_depth--; | ||
192 | |||
193 | sfq_link(q, x); | ||
194 | } | ||
195 | |||
196 | static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) | ||
197 | { | ||
198 | sfq_index p, n; | ||
199 | int d; | ||
200 | |||
201 | n = q->dep[x].next; | ||
202 | p = q->dep[x].prev; | ||
203 | q->dep[p].next = n; | ||
204 | q->dep[n].prev = p; | ||
205 | d = q->qs[x].qlen; | ||
206 | if (q->max_depth < d) | ||
207 | q->max_depth = d; | ||
208 | |||
209 | sfq_link(q, x); | ||
210 | } | ||
211 | |||
212 | static unsigned int sfq_drop(struct Qdisc *sch) | ||
213 | { | ||
214 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
215 | sfq_index d = q->max_depth; | ||
216 | struct sk_buff *skb; | ||
217 | unsigned int len; | ||
218 | |||
219 | /* Queue is full! Find the longest slot and | ||
220 | drop a packet from it */ | ||
221 | |||
222 | if (d > 1) { | ||
223 | sfq_index x = q->dep[d+SFQ_DEPTH].next; | ||
224 | skb = q->qs[x].prev; | ||
225 | len = skb->len; | ||
226 | __skb_unlink(skb, &q->qs[x]); | ||
227 | kfree_skb(skb); | ||
228 | sfq_dec(q, x); | ||
229 | sch->q.qlen--; | ||
230 | sch->qstats.drops++; | ||
231 | return len; | ||
232 | } | ||
233 | |||
234 | if (d == 1) { | ||
235 | /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ | ||
236 | d = q->next[q->tail]; | ||
237 | q->next[q->tail] = q->next[d]; | ||
238 | q->allot[q->next[d]] += q->quantum; | ||
239 | skb = q->qs[d].prev; | ||
240 | len = skb->len; | ||
241 | __skb_unlink(skb, &q->qs[d]); | ||
242 | kfree_skb(skb); | ||
243 | sfq_dec(q, d); | ||
244 | sch->q.qlen--; | ||
245 | q->ht[q->hash[d]] = SFQ_DEPTH; | ||
246 | sch->qstats.drops++; | ||
247 | return len; | ||
248 | } | ||
249 | |||
250 | return 0; | ||
251 | } | ||
252 | |||
253 | static int | ||
254 | sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
255 | { | ||
256 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
257 | unsigned hash = sfq_hash(q, skb); | ||
258 | sfq_index x; | ||
259 | |||
260 | x = q->ht[hash]; | ||
261 | if (x == SFQ_DEPTH) { | ||
262 | q->ht[hash] = x = q->dep[SFQ_DEPTH].next; | ||
263 | q->hash[x] = hash; | ||
264 | } | ||
265 | __skb_queue_tail(&q->qs[x], skb); | ||
266 | sfq_inc(q, x); | ||
267 | if (q->qs[x].qlen == 1) { /* The flow is new */ | ||
268 | if (q->tail == SFQ_DEPTH) { /* It is the first flow */ | ||
269 | q->tail = x; | ||
270 | q->next[x] = x; | ||
271 | q->allot[x] = q->quantum; | ||
272 | } else { | ||
273 | q->next[x] = q->next[q->tail]; | ||
274 | q->next[q->tail] = x; | ||
275 | q->tail = x; | ||
276 | } | ||
277 | } | ||
278 | if (++sch->q.qlen < q->limit-1) { | ||
279 | sch->bstats.bytes += skb->len; | ||
280 | sch->bstats.packets++; | ||
281 | return 0; | ||
282 | } | ||
283 | |||
284 | sfq_drop(sch); | ||
285 | return NET_XMIT_CN; | ||
286 | } | ||
287 | |||
288 | static int | ||
289 | sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
290 | { | ||
291 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
292 | unsigned hash = sfq_hash(q, skb); | ||
293 | sfq_index x; | ||
294 | |||
295 | x = q->ht[hash]; | ||
296 | if (x == SFQ_DEPTH) { | ||
297 | q->ht[hash] = x = q->dep[SFQ_DEPTH].next; | ||
298 | q->hash[x] = hash; | ||
299 | } | ||
300 | __skb_queue_head(&q->qs[x], skb); | ||
301 | sfq_inc(q, x); | ||
302 | if (q->qs[x].qlen == 1) { /* The flow is new */ | ||
303 | if (q->tail == SFQ_DEPTH) { /* It is the first flow */ | ||
304 | q->tail = x; | ||
305 | q->next[x] = x; | ||
306 | q->allot[x] = q->quantum; | ||
307 | } else { | ||
308 | q->next[x] = q->next[q->tail]; | ||
309 | q->next[q->tail] = x; | ||
310 | q->tail = x; | ||
311 | } | ||
312 | } | ||
313 | if (++sch->q.qlen < q->limit - 1) { | ||
314 | sch->qstats.requeues++; | ||
315 | return 0; | ||
316 | } | ||
317 | |||
318 | sch->qstats.drops++; | ||
319 | sfq_drop(sch); | ||
320 | return NET_XMIT_CN; | ||
321 | } | ||
322 | |||
323 | |||
324 | |||
325 | |||
326 | static struct sk_buff * | ||
327 | sfq_dequeue(struct Qdisc* sch) | ||
328 | { | ||
329 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
330 | struct sk_buff *skb; | ||
331 | sfq_index a, old_a; | ||
332 | |||
333 | /* No active slots */ | ||
334 | if (q->tail == SFQ_DEPTH) | ||
335 | return NULL; | ||
336 | |||
337 | a = old_a = q->next[q->tail]; | ||
338 | |||
339 | /* Grab packet */ | ||
340 | skb = __skb_dequeue(&q->qs[a]); | ||
341 | sfq_dec(q, a); | ||
342 | sch->q.qlen--; | ||
343 | |||
344 | /* Is the slot empty? */ | ||
345 | if (q->qs[a].qlen == 0) { | ||
346 | q->ht[q->hash[a]] = SFQ_DEPTH; | ||
347 | a = q->next[a]; | ||
348 | if (a == old_a) { | ||
349 | q->tail = SFQ_DEPTH; | ||
350 | return skb; | ||
351 | } | ||
352 | q->next[q->tail] = a; | ||
353 | q->allot[a] += q->quantum; | ||
354 | } else if ((q->allot[a] -= skb->len) <= 0) { | ||
355 | q->tail = a; | ||
356 | a = q->next[a]; | ||
357 | q->allot[a] += q->quantum; | ||
358 | } | ||
359 | return skb; | ||
360 | } | ||
361 | |||
362 | static void | ||
363 | sfq_reset(struct Qdisc* sch) | ||
364 | { | ||
365 | struct sk_buff *skb; | ||
366 | |||
367 | while ((skb = sfq_dequeue(sch)) != NULL) | ||
368 | kfree_skb(skb); | ||
369 | } | ||
370 | |||
371 | static void sfq_perturbation(unsigned long arg) | ||
372 | { | ||
373 | struct Qdisc *sch = (struct Qdisc*)arg; | ||
374 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
375 | |||
376 | q->perturbation = net_random()&0x1F; | ||
377 | |||
378 | if (q->perturb_period) { | ||
379 | q->perturb_timer.expires = jiffies + q->perturb_period; | ||
380 | add_timer(&q->perturb_timer); | ||
381 | } | ||
382 | } | ||
383 | |||
384 | static int sfq_change(struct Qdisc *sch, struct rtattr *opt) | ||
385 | { | ||
386 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
387 | struct tc_sfq_qopt *ctl = RTA_DATA(opt); | ||
388 | |||
389 | if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) | ||
390 | return -EINVAL; | ||
391 | |||
392 | sch_tree_lock(sch); | ||
393 | q->quantum = ctl->quantum ? : psched_mtu(sch->dev); | ||
394 | q->perturb_period = ctl->perturb_period*HZ; | ||
395 | if (ctl->limit) | ||
396 | q->limit = min_t(u32, ctl->limit, SFQ_DEPTH); | ||
397 | |||
398 | while (sch->q.qlen >= q->limit-1) | ||
399 | sfq_drop(sch); | ||
400 | |||
401 | del_timer(&q->perturb_timer); | ||
402 | if (q->perturb_period) { | ||
403 | q->perturb_timer.expires = jiffies + q->perturb_period; | ||
404 | add_timer(&q->perturb_timer); | ||
405 | } | ||
406 | sch_tree_unlock(sch); | ||
407 | return 0; | ||
408 | } | ||
409 | |||
410 | static int sfq_init(struct Qdisc *sch, struct rtattr *opt) | ||
411 | { | ||
412 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
413 | int i; | ||
414 | |||
415 | init_timer(&q->perturb_timer); | ||
416 | q->perturb_timer.data = (unsigned long)sch; | ||
417 | q->perturb_timer.function = sfq_perturbation; | ||
418 | |||
419 | for (i=0; i<SFQ_HASH_DIVISOR; i++) | ||
420 | q->ht[i] = SFQ_DEPTH; | ||
421 | for (i=0; i<SFQ_DEPTH; i++) { | ||
422 | skb_queue_head_init(&q->qs[i]); | ||
423 | q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; | ||
424 | q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; | ||
425 | } | ||
426 | q->limit = SFQ_DEPTH; | ||
427 | q->max_depth = 0; | ||
428 | q->tail = SFQ_DEPTH; | ||
429 | if (opt == NULL) { | ||
430 | q->quantum = psched_mtu(sch->dev); | ||
431 | q->perturb_period = 0; | ||
432 | } else { | ||
433 | int err = sfq_change(sch, opt); | ||
434 | if (err) | ||
435 | return err; | ||
436 | } | ||
437 | for (i=0; i<SFQ_DEPTH; i++) | ||
438 | sfq_link(q, i); | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | static void sfq_destroy(struct Qdisc *sch) | ||
443 | { | ||
444 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
445 | del_timer(&q->perturb_timer); | ||
446 | } | ||
447 | |||
448 | static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
449 | { | ||
450 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
451 | unsigned char *b = skb->tail; | ||
452 | struct tc_sfq_qopt opt; | ||
453 | |||
454 | opt.quantum = q->quantum; | ||
455 | opt.perturb_period = q->perturb_period/HZ; | ||
456 | |||
457 | opt.limit = q->limit; | ||
458 | opt.divisor = SFQ_HASH_DIVISOR; | ||
459 | opt.flows = q->limit; | ||
460 | |||
461 | RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); | ||
462 | |||
463 | return skb->len; | ||
464 | |||
465 | rtattr_failure: | ||
466 | skb_trim(skb, b - skb->data); | ||
467 | return -1; | ||
468 | } | ||
469 | |||
470 | static struct Qdisc_ops sfq_qdisc_ops = { | ||
471 | .next = NULL, | ||
472 | .cl_ops = NULL, | ||
473 | .id = "sfq", | ||
474 | .priv_size = sizeof(struct sfq_sched_data), | ||
475 | .enqueue = sfq_enqueue, | ||
476 | .dequeue = sfq_dequeue, | ||
477 | .requeue = sfq_requeue, | ||
478 | .drop = sfq_drop, | ||
479 | .init = sfq_init, | ||
480 | .reset = sfq_reset, | ||
481 | .destroy = sfq_destroy, | ||
482 | .change = NULL, | ||
483 | .dump = sfq_dump, | ||
484 | .owner = THIS_MODULE, | ||
485 | }; | ||
486 | |||
487 | static int __init sfq_module_init(void) | ||
488 | { | ||
489 | return register_qdisc(&sfq_qdisc_ops); | ||
490 | } | ||
491 | static void __exit sfq_module_exit(void) | ||
492 | { | ||
493 | unregister_qdisc(&sfq_qdisc_ops); | ||
494 | } | ||
495 | module_init(sfq_module_init) | ||
496 | module_exit(sfq_module_exit) | ||
497 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c new file mode 100644 index 000000000000..cb9711ea8c6c --- /dev/null +++ b/net/sched/sch_tbf.c | |||
@@ -0,0 +1,543 @@ | |||
1 | /* | ||
2 | * net/sched/sch_tbf.c Token Bucket Filter queue. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - | ||
11 | * original idea by Martin Devera | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/system.h> | ||
19 | #include <linux/bitops.h> | ||
20 | #include <linux/types.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/jiffies.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/socket.h> | ||
26 | #include <linux/sockios.h> | ||
27 | #include <linux/in.h> | ||
28 | #include <linux/errno.h> | ||
29 | #include <linux/interrupt.h> | ||
30 | #include <linux/if_ether.h> | ||
31 | #include <linux/inet.h> | ||
32 | #include <linux/netdevice.h> | ||
33 | #include <linux/etherdevice.h> | ||
34 | #include <linux/notifier.h> | ||
35 | #include <net/ip.h> | ||
36 | #include <net/route.h> | ||
37 | #include <linux/skbuff.h> | ||
38 | #include <net/sock.h> | ||
39 | #include <net/pkt_sched.h> | ||
40 | |||
41 | |||
42 | /* Simple Token Bucket Filter. | ||
43 | ======================================= | ||
44 | |||
45 | SOURCE. | ||
46 | ------- | ||
47 | |||
48 | None. | ||
49 | |||
50 | Description. | ||
51 | ------------ | ||
52 | |||
53 | A data flow obeys TBF with rate R and depth B, if for any | ||
54 | time interval t_i...t_f the number of transmitted bits | ||
55 | does not exceed B + R*(t_f-t_i). | ||
56 | |||
57 | Packetized version of this definition: | ||
58 | The sequence of packets of sizes s_i served at moments t_i | ||
59 | obeys TBF, if for any i<=k: | ||
60 | |||
61 | s_i+....+s_k <= B + R*(t_k - t_i) | ||
62 | |||
63 | Algorithm. | ||
64 | ---------- | ||
65 | |||
66 | Let N(t_i) be B/R initially and N(t) grow continuously with time as: | ||
67 | |||
68 | N(t+delta) = min{B/R, N(t) + delta} | ||
69 | |||
70 | If the first packet in queue has length S, it may be | ||
71 | transmitted only at the time t_* when S/R <= N(t_*), | ||
72 | and in this case N(t) jumps: | ||
73 | |||
74 | N(t_* + 0) = N(t_* - 0) - S/R. | ||
75 | |||
76 | |||
77 | |||
78 | Actually, QoS requires two TBF to be applied to a data stream. | ||
79 | One of them controls steady state burst size, another | ||
80 | one with rate P (peak rate) and depth M (equal to link MTU) | ||
81 | limits bursts at a smaller time scale. | ||
82 | |||
83 | It is easy to see that P>R, and B>M. If P is infinity, this double | ||
84 | TBF is equivalent to a single one. | ||
85 | |||
86 | When TBF works in reshaping mode, latency is estimated as: | ||
87 | |||
88 | lat = max ((L-B)/R, (L-M)/P) | ||
89 | |||
90 | |||
91 | NOTES. | ||
92 | ------ | ||
93 | |||
94 | If TBF throttles, it starts a watchdog timer, which will wake it up | ||
95 | when it is ready to transmit. | ||
96 | Note that the minimal timer resolution is 1/HZ. | ||
97 | If no new packets arrive during this period, | ||
98 | or if the device is not awaken by EOI for some previous packet, | ||
99 | TBF can stop its activity for 1/HZ. | ||
100 | |||
101 | |||
102 | This means, that with depth B, the maximal rate is | ||
103 | |||
104 | R_crit = B*HZ | ||
105 | |||
106 | F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. | ||
107 | |||
108 | Note that the peak rate TBF is much more tough: with MTU 1500 | ||
109 | P_crit = 150Kbytes/sec. So, if you need greater peak | ||
110 | rates, use alpha with HZ=1000 :-) | ||
111 | |||
112 | With classful TBF, limit is just kept for backwards compatibility. | ||
113 | It is passed to the default bfifo qdisc - if the inner qdisc is | ||
114 | changed the limit is not effective anymore. | ||
115 | */ | ||
116 | |||
117 | struct tbf_sched_data | ||
118 | { | ||
119 | /* Parameters */ | ||
120 | u32 limit; /* Maximal length of backlog: bytes */ | ||
121 | u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ | ||
122 | u32 mtu; | ||
123 | u32 max_size; | ||
124 | struct qdisc_rate_table *R_tab; | ||
125 | struct qdisc_rate_table *P_tab; | ||
126 | |||
127 | /* Variables */ | ||
128 | long tokens; /* Current number of B tokens */ | ||
129 | long ptokens; /* Current number of P tokens */ | ||
130 | psched_time_t t_c; /* Time check-point */ | ||
131 | struct timer_list wd_timer; /* Watchdog timer */ | ||
132 | struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ | ||
133 | }; | ||
134 | |||
135 | #define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) | ||
136 | #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) | ||
137 | |||
138 | static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
139 | { | ||
140 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
141 | int ret; | ||
142 | |||
143 | if (skb->len > q->max_size) { | ||
144 | sch->qstats.drops++; | ||
145 | #ifdef CONFIG_NET_CLS_POLICE | ||
146 | if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) | ||
147 | #endif | ||
148 | kfree_skb(skb); | ||
149 | |||
150 | return NET_XMIT_DROP; | ||
151 | } | ||
152 | |||
153 | if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) { | ||
154 | sch->qstats.drops++; | ||
155 | return ret; | ||
156 | } | ||
157 | |||
158 | sch->q.qlen++; | ||
159 | sch->bstats.bytes += skb->len; | ||
160 | sch->bstats.packets++; | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
165 | { | ||
166 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
167 | int ret; | ||
168 | |||
169 | if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { | ||
170 | sch->q.qlen++; | ||
171 | sch->qstats.requeues++; | ||
172 | } | ||
173 | |||
174 | return ret; | ||
175 | } | ||
176 | |||
177 | static unsigned int tbf_drop(struct Qdisc* sch) | ||
178 | { | ||
179 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
180 | unsigned int len; | ||
181 | |||
182 | if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { | ||
183 | sch->q.qlen--; | ||
184 | sch->qstats.drops++; | ||
185 | } | ||
186 | return len; | ||
187 | } | ||
188 | |||
189 | static void tbf_watchdog(unsigned long arg) | ||
190 | { | ||
191 | struct Qdisc *sch = (struct Qdisc*)arg; | ||
192 | |||
193 | sch->flags &= ~TCQ_F_THROTTLED; | ||
194 | netif_schedule(sch->dev); | ||
195 | } | ||
196 | |||
197 | static struct sk_buff *tbf_dequeue(struct Qdisc* sch) | ||
198 | { | ||
199 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
200 | struct sk_buff *skb; | ||
201 | |||
202 | skb = q->qdisc->dequeue(q->qdisc); | ||
203 | |||
204 | if (skb) { | ||
205 | psched_time_t now; | ||
206 | long toks, delay; | ||
207 | long ptoks = 0; | ||
208 | unsigned int len = skb->len; | ||
209 | |||
210 | PSCHED_GET_TIME(now); | ||
211 | |||
212 | toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer); | ||
213 | |||
214 | if (q->P_tab) { | ||
215 | ptoks = toks + q->ptokens; | ||
216 | if (ptoks > (long)q->mtu) | ||
217 | ptoks = q->mtu; | ||
218 | ptoks -= L2T_P(q, len); | ||
219 | } | ||
220 | toks += q->tokens; | ||
221 | if (toks > (long)q->buffer) | ||
222 | toks = q->buffer; | ||
223 | toks -= L2T(q, len); | ||
224 | |||
225 | if ((toks|ptoks) >= 0) { | ||
226 | q->t_c = now; | ||
227 | q->tokens = toks; | ||
228 | q->ptokens = ptoks; | ||
229 | sch->q.qlen--; | ||
230 | sch->flags &= ~TCQ_F_THROTTLED; | ||
231 | return skb; | ||
232 | } | ||
233 | |||
234 | delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks)); | ||
235 | |||
236 | if (delay == 0) | ||
237 | delay = 1; | ||
238 | |||
239 | mod_timer(&q->wd_timer, jiffies+delay); | ||
240 | |||
241 | /* Maybe we have a shorter packet in the queue, | ||
242 | which can be sent now. It sounds cool, | ||
243 | but, however, this is wrong in principle. | ||
244 | We MUST NOT reorder packets under these circumstances. | ||
245 | |||
246 | Really, if we split the flow into independent | ||
247 | subflows, it would be a very good solution. | ||
248 | This is the main idea of all FQ algorithms | ||
249 | (cf. CSZ, HPFQ, HFSC) | ||
250 | */ | ||
251 | |||
252 | if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { | ||
253 | /* When requeue fails skb is dropped */ | ||
254 | sch->q.qlen--; | ||
255 | sch->qstats.drops++; | ||
256 | } | ||
257 | |||
258 | sch->flags |= TCQ_F_THROTTLED; | ||
259 | sch->qstats.overlimits++; | ||
260 | } | ||
261 | return NULL; | ||
262 | } | ||
263 | |||
264 | static void tbf_reset(struct Qdisc* sch) | ||
265 | { | ||
266 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
267 | |||
268 | qdisc_reset(q->qdisc); | ||
269 | sch->q.qlen = 0; | ||
270 | PSCHED_GET_TIME(q->t_c); | ||
271 | q->tokens = q->buffer; | ||
272 | q->ptokens = q->mtu; | ||
273 | sch->flags &= ~TCQ_F_THROTTLED; | ||
274 | del_timer(&q->wd_timer); | ||
275 | } | ||
276 | |||
277 | static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit) | ||
278 | { | ||
279 | struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops); | ||
280 | struct rtattr *rta; | ||
281 | int ret; | ||
282 | |||
283 | if (q) { | ||
284 | rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); | ||
285 | if (rta) { | ||
286 | rta->rta_type = RTM_NEWQDISC; | ||
287 | rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); | ||
288 | ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; | ||
289 | |||
290 | ret = q->ops->change(q, rta); | ||
291 | kfree(rta); | ||
292 | |||
293 | if (ret == 0) | ||
294 | return q; | ||
295 | } | ||
296 | qdisc_destroy(q); | ||
297 | } | ||
298 | |||
299 | return NULL; | ||
300 | } | ||
301 | |||
302 | static int tbf_change(struct Qdisc* sch, struct rtattr *opt) | ||
303 | { | ||
304 | int err = -EINVAL; | ||
305 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
306 | struct rtattr *tb[TCA_TBF_PTAB]; | ||
307 | struct tc_tbf_qopt *qopt; | ||
308 | struct qdisc_rate_table *rtab = NULL; | ||
309 | struct qdisc_rate_table *ptab = NULL; | ||
310 | struct Qdisc *child = NULL; | ||
311 | int max_size,n; | ||
312 | |||
313 | if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) || | ||
314 | tb[TCA_TBF_PARMS-1] == NULL || | ||
315 | RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) | ||
316 | goto done; | ||
317 | |||
318 | qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); | ||
319 | rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); | ||
320 | if (rtab == NULL) | ||
321 | goto done; | ||
322 | |||
323 | if (qopt->peakrate.rate) { | ||
324 | if (qopt->peakrate.rate > qopt->rate.rate) | ||
325 | ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]); | ||
326 | if (ptab == NULL) | ||
327 | goto done; | ||
328 | } | ||
329 | |||
330 | for (n = 0; n < 256; n++) | ||
331 | if (rtab->data[n] > qopt->buffer) break; | ||
332 | max_size = (n << qopt->rate.cell_log)-1; | ||
333 | if (ptab) { | ||
334 | int size; | ||
335 | |||
336 | for (n = 0; n < 256; n++) | ||
337 | if (ptab->data[n] > qopt->mtu) break; | ||
338 | size = (n << qopt->peakrate.cell_log)-1; | ||
339 | if (size < max_size) max_size = size; | ||
340 | } | ||
341 | if (max_size < 0) | ||
342 | goto done; | ||
343 | |||
344 | if (q->qdisc == &noop_qdisc) { | ||
345 | if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL) | ||
346 | goto done; | ||
347 | } | ||
348 | |||
349 | sch_tree_lock(sch); | ||
350 | if (child) q->qdisc = child; | ||
351 | q->limit = qopt->limit; | ||
352 | q->mtu = qopt->mtu; | ||
353 | q->max_size = max_size; | ||
354 | q->buffer = qopt->buffer; | ||
355 | q->tokens = q->buffer; | ||
356 | q->ptokens = q->mtu; | ||
357 | rtab = xchg(&q->R_tab, rtab); | ||
358 | ptab = xchg(&q->P_tab, ptab); | ||
359 | sch_tree_unlock(sch); | ||
360 | err = 0; | ||
361 | done: | ||
362 | if (rtab) | ||
363 | qdisc_put_rtab(rtab); | ||
364 | if (ptab) | ||
365 | qdisc_put_rtab(ptab); | ||
366 | return err; | ||
367 | } | ||
368 | |||
369 | static int tbf_init(struct Qdisc* sch, struct rtattr *opt) | ||
370 | { | ||
371 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
372 | |||
373 | if (opt == NULL) | ||
374 | return -EINVAL; | ||
375 | |||
376 | PSCHED_GET_TIME(q->t_c); | ||
377 | init_timer(&q->wd_timer); | ||
378 | q->wd_timer.function = tbf_watchdog; | ||
379 | q->wd_timer.data = (unsigned long)sch; | ||
380 | |||
381 | q->qdisc = &noop_qdisc; | ||
382 | |||
383 | return tbf_change(sch, opt); | ||
384 | } | ||
385 | |||
386 | static void tbf_destroy(struct Qdisc *sch) | ||
387 | { | ||
388 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
389 | |||
390 | del_timer(&q->wd_timer); | ||
391 | |||
392 | if (q->P_tab) | ||
393 | qdisc_put_rtab(q->P_tab); | ||
394 | if (q->R_tab) | ||
395 | qdisc_put_rtab(q->R_tab); | ||
396 | |||
397 | qdisc_destroy(q->qdisc); | ||
398 | } | ||
399 | |||
400 | static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
401 | { | ||
402 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
403 | unsigned char *b = skb->tail; | ||
404 | struct rtattr *rta; | ||
405 | struct tc_tbf_qopt opt; | ||
406 | |||
407 | rta = (struct rtattr*)b; | ||
408 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | ||
409 | |||
410 | opt.limit = q->limit; | ||
411 | opt.rate = q->R_tab->rate; | ||
412 | if (q->P_tab) | ||
413 | opt.peakrate = q->P_tab->rate; | ||
414 | else | ||
415 | memset(&opt.peakrate, 0, sizeof(opt.peakrate)); | ||
416 | opt.mtu = q->mtu; | ||
417 | opt.buffer = q->buffer; | ||
418 | RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); | ||
419 | rta->rta_len = skb->tail - b; | ||
420 | |||
421 | return skb->len; | ||
422 | |||
423 | rtattr_failure: | ||
424 | skb_trim(skb, b - skb->data); | ||
425 | return -1; | ||
426 | } | ||
427 | |||
428 | static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, | ||
429 | struct sk_buff *skb, struct tcmsg *tcm) | ||
430 | { | ||
431 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
432 | |||
433 | if (cl != 1) /* only one class */ | ||
434 | return -ENOENT; | ||
435 | |||
436 | tcm->tcm_handle |= TC_H_MIN(1); | ||
437 | tcm->tcm_info = q->qdisc->handle; | ||
438 | |||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, | ||
443 | struct Qdisc **old) | ||
444 | { | ||
445 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
446 | |||
447 | if (new == NULL) | ||
448 | new = &noop_qdisc; | ||
449 | |||
450 | sch_tree_lock(sch); | ||
451 | *old = xchg(&q->qdisc, new); | ||
452 | qdisc_reset(*old); | ||
453 | sch->q.qlen = 0; | ||
454 | sch_tree_unlock(sch); | ||
455 | |||
456 | return 0; | ||
457 | } | ||
458 | |||
459 | static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) | ||
460 | { | ||
461 | struct tbf_sched_data *q = qdisc_priv(sch); | ||
462 | return q->qdisc; | ||
463 | } | ||
464 | |||
465 | static unsigned long tbf_get(struct Qdisc *sch, u32 classid) | ||
466 | { | ||
467 | return 1; | ||
468 | } | ||
469 | |||
470 | static void tbf_put(struct Qdisc *sch, unsigned long arg) | ||
471 | { | ||
472 | } | ||
473 | |||
474 | static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, | ||
475 | struct rtattr **tca, unsigned long *arg) | ||
476 | { | ||
477 | return -ENOSYS; | ||
478 | } | ||
479 | |||
480 | static int tbf_delete(struct Qdisc *sch, unsigned long arg) | ||
481 | { | ||
482 | return -ENOSYS; | ||
483 | } | ||
484 | |||
485 | static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) | ||
486 | { | ||
487 | if (!walker->stop) { | ||
488 | if (walker->count >= walker->skip) | ||
489 | if (walker->fn(sch, 1, walker) < 0) { | ||
490 | walker->stop = 1; | ||
491 | return; | ||
492 | } | ||
493 | walker->count++; | ||
494 | } | ||
495 | } | ||
496 | |||
497 | static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl) | ||
498 | { | ||
499 | return NULL; | ||
500 | } | ||
501 | |||
502 | static struct Qdisc_class_ops tbf_class_ops = | ||
503 | { | ||
504 | .graft = tbf_graft, | ||
505 | .leaf = tbf_leaf, | ||
506 | .get = tbf_get, | ||
507 | .put = tbf_put, | ||
508 | .change = tbf_change_class, | ||
509 | .delete = tbf_delete, | ||
510 | .walk = tbf_walk, | ||
511 | .tcf_chain = tbf_find_tcf, | ||
512 | .dump = tbf_dump_class, | ||
513 | }; | ||
514 | |||
515 | static struct Qdisc_ops tbf_qdisc_ops = { | ||
516 | .next = NULL, | ||
517 | .cl_ops = &tbf_class_ops, | ||
518 | .id = "tbf", | ||
519 | .priv_size = sizeof(struct tbf_sched_data), | ||
520 | .enqueue = tbf_enqueue, | ||
521 | .dequeue = tbf_dequeue, | ||
522 | .requeue = tbf_requeue, | ||
523 | .drop = tbf_drop, | ||
524 | .init = tbf_init, | ||
525 | .reset = tbf_reset, | ||
526 | .destroy = tbf_destroy, | ||
527 | .change = tbf_change, | ||
528 | .dump = tbf_dump, | ||
529 | .owner = THIS_MODULE, | ||
530 | }; | ||
531 | |||
532 | static int __init tbf_module_init(void) | ||
533 | { | ||
534 | return register_qdisc(&tbf_qdisc_ops); | ||
535 | } | ||
536 | |||
537 | static void __exit tbf_module_exit(void) | ||
538 | { | ||
539 | unregister_qdisc(&tbf_qdisc_ops); | ||
540 | } | ||
541 | module_init(tbf_module_init) | ||
542 | module_exit(tbf_module_exit) | ||
543 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c new file mode 100644 index 000000000000..6cf0342706b5 --- /dev/null +++ b/net/sched/sch_teql.c | |||
@@ -0,0 +1,511 @@ | |||
1 | /* net/sched/sch_teql.c "True" (or "trivial") link equalizer. | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of the GNU General Public License | ||
5 | * as published by the Free Software Foundation; either version | ||
6 | * 2 of the License, or (at your option) any later version. | ||
7 | * | ||
8 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
9 | */ | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <asm/uaccess.h> | ||
13 | #include <asm/system.h> | ||
14 | #include <linux/bitops.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/socket.h> | ||
21 | #include <linux/sockios.h> | ||
22 | #include <linux/in.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/if_ether.h> | ||
26 | #include <linux/inet.h> | ||
27 | #include <linux/netdevice.h> | ||
28 | #include <linux/etherdevice.h> | ||
29 | #include <linux/notifier.h> | ||
30 | #include <linux/init.h> | ||
31 | #include <net/ip.h> | ||
32 | #include <net/route.h> | ||
33 | #include <linux/skbuff.h> | ||
34 | #include <linux/moduleparam.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <net/pkt_sched.h> | ||
37 | |||
38 | /* | ||
39 | How to setup it. | ||
40 | ---------------- | ||
41 | |||
42 | After loading this module you will find a new device teqlN | ||
43 | and new qdisc with the same name. To join a slave to the equalizer | ||
44 | you should just set this qdisc on a device f.e. | ||
45 | |||
46 | # tc qdisc add dev eth0 root teql0 | ||
47 | # tc qdisc add dev eth1 root teql0 | ||
48 | |||
49 | That's all. Full PnP 8) | ||
50 | |||
51 | Applicability. | ||
52 | -------------- | ||
53 | |||
54 | 1. Slave devices MUST be active devices, i.e., they must raise the tbusy | ||
55 | signal and generate EOI events. If you want to equalize virtual devices | ||
56 | like tunnels, use a normal eql device. | ||
57 | 2. This device puts no limitations on physical slave characteristics | ||
58 | f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) | ||
59 | Certainly, large difference in link speeds will make the resulting | ||
60 | eqalized link unusable, because of huge packet reordering. | ||
61 | I estimate an upper useful difference as ~10 times. | ||
62 | 3. If the slave requires address resolution, only protocols using | ||
63 | neighbour cache (IPv4/IPv6) will work over the equalized link. | ||
64 | Other protocols are still allowed to use the slave device directly, | ||
65 | which will not break load balancing, though native slave | ||
66 | traffic will have the highest priority. */ | ||
67 | |||
68 | struct teql_master | ||
69 | { | ||
70 | struct Qdisc_ops qops; | ||
71 | struct net_device *dev; | ||
72 | struct Qdisc *slaves; | ||
73 | struct list_head master_list; | ||
74 | struct net_device_stats stats; | ||
75 | }; | ||
76 | |||
77 | struct teql_sched_data | ||
78 | { | ||
79 | struct Qdisc *next; | ||
80 | struct teql_master *m; | ||
81 | struct neighbour *ncache; | ||
82 | struct sk_buff_head q; | ||
83 | }; | ||
84 | |||
85 | #define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next) | ||
86 | |||
87 | #define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) | ||
88 | |||
89 | /* "teql*" qdisc routines */ | ||
90 | |||
91 | static int | ||
92 | teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
93 | { | ||
94 | struct net_device *dev = sch->dev; | ||
95 | struct teql_sched_data *q = qdisc_priv(sch); | ||
96 | |||
97 | __skb_queue_tail(&q->q, skb); | ||
98 | if (q->q.qlen <= dev->tx_queue_len) { | ||
99 | sch->bstats.bytes += skb->len; | ||
100 | sch->bstats.packets++; | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | __skb_unlink(skb, &q->q); | ||
105 | kfree_skb(skb); | ||
106 | sch->qstats.drops++; | ||
107 | return NET_XMIT_DROP; | ||
108 | } | ||
109 | |||
110 | static int | ||
111 | teql_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
112 | { | ||
113 | struct teql_sched_data *q = qdisc_priv(sch); | ||
114 | |||
115 | __skb_queue_head(&q->q, skb); | ||
116 | sch->qstats.requeues++; | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | static struct sk_buff * | ||
121 | teql_dequeue(struct Qdisc* sch) | ||
122 | { | ||
123 | struct teql_sched_data *dat = qdisc_priv(sch); | ||
124 | struct sk_buff *skb; | ||
125 | |||
126 | skb = __skb_dequeue(&dat->q); | ||
127 | if (skb == NULL) { | ||
128 | struct net_device *m = dat->m->dev->qdisc->dev; | ||
129 | if (m) { | ||
130 | dat->m->slaves = sch; | ||
131 | netif_wake_queue(m); | ||
132 | } | ||
133 | } | ||
134 | sch->q.qlen = dat->q.qlen + dat->m->dev->qdisc->q.qlen; | ||
135 | return skb; | ||
136 | } | ||
137 | |||
138 | static __inline__ void | ||
139 | teql_neigh_release(struct neighbour *n) | ||
140 | { | ||
141 | if (n) | ||
142 | neigh_release(n); | ||
143 | } | ||
144 | |||
145 | static void | ||
146 | teql_reset(struct Qdisc* sch) | ||
147 | { | ||
148 | struct teql_sched_data *dat = qdisc_priv(sch); | ||
149 | |||
150 | skb_queue_purge(&dat->q); | ||
151 | sch->q.qlen = 0; | ||
152 | teql_neigh_release(xchg(&dat->ncache, NULL)); | ||
153 | } | ||
154 | |||
155 | static void | ||
156 | teql_destroy(struct Qdisc* sch) | ||
157 | { | ||
158 | struct Qdisc *q, *prev; | ||
159 | struct teql_sched_data *dat = qdisc_priv(sch); | ||
160 | struct teql_master *master = dat->m; | ||
161 | |||
162 | if ((prev = master->slaves) != NULL) { | ||
163 | do { | ||
164 | q = NEXT_SLAVE(prev); | ||
165 | if (q == sch) { | ||
166 | NEXT_SLAVE(prev) = NEXT_SLAVE(q); | ||
167 | if (q == master->slaves) { | ||
168 | master->slaves = NEXT_SLAVE(q); | ||
169 | if (q == master->slaves) { | ||
170 | master->slaves = NULL; | ||
171 | spin_lock_bh(&master->dev->queue_lock); | ||
172 | qdisc_reset(master->dev->qdisc); | ||
173 | spin_unlock_bh(&master->dev->queue_lock); | ||
174 | } | ||
175 | } | ||
176 | skb_queue_purge(&dat->q); | ||
177 | teql_neigh_release(xchg(&dat->ncache, NULL)); | ||
178 | break; | ||
179 | } | ||
180 | |||
181 | } while ((prev = q) != master->slaves); | ||
182 | } | ||
183 | } | ||
184 | |||
185 | static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) | ||
186 | { | ||
187 | struct net_device *dev = sch->dev; | ||
188 | struct teql_master *m = (struct teql_master*)sch->ops; | ||
189 | struct teql_sched_data *q = qdisc_priv(sch); | ||
190 | |||
191 | if (dev->hard_header_len > m->dev->hard_header_len) | ||
192 | return -EINVAL; | ||
193 | |||
194 | if (m->dev == dev) | ||
195 | return -ELOOP; | ||
196 | |||
197 | q->m = m; | ||
198 | |||
199 | skb_queue_head_init(&q->q); | ||
200 | |||
201 | if (m->slaves) { | ||
202 | if (m->dev->flags & IFF_UP) { | ||
203 | if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) | ||
204 | || (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) | ||
205 | || (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) | ||
206 | || dev->mtu < m->dev->mtu) | ||
207 | return -EINVAL; | ||
208 | } else { | ||
209 | if (!(dev->flags&IFF_POINTOPOINT)) | ||
210 | m->dev->flags &= ~IFF_POINTOPOINT; | ||
211 | if (!(dev->flags&IFF_BROADCAST)) | ||
212 | m->dev->flags &= ~IFF_BROADCAST; | ||
213 | if (!(dev->flags&IFF_MULTICAST)) | ||
214 | m->dev->flags &= ~IFF_MULTICAST; | ||
215 | if (dev->mtu < m->dev->mtu) | ||
216 | m->dev->mtu = dev->mtu; | ||
217 | } | ||
218 | q->next = NEXT_SLAVE(m->slaves); | ||
219 | NEXT_SLAVE(m->slaves) = sch; | ||
220 | } else { | ||
221 | q->next = sch; | ||
222 | m->slaves = sch; | ||
223 | m->dev->mtu = dev->mtu; | ||
224 | m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK); | ||
225 | } | ||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | /* "teql*" netdevice routines */ | ||
230 | |||
231 | static int | ||
232 | __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) | ||
233 | { | ||
234 | struct teql_sched_data *q = qdisc_priv(dev->qdisc); | ||
235 | struct neighbour *mn = skb->dst->neighbour; | ||
236 | struct neighbour *n = q->ncache; | ||
237 | |||
238 | if (mn->tbl == NULL) | ||
239 | return -EINVAL; | ||
240 | if (n && n->tbl == mn->tbl && | ||
241 | memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { | ||
242 | atomic_inc(&n->refcnt); | ||
243 | } else { | ||
244 | n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); | ||
245 | if (IS_ERR(n)) | ||
246 | return PTR_ERR(n); | ||
247 | } | ||
248 | if (neigh_event_send(n, skb_res) == 0) { | ||
249 | int err; | ||
250 | read_lock(&n->lock); | ||
251 | err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len); | ||
252 | read_unlock(&n->lock); | ||
253 | if (err < 0) { | ||
254 | neigh_release(n); | ||
255 | return -EINVAL; | ||
256 | } | ||
257 | teql_neigh_release(xchg(&q->ncache, n)); | ||
258 | return 0; | ||
259 | } | ||
260 | neigh_release(n); | ||
261 | return (skb_res == NULL) ? -EAGAIN : 1; | ||
262 | } | ||
263 | |||
264 | static __inline__ int | ||
265 | teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) | ||
266 | { | ||
267 | if (dev->hard_header == NULL || | ||
268 | skb->dst == NULL || | ||
269 | skb->dst->neighbour == NULL) | ||
270 | return 0; | ||
271 | return __teql_resolve(skb, skb_res, dev); | ||
272 | } | ||
273 | |||
274 | static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) | ||
275 | { | ||
276 | struct teql_master *master = (void*)dev->priv; | ||
277 | struct Qdisc *start, *q; | ||
278 | int busy; | ||
279 | int nores; | ||
280 | int len = skb->len; | ||
281 | struct sk_buff *skb_res = NULL; | ||
282 | |||
283 | start = master->slaves; | ||
284 | |||
285 | restart: | ||
286 | nores = 0; | ||
287 | busy = 0; | ||
288 | |||
289 | if ((q = start) == NULL) | ||
290 | goto drop; | ||
291 | |||
292 | do { | ||
293 | struct net_device *slave = q->dev; | ||
294 | |||
295 | if (slave->qdisc_sleeping != q) | ||
296 | continue; | ||
297 | if (netif_queue_stopped(slave) || ! netif_running(slave)) { | ||
298 | busy = 1; | ||
299 | continue; | ||
300 | } | ||
301 | |||
302 | switch (teql_resolve(skb, skb_res, slave)) { | ||
303 | case 0: | ||
304 | if (spin_trylock(&slave->xmit_lock)) { | ||
305 | slave->xmit_lock_owner = smp_processor_id(); | ||
306 | if (!netif_queue_stopped(slave) && | ||
307 | slave->hard_start_xmit(skb, slave) == 0) { | ||
308 | slave->xmit_lock_owner = -1; | ||
309 | spin_unlock(&slave->xmit_lock); | ||
310 | master->slaves = NEXT_SLAVE(q); | ||
311 | netif_wake_queue(dev); | ||
312 | master->stats.tx_packets++; | ||
313 | master->stats.tx_bytes += len; | ||
314 | return 0; | ||
315 | } | ||
316 | slave->xmit_lock_owner = -1; | ||
317 | spin_unlock(&slave->xmit_lock); | ||
318 | } | ||
319 | if (netif_queue_stopped(dev)) | ||
320 | busy = 1; | ||
321 | break; | ||
322 | case 1: | ||
323 | master->slaves = NEXT_SLAVE(q); | ||
324 | return 0; | ||
325 | default: | ||
326 | nores = 1; | ||
327 | break; | ||
328 | } | ||
329 | __skb_pull(skb, skb->nh.raw - skb->data); | ||
330 | } while ((q = NEXT_SLAVE(q)) != start); | ||
331 | |||
332 | if (nores && skb_res == NULL) { | ||
333 | skb_res = skb; | ||
334 | goto restart; | ||
335 | } | ||
336 | |||
337 | if (busy) { | ||
338 | netif_stop_queue(dev); | ||
339 | return 1; | ||
340 | } | ||
341 | master->stats.tx_errors++; | ||
342 | |||
343 | drop: | ||
344 | master->stats.tx_dropped++; | ||
345 | dev_kfree_skb(skb); | ||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | static int teql_master_open(struct net_device *dev) | ||
350 | { | ||
351 | struct Qdisc * q; | ||
352 | struct teql_master *m = (void*)dev->priv; | ||
353 | int mtu = 0xFFFE; | ||
354 | unsigned flags = IFF_NOARP|IFF_MULTICAST; | ||
355 | |||
356 | if (m->slaves == NULL) | ||
357 | return -EUNATCH; | ||
358 | |||
359 | flags = FMASK; | ||
360 | |||
361 | q = m->slaves; | ||
362 | do { | ||
363 | struct net_device *slave = q->dev; | ||
364 | |||
365 | if (slave == NULL) | ||
366 | return -EUNATCH; | ||
367 | |||
368 | if (slave->mtu < mtu) | ||
369 | mtu = slave->mtu; | ||
370 | if (slave->hard_header_len > LL_MAX_HEADER) | ||
371 | return -EINVAL; | ||
372 | |||
373 | /* If all the slaves are BROADCAST, master is BROADCAST | ||
374 | If all the slaves are PtP, master is PtP | ||
375 | Otherwise, master is NBMA. | ||
376 | */ | ||
377 | if (!(slave->flags&IFF_POINTOPOINT)) | ||
378 | flags &= ~IFF_POINTOPOINT; | ||
379 | if (!(slave->flags&IFF_BROADCAST)) | ||
380 | flags &= ~IFF_BROADCAST; | ||
381 | if (!(slave->flags&IFF_MULTICAST)) | ||
382 | flags &= ~IFF_MULTICAST; | ||
383 | } while ((q = NEXT_SLAVE(q)) != m->slaves); | ||
384 | |||
385 | m->dev->mtu = mtu; | ||
386 | m->dev->flags = (m->dev->flags&~FMASK) | flags; | ||
387 | netif_start_queue(m->dev); | ||
388 | return 0; | ||
389 | } | ||
390 | |||
391 | static int teql_master_close(struct net_device *dev) | ||
392 | { | ||
393 | netif_stop_queue(dev); | ||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | static struct net_device_stats *teql_master_stats(struct net_device *dev) | ||
398 | { | ||
399 | struct teql_master *m = (void*)dev->priv; | ||
400 | return &m->stats; | ||
401 | } | ||
402 | |||
403 | static int teql_master_mtu(struct net_device *dev, int new_mtu) | ||
404 | { | ||
405 | struct teql_master *m = (void*)dev->priv; | ||
406 | struct Qdisc *q; | ||
407 | |||
408 | if (new_mtu < 68) | ||
409 | return -EINVAL; | ||
410 | |||
411 | q = m->slaves; | ||
412 | if (q) { | ||
413 | do { | ||
414 | if (new_mtu > q->dev->mtu) | ||
415 | return -EINVAL; | ||
416 | } while ((q=NEXT_SLAVE(q)) != m->slaves); | ||
417 | } | ||
418 | |||
419 | dev->mtu = new_mtu; | ||
420 | return 0; | ||
421 | } | ||
422 | |||
423 | static __init void teql_master_setup(struct net_device *dev) | ||
424 | { | ||
425 | struct teql_master *master = dev->priv; | ||
426 | struct Qdisc_ops *ops = &master->qops; | ||
427 | |||
428 | master->dev = dev; | ||
429 | ops->priv_size = sizeof(struct teql_sched_data); | ||
430 | |||
431 | ops->enqueue = teql_enqueue; | ||
432 | ops->dequeue = teql_dequeue; | ||
433 | ops->requeue = teql_requeue; | ||
434 | ops->init = teql_qdisc_init; | ||
435 | ops->reset = teql_reset; | ||
436 | ops->destroy = teql_destroy; | ||
437 | ops->owner = THIS_MODULE; | ||
438 | |||
439 | dev->open = teql_master_open; | ||
440 | dev->hard_start_xmit = teql_master_xmit; | ||
441 | dev->stop = teql_master_close; | ||
442 | dev->get_stats = teql_master_stats; | ||
443 | dev->change_mtu = teql_master_mtu; | ||
444 | dev->type = ARPHRD_VOID; | ||
445 | dev->mtu = 1500; | ||
446 | dev->tx_queue_len = 100; | ||
447 | dev->flags = IFF_NOARP; | ||
448 | dev->hard_header_len = LL_MAX_HEADER; | ||
449 | SET_MODULE_OWNER(dev); | ||
450 | } | ||
451 | |||
452 | static LIST_HEAD(master_dev_list); | ||
453 | static int max_equalizers = 1; | ||
454 | module_param(max_equalizers, int, 0); | ||
455 | MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers"); | ||
456 | |||
457 | static int __init teql_init(void) | ||
458 | { | ||
459 | int i; | ||
460 | int err = -ENODEV; | ||
461 | |||
462 | for (i = 0; i < max_equalizers; i++) { | ||
463 | struct net_device *dev; | ||
464 | struct teql_master *master; | ||
465 | |||
466 | dev = alloc_netdev(sizeof(struct teql_master), | ||
467 | "teql%d", teql_master_setup); | ||
468 | if (!dev) { | ||
469 | err = -ENOMEM; | ||
470 | break; | ||
471 | } | ||
472 | |||
473 | if ((err = register_netdev(dev))) { | ||
474 | free_netdev(dev); | ||
475 | break; | ||
476 | } | ||
477 | |||
478 | master = dev->priv; | ||
479 | |||
480 | strlcpy(master->qops.id, dev->name, IFNAMSIZ); | ||
481 | err = register_qdisc(&master->qops); | ||
482 | |||
483 | if (err) { | ||
484 | unregister_netdev(dev); | ||
485 | free_netdev(dev); | ||
486 | break; | ||
487 | } | ||
488 | |||
489 | list_add_tail(&master->master_list, &master_dev_list); | ||
490 | } | ||
491 | return i ? 0 : err; | ||
492 | } | ||
493 | |||
494 | static void __exit teql_exit(void) | ||
495 | { | ||
496 | struct teql_master *master, *nxt; | ||
497 | |||
498 | list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) { | ||
499 | |||
500 | list_del(&master->master_list); | ||
501 | |||
502 | unregister_qdisc(&master->qops); | ||
503 | unregister_netdev(master->dev); | ||
504 | free_netdev(master->dev); | ||
505 | } | ||
506 | } | ||
507 | |||
508 | module_init(teql_init); | ||
509 | module_exit(teql_exit); | ||
510 | |||
511 | MODULE_LICENSE("GPL"); | ||