diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/sched/sch_sfq.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'net/sched/sch_sfq.c')
-rw-r--r-- | net/sched/sch_sfq.c | 409 |
1 files changed, 253 insertions, 156 deletions
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 201cbac2b32c..b6ea6afa55b0 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/skbuff.h> | 21 | #include <linux/skbuff.h> |
22 | #include <linux/jhash.h> | 22 | #include <linux/jhash.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | ||
24 | #include <net/ip.h> | 25 | #include <net/ip.h> |
25 | #include <net/netlink.h> | 26 | #include <net/netlink.h> |
26 | #include <net/pkt_sched.h> | 27 | #include <net/pkt_sched.h> |
@@ -67,55 +68,81 @@ | |||
67 | 68 | ||
68 | IMPLEMENTATION: | 69 | IMPLEMENTATION: |
69 | This implementation limits maximal queue length to 128; | 70 | This implementation limits maximal queue length to 128; |
70 | maximal mtu to 2^15-1; number of hash buckets to 1024. | 71 | max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024. |
71 | The only goal of this restrictions was that all data | 72 | The only goal of this restrictions was that all data |
72 | fit into one 4K page :-). Struct sfq_sched_data is | 73 | fit into one 4K page on 32bit arches. |
73 | organized in anti-cache manner: all the data for a bucket | ||
74 | are scattered over different locations. This is not good, | ||
75 | but it allowed me to put it into 4K. | ||
76 | 74 | ||
77 | It is easy to increase these values, but not in flight. */ | 75 | It is easy to increase these values, but not in flight. */ |
78 | 76 | ||
79 | #define SFQ_DEPTH 128 | 77 | #define SFQ_DEPTH 128 /* max number of packets per flow */ |
80 | #define SFQ_HASH_DIVISOR 1024 | 78 | #define SFQ_SLOTS 128 /* max number of flows */ |
79 | #define SFQ_EMPTY_SLOT 255 | ||
80 | #define SFQ_DEFAULT_HASH_DIVISOR 1024 | ||
81 | 81 | ||
82 | /* This type should contain at least SFQ_DEPTH*2 values */ | 82 | /* We use 16 bits to store allot, and want to handle packets up to 64K |
83 | * Scale allot by 8 (1<<3) so that no overflow occurs. | ||
84 | */ | ||
85 | #define SFQ_ALLOT_SHIFT 3 | ||
86 | #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) | ||
87 | |||
88 | /* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */ | ||
83 | typedef unsigned char sfq_index; | 89 | typedef unsigned char sfq_index; |
84 | 90 | ||
85 | struct sfq_head | 91 | /* |
86 | { | 92 | * We dont use pointers to save space. |
93 | * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array | ||
94 | * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1] | ||
95 | * are 'pointers' to dep[] array | ||
96 | */ | ||
97 | struct sfq_head { | ||
87 | sfq_index next; | 98 | sfq_index next; |
88 | sfq_index prev; | 99 | sfq_index prev; |
89 | }; | 100 | }; |
90 | 101 | ||
91 | struct sfq_sched_data | 102 | struct sfq_slot { |
92 | { | 103 | struct sk_buff *skblist_next; |
104 | struct sk_buff *skblist_prev; | ||
105 | sfq_index qlen; /* number of skbs in skblist */ | ||
106 | sfq_index next; /* next slot in sfq chain */ | ||
107 | struct sfq_head dep; /* anchor in dep[] chains */ | ||
108 | unsigned short hash; /* hash value (index in ht[]) */ | ||
109 | short allot; /* credit for this slot */ | ||
110 | }; | ||
111 | |||
112 | struct sfq_sched_data { | ||
93 | /* Parameters */ | 113 | /* Parameters */ |
94 | int perturb_period; | 114 | int perturb_period; |
95 | unsigned quantum; /* Allotment per round: MUST BE >= MTU */ | 115 | unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ |
96 | int limit; | 116 | int limit; |
97 | 117 | unsigned int divisor; /* number of slots in hash table */ | |
98 | /* Variables */ | 118 | /* Variables */ |
99 | struct tcf_proto *filter_list; | 119 | struct tcf_proto *filter_list; |
100 | struct timer_list perturb_timer; | 120 | struct timer_list perturb_timer; |
101 | u32 perturbation; | 121 | u32 perturbation; |
102 | sfq_index tail; /* Index of current slot in round */ | 122 | sfq_index cur_depth; /* depth of longest slot */ |
103 | sfq_index max_depth; /* Maximal depth */ | 123 | unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ |
104 | 124 | struct sfq_slot *tail; /* current slot in round */ | |
105 | sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ | 125 | sfq_index *ht; /* Hash table (divisor slots) */ |
106 | sfq_index next[SFQ_DEPTH]; /* Active slots link */ | 126 | struct sfq_slot slots[SFQ_SLOTS]; |
107 | short allot[SFQ_DEPTH]; /* Current allotment per slot */ | 127 | struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */ |
108 | unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ | ||
109 | struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ | ||
110 | struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ | ||
111 | }; | 128 | }; |
112 | 129 | ||
113 | static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) | 130 | /* |
131 | * sfq_head are either in a sfq_slot or in dep[] array | ||
132 | */ | ||
133 | static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) | ||
114 | { | 134 | { |
115 | return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1); | 135 | if (val < SFQ_SLOTS) |
136 | return &q->slots[val].dep; | ||
137 | return &q->dep[val - SFQ_SLOTS]; | ||
116 | } | 138 | } |
117 | 139 | ||
118 | static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) | 140 | static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) |
141 | { | ||
142 | return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1); | ||
143 | } | ||
144 | |||
145 | static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) | ||
119 | { | 146 | { |
120 | u32 h, h2; | 147 | u32 h, h2; |
121 | 148 | ||
@@ -123,40 +150,39 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) | |||
123 | case htons(ETH_P_IP): | 150 | case htons(ETH_P_IP): |
124 | { | 151 | { |
125 | const struct iphdr *iph; | 152 | const struct iphdr *iph; |
153 | int poff; | ||
126 | 154 | ||
127 | if (!pskb_network_may_pull(skb, sizeof(*iph))) | 155 | if (!pskb_network_may_pull(skb, sizeof(*iph))) |
128 | goto err; | 156 | goto err; |
129 | iph = ip_hdr(skb); | 157 | iph = ip_hdr(skb); |
130 | h = (__force u32)iph->daddr; | 158 | h = (__force u32)iph->daddr; |
131 | h2 = (__force u32)iph->saddr ^ iph->protocol; | 159 | h2 = (__force u32)iph->saddr ^ iph->protocol; |
132 | if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && | 160 | if (iph->frag_off & htons(IP_MF | IP_OFFSET)) |
133 | (iph->protocol == IPPROTO_TCP || | 161 | break; |
134 | iph->protocol == IPPROTO_UDP || | 162 | poff = proto_ports_offset(iph->protocol); |
135 | iph->protocol == IPPROTO_UDPLITE || | 163 | if (poff >= 0 && |
136 | iph->protocol == IPPROTO_SCTP || | 164 | pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { |
137 | iph->protocol == IPPROTO_DCCP || | 165 | iph = ip_hdr(skb); |
138 | iph->protocol == IPPROTO_ESP) && | 166 | h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff); |
139 | pskb_network_may_pull(skb, iph->ihl * 4 + 4)) | 167 | } |
140 | h2 ^= *(((u32*)iph) + iph->ihl); | ||
141 | break; | 168 | break; |
142 | } | 169 | } |
143 | case htons(ETH_P_IPV6): | 170 | case htons(ETH_P_IPV6): |
144 | { | 171 | { |
145 | struct ipv6hdr *iph; | 172 | const struct ipv6hdr *iph; |
173 | int poff; | ||
146 | 174 | ||
147 | if (!pskb_network_may_pull(skb, sizeof(*iph))) | 175 | if (!pskb_network_may_pull(skb, sizeof(*iph))) |
148 | goto err; | 176 | goto err; |
149 | iph = ipv6_hdr(skb); | 177 | iph = ipv6_hdr(skb); |
150 | h = (__force u32)iph->daddr.s6_addr32[3]; | 178 | h = (__force u32)iph->daddr.s6_addr32[3]; |
151 | h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; | 179 | h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; |
152 | if ((iph->nexthdr == IPPROTO_TCP || | 180 | poff = proto_ports_offset(iph->nexthdr); |
153 | iph->nexthdr == IPPROTO_UDP || | 181 | if (poff >= 0 && |
154 | iph->nexthdr == IPPROTO_UDPLITE || | 182 | pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { |
155 | iph->nexthdr == IPPROTO_SCTP || | 183 | iph = ipv6_hdr(skb); |
156 | iph->nexthdr == IPPROTO_DCCP || | 184 | h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff); |
157 | iph->nexthdr == IPPROTO_ESP) && | 185 | } |
158 | pskb_network_may_pull(skb, sizeof(*iph) + 4)) | ||
159 | h2 ^= *(u32*)&iph[1]; | ||
160 | break; | 186 | break; |
161 | } | 187 | } |
162 | default: | 188 | default: |
@@ -177,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, | |||
177 | 203 | ||
178 | if (TC_H_MAJ(skb->priority) == sch->handle && | 204 | if (TC_H_MAJ(skb->priority) == sch->handle && |
179 | TC_H_MIN(skb->priority) > 0 && | 205 | TC_H_MIN(skb->priority) > 0 && |
180 | TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR) | 206 | TC_H_MIN(skb->priority) <= q->divisor) |
181 | return TC_H_MIN(skb->priority); | 207 | return TC_H_MIN(skb->priority); |
182 | 208 | ||
183 | if (!q->filter_list) | 209 | if (!q->filter_list) |
@@ -195,36 +221,47 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, | |||
195 | return 0; | 221 | return 0; |
196 | } | 222 | } |
197 | #endif | 223 | #endif |
198 | if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR) | 224 | if (TC_H_MIN(res.classid) <= q->divisor) |
199 | return TC_H_MIN(res.classid); | 225 | return TC_H_MIN(res.classid); |
200 | } | 226 | } |
201 | return 0; | 227 | return 0; |
202 | } | 228 | } |
203 | 229 | ||
230 | /* | ||
231 | * x : slot number [0 .. SFQ_SLOTS - 1] | ||
232 | */ | ||
204 | static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) | 233 | static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) |
205 | { | 234 | { |
206 | sfq_index p, n; | 235 | sfq_index p, n; |
207 | int d = q->qs[x].qlen + SFQ_DEPTH; | 236 | int qlen = q->slots[x].qlen; |
208 | 237 | ||
209 | p = d; | 238 | p = qlen + SFQ_SLOTS; |
210 | n = q->dep[d].next; | 239 | n = q->dep[qlen].next; |
211 | q->dep[x].next = n; | 240 | |
212 | q->dep[x].prev = p; | 241 | q->slots[x].dep.next = n; |
213 | q->dep[p].next = q->dep[n].prev = x; | 242 | q->slots[x].dep.prev = p; |
243 | |||
244 | q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ | ||
245 | sfq_dep_head(q, n)->prev = x; | ||
214 | } | 246 | } |
215 | 247 | ||
248 | #define sfq_unlink(q, x, n, p) \ | ||
249 | n = q->slots[x].dep.next; \ | ||
250 | p = q->slots[x].dep.prev; \ | ||
251 | sfq_dep_head(q, p)->next = n; \ | ||
252 | sfq_dep_head(q, n)->prev = p | ||
253 | |||
254 | |||
216 | static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) | 255 | static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) |
217 | { | 256 | { |
218 | sfq_index p, n; | 257 | sfq_index p, n; |
258 | int d; | ||
219 | 259 | ||
220 | n = q->dep[x].next; | 260 | sfq_unlink(q, x, n, p); |
221 | p = q->dep[x].prev; | ||
222 | q->dep[p].next = n; | ||
223 | q->dep[n].prev = p; | ||
224 | |||
225 | if (n == p && q->max_depth == q->qs[x].qlen + 1) | ||
226 | q->max_depth--; | ||
227 | 261 | ||
262 | d = q->slots[x].qlen--; | ||
263 | if (n == p && q->cur_depth == d) | ||
264 | q->cur_depth--; | ||
228 | sfq_link(q, x); | 265 | sfq_link(q, x); |
229 | } | 266 | } |
230 | 267 | ||
@@ -233,34 +270,74 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) | |||
233 | sfq_index p, n; | 270 | sfq_index p, n; |
234 | int d; | 271 | int d; |
235 | 272 | ||
236 | n = q->dep[x].next; | 273 | sfq_unlink(q, x, n, p); |
237 | p = q->dep[x].prev; | ||
238 | q->dep[p].next = n; | ||
239 | q->dep[n].prev = p; | ||
240 | d = q->qs[x].qlen; | ||
241 | if (q->max_depth < d) | ||
242 | q->max_depth = d; | ||
243 | 274 | ||
275 | d = ++q->slots[x].qlen; | ||
276 | if (q->cur_depth < d) | ||
277 | q->cur_depth = d; | ||
244 | sfq_link(q, x); | 278 | sfq_link(q, x); |
245 | } | 279 | } |
246 | 280 | ||
281 | /* helper functions : might be changed when/if skb use a standard list_head */ | ||
282 | |||
283 | /* remove one skb from tail of slot queue */ | ||
284 | static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) | ||
285 | { | ||
286 | struct sk_buff *skb = slot->skblist_prev; | ||
287 | |||
288 | slot->skblist_prev = skb->prev; | ||
289 | skb->prev->next = (struct sk_buff *)slot; | ||
290 | skb->next = skb->prev = NULL; | ||
291 | return skb; | ||
292 | } | ||
293 | |||
294 | /* remove one skb from head of slot queue */ | ||
295 | static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) | ||
296 | { | ||
297 | struct sk_buff *skb = slot->skblist_next; | ||
298 | |||
299 | slot->skblist_next = skb->next; | ||
300 | skb->next->prev = (struct sk_buff *)slot; | ||
301 | skb->next = skb->prev = NULL; | ||
302 | return skb; | ||
303 | } | ||
304 | |||
305 | static inline void slot_queue_init(struct sfq_slot *slot) | ||
306 | { | ||
307 | slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; | ||
308 | } | ||
309 | |||
310 | /* add skb to slot queue (tail add) */ | ||
311 | static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) | ||
312 | { | ||
313 | skb->prev = slot->skblist_prev; | ||
314 | skb->next = (struct sk_buff *)slot; | ||
315 | slot->skblist_prev->next = skb; | ||
316 | slot->skblist_prev = skb; | ||
317 | } | ||
318 | |||
319 | #define slot_queue_walk(slot, skb) \ | ||
320 | for (skb = slot->skblist_next; \ | ||
321 | skb != (struct sk_buff *)slot; \ | ||
322 | skb = skb->next) | ||
323 | |||
247 | static unsigned int sfq_drop(struct Qdisc *sch) | 324 | static unsigned int sfq_drop(struct Qdisc *sch) |
248 | { | 325 | { |
249 | struct sfq_sched_data *q = qdisc_priv(sch); | 326 | struct sfq_sched_data *q = qdisc_priv(sch); |
250 | sfq_index d = q->max_depth; | 327 | sfq_index x, d = q->cur_depth; |
251 | struct sk_buff *skb; | 328 | struct sk_buff *skb; |
252 | unsigned int len; | 329 | unsigned int len; |
330 | struct sfq_slot *slot; | ||
253 | 331 | ||
254 | /* Queue is full! Find the longest slot and | 332 | /* Queue is full! Find the longest slot and drop tail packet from it */ |
255 | drop a packet from it */ | ||
256 | |||
257 | if (d > 1) { | 333 | if (d > 1) { |
258 | sfq_index x = q->dep[d + SFQ_DEPTH].next; | 334 | x = q->dep[d].next; |
259 | skb = q->qs[x].prev; | 335 | slot = &q->slots[x]; |
336 | drop: | ||
337 | skb = slot_dequeue_tail(slot); | ||
260 | len = qdisc_pkt_len(skb); | 338 | len = qdisc_pkt_len(skb); |
261 | __skb_unlink(skb, &q->qs[x]); | ||
262 | kfree_skb(skb); | ||
263 | sfq_dec(q, x); | 339 | sfq_dec(q, x); |
340 | kfree_skb(skb); | ||
264 | sch->q.qlen--; | 341 | sch->q.qlen--; |
265 | sch->qstats.drops++; | 342 | sch->qstats.drops++; |
266 | sch->qstats.backlog -= len; | 343 | sch->qstats.backlog -= len; |
@@ -269,19 +346,11 @@ static unsigned int sfq_drop(struct Qdisc *sch) | |||
269 | 346 | ||
270 | if (d == 1) { | 347 | if (d == 1) { |
271 | /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ | 348 | /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ |
272 | d = q->next[q->tail]; | 349 | x = q->tail->next; |
273 | q->next[q->tail] = q->next[d]; | 350 | slot = &q->slots[x]; |
274 | q->allot[q->next[d]] += q->quantum; | 351 | q->tail->next = slot->next; |
275 | skb = q->qs[d].prev; | 352 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; |
276 | len = qdisc_pkt_len(skb); | 353 | goto drop; |
277 | __skb_unlink(skb, &q->qs[d]); | ||
278 | kfree_skb(skb); | ||
279 | sfq_dec(q, d); | ||
280 | sch->q.qlen--; | ||
281 | q->ht[q->hash[d]] = SFQ_DEPTH; | ||
282 | sch->qstats.drops++; | ||
283 | sch->qstats.backlog -= len; | ||
284 | return len; | ||
285 | } | 354 | } |
286 | 355 | ||
287 | return 0; | 356 | return 0; |
@@ -292,7 +361,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
292 | { | 361 | { |
293 | struct sfq_sched_data *q = qdisc_priv(sch); | 362 | struct sfq_sched_data *q = qdisc_priv(sch); |
294 | unsigned int hash; | 363 | unsigned int hash; |
295 | sfq_index x; | 364 | sfq_index x, qlen; |
365 | struct sfq_slot *slot; | ||
296 | int uninitialized_var(ret); | 366 | int uninitialized_var(ret); |
297 | 367 | ||
298 | hash = sfq_classify(skb, sch, &ret); | 368 | hash = sfq_classify(skb, sch, &ret); |
@@ -305,54 +375,42 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
305 | hash--; | 375 | hash--; |
306 | 376 | ||
307 | x = q->ht[hash]; | 377 | x = q->ht[hash]; |
308 | if (x == SFQ_DEPTH) { | 378 | slot = &q->slots[x]; |
309 | q->ht[hash] = x = q->dep[SFQ_DEPTH].next; | 379 | if (x == SFQ_EMPTY_SLOT) { |
310 | q->hash[x] = hash; | 380 | x = q->dep[0].next; /* get a free slot */ |
381 | q->ht[hash] = x; | ||
382 | slot = &q->slots[x]; | ||
383 | slot->hash = hash; | ||
311 | } | 384 | } |
312 | 385 | ||
313 | /* If selected queue has length q->limit, this means that | 386 | /* If selected queue has length q->limit, do simple tail drop, |
314 | * all another queues are empty and that we do simple tail drop, | ||
315 | * i.e. drop _this_ packet. | 387 | * i.e. drop _this_ packet. |
316 | */ | 388 | */ |
317 | if (q->qs[x].qlen >= q->limit) | 389 | if (slot->qlen >= q->limit) |
318 | return qdisc_drop(skb, sch); | 390 | return qdisc_drop(skb, sch); |
319 | 391 | ||
320 | sch->qstats.backlog += qdisc_pkt_len(skb); | 392 | sch->qstats.backlog += qdisc_pkt_len(skb); |
321 | __skb_queue_tail(&q->qs[x], skb); | 393 | slot_queue_add(slot, skb); |
322 | sfq_inc(q, x); | 394 | sfq_inc(q, x); |
323 | if (q->qs[x].qlen == 1) { /* The flow is new */ | 395 | if (slot->qlen == 1) { /* The flow is new */ |
324 | if (q->tail == SFQ_DEPTH) { /* It is the first flow */ | 396 | if (q->tail == NULL) { /* It is the first flow */ |
325 | q->tail = x; | 397 | slot->next = x; |
326 | q->next[x] = x; | ||
327 | q->allot[x] = q->quantum; | ||
328 | } else { | 398 | } else { |
329 | q->next[x] = q->next[q->tail]; | 399 | slot->next = q->tail->next; |
330 | q->next[q->tail] = x; | 400 | q->tail->next = x; |
331 | q->tail = x; | ||
332 | } | 401 | } |
402 | q->tail = slot; | ||
403 | slot->allot = q->scaled_quantum; | ||
333 | } | 404 | } |
334 | if (++sch->q.qlen <= q->limit) { | 405 | if (++sch->q.qlen <= q->limit) |
335 | sch->bstats.bytes += qdisc_pkt_len(skb); | ||
336 | sch->bstats.packets++; | ||
337 | return NET_XMIT_SUCCESS; | 406 | return NET_XMIT_SUCCESS; |
338 | } | ||
339 | 407 | ||
408 | qlen = slot->qlen; | ||
340 | sfq_drop(sch); | 409 | sfq_drop(sch); |
341 | return NET_XMIT_CN; | 410 | /* Return Congestion Notification only if we dropped a packet |
342 | } | 411 | * from this flow. |
343 | 412 | */ | |
344 | static struct sk_buff * | 413 | return (qlen != slot->qlen) ? NET_XMIT_CN : NET_XMIT_SUCCESS; |
345 | sfq_peek(struct Qdisc *sch) | ||
346 | { | ||
347 | struct sfq_sched_data *q = qdisc_priv(sch); | ||
348 | sfq_index a; | ||
349 | |||
350 | /* No active slots */ | ||
351 | if (q->tail == SFQ_DEPTH) | ||
352 | return NULL; | ||
353 | |||
354 | a = q->next[q->tail]; | ||
355 | return skb_peek(&q->qs[a]); | ||
356 | } | 414 | } |
357 | 415 | ||
358 | static struct sk_buff * | 416 | static struct sk_buff * |
@@ -360,34 +418,38 @@ sfq_dequeue(struct Qdisc *sch) | |||
360 | { | 418 | { |
361 | struct sfq_sched_data *q = qdisc_priv(sch); | 419 | struct sfq_sched_data *q = qdisc_priv(sch); |
362 | struct sk_buff *skb; | 420 | struct sk_buff *skb; |
363 | sfq_index a, old_a; | 421 | sfq_index a, next_a; |
422 | struct sfq_slot *slot; | ||
364 | 423 | ||
365 | /* No active slots */ | 424 | /* No active slots */ |
366 | if (q->tail == SFQ_DEPTH) | 425 | if (q->tail == NULL) |
367 | return NULL; | 426 | return NULL; |
368 | 427 | ||
369 | a = old_a = q->next[q->tail]; | 428 | next_slot: |
370 | 429 | a = q->tail->next; | |
371 | /* Grab packet */ | 430 | slot = &q->slots[a]; |
372 | skb = __skb_dequeue(&q->qs[a]); | 431 | if (slot->allot <= 0) { |
432 | q->tail = slot; | ||
433 | slot->allot += q->scaled_quantum; | ||
434 | goto next_slot; | ||
435 | } | ||
436 | skb = slot_dequeue_head(slot); | ||
373 | sfq_dec(q, a); | 437 | sfq_dec(q, a); |
438 | qdisc_bstats_update(sch, skb); | ||
374 | sch->q.qlen--; | 439 | sch->q.qlen--; |
375 | sch->qstats.backlog -= qdisc_pkt_len(skb); | 440 | sch->qstats.backlog -= qdisc_pkt_len(skb); |
376 | 441 | ||
377 | /* Is the slot empty? */ | 442 | /* Is the slot empty? */ |
378 | if (q->qs[a].qlen == 0) { | 443 | if (slot->qlen == 0) { |
379 | q->ht[q->hash[a]] = SFQ_DEPTH; | 444 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; |
380 | a = q->next[a]; | 445 | next_a = slot->next; |
381 | if (a == old_a) { | 446 | if (a == next_a) { |
382 | q->tail = SFQ_DEPTH; | 447 | q->tail = NULL; /* no more active slots */ |
383 | return skb; | 448 | return skb; |
384 | } | 449 | } |
385 | q->next[q->tail] = a; | 450 | q->tail->next = next_a; |
386 | q->allot[a] += q->quantum; | 451 | } else { |
387 | } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) { | 452 | slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb)); |
388 | q->tail = a; | ||
389 | a = q->next[a]; | ||
390 | q->allot[a] += q->quantum; | ||
391 | } | 453 | } |
392 | return skb; | 454 | return skb; |
393 | } | 455 | } |
@@ -421,12 +483,18 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) | |||
421 | if (opt->nla_len < nla_attr_size(sizeof(*ctl))) | 483 | if (opt->nla_len < nla_attr_size(sizeof(*ctl))) |
422 | return -EINVAL; | 484 | return -EINVAL; |
423 | 485 | ||
486 | if (ctl->divisor && | ||
487 | (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) | ||
488 | return -EINVAL; | ||
489 | |||
424 | sch_tree_lock(sch); | 490 | sch_tree_lock(sch); |
425 | q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch)); | 491 | q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch)); |
492 | q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); | ||
426 | q->perturb_period = ctl->perturb_period * HZ; | 493 | q->perturb_period = ctl->perturb_period * HZ; |
427 | if (ctl->limit) | 494 | if (ctl->limit) |
428 | q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); | 495 | q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); |
429 | 496 | if (ctl->divisor) | |
497 | q->divisor = ctl->divisor; | ||
430 | qlen = sch->q.qlen; | 498 | qlen = sch->q.qlen; |
431 | while (sch->q.qlen > q->limit) | 499 | while (sch->q.qlen > q->limit) |
432 | sfq_drop(sch); | 500 | sfq_drop(sch); |
@@ -444,26 +512,25 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) | |||
444 | static int sfq_init(struct Qdisc *sch, struct nlattr *opt) | 512 | static int sfq_init(struct Qdisc *sch, struct nlattr *opt) |
445 | { | 513 | { |
446 | struct sfq_sched_data *q = qdisc_priv(sch); | 514 | struct sfq_sched_data *q = qdisc_priv(sch); |
515 | size_t sz; | ||
447 | int i; | 516 | int i; |
448 | 517 | ||
449 | q->perturb_timer.function = sfq_perturbation; | 518 | q->perturb_timer.function = sfq_perturbation; |
450 | q->perturb_timer.data = (unsigned long)sch; | 519 | q->perturb_timer.data = (unsigned long)sch; |
451 | init_timer_deferrable(&q->perturb_timer); | 520 | init_timer_deferrable(&q->perturb_timer); |
452 | 521 | ||
453 | for (i = 0; i < SFQ_HASH_DIVISOR; i++) | ||
454 | q->ht[i] = SFQ_DEPTH; | ||
455 | |||
456 | for (i = 0; i < SFQ_DEPTH; i++) { | 522 | for (i = 0; i < SFQ_DEPTH; i++) { |
457 | skb_queue_head_init(&q->qs[i]); | 523 | q->dep[i].next = i + SFQ_SLOTS; |
458 | q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH; | 524 | q->dep[i].prev = i + SFQ_SLOTS; |
459 | q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH; | ||
460 | } | 525 | } |
461 | 526 | ||
462 | q->limit = SFQ_DEPTH - 1; | 527 | q->limit = SFQ_DEPTH - 1; |
463 | q->max_depth = 0; | 528 | q->cur_depth = 0; |
464 | q->tail = SFQ_DEPTH; | 529 | q->tail = NULL; |
530 | q->divisor = SFQ_DEFAULT_HASH_DIVISOR; | ||
465 | if (opt == NULL) { | 531 | if (opt == NULL) { |
466 | q->quantum = psched_mtu(qdisc_dev(sch)); | 532 | q->quantum = psched_mtu(qdisc_dev(sch)); |
533 | q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); | ||
467 | q->perturb_period = 0; | 534 | q->perturb_period = 0; |
468 | q->perturbation = net_random(); | 535 | q->perturbation = net_random(); |
469 | } else { | 536 | } else { |
@@ -472,8 +539,23 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) | |||
472 | return err; | 539 | return err; |
473 | } | 540 | } |
474 | 541 | ||
475 | for (i = 0; i < SFQ_DEPTH; i++) | 542 | sz = sizeof(q->ht[0]) * q->divisor; |
543 | q->ht = kmalloc(sz, GFP_KERNEL); | ||
544 | if (!q->ht && sz > PAGE_SIZE) | ||
545 | q->ht = vmalloc(sz); | ||
546 | if (!q->ht) | ||
547 | return -ENOMEM; | ||
548 | for (i = 0; i < q->divisor; i++) | ||
549 | q->ht[i] = SFQ_EMPTY_SLOT; | ||
550 | |||
551 | for (i = 0; i < SFQ_SLOTS; i++) { | ||
552 | slot_queue_init(&q->slots[i]); | ||
476 | sfq_link(q, i); | 553 | sfq_link(q, i); |
554 | } | ||
555 | if (q->limit >= 1) | ||
556 | sch->flags |= TCQ_F_CAN_BYPASS; | ||
557 | else | ||
558 | sch->flags &= ~TCQ_F_CAN_BYPASS; | ||
477 | return 0; | 559 | return 0; |
478 | } | 560 | } |
479 | 561 | ||
@@ -484,6 +566,10 @@ static void sfq_destroy(struct Qdisc *sch) | |||
484 | tcf_destroy_chain(&q->filter_list); | 566 | tcf_destroy_chain(&q->filter_list); |
485 | q->perturb_period = 0; | 567 | q->perturb_period = 0; |
486 | del_timer_sync(&q->perturb_timer); | 568 | del_timer_sync(&q->perturb_timer); |
569 | if (is_vmalloc_addr(q->ht)) | ||
570 | vfree(q->ht); | ||
571 | else | ||
572 | kfree(q->ht); | ||
487 | } | 573 | } |
488 | 574 | ||
489 | static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) | 575 | static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) |
@@ -496,7 +582,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) | |||
496 | opt.perturb_period = q->perturb_period / HZ; | 582 | opt.perturb_period = q->perturb_period / HZ; |
497 | 583 | ||
498 | opt.limit = q->limit; | 584 | opt.limit = q->limit; |
499 | opt.divisor = SFQ_HASH_DIVISOR; | 585 | opt.divisor = q->divisor; |
500 | opt.flows = q->limit; | 586 | opt.flows = q->limit; |
501 | 587 | ||
502 | NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); | 588 | NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); |
@@ -521,6 +607,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid) | |||
521 | static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, | 607 | static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, |
522 | u32 classid) | 608 | u32 classid) |
523 | { | 609 | { |
610 | /* we cannot bypass queue discipline anymore */ | ||
611 | sch->flags &= ~TCQ_F_CAN_BYPASS; | ||
524 | return 0; | 612 | return 0; |
525 | } | 613 | } |
526 | 614 | ||
@@ -548,10 +636,19 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, | |||
548 | struct gnet_dump *d) | 636 | struct gnet_dump *d) |
549 | { | 637 | { |
550 | struct sfq_sched_data *q = qdisc_priv(sch); | 638 | struct sfq_sched_data *q = qdisc_priv(sch); |
551 | sfq_index idx = q->ht[cl-1]; | 639 | sfq_index idx = q->ht[cl - 1]; |
552 | struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen }; | 640 | struct gnet_stats_queue qs = { 0 }; |
553 | struct tc_sfq_xstats xstats = { .allot = q->allot[idx] }; | 641 | struct tc_sfq_xstats xstats = { 0 }; |
642 | struct sk_buff *skb; | ||
554 | 643 | ||
644 | if (idx != SFQ_EMPTY_SLOT) { | ||
645 | const struct sfq_slot *slot = &q->slots[idx]; | ||
646 | |||
647 | xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; | ||
648 | qs.qlen = slot->qlen; | ||
649 | slot_queue_walk(slot, skb) | ||
650 | qs.backlog += qdisc_pkt_len(skb); | ||
651 | } | ||
555 | if (gnet_stats_copy_queue(d, &qs) < 0) | 652 | if (gnet_stats_copy_queue(d, &qs) < 0) |
556 | return -1; | 653 | return -1; |
557 | return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); | 654 | return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); |
@@ -565,8 +662,8 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) | |||
565 | if (arg->stop) | 662 | if (arg->stop) |
566 | return; | 663 | return; |
567 | 664 | ||
568 | for (i = 0; i < SFQ_HASH_DIVISOR; i++) { | 665 | for (i = 0; i < q->divisor; i++) { |
569 | if (q->ht[i] == SFQ_DEPTH || | 666 | if (q->ht[i] == SFQ_EMPTY_SLOT || |
570 | arg->count < arg->skip) { | 667 | arg->count < arg->skip) { |
571 | arg->count++; | 668 | arg->count++; |
572 | continue; | 669 | continue; |
@@ -597,7 +694,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { | |||
597 | .priv_size = sizeof(struct sfq_sched_data), | 694 | .priv_size = sizeof(struct sfq_sched_data), |
598 | .enqueue = sfq_enqueue, | 695 | .enqueue = sfq_enqueue, |
599 | .dequeue = sfq_dequeue, | 696 | .dequeue = sfq_dequeue, |
600 | .peek = sfq_peek, | 697 | .peek = qdisc_peek_dequeued, |
601 | .drop = sfq_drop, | 698 | .drop = sfq_drop, |
602 | .init = sfq_init, | 699 | .init = sfq_init, |
603 | .reset = sfq_reset, | 700 | .reset = sfq_reset, |