aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-08-29 18:49:55 -0400
committerDavid S. Miller <davem@davemloft.net>2013-08-29 21:38:31 -0400
commitafe4fd062416b158a8a8538b23adc1930a9b88dc (patch)
treefeceeabeb0454af51c315b7d206a931800ebf97c
parent7ec06da81d2b98859b558d8d551a0c4e3d9516a3 (diff)
pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel) - Uses the new_flow/old_flow separation from FQ_codel - New flows get an initial credit allowing IW10 without added delay. - Special FIFO queue for high prio packets (no need for PRIO + FQ) - Uses a hash table of RB trees to locate the flows at enqueue() time - Smart on demand gc (at enqueue() time, RB tree lookup evicts old unused flows) - Dynamic memory allocations. - Designed to allow millions of concurrent flows per Qdisc. - Small memory footprint : ~8K per Qdisc, and 104 bytes per flow. - Single high resolution timer for throttled flows (if any). - One RB tree to link throttled flows. - Ability to have a max rate per flow. We might add a socket option to add per socket limitation. Attempts have been made to add TCP pacing in TCP stack, but this seems to add complex code to an already complex stack. TCP pacing is welcomed for flows having idle times, as the cwnd permits TCP stack to queue a possibly large number of packets. This removes the 'slow start after idle' choice, hitting badly large BDP flows, and applications delivering chunks of data as video streams. Nicely spaced packets : Here interface is 10Gbit, but flow bottleneck is ~20Mbit cwin is big, yet FQ avoids the typical bursts generated by TCP (as in netperf TCP_RR -- -r 100000,100000) 15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115> 15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115> 15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115> 15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115> 15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115> 15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115> 15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115> 15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> TCP timestamps show that most packets from B were queued in the same ms timeframe (TSval 1159799{3,4}), but FQ managed to send them right in time to avoid a big burst. In slow start or steady state, very few packets are throttled [1] FQ gets a bunch of tunables as : limit : max number of packets on whole Qdisc (default 10000) flow_limit : max number of packets per flow (default 100) quantum : the credit per RR round (default is 2 MTU) initial_quantum : initial credit for new flows (default is 10 MTU) maxrate : max per flow rate (default : unlimited) buckets : number of RB trees (default : 1024) in hash table. (consumes 8 bytes per bucket) [no]pacing : disable/enable pacing (default is enable) All of them can be changed on a live qdisc. $ tc qd add dev eth0 root fq help Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ] [ quantum BYTES ] [ initial_quantum BYTES ] [ maxrate RATE ] [ buckets NUMBER ] [ [no]pacing ] $ tc -s -d qd qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14) backlog 0b 0p requeues 14 511 flows, 511 inactive, 0 throttled 110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit [1] Except if initial srtt is overestimated, as if using cached srtt in tcp metrics. We'll provide a fix for this issue. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/uapi/linux/pkt_sched.h41
-rw-r--r--net/sched/Kconfig14
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/sch_fq.c792
4 files changed, 848 insertions, 0 deletions
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 09d62b9228ff..9b829134d422 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -744,4 +744,45 @@ struct tc_fq_codel_xstats {
744 }; 744 };
745}; 745};
746 746
747/* FQ */
748
749enum {
750 TCA_FQ_UNSPEC,
751
752 TCA_FQ_PLIMIT, /* limit of total number of packets in queue */
753
754 TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */
755
756 TCA_FQ_QUANTUM, /* RR quantum */
757
758 TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */
759
760 TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */
761
762 TCA_FQ_FLOW_DEFAULT_RATE,/* for sockets with unspecified sk_rate,
763 * use the following rate
764 */
765
766 TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */
767
768 TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */
769 __TCA_FQ_MAX
770};
771
772#define TCA_FQ_MAX (__TCA_FQ_MAX - 1)
773
774struct tc_fq_qd_stats {
775 __u64 gc_flows;
776 __u64 highprio_packets;
777 __u64 tcp_retrans;
778 __u64 throttled;
779 __u64 flows_plimit;
780 __u64 pkts_too_long;
781 __u64 allocation_errors;
782 __s64 time_next_delayed_flow;
783 __u32 flows;
784 __u32 inactive_flows;
785 __u32 throttled_flows;
786 __u32 pad;
787};
747#endif 788#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01acac51..c03a32a0418e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL
272 272
273 If unsure, say N. 273 If unsure, say N.
274 274
275config NET_SCH_FQ
276 tristate "Fair Queue"
277 help
278 Say Y here if you want to use the FQ packet scheduling algorithm.
279
280 FQ does flow separation, and is able to respect pacing requirements
281 set by TCP stack into sk->sk_pacing_rate (for localy generated
282 traffic)
283
284 To compile this driver as a module, choose M here: the module
285 will be called sch_fq.
286
287 If unsure, say N.
288
275config NET_SCH_INGRESS 289config NET_SCH_INGRESS
276 tristate "Ingress Qdisc" 290 tristate "Ingress Qdisc"
277 depends on NET_CLS_ACT 291 depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 978cbf004e80..e5f9abe9a5db 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
39obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o 39obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
40obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o 40obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
41obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o 41obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
42obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
42 43
43obj-$(CONFIG_NET_CLS_U32) += cls_u32.o 44obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
44obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o 45obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 000000000000..91ceca7bcb52
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,792 @@
1/*
2 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
3 *
4 * Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Meant to be mostly used for localy generated traffic :
12 * Fast classification depends on skb->sk being set before reaching us.
13 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
14 * All packets belonging to a socket are considered as a 'flow'.
15 *
16 * Flows are dynamically allocated and stored in a hash table of RB trees
17 * They are also part of one Round Robin 'queues' (new or old flows)
18 *
19 * Burst avoidance (aka pacing) capability :
20 *
21 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
22 * bunch of packets, and this packet scheduler adds delay between
23 * packets to respect rate limitation.
24 *
25 * enqueue() :
26 * - lookup one RB tree (out of 1024 or more) to find the flow.
27 * If non existent flow, create it, add it to the tree.
28 * Add skb to the per flow list of skb (fifo).
29 * - Use a special fifo for high prio packets
30 *
31 * dequeue() : serves flows in Round Robin
32 * Note : When a flow becomes empty, we do not immediately remove it from
33 * rb trees, for performance reasons (its expected to send additional packets,
34 * or SLAB cache will reuse socket for another flow)
35 */
36
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/jiffies.h>
41#include <linux/string.h>
42#include <linux/in.h>
43#include <linux/errno.h>
44#include <linux/init.h>
45#include <linux/skbuff.h>
46#include <linux/slab.h>
47#include <linux/rbtree.h>
48#include <linux/hash.h>
49#include <net/netlink.h>
50#include <net/pkt_sched.h>
51#include <net/sock.h>
52#include <net/tcp_states.h>
53
54/*
55 * Per flow structure, dynamically allocated
56 */
57struct fq_flow {
58 struct sk_buff *head; /* list of skbs for this flow : first skb */
59 union {
60 struct sk_buff *tail; /* last skb in the list */
61 unsigned long age; /* jiffies when flow was emptied, for gc */
62 };
63 struct rb_node fq_node; /* anchor in fq_root[] trees */
64 struct sock *sk;
65 int qlen; /* number of packets in flow queue */
66 int credit;
67 u32 socket_hash; /* sk_hash */
68 struct fq_flow *next; /* next pointer in RR lists, or &detached */
69
70 struct rb_node rate_node; /* anchor in q->delayed tree */
71 u64 time_next_packet;
72};
73
74struct fq_flow_head {
75 struct fq_flow *first;
76 struct fq_flow *last;
77};
78
79struct fq_sched_data {
80 struct fq_flow_head new_flows;
81
82 struct fq_flow_head old_flows;
83
84 struct rb_root delayed; /* for rate limited flows */
85 u64 time_next_delayed_flow;
86
87 struct fq_flow internal; /* for non classified or high prio packets */
88 u32 quantum;
89 u32 initial_quantum;
90 u32 flow_default_rate;/* rate per flow : bytes per second */
91 u32 flow_max_rate; /* optional max rate per flow */
92 u32 flow_plimit; /* max packets per flow */
93 struct rb_root *fq_root;
94 u8 rate_enable;
95 u8 fq_trees_log;
96
97 u32 flows;
98 u32 inactive_flows;
99 u32 throttled_flows;
100
101 u64 stat_gc_flows;
102 u64 stat_internal_packets;
103 u64 stat_tcp_retrans;
104 u64 stat_throttled;
105 u64 stat_flows_plimit;
106 u64 stat_pkts_too_long;
107 u64 stat_allocation_errors;
108 struct qdisc_watchdog watchdog;
109};
110
111/* special value to mark a detached flow (not on old/new list) */
112static struct fq_flow detached, throttled;
113
114static void fq_flow_set_detached(struct fq_flow *f)
115{
116 f->next = &detached;
117}
118
119static bool fq_flow_is_detached(const struct fq_flow *f)
120{
121 return f->next == &detached;
122}
123
124static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
125{
126 struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
127
128 while (*p) {
129 struct fq_flow *aux;
130
131 parent = *p;
132 aux = container_of(parent, struct fq_flow, rate_node);
133 if (f->time_next_packet >= aux->time_next_packet)
134 p = &parent->rb_right;
135 else
136 p = &parent->rb_left;
137 }
138 rb_link_node(&f->rate_node, parent, p);
139 rb_insert_color(&f->rate_node, &q->delayed);
140 q->throttled_flows++;
141 q->stat_throttled++;
142
143 f->next = &throttled;
144 if (q->time_next_delayed_flow > f->time_next_packet)
145 q->time_next_delayed_flow = f->time_next_packet;
146}
147
148
149static struct kmem_cache *fq_flow_cachep __read_mostly;
150
151static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
152{
153 if (head->first)
154 head->last->next = flow;
155 else
156 head->first = flow;
157 head->last = flow;
158 flow->next = NULL;
159}
160
161/* limit number of collected flows per round */
162#define FQ_GC_MAX 8
163#define FQ_GC_AGE (3*HZ)
164
165static bool fq_gc_candidate(const struct fq_flow *f)
166{
167 return fq_flow_is_detached(f) &&
168 time_after(jiffies, f->age + FQ_GC_AGE);
169}
170
171static void fq_gc(struct fq_sched_data *q,
172 struct rb_root *root,
173 struct sock *sk)
174{
175 struct fq_flow *f, *tofree[FQ_GC_MAX];
176 struct rb_node **p, *parent;
177 int fcnt = 0;
178
179 p = &root->rb_node;
180 parent = NULL;
181 while (*p) {
182 parent = *p;
183
184 f = container_of(parent, struct fq_flow, fq_node);
185 if (f->sk == sk)
186 break;
187
188 if (fq_gc_candidate(f)) {
189 tofree[fcnt++] = f;
190 if (fcnt == FQ_GC_MAX)
191 break;
192 }
193
194 if (f->sk > sk)
195 p = &parent->rb_right;
196 else
197 p = &parent->rb_left;
198 }
199
200 q->flows -= fcnt;
201 q->inactive_flows -= fcnt;
202 q->stat_gc_flows += fcnt;
203 while (fcnt) {
204 struct fq_flow *f = tofree[--fcnt];
205
206 rb_erase(&f->fq_node, root);
207 kmem_cache_free(fq_flow_cachep, f);
208 }
209}
210
211static const u8 prio2band[TC_PRIO_MAX + 1] = {
212 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
213};
214
215static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
216{
217 struct rb_node **p, *parent;
218 struct sock *sk = skb->sk;
219 struct rb_root *root;
220 struct fq_flow *f;
221 int band;
222
223 /* warning: no starvation prevention... */
224 band = prio2band[skb->priority & TC_PRIO_MAX];
225 if (unlikely(band == 0))
226 return &q->internal;
227
228 if (unlikely(!sk)) {
229 /* By forcing low order bit to 1, we make sure to not
230 * collide with a local flow (socket pointers are word aligned)
231 */
232 sk = (struct sock *)(skb_get_rxhash(skb) | 1L);
233 }
234
235 root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
236
237 if (q->flows >= (2U << q->fq_trees_log) &&
238 q->inactive_flows > q->flows/2)
239 fq_gc(q, root, sk);
240
241 p = &root->rb_node;
242 parent = NULL;
243 while (*p) {
244 parent = *p;
245
246 f = container_of(parent, struct fq_flow, fq_node);
247 if (f->sk == sk) {
248 /* socket might have been reallocated, so check
249 * if its sk_hash is the same.
250 * It not, we need to refill credit with
251 * initial quantum
252 */
253 if (unlikely(skb->sk &&
254 f->socket_hash != sk->sk_hash)) {
255 f->credit = q->initial_quantum;
256 f->socket_hash = sk->sk_hash;
257 }
258 return f;
259 }
260 if (f->sk > sk)
261 p = &parent->rb_right;
262 else
263 p = &parent->rb_left;
264 }
265
266 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
267 if (unlikely(!f)) {
268 q->stat_allocation_errors++;
269 return &q->internal;
270 }
271 fq_flow_set_detached(f);
272 f->sk = sk;
273 if (skb->sk)
274 f->socket_hash = sk->sk_hash;
275 f->credit = q->initial_quantum;
276
277 rb_link_node(&f->fq_node, parent, p);
278 rb_insert_color(&f->fq_node, root);
279
280 q->flows++;
281 q->inactive_flows++;
282 return f;
283}
284
285
286/* remove one skb from head of flow queue */
287static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
288{
289 struct sk_buff *skb = flow->head;
290
291 if (skb) {
292 flow->head = skb->next;
293 skb->next = NULL;
294 flow->qlen--;
295 }
296 return skb;
297}
298
299/* We might add in the future detection of retransmits
300 * For the time being, just return false
301 */
302static bool skb_is_retransmit(struct sk_buff *skb)
303{
304 return false;
305}
306
307/* add skb to flow queue
308 * flow queue is a linked list, kind of FIFO, except for TCP retransmits
309 * We special case tcp retransmits to be transmitted before other packets.
310 * We rely on fact that TCP retransmits are unlikely, so we do not waste
311 * a separate queue or a pointer.
312 * head-> [retrans pkt 1]
313 * [retrans pkt 2]
314 * [ normal pkt 1]
315 * [ normal pkt 2]
316 * [ normal pkt 3]
317 * tail-> [ normal pkt 4]
318 */
319static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
320{
321 struct sk_buff *prev, *head = flow->head;
322
323 skb->next = NULL;
324 if (!head) {
325 flow->head = skb;
326 flow->tail = skb;
327 return;
328 }
329 if (likely(!skb_is_retransmit(skb))) {
330 flow->tail->next = skb;
331 flow->tail = skb;
332 return;
333 }
334
335 /* This skb is a tcp retransmit,
336 * find the last retrans packet in the queue
337 */
338 prev = NULL;
339 while (skb_is_retransmit(head)) {
340 prev = head;
341 head = head->next;
342 if (!head)
343 break;
344 }
345 if (!prev) { /* no rtx packet in queue, become the new head */
346 skb->next = flow->head;
347 flow->head = skb;
348 } else {
349 if (prev == flow->tail)
350 flow->tail = skb;
351 else
352 skb->next = prev->next;
353 prev->next = skb;
354 }
355}
356
357static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
358{
359 struct fq_sched_data *q = qdisc_priv(sch);
360 struct fq_flow *f;
361
362 if (unlikely(sch->q.qlen >= sch->limit))
363 return qdisc_drop(skb, sch);
364
365 f = fq_classify(skb, q);
366 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
367 q->stat_flows_plimit++;
368 return qdisc_drop(skb, sch);
369 }
370
371 f->qlen++;
372 flow_queue_add(f, skb);
373 if (skb_is_retransmit(skb))
374 q->stat_tcp_retrans++;
375 sch->qstats.backlog += qdisc_pkt_len(skb);
376 if (fq_flow_is_detached(f)) {
377 fq_flow_add_tail(&q->new_flows, f);
378 if (q->quantum > f->credit)
379 f->credit = q->quantum;
380 q->inactive_flows--;
381 qdisc_unthrottled(sch);
382 }
383 if (unlikely(f == &q->internal)) {
384 q->stat_internal_packets++;
385 qdisc_unthrottled(sch);
386 }
387 sch->q.qlen++;
388
389 return NET_XMIT_SUCCESS;
390}
391
392static void fq_check_throttled(struct fq_sched_data *q, u64 now)
393{
394 struct rb_node *p;
395
396 if (q->time_next_delayed_flow > now)
397 return;
398
399 q->time_next_delayed_flow = ~0ULL;
400 while ((p = rb_first(&q->delayed)) != NULL) {
401 struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
402
403 if (f->time_next_packet > now) {
404 q->time_next_delayed_flow = f->time_next_packet;
405 break;
406 }
407 rb_erase(p, &q->delayed);
408 q->throttled_flows--;
409 fq_flow_add_tail(&q->old_flows, f);
410 }
411}
412
413static struct sk_buff *fq_dequeue(struct Qdisc *sch)
414{
415 struct fq_sched_data *q = qdisc_priv(sch);
416 u64 now = ktime_to_ns(ktime_get());
417 struct fq_flow_head *head;
418 struct sk_buff *skb;
419 struct fq_flow *f;
420
421 skb = fq_dequeue_head(&q->internal);
422 if (skb)
423 goto out;
424 fq_check_throttled(q, now);
425begin:
426 head = &q->new_flows;
427 if (!head->first) {
428 head = &q->old_flows;
429 if (!head->first) {
430 if (q->time_next_delayed_flow != ~0ULL)
431 qdisc_watchdog_schedule_ns(&q->watchdog,
432 q->time_next_delayed_flow);
433 return NULL;
434 }
435 }
436 f = head->first;
437
438 if (f->credit <= 0) {
439 f->credit += q->quantum;
440 head->first = f->next;
441 fq_flow_add_tail(&q->old_flows, f);
442 goto begin;
443 }
444
445 if (unlikely(f->head && now < f->time_next_packet)) {
446 head->first = f->next;
447 fq_flow_set_throttled(q, f);
448 goto begin;
449 }
450
451 skb = fq_dequeue_head(f);
452 if (!skb) {
453 head->first = f->next;
454 /* force a pass through old_flows to prevent starvation */
455 if ((head == &q->new_flows) && q->old_flows.first) {
456 fq_flow_add_tail(&q->old_flows, f);
457 } else {
458 fq_flow_set_detached(f);
459 f->age = jiffies;
460 q->inactive_flows++;
461 }
462 goto begin;
463 }
464 f->time_next_packet = now;
465 f->credit -= qdisc_pkt_len(skb);
466
467 if (f->credit <= 0 &&
468 q->rate_enable &&
469 skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
470 u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
471
472 rate = min(rate, q->flow_max_rate);
473 if (rate) {
474 u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
475
476 do_div(len, rate);
477 /* Since socket rate can change later,
478 * clamp the delay to 125 ms.
479 * TODO: maybe segment the too big skb, as in commit
480 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
481 */
482 if (unlikely(len > 125 * NSEC_PER_MSEC)) {
483 len = 125 * NSEC_PER_MSEC;
484 q->stat_pkts_too_long++;
485 }
486
487 f->time_next_packet = now + len;
488 }
489 }
490out:
491 prefetch(&skb->end);
492 sch->qstats.backlog -= qdisc_pkt_len(skb);
493 qdisc_bstats_update(sch, skb);
494 sch->q.qlen--;
495 qdisc_unthrottled(sch);
496 return skb;
497}
498
499static void fq_reset(struct Qdisc *sch)
500{
501 struct sk_buff *skb;
502
503 while ((skb = fq_dequeue(sch)) != NULL)
504 kfree_skb(skb);
505}
506
507static void fq_rehash(struct fq_sched_data *q,
508 struct rb_root *old_array, u32 old_log,
509 struct rb_root *new_array, u32 new_log)
510{
511 struct rb_node *op, **np, *parent;
512 struct rb_root *oroot, *nroot;
513 struct fq_flow *of, *nf;
514 int fcnt = 0;
515 u32 idx;
516
517 for (idx = 0; idx < (1U << old_log); idx++) {
518 oroot = &old_array[idx];
519 while ((op = rb_first(oroot)) != NULL) {
520 rb_erase(op, oroot);
521 of = container_of(op, struct fq_flow, fq_node);
522 if (fq_gc_candidate(of)) {
523 fcnt++;
524 kmem_cache_free(fq_flow_cachep, of);
525 continue;
526 }
527 nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
528
529 np = &nroot->rb_node;
530 parent = NULL;
531 while (*np) {
532 parent = *np;
533
534 nf = container_of(parent, struct fq_flow, fq_node);
535 BUG_ON(nf->sk == of->sk);
536
537 if (nf->sk > of->sk)
538 np = &parent->rb_right;
539 else
540 np = &parent->rb_left;
541 }
542
543 rb_link_node(&of->fq_node, parent, np);
544 rb_insert_color(&of->fq_node, nroot);
545 }
546 }
547 q->flows -= fcnt;
548 q->inactive_flows -= fcnt;
549 q->stat_gc_flows += fcnt;
550}
551
552static int fq_resize(struct fq_sched_data *q, u32 log)
553{
554 struct rb_root *array;
555 u32 idx;
556
557 if (q->fq_root && log == q->fq_trees_log)
558 return 0;
559
560 array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
561 if (!array)
562 return -ENOMEM;
563
564 for (idx = 0; idx < (1U << log); idx++)
565 array[idx] = RB_ROOT;
566
567 if (q->fq_root) {
568 fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
569 kfree(q->fq_root);
570 }
571 q->fq_root = array;
572 q->fq_trees_log = log;
573
574 return 0;
575}
576
577static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
578 [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
579 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
580 [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
581 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
582 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
583 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
584 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
585 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
586};
587
588static int fq_change(struct Qdisc *sch, struct nlattr *opt)
589{
590 struct fq_sched_data *q = qdisc_priv(sch);
591 struct nlattr *tb[TCA_FQ_MAX + 1];
592 int err, drop_count = 0;
593 u32 fq_log;
594
595 if (!opt)
596 return -EINVAL;
597
598 err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
599 if (err < 0)
600 return err;
601
602 sch_tree_lock(sch);
603
604 fq_log = q->fq_trees_log;
605
606 if (tb[TCA_FQ_BUCKETS_LOG]) {
607 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
608
609 if (nval >= 1 && nval <= ilog2(256*1024))
610 fq_log = nval;
611 else
612 err = -EINVAL;
613 }
614 if (tb[TCA_FQ_PLIMIT])
615 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
616
617 if (tb[TCA_FQ_FLOW_PLIMIT])
618 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
619
620 if (tb[TCA_FQ_QUANTUM])
621 q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
622
623 if (tb[TCA_FQ_INITIAL_QUANTUM])
624 q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
625
626 if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
627 q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
628
629 if (tb[TCA_FQ_FLOW_MAX_RATE])
630 q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
631
632 if (tb[TCA_FQ_RATE_ENABLE]) {
633 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
634
635 if (enable <= 1)
636 q->rate_enable = enable;
637 else
638 err = -EINVAL;
639 }
640
641 if (!err)
642 err = fq_resize(q, fq_log);
643
644 while (sch->q.qlen > sch->limit) {
645 struct sk_buff *skb = fq_dequeue(sch);
646
647 kfree_skb(skb);
648 drop_count++;
649 }
650 qdisc_tree_decrease_qlen(sch, drop_count);
651
652 sch_tree_unlock(sch);
653 return err;
654}
655
656static void fq_destroy(struct Qdisc *sch)
657{
658 struct fq_sched_data *q = qdisc_priv(sch);
659 struct rb_root *root;
660 struct rb_node *p;
661 unsigned int idx;
662
663 if (q->fq_root) {
664 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
665 root = &q->fq_root[idx];
666 while ((p = rb_first(root)) != NULL) {
667 rb_erase(p, root);
668 kmem_cache_free(fq_flow_cachep,
669 container_of(p, struct fq_flow, fq_node));
670 }
671 }
672 kfree(q->fq_root);
673 }
674 qdisc_watchdog_cancel(&q->watchdog);
675}
676
677static int fq_init(struct Qdisc *sch, struct nlattr *opt)
678{
679 struct fq_sched_data *q = qdisc_priv(sch);
680 int err;
681
682 sch->limit = 10000;
683 q->flow_plimit = 100;
684 q->quantum = 2 * psched_mtu(qdisc_dev(sch));
685 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
686 q->flow_default_rate = 0;
687 q->flow_max_rate = ~0U;
688 q->rate_enable = 1;
689 q->new_flows.first = NULL;
690 q->old_flows.first = NULL;
691 q->delayed = RB_ROOT;
692 q->fq_root = NULL;
693 q->fq_trees_log = ilog2(1024);
694 qdisc_watchdog_init(&q->watchdog, sch);
695
696 if (opt)
697 err = fq_change(sch, opt);
698 else
699 err = fq_resize(q, q->fq_trees_log);
700
701 return err;
702}
703
704static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
705{
706 struct fq_sched_data *q = qdisc_priv(sch);
707 struct nlattr *opts;
708
709 opts = nla_nest_start(skb, TCA_OPTIONS);
710 if (opts == NULL)
711 goto nla_put_failure;
712
713 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
714 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
715 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
716 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
717 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
718 nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) ||
719 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
720 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
721 goto nla_put_failure;
722
723 nla_nest_end(skb, opts);
724 return skb->len;
725
726nla_put_failure:
727 return -1;
728}
729
730static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
731{
732 struct fq_sched_data *q = qdisc_priv(sch);
733 u64 now = ktime_to_ns(ktime_get());
734 struct tc_fq_qd_stats st = {
735 .gc_flows = q->stat_gc_flows,
736 .highprio_packets = q->stat_internal_packets,
737 .tcp_retrans = q->stat_tcp_retrans,
738 .throttled = q->stat_throttled,
739 .flows_plimit = q->stat_flows_plimit,
740 .pkts_too_long = q->stat_pkts_too_long,
741 .allocation_errors = q->stat_allocation_errors,
742 .flows = q->flows,
743 .inactive_flows = q->inactive_flows,
744 .throttled_flows = q->throttled_flows,
745 .time_next_delayed_flow = q->time_next_delayed_flow - now,
746 };
747
748 return gnet_stats_copy_app(d, &st, sizeof(st));
749}
750
751static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
752 .id = "fq",
753 .priv_size = sizeof(struct fq_sched_data),
754
755 .enqueue = fq_enqueue,
756 .dequeue = fq_dequeue,
757 .peek = qdisc_peek_dequeued,
758 .init = fq_init,
759 .reset = fq_reset,
760 .destroy = fq_destroy,
761 .change = fq_change,
762 .dump = fq_dump,
763 .dump_stats = fq_dump_stats,
764 .owner = THIS_MODULE,
765};
766
767static int __init fq_module_init(void)
768{
769 int ret;
770
771 fq_flow_cachep = kmem_cache_create("fq_flow_cache",
772 sizeof(struct fq_flow),
773 0, 0, NULL);
774 if (!fq_flow_cachep)
775 return -ENOMEM;
776
777 ret = register_qdisc(&fq_qdisc_ops);
778 if (ret)
779 kmem_cache_destroy(fq_flow_cachep);
780 return ret;
781}
782
783static void __exit fq_module_exit(void)
784{
785 unregister_qdisc(&fq_qdisc_ops);
786 kmem_cache_destroy(fq_flow_cachep);
787}
788
789module_init(fq_module_init)
790module_exit(fq_module_exit)
791MODULE_AUTHOR("Eric Dumazet");
792MODULE_LICENSE("GPL");