aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-05-08 18:05:12 -0400
committerDavid S. Miller <davem@davemloft.net>2015-05-10 19:50:20 -0400
commit80ba92fa1a92dea128283f69f55b02242e213650 (patch)
tree726f9c72b6fba997542ed8439bab7302c0cd763d
parentcf9d0dcc5a46f0a3dcb7905aa6ffa8e4bbc1cc99 (diff)
codel: add ce_threshold attribute
For DCTCP or similar ECN based deployments on fabrics with shallow buffers, hosts are responsible for a good part of the buffering. This patch adds an optional ce_threshold to codel & fq_codel qdiscs, so that DCTCP can have feedback from queuing in the host. A DCTCP enabled egress port simply have a queue occupancy threshold above which ECT packets get CE mark. In codel language this translates to a sojourn time, so that one doesn't have to worry about bytes or bandwidth but delays. This makes the host an active participant in the health of the whole network. This also helps experimenting DCTCP in a setup without DCTCP compliant fabric. On following example, ce_threshold is set to 1ms, and we can see from 'ldelay xxx us' that TCP is not trying to go around the 5ms codel target. Queue has more capacity to absorb inelastic bursts (say from UDP traffic), as queues are maintained to an optimal level. lpaa23:~# ./tc -s -d qd sh dev eth1 qdisc mq 1: dev eth1 root Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961) backlog 3108242b 364p requeues 42961 qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503) rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503 count 0 lastcount 0 ldelay 1.0ms drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384 qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186) rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186 count 0 lastcount 0 ldelay 694us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873 qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554) rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554 count 0 lastcount 0 ldelay 889us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780 ... Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Florian Westphal <fw@strlen.de> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: Glenn Judd <glenn.judd@morganstanley.com> Cc: Nandita Dukkipati <nanditad@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/codel.h12
-rw-r--r--include/uapi/linux/pkt_sched.h4
-rw-r--r--net/sched/sch_codel.c15
-rw-r--r--net/sched/sch_fq_codel.c15
4 files changed, 42 insertions, 4 deletions
diff --git a/include/net/codel.h b/include/net/codel.h
index aeee28081245..8c0f78f209e8 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -7,7 +7,7 @@
7 * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> 7 * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
8 * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> 8 * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
9 * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> 9 * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
10 * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> 10 * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
@@ -119,11 +119,13 @@ static inline u32 codel_time_to_us(codel_time_t val)
119/** 119/**
120 * struct codel_params - contains codel parameters 120 * struct codel_params - contains codel parameters
121 * @target: target queue size (in time units) 121 * @target: target queue size (in time units)
122 * @ce_threshold: threshold for marking packets with ECN CE
122 * @interval: width of moving time window 123 * @interval: width of moving time window
123 * @ecn: is Explicit Congestion Notification enabled 124 * @ecn: is Explicit Congestion Notification enabled
124 */ 125 */
125struct codel_params { 126struct codel_params {
126 codel_time_t target; 127 codel_time_t target;
128 codel_time_t ce_threshold;
127 codel_time_t interval; 129 codel_time_t interval;
128 bool ecn; 130 bool ecn;
129}; 131};
@@ -159,17 +161,22 @@ struct codel_vars {
159 * @maxpacket: largest packet we've seen so far 161 * @maxpacket: largest packet we've seen so far
160 * @drop_count: temp count of dropped packets in dequeue() 162 * @drop_count: temp count of dropped packets in dequeue()
161 * ecn_mark: number of packets we ECN marked instead of dropping 163 * ecn_mark: number of packets we ECN marked instead of dropping
164 * ce_mark: number of packets CE marked because sojourn time was above ce_threshold
162 */ 165 */
163struct codel_stats { 166struct codel_stats {
164 u32 maxpacket; 167 u32 maxpacket;
165 u32 drop_count; 168 u32 drop_count;
166 u32 ecn_mark; 169 u32 ecn_mark;
170 u32 ce_mark;
167}; 171};
168 172
173#define CODEL_DISABLED_THRESHOLD INT_MAX
174
169static void codel_params_init(struct codel_params *params) 175static void codel_params_init(struct codel_params *params)
170{ 176{
171 params->interval = MS2TIME(100); 177 params->interval = MS2TIME(100);
172 params->target = MS2TIME(5); 178 params->target = MS2TIME(5);
179 params->ce_threshold = CODEL_DISABLED_THRESHOLD;
173 params->ecn = false; 180 params->ecn = false;
174} 181}
175 182
@@ -350,6 +357,9 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
350 vars->rec_inv_sqrt); 357 vars->rec_inv_sqrt);
351 } 358 }
352end: 359end:
360 if (skb && codel_time_after(vars->ldelay, params->ce_threshold) &&
361 INET_ECN_set_ce(skb))
362 stats->ce_mark++;
353 return skb; 363 return skb;
354} 364}
355#endif 365#endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 534b84710745..69d88b309cc7 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -679,6 +679,7 @@ enum {
679 TCA_CODEL_LIMIT, 679 TCA_CODEL_LIMIT,
680 TCA_CODEL_INTERVAL, 680 TCA_CODEL_INTERVAL,
681 TCA_CODEL_ECN, 681 TCA_CODEL_ECN,
682 TCA_CODEL_CE_THRESHOLD,
682 __TCA_CODEL_MAX 683 __TCA_CODEL_MAX
683}; 684};
684 685
@@ -695,6 +696,7 @@ struct tc_codel_xstats {
695 __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ 696 __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */
696 __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ 697 __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */
697 __u32 dropping; /* are we in dropping state ? */ 698 __u32 dropping; /* are we in dropping state ? */
699 __u32 ce_mark; /* number of CE marked packets because of ce_threshold */
698}; 700};
699 701
700/* FQ_CODEL */ 702/* FQ_CODEL */
@@ -707,6 +709,7 @@ enum {
707 TCA_FQ_CODEL_ECN, 709 TCA_FQ_CODEL_ECN,
708 TCA_FQ_CODEL_FLOWS, 710 TCA_FQ_CODEL_FLOWS,
709 TCA_FQ_CODEL_QUANTUM, 711 TCA_FQ_CODEL_QUANTUM,
712 TCA_FQ_CODEL_CE_THRESHOLD,
710 __TCA_FQ_CODEL_MAX 713 __TCA_FQ_CODEL_MAX
711}; 714};
712 715
@@ -730,6 +733,7 @@ struct tc_fq_codel_qd_stats {
730 */ 733 */
731 __u32 new_flows_len; /* count of flows in new list */ 734 __u32 new_flows_len; /* count of flows in new list */
732 __u32 old_flows_len; /* count of flows in old list */ 735 __u32 old_flows_len; /* count of flows in old list */
736 __u32 ce_mark; /* packets above ce_threshold */
733}; 737};
734 738
735struct tc_fq_codel_cl_stats { 739struct tc_fq_codel_cl_stats {
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index de28f8e968e8..1474b6560fac 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * Implemented on linux by : 7 * Implemented on linux by :
8 * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> 8 * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
9 * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> 9 * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
@@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
109 [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, 109 [TCA_CODEL_LIMIT] = { .type = NLA_U32 },
110 [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, 110 [TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
111 [TCA_CODEL_ECN] = { .type = NLA_U32 }, 111 [TCA_CODEL_ECN] = { .type = NLA_U32 },
112 [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 },
112}; 113};
113 114
114static int codel_change(struct Qdisc *sch, struct nlattr *opt) 115static int codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
133 q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; 134 q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
134 } 135 }
135 136
137 if (tb[TCA_CODEL_CE_THRESHOLD]) {
138 u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]);
139
140 q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
141 }
142
136 if (tb[TCA_CODEL_INTERVAL]) { 143 if (tb[TCA_CODEL_INTERVAL]) {
137 u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); 144 u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
138 145
@@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
201 nla_put_u32(skb, TCA_CODEL_ECN, 208 nla_put_u32(skb, TCA_CODEL_ECN,
202 q->params.ecn)) 209 q->params.ecn))
203 goto nla_put_failure; 210 goto nla_put_failure;
204 211 if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD &&
212 nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD,
213 codel_time_to_us(q->params.ce_threshold)))
214 goto nla_put_failure;
205 return nla_nest_end(skb, opts); 215 return nla_nest_end(skb, opts);
206 216
207nla_put_failure: 217nla_put_failure:
@@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
220 .ldelay = codel_time_to_us(q->vars.ldelay), 230 .ldelay = codel_time_to_us(q->vars.ldelay),
221 .dropping = q->vars.dropping, 231 .dropping = q->vars.dropping,
222 .ecn_mark = q->stats.ecn_mark, 232 .ecn_mark = q->stats.ecn_mark,
233 .ce_mark = q->stats.ce_mark,
223 }; 234 };
224 235
225 if (q->vars.dropping) { 236 if (q->vars.dropping) {
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a6fc53d69513..778739786b32 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -6,7 +6,7 @@
6 * as published by the Free Software Foundation; either version 6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version. 7 * 2 of the License, or (at your option) any later version.
8 * 8 *
9 * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> 9 * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
10 */ 10 */
11 11
12#include <linux/module.h> 12#include <linux/module.h>
@@ -292,6 +292,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
292 [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, 292 [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 },
293 [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, 293 [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
294 [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, 294 [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
295 [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
295}; 296};
296 297
297static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) 298static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -322,6 +323,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
322 q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; 323 q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
323 } 324 }
324 325
326 if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) {
327 u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]);
328
329 q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
330 }
331
325 if (tb[TCA_FQ_CODEL_INTERVAL]) { 332 if (tb[TCA_FQ_CODEL_INTERVAL]) {
326 u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); 333 u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
327 334
@@ -441,6 +448,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
441 q->flows_cnt)) 448 q->flows_cnt))
442 goto nla_put_failure; 449 goto nla_put_failure;
443 450
451 if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD &&
452 nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD,
453 codel_time_to_us(q->cparams.ce_threshold)))
454 goto nla_put_failure;
455
444 return nla_nest_end(skb, opts); 456 return nla_nest_end(skb, opts);
445 457
446nla_put_failure: 458nla_put_failure:
@@ -459,6 +471,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
459 st.qdisc_stats.drop_overlimit = q->drop_overlimit; 471 st.qdisc_stats.drop_overlimit = q->drop_overlimit;
460 st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; 472 st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
461 st.qdisc_stats.new_flow_count = q->new_flow_count; 473 st.qdisc_stats.new_flow_count = q->new_flow_count;
474 st.qdisc_stats.ce_mark = q->cstats.ce_mark;
462 475
463 list_for_each(pos, &q->new_flows) 476 list_for_each(pos, &q->new_flows)
464 st.qdisc_stats.new_flows_len++; 477 st.qdisc_stats.new_flows_len++;