aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorVimalkumar <j.vimal@gmail.com>2012-10-31 02:04:11 -0400
committerDavid S. Miller <davem@davemloft.net>2012-11-03 15:24:01 -0400
commit56b765b79e9a78dc7d3f8850ba5e5567205a3ecd (patch)
tree890f819336364d0857624695614f9b5dd77b6a6e /net/sched
parentafb97186f5d8f1d552298e7423e84c4282e48b92 (diff)
htb: improved accuracy at high rates
Current HTB (and TBF) uses rate table computed by the "tc" userspace program, which has the following issue: The rate table has 256 entries to map packet lengths to token (time units). With TSO sized packets, the 256 entry granularity leads to loss/gain of rate, making the token bucket inaccurate. Thus, instead of relying on rate table, this patch explicitly computes the time and accounts for packet transmission times with nanosecond granularity. This greatly improves accuracy of HTB with a wide range of packet sizes. Example: tc qdisc add dev $dev root handle 1: \ htb default 1 tc class add dev $dev classid 1:1 parent 1: \ rate 5Gbit mtu 64k Here is an example of inaccuracy: $ iperf -c host -t 10 -i 1 With old htb: eth4: 34.76 Mb/s In 5827.98 Mb/s Out - 65836.0 p/s In 481273.0 p/s Out [SUM] 9.0-10.0 sec 669 MBytes 5.61 Gbits/sec [SUM] 0.0-10.0 sec 6.50 GBytes 5.58 Gbits/sec With new htb: eth4: 28.36 Mb/s In 5208.06 Mb/s Out - 53704.0 p/s In 430076.0 p/s Out [SUM] 9.0-10.0 sec 594 MBytes 4.98 Gbits/sec [SUM] 0.0-10.0 sec 5.80 GBytes 4.98 Gbits/sec The bits per second on the wire is still 5200Mb/s with new HTB because qdisc accounts for packet length using skb->len, which is smaller than total bytes on the wire if GSO is used. But that is for another patch regardless of how time is accounted. Many thanks to Eric Dumazet for review and feedback. Signed-off-by: Vimalkumar <j.vimal@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/sch_htb.c128
1 files changed, 90 insertions, 38 deletions
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9d75b7761313..32a80977cf54 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -71,6 +71,12 @@ enum htb_cmode {
71 HTB_CAN_SEND /* class can send */ 71 HTB_CAN_SEND /* class can send */
72}; 72};
73 73
74struct htb_rate_cfg {
75 u64 rate_bps;
76 u32 mult;
77 u32 shift;
78};
79
74/* interior & leaf nodes; props specific to leaves are marked L: */ 80/* interior & leaf nodes; props specific to leaves are marked L: */
75struct htb_class { 81struct htb_class {
76 struct Qdisc_class_common common; 82 struct Qdisc_class_common common;
@@ -118,11 +124,11 @@ struct htb_class {
118 int filter_cnt; 124 int filter_cnt;
119 125
120 /* token bucket parameters */ 126 /* token bucket parameters */
121 struct qdisc_rate_table *rate; /* rate table of the class itself */ 127 struct htb_rate_cfg rate;
122 struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ 128 struct htb_rate_cfg ceil;
123 long buffer, cbuffer; /* token bucket depth/rate */ 129 s64 buffer, cbuffer; /* token bucket depth/rate */
124 psched_tdiff_t mbuffer; /* max wait time */ 130 psched_tdiff_t mbuffer; /* max wait time */
125 long tokens, ctokens; /* current number of tokens */ 131 s64 tokens, ctokens; /* current number of tokens */
126 psched_time_t t_c; /* checkpoint time */ 132 psched_time_t t_c; /* checkpoint time */
127}; 133};
128 134
@@ -162,6 +168,45 @@ struct htb_sched {
162 struct work_struct work; 168 struct work_struct work;
163}; 169};
164 170
171static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len)
172{
173 return ((u64)len * r->mult) >> r->shift;
174}
175
176static void htb_precompute_ratedata(struct htb_rate_cfg *r)
177{
178 u64 factor;
179 u64 mult;
180 int shift;
181
182 r->shift = 0;
183 r->mult = 1;
184 /*
185 * Calibrate mult, shift so that token counting is accurate
186 * for smallest packet size (64 bytes). Token (time in ns) is
187 * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will
188 * work as long as the smallest packet transfer time can be
189 * accurately represented in nanosec.
190 */
191 if (r->rate_bps > 0) {
192 /*
193 * Higher shift gives better accuracy. Find the largest
194 * shift such that mult fits in 32 bits.
195 */
196 for (shift = 0; shift < 16; shift++) {
197 r->shift = shift;
198 factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
199 mult = div64_u64(factor, r->rate_bps);
200 if (mult > UINT_MAX)
201 break;
202 }
203
204 r->shift = shift - 1;
205 factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
206 r->mult = div64_u64(factor, r->rate_bps);
207 }
208}
209
165/* find class in global hash table using given handle */ 210/* find class in global hash table using given handle */
166static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) 211static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
167{ 212{
@@ -273,7 +318,7 @@ static void htb_add_to_id_tree(struct rb_root *root,
273 * already in the queue. 318 * already in the queue.
274 */ 319 */
275static void htb_add_to_wait_tree(struct htb_sched *q, 320static void htb_add_to_wait_tree(struct htb_sched *q,
276 struct htb_class *cl, long delay) 321 struct htb_class *cl, s64 delay)
277{ 322{
278 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; 323 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
279 324
@@ -441,14 +486,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
441 htb_remove_class_from_row(q, cl, mask); 486 htb_remove_class_from_row(q, cl, mask);
442} 487}
443 488
444static inline long htb_lowater(const struct htb_class *cl) 489static inline s64 htb_lowater(const struct htb_class *cl)
445{ 490{
446 if (htb_hysteresis) 491 if (htb_hysteresis)
447 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; 492 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
448 else 493 else
449 return 0; 494 return 0;
450} 495}
451static inline long htb_hiwater(const struct htb_class *cl) 496static inline s64 htb_hiwater(const struct htb_class *cl)
452{ 497{
453 if (htb_hysteresis) 498 if (htb_hysteresis)
454 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; 499 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
@@ -469,9 +514,9 @@ static inline long htb_hiwater(const struct htb_class *cl)
469 * mode transitions per time unit. The speed gain is about 1/6. 514 * mode transitions per time unit. The speed gain is about 1/6.
470 */ 515 */
471static inline enum htb_cmode 516static inline enum htb_cmode
472htb_class_mode(struct htb_class *cl, long *diff) 517htb_class_mode(struct htb_class *cl, s64 *diff)
473{ 518{
474 long toks; 519 s64 toks;
475 520
476 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { 521 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
477 *diff = -toks; 522 *diff = -toks;
@@ -495,7 +540,7 @@ htb_class_mode(struct htb_class *cl, long *diff)
495 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). 540 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
496 */ 541 */
497static void 542static void
498htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) 543htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
499{ 544{
500 enum htb_cmode new_mode = htb_class_mode(cl, diff); 545 enum htb_cmode new_mode = htb_class_mode(cl, diff);
501 546
@@ -581,26 +626,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
581 return NET_XMIT_SUCCESS; 626 return NET_XMIT_SUCCESS;
582} 627}
583 628
584static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) 629static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
585{ 630{
586 long toks = diff + cl->tokens; 631 s64 toks = diff + cl->tokens;
587 632
588 if (toks > cl->buffer) 633 if (toks > cl->buffer)
589 toks = cl->buffer; 634 toks = cl->buffer;
590 toks -= (long) qdisc_l2t(cl->rate, bytes); 635 toks -= (s64) l2t_ns(&cl->rate, bytes);
591 if (toks <= -cl->mbuffer) 636 if (toks <= -cl->mbuffer)
592 toks = 1 - cl->mbuffer; 637 toks = 1 - cl->mbuffer;
593 638
594 cl->tokens = toks; 639 cl->tokens = toks;
595} 640}
596 641
597static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) 642static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
598{ 643{
599 long toks = diff + cl->ctokens; 644 s64 toks = diff + cl->ctokens;
600 645
601 if (toks > cl->cbuffer) 646 if (toks > cl->cbuffer)
602 toks = cl->cbuffer; 647 toks = cl->cbuffer;
603 toks -= (long) qdisc_l2t(cl->ceil, bytes); 648 toks -= (s64) l2t_ns(&cl->ceil, bytes);
604 if (toks <= -cl->mbuffer) 649 if (toks <= -cl->mbuffer)
605 toks = 1 - cl->mbuffer; 650 toks = 1 - cl->mbuffer;
606 651
@@ -623,10 +668,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
623{ 668{
624 int bytes = qdisc_pkt_len(skb); 669 int bytes = qdisc_pkt_len(skb);
625 enum htb_cmode old_mode; 670 enum htb_cmode old_mode;
626 long diff; 671 s64 diff;
627 672
628 while (cl) { 673 while (cl) {
629 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); 674 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
630 if (cl->level >= level) { 675 if (cl->level >= level) {
631 if (cl->level == level) 676 if (cl->level == level)
632 cl->xstats.lends++; 677 cl->xstats.lends++;
@@ -673,7 +718,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
673 unsigned long stop_at = start + 2; 718 unsigned long stop_at = start + 2;
674 while (time_before(jiffies, stop_at)) { 719 while (time_before(jiffies, stop_at)) {
675 struct htb_class *cl; 720 struct htb_class *cl;
676 long diff; 721 s64 diff;
677 struct rb_node *p = rb_first(&q->wait_pq[level]); 722 struct rb_node *p = rb_first(&q->wait_pq[level]);
678 723
679 if (!p) 724 if (!p)
@@ -684,7 +729,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
684 return cl->pq_key; 729 return cl->pq_key;
685 730
686 htb_safe_rb_erase(p, q->wait_pq + level); 731 htb_safe_rb_erase(p, q->wait_pq + level);
687 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); 732 diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
688 htb_change_class_mode(q, cl, &diff); 733 htb_change_class_mode(q, cl, &diff);
689 if (cl->cmode != HTB_CAN_SEND) 734 if (cl->cmode != HTB_CAN_SEND)
690 htb_add_to_wait_tree(q, cl, diff); 735 htb_add_to_wait_tree(q, cl, diff);
@@ -834,7 +879,6 @@ next:
834 } while (cl != start); 879 } while (cl != start);
835 880
836 if (likely(skb != NULL)) { 881 if (likely(skb != NULL)) {
837 bstats_update(&cl->bstats, skb);
838 cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); 882 cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
839 if (cl->un.leaf.deficit[level] < 0) { 883 if (cl->un.leaf.deficit[level] < 0) {
840 cl->un.leaf.deficit[level] += cl->quantum; 884 cl->un.leaf.deficit[level] += cl->quantum;
@@ -871,10 +915,10 @@ ok:
871 915
872 if (!sch->q.qlen) 916 if (!sch->q.qlen)
873 goto fin; 917 goto fin;
874 q->now = psched_get_time(); 918 q->now = ktime_to_ns(ktime_get());
875 start_at = jiffies; 919 start_at = jiffies;
876 920
877 next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; 921 next_event = q->now + 5 * NSEC_PER_SEC;
878 922
879 for (level = 0; level < TC_HTB_MAXDEPTH; level++) { 923 for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
880 /* common case optimization - skip event handler quickly */ 924 /* common case optimization - skip event handler quickly */
@@ -884,7 +928,7 @@ ok:
884 if (q->now >= q->near_ev_cache[level]) { 928 if (q->now >= q->near_ev_cache[level]) {
885 event = htb_do_events(q, level, start_at); 929 event = htb_do_events(q, level, start_at);
886 if (!event) 930 if (!event)
887 event = q->now + PSCHED_TICKS_PER_SEC; 931 event = q->now + NSEC_PER_SEC;
888 q->near_ev_cache[level] = event; 932 q->near_ev_cache[level] = event;
889 } else 933 } else
890 event = q->near_ev_cache[level]; 934 event = q->near_ev_cache[level];
@@ -903,10 +947,17 @@ ok:
903 } 947 }
904 } 948 }
905 sch->qstats.overlimits++; 949 sch->qstats.overlimits++;
906 if (likely(next_event > q->now)) 950 if (likely(next_event > q->now)) {
907 qdisc_watchdog_schedule(&q->watchdog, next_event); 951 if (!test_bit(__QDISC_STATE_DEACTIVATED,
908 else 952 &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
953 ktime_t time = ns_to_ktime(next_event);
954 qdisc_throttled(q->watchdog.qdisc);
955 hrtimer_start(&q->watchdog.timer, time,
956 HRTIMER_MODE_ABS);
957 }
958 } else {
909 schedule_work(&q->work); 959 schedule_work(&q->work);
960 }
910fin: 961fin:
911 return skb; 962 return skb;
912} 963}
@@ -1082,9 +1133,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1082 1133
1083 memset(&opt, 0, sizeof(opt)); 1134 memset(&opt, 0, sizeof(opt));
1084 1135
1085 opt.rate = cl->rate->rate; 1136 opt.rate.rate = cl->rate.rate_bps >> 3;
1086 opt.buffer = cl->buffer; 1137 opt.buffer = cl->buffer;
1087 opt.ceil = cl->ceil->rate; 1138 opt.ceil.rate = cl->ceil.rate_bps >> 3;
1088 opt.cbuffer = cl->cbuffer; 1139 opt.cbuffer = cl->cbuffer;
1089 opt.quantum = cl->quantum; 1140 opt.quantum = cl->quantum;
1090 opt.prio = cl->prio; 1141 opt.prio = cl->prio;
@@ -1203,9 +1254,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1203 qdisc_destroy(cl->un.leaf.q); 1254 qdisc_destroy(cl->un.leaf.q);
1204 } 1255 }
1205 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1256 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1206 qdisc_put_rtab(cl->rate);
1207 qdisc_put_rtab(cl->ceil);
1208
1209 tcf_destroy_chain(&cl->filter_list); 1257 tcf_destroy_chain(&cl->filter_list);
1210 kfree(cl); 1258 kfree(cl);
1211} 1259}
@@ -1460,12 +1508,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1460 1508
1461 cl->buffer = hopt->buffer; 1509 cl->buffer = hopt->buffer;
1462 cl->cbuffer = hopt->cbuffer; 1510 cl->cbuffer = hopt->cbuffer;
1463 if (cl->rate) 1511
1464 qdisc_put_rtab(cl->rate); 1512 cl->rate.rate_bps = (u64)rtab->rate.rate << 3;
1465 cl->rate = rtab; 1513 cl->ceil.rate_bps = (u64)ctab->rate.rate << 3;
1466 if (cl->ceil) 1514
1467 qdisc_put_rtab(cl->ceil); 1515 htb_precompute_ratedata(&cl->rate);
1468 cl->ceil = ctab; 1516 htb_precompute_ratedata(&cl->ceil);
1517
1518 cl->buffer = hopt->buffer << PSCHED_SHIFT;
1519 cl->cbuffer = hopt->buffer << PSCHED_SHIFT;
1520
1469 sch_tree_unlock(sch); 1521 sch_tree_unlock(sch);
1470 1522
1471 qdisc_class_hash_grow(sch, &q->clhash); 1523 qdisc_class_hash_grow(sch, &q->clhash);