diff options
Diffstat (limited to 'net/sched/estimator.c')
-rw-r--r-- | net/sched/estimator.c | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/net/sched/estimator.c b/net/sched/estimator.c new file mode 100644 index 000000000000..5d3ae03e22a7 --- /dev/null +++ b/net/sched/estimator.c | |||
@@ -0,0 +1,197 @@ | |||
1 | /* | ||
2 | * net/sched/estimator.c Simple rate estimator. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | * | ||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
10 | */ | ||
11 | |||
12 | #include <asm/uaccess.h> | ||
13 | #include <asm/system.h> | ||
14 | #include <linux/bitops.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/types.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/jiffies.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/socket.h> | ||
22 | #include <linux/sockios.h> | ||
23 | #include <linux/in.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/netdevice.h> | ||
27 | #include <linux/skbuff.h> | ||
28 | #include <linux/rtnetlink.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <net/sock.h> | ||
31 | #include <net/pkt_sched.h> | ||
32 | |||
33 | /* | ||
34 | This code is NOT intended to be used for statistics collection, | ||
35 | its purpose is to provide a base for statistical multiplexing | ||
36 | for controlled load service. | ||
37 | If you need only statistics, run a user level daemon which | ||
38 | periodically reads byte counters. | ||
39 | |||
40 | Unfortunately, rate estimation is not a very easy task. | ||
41 | F.e. I did not find a simple way to estimate the current peak rate | ||
42 | and even failed to formulate the problem 8)8) | ||
43 | |||
44 | So I preferred not to built an estimator into the scheduler, | ||
45 | but run this task separately. | ||
46 | Ideally, it should be kernel thread(s), but for now it runs | ||
47 | from timers, which puts apparent top bounds on the number of rated | ||
48 | flows, has minimal overhead on small, but is enough | ||
49 | to handle controlled load service, sets of aggregates. | ||
50 | |||
51 | We measure rate over A=(1<<interval) seconds and evaluate EWMA: | ||
52 | |||
53 | avrate = avrate*(1-W) + rate*W | ||
54 | |||
55 | where W is chosen as negative power of 2: W = 2^(-ewma_log) | ||
56 | |||
57 | The resulting time constant is: | ||
58 | |||
59 | T = A/(-ln(1-W)) | ||
60 | |||
61 | |||
62 | NOTES. | ||
63 | |||
64 | * The stored value for avbps is scaled by 2^5, so that maximal | ||
65 | rate is ~1Gbit, avpps is scaled by 2^10. | ||
66 | |||
67 | * Minimal interval is HZ/4=250msec (it is the greatest common divisor | ||
68 | for HZ=100 and HZ=1024 8)), maximal interval | ||
69 | is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals | ||
70 | are too expensive, longer ones can be implemented | ||
71 | at user level painlessly. | ||
72 | */ | ||
73 | |||
74 | #define EST_MAX_INTERVAL 5 | ||
75 | |||
76 | struct qdisc_estimator | ||
77 | { | ||
78 | struct qdisc_estimator *next; | ||
79 | struct tc_stats *stats; | ||
80 | spinlock_t *stats_lock; | ||
81 | unsigned interval; | ||
82 | int ewma_log; | ||
83 | u64 last_bytes; | ||
84 | u32 last_packets; | ||
85 | u32 avpps; | ||
86 | u32 avbps; | ||
87 | }; | ||
88 | |||
89 | struct qdisc_estimator_head | ||
90 | { | ||
91 | struct timer_list timer; | ||
92 | struct qdisc_estimator *list; | ||
93 | }; | ||
94 | |||
95 | static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; | ||
96 | |||
97 | /* Estimator array lock */ | ||
98 | static DEFINE_RWLOCK(est_lock); | ||
99 | |||
100 | static void est_timer(unsigned long arg) | ||
101 | { | ||
102 | int idx = (int)arg; | ||
103 | struct qdisc_estimator *e; | ||
104 | |||
105 | read_lock(&est_lock); | ||
106 | for (e = elist[idx].list; e; e = e->next) { | ||
107 | struct tc_stats *st = e->stats; | ||
108 | u64 nbytes; | ||
109 | u32 npackets; | ||
110 | u32 rate; | ||
111 | |||
112 | spin_lock(e->stats_lock); | ||
113 | nbytes = st->bytes; | ||
114 | npackets = st->packets; | ||
115 | rate = (nbytes - e->last_bytes)<<(7 - idx); | ||
116 | e->last_bytes = nbytes; | ||
117 | e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; | ||
118 | st->bps = (e->avbps+0xF)>>5; | ||
119 | |||
120 | rate = (npackets - e->last_packets)<<(12 - idx); | ||
121 | e->last_packets = npackets; | ||
122 | e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; | ||
123 | e->stats->pps = (e->avpps+0x1FF)>>10; | ||
124 | spin_unlock(e->stats_lock); | ||
125 | } | ||
126 | |||
127 | mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); | ||
128 | read_unlock(&est_lock); | ||
129 | } | ||
130 | |||
131 | int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt) | ||
132 | { | ||
133 | struct qdisc_estimator *est; | ||
134 | struct tc_estimator *parm = RTA_DATA(opt); | ||
135 | |||
136 | if (RTA_PAYLOAD(opt) < sizeof(*parm)) | ||
137 | return -EINVAL; | ||
138 | |||
139 | if (parm->interval < -2 || parm->interval > 3) | ||
140 | return -EINVAL; | ||
141 | |||
142 | est = kmalloc(sizeof(*est), GFP_KERNEL); | ||
143 | if (est == NULL) | ||
144 | return -ENOBUFS; | ||
145 | |||
146 | memset(est, 0, sizeof(*est)); | ||
147 | est->interval = parm->interval + 2; | ||
148 | est->stats = stats; | ||
149 | est->stats_lock = stats_lock; | ||
150 | est->ewma_log = parm->ewma_log; | ||
151 | est->last_bytes = stats->bytes; | ||
152 | est->avbps = stats->bps<<5; | ||
153 | est->last_packets = stats->packets; | ||
154 | est->avpps = stats->pps<<10; | ||
155 | |||
156 | est->next = elist[est->interval].list; | ||
157 | if (est->next == NULL) { | ||
158 | init_timer(&elist[est->interval].timer); | ||
159 | elist[est->interval].timer.data = est->interval; | ||
160 | elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); | ||
161 | elist[est->interval].timer.function = est_timer; | ||
162 | add_timer(&elist[est->interval].timer); | ||
163 | } | ||
164 | write_lock_bh(&est_lock); | ||
165 | elist[est->interval].list = est; | ||
166 | write_unlock_bh(&est_lock); | ||
167 | return 0; | ||
168 | } | ||
169 | |||
170 | void qdisc_kill_estimator(struct tc_stats *stats) | ||
171 | { | ||
172 | int idx; | ||
173 | struct qdisc_estimator *est, **pest; | ||
174 | |||
175 | for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { | ||
176 | int killed = 0; | ||
177 | pest = &elist[idx].list; | ||
178 | while ((est=*pest) != NULL) { | ||
179 | if (est->stats != stats) { | ||
180 | pest = &est->next; | ||
181 | continue; | ||
182 | } | ||
183 | |||
184 | write_lock_bh(&est_lock); | ||
185 | *pest = est->next; | ||
186 | write_unlock_bh(&est_lock); | ||
187 | |||
188 | kfree(est); | ||
189 | killed++; | ||
190 | } | ||
191 | if (killed && elist[idx].list == NULL) | ||
192 | del_timer(&elist[idx].timer); | ||
193 | } | ||
194 | } | ||
195 | |||
196 | EXPORT_SYMBOL(qdisc_kill_estimator); | ||
197 | EXPORT_SYMBOL(qdisc_new_estimator); | ||