diff options
-rw-r--r-- | include/linux/pkt_sched.h | 50 | ||||
-rw-r--r-- | include/linux/skbuff.h | 38 | ||||
-rw-r--r-- | include/net/inet_ecn.h | 28 | ||||
-rw-r--r-- | include/net/inet_hashtables.h | 2 | ||||
-rw-r--r-- | include/net/red.h | 325 | ||||
-rw-r--r-- | net/core/stream.c | 12 | ||||
-rw-r--r-- | net/dccp/ipv4.c | 32 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 14 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_conntrack_helper_pptp.c | 4 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_conntrack_netlink.c | 19 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_core.c | 6 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_helper_pptp.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_proto_gre.c | 4 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_proto_unknown.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_CONNMARK.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 15 | ||||
-rw-r--r-- | net/netfilter/nf_queue.c | 2 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_log.c | 6 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_queue.c | 6 | ||||
-rw-r--r-- | net/sched/sch_gred.c | 841 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 122 | ||||
-rw-r--r-- | net/sched/sch_red.c | 418 |
24 files changed, 1066 insertions, 886 deletions
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 60ffcb9c5791..e87b233615b3 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h | |||
@@ -93,6 +93,7 @@ struct tc_fifo_qopt | |||
93 | /* PRIO section */ | 93 | /* PRIO section */ |
94 | 94 | ||
95 | #define TCQ_PRIO_BANDS 16 | 95 | #define TCQ_PRIO_BANDS 16 |
96 | #define TCQ_MIN_PRIO_BANDS 2 | ||
96 | 97 | ||
97 | struct tc_prio_qopt | 98 | struct tc_prio_qopt |
98 | { | 99 | { |
@@ -169,6 +170,7 @@ struct tc_red_qopt | |||
169 | unsigned char Scell_log; /* cell size for idle damping */ | 170 | unsigned char Scell_log; /* cell size for idle damping */ |
170 | unsigned char flags; | 171 | unsigned char flags; |
171 | #define TC_RED_ECN 1 | 172 | #define TC_RED_ECN 1 |
173 | #define TC_RED_HARDDROP 2 | ||
172 | }; | 174 | }; |
173 | 175 | ||
174 | struct tc_red_xstats | 176 | struct tc_red_xstats |
@@ -194,38 +196,34 @@ enum | |||
194 | 196 | ||
195 | #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) | 197 | #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) |
196 | 198 | ||
197 | #define TCA_SET_OFF TCA_GRED_PARMS | ||
198 | struct tc_gred_qopt | 199 | struct tc_gred_qopt |
199 | { | 200 | { |
200 | __u32 limit; /* HARD maximal queue length (bytes) | 201 | __u32 limit; /* HARD maximal queue length (bytes) */ |
201 | */ | 202 | __u32 qth_min; /* Min average length threshold (bytes) */ |
202 | __u32 qth_min; /* Min average length threshold (bytes) | 203 | __u32 qth_max; /* Max average length threshold (bytes) */ |
203 | */ | 204 | __u32 DP; /* upto 2^32 DPs */ |
204 | __u32 qth_max; /* Max average length threshold (bytes) | 205 | __u32 backlog; |
205 | */ | 206 | __u32 qave; |
206 | __u32 DP; /* upto 2^32 DPs */ | 207 | __u32 forced; |
207 | __u32 backlog; | 208 | __u32 early; |
208 | __u32 qave; | 209 | __u32 other; |
209 | __u32 forced; | 210 | __u32 pdrop; |
210 | __u32 early; | 211 | __u8 Wlog; /* log(W) */ |
211 | __u32 other; | 212 | __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ |
212 | __u32 pdrop; | 213 | __u8 Scell_log; /* cell size for idle damping */ |
213 | 214 | __u8 prio; /* prio of this VQ */ | |
214 | unsigned char Wlog; /* log(W) */ | 215 | __u32 packets; |
215 | unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ | 216 | __u32 bytesin; |
216 | unsigned char Scell_log; /* cell size for idle damping */ | ||
217 | __u8 prio; /* prio of this VQ */ | ||
218 | __u32 packets; | ||
219 | __u32 bytesin; | ||
220 | }; | 217 | }; |
218 | |||
221 | /* gred setup */ | 219 | /* gred setup */ |
222 | struct tc_gred_sopt | 220 | struct tc_gred_sopt |
223 | { | 221 | { |
224 | __u32 DPs; | 222 | __u32 DPs; |
225 | __u32 def_DP; | 223 | __u32 def_DP; |
226 | __u8 grio; | 224 | __u8 grio; |
227 | __u8 pad1; | 225 | __u8 flags; |
228 | __u16 pad2; | 226 | __u16 pad1; |
229 | }; | 227 | }; |
230 | 228 | ||
231 | /* HTB section */ | 229 | /* HTB section */ |
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4286d832166f..fdfb8fe8c38c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h | |||
@@ -603,23 +603,23 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) | |||
603 | */ | 603 | */ |
604 | 604 | ||
605 | /** | 605 | /** |
606 | * __skb_queue_head - queue a buffer at the list head | 606 | * __skb_queue_after - queue a buffer at the list head |
607 | * @list: list to use | 607 | * @list: list to use |
608 | * @prev: place after this buffer | ||
608 | * @newsk: buffer to queue | 609 | * @newsk: buffer to queue |
609 | * | 610 | * |
610 | * Queue a buffer at the start of a list. This function takes no locks | 611 | * Queue a buffer int the middle of a list. This function takes no locks |
611 | * and you must therefore hold required locks before calling it. | 612 | * and you must therefore hold required locks before calling it. |
612 | * | 613 | * |
613 | * A buffer cannot be placed on two lists at the same time. | 614 | * A buffer cannot be placed on two lists at the same time. |
614 | */ | 615 | */ |
615 | extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); | 616 | static inline void __skb_queue_after(struct sk_buff_head *list, |
616 | static inline void __skb_queue_head(struct sk_buff_head *list, | 617 | struct sk_buff *prev, |
617 | struct sk_buff *newsk) | 618 | struct sk_buff *newsk) |
618 | { | 619 | { |
619 | struct sk_buff *prev, *next; | 620 | struct sk_buff *next; |
620 | |||
621 | list->qlen++; | 621 | list->qlen++; |
622 | prev = (struct sk_buff *)list; | 622 | |
623 | next = prev->next; | 623 | next = prev->next; |
624 | newsk->next = next; | 624 | newsk->next = next; |
625 | newsk->prev = prev; | 625 | newsk->prev = prev; |
@@ -627,6 +627,23 @@ static inline void __skb_queue_head(struct sk_buff_head *list, | |||
627 | } | 627 | } |
628 | 628 | ||
629 | /** | 629 | /** |
630 | * __skb_queue_head - queue a buffer at the list head | ||
631 | * @list: list to use | ||
632 | * @newsk: buffer to queue | ||
633 | * | ||
634 | * Queue a buffer at the start of a list. This function takes no locks | ||
635 | * and you must therefore hold required locks before calling it. | ||
636 | * | ||
637 | * A buffer cannot be placed on two lists at the same time. | ||
638 | */ | ||
639 | extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); | ||
640 | static inline void __skb_queue_head(struct sk_buff_head *list, | ||
641 | struct sk_buff *newsk) | ||
642 | { | ||
643 | __skb_queue_after(list, (struct sk_buff *)list, newsk); | ||
644 | } | ||
645 | |||
646 | /** | ||
630 | * __skb_queue_tail - queue a buffer at the list tail | 647 | * __skb_queue_tail - queue a buffer at the list tail |
631 | * @list: list to use | 648 | * @list: list to use |
632 | * @newsk: buffer to queue | 649 | * @newsk: buffer to queue |
@@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr) | |||
1203 | prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \ | 1220 | prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \ |
1204 | skb = skb->next) | 1221 | skb = skb->next) |
1205 | 1222 | ||
1223 | #define skb_queue_reverse_walk(queue, skb) \ | ||
1224 | for (skb = (queue)->prev; \ | ||
1225 | prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \ | ||
1226 | skb = skb->prev) | ||
1227 | |||
1206 | 1228 | ||
1207 | extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, | 1229 | extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, |
1208 | int noblock, int *err); | 1230 | int noblock, int *err); |
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index f87845e2e965..b0c47e2eccf1 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _INET_ECN_H_ | 2 | #define _INET_ECN_H_ |
3 | 3 | ||
4 | #include <linux/ip.h> | 4 | #include <linux/ip.h> |
5 | #include <linux/skbuff.h> | ||
5 | #include <net/dsfield.h> | 6 | #include <net/dsfield.h> |
6 | 7 | ||
7 | enum { | 8 | enum { |
@@ -48,7 +49,7 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) | |||
48 | (label) |= __constant_htons(INET_ECN_ECT_0 << 4); \ | 49 | (label) |= __constant_htons(INET_ECN_ECT_0 << 4); \ |
49 | } while (0) | 50 | } while (0) |
50 | 51 | ||
51 | static inline void IP_ECN_set_ce(struct iphdr *iph) | 52 | static inline int IP_ECN_set_ce(struct iphdr *iph) |
52 | { | 53 | { |
53 | u32 check = iph->check; | 54 | u32 check = iph->check; |
54 | u32 ecn = (iph->tos + 1) & INET_ECN_MASK; | 55 | u32 ecn = (iph->tos + 1) & INET_ECN_MASK; |
@@ -61,7 +62,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph) | |||
61 | * INET_ECN_CE => 00 | 62 | * INET_ECN_CE => 00 |
62 | */ | 63 | */ |
63 | if (!(ecn & 2)) | 64 | if (!(ecn & 2)) |
64 | return; | 65 | return !ecn; |
65 | 66 | ||
66 | /* | 67 | /* |
67 | * The following gives us: | 68 | * The following gives us: |
@@ -72,6 +73,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph) | |||
72 | 73 | ||
73 | iph->check = check + (check>=0xFFFF); | 74 | iph->check = check + (check>=0xFFFF); |
74 | iph->tos |= INET_ECN_CE; | 75 | iph->tos |= INET_ECN_CE; |
76 | return 1; | ||
75 | } | 77 | } |
76 | 78 | ||
77 | static inline void IP_ECN_clear(struct iphdr *iph) | 79 | static inline void IP_ECN_clear(struct iphdr *iph) |
@@ -87,11 +89,12 @@ static inline void ipv4_copy_dscp(struct iphdr *outer, struct iphdr *inner) | |||
87 | 89 | ||
88 | struct ipv6hdr; | 90 | struct ipv6hdr; |
89 | 91 | ||
90 | static inline void IP6_ECN_set_ce(struct ipv6hdr *iph) | 92 | static inline int IP6_ECN_set_ce(struct ipv6hdr *iph) |
91 | { | 93 | { |
92 | if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) | 94 | if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) |
93 | return; | 95 | return 0; |
94 | *(u32*)iph |= htonl(INET_ECN_CE << 20); | 96 | *(u32*)iph |= htonl(INET_ECN_CE << 20); |
97 | return 1; | ||
95 | } | 98 | } |
96 | 99 | ||
97 | static inline void IP6_ECN_clear(struct ipv6hdr *iph) | 100 | static inline void IP6_ECN_clear(struct ipv6hdr *iph) |
@@ -105,4 +108,21 @@ static inline void ipv6_copy_dscp(struct ipv6hdr *outer, struct ipv6hdr *inner) | |||
105 | ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); | 108 | ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); |
106 | } | 109 | } |
107 | 110 | ||
111 | static inline int INET_ECN_set_ce(struct sk_buff *skb) | ||
112 | { | ||
113 | switch (skb->protocol) { | ||
114 | case __constant_htons(ETH_P_IP): | ||
115 | if (skb->nh.raw + sizeof(struct iphdr) <= skb->tail) | ||
116 | return IP_ECN_set_ce(skb->nh.iph); | ||
117 | break; | ||
118 | |||
119 | case __constant_htons(ETH_P_IPV6): | ||
120 | if (skb->nh.raw + sizeof(struct ipv6hdr) <= skb->tail) | ||
121 | return IP6_ECN_set_ce(skb->nh.ipv6h); | ||
122 | break; | ||
123 | } | ||
124 | |||
125 | return 0; | ||
126 | } | ||
127 | |||
108 | #endif | 128 | #endif |
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f50f95968340..07840baa9341 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h | |||
@@ -125,9 +125,7 @@ struct inet_hashinfo { | |||
125 | rwlock_t lhash_lock ____cacheline_aligned; | 125 | rwlock_t lhash_lock ____cacheline_aligned; |
126 | atomic_t lhash_users; | 126 | atomic_t lhash_users; |
127 | wait_queue_head_t lhash_wait; | 127 | wait_queue_head_t lhash_wait; |
128 | spinlock_t portalloc_lock; | ||
129 | kmem_cache_t *bind_bucket_cachep; | 128 | kmem_cache_t *bind_bucket_cachep; |
130 | int port_rover; | ||
131 | }; | 129 | }; |
132 | 130 | ||
133 | static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, | 131 | static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, |
diff --git a/include/net/red.h b/include/net/red.h new file mode 100644 index 000000000000..2ed4358e3295 --- /dev/null +++ b/include/net/red.h | |||
@@ -0,0 +1,325 @@ | |||
1 | #ifndef __NET_SCHED_RED_H | ||
2 | #define __NET_SCHED_RED_H | ||
3 | |||
4 | #include <linux/config.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <net/pkt_sched.h> | ||
7 | #include <net/inet_ecn.h> | ||
8 | #include <net/dsfield.h> | ||
9 | |||
10 | /* Random Early Detection (RED) algorithm. | ||
11 | ======================================= | ||
12 | |||
13 | Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways | ||
14 | for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. | ||
15 | |||
16 | This file codes a "divisionless" version of RED algorithm | ||
17 | as written down in Fig.17 of the paper. | ||
18 | |||
19 | Short description. | ||
20 | ------------------ | ||
21 | |||
22 | When a new packet arrives we calculate the average queue length: | ||
23 | |||
24 | avg = (1-W)*avg + W*current_queue_len, | ||
25 | |||
26 | W is the filter time constant (chosen as 2^(-Wlog)), it controls | ||
27 | the inertia of the algorithm. To allow larger bursts, W should be | ||
28 | decreased. | ||
29 | |||
30 | if (avg > th_max) -> packet marked (dropped). | ||
31 | if (avg < th_min) -> packet passes. | ||
32 | if (th_min < avg < th_max) we calculate probability: | ||
33 | |||
34 | Pb = max_P * (avg - th_min)/(th_max-th_min) | ||
35 | |||
36 | and mark (drop) packet with this probability. | ||
37 | Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). | ||
38 | max_P should be small (not 1), usually 0.01..0.02 is good value. | ||
39 | |||
40 | max_P is chosen as a number, so that max_P/(th_max-th_min) | ||
41 | is a negative power of two in order arithmetics to contain | ||
42 | only shifts. | ||
43 | |||
44 | |||
45 | Parameters, settable by user: | ||
46 | ----------------------------- | ||
47 | |||
48 | qth_min - bytes (should be < qth_max/2) | ||
49 | qth_max - bytes (should be at least 2*qth_min and less limit) | ||
50 | Wlog - bits (<32) log(1/W). | ||
51 | Plog - bits (<32) | ||
52 | |||
53 | Plog is related to max_P by formula: | ||
54 | |||
55 | max_P = (qth_max-qth_min)/2^Plog; | ||
56 | |||
57 | F.e. if qth_max=128K and qth_min=32K, then Plog=22 | ||
58 | corresponds to max_P=0.02 | ||
59 | |||
60 | Scell_log | ||
61 | Stab | ||
62 | |||
63 | Lookup table for log((1-W)^(t/t_ave). | ||
64 | |||
65 | |||
66 | NOTES: | ||
67 | |||
68 | Upper bound on W. | ||
69 | ----------------- | ||
70 | |||
71 | If you want to allow bursts of L packets of size S, | ||
72 | you should choose W: | ||
73 | |||
74 | L + 1 - th_min/S < (1-(1-W)^L)/W | ||
75 | |||
76 | th_min/S = 32 th_min/S = 4 | ||
77 | |||
78 | log(W) L | ||
79 | -1 33 | ||
80 | -2 35 | ||
81 | -3 39 | ||
82 | -4 46 | ||
83 | -5 57 | ||
84 | -6 75 | ||
85 | -7 101 | ||
86 | -8 135 | ||
87 | -9 190 | ||
88 | etc. | ||
89 | */ | ||
90 | |||
91 | #define RED_STAB_SIZE 256 | ||
92 | #define RED_STAB_MASK (RED_STAB_SIZE - 1) | ||
93 | |||
94 | struct red_stats | ||
95 | { | ||
96 | u32 prob_drop; /* Early probability drops */ | ||
97 | u32 prob_mark; /* Early probability marks */ | ||
98 | u32 forced_drop; /* Forced drops, qavg > max_thresh */ | ||
99 | u32 forced_mark; /* Forced marks, qavg > max_thresh */ | ||
100 | u32 pdrop; /* Drops due to queue limits */ | ||
101 | u32 other; /* Drops due to drop() calls */ | ||
102 | u32 backlog; | ||
103 | }; | ||
104 | |||
105 | struct red_parms | ||
106 | { | ||
107 | /* Parameters */ | ||
108 | u32 qth_min; /* Min avg length threshold: A scaled */ | ||
109 | u32 qth_max; /* Max avg length threshold: A scaled */ | ||
110 | u32 Scell_max; | ||
111 | u32 Rmask; /* Cached random mask, see red_rmask */ | ||
112 | u8 Scell_log; | ||
113 | u8 Wlog; /* log(W) */ | ||
114 | u8 Plog; /* random number bits */ | ||
115 | u8 Stab[RED_STAB_SIZE]; | ||
116 | |||
117 | /* Variables */ | ||
118 | int qcount; /* Number of packets since last random | ||
119 | number generation */ | ||
120 | u32 qR; /* Cached random number */ | ||
121 | |||
122 | unsigned long qavg; /* Average queue length: A scaled */ | ||
123 | psched_time_t qidlestart; /* Start of current idle period */ | ||
124 | }; | ||
125 | |||
126 | static inline u32 red_rmask(u8 Plog) | ||
127 | { | ||
128 | return Plog < 32 ? ((1 << Plog) - 1) : ~0UL; | ||
129 | } | ||
130 | |||
131 | static inline void red_set_parms(struct red_parms *p, | ||
132 | u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, | ||
133 | u8 Scell_log, u8 *stab) | ||
134 | { | ||
135 | /* Reset average queue length, the value is strictly bound | ||
136 | * to the parameters below, reseting hurts a bit but leaving | ||
137 | * it might result in an unreasonable qavg for a while. --TGR | ||
138 | */ | ||
139 | p->qavg = 0; | ||
140 | |||
141 | p->qcount = -1; | ||
142 | p->qth_min = qth_min << Wlog; | ||
143 | p->qth_max = qth_max << Wlog; | ||
144 | p->Wlog = Wlog; | ||
145 | p->Plog = Plog; | ||
146 | p->Rmask = red_rmask(Plog); | ||
147 | p->Scell_log = Scell_log; | ||
148 | p->Scell_max = (255 << Scell_log); | ||
149 | |||
150 | memcpy(p->Stab, stab, sizeof(p->Stab)); | ||
151 | } | ||
152 | |||
153 | static inline int red_is_idling(struct red_parms *p) | ||
154 | { | ||
155 | return !PSCHED_IS_PASTPERFECT(p->qidlestart); | ||
156 | } | ||
157 | |||
158 | static inline void red_start_of_idle_period(struct red_parms *p) | ||
159 | { | ||
160 | PSCHED_GET_TIME(p->qidlestart); | ||
161 | } | ||
162 | |||
163 | static inline void red_end_of_idle_period(struct red_parms *p) | ||
164 | { | ||
165 | PSCHED_SET_PASTPERFECT(p->qidlestart); | ||
166 | } | ||
167 | |||
168 | static inline void red_restart(struct red_parms *p) | ||
169 | { | ||
170 | red_end_of_idle_period(p); | ||
171 | p->qavg = 0; | ||
172 | p->qcount = -1; | ||
173 | } | ||
174 | |||
175 | static inline unsigned long red_calc_qavg_from_idle_time(struct red_parms *p) | ||
176 | { | ||
177 | psched_time_t now; | ||
178 | long us_idle; | ||
179 | int shift; | ||
180 | |||
181 | PSCHED_GET_TIME(now); | ||
182 | us_idle = PSCHED_TDIFF_SAFE(now, p->qidlestart, p->Scell_max); | ||
183 | |||
184 | /* | ||
185 | * The problem: ideally, average length queue recalcultion should | ||
186 | * be done over constant clock intervals. This is too expensive, so | ||
187 | * that the calculation is driven by outgoing packets. | ||
188 | * When the queue is idle we have to model this clock by hand. | ||
189 | * | ||
190 | * SF+VJ proposed to "generate": | ||
191 | * | ||
192 | * m = idletime / (average_pkt_size / bandwidth) | ||
193 | * | ||
194 | * dummy packets as a burst after idle time, i.e. | ||
195 | * | ||
196 | * p->qavg *= (1-W)^m | ||
197 | * | ||
198 | * This is an apparently overcomplicated solution (f.e. we have to | ||
199 | * precompute a table to make this calculation in reasonable time) | ||
200 | * I believe that a simpler model may be used here, | ||
201 | * but it is field for experiments. | ||
202 | */ | ||
203 | |||
204 | shift = p->Stab[(us_idle >> p->Scell_log) & RED_STAB_MASK]; | ||
205 | |||
206 | if (shift) | ||
207 | return p->qavg >> shift; | ||
208 | else { | ||
209 | /* Approximate initial part of exponent with linear function: | ||
210 | * | ||
211 | * (1-W)^m ~= 1-mW + ... | ||
212 | * | ||
213 | * Seems, it is the best solution to | ||
214 | * problem of too coarse exponent tabulation. | ||
215 | */ | ||
216 | us_idle = (p->qavg * us_idle) >> p->Scell_log; | ||
217 | |||
218 | if (us_idle < (p->qavg >> 1)) | ||
219 | return p->qavg - us_idle; | ||
220 | else | ||
221 | return p->qavg >> 1; | ||
222 | } | ||
223 | } | ||
224 | |||
225 | static inline unsigned long red_calc_qavg_no_idle_time(struct red_parms *p, | ||
226 | unsigned int backlog) | ||
227 | { | ||
228 | /* | ||
229 | * NOTE: p->qavg is fixed point number with point at Wlog. | ||
230 | * The formula below is equvalent to floating point | ||
231 | * version: | ||
232 | * | ||
233 | * qavg = qavg*(1-W) + backlog*W; | ||
234 | * | ||
235 | * --ANK (980924) | ||
236 | */ | ||
237 | return p->qavg + (backlog - (p->qavg >> p->Wlog)); | ||
238 | } | ||
239 | |||
240 | static inline unsigned long red_calc_qavg(struct red_parms *p, | ||
241 | unsigned int backlog) | ||
242 | { | ||
243 | if (!red_is_idling(p)) | ||
244 | return red_calc_qavg_no_idle_time(p, backlog); | ||
245 | else | ||
246 | return red_calc_qavg_from_idle_time(p); | ||
247 | } | ||
248 | |||
249 | static inline u32 red_random(struct red_parms *p) | ||
250 | { | ||
251 | return net_random() & p->Rmask; | ||
252 | } | ||
253 | |||
254 | static inline int red_mark_probability(struct red_parms *p, unsigned long qavg) | ||
255 | { | ||
256 | /* The formula used below causes questions. | ||
257 | |||
258 | OK. qR is random number in the interval 0..Rmask | ||
259 | i.e. 0..(2^Plog). If we used floating point | ||
260 | arithmetics, it would be: (2^Plog)*rnd_num, | ||
261 | where rnd_num is less 1. | ||
262 | |||
263 | Taking into account, that qavg have fixed | ||
264 | point at Wlog, and Plog is related to max_P by | ||
265 | max_P = (qth_max-qth_min)/2^Plog; two lines | ||
266 | below have the following floating point equivalent: | ||
267 | |||
268 | max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount | ||
269 | |||
270 | Any questions? --ANK (980924) | ||
271 | */ | ||
272 | return !(((qavg - p->qth_min) >> p->Wlog) * p->qcount < p->qR); | ||
273 | } | ||
274 | |||
275 | enum { | ||
276 | RED_BELOW_MIN_THRESH, | ||
277 | RED_BETWEEN_TRESH, | ||
278 | RED_ABOVE_MAX_TRESH, | ||
279 | }; | ||
280 | |||
281 | static inline int red_cmp_thresh(struct red_parms *p, unsigned long qavg) | ||
282 | { | ||
283 | if (qavg < p->qth_min) | ||
284 | return RED_BELOW_MIN_THRESH; | ||
285 | else if (qavg >= p->qth_max) | ||
286 | return RED_ABOVE_MAX_TRESH; | ||
287 | else | ||
288 | return RED_BETWEEN_TRESH; | ||
289 | } | ||
290 | |||
291 | enum { | ||
292 | RED_DONT_MARK, | ||
293 | RED_PROB_MARK, | ||
294 | RED_HARD_MARK, | ||
295 | }; | ||
296 | |||
297 | static inline int red_action(struct red_parms *p, unsigned long qavg) | ||
298 | { | ||
299 | switch (red_cmp_thresh(p, qavg)) { | ||
300 | case RED_BELOW_MIN_THRESH: | ||
301 | p->qcount = -1; | ||
302 | return RED_DONT_MARK; | ||
303 | |||
304 | case RED_BETWEEN_TRESH: | ||
305 | if (++p->qcount) { | ||
306 | if (red_mark_probability(p, qavg)) { | ||
307 | p->qcount = 0; | ||
308 | p->qR = red_random(p); | ||
309 | return RED_PROB_MARK; | ||
310 | } | ||
311 | } else | ||
312 | p->qR = red_random(p); | ||
313 | |||
314 | return RED_DONT_MARK; | ||
315 | |||
316 | case RED_ABOVE_MAX_TRESH: | ||
317 | p->qcount = -1; | ||
318 | return RED_HARD_MARK; | ||
319 | } | ||
320 | |||
321 | BUG(); | ||
322 | return RED_DONT_MARK; | ||
323 | } | ||
324 | |||
325 | #endif | ||
diff --git a/net/core/stream.c b/net/core/stream.c index ac9edfdf8742..15bfd03e8024 100644 --- a/net/core/stream.c +++ b/net/core/stream.c | |||
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) | |||
52 | { | 52 | { |
53 | struct task_struct *tsk = current; | 53 | struct task_struct *tsk = current; |
54 | DEFINE_WAIT(wait); | 54 | DEFINE_WAIT(wait); |
55 | int done; | ||
55 | 56 | ||
56 | while (1) { | 57 | do { |
57 | if (sk->sk_err) | 58 | if (sk->sk_err) |
58 | return sock_error(sk); | 59 | return sock_error(sk); |
59 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) | 60 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) |
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) | |||
65 | 66 | ||
66 | prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); | 67 | prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); |
67 | sk->sk_write_pending++; | 68 | sk->sk_write_pending++; |
68 | if (sk_wait_event(sk, timeo_p, | 69 | done = sk_wait_event(sk, timeo_p, |
69 | !((1 << sk->sk_state) & | 70 | !((1 << sk->sk_state) & |
70 | ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))) | 71 | ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); |
71 | break; | ||
72 | finish_wait(sk->sk_sleep, &wait); | 72 | finish_wait(sk->sk_sleep, &wait); |
73 | sk->sk_write_pending--; | 73 | sk->sk_write_pending--; |
74 | } | 74 | } while (!done); |
75 | return 0; | 75 | return 0; |
76 | } | 76 | } |
77 | 77 | ||
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6298cf58ff9e..4b9bc81ae1a3 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c | |||
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { | |||
31 | .lhash_lock = RW_LOCK_UNLOCKED, | 31 | .lhash_lock = RW_LOCK_UNLOCKED, |
32 | .lhash_users = ATOMIC_INIT(0), | 32 | .lhash_users = ATOMIC_INIT(0), |
33 | .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), | 33 | .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), |
34 | .portalloc_lock = SPIN_LOCK_UNLOCKED, | ||
35 | .port_rover = 1024 - 1, | ||
36 | }; | 34 | }; |
37 | 35 | ||
38 | EXPORT_SYMBOL_GPL(dccp_hashinfo); | 36 | EXPORT_SYMBOL_GPL(dccp_hashinfo); |
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk) | |||
125 | int ret; | 123 | int ret; |
126 | 124 | ||
127 | if (snum == 0) { | 125 | if (snum == 0) { |
128 | int rover; | ||
129 | int low = sysctl_local_port_range[0]; | 126 | int low = sysctl_local_port_range[0]; |
130 | int high = sysctl_local_port_range[1]; | 127 | int high = sysctl_local_port_range[1]; |
131 | int remaining = (high - low) + 1; | 128 | int remaining = (high - low) + 1; |
129 | int rover = net_random() % (high - low) + low; | ||
132 | struct hlist_node *node; | 130 | struct hlist_node *node; |
133 | struct inet_timewait_sock *tw = NULL; | 131 | struct inet_timewait_sock *tw = NULL; |
134 | 132 | ||
135 | local_bh_disable(); | 133 | local_bh_disable(); |
136 | |||
137 | /* TODO. Actually it is not so bad idea to remove | ||
138 | * dccp_hashinfo.portalloc_lock before next submission to | ||
139 | * Linus. | ||
140 | * As soon as we touch this place at all it is time to think. | ||
141 | * | ||
142 | * Now it protects single _advisory_ variable | ||
143 | * dccp_hashinfo.port_rover, hence it is mostly useless. | ||
144 | * Code will work nicely if we just delete it, but | ||
145 | * I am afraid in contented case it will work not better or | ||
146 | * even worse: another cpu just will hit the same bucket | ||
147 | * and spin there. | ||
148 | * So some cpu salt could remove both contention and | ||
149 | * memory pingpong. Any ideas how to do this in a nice way? | ||
150 | */ | ||
151 | spin_lock(&dccp_hashinfo.portalloc_lock); | ||
152 | rover = dccp_hashinfo.port_rover; | ||
153 | |||
154 | do { | 134 | do { |
155 | rover++; | ||
156 | if ((rover < low) || (rover > high)) | ||
157 | rover = low; | ||
158 | head = &dccp_hashinfo.bhash[inet_bhashfn(rover, | 135 | head = &dccp_hashinfo.bhash[inet_bhashfn(rover, |
159 | dccp_hashinfo.bhash_size)]; | 136 | dccp_hashinfo.bhash_size)]; |
160 | spin_lock(&head->lock); | 137 | spin_lock(&head->lock); |
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk) | |||
187 | 164 | ||
188 | next_port: | 165 | next_port: |
189 | spin_unlock(&head->lock); | 166 | spin_unlock(&head->lock); |
167 | if (++rover > high) | ||
168 | rover = low; | ||
190 | } while (--remaining > 0); | 169 | } while (--remaining > 0); |
191 | dccp_hashinfo.port_rover = rover; | ||
192 | spin_unlock(&dccp_hashinfo.portalloc_lock); | ||
193 | 170 | ||
194 | local_bh_enable(); | 171 | local_bh_enable(); |
195 | 172 | ||
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk) | |||
197 | 174 | ||
198 | ok: | 175 | ok: |
199 | /* All locks still held and bhs disabled */ | 176 | /* All locks still held and bhs disabled */ |
200 | dccp_hashinfo.port_rover = rover; | ||
201 | spin_unlock(&dccp_hashinfo.portalloc_lock); | ||
202 | |||
203 | inet_bind_hash(sk, tb, rover); | 177 | inet_bind_hash(sk, tb, rover); |
204 | if (sk_unhashed(sk)) { | 178 | if (sk_unhashed(sk)) { |
205 | inet_sk(sk)->sport = htons(rover); | 179 | inet_sk(sk)->sport = htons(rover); |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 94468a76c5b4..3fe021f1a566 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, | |||
78 | int low = sysctl_local_port_range[0]; | 78 | int low = sysctl_local_port_range[0]; |
79 | int high = sysctl_local_port_range[1]; | 79 | int high = sysctl_local_port_range[1]; |
80 | int remaining = (high - low) + 1; | 80 | int remaining = (high - low) + 1; |
81 | int rover; | 81 | int rover = net_random() % (high - low) + low; |
82 | 82 | ||
83 | spin_lock(&hashinfo->portalloc_lock); | ||
84 | if (hashinfo->port_rover < low) | ||
85 | rover = low; | ||
86 | else | ||
87 | rover = hashinfo->port_rover; | ||
88 | do { | 83 | do { |
89 | rover++; | ||
90 | if (rover > high) | ||
91 | rover = low; | ||
92 | head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; | 84 | head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; |
93 | spin_lock(&head->lock); | 85 | spin_lock(&head->lock); |
94 | inet_bind_bucket_for_each(tb, node, &head->chain) | 86 | inet_bind_bucket_for_each(tb, node, &head->chain) |
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, | |||
97 | break; | 89 | break; |
98 | next: | 90 | next: |
99 | spin_unlock(&head->lock); | 91 | spin_unlock(&head->lock); |
92 | if (++rover > high) | ||
93 | rover = low; | ||
100 | } while (--remaining > 0); | 94 | } while (--remaining > 0); |
101 | hashinfo->port_rover = rover; | ||
102 | spin_unlock(&hashinfo->portalloc_lock); | ||
103 | 95 | ||
104 | /* Exhausted local port range during search? It is not | 96 | /* Exhausted local port range during search? It is not |
105 | * possible for us to be holding one of the bind hash | 97 | * possible for us to be holding one of the bind hash |
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 926a6684643d..4108a5e12b3c 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c | |||
@@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master, | |||
270 | exp_orig->expectfn = pptp_expectfn; | 270 | exp_orig->expectfn = pptp_expectfn; |
271 | exp_orig->flags = 0; | 271 | exp_orig->flags = 0; |
272 | 272 | ||
273 | exp_orig->dir = IP_CT_DIR_ORIGINAL; | ||
274 | |||
275 | /* both expectations are identical apart from tuple */ | 273 | /* both expectations are identical apart from tuple */ |
276 | memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); | 274 | memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); |
277 | memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple)); | 275 | memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple)); |
278 | 276 | ||
279 | exp_reply->dir = !exp_orig->dir; | ||
280 | |||
281 | if (ip_nat_pptp_hook_exp_gre) | 277 | if (ip_nat_pptp_hook_exp_gre) |
282 | ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply); | 278 | ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply); |
283 | else { | 279 | else { |
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 166e6069f121..82a65043a8ef 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c | |||
@@ -815,7 +815,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, | |||
815 | IPCTNL_MSG_CT_NEW, 1, ct); | 815 | IPCTNL_MSG_CT_NEW, 1, ct); |
816 | ip_conntrack_put(ct); | 816 | ip_conntrack_put(ct); |
817 | if (err <= 0) | 817 | if (err <= 0) |
818 | goto out; | 818 | goto free; |
819 | 819 | ||
820 | err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); | 820 | err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); |
821 | if (err < 0) | 821 | if (err < 0) |
@@ -824,9 +824,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, | |||
824 | DEBUGP("leaving\n"); | 824 | DEBUGP("leaving\n"); |
825 | return 0; | 825 | return 0; |
826 | 826 | ||
827 | free: | ||
828 | kfree_skb(skb2); | ||
827 | out: | 829 | out: |
828 | if (skb2) | ||
829 | kfree_skb(skb2); | ||
830 | return -1; | 830 | return -1; |
831 | } | 831 | } |
832 | 832 | ||
@@ -1322,21 +1322,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, | |||
1322 | nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, | 1322 | nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, |
1323 | 1, exp); | 1323 | 1, exp); |
1324 | if (err <= 0) | 1324 | if (err <= 0) |
1325 | goto out; | 1325 | goto free; |
1326 | 1326 | ||
1327 | ip_conntrack_expect_put(exp); | 1327 | ip_conntrack_expect_put(exp); |
1328 | 1328 | ||
1329 | err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); | 1329 | return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); |
1330 | if (err < 0) | ||
1331 | goto free; | ||
1332 | |||
1333 | return err; | ||
1334 | 1330 | ||
1331 | free: | ||
1332 | kfree_skb(skb2); | ||
1335 | out: | 1333 | out: |
1336 | ip_conntrack_expect_put(exp); | 1334 | ip_conntrack_expect_put(exp); |
1337 | free: | ||
1338 | if (skb2) | ||
1339 | kfree_skb(skb2); | ||
1340 | return err; | 1335 | return err; |
1341 | } | 1336 | } |
1342 | 1337 | ||
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index c5e3abd24672..762f4d93936b 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c | |||
@@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum) | |||
66 | * removed until we've grabbed the reference */ | 66 | * removed until we've grabbed the reference */ |
67 | preempt_disable(); | 67 | preempt_disable(); |
68 | p = __ip_nat_proto_find(protonum); | 68 | p = __ip_nat_proto_find(protonum); |
69 | if (p) { | 69 | if (!try_module_get(p->me)) |
70 | if (!try_module_get(p->me)) | 70 | p = &ip_nat_unknown_protocol; |
71 | p = &ip_nat_unknown_protocol; | ||
72 | } | ||
73 | preempt_enable(); | 71 | preempt_enable(); |
74 | 72 | ||
75 | return p; | 73 | return p; |
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index 3cdd0684d30d..ee6ab74ad3a9 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c | |||
@@ -216,6 +216,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, | |||
216 | expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); | 216 | expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); |
217 | expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); | 217 | expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); |
218 | expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); | 218 | expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); |
219 | expect_orig->dir = IP_CT_DIR_ORIGINAL; | ||
219 | inv_t.src.ip = reply_t->src.ip; | 220 | inv_t.src.ip = reply_t->src.ip; |
220 | inv_t.dst.ip = reply_t->dst.ip; | 221 | inv_t.dst.ip = reply_t->dst.ip; |
221 | inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id); | 222 | inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id); |
@@ -233,6 +234,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, | |||
233 | expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id); | 234 | expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id); |
234 | expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id); | 235 | expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id); |
235 | expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id); | 236 | expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id); |
237 | expect_reply->dir = IP_CT_DIR_REPLY; | ||
236 | inv_t.src.ip = orig_t->src.ip; | 238 | inv_t.src.ip = orig_t->src.ip; |
237 | inv_t.dst.ip = orig_t->dst.ip; | 239 | inv_t.dst.ip = orig_t->dst.ip; |
238 | inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id); | 240 | inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id); |
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c index 7c1285401672..f7cad7cf1aec 100644 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ b/net/ipv4/netfilter/ip_nat_proto_gre.c | |||
@@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb, | |||
139 | break; | 139 | break; |
140 | case GRE_VERSION_PPTP: | 140 | case GRE_VERSION_PPTP: |
141 | DEBUGP("call_id -> 0x%04x\n", | 141 | DEBUGP("call_id -> 0x%04x\n", |
142 | ntohl(tuple->dst.u.gre.key)); | 142 | ntohs(tuple->dst.u.gre.key)); |
143 | pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key)); | 143 | pgreh->call_id = tuple->dst.u.gre.key; |
144 | break; | 144 | break; |
145 | default: | 145 | default: |
146 | DEBUGP("can't nat unknown GRE version\n"); | 146 | DEBUGP("can't nat unknown GRE version\n"); |
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index 99bbef56f84e..f0099a646a0b 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c | |||
@@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) | |||
62 | 62 | ||
63 | struct ip_nat_protocol ip_nat_unknown_protocol = { | 63 | struct ip_nat_protocol ip_nat_unknown_protocol = { |
64 | .name = "unknown", | 64 | .name = "unknown", |
65 | .me = THIS_MODULE, | 65 | /* .me isn't set: getting a ref to this cannot fail. */ |
66 | .manip_pkt = unknown_manip_pkt, | 66 | .manip_pkt = unknown_manip_pkt, |
67 | .in_range = unknown_in_range, | 67 | .in_range = unknown_in_range, |
68 | .unique_tuple = unknown_unique_tuple, | 68 | .unique_tuple = unknown_unique_tuple, |
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 134638021339..05d66ab59424 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c | |||
@@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = { | |||
109 | 109 | ||
110 | static int __init init(void) | 110 | static int __init init(void) |
111 | { | 111 | { |
112 | need_ip_conntrack(); | ||
112 | return ipt_register_target(&ipt_connmark_reg); | 113 | return ipt_register_target(&ipt_connmark_reg); |
113 | } | 114 | } |
114 | 115 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3f0013a9580..72b7c22e1ea5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -2112,7 +2112,6 @@ void __init tcp_init(void) | |||
2112 | sysctl_tcp_max_orphans >>= (3 - order); | 2112 | sysctl_tcp_max_orphans >>= (3 - order); |
2113 | sysctl_max_syn_backlog = 128; | 2113 | sysctl_max_syn_backlog = 128; |
2114 | } | 2114 | } |
2115 | tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; | ||
2116 | 2115 | ||
2117 | sysctl_tcp_mem[0] = 768 << order; | 2116 | sysctl_tcp_mem[0] = 768 << order; |
2118 | sysctl_tcp_mem[1] = 1024 << order; | 2117 | sysctl_tcp_mem[1] = 1024 << order; |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c85819d8474b..49d67cd75edd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { | |||
93 | .lhash_lock = RW_LOCK_UNLOCKED, | 93 | .lhash_lock = RW_LOCK_UNLOCKED, |
94 | .lhash_users = ATOMIC_INIT(0), | 94 | .lhash_users = ATOMIC_INIT(0), |
95 | .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), | 95 | .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), |
96 | .portalloc_lock = SPIN_LOCK_UNLOCKED, | ||
97 | .port_rover = 1024 - 1, | ||
98 | }; | 96 | }; |
99 | 97 | ||
100 | static int tcp_v4_get_port(struct sock *sk, unsigned short snum) | 98 | static int tcp_v4_get_port(struct sock *sk, unsigned short snum) |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d693cb988b78..d746d3b27efb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) | |||
114 | int low = sysctl_local_port_range[0]; | 114 | int low = sysctl_local_port_range[0]; |
115 | int high = sysctl_local_port_range[1]; | 115 | int high = sysctl_local_port_range[1]; |
116 | int remaining = (high - low) + 1; | 116 | int remaining = (high - low) + 1; |
117 | int rover; | 117 | int rover = net_random() % (high - low) + low; |
118 | 118 | ||
119 | spin_lock(&tcp_hashinfo.portalloc_lock); | 119 | do { |
120 | if (tcp_hashinfo.port_rover < low) | ||
121 | rover = low; | ||
122 | else | ||
123 | rover = tcp_hashinfo.port_rover; | ||
124 | do { rover++; | ||
125 | if (rover > high) | ||
126 | rover = low; | ||
127 | head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; | 120 | head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; |
128 | spin_lock(&head->lock); | 121 | spin_lock(&head->lock); |
129 | inet_bind_bucket_for_each(tb, node, &head->chain) | 122 | inet_bind_bucket_for_each(tb, node, &head->chain) |
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) | |||
132 | break; | 125 | break; |
133 | next: | 126 | next: |
134 | spin_unlock(&head->lock); | 127 | spin_unlock(&head->lock); |
128 | if (++rover > high) | ||
129 | rover = low; | ||
135 | } while (--remaining > 0); | 130 | } while (--remaining > 0); |
136 | tcp_hashinfo.port_rover = rover; | ||
137 | spin_unlock(&tcp_hashinfo.portalloc_lock); | ||
138 | 131 | ||
139 | /* Exhausted local port range during search? It is not | 132 | /* Exhausted local port range during search? It is not |
140 | * possible for us to be holding one of the bind hash | 133 | * possible for us to be holding one of the bind hash |
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index d10d552d9c40..d3a4f30a7f22 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c | |||
@@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb, | |||
117 | 117 | ||
118 | /* QUEUE == DROP if noone is waiting, to be safe. */ | 118 | /* QUEUE == DROP if noone is waiting, to be safe. */ |
119 | read_lock(&queue_handler_lock); | 119 | read_lock(&queue_handler_lock); |
120 | if (!queue_handler[pf]->outfn) { | 120 | if (!queue_handler[pf] || !queue_handler[pf]->outfn) { |
121 | read_unlock(&queue_handler_lock); | 121 | read_unlock(&queue_handler_lock); |
122 | kfree_skb(*skb); | 122 | kfree_skb(*skb); |
123 | return 1; | 123 | return 1; |
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index efcd10f996ba..d194676f3655 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c | |||
@@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid) | |||
146 | goto out_unlock; | 146 | goto out_unlock; |
147 | } | 147 | } |
148 | 148 | ||
149 | inst = kmalloc(sizeof(*inst), GFP_ATOMIC); | 149 | inst = kzalloc(sizeof(*inst), GFP_ATOMIC); |
150 | if (!inst) | 150 | if (!inst) |
151 | goto out_unlock; | 151 | goto out_unlock; |
152 | 152 | ||
153 | memset(inst, 0, sizeof(*inst)); | ||
154 | INIT_HLIST_NODE(&inst->hlist); | 153 | INIT_HLIST_NODE(&inst->hlist); |
155 | inst->lock = SPIN_LOCK_UNLOCKED; | 154 | inst->lock = SPIN_LOCK_UNLOCKED; |
156 | /* needs to be two, since we _put() after creation */ | 155 | /* needs to be two, since we _put() after creation */ |
@@ -962,10 +961,9 @@ static int nful_open(struct inode *inode, struct file *file) | |||
962 | struct iter_state *is; | 961 | struct iter_state *is; |
963 | int ret; | 962 | int ret; |
964 | 963 | ||
965 | is = kmalloc(sizeof(*is), GFP_KERNEL); | 964 | is = kzalloc(sizeof(*is), GFP_KERNEL); |
966 | if (!is) | 965 | if (!is) |
967 | return -ENOMEM; | 966 | return -ENOMEM; |
968 | memset(is, 0, sizeof(*is)); | ||
969 | ret = seq_open(file, &nful_seq_ops); | 967 | ret = seq_open(file, &nful_seq_ops); |
970 | if (ret < 0) | 968 | if (ret < 0) |
971 | goto out_free; | 969 | goto out_free; |
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index eaa44c49567b..f065a6c94953 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c | |||
@@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid) | |||
136 | goto out_unlock; | 136 | goto out_unlock; |
137 | } | 137 | } |
138 | 138 | ||
139 | inst = kmalloc(sizeof(*inst), GFP_ATOMIC); | 139 | inst = kzalloc(sizeof(*inst), GFP_ATOMIC); |
140 | if (!inst) | 140 | if (!inst) |
141 | goto out_unlock; | 141 | goto out_unlock; |
142 | 142 | ||
143 | memset(inst, 0, sizeof(*inst)); | ||
144 | inst->queue_num = queue_num; | 143 | inst->queue_num = queue_num; |
145 | inst->peer_pid = pid; | 144 | inst->peer_pid = pid; |
146 | inst->queue_maxlen = NFQNL_QMAX_DEFAULT; | 145 | inst->queue_maxlen = NFQNL_QMAX_DEFAULT; |
@@ -1036,10 +1035,9 @@ static int nfqnl_open(struct inode *inode, struct file *file) | |||
1036 | struct iter_state *is; | 1035 | struct iter_state *is; |
1037 | int ret; | 1036 | int ret; |
1038 | 1037 | ||
1039 | is = kmalloc(sizeof(*is), GFP_KERNEL); | 1038 | is = kzalloc(sizeof(*is), GFP_KERNEL); |
1040 | if (!is) | 1039 | if (!is) |
1041 | return -ENOMEM; | 1040 | return -ENOMEM; |
1042 | memset(is, 0, sizeof(*is)); | ||
1043 | ret = seq_open(file, &nfqnl_seq_ops); | 1041 | ret = seq_open(file, &nfqnl_seq_ops); |
1044 | if (ret < 0) | 1042 | if (ret < 0) |
1045 | goto out_free; | 1043 | goto out_free; |
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 25c171c32715..29a2dd9f3029 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c | |||
@@ -15,247 +15,281 @@ | |||
15 | * from Ren Liu | 15 | * from Ren Liu |
16 | * - More error checks | 16 | * - More error checks |
17 | * | 17 | * |
18 | * | 18 | * For all the glorious comments look at include/net/red.h |
19 | * | ||
20 | * For all the glorious comments look at Alexey's sch_red.c | ||
21 | */ | 19 | */ |
22 | 20 | ||
23 | #include <linux/config.h> | 21 | #include <linux/config.h> |
24 | #include <linux/module.h> | 22 | #include <linux/module.h> |
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/system.h> | ||
27 | #include <linux/bitops.h> | ||
28 | #include <linux/types.h> | 23 | #include <linux/types.h> |
29 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
30 | #include <linux/sched.h> | ||
31 | #include <linux/string.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/socket.h> | ||
34 | #include <linux/sockios.h> | ||
35 | #include <linux/in.h> | ||
36 | #include <linux/errno.h> | ||
37 | #include <linux/interrupt.h> | ||
38 | #include <linux/if_ether.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/netdevice.h> | 25 | #include <linux/netdevice.h> |
41 | #include <linux/etherdevice.h> | ||
42 | #include <linux/notifier.h> | ||
43 | #include <net/ip.h> | ||
44 | #include <net/route.h> | ||
45 | #include <linux/skbuff.h> | 26 | #include <linux/skbuff.h> |
46 | #include <net/sock.h> | ||
47 | #include <net/pkt_sched.h> | 27 | #include <net/pkt_sched.h> |
28 | #include <net/red.h> | ||
48 | 29 | ||
49 | #if 1 /* control */ | 30 | #define GRED_DEF_PRIO (MAX_DPs / 2) |
50 | #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) | 31 | #define GRED_VQ_MASK (MAX_DPs - 1) |
51 | #else | ||
52 | #define DPRINTK(format,args...) | ||
53 | #endif | ||
54 | |||
55 | #if 0 /* data */ | ||
56 | #define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) | ||
57 | #else | ||
58 | #define D2PRINTK(format,args...) | ||
59 | #endif | ||
60 | 32 | ||
61 | struct gred_sched_data; | 33 | struct gred_sched_data; |
62 | struct gred_sched; | 34 | struct gred_sched; |
63 | 35 | ||
64 | struct gred_sched_data | 36 | struct gred_sched_data |
65 | { | 37 | { |
66 | /* Parameters */ | ||
67 | u32 limit; /* HARD maximal queue length */ | 38 | u32 limit; /* HARD maximal queue length */ |
68 | u32 qth_min; /* Min average length threshold: A scaled */ | ||
69 | u32 qth_max; /* Max average length threshold: A scaled */ | ||
70 | u32 DP; /* the drop pramaters */ | 39 | u32 DP; /* the drop pramaters */ |
71 | char Wlog; /* log(W) */ | ||
72 | char Plog; /* random number bits */ | ||
73 | u32 Scell_max; | ||
74 | u32 Rmask; | ||
75 | u32 bytesin; /* bytes seen on virtualQ so far*/ | 40 | u32 bytesin; /* bytes seen on virtualQ so far*/ |
76 | u32 packetsin; /* packets seen on virtualQ so far*/ | 41 | u32 packetsin; /* packets seen on virtualQ so far*/ |
77 | u32 backlog; /* bytes on the virtualQ */ | 42 | u32 backlog; /* bytes on the virtualQ */ |
78 | u32 forced; /* packets dropped for exceeding limits */ | 43 | u8 prio; /* the prio of this vq */ |
79 | u32 early; /* packets dropped as a warning */ | 44 | |
80 | u32 other; /* packets dropped by invoking drop() */ | 45 | struct red_parms parms; |
81 | u32 pdrop; /* packets dropped because we exceeded physical queue limits */ | 46 | struct red_stats stats; |
82 | char Scell_log; | 47 | }; |
83 | u8 Stab[256]; | 48 | |
84 | u8 prio; /* the prio of this vq */ | 49 | enum { |
85 | 50 | GRED_WRED_MODE = 1, | |
86 | /* Variables */ | 51 | GRED_RIO_MODE, |
87 | unsigned long qave; /* Average queue length: A scaled */ | ||
88 | int qcount; /* Packets since last random number generation */ | ||
89 | u32 qR; /* Cached random number */ | ||
90 | |||
91 | psched_time_t qidlestart; /* Start of idle period */ | ||
92 | }; | 52 | }; |
93 | 53 | ||
94 | struct gred_sched | 54 | struct gred_sched |
95 | { | 55 | { |
96 | struct gred_sched_data *tab[MAX_DPs]; | 56 | struct gred_sched_data *tab[MAX_DPs]; |
97 | u32 DPs; | 57 | unsigned long flags; |
98 | u32 def; | 58 | u32 red_flags; |
99 | u8 initd; | 59 | u32 DPs; |
100 | u8 grio; | 60 | u32 def; |
101 | u8 eqp; | 61 | struct red_parms wred_set; |
102 | }; | 62 | }; |
103 | 63 | ||
104 | static int | 64 | static inline int gred_wred_mode(struct gred_sched *table) |
105 | gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
106 | { | 65 | { |
107 | psched_time_t now; | 66 | return test_bit(GRED_WRED_MODE, &table->flags); |
108 | struct gred_sched_data *q=NULL; | 67 | } |
109 | struct gred_sched *t= qdisc_priv(sch); | 68 | |
110 | unsigned long qave=0; | 69 | static inline void gred_enable_wred_mode(struct gred_sched *table) |
111 | int i=0; | 70 | { |
71 | __set_bit(GRED_WRED_MODE, &table->flags); | ||
72 | } | ||
73 | |||
74 | static inline void gred_disable_wred_mode(struct gred_sched *table) | ||
75 | { | ||
76 | __clear_bit(GRED_WRED_MODE, &table->flags); | ||
77 | } | ||
78 | |||
79 | static inline int gred_rio_mode(struct gred_sched *table) | ||
80 | { | ||
81 | return test_bit(GRED_RIO_MODE, &table->flags); | ||
82 | } | ||
83 | |||
84 | static inline void gred_enable_rio_mode(struct gred_sched *table) | ||
85 | { | ||
86 | __set_bit(GRED_RIO_MODE, &table->flags); | ||
87 | } | ||
88 | |||
89 | static inline void gred_disable_rio_mode(struct gred_sched *table) | ||
90 | { | ||
91 | __clear_bit(GRED_RIO_MODE, &table->flags); | ||
92 | } | ||
93 | |||
94 | static inline int gred_wred_mode_check(struct Qdisc *sch) | ||
95 | { | ||
96 | struct gred_sched *table = qdisc_priv(sch); | ||
97 | int i; | ||
112 | 98 | ||
113 | if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { | 99 | /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ |
114 | D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); | 100 | for (i = 0; i < table->DPs; i++) { |
115 | goto do_enqueue; | 101 | struct gred_sched_data *q = table->tab[i]; |
102 | int n; | ||
103 | |||
104 | if (q == NULL) | ||
105 | continue; | ||
106 | |||
107 | for (n = 0; n < table->DPs; n++) | ||
108 | if (table->tab[n] && table->tab[n] != q && | ||
109 | table->tab[n]->prio == q->prio) | ||
110 | return 1; | ||
116 | } | 111 | } |
117 | 112 | ||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | static inline unsigned int gred_backlog(struct gred_sched *table, | ||
117 | struct gred_sched_data *q, | ||
118 | struct Qdisc *sch) | ||
119 | { | ||
120 | if (gred_wred_mode(table)) | ||
121 | return sch->qstats.backlog; | ||
122 | else | ||
123 | return q->backlog; | ||
124 | } | ||
125 | |||
126 | static inline u16 tc_index_to_dp(struct sk_buff *skb) | ||
127 | { | ||
128 | return skb->tc_index & GRED_VQ_MASK; | ||
129 | } | ||
130 | |||
131 | static inline void gred_load_wred_set(struct gred_sched *table, | ||
132 | struct gred_sched_data *q) | ||
133 | { | ||
134 | q->parms.qavg = table->wred_set.qavg; | ||
135 | q->parms.qidlestart = table->wred_set.qidlestart; | ||
136 | } | ||
137 | |||
138 | static inline void gred_store_wred_set(struct gred_sched *table, | ||
139 | struct gred_sched_data *q) | ||
140 | { | ||
141 | table->wred_set.qavg = q->parms.qavg; | ||
142 | } | ||
143 | |||
144 | static inline int gred_use_ecn(struct gred_sched *t) | ||
145 | { | ||
146 | return t->red_flags & TC_RED_ECN; | ||
147 | } | ||
118 | 148 | ||
119 | if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { | 149 | static inline int gred_use_harddrop(struct gred_sched *t) |
120 | printk("GRED: setting to default (%d)\n ",t->def); | 150 | { |
121 | if (!(q=t->tab[t->def])) { | 151 | return t->red_flags & TC_RED_HARDDROP; |
122 | DPRINTK("GRED: setting to default FAILED! dropping!! " | 152 | } |
123 | "(%d)\n ", t->def); | 153 | |
124 | goto drop; | 154 | static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
155 | { | ||
156 | struct gred_sched_data *q=NULL; | ||
157 | struct gred_sched *t= qdisc_priv(sch); | ||
158 | unsigned long qavg = 0; | ||
159 | u16 dp = tc_index_to_dp(skb); | ||
160 | |||
161 | if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { | ||
162 | dp = t->def; | ||
163 | |||
164 | if ((q = t->tab[dp]) == NULL) { | ||
165 | /* Pass through packets not assigned to a DP | ||
166 | * if no default DP has been configured. This | ||
167 | * allows for DP flows to be left untouched. | ||
168 | */ | ||
169 | if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len) | ||
170 | return qdisc_enqueue_tail(skb, sch); | ||
171 | else | ||
172 | goto drop; | ||
125 | } | 173 | } |
174 | |||
126 | /* fix tc_index? --could be controvesial but needed for | 175 | /* fix tc_index? --could be controvesial but needed for |
127 | requeueing */ | 176 | requeueing */ |
128 | skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; | 177 | skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; |
129 | } | 178 | } |
130 | 179 | ||
131 | D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " | 180 | /* sum up all the qaves of prios <= to ours to get the new qave */ |
132 | "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, | 181 | if (!gred_wred_mode(t) && gred_rio_mode(t)) { |
133 | sch->qstats.backlog); | 182 | int i; |
134 | /* sum up all the qaves of prios <= to ours to get the new qave*/ | 183 | |
135 | if (!t->eqp && t->grio) { | 184 | for (i = 0; i < t->DPs; i++) { |
136 | for (i=0;i<t->DPs;i++) { | 185 | if (t->tab[i] && t->tab[i]->prio < q->prio && |
137 | if ((!t->tab[i]) || (i==q->DP)) | 186 | !red_is_idling(&t->tab[i]->parms)) |
138 | continue; | 187 | qavg +=t->tab[i]->parms.qavg; |
139 | |||
140 | if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) | ||
141 | qave +=t->tab[i]->qave; | ||
142 | } | 188 | } |
143 | 189 | ||
144 | } | 190 | } |
145 | 191 | ||
146 | q->packetsin++; | 192 | q->packetsin++; |
147 | q->bytesin+=skb->len; | 193 | q->bytesin += skb->len; |
148 | 194 | ||
149 | if (t->eqp && t->grio) { | 195 | if (gred_wred_mode(t)) |
150 | qave=0; | 196 | gred_load_wred_set(t, q); |
151 | q->qave=t->tab[t->def]->qave; | ||
152 | q->qidlestart=t->tab[t->def]->qidlestart; | ||
153 | } | ||
154 | 197 | ||
155 | if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { | 198 | q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); |
156 | long us_idle; | ||
157 | PSCHED_GET_TIME(now); | ||
158 | us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); | ||
159 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
160 | 199 | ||
161 | q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; | 200 | if (red_is_idling(&q->parms)) |
162 | } else { | 201 | red_end_of_idle_period(&q->parms); |
163 | if (t->eqp) { | ||
164 | q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); | ||
165 | } else { | ||
166 | q->qave += q->backlog - (q->qave >> q->Wlog); | ||
167 | } | ||
168 | 202 | ||
169 | } | 203 | if (gred_wred_mode(t)) |
170 | 204 | gred_store_wred_set(t, q); | |
171 | |||
172 | if (t->eqp && t->grio) | ||
173 | t->tab[t->def]->qave=q->qave; | ||
174 | |||
175 | if ((q->qave+qave) < q->qth_min) { | ||
176 | q->qcount = -1; | ||
177 | enqueue: | ||
178 | if (q->backlog + skb->len <= q->limit) { | ||
179 | q->backlog += skb->len; | ||
180 | do_enqueue: | ||
181 | __skb_queue_tail(&sch->q, skb); | ||
182 | sch->qstats.backlog += skb->len; | ||
183 | sch->bstats.bytes += skb->len; | ||
184 | sch->bstats.packets++; | ||
185 | return 0; | ||
186 | } else { | ||
187 | q->pdrop++; | ||
188 | } | ||
189 | 205 | ||
190 | drop: | 206 | switch (red_action(&q->parms, q->parms.qavg + qavg)) { |
191 | kfree_skb(skb); | 207 | case RED_DONT_MARK: |
192 | sch->qstats.drops++; | 208 | break; |
193 | return NET_XMIT_DROP; | 209 | |
194 | } | 210 | case RED_PROB_MARK: |
195 | if ((q->qave+qave) >= q->qth_max) { | 211 | sch->qstats.overlimits++; |
196 | q->qcount = -1; | 212 | if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { |
197 | sch->qstats.overlimits++; | 213 | q->stats.prob_drop++; |
198 | q->forced++; | 214 | goto congestion_drop; |
199 | goto drop; | 215 | } |
216 | |||
217 | q->stats.prob_mark++; | ||
218 | break; | ||
219 | |||
220 | case RED_HARD_MARK: | ||
221 | sch->qstats.overlimits++; | ||
222 | if (gred_use_harddrop(t) || !gred_use_ecn(t) || | ||
223 | !INET_ECN_set_ce(skb)) { | ||
224 | q->stats.forced_drop++; | ||
225 | goto congestion_drop; | ||
226 | } | ||
227 | q->stats.forced_mark++; | ||
228 | break; | ||
200 | } | 229 | } |
201 | if (++q->qcount) { | 230 | |
202 | if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) | 231 | if (q->backlog + skb->len <= q->limit) { |
203 | goto enqueue; | 232 | q->backlog += skb->len; |
204 | q->qcount = 0; | 233 | return qdisc_enqueue_tail(skb, sch); |
205 | q->qR = net_random()&q->Rmask; | ||
206 | sch->qstats.overlimits++; | ||
207 | q->early++; | ||
208 | goto drop; | ||
209 | } | 234 | } |
210 | q->qR = net_random()&q->Rmask; | 235 | |
211 | goto enqueue; | 236 | q->stats.pdrop++; |
237 | drop: | ||
238 | return qdisc_drop(skb, sch); | ||
239 | |||
240 | congestion_drop: | ||
241 | qdisc_drop(skb, sch); | ||
242 | return NET_XMIT_CN; | ||
212 | } | 243 | } |
213 | 244 | ||
214 | static int | 245 | static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch) |
215 | gred_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
216 | { | 246 | { |
247 | struct gred_sched *t = qdisc_priv(sch); | ||
217 | struct gred_sched_data *q; | 248 | struct gred_sched_data *q; |
218 | struct gred_sched *t= qdisc_priv(sch); | 249 | u16 dp = tc_index_to_dp(skb); |
219 | q= t->tab[(skb->tc_index&0xf)]; | 250 | |
220 | /* error checking here -- probably unnecessary */ | 251 | if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { |
221 | PSCHED_SET_PASTPERFECT(q->qidlestart); | 252 | if (net_ratelimit()) |
222 | 253 | printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x " | |
223 | __skb_queue_head(&sch->q, skb); | 254 | "for requeue, screwing up backlog.\n", |
224 | sch->qstats.backlog += skb->len; | 255 | tc_index_to_dp(skb)); |
225 | sch->qstats.requeues++; | 256 | } else { |
226 | q->backlog += skb->len; | 257 | if (red_is_idling(&q->parms)) |
227 | return 0; | 258 | red_end_of_idle_period(&q->parms); |
259 | q->backlog += skb->len; | ||
260 | } | ||
261 | |||
262 | return qdisc_requeue(skb, sch); | ||
228 | } | 263 | } |
229 | 264 | ||
230 | static struct sk_buff * | 265 | static struct sk_buff *gred_dequeue(struct Qdisc* sch) |
231 | gred_dequeue(struct Qdisc* sch) | ||
232 | { | 266 | { |
233 | struct sk_buff *skb; | 267 | struct sk_buff *skb; |
234 | struct gred_sched_data *q; | 268 | struct gred_sched *t = qdisc_priv(sch); |
235 | struct gred_sched *t= qdisc_priv(sch); | 269 | |
270 | skb = qdisc_dequeue_head(sch); | ||
236 | 271 | ||
237 | skb = __skb_dequeue(&sch->q); | ||
238 | if (skb) { | 272 | if (skb) { |
239 | sch->qstats.backlog -= skb->len; | 273 | struct gred_sched_data *q; |
240 | q= t->tab[(skb->tc_index&0xf)]; | 274 | u16 dp = tc_index_to_dp(skb); |
241 | if (q) { | 275 | |
242 | q->backlog -= skb->len; | 276 | if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { |
243 | if (!q->backlog && !t->eqp) | 277 | if (net_ratelimit()) |
244 | PSCHED_GET_TIME(q->qidlestart); | 278 | printk(KERN_WARNING "GRED: Unable to relocate " |
279 | "VQ 0x%x after dequeue, screwing up " | ||
280 | "backlog.\n", tc_index_to_dp(skb)); | ||
245 | } else { | 281 | } else { |
246 | D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); | 282 | q->backlog -= skb->len; |
283 | |||
284 | if (!q->backlog && !gred_wred_mode(t)) | ||
285 | red_start_of_idle_period(&q->parms); | ||
247 | } | 286 | } |
287 | |||
248 | return skb; | 288 | return skb; |
249 | } | 289 | } |
250 | 290 | ||
251 | if (t->eqp) { | 291 | if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) |
252 | q= t->tab[t->def]; | 292 | red_start_of_idle_period(&t->wred_set); |
253 | if (!q) | ||
254 | D2PRINTK("no default VQ set: Results will be " | ||
255 | "screwed up\n"); | ||
256 | else | ||
257 | PSCHED_GET_TIME(q->qidlestart); | ||
258 | } | ||
259 | 293 | ||
260 | return NULL; | 294 | return NULL; |
261 | } | 295 | } |
@@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch) | |||
263 | static unsigned int gred_drop(struct Qdisc* sch) | 297 | static unsigned int gred_drop(struct Qdisc* sch) |
264 | { | 298 | { |
265 | struct sk_buff *skb; | 299 | struct sk_buff *skb; |
300 | struct gred_sched *t = qdisc_priv(sch); | ||
266 | 301 | ||
267 | struct gred_sched_data *q; | 302 | skb = qdisc_dequeue_tail(sch); |
268 | struct gred_sched *t= qdisc_priv(sch); | ||
269 | |||
270 | skb = __skb_dequeue_tail(&sch->q); | ||
271 | if (skb) { | 303 | if (skb) { |
272 | unsigned int len = skb->len; | 304 | unsigned int len = skb->len; |
273 | sch->qstats.backlog -= len; | 305 | struct gred_sched_data *q; |
274 | sch->qstats.drops++; | 306 | u16 dp = tc_index_to_dp(skb); |
275 | q= t->tab[(skb->tc_index&0xf)]; | 307 | |
276 | if (q) { | 308 | if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { |
277 | q->backlog -= len; | 309 | if (net_ratelimit()) |
278 | q->other++; | 310 | printk(KERN_WARNING "GRED: Unable to relocate " |
279 | if (!q->backlog && !t->eqp) | 311 | "VQ 0x%x while dropping, screwing up " |
280 | PSCHED_GET_TIME(q->qidlestart); | 312 | "backlog.\n", tc_index_to_dp(skb)); |
281 | } else { | 313 | } else { |
282 | D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); | 314 | q->backlog -= len; |
315 | q->stats.other++; | ||
316 | |||
317 | if (!q->backlog && !gred_wred_mode(t)) | ||
318 | red_start_of_idle_period(&q->parms); | ||
283 | } | 319 | } |
284 | 320 | ||
285 | kfree_skb(skb); | 321 | qdisc_drop(skb, sch); |
286 | return len; | 322 | return len; |
287 | } | 323 | } |
288 | 324 | ||
289 | q=t->tab[t->def]; | 325 | if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) |
290 | if (!q) { | 326 | red_start_of_idle_period(&t->wred_set); |
291 | D2PRINTK("no default VQ set: Results might be screwed up\n"); | ||
292 | return 0; | ||
293 | } | ||
294 | 327 | ||
295 | PSCHED_GET_TIME(q->qidlestart); | ||
296 | return 0; | 328 | return 0; |
297 | 329 | ||
298 | } | 330 | } |
@@ -300,293 +332,241 @@ static unsigned int gred_drop(struct Qdisc* sch) | |||
300 | static void gred_reset(struct Qdisc* sch) | 332 | static void gred_reset(struct Qdisc* sch) |
301 | { | 333 | { |
302 | int i; | 334 | int i; |
303 | struct gred_sched_data *q; | 335 | struct gred_sched *t = qdisc_priv(sch); |
304 | struct gred_sched *t= qdisc_priv(sch); | 336 | |
337 | qdisc_reset_queue(sch); | ||
305 | 338 | ||
306 | __skb_queue_purge(&sch->q); | 339 | for (i = 0; i < t->DPs; i++) { |
340 | struct gred_sched_data *q = t->tab[i]; | ||
307 | 341 | ||
308 | sch->qstats.backlog = 0; | 342 | if (!q) |
343 | continue; | ||
309 | 344 | ||
310 | for (i=0;i<t->DPs;i++) { | 345 | red_restart(&q->parms); |
311 | q= t->tab[i]; | ||
312 | if (!q) | ||
313 | continue; | ||
314 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
315 | q->qave = 0; | ||
316 | q->qcount = -1; | ||
317 | q->backlog = 0; | 346 | q->backlog = 0; |
318 | q->other=0; | ||
319 | q->forced=0; | ||
320 | q->pdrop=0; | ||
321 | q->early=0; | ||
322 | } | 347 | } |
323 | } | 348 | } |
324 | 349 | ||
325 | static int gred_change(struct Qdisc *sch, struct rtattr *opt) | 350 | static inline void gred_destroy_vq(struct gred_sched_data *q) |
351 | { | ||
352 | kfree(q); | ||
353 | } | ||
354 | |||
355 | static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) | ||
326 | { | 356 | { |
327 | struct gred_sched *table = qdisc_priv(sch); | 357 | struct gred_sched *table = qdisc_priv(sch); |
328 | struct gred_sched_data *q; | ||
329 | struct tc_gred_qopt *ctl; | ||
330 | struct tc_gred_sopt *sopt; | 358 | struct tc_gred_sopt *sopt; |
331 | struct rtattr *tb[TCA_GRED_STAB]; | ||
332 | struct rtattr *tb2[TCA_GRED_DPS]; | ||
333 | int i; | 359 | int i; |
334 | 360 | ||
335 | if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) | 361 | if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) |
336 | return -EINVAL; | 362 | return -EINVAL; |
337 | 363 | ||
338 | if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { | 364 | sopt = RTA_DATA(dps); |
339 | rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); | 365 | |
366 | if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) | ||
367 | return -EINVAL; | ||
340 | 368 | ||
341 | if (tb2[TCA_GRED_DPS-1] == 0) | 369 | sch_tree_lock(sch); |
342 | return -EINVAL; | 370 | table->DPs = sopt->DPs; |
371 | table->def = sopt->def_DP; | ||
372 | table->red_flags = sopt->flags; | ||
373 | |||
374 | /* | ||
375 | * Every entry point to GRED is synchronized with the above code | ||
376 | * and the DP is checked against DPs, i.e. shadowed VQs can no | ||
377 | * longer be found so we can unlock right here. | ||
378 | */ | ||
379 | sch_tree_unlock(sch); | ||
380 | |||
381 | if (sopt->grio) { | ||
382 | gred_enable_rio_mode(table); | ||
383 | gred_disable_wred_mode(table); | ||
384 | if (gred_wred_mode_check(sch)) | ||
385 | gred_enable_wred_mode(table); | ||
386 | } else { | ||
387 | gred_disable_rio_mode(table); | ||
388 | gred_disable_wred_mode(table); | ||
389 | } | ||
343 | 390 | ||
344 | sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); | 391 | for (i = table->DPs; i < MAX_DPs; i++) { |
345 | table->DPs=sopt->DPs; | 392 | if (table->tab[i]) { |
346 | table->def=sopt->def_DP; | 393 | printk(KERN_WARNING "GRED: Warning: Destroying " |
347 | table->grio=sopt->grio; | 394 | "shadowed VQ 0x%x\n", i); |
348 | table->initd=0; | 395 | gred_destroy_vq(table->tab[i]); |
349 | /* probably need to clear all the table DP entries as well */ | 396 | table->tab[i] = NULL; |
350 | return 0; | 397 | } |
351 | } | 398 | } |
352 | 399 | ||
400 | return 0; | ||
401 | } | ||
353 | 402 | ||
354 | if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || | 403 | static inline int gred_change_vq(struct Qdisc *sch, int dp, |
355 | RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || | 404 | struct tc_gred_qopt *ctl, int prio, u8 *stab) |
356 | RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) | 405 | { |
357 | return -EINVAL; | 406 | struct gred_sched *table = qdisc_priv(sch); |
407 | struct gred_sched_data *q; | ||
358 | 408 | ||
359 | ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); | 409 | if (table->tab[dp] == NULL) { |
360 | if (ctl->DP > MAX_DPs-1 ) { | 410 | table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL); |
361 | /* misbehaving is punished! Put in the default drop probability */ | 411 | if (table->tab[dp] == NULL) |
362 | DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " | ||
363 | "set to default at %d\n",ctl->DP,table->def); | ||
364 | ctl->DP=table->def; | ||
365 | } | ||
366 | |||
367 | if (table->tab[ctl->DP] == NULL) { | ||
368 | table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), | ||
369 | GFP_KERNEL); | ||
370 | if (NULL == table->tab[ctl->DP]) | ||
371 | return -ENOMEM; | 412 | return -ENOMEM; |
372 | memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); | 413 | memset(table->tab[dp], 0, sizeof(*q)); |
373 | } | ||
374 | q= table->tab[ctl->DP]; | ||
375 | |||
376 | if (table->grio) { | ||
377 | if (ctl->prio <=0) { | ||
378 | if (table->def && table->tab[table->def]) { | ||
379 | DPRINTK("\nGRED: DP %u does not have a prio" | ||
380 | "setting default to %d\n",ctl->DP, | ||
381 | table->tab[table->def]->prio); | ||
382 | q->prio=table->tab[table->def]->prio; | ||
383 | } else { | ||
384 | DPRINTK("\nGRED: DP %u does not have a prio" | ||
385 | " setting default to 8\n",ctl->DP); | ||
386 | q->prio=8; | ||
387 | } | ||
388 | } else { | ||
389 | q->prio=ctl->prio; | ||
390 | } | ||
391 | } else { | ||
392 | q->prio=8; | ||
393 | } | 414 | } |
394 | 415 | ||
395 | 416 | q = table->tab[dp]; | |
396 | q->DP=ctl->DP; | 417 | q->DP = dp; |
397 | q->Wlog = ctl->Wlog; | 418 | q->prio = prio; |
398 | q->Plog = ctl->Plog; | ||
399 | q->limit = ctl->limit; | 419 | q->limit = ctl->limit; |
400 | q->Scell_log = ctl->Scell_log; | ||
401 | q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; | ||
402 | q->Scell_max = (255<<q->Scell_log); | ||
403 | q->qth_min = ctl->qth_min<<ctl->Wlog; | ||
404 | q->qth_max = ctl->qth_max<<ctl->Wlog; | ||
405 | q->qave=0; | ||
406 | q->backlog=0; | ||
407 | q->qcount = -1; | ||
408 | q->other=0; | ||
409 | q->forced=0; | ||
410 | q->pdrop=0; | ||
411 | q->early=0; | ||
412 | |||
413 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
414 | memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); | ||
415 | |||
416 | if ( table->initd && table->grio) { | ||
417 | /* this looks ugly but it's not in the fast path */ | ||
418 | for (i=0;i<table->DPs;i++) { | ||
419 | if ((!table->tab[i]) || (i==q->DP) ) | ||
420 | continue; | ||
421 | if (table->tab[i]->prio == q->prio ){ | ||
422 | /* WRED mode detected */ | ||
423 | table->eqp=1; | ||
424 | break; | ||
425 | } | ||
426 | } | ||
427 | } | ||
428 | 420 | ||
429 | if (!table->initd) { | 421 | if (q->backlog == 0) |
430 | table->initd=1; | 422 | red_end_of_idle_period(&q->parms); |
431 | /* | ||
432 | the first entry also goes into the default until | ||
433 | over-written | ||
434 | */ | ||
435 | |||
436 | if (table->tab[table->def] == NULL) { | ||
437 | table->tab[table->def]= | ||
438 | kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); | ||
439 | if (NULL == table->tab[table->def]) | ||
440 | return -ENOMEM; | ||
441 | |||
442 | memset(table->tab[table->def], 0, | ||
443 | (sizeof(struct gred_sched_data))); | ||
444 | } | ||
445 | q= table->tab[table->def]; | ||
446 | q->DP=table->def; | ||
447 | q->Wlog = ctl->Wlog; | ||
448 | q->Plog = ctl->Plog; | ||
449 | q->limit = ctl->limit; | ||
450 | q->Scell_log = ctl->Scell_log; | ||
451 | q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; | ||
452 | q->Scell_max = (255<<q->Scell_log); | ||
453 | q->qth_min = ctl->qth_min<<ctl->Wlog; | ||
454 | q->qth_max = ctl->qth_max<<ctl->Wlog; | ||
455 | |||
456 | if (table->grio) | ||
457 | q->prio=table->tab[ctl->DP]->prio; | ||
458 | else | ||
459 | q->prio=8; | ||
460 | |||
461 | q->qcount = -1; | ||
462 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
463 | memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); | ||
464 | } | ||
465 | return 0; | ||
466 | 423 | ||
424 | red_set_parms(&q->parms, | ||
425 | ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, | ||
426 | ctl->Scell_log, stab); | ||
427 | |||
428 | return 0; | ||
467 | } | 429 | } |
468 | 430 | ||
469 | static int gred_init(struct Qdisc *sch, struct rtattr *opt) | 431 | static int gred_change(struct Qdisc *sch, struct rtattr *opt) |
470 | { | 432 | { |
471 | struct gred_sched *table = qdisc_priv(sch); | 433 | struct gred_sched *table = qdisc_priv(sch); |
472 | struct tc_gred_sopt *sopt; | 434 | struct tc_gred_qopt *ctl; |
473 | struct rtattr *tb[TCA_GRED_STAB]; | 435 | struct rtattr *tb[TCA_GRED_MAX]; |
474 | struct rtattr *tb2[TCA_GRED_DPS]; | 436 | int err = -EINVAL, prio = GRED_DEF_PRIO; |
437 | u8 *stab; | ||
475 | 438 | ||
476 | if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) | 439 | if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) |
477 | return -EINVAL; | 440 | return -EINVAL; |
478 | 441 | ||
479 | if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { | 442 | if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) |
480 | rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); | 443 | return gred_change_table_def(sch, opt); |
444 | |||
445 | if (tb[TCA_GRED_PARMS-1] == NULL || | ||
446 | RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || | ||
447 | tb[TCA_GRED_STAB-1] == NULL || | ||
448 | RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) | ||
449 | return -EINVAL; | ||
450 | |||
451 | ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); | ||
452 | stab = RTA_DATA(tb[TCA_GRED_STAB-1]); | ||
453 | |||
454 | if (ctl->DP >= table->DPs) | ||
455 | goto errout; | ||
481 | 456 | ||
482 | if (tb2[TCA_GRED_DPS-1] == 0) | 457 | if (gred_rio_mode(table)) { |
483 | return -EINVAL; | 458 | if (ctl->prio == 0) { |
459 | int def_prio = GRED_DEF_PRIO; | ||
484 | 460 | ||
485 | sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); | 461 | if (table->tab[table->def]) |
486 | table->DPs=sopt->DPs; | 462 | def_prio = table->tab[table->def]->prio; |
487 | table->def=sopt->def_DP; | 463 | |
488 | table->grio=sopt->grio; | 464 | printk(KERN_DEBUG "GRED: DP %u does not have a prio " |
489 | table->initd=0; | 465 | "setting default to %d\n", ctl->DP, def_prio); |
490 | return 0; | 466 | |
467 | prio = def_prio; | ||
468 | } else | ||
469 | prio = ctl->prio; | ||
470 | } | ||
471 | |||
472 | sch_tree_lock(sch); | ||
473 | |||
474 | err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); | ||
475 | if (err < 0) | ||
476 | goto errout_locked; | ||
477 | |||
478 | if (gred_rio_mode(table)) { | ||
479 | gred_disable_wred_mode(table); | ||
480 | if (gred_wred_mode_check(sch)) | ||
481 | gred_enable_wred_mode(table); | ||
491 | } | 482 | } |
492 | 483 | ||
493 | DPRINTK("\n GRED_INIT error!\n"); | 484 | err = 0; |
494 | return -EINVAL; | 485 | |
486 | errout_locked: | ||
487 | sch_tree_unlock(sch); | ||
488 | errout: | ||
489 | return err; | ||
495 | } | 490 | } |
496 | 491 | ||
497 | static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) | 492 | static int gred_init(struct Qdisc *sch, struct rtattr *opt) |
498 | { | 493 | { |
499 | unsigned long qave; | 494 | struct rtattr *tb[TCA_GRED_MAX]; |
500 | struct rtattr *rta; | ||
501 | struct tc_gred_qopt *opt = NULL ; | ||
502 | struct tc_gred_qopt *dst; | ||
503 | struct gred_sched *table = qdisc_priv(sch); | ||
504 | struct gred_sched_data *q; | ||
505 | int i; | ||
506 | unsigned char *b = skb->tail; | ||
507 | 495 | ||
508 | rta = (struct rtattr*)b; | 496 | if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) |
509 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | 497 | return -EINVAL; |
510 | 498 | ||
511 | opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); | 499 | if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) |
500 | return -EINVAL; | ||
512 | 501 | ||
513 | if (opt == NULL) { | 502 | return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); |
514 | DPRINTK("gred_dump:failed to malloc for %Zd\n", | 503 | } |
515 | sizeof(struct tc_gred_qopt)*MAX_DPs); | ||
516 | goto rtattr_failure; | ||
517 | } | ||
518 | 504 | ||
519 | memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); | 505 | static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) |
506 | { | ||
507 | struct gred_sched *table = qdisc_priv(sch); | ||
508 | struct rtattr *parms, *opts = NULL; | ||
509 | int i; | ||
510 | struct tc_gred_sopt sopt = { | ||
511 | .DPs = table->DPs, | ||
512 | .def_DP = table->def, | ||
513 | .grio = gred_rio_mode(table), | ||
514 | .flags = table->red_flags, | ||
515 | }; | ||
520 | 516 | ||
521 | if (!table->initd) { | 517 | opts = RTA_NEST(skb, TCA_OPTIONS); |
522 | DPRINTK("NO GRED Queues setup!\n"); | 518 | RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); |
523 | } | 519 | parms = RTA_NEST(skb, TCA_GRED_PARMS); |
520 | |||
521 | for (i = 0; i < MAX_DPs; i++) { | ||
522 | struct gred_sched_data *q = table->tab[i]; | ||
523 | struct tc_gred_qopt opt; | ||
524 | 524 | ||
525 | for (i=0;i<MAX_DPs;i++) { | 525 | memset(&opt, 0, sizeof(opt)); |
526 | dst= &opt[i]; | ||
527 | q= table->tab[i]; | ||
528 | 526 | ||
529 | if (!q) { | 527 | if (!q) { |
530 | /* hack -- fix at some point with proper message | 528 | /* hack -- fix at some point with proper message |
531 | This is how we indicate to tc that there is no VQ | 529 | This is how we indicate to tc that there is no VQ |
532 | at this DP */ | 530 | at this DP */ |
533 | 531 | ||
534 | dst->DP=MAX_DPs+i; | 532 | opt.DP = MAX_DPs + i; |
535 | continue; | 533 | goto append_opt; |
536 | } | 534 | } |
537 | 535 | ||
538 | dst->limit=q->limit; | 536 | opt.limit = q->limit; |
539 | dst->qth_min=q->qth_min>>q->Wlog; | 537 | opt.DP = q->DP; |
540 | dst->qth_max=q->qth_max>>q->Wlog; | 538 | opt.backlog = q->backlog; |
541 | dst->DP=q->DP; | 539 | opt.prio = q->prio; |
542 | dst->backlog=q->backlog; | 540 | opt.qth_min = q->parms.qth_min >> q->parms.Wlog; |
543 | if (q->qave) { | 541 | opt.qth_max = q->parms.qth_max >> q->parms.Wlog; |
544 | if (table->eqp && table->grio) { | 542 | opt.Wlog = q->parms.Wlog; |
545 | q->qidlestart=table->tab[table->def]->qidlestart; | 543 | opt.Plog = q->parms.Plog; |
546 | q->qave=table->tab[table->def]->qave; | 544 | opt.Scell_log = q->parms.Scell_log; |
547 | } | 545 | opt.other = q->stats.other; |
548 | if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { | 546 | opt.early = q->stats.prob_drop; |
549 | long idle; | 547 | opt.forced = q->stats.forced_drop; |
550 | psched_time_t now; | 548 | opt.pdrop = q->stats.pdrop; |
551 | PSCHED_GET_TIME(now); | 549 | opt.packets = q->packetsin; |
552 | idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); | 550 | opt.bytesin = q->bytesin; |
553 | qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; | 551 | |
554 | dst->qave = qave >> q->Wlog; | 552 | if (gred_wred_mode(table)) { |
555 | 553 | q->parms.qidlestart = | |
556 | } else { | 554 | table->tab[table->def]->parms.qidlestart; |
557 | dst->qave = q->qave >> q->Wlog; | 555 | q->parms.qavg = table->tab[table->def]->parms.qavg; |
558 | } | ||
559 | } else { | ||
560 | dst->qave = 0; | ||
561 | } | 556 | } |
562 | 557 | ||
563 | 558 | opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); | |
564 | dst->Wlog = q->Wlog; | 559 | |
565 | dst->Plog = q->Plog; | 560 | append_opt: |
566 | dst->Scell_log = q->Scell_log; | 561 | RTA_APPEND(skb, sizeof(opt), &opt); |
567 | dst->other = q->other; | ||
568 | dst->forced = q->forced; | ||
569 | dst->early = q->early; | ||
570 | dst->pdrop = q->pdrop; | ||
571 | dst->prio = q->prio; | ||
572 | dst->packets=q->packetsin; | ||
573 | dst->bytesin=q->bytesin; | ||
574 | } | 562 | } |
575 | 563 | ||
576 | RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); | 564 | RTA_NEST_END(skb, parms); |
577 | rta->rta_len = skb->tail - b; | ||
578 | 565 | ||
579 | kfree(opt); | 566 | return RTA_NEST_END(skb, opts); |
580 | return skb->len; | ||
581 | 567 | ||
582 | rtattr_failure: | 568 | rtattr_failure: |
583 | if (opt) | 569 | return RTA_NEST_CANCEL(skb, opts); |
584 | kfree(opt); | ||
585 | DPRINTK("gred_dump: FAILURE!!!!\n"); | ||
586 | |||
587 | /* also free the opt struct here */ | ||
588 | skb_trim(skb, b - skb->data); | ||
589 | return -1; | ||
590 | } | 570 | } |
591 | 571 | ||
592 | static void gred_destroy(struct Qdisc *sch) | 572 | static void gred_destroy(struct Qdisc *sch) |
@@ -594,15 +574,13 @@ static void gred_destroy(struct Qdisc *sch) | |||
594 | struct gred_sched *table = qdisc_priv(sch); | 574 | struct gred_sched *table = qdisc_priv(sch); |
595 | int i; | 575 | int i; |
596 | 576 | ||
597 | for (i = 0;i < table->DPs; i++) { | 577 | for (i = 0; i < table->DPs; i++) { |
598 | if (table->tab[i]) | 578 | if (table->tab[i]) |
599 | kfree(table->tab[i]); | 579 | gred_destroy_vq(table->tab[i]); |
600 | } | 580 | } |
601 | } | 581 | } |
602 | 582 | ||
603 | static struct Qdisc_ops gred_qdisc_ops = { | 583 | static struct Qdisc_ops gred_qdisc_ops = { |
604 | .next = NULL, | ||
605 | .cl_ops = NULL, | ||
606 | .id = "gred", | 584 | .id = "gred", |
607 | .priv_size = sizeof(struct gred_sched), | 585 | .priv_size = sizeof(struct gred_sched), |
608 | .enqueue = gred_enqueue, | 586 | .enqueue = gred_enqueue, |
@@ -621,10 +599,13 @@ static int __init gred_module_init(void) | |||
621 | { | 599 | { |
622 | return register_qdisc(&gred_qdisc_ops); | 600 | return register_qdisc(&gred_qdisc_ops); |
623 | } | 601 | } |
624 | static void __exit gred_module_exit(void) | 602 | |
603 | static void __exit gred_module_exit(void) | ||
625 | { | 604 | { |
626 | unregister_qdisc(&gred_qdisc_ops); | 605 | unregister_qdisc(&gred_qdisc_ops); |
627 | } | 606 | } |
607 | |||
628 | module_init(gred_module_init) | 608 | module_init(gred_module_init) |
629 | module_exit(gred_module_exit) | 609 | module_exit(gred_module_exit) |
610 | |||
630 | MODULE_LICENSE("GPL"); | 611 | MODULE_LICENSE("GPL"); |
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bb9bf8d5003c..cdc8d283791c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c | |||
@@ -25,6 +25,8 @@ | |||
25 | 25 | ||
26 | #include <net/pkt_sched.h> | 26 | #include <net/pkt_sched.h> |
27 | 27 | ||
28 | #define VERSION "1.1" | ||
29 | |||
28 | /* Network Emulation Queuing algorithm. | 30 | /* Network Emulation Queuing algorithm. |
29 | ==================================== | 31 | ==================================== |
30 | 32 | ||
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
185 | || q->counter < q->gap /* inside last reordering gap */ | 187 | || q->counter < q->gap /* inside last reordering gap */ |
186 | || q->reorder < get_crandom(&q->reorder_cor)) { | 188 | || q->reorder < get_crandom(&q->reorder_cor)) { |
187 | psched_time_t now; | 189 | psched_time_t now; |
190 | psched_tdiff_t delay; | ||
191 | |||
192 | delay = tabledist(q->latency, q->jitter, | ||
193 | &q->delay_cor, q->delay_dist); | ||
194 | |||
188 | PSCHED_GET_TIME(now); | 195 | PSCHED_GET_TIME(now); |
189 | PSCHED_TADD2(now, tabledist(q->latency, q->jitter, | 196 | PSCHED_TADD2(now, delay, cb->time_to_send); |
190 | &q->delay_cor, q->delay_dist), | ||
191 | cb->time_to_send); | ||
192 | ++q->counter; | 197 | ++q->counter; |
193 | ret = q->qdisc->enqueue(skb, q->qdisc); | 198 | ret = q->qdisc->enqueue(skb, q->qdisc); |
194 | } else { | 199 | } else { |
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) | |||
248 | const struct netem_skb_cb *cb | 253 | const struct netem_skb_cb *cb |
249 | = (const struct netem_skb_cb *)skb->cb; | 254 | = (const struct netem_skb_cb *)skb->cb; |
250 | psched_time_t now; | 255 | psched_time_t now; |
251 | long delay; | ||
252 | 256 | ||
253 | /* if more time remaining? */ | 257 | /* if more time remaining? */ |
254 | PSCHED_GET_TIME(now); | 258 | PSCHED_GET_TIME(now); |
255 | delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); | 259 | |
256 | pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); | 260 | if (PSCHED_TLESS(cb->time_to_send, now)) { |
257 | if (delay <= 0) { | ||
258 | pr_debug("netem_dequeue: return skb=%p\n", skb); | 261 | pr_debug("netem_dequeue: return skb=%p\n", skb); |
259 | sch->q.qlen--; | 262 | sch->q.qlen--; |
260 | sch->flags &= ~TCQ_F_THROTTLED; | 263 | sch->flags &= ~TCQ_F_THROTTLED; |
261 | return skb; | 264 | return skb; |
262 | } | 265 | } else { |
266 | psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now); | ||
267 | |||
268 | if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { | ||
269 | sch->qstats.drops++; | ||
263 | 270 | ||
264 | mod_timer(&q->timer, jiffies + delay); | 271 | /* After this qlen is confused */ |
265 | sch->flags |= TCQ_F_THROTTLED; | 272 | printk(KERN_ERR "netem: queue discpline %s could not requeue\n", |
273 | q->qdisc->ops->id); | ||
266 | 274 | ||
267 | if (q->qdisc->ops->requeue(skb, q->qdisc) != 0) | 275 | sch->q.qlen--; |
268 | sch->qstats.drops++; | 276 | } |
277 | |||
278 | mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay)); | ||
279 | sch->flags |= TCQ_F_THROTTLED; | ||
280 | } | ||
269 | } | 281 | } |
270 | 282 | ||
271 | return NULL; | 283 | return NULL; |
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch) | |||
290 | del_timer_sync(&q->timer); | 302 | del_timer_sync(&q->timer); |
291 | } | 303 | } |
292 | 304 | ||
305 | /* Pass size change message down to embedded FIFO */ | ||
293 | static int set_fifo_limit(struct Qdisc *q, int limit) | 306 | static int set_fifo_limit(struct Qdisc *q, int limit) |
294 | { | 307 | { |
295 | struct rtattr *rta; | 308 | struct rtattr *rta; |
296 | int ret = -ENOMEM; | 309 | int ret = -ENOMEM; |
297 | 310 | ||
311 | /* Hack to avoid sending change message to non-FIFO */ | ||
312 | if (strncmp(q->ops->id + 1, "fifo", 4) != 0) | ||
313 | return 0; | ||
314 | |||
298 | rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); | 315 | rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); |
299 | if (rta) { | 316 | if (rta) { |
300 | rta->rta_type = RTM_NEWQDISC; | 317 | rta->rta_type = RTM_NEWQDISC; |
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) | |||
426 | return 0; | 443 | return 0; |
427 | } | 444 | } |
428 | 445 | ||
446 | /* | ||
447 | * Special case version of FIFO queue for use by netem. | ||
448 | * It queues in order based on timestamps in skb's | ||
449 | */ | ||
450 | struct fifo_sched_data { | ||
451 | u32 limit; | ||
452 | }; | ||
453 | |||
454 | static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) | ||
455 | { | ||
456 | struct fifo_sched_data *q = qdisc_priv(sch); | ||
457 | struct sk_buff_head *list = &sch->q; | ||
458 | const struct netem_skb_cb *ncb | ||
459 | = (const struct netem_skb_cb *)nskb->cb; | ||
460 | struct sk_buff *skb; | ||
461 | |||
462 | if (likely(skb_queue_len(list) < q->limit)) { | ||
463 | skb_queue_reverse_walk(list, skb) { | ||
464 | const struct netem_skb_cb *cb | ||
465 | = (const struct netem_skb_cb *)skb->cb; | ||
466 | |||
467 | if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send)) | ||
468 | break; | ||
469 | } | ||
470 | |||
471 | __skb_queue_after(list, skb, nskb); | ||
472 | |||
473 | sch->qstats.backlog += nskb->len; | ||
474 | sch->bstats.bytes += nskb->len; | ||
475 | sch->bstats.packets++; | ||
476 | |||
477 | return NET_XMIT_SUCCESS; | ||
478 | } | ||
479 | |||
480 | return qdisc_drop(nskb, sch); | ||
481 | } | ||
482 | |||
483 | static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) | ||
484 | { | ||
485 | struct fifo_sched_data *q = qdisc_priv(sch); | ||
486 | |||
487 | if (opt) { | ||
488 | struct tc_fifo_qopt *ctl = RTA_DATA(opt); | ||
489 | if (RTA_PAYLOAD(opt) < sizeof(*ctl)) | ||
490 | return -EINVAL; | ||
491 | |||
492 | q->limit = ctl->limit; | ||
493 | } else | ||
494 | q->limit = max_t(u32, sch->dev->tx_queue_len, 1); | ||
495 | |||
496 | return 0; | ||
497 | } | ||
498 | |||
499 | static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) | ||
500 | { | ||
501 | struct fifo_sched_data *q = qdisc_priv(sch); | ||
502 | struct tc_fifo_qopt opt = { .limit = q->limit }; | ||
503 | |||
504 | RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); | ||
505 | return skb->len; | ||
506 | |||
507 | rtattr_failure: | ||
508 | return -1; | ||
509 | } | ||
510 | |||
511 | static struct Qdisc_ops tfifo_qdisc_ops = { | ||
512 | .id = "tfifo", | ||
513 | .priv_size = sizeof(struct fifo_sched_data), | ||
514 | .enqueue = tfifo_enqueue, | ||
515 | .dequeue = qdisc_dequeue_head, | ||
516 | .requeue = qdisc_requeue, | ||
517 | .drop = qdisc_queue_drop, | ||
518 | .init = tfifo_init, | ||
519 | .reset = qdisc_reset_queue, | ||
520 | .change = tfifo_init, | ||
521 | .dump = tfifo_dump, | ||
522 | }; | ||
523 | |||
429 | static int netem_init(struct Qdisc *sch, struct rtattr *opt) | 524 | static int netem_init(struct Qdisc *sch, struct rtattr *opt) |
430 | { | 525 | { |
431 | struct netem_sched_data *q = qdisc_priv(sch); | 526 | struct netem_sched_data *q = qdisc_priv(sch); |
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) | |||
438 | q->timer.function = netem_watchdog; | 533 | q->timer.function = netem_watchdog; |
439 | q->timer.data = (unsigned long) sch; | 534 | q->timer.data = (unsigned long) sch; |
440 | 535 | ||
441 | q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); | 536 | q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops); |
442 | if (!q->qdisc) { | 537 | if (!q->qdisc) { |
443 | pr_debug("netem: qdisc create failed\n"); | 538 | pr_debug("netem: qdisc create failed\n"); |
444 | return -ENOMEM; | 539 | return -ENOMEM; |
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = { | |||
601 | 696 | ||
602 | static int __init netem_module_init(void) | 697 | static int __init netem_module_init(void) |
603 | { | 698 | { |
699 | pr_info("netem: version " VERSION "\n"); | ||
604 | return register_qdisc(&netem_qdisc_ops); | 700 | return register_qdisc(&netem_qdisc_ops); |
605 | } | 701 | } |
606 | static void __exit netem_module_exit(void) | 702 | static void __exit netem_module_exit(void) |
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7845d045eec4..dccfa44c2d71 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c | |||
@@ -9,76 +9,23 @@ | |||
9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | 9 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
10 | * | 10 | * |
11 | * Changes: | 11 | * Changes: |
12 | * J Hadi Salim <hadi@nortel.com> 980914: computation fixes | 12 | * J Hadi Salim 980914: computation fixes |
13 | * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. | 13 | * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. |
14 | * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support | 14 | * J Hadi Salim 980816: ECN support |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/config.h> | 17 | #include <linux/config.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <asm/uaccess.h> | ||
20 | #include <asm/system.h> | ||
21 | #include <linux/bitops.h> | ||
22 | #include <linux/types.h> | 19 | #include <linux/types.h> |
23 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
24 | #include <linux/sched.h> | ||
25 | #include <linux/string.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/socket.h> | ||
28 | #include <linux/sockios.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/errno.h> | ||
31 | #include <linux/interrupt.h> | ||
32 | #include <linux/if_ether.h> | ||
33 | #include <linux/inet.h> | ||
34 | #include <linux/netdevice.h> | 21 | #include <linux/netdevice.h> |
35 | #include <linux/etherdevice.h> | ||
36 | #include <linux/notifier.h> | ||
37 | #include <net/ip.h> | ||
38 | #include <net/route.h> | ||
39 | #include <linux/skbuff.h> | 22 | #include <linux/skbuff.h> |
40 | #include <net/sock.h> | ||
41 | #include <net/pkt_sched.h> | 23 | #include <net/pkt_sched.h> |
42 | #include <net/inet_ecn.h> | 24 | #include <net/inet_ecn.h> |
43 | #include <net/dsfield.h> | 25 | #include <net/red.h> |
44 | 26 | ||
45 | 27 | ||
46 | /* Random Early Detection (RED) algorithm. | 28 | /* Parameters, settable by user: |
47 | ======================================= | ||
48 | |||
49 | Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways | ||
50 | for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. | ||
51 | |||
52 | This file codes a "divisionless" version of RED algorithm | ||
53 | as written down in Fig.17 of the paper. | ||
54 | |||
55 | Short description. | ||
56 | ------------------ | ||
57 | |||
58 | When a new packet arrives we calculate the average queue length: | ||
59 | |||
60 | avg = (1-W)*avg + W*current_queue_len, | ||
61 | |||
62 | W is the filter time constant (chosen as 2^(-Wlog)), it controls | ||
63 | the inertia of the algorithm. To allow larger bursts, W should be | ||
64 | decreased. | ||
65 | |||
66 | if (avg > th_max) -> packet marked (dropped). | ||
67 | if (avg < th_min) -> packet passes. | ||
68 | if (th_min < avg < th_max) we calculate probability: | ||
69 | |||
70 | Pb = max_P * (avg - th_min)/(th_max-th_min) | ||
71 | |||
72 | and mark (drop) packet with this probability. | ||
73 | Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). | ||
74 | max_P should be small (not 1), usually 0.01..0.02 is good value. | ||
75 | |||
76 | max_P is chosen as a number, so that max_P/(th_max-th_min) | ||
77 | is a negative power of two in order arithmetics to contain | ||
78 | only shifts. | ||
79 | |||
80 | |||
81 | Parameters, settable by user: | ||
82 | ----------------------------- | 29 | ----------------------------- |
83 | 30 | ||
84 | limit - bytes (must be > qth_max + burst) | 31 | limit - bytes (must be > qth_max + burst) |
@@ -89,243 +36,93 @@ Short description. | |||
89 | arbitrarily high (well, less than ram size) | 36 | arbitrarily high (well, less than ram size) |
90 | Really, this limit will never be reached | 37 | Really, this limit will never be reached |
91 | if RED works correctly. | 38 | if RED works correctly. |
92 | |||
93 | qth_min - bytes (should be < qth_max/2) | ||
94 | qth_max - bytes (should be at least 2*qth_min and less limit) | ||
95 | Wlog - bits (<32) log(1/W). | ||
96 | Plog - bits (<32) | ||
97 | |||
98 | Plog is related to max_P by formula: | ||
99 | |||
100 | max_P = (qth_max-qth_min)/2^Plog; | ||
101 | |||
102 | F.e. if qth_max=128K and qth_min=32K, then Plog=22 | ||
103 | corresponds to max_P=0.02 | ||
104 | |||
105 | Scell_log | ||
106 | Stab | ||
107 | |||
108 | Lookup table for log((1-W)^(t/t_ave). | ||
109 | |||
110 | |||
111 | NOTES: | ||
112 | |||
113 | Upper bound on W. | ||
114 | ----------------- | ||
115 | |||
116 | If you want to allow bursts of L packets of size S, | ||
117 | you should choose W: | ||
118 | |||
119 | L + 1 - th_min/S < (1-(1-W)^L)/W | ||
120 | |||
121 | th_min/S = 32 th_min/S = 4 | ||
122 | |||
123 | log(W) L | ||
124 | -1 33 | ||
125 | -2 35 | ||
126 | -3 39 | ||
127 | -4 46 | ||
128 | -5 57 | ||
129 | -6 75 | ||
130 | -7 101 | ||
131 | -8 135 | ||
132 | -9 190 | ||
133 | etc. | ||
134 | */ | 39 | */ |
135 | 40 | ||
136 | struct red_sched_data | 41 | struct red_sched_data |
137 | { | 42 | { |
138 | /* Parameters */ | 43 | u32 limit; /* HARD maximal queue length */ |
139 | u32 limit; /* HARD maximal queue length */ | 44 | unsigned char flags; |
140 | u32 qth_min; /* Min average length threshold: A scaled */ | 45 | struct red_parms parms; |
141 | u32 qth_max; /* Max average length threshold: A scaled */ | 46 | struct red_stats stats; |
142 | u32 Rmask; | ||
143 | u32 Scell_max; | ||
144 | unsigned char flags; | ||
145 | char Wlog; /* log(W) */ | ||
146 | char Plog; /* random number bits */ | ||
147 | char Scell_log; | ||
148 | u8 Stab[256]; | ||
149 | |||
150 | /* Variables */ | ||
151 | unsigned long qave; /* Average queue length: A scaled */ | ||
152 | int qcount; /* Packets since last random number generation */ | ||
153 | u32 qR; /* Cached random number */ | ||
154 | |||
155 | psched_time_t qidlestart; /* Start of idle period */ | ||
156 | struct tc_red_xstats st; | ||
157 | }; | 47 | }; |
158 | 48 | ||
159 | static int red_ecn_mark(struct sk_buff *skb) | 49 | static inline int red_use_ecn(struct red_sched_data *q) |
160 | { | 50 | { |
161 | if (skb->nh.raw + 20 > skb->tail) | 51 | return q->flags & TC_RED_ECN; |
162 | return 0; | ||
163 | |||
164 | switch (skb->protocol) { | ||
165 | case __constant_htons(ETH_P_IP): | ||
166 | if (INET_ECN_is_not_ect(skb->nh.iph->tos)) | ||
167 | return 0; | ||
168 | IP_ECN_set_ce(skb->nh.iph); | ||
169 | return 1; | ||
170 | case __constant_htons(ETH_P_IPV6): | ||
171 | if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h))) | ||
172 | return 0; | ||
173 | IP6_ECN_set_ce(skb->nh.ipv6h); | ||
174 | return 1; | ||
175 | default: | ||
176 | return 0; | ||
177 | } | ||
178 | } | 52 | } |
179 | 53 | ||
180 | static int | 54 | static inline int red_use_harddrop(struct red_sched_data *q) |
181 | red_enqueue(struct sk_buff *skb, struct Qdisc* sch) | 55 | { |
56 | return q->flags & TC_RED_HARDDROP; | ||
57 | } | ||
58 | |||
59 | static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) | ||
182 | { | 60 | { |
183 | struct red_sched_data *q = qdisc_priv(sch); | 61 | struct red_sched_data *q = qdisc_priv(sch); |
184 | 62 | ||
185 | psched_time_t now; | 63 | q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog); |
186 | 64 | ||
187 | if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { | 65 | if (red_is_idling(&q->parms)) |
188 | long us_idle; | 66 | red_end_of_idle_period(&q->parms); |
189 | int shift; | ||
190 | 67 | ||
191 | PSCHED_GET_TIME(now); | 68 | switch (red_action(&q->parms, q->parms.qavg)) { |
192 | us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); | 69 | case RED_DONT_MARK: |
193 | PSCHED_SET_PASTPERFECT(q->qidlestart); | 70 | break; |
194 | 71 | ||
195 | /* | 72 | case RED_PROB_MARK: |
196 | The problem: ideally, average length queue recalcultion should | 73 | sch->qstats.overlimits++; |
197 | be done over constant clock intervals. This is too expensive, so that | 74 | if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { |
198 | the calculation is driven by outgoing packets. | 75 | q->stats.prob_drop++; |
199 | When the queue is idle we have to model this clock by hand. | 76 | goto congestion_drop; |
200 | 77 | } | |
201 | SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) | ||
202 | dummy packets as a burst after idle time, i.e. | ||
203 | |||
204 | q->qave *= (1-W)^m | ||
205 | |||
206 | This is an apparently overcomplicated solution (f.e. we have to precompute | ||
207 | a table to make this calculation in reasonable time) | ||
208 | I believe that a simpler model may be used here, | ||
209 | but it is field for experiments. | ||
210 | */ | ||
211 | shift = q->Stab[us_idle>>q->Scell_log]; | ||
212 | |||
213 | if (shift) { | ||
214 | q->qave >>= shift; | ||
215 | } else { | ||
216 | /* Approximate initial part of exponent | ||
217 | with linear function: | ||
218 | (1-W)^m ~= 1-mW + ... | ||
219 | |||
220 | Seems, it is the best solution to | ||
221 | problem of too coarce exponent tabulation. | ||
222 | */ | ||
223 | |||
224 | us_idle = (q->qave * us_idle)>>q->Scell_log; | ||
225 | if (us_idle < q->qave/2) | ||
226 | q->qave -= us_idle; | ||
227 | else | ||
228 | q->qave >>= 1; | ||
229 | } | ||
230 | } else { | ||
231 | q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); | ||
232 | /* NOTE: | ||
233 | q->qave is fixed point number with point at Wlog. | ||
234 | The formulae above is equvalent to floating point | ||
235 | version: | ||
236 | |||
237 | qave = qave*(1-W) + sch->qstats.backlog*W; | ||
238 | --ANK (980924) | ||
239 | */ | ||
240 | } | ||
241 | 78 | ||
242 | if (q->qave < q->qth_min) { | 79 | q->stats.prob_mark++; |
243 | q->qcount = -1; | 80 | break; |
244 | enqueue: | 81 | |
245 | if (sch->qstats.backlog + skb->len <= q->limit) { | 82 | case RED_HARD_MARK: |
246 | __skb_queue_tail(&sch->q, skb); | 83 | sch->qstats.overlimits++; |
247 | sch->qstats.backlog += skb->len; | 84 | if (red_use_harddrop(q) || !red_use_ecn(q) || |
248 | sch->bstats.bytes += skb->len; | 85 | !INET_ECN_set_ce(skb)) { |
249 | sch->bstats.packets++; | 86 | q->stats.forced_drop++; |
250 | return NET_XMIT_SUCCESS; | 87 | goto congestion_drop; |
251 | } else { | 88 | } |
252 | q->st.pdrop++; | ||
253 | } | ||
254 | kfree_skb(skb); | ||
255 | sch->qstats.drops++; | ||
256 | return NET_XMIT_DROP; | ||
257 | } | ||
258 | if (q->qave >= q->qth_max) { | ||
259 | q->qcount = -1; | ||
260 | sch->qstats.overlimits++; | ||
261 | mark: | ||
262 | if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { | ||
263 | q->st.early++; | ||
264 | goto drop; | ||
265 | } | ||
266 | q->st.marked++; | ||
267 | goto enqueue; | ||
268 | } | ||
269 | 89 | ||
270 | if (++q->qcount) { | 90 | q->stats.forced_mark++; |
271 | /* The formula used below causes questions. | 91 | break; |
272 | |||
273 | OK. qR is random number in the interval 0..Rmask | ||
274 | i.e. 0..(2^Plog). If we used floating point | ||
275 | arithmetics, it would be: (2^Plog)*rnd_num, | ||
276 | where rnd_num is less 1. | ||
277 | |||
278 | Taking into account, that qave have fixed | ||
279 | point at Wlog, and Plog is related to max_P by | ||
280 | max_P = (qth_max-qth_min)/2^Plog; two lines | ||
281 | below have the following floating point equivalent: | ||
282 | |||
283 | max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount | ||
284 | |||
285 | Any questions? --ANK (980924) | ||
286 | */ | ||
287 | if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) | ||
288 | goto enqueue; | ||
289 | q->qcount = 0; | ||
290 | q->qR = net_random()&q->Rmask; | ||
291 | sch->qstats.overlimits++; | ||
292 | goto mark; | ||
293 | } | 92 | } |
294 | q->qR = net_random()&q->Rmask; | ||
295 | goto enqueue; | ||
296 | 93 | ||
297 | drop: | 94 | if (sch->qstats.backlog + skb->len <= q->limit) |
298 | kfree_skb(skb); | 95 | return qdisc_enqueue_tail(skb, sch); |
299 | sch->qstats.drops++; | 96 | |
97 | q->stats.pdrop++; | ||
98 | return qdisc_drop(skb, sch); | ||
99 | |||
100 | congestion_drop: | ||
101 | qdisc_drop(skb, sch); | ||
300 | return NET_XMIT_CN; | 102 | return NET_XMIT_CN; |
301 | } | 103 | } |
302 | 104 | ||
303 | static int | 105 | static int red_requeue(struct sk_buff *skb, struct Qdisc* sch) |
304 | red_requeue(struct sk_buff *skb, struct Qdisc* sch) | ||
305 | { | 106 | { |
306 | struct red_sched_data *q = qdisc_priv(sch); | 107 | struct red_sched_data *q = qdisc_priv(sch); |
307 | 108 | ||
308 | PSCHED_SET_PASTPERFECT(q->qidlestart); | 109 | if (red_is_idling(&q->parms)) |
110 | red_end_of_idle_period(&q->parms); | ||
309 | 111 | ||
310 | __skb_queue_head(&sch->q, skb); | 112 | return qdisc_requeue(skb, sch); |
311 | sch->qstats.backlog += skb->len; | ||
312 | sch->qstats.requeues++; | ||
313 | return 0; | ||
314 | } | 113 | } |
315 | 114 | ||
316 | static struct sk_buff * | 115 | static struct sk_buff * red_dequeue(struct Qdisc* sch) |
317 | red_dequeue(struct Qdisc* sch) | ||
318 | { | 116 | { |
319 | struct sk_buff *skb; | 117 | struct sk_buff *skb; |
320 | struct red_sched_data *q = qdisc_priv(sch); | 118 | struct red_sched_data *q = qdisc_priv(sch); |
321 | 119 | ||
322 | skb = __skb_dequeue(&sch->q); | 120 | skb = qdisc_dequeue_head(sch); |
323 | if (skb) { | 121 | |
324 | sch->qstats.backlog -= skb->len; | 122 | if (skb == NULL && !red_is_idling(&q->parms)) |
325 | return skb; | 123 | red_start_of_idle_period(&q->parms); |
326 | } | 124 | |
327 | PSCHED_GET_TIME(q->qidlestart); | 125 | return skb; |
328 | return NULL; | ||
329 | } | 126 | } |
330 | 127 | ||
331 | static unsigned int red_drop(struct Qdisc* sch) | 128 | static unsigned int red_drop(struct Qdisc* sch) |
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch) | |||
333 | struct sk_buff *skb; | 130 | struct sk_buff *skb; |
334 | struct red_sched_data *q = qdisc_priv(sch); | 131 | struct red_sched_data *q = qdisc_priv(sch); |
335 | 132 | ||
336 | skb = __skb_dequeue_tail(&sch->q); | 133 | skb = qdisc_dequeue_tail(sch); |
337 | if (skb) { | 134 | if (skb) { |
338 | unsigned int len = skb->len; | 135 | unsigned int len = skb->len; |
339 | sch->qstats.backlog -= len; | 136 | q->stats.other++; |
340 | sch->qstats.drops++; | 137 | qdisc_drop(skb, sch); |
341 | q->st.other++; | ||
342 | kfree_skb(skb); | ||
343 | return len; | 138 | return len; |
344 | } | 139 | } |
345 | PSCHED_GET_TIME(q->qidlestart); | 140 | |
141 | if (!red_is_idling(&q->parms)) | ||
142 | red_start_of_idle_period(&q->parms); | ||
143 | |||
346 | return 0; | 144 | return 0; |
347 | } | 145 | } |
348 | 146 | ||
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch) | |||
350 | { | 148 | { |
351 | struct red_sched_data *q = qdisc_priv(sch); | 149 | struct red_sched_data *q = qdisc_priv(sch); |
352 | 150 | ||
353 | __skb_queue_purge(&sch->q); | 151 | qdisc_reset_queue(sch); |
354 | sch->qstats.backlog = 0; | 152 | red_restart(&q->parms); |
355 | PSCHED_SET_PASTPERFECT(q->qidlestart); | ||
356 | q->qave = 0; | ||
357 | q->qcount = -1; | ||
358 | } | 153 | } |
359 | 154 | ||
360 | static int red_change(struct Qdisc *sch, struct rtattr *opt) | 155 | static int red_change(struct Qdisc *sch, struct rtattr *opt) |
361 | { | 156 | { |
362 | struct red_sched_data *q = qdisc_priv(sch); | 157 | struct red_sched_data *q = qdisc_priv(sch); |
363 | struct rtattr *tb[TCA_RED_STAB]; | 158 | struct rtattr *tb[TCA_RED_MAX]; |
364 | struct tc_red_qopt *ctl; | 159 | struct tc_red_qopt *ctl; |
365 | 160 | ||
366 | if (opt == NULL || | 161 | if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt)) |
367 | rtattr_parse_nested(tb, TCA_RED_STAB, opt) || | 162 | return -EINVAL; |
368 | tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || | 163 | |
164 | if (tb[TCA_RED_PARMS-1] == NULL || | ||
369 | RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || | 165 | RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || |
370 | RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) | 166 | tb[TCA_RED_STAB-1] == NULL || |
167 | RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE) | ||
371 | return -EINVAL; | 168 | return -EINVAL; |
372 | 169 | ||
373 | ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); | 170 | ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); |
374 | 171 | ||
375 | sch_tree_lock(sch); | 172 | sch_tree_lock(sch); |
376 | q->flags = ctl->flags; | 173 | q->flags = ctl->flags; |
377 | q->Wlog = ctl->Wlog; | ||
378 | q->Plog = ctl->Plog; | ||
379 | q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; | ||
380 | q->Scell_log = ctl->Scell_log; | ||
381 | q->Scell_max = (255<<q->Scell_log); | ||
382 | q->qth_min = ctl->qth_min<<ctl->Wlog; | ||
383 | q->qth_max = ctl->qth_max<<ctl->Wlog; | ||
384 | q->limit = ctl->limit; | 174 | q->limit = ctl->limit; |
385 | memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); | ||
386 | 175 | ||
387 | q->qcount = -1; | 176 | red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, |
177 | ctl->Plog, ctl->Scell_log, | ||
178 | RTA_DATA(tb[TCA_RED_STAB-1])); | ||
179 | |||
388 | if (skb_queue_empty(&sch->q)) | 180 | if (skb_queue_empty(&sch->q)) |
389 | PSCHED_SET_PASTPERFECT(q->qidlestart); | 181 | red_end_of_idle_period(&q->parms); |
182 | |||
390 | sch_tree_unlock(sch); | 183 | sch_tree_unlock(sch); |
391 | return 0; | 184 | return 0; |
392 | } | 185 | } |
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt) | |||
399 | static int red_dump(struct Qdisc *sch, struct sk_buff *skb) | 192 | static int red_dump(struct Qdisc *sch, struct sk_buff *skb) |
400 | { | 193 | { |
401 | struct red_sched_data *q = qdisc_priv(sch); | 194 | struct red_sched_data *q = qdisc_priv(sch); |
402 | unsigned char *b = skb->tail; | 195 | struct rtattr *opts = NULL; |
403 | struct rtattr *rta; | 196 | struct tc_red_qopt opt = { |
404 | struct tc_red_qopt opt; | 197 | .limit = q->limit, |
405 | 198 | .flags = q->flags, | |
406 | rta = (struct rtattr*)b; | 199 | .qth_min = q->parms.qth_min >> q->parms.Wlog, |
407 | RTA_PUT(skb, TCA_OPTIONS, 0, NULL); | 200 | .qth_max = q->parms.qth_max >> q->parms.Wlog, |
408 | opt.limit = q->limit; | 201 | .Wlog = q->parms.Wlog, |
409 | opt.qth_min = q->qth_min>>q->Wlog; | 202 | .Plog = q->parms.Plog, |
410 | opt.qth_max = q->qth_max>>q->Wlog; | 203 | .Scell_log = q->parms.Scell_log, |
411 | opt.Wlog = q->Wlog; | 204 | }; |
412 | opt.Plog = q->Plog; | 205 | |
413 | opt.Scell_log = q->Scell_log; | 206 | opts = RTA_NEST(skb, TCA_OPTIONS); |
414 | opt.flags = q->flags; | ||
415 | RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); | 207 | RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); |
416 | rta->rta_len = skb->tail - b; | 208 | return RTA_NEST_END(skb, opts); |
417 | |||
418 | return skb->len; | ||
419 | 209 | ||
420 | rtattr_failure: | 210 | rtattr_failure: |
421 | skb_trim(skb, b - skb->data); | 211 | return RTA_NEST_CANCEL(skb, opts); |
422 | return -1; | ||
423 | } | 212 | } |
424 | 213 | ||
425 | static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) | 214 | static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) |
426 | { | 215 | { |
427 | struct red_sched_data *q = qdisc_priv(sch); | 216 | struct red_sched_data *q = qdisc_priv(sch); |
428 | 217 | struct tc_red_xstats st = { | |
429 | return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); | 218 | .early = q->stats.prob_drop + q->stats.forced_drop, |
219 | .pdrop = q->stats.pdrop, | ||
220 | .other = q->stats.other, | ||
221 | .marked = q->stats.prob_mark + q->stats.forced_mark, | ||
222 | }; | ||
223 | |||
224 | return gnet_stats_copy_app(d, &st, sizeof(st)); | ||
430 | } | 225 | } |
431 | 226 | ||
432 | static struct Qdisc_ops red_qdisc_ops = { | 227 | static struct Qdisc_ops red_qdisc_ops = { |
433 | .next = NULL, | ||
434 | .cl_ops = NULL, | ||
435 | .id = "red", | 228 | .id = "red", |
436 | .priv_size = sizeof(struct red_sched_data), | 229 | .priv_size = sizeof(struct red_sched_data), |
437 | .enqueue = red_enqueue, | 230 | .enqueue = red_enqueue, |
@@ -450,10 +243,13 @@ static int __init red_module_init(void) | |||
450 | { | 243 | { |
451 | return register_qdisc(&red_qdisc_ops); | 244 | return register_qdisc(&red_qdisc_ops); |
452 | } | 245 | } |
453 | static void __exit red_module_exit(void) | 246 | |
247 | static void __exit red_module_exit(void) | ||
454 | { | 248 | { |
455 | unregister_qdisc(&red_qdisc_ops); | 249 | unregister_qdisc(&red_qdisc_ops); |
456 | } | 250 | } |
251 | |||
457 | module_init(red_module_init) | 252 | module_init(red_module_init) |
458 | module_exit(red_module_exit) | 253 | module_exit(red_module_exit) |
254 | |||
459 | MODULE_LICENSE("GPL"); | 255 | MODULE_LICENSE("GPL"); |