diff options
author | Alexander Frolkin <avf@eldamar.org.uk> | 2013-06-19 05:54:25 -0400 |
---|---|---|
committer | Simon Horman <horms@verge.net.au> | 2013-06-26 05:01:46 -0400 |
commit | eba3b5a78799d21dea05118b294524958f0ab592 (patch) | |
tree | 2f84838b89f17c7b10cee8fa618f232c64e9fc1d /net | |
parent | acaac5d8bbedf6bd96f53960780942e1ad90d70e (diff) |
ipvs: SH fallback and L4 hashing
By default the SH scheduler rejects connections that are hashed onto a
realserver of weight 0. This patch adds a flag to make SH choose a
different realserver in this case, instead of rejecting the connection.
The patch also adds a flag to make SH include the source port (TCP, UDP,
SCTP) in the hash as well as the source address. This basically allows
for deterministic round-robin load balancing (i.e., where any director
in a cluster of directors with identical config will send the same
packet the same way).
The flags are service flags (IP_VS_SVC_F_SCHED*) so that these options
can be set per service. They are set using a new option to ipvsadm.
Signed-off-by: Alexander Frolkin <avf@eldamar.org.uk>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net')
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sh.c | 100 |
1 files changed, 85 insertions, 15 deletions
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index e0d5d1653566..f16c027df15b 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c | |||
@@ -48,6 +48,10 @@ | |||
48 | 48 | ||
49 | #include <net/ip_vs.h> | 49 | #include <net/ip_vs.h> |
50 | 50 | ||
51 | #include <net/tcp.h> | ||
52 | #include <linux/udp.h> | ||
53 | #include <linux/sctp.h> | ||
54 | |||
51 | 55 | ||
52 | /* | 56 | /* |
53 | * IPVS SH bucket | 57 | * IPVS SH bucket |
@@ -71,10 +75,19 @@ struct ip_vs_sh_state { | |||
71 | struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; | 75 | struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; |
72 | }; | 76 | }; |
73 | 77 | ||
78 | /* Helper function to determine if server is unavailable */ | ||
79 | static inline bool is_unavailable(struct ip_vs_dest *dest) | ||
80 | { | ||
81 | return atomic_read(&dest->weight) <= 0 || | ||
82 | dest->flags & IP_VS_DEST_F_OVERLOAD; | ||
83 | } | ||
84 | |||
74 | /* | 85 | /* |
75 | * Returns hash value for IPVS SH entry | 86 | * Returns hash value for IPVS SH entry |
76 | */ | 87 | */ |
77 | static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) | 88 | static inline unsigned int |
89 | ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, | ||
90 | __be16 port, unsigned int offset) | ||
78 | { | 91 | { |
79 | __be32 addr_fold = addr->ip; | 92 | __be32 addr_fold = addr->ip; |
80 | 93 | ||
@@ -83,7 +96,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad | |||
83 | addr_fold = addr->ip6[0]^addr->ip6[1]^ | 96 | addr_fold = addr->ip6[0]^addr->ip6[1]^ |
84 | addr->ip6[2]^addr->ip6[3]; | 97 | addr->ip6[2]^addr->ip6[3]; |
85 | #endif | 98 | #endif |
86 | return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; | 99 | return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & |
100 | IP_VS_SH_TAB_MASK; | ||
87 | } | 101 | } |
88 | 102 | ||
89 | 103 | ||
@@ -91,12 +105,42 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad | |||
91 | * Get ip_vs_dest associated with supplied parameters. | 105 | * Get ip_vs_dest associated with supplied parameters. |
92 | */ | 106 | */ |
93 | static inline struct ip_vs_dest * | 107 | static inline struct ip_vs_dest * |
94 | ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr) | 108 | ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, |
109 | const union nf_inet_addr *addr, __be16 port) | ||
95 | { | 110 | { |
96 | return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest); | 111 | unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); |
112 | struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); | ||
113 | |||
114 | return (!dest || is_unavailable(dest)) ? NULL : dest; | ||
97 | } | 115 | } |
98 | 116 | ||
99 | 117 | ||
118 | /* As ip_vs_sh_get, but with fallback if selected server is unavailable */ | ||
119 | static inline struct ip_vs_dest * | ||
120 | ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, | ||
121 | const union nf_inet_addr *addr, __be16 port) | ||
122 | { | ||
123 | unsigned int offset; | ||
124 | unsigned int hash; | ||
125 | struct ip_vs_dest *dest; | ||
126 | |||
127 | for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { | ||
128 | hash = ip_vs_sh_hashkey(svc->af, addr, port, offset); | ||
129 | dest = rcu_dereference(s->buckets[hash].dest); | ||
130 | if (!dest) | ||
131 | break; | ||
132 | if (is_unavailable(dest)) | ||
133 | IP_VS_DBG_BUF(6, "SH: selected unavailable server " | ||
134 | "%s:%d (offset %d)", | ||
135 | IP_VS_DBG_ADDR(svc->af, &dest->addr), | ||
136 | ntohs(dest->port), offset); | ||
137 | else | ||
138 | return dest; | ||
139 | } | ||
140 | |||
141 | return NULL; | ||
142 | } | ||
143 | |||
100 | /* | 144 | /* |
101 | * Assign all the hash buckets of the specified table with the service. | 145 | * Assign all the hash buckets of the specified table with the service. |
102 | */ | 146 | */ |
@@ -213,13 +257,33 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, | |||
213 | } | 257 | } |
214 | 258 | ||
215 | 259 | ||
216 | /* | 260 | /* Helper function to get port number */ |
217 | * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, | 261 | static inline __be16 |
218 | * consider that the server is overloaded here. | 262 | ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) |
219 | */ | ||
220 | static inline int is_overloaded(struct ip_vs_dest *dest) | ||
221 | { | 263 | { |
222 | return dest->flags & IP_VS_DEST_F_OVERLOAD; | 264 | __be16 port; |
265 | struct tcphdr _tcph, *th; | ||
266 | struct udphdr _udph, *uh; | ||
267 | sctp_sctphdr_t _sctph, *sh; | ||
268 | |||
269 | switch (iph->protocol) { | ||
270 | case IPPROTO_TCP: | ||
271 | th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); | ||
272 | port = th->source; | ||
273 | break; | ||
274 | case IPPROTO_UDP: | ||
275 | uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); | ||
276 | port = uh->source; | ||
277 | break; | ||
278 | case IPPROTO_SCTP: | ||
279 | sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); | ||
280 | port = sh->source; | ||
281 | break; | ||
282 | default: | ||
283 | port = 0; | ||
284 | } | ||
285 | |||
286 | return port; | ||
223 | } | 287 | } |
224 | 288 | ||
225 | 289 | ||
@@ -232,15 +296,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, | |||
232 | { | 296 | { |
233 | struct ip_vs_dest *dest; | 297 | struct ip_vs_dest *dest; |
234 | struct ip_vs_sh_state *s; | 298 | struct ip_vs_sh_state *s; |
299 | __be16 port = 0; | ||
235 | 300 | ||
236 | IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); | 301 | IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); |
237 | 302 | ||
303 | if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) | ||
304 | port = ip_vs_sh_get_port(skb, iph); | ||
305 | |||
238 | s = (struct ip_vs_sh_state *) svc->sched_data; | 306 | s = (struct ip_vs_sh_state *) svc->sched_data; |
239 | dest = ip_vs_sh_get(svc->af, s, &iph->saddr); | 307 | |
240 | if (!dest | 308 | if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) |
241 | || !(dest->flags & IP_VS_DEST_F_AVAILABLE) | 309 | dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); |
242 | || atomic_read(&dest->weight) <= 0 | 310 | else |
243 | || is_overloaded(dest)) { | 311 | dest = ip_vs_sh_get(svc, s, &iph->saddr, port); |
312 | |||
313 | if (!dest) { | ||
244 | ip_vs_scheduler_err(svc, "no destination available"); | 314 | ip_vs_scheduler_err(svc, "no destination available"); |
245 | return NULL; | 315 | return NULL; |
246 | } | 316 | } |