aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/ipvs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4/ipvs
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'net/ipv4/ipvs')
-rw-r--r--net/ipv4/ipvs/Kconfig244
-rw-r--r--net/ipv4/ipvs/Makefile34
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c658
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c920
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1191
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c2391
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c258
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c200
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c400
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c624
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c888
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c123
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c161
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c244
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c177
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c175
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_icmp.c182
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c640
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c427
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c118
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c163
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c255
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c892
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c151
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c563
27 files changed, 12565 insertions, 0 deletions
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
new file mode 100644
index 000000000000..63a82b4b64bb
--- /dev/null
+++ b/net/ipv4/ipvs/Kconfig
@@ -0,0 +1,244 @@
1#
2# IP Virtual Server configuration
3#
4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER
6
7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER
10 ---help---
11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This
13 option must be enabled for at least one of the clustered computers
14 that will take care of intercepting incoming connections to a
15 single IP address and scheduling them to real servers.
16
17 Three request dispatching techniques are implemented, they are
18 virtual server via NAT, virtual server via tunneling and virtual
19 server via direct routing. The several scheduling algorithms can
20 be used to choose which server the connection is directed to,
21 thus load balancing can be achieved among the servers. For more
22 information and its administration program, please visit the
23 following URL: <http://www.linuxvirtualserver.org/>.
24
25 If you want to compile it in kernel, say Y. To compile it as a
26 module, choose M here. If unsure, say N.
27
28config IP_VS_DEBUG
29 bool "IP virtual server debugging"
30 depends on IP_VS
31 ---help---
32 Say Y here if you want to get additional messages useful in
33 debugging the IP virtual server code. You can change the debug
34 level in /proc/sys/net/ipv4/vs/debug_level
35
36config IP_VS_TAB_BITS
37 int "IPVS connection table size (the Nth power of 2)"
38 depends on IP_VS
39 default "12"
40 ---help---
41 The IPVS connection hash table uses the chaining scheme to handle
42 hash collisions. Using a big IPVS connection hash table will greatly
43 reduce conflicts when there are hundreds of thousands of connections
44 in the hash table.
45
46 Note the table size must be power of 2. The table size will be the
47 value of 2 to the your input number power. The number to choose is
48 from 8 to 20, the default number is 12, which means the table size
49 is 4096. Don't input the number too small, otherwise you will lose
50 performance on it. You can adapt the table size yourself, according
51 to your virtual server application. It is good to set the table size
52 not far less than the number of connections per second multiplying
53 average lasting time of connection in the table. For example, your
54 virtual server gets 200 connections per second, the connection lasts
55 for 200 seconds in average in the connection table, the table size
56 should be not far less than 200x200, it is good to set the table
57 size 32768 (2**15).
58
59 Another note that each connection occupies 128 bytes effectively and
60 each hash entry uses 8 bytes, so you can estimate how much memory is
61 needed for your box.
62
63comment "IPVS transport protocol load balancing support"
64 depends on IP_VS
65
66config IP_VS_PROTO_TCP
67 bool "TCP load balancing support"
68 depends on IP_VS
69 ---help---
70 This option enables support for load balancing TCP transport
71 protocol. Say Y if unsure.
72
73config IP_VS_PROTO_UDP
74 bool "UDP load balancing support"
75 depends on IP_VS
76 ---help---
77 This option enables support for load balancing UDP transport
78 protocol. Say Y if unsure.
79
80config IP_VS_PROTO_ESP
81 bool "ESP load balancing support"
82 depends on IP_VS
83 ---help---
84 This option enables support for load balancing ESP (Encapsultion
85 Security Payload) transport protocol. Say Y if unsure.
86
87config IP_VS_PROTO_AH
88 bool "AH load balancing support"
89 depends on IP_VS
90 ---help---
91 This option enables support for load balancing AH (Authentication
92 Header) transport protocol. Say Y if unsure.
93
94comment "IPVS scheduler"
95 depends on IP_VS
96
97config IP_VS_RR
98 tristate "round-robin scheduling"
99 depends on IP_VS
100 ---help---
101 The robin-robin scheduling algorithm simply directs network
102 connections to different real servers in a round-robin manner.
103
104 If you want to compile it in kernel, say Y. To compile it as a
105 module, choose M here. If unsure, say N.
106
107config IP_VS_WRR
108 tristate "weighted round-robin scheduling"
109 depends on IP_VS
110 ---help---
111 The weighted robin-robin scheduling algorithm directs network
112 connections to different real servers based on server weights
113 in a round-robin manner. Servers with higher weights receive
114 new connections first than those with less weights, and servers
115 with higher weights get more connections than those with less
116 weights and servers with equal weights get equal connections.
117
118 If you want to compile it in kernel, say Y. To compile it as a
119 module, choose M here. If unsure, say N.
120
121config IP_VS_LC
122 tristate "least-connection scheduling"
123 depends on IP_VS
124 ---help---
125 The least-connection scheduling algorithm directs network
126 connections to the server with the least number of active
127 connections.
128
129 If you want to compile it in kernel, say Y. To compile it as a
130 module, choose M here. If unsure, say N.
131
132config IP_VS_WLC
133 tristate "weighted least-connection scheduling"
134 depends on IP_VS
135 ---help---
136 The weighted least-connection scheduling algorithm directs network
137 connections to the server with the least active connections
138 normalized by the server weight.
139
140 If you want to compile it in kernel, say Y. To compile it as a
141 module, choose M here. If unsure, say N.
142
143config IP_VS_LBLC
144 tristate "locality-based least-connection scheduling"
145 depends on IP_VS
146 ---help---
147 The locality-based least-connection scheduling algorithm is for
148 destination IP load balancing. It is usually used in cache cluster.
149 This algorithm usually directs packet destined for an IP address to
150 its server if the server is alive and under load. If the server is
151 overloaded (its active connection numbers is larger than its weight)
152 and there is a server in its half load, then allocate the weighted
153 least-connection server to this IP address.
154
155 If you want to compile it in kernel, say Y. To compile it as a
156 module, choose M here. If unsure, say N.
157
158config IP_VS_LBLCR
159 tristate "locality-based least-connection with replication scheduling"
160 depends on IP_VS
161 ---help---
162 The locality-based least-connection with replication scheduling
163 algorithm is also for destination IP load balancing. It is
164 usually used in cache cluster. It differs from the LBLC scheduling
165 as follows: the load balancer maintains mappings from a target
166 to a set of server nodes that can serve the target. Requests for
167 a target are assigned to the least-connection node in the target's
168 server set. If all the node in the server set are over loaded,
169 it picks up a least-connection node in the cluster and adds it
170 in the sever set for the target. If the server set has not been
171 modified for the specified time, the most loaded node is removed
172 from the server set, in order to avoid high degree of replication.
173
174 If you want to compile it in kernel, say Y. To compile it as a
175 module, choose M here. If unsure, say N.
176
177config IP_VS_DH
178 tristate "destination hashing scheduling"
179 depends on IP_VS
180 ---help---
181 The destination hashing scheduling algorithm assigns network
182 connections to the servers through looking up a statically assigned
183 hash table by their destination IP addresses.
184
185 If you want to compile it in kernel, say Y. To compile it as a
186 module, choose M here. If unsure, say N.
187
188config IP_VS_SH
189 tristate "source hashing scheduling"
190 depends on IP_VS
191 ---help---
192 The source hashing scheduling algorithm assigns network
193 connections to the servers through looking up a statically assigned
194 hash table by their source IP addresses.
195
196 If you want to compile it in kernel, say Y. To compile it as a
197 module, choose M here. If unsure, say N.
198
199config IP_VS_SED
200 tristate "shortest expected delay scheduling"
201 depends on IP_VS
202 ---help---
203 The shortest expected delay scheduling algorithm assigns network
204 connections to the server with the shortest expected delay. The
205 expected delay that the job will experience is (Ci + 1) / Ui if
206 sent to the ith server, in which Ci is the number of connections
207 on the the ith server and Ui is the fixed service rate (weight)
208 of the ith server.
209
210 If you want to compile it in kernel, say Y. To compile it as a
211 module, choose M here. If unsure, say N.
212
213config IP_VS_NQ
214 tristate "never queue scheduling"
215 depends on IP_VS
216 ---help---
217 The never queue scheduling algorithm adopts a two-speed model.
218 When there is an idle server available, the job will be sent to
219 the idle server, instead of waiting for a fast one. When there
220 is no idle server available, the job will be sent to the server
221 that minimize its expected delay (The Shortest Expected Delay
222 scheduling algorithm).
223
224 If you want to compile it in kernel, say Y. To compile it as a
225 module, choose M here. If unsure, say N.
226
227comment 'IPVS application helper'
228 depends on IP_VS
229
230config IP_VS_FTP
231 tristate "FTP protocol helper"
232 depends on IP_VS && IP_VS_PROTO_TCP
233 ---help---
234 FTP is a protocol that transfers IP address and/or port number in
235 the payload. In the virtual server via Network Address Translation,
236 the IP address and port number of real servers cannot be sent to
237 clients in ftp connections directly, so FTP protocol helper is
238 required for tracking the connection and mangling it back to that of
239 virtual service.
240
241 If you want to compile it in kernel, say Y. To compile it as a
242 module, choose M here. If unsure, say N.
243
244endmenu
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
new file mode 100644
index 000000000000..a788461a40c9
--- /dev/null
+++ b/net/ipv4/ipvs/Makefile
@@ -0,0 +1,34 @@
1#
2# Makefile for the IPVS modules on top of IPv4.
3#
4
5# IPVS transport protocol load balancing support
6ip_vs_proto-objs-y :=
7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
10ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
11
12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
14 ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \
15 $(ip_vs_proto-objs-y)
16
17
18# IPVS core
19obj-$(CONFIG_IP_VS) += ip_vs.o
20
21# IPVS schedulers
22obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
23obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
24obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
25obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
26obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
27obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
28obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
29obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
30obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
31obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
32
33# IPVS application helpers
34obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
new file mode 100644
index 000000000000..d9212addd193
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -0,0 +1,658 @@
1/*
2 * ip_vs_app.c: Application module support for IPVS
3 *
4 * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
14 * is that ip_vs_app module handles the reverse direction (incoming requests
15 * and outgoing responses).
16 *
17 * IP_MASQ_APP application masquerading module
18 *
19 * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/skbuff.h>
26#include <linux/in.h>
27#include <linux/ip.h>
28#include <net/protocol.h>
29#include <asm/system.h>
30#include <linux/stat.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33
34#include <net/ip_vs.h>
35
36EXPORT_SYMBOL(register_ip_vs_app);
37EXPORT_SYMBOL(unregister_ip_vs_app);
38EXPORT_SYMBOL(register_ip_vs_app_inc);
39
40/* ipvs application list head */
41static LIST_HEAD(ip_vs_app_list);
42static DECLARE_MUTEX(__ip_vs_app_mutex);
43
44
45/*
46 * Get an ip_vs_app object
47 */
48static inline int ip_vs_app_get(struct ip_vs_app *app)
49{
50 /* test and get the module atomically */
51 if (app->module)
52 return try_module_get(app->module);
53 else
54 return 1;
55}
56
57
58static inline void ip_vs_app_put(struct ip_vs_app *app)
59{
60 if (app->module)
61 module_put(app->module);
62}
63
64
65/*
66 * Allocate/initialize app incarnation and register it in proto apps.
67 */
68static int
69ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
70{
71 struct ip_vs_protocol *pp;
72 struct ip_vs_app *inc;
73 int ret;
74
75 if (!(pp = ip_vs_proto_get(proto)))
76 return -EPROTONOSUPPORT;
77
78 if (!pp->unregister_app)
79 return -EOPNOTSUPP;
80
81 inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
82 if (!inc)
83 return -ENOMEM;
84 memcpy(inc, app, sizeof(*inc));
85 INIT_LIST_HEAD(&inc->p_list);
86 INIT_LIST_HEAD(&inc->incs_list);
87 inc->app = app;
88 inc->port = htons(port);
89 atomic_set(&inc->usecnt, 0);
90
91 if (app->timeouts) {
92 inc->timeout_table =
93 ip_vs_create_timeout_table(app->timeouts,
94 app->timeouts_size);
95 if (!inc->timeout_table) {
96 ret = -ENOMEM;
97 goto out;
98 }
99 }
100
101 ret = pp->register_app(inc);
102 if (ret)
103 goto out;
104
105 list_add(&inc->a_list, &app->incs_list);
106 IP_VS_DBG(9, "%s application %s:%u registered\n",
107 pp->name, inc->name, inc->port);
108
109 return 0;
110
111 out:
112 if (inc->timeout_table)
113 kfree(inc->timeout_table);
114 kfree(inc);
115 return ret;
116}
117
118
119/*
120 * Release app incarnation
121 */
122static void
123ip_vs_app_inc_release(struct ip_vs_app *inc)
124{
125 struct ip_vs_protocol *pp;
126
127 if (!(pp = ip_vs_proto_get(inc->protocol)))
128 return;
129
130 if (pp->unregister_app)
131 pp->unregister_app(inc);
132
133 IP_VS_DBG(9, "%s App %s:%u unregistered\n",
134 pp->name, inc->name, inc->port);
135
136 list_del(&inc->a_list);
137
138 if (inc->timeout_table != NULL)
139 kfree(inc->timeout_table);
140 kfree(inc);
141}
142
143
144/*
145 * Get reference to app inc (only called from softirq)
146 *
147 */
148int ip_vs_app_inc_get(struct ip_vs_app *inc)
149{
150 int result;
151
152 atomic_inc(&inc->usecnt);
153 if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
154 atomic_dec(&inc->usecnt);
155 return result;
156}
157
158
159/*
160 * Put the app inc (only called from timer or net softirq)
161 */
162void ip_vs_app_inc_put(struct ip_vs_app *inc)
163{
164 ip_vs_app_put(inc->app);
165 atomic_dec(&inc->usecnt);
166}
167
168
169/*
170 * Register an application incarnation in protocol applications
171 */
172int
173register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
174{
175 int result;
176
177 down(&__ip_vs_app_mutex);
178
179 result = ip_vs_app_inc_new(app, proto, port);
180
181 up(&__ip_vs_app_mutex);
182
183 return result;
184}
185
186
187/*
188 * ip_vs_app registration routine
189 */
190int register_ip_vs_app(struct ip_vs_app *app)
191{
192 /* increase the module use count */
193 ip_vs_use_count_inc();
194
195 down(&__ip_vs_app_mutex);
196
197 list_add(&app->a_list, &ip_vs_app_list);
198
199 up(&__ip_vs_app_mutex);
200
201 return 0;
202}
203
204
205/*
206 * ip_vs_app unregistration routine
207 * We are sure there are no app incarnations attached to services
208 */
209void unregister_ip_vs_app(struct ip_vs_app *app)
210{
211 struct ip_vs_app *inc, *nxt;
212
213 down(&__ip_vs_app_mutex);
214
215 list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
216 ip_vs_app_inc_release(inc);
217 }
218
219 list_del(&app->a_list);
220
221 up(&__ip_vs_app_mutex);
222
223 /* decrease the module use count */
224 ip_vs_use_count_dec();
225}
226
227
228#if 0000
229/*
230 * Get reference to app by name (called from user context)
231 */
232struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
233{
234 struct ip_vs_app *app, *a = NULL;
235
236 down(&__ip_vs_app_mutex);
237
238 list_for_each_entry(ent, &ip_vs_app_list, a_list) {
239 if (strcmp(app->name, appname))
240 continue;
241
242 /* softirq may call ip_vs_app_get too, so the caller
243 must disable softirq on the current CPU */
244 if (ip_vs_app_get(app))
245 a = app;
246 break;
247 }
248
249 up(&__ip_vs_app_mutex);
250
251 return a;
252}
253#endif
254
255
256/*
257 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
258 */
259int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
260{
261 return pp->app_conn_bind(cp);
262}
263
264
265/*
266 * Unbind cp from application incarnation (called by cp destructor)
267 */
268void ip_vs_unbind_app(struct ip_vs_conn *cp)
269{
270 struct ip_vs_app *inc = cp->app;
271
272 if (!inc)
273 return;
274
275 if (inc->unbind_conn)
276 inc->unbind_conn(inc, cp);
277 if (inc->done_conn)
278 inc->done_conn(inc, cp);
279 ip_vs_app_inc_put(inc);
280 cp->app = NULL;
281}
282
283
284/*
285 * Fixes th->seq based on ip_vs_seq info.
286 */
287static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
288{
289 __u32 seq = ntohl(th->seq);
290
291 /*
292 * Adjust seq with delta-offset for all packets after
293 * the most recent resized pkt seq and with previous_delta offset
294 * for all packets before most recent resized pkt seq.
295 */
296 if (vseq->delta || vseq->previous_delta) {
297 if(after(seq, vseq->init_seq)) {
298 th->seq = htonl(seq + vseq->delta);
299 IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
300 vseq->delta);
301 } else {
302 th->seq = htonl(seq + vseq->previous_delta);
303 IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
304 "(%d) to seq\n", vseq->previous_delta);
305 }
306 }
307}
308
309
310/*
311 * Fixes th->ack_seq based on ip_vs_seq info.
312 */
313static inline void
314vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
315{
316 __u32 ack_seq = ntohl(th->ack_seq);
317
318 /*
319 * Adjust ack_seq with delta-offset for
320 * the packets AFTER most recent resized pkt has caused a shift
321 * for packets before most recent resized pkt, use previous_delta
322 */
323 if (vseq->delta || vseq->previous_delta) {
324 /* since ack_seq is the number of octet that is expected
325 to receive next, so compare it with init_seq+delta */
326 if(after(ack_seq, vseq->init_seq+vseq->delta)) {
327 th->ack_seq = htonl(ack_seq - vseq->delta);
328 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
329 "(%d) from ack_seq\n", vseq->delta);
330
331 } else {
332 th->ack_seq = htonl(ack_seq - vseq->previous_delta);
333 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
334 "previous_delta (%d) from ack_seq\n",
335 vseq->previous_delta);
336 }
337 }
338}
339
340
341/*
342 * Updates ip_vs_seq if pkt has been resized
343 * Assumes already checked proto==IPPROTO_TCP and diff!=0.
344 */
345static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
346 unsigned flag, __u32 seq, int diff)
347{
348 /* spinlock is to keep updating cp->flags atomic */
349 spin_lock(&cp->lock);
350 if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
351 vseq->previous_delta = vseq->delta;
352 vseq->delta += diff;
353 vseq->init_seq = seq;
354 cp->flags |= flag;
355 }
356 spin_unlock(&cp->lock);
357}
358
359static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
360 struct ip_vs_app *app)
361{
362 int diff;
363 unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
364 struct tcphdr *th;
365 __u32 seq;
366
367 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
368 return 0;
369
370 th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
371
372 /*
373 * Remember seq number in case this pkt gets resized
374 */
375 seq = ntohl(th->seq);
376
377 /*
378 * Fix seq stuff if flagged as so.
379 */
380 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
381 vs_fix_seq(&cp->out_seq, th);
382 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
383 vs_fix_ack_seq(&cp->in_seq, th);
384
385 /*
386 * Call private output hook function
387 */
388 if (app->pkt_out == NULL)
389 return 1;
390
391 if (!app->pkt_out(app, cp, pskb, &diff))
392 return 0;
393
394 /*
395 * Update ip_vs seq stuff if len has changed.
396 */
397 if (diff != 0)
398 vs_seq_update(cp, &cp->out_seq,
399 IP_VS_CONN_F_OUT_SEQ, seq, diff);
400
401 return 1;
402}
403
404/*
405 * Output pkt hook. Will call bound ip_vs_app specific function
406 * called by ipvs packet handler, assumes previously checked cp!=NULL
407 * returns false if it can't handle packet (oom)
408 */
409int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
410{
411 struct ip_vs_app *app;
412
413 /*
414 * check if application module is bound to
415 * this ip_vs_conn.
416 */
417 if ((app = cp->app) == NULL)
418 return 1;
419
420 /* TCP is complicated */
421 if (cp->protocol == IPPROTO_TCP)
422 return app_tcp_pkt_out(cp, pskb, app);
423
424 /*
425 * Call private output hook function
426 */
427 if (app->pkt_out == NULL)
428 return 1;
429
430 return app->pkt_out(app, cp, pskb, NULL);
431}
432
433
434static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
435 struct ip_vs_app *app)
436{
437 int diff;
438 unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
439 struct tcphdr *th;
440 __u32 seq;
441
442 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
443 return 0;
444
445 th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
446
447 /*
448 * Remember seq number in case this pkt gets resized
449 */
450 seq = ntohl(th->seq);
451
452 /*
453 * Fix seq stuff if flagged as so.
454 */
455 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
456 vs_fix_seq(&cp->in_seq, th);
457 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
458 vs_fix_ack_seq(&cp->out_seq, th);
459
460 /*
461 * Call private input hook function
462 */
463 if (app->pkt_in == NULL)
464 return 1;
465
466 if (!app->pkt_in(app, cp, pskb, &diff))
467 return 0;
468
469 /*
470 * Update ip_vs seq stuff if len has changed.
471 */
472 if (diff != 0)
473 vs_seq_update(cp, &cp->in_seq,
474 IP_VS_CONN_F_IN_SEQ, seq, diff);
475
476 return 1;
477}
478
479/*
480 * Input pkt hook. Will call bound ip_vs_app specific function
481 * called by ipvs packet handler, assumes previously checked cp!=NULL.
482 * returns false if can't handle packet (oom).
483 */
484int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
485{
486 struct ip_vs_app *app;
487
488 /*
489 * check if application module is bound to
490 * this ip_vs_conn.
491 */
492 if ((app = cp->app) == NULL)
493 return 1;
494
495 /* TCP is complicated */
496 if (cp->protocol == IPPROTO_TCP)
497 return app_tcp_pkt_in(cp, pskb, app);
498
499 /*
500 * Call private input hook function
501 */
502 if (app->pkt_in == NULL)
503 return 1;
504
505 return app->pkt_in(app, cp, pskb, NULL);
506}
507
508
509#ifdef CONFIG_PROC_FS
510/*
511 * /proc/net/ip_vs_app entry function
512 */
513
514static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
515{
516 struct ip_vs_app *app, *inc;
517
518 list_for_each_entry(app, &ip_vs_app_list, a_list) {
519 list_for_each_entry(inc, &app->incs_list, a_list) {
520 if (pos-- == 0)
521 return inc;
522 }
523 }
524 return NULL;
525
526}
527
528static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
529{
530 down(&__ip_vs_app_mutex);
531
532 return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
533}
534
535static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
536{
537 struct ip_vs_app *inc, *app;
538 struct list_head *e;
539
540 ++*pos;
541 if (v == SEQ_START_TOKEN)
542 return ip_vs_app_idx(0);
543
544 inc = v;
545 app = inc->app;
546
547 if ((e = inc->a_list.next) != &app->incs_list)
548 return list_entry(e, struct ip_vs_app, a_list);
549
550 /* go on to next application */
551 for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
552 app = list_entry(e, struct ip_vs_app, a_list);
553 list_for_each_entry(inc, &app->incs_list, a_list) {
554 return inc;
555 }
556 }
557 return NULL;
558}
559
560static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
561{
562 up(&__ip_vs_app_mutex);
563}
564
565static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
566{
567 if (v == SEQ_START_TOKEN)
568 seq_puts(seq, "prot port usecnt name\n");
569 else {
570 const struct ip_vs_app *inc = v;
571
572 seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
573 ip_vs_proto_name(inc->protocol),
574 ntohs(inc->port),
575 atomic_read(&inc->usecnt),
576 inc->name);
577 }
578 return 0;
579}
580
581static struct seq_operations ip_vs_app_seq_ops = {
582 .start = ip_vs_app_seq_start,
583 .next = ip_vs_app_seq_next,
584 .stop = ip_vs_app_seq_stop,
585 .show = ip_vs_app_seq_show,
586};
587
588static int ip_vs_app_open(struct inode *inode, struct file *file)
589{
590 return seq_open(file, &ip_vs_app_seq_ops);
591}
592
593static struct file_operations ip_vs_app_fops = {
594 .owner = THIS_MODULE,
595 .open = ip_vs_app_open,
596 .read = seq_read,
597 .llseek = seq_lseek,
598 .release = seq_release,
599};
600#endif
601
602
603/*
604 * Replace a segment of data with a new segment
605 */
606int ip_vs_skb_replace(struct sk_buff *skb, int pri,
607 char *o_buf, int o_len, char *n_buf, int n_len)
608{
609 struct iphdr *iph;
610 int diff;
611 int o_offset;
612 int o_left;
613
614 EnterFunction(9);
615
616 diff = n_len - o_len;
617 o_offset = o_buf - (char *)skb->data;
618 /* The length of left data after o_buf+o_len in the skb data */
619 o_left = skb->len - (o_offset + o_len);
620
621 if (diff <= 0) {
622 memmove(o_buf + n_len, o_buf + o_len, o_left);
623 memcpy(o_buf, n_buf, n_len);
624 skb_trim(skb, skb->len + diff);
625 } else if (diff <= skb_tailroom(skb)) {
626 skb_put(skb, diff);
627 memmove(o_buf + n_len, o_buf + o_len, o_left);
628 memcpy(o_buf, n_buf, n_len);
629 } else {
630 if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
631 return -ENOMEM;
632 skb_put(skb, diff);
633 memmove(skb->data + o_offset + n_len,
634 skb->data + o_offset + o_len, o_left);
635 memcpy(skb->data + o_offset, n_buf, n_len);
636 }
637
638 /* must update the iph total length here */
639 iph = skb->nh.iph;
640 iph->tot_len = htons(skb->len);
641
642 LeaveFunction(9);
643 return 0;
644}
645
646
647int ip_vs_app_init(void)
648{
649 /* we will replace it with proc_net_ipvs_create() soon */
650 proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
651 return 0;
652}
653
654
655void ip_vs_app_cleanup(void)
656{
657 proc_net_remove("ip_vs_app");
658}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
new file mode 100644
index 000000000000..fd6feb5499fe
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -0,0 +1,920 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
22 *
23 * Changes:
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/vmalloc.h>
29#include <linux/proc_fs.h> /* for proc_net_* */
30#include <linux/seq_file.h>
31#include <linux/jhash.h>
32#include <linux/random.h>
33
34#include <net/ip_vs.h>
35
36
37/*
38 * Connection hash table: for input and output packets lookups of IPVS
39 */
40static struct list_head *ip_vs_conn_tab;
41
42/* SLAB cache for IPVS connections */
43static kmem_cache_t *ip_vs_conn_cachep;
44
45/* counter for current IPVS connections */
46static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
47
48/* counter for no client port connections */
49static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
50
51/* random value for IPVS connection hash */
52static unsigned int ip_vs_conn_rnd;
53
54/*
55 * Fine locking granularity for big connection hash table
56 */
57#define CT_LOCKARRAY_BITS 4
58#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
59#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
60
61struct ip_vs_aligned_lock
62{
63 rwlock_t l;
64} __attribute__((__aligned__(SMP_CACHE_BYTES)));
65
66/* lock array for conn table */
67static struct ip_vs_aligned_lock
68__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
69
70static inline void ct_read_lock(unsigned key)
71{
72 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
73}
74
75static inline void ct_read_unlock(unsigned key)
76{
77 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
78}
79
80static inline void ct_write_lock(unsigned key)
81{
82 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
83}
84
85static inline void ct_write_unlock(unsigned key)
86{
87 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
88}
89
90static inline void ct_read_lock_bh(unsigned key)
91{
92 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
93}
94
95static inline void ct_read_unlock_bh(unsigned key)
96{
97 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
98}
99
100static inline void ct_write_lock_bh(unsigned key)
101{
102 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
103}
104
105static inline void ct_write_unlock_bh(unsigned key)
106{
107 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
108}
109
110
111/*
112 * Returns hash value for IPVS connection entry
113 */
114static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
115{
116 return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
117 & IP_VS_CONN_TAB_MASK;
118}
119
120
121/*
122 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
123 * returns bool success.
124 */
125static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
126{
127 unsigned hash;
128 int ret;
129
130 /* Hash by protocol, client address and port */
131 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
132
133 ct_write_lock(hash);
134
135 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
136 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
137 cp->flags |= IP_VS_CONN_F_HASHED;
138 atomic_inc(&cp->refcnt);
139 ret = 1;
140 } else {
141 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
142 "called from %p\n", __builtin_return_address(0));
143 ret = 0;
144 }
145
146 ct_write_unlock(hash);
147
148 return ret;
149}
150
151
152/*
153 * UNhashes ip_vs_conn from ip_vs_conn_tab.
154 * returns bool success.
155 */
156static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
157{
158 unsigned hash;
159 int ret;
160
161 /* unhash it and decrease its reference counter */
162 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
163
164 ct_write_lock(hash);
165
166 if (cp->flags & IP_VS_CONN_F_HASHED) {
167 list_del(&cp->c_list);
168 cp->flags &= ~IP_VS_CONN_F_HASHED;
169 atomic_dec(&cp->refcnt);
170 ret = 1;
171 } else
172 ret = 0;
173
174 ct_write_unlock(hash);
175
176 return ret;
177}
178
179
180/*
181 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
182 * Called for pkts coming from OUTside-to-INside.
183 * s_addr, s_port: pkt source address (foreign host)
184 * d_addr, d_port: pkt dest address (load balancer)
185 */
186static inline struct ip_vs_conn *__ip_vs_conn_in_get
187(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
188{
189 unsigned hash;
190 struct ip_vs_conn *cp;
191
192 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
193
194 ct_read_lock(hash);
195
196 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
197 if (s_addr==cp->caddr && s_port==cp->cport &&
198 d_port==cp->vport && d_addr==cp->vaddr &&
199 protocol==cp->protocol) {
200 /* HIT */
201 atomic_inc(&cp->refcnt);
202 ct_read_unlock(hash);
203 return cp;
204 }
205 }
206
207 ct_read_unlock(hash);
208
209 return NULL;
210}
211
212struct ip_vs_conn *ip_vs_conn_in_get
213(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
214{
215 struct ip_vs_conn *cp;
216
217 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
218 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
219 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
220
221 IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
222 ip_vs_proto_name(protocol),
223 NIPQUAD(s_addr), ntohs(s_port),
224 NIPQUAD(d_addr), ntohs(d_port),
225 cp?"hit":"not hit");
226
227 return cp;
228}
229
230
231/*
232 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
233 * Called for pkts coming from inside-to-OUTside.
234 * s_addr, s_port: pkt source address (inside host)
235 * d_addr, d_port: pkt dest address (foreign host)
236 */
237struct ip_vs_conn *ip_vs_conn_out_get
238(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
239{
240 unsigned hash;
241 struct ip_vs_conn *cp, *ret=NULL;
242
243 /*
244 * Check for "full" addressed entries
245 */
246 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
247
248 ct_read_lock(hash);
249
250 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
251 if (d_addr == cp->caddr && d_port == cp->cport &&
252 s_port == cp->dport && s_addr == cp->daddr &&
253 protocol == cp->protocol) {
254 /* HIT */
255 atomic_inc(&cp->refcnt);
256 ret = cp;
257 break;
258 }
259 }
260
261 ct_read_unlock(hash);
262
263 IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
264 ip_vs_proto_name(protocol),
265 NIPQUAD(s_addr), ntohs(s_port),
266 NIPQUAD(d_addr), ntohs(d_port),
267 ret?"hit":"not hit");
268
269 return ret;
270}
271
272
273/*
274 * Put back the conn and restart its timer with its timeout
275 */
276void ip_vs_conn_put(struct ip_vs_conn *cp)
277{
278 /* reset it expire in its timeout */
279 mod_timer(&cp->timer, jiffies+cp->timeout);
280
281 __ip_vs_conn_put(cp);
282}
283
284
285/*
286 * Fill a no_client_port connection with a client port number
287 */
288void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
289{
290 if (ip_vs_conn_unhash(cp)) {
291 spin_lock(&cp->lock);
292 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
293 atomic_dec(&ip_vs_conn_no_cport_cnt);
294 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
295 cp->cport = cport;
296 }
297 spin_unlock(&cp->lock);
298
299 /* hash on new dport */
300 ip_vs_conn_hash(cp);
301 }
302}
303
304
305/*
306 * Bind a connection entry with the corresponding packet_xmit.
307 * Called by ip_vs_conn_new.
308 */
309static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
310{
311 switch (IP_VS_FWD_METHOD(cp)) {
312 case IP_VS_CONN_F_MASQ:
313 cp->packet_xmit = ip_vs_nat_xmit;
314 break;
315
316 case IP_VS_CONN_F_TUNNEL:
317 cp->packet_xmit = ip_vs_tunnel_xmit;
318 break;
319
320 case IP_VS_CONN_F_DROUTE:
321 cp->packet_xmit = ip_vs_dr_xmit;
322 break;
323
324 case IP_VS_CONN_F_LOCALNODE:
325 cp->packet_xmit = ip_vs_null_xmit;
326 break;
327
328 case IP_VS_CONN_F_BYPASS:
329 cp->packet_xmit = ip_vs_bypass_xmit;
330 break;
331 }
332}
333
334
335static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
336{
337 return atomic_read(&dest->activeconns)
338 + atomic_read(&dest->inactconns);
339}
340
341/*
342 * Bind a connection entry with a virtual service destination
343 * Called just after a new connection entry is created.
344 */
345static inline void
346ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
347{
348 /* if dest is NULL, then return directly */
349 if (!dest)
350 return;
351
352 /* Increase the refcnt counter of the dest */
353 atomic_inc(&dest->refcnt);
354
355 /* Bind with the destination and its corresponding transmitter */
356 cp->flags |= atomic_read(&dest->conn_flags);
357 cp->dest = dest;
358
359 IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
360 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
361 ip_vs_proto_name(cp->protocol),
362 NIPQUAD(cp->caddr), ntohs(cp->cport),
363 NIPQUAD(cp->vaddr), ntohs(cp->vport),
364 NIPQUAD(cp->daddr), ntohs(cp->dport),
365 ip_vs_fwd_tag(cp), cp->state,
366 cp->flags, atomic_read(&cp->refcnt),
367 atomic_read(&dest->refcnt));
368
369 /* Update the connection counters */
370 if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
371 /* It is a normal connection, so increase the inactive
372 connection counter because it is in TCP SYNRECV
373 state (inactive) or other protocol inacive state */
374 atomic_inc(&dest->inactconns);
375 } else {
376 /* It is a persistent connection/template, so increase
377 the peristent connection counter */
378 atomic_inc(&dest->persistconns);
379 }
380
381 if (dest->u_threshold != 0 &&
382 ip_vs_dest_totalconns(dest) >= dest->u_threshold)
383 dest->flags |= IP_VS_DEST_F_OVERLOAD;
384}
385
386
387/*
388 * Unbind a connection entry with its VS destination
389 * Called by the ip_vs_conn_expire function.
390 */
391static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
392{
393 struct ip_vs_dest *dest = cp->dest;
394
395 if (!dest)
396 return;
397
398 IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
399 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
400 ip_vs_proto_name(cp->protocol),
401 NIPQUAD(cp->caddr), ntohs(cp->cport),
402 NIPQUAD(cp->vaddr), ntohs(cp->vport),
403 NIPQUAD(cp->daddr), ntohs(cp->dport),
404 ip_vs_fwd_tag(cp), cp->state,
405 cp->flags, atomic_read(&cp->refcnt),
406 atomic_read(&dest->refcnt));
407
408 /* Update the connection counters */
409 if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
410 /* It is a normal connection, so decrease the inactconns
411 or activeconns counter */
412 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
413 atomic_dec(&dest->inactconns);
414 } else {
415 atomic_dec(&dest->activeconns);
416 }
417 } else {
418 /* It is a persistent connection/template, so decrease
419 the peristent connection counter */
420 atomic_dec(&dest->persistconns);
421 }
422
423 if (dest->l_threshold != 0) {
424 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
425 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
426 } else if (dest->u_threshold != 0) {
427 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
428 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
429 } else {
430 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
431 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
432 }
433
434 /*
435 * Simply decrease the refcnt of the dest, because the
436 * dest will be either in service's destination list
437 * or in the trash.
438 */
439 atomic_dec(&dest->refcnt);
440}
441
442
443/*
444 * Checking if the destination of a connection template is available.
445 * If available, return 1, otherwise invalidate this connection
446 * template and return 0.
447 */
448int ip_vs_check_template(struct ip_vs_conn *ct)
449{
450 struct ip_vs_dest *dest = ct->dest;
451
452 /*
453 * Checking the dest server status.
454 */
455 if ((dest == NULL) ||
456 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
457 (sysctl_ip_vs_expire_quiescent_template &&
458 (atomic_read(&dest->weight) == 0))) {
459 IP_VS_DBG(9, "check_template: dest not available for "
460 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
461 "-> d:%u.%u.%u.%u:%d\n",
462 ip_vs_proto_name(ct->protocol),
463 NIPQUAD(ct->caddr), ntohs(ct->cport),
464 NIPQUAD(ct->vaddr), ntohs(ct->vport),
465 NIPQUAD(ct->daddr), ntohs(ct->dport));
466
467 /*
468 * Invalidate the connection template
469 */
470 if (ct->cport) {
471 if (ip_vs_conn_unhash(ct)) {
472 ct->dport = 65535;
473 ct->vport = 65535;
474 ct->cport = 0;
475 ip_vs_conn_hash(ct);
476 }
477 }
478
479 /*
480 * Simply decrease the refcnt of the template,
481 * don't restart its timer.
482 */
483 atomic_dec(&ct->refcnt);
484 return 0;
485 }
486 return 1;
487}
488
489static void ip_vs_conn_expire(unsigned long data)
490{
491 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
492
493 cp->timeout = 60*HZ;
494
495 /*
496 * hey, I'm using it
497 */
498 atomic_inc(&cp->refcnt);
499
500 /*
501 * do I control anybody?
502 */
503 if (atomic_read(&cp->n_control))
504 goto expire_later;
505
506 /*
507 * unhash it if it is hashed in the conn table
508 */
509 if (!ip_vs_conn_unhash(cp))
510 goto expire_later;
511
512 /*
513 * refcnt==1 implies I'm the only one referrer
514 */
515 if (likely(atomic_read(&cp->refcnt) == 1)) {
516 /* delete the timer if it is activated by other users */
517 if (timer_pending(&cp->timer))
518 del_timer(&cp->timer);
519
520 /* does anybody control me? */
521 if (cp->control)
522 ip_vs_control_del(cp);
523
524 if (unlikely(cp->app != NULL))
525 ip_vs_unbind_app(cp);
526 ip_vs_unbind_dest(cp);
527 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
528 atomic_dec(&ip_vs_conn_no_cport_cnt);
529 atomic_dec(&ip_vs_conn_count);
530
531 kmem_cache_free(ip_vs_conn_cachep, cp);
532 return;
533 }
534
535 /* hash it back to the table */
536 ip_vs_conn_hash(cp);
537
538 expire_later:
539 IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
540 atomic_read(&cp->refcnt)-1,
541 atomic_read(&cp->n_control));
542
543 ip_vs_conn_put(cp);
544}
545
546
547void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
548{
549 if (del_timer(&cp->timer))
550 mod_timer(&cp->timer, jiffies);
551 __ip_vs_conn_put(cp);
552}
553
554
555/*
556 * Create a new connection entry and hash it into the ip_vs_conn_tab
557 */
558struct ip_vs_conn *
559ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
560 __u32 daddr, __u16 dport, unsigned flags,
561 struct ip_vs_dest *dest)
562{
563 struct ip_vs_conn *cp;
564 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
565
566 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
567 if (cp == NULL) {
568 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
569 return NULL;
570 }
571
572 memset(cp, 0, sizeof(*cp));
573 INIT_LIST_HEAD(&cp->c_list);
574 init_timer(&cp->timer);
575 cp->timer.data = (unsigned long)cp;
576 cp->timer.function = ip_vs_conn_expire;
577 cp->protocol = proto;
578 cp->caddr = caddr;
579 cp->cport = cport;
580 cp->vaddr = vaddr;
581 cp->vport = vport;
582 cp->daddr = daddr;
583 cp->dport = dport;
584 cp->flags = flags;
585 spin_lock_init(&cp->lock);
586
587 /*
588 * Set the entry is referenced by the current thread before hashing
589 * it in the table, so that other thread run ip_vs_random_dropentry
590 * but cannot drop this entry.
591 */
592 atomic_set(&cp->refcnt, 1);
593
594 atomic_set(&cp->n_control, 0);
595 atomic_set(&cp->in_pkts, 0);
596
597 atomic_inc(&ip_vs_conn_count);
598 if (flags & IP_VS_CONN_F_NO_CPORT)
599 atomic_inc(&ip_vs_conn_no_cport_cnt);
600
601 /* Bind the connection with a destination server */
602 ip_vs_bind_dest(cp, dest);
603
604 /* Set its state and timeout */
605 cp->state = 0;
606 cp->timeout = 3*HZ;
607
608 /* Bind its packet transmitter */
609 ip_vs_bind_xmit(cp);
610
611 if (unlikely(pp && atomic_read(&pp->appcnt)))
612 ip_vs_bind_app(cp, pp);
613
614 /* Hash it in the ip_vs_conn_tab finally */
615 ip_vs_conn_hash(cp);
616
617 return cp;
618}
619
620
621/*
622 * /proc/net/ip_vs_conn entries
623 */
624#ifdef CONFIG_PROC_FS
625
626static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
627{
628 int idx;
629 struct ip_vs_conn *cp;
630
631 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
632 ct_read_lock_bh(idx);
633 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
634 if (pos-- == 0) {
635 seq->private = &ip_vs_conn_tab[idx];
636 return cp;
637 }
638 }
639 ct_read_unlock_bh(idx);
640 }
641
642 return NULL;
643}
644
645static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
646{
647 seq->private = NULL;
648 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
649}
650
651static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
652{
653 struct ip_vs_conn *cp = v;
654 struct list_head *e, *l = seq->private;
655 int idx;
656
657 ++*pos;
658 if (v == SEQ_START_TOKEN)
659 return ip_vs_conn_array(seq, 0);
660
661 /* more on same hash chain? */
662 if ((e = cp->c_list.next) != l)
663 return list_entry(e, struct ip_vs_conn, c_list);
664
665 idx = l - ip_vs_conn_tab;
666 ct_read_unlock_bh(idx);
667
668 while (++idx < IP_VS_CONN_TAB_SIZE) {
669 ct_read_lock_bh(idx);
670 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
671 seq->private = &ip_vs_conn_tab[idx];
672 return cp;
673 }
674 ct_read_unlock_bh(idx);
675 }
676 seq->private = NULL;
677 return NULL;
678}
679
680static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
681{
682 struct list_head *l = seq->private;
683
684 if (l)
685 ct_read_unlock_bh(l - ip_vs_conn_tab);
686}
687
688static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
689{
690
691 if (v == SEQ_START_TOKEN)
692 seq_puts(seq,
693 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
694 else {
695 const struct ip_vs_conn *cp = v;
696
697 seq_printf(seq,
698 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
699 ip_vs_proto_name(cp->protocol),
700 ntohl(cp->caddr), ntohs(cp->cport),
701 ntohl(cp->vaddr), ntohs(cp->vport),
702 ntohl(cp->daddr), ntohs(cp->dport),
703 ip_vs_state_name(cp->protocol, cp->state),
704 (cp->timer.expires-jiffies)/HZ);
705 }
706 return 0;
707}
708
709static struct seq_operations ip_vs_conn_seq_ops = {
710 .start = ip_vs_conn_seq_start,
711 .next = ip_vs_conn_seq_next,
712 .stop = ip_vs_conn_seq_stop,
713 .show = ip_vs_conn_seq_show,
714};
715
716static int ip_vs_conn_open(struct inode *inode, struct file *file)
717{
718 return seq_open(file, &ip_vs_conn_seq_ops);
719}
720
721static struct file_operations ip_vs_conn_fops = {
722 .owner = THIS_MODULE,
723 .open = ip_vs_conn_open,
724 .read = seq_read,
725 .llseek = seq_lseek,
726 .release = seq_release,
727};
728#endif
729
730
731/*
732 * Randomly drop connection entries before running out of memory
733 */
734static inline int todrop_entry(struct ip_vs_conn *cp)
735{
736 /*
737 * The drop rate array needs tuning for real environments.
738 * Called from timer bh only => no locking
739 */
740 static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
741 static char todrop_counter[9] = {0};
742 int i;
743
744 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
745 This will leave enough time for normal connection to get
746 through. */
747 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
748 return 0;
749
750 /* Don't drop the entry if its number of incoming packets is not
751 located in [0, 8] */
752 i = atomic_read(&cp->in_pkts);
753 if (i > 8 || i < 0) return 0;
754
755 if (!todrop_rate[i]) return 0;
756 if (--todrop_counter[i] > 0) return 0;
757
758 todrop_counter[i] = todrop_rate[i];
759 return 1;
760}
761
762
763void ip_vs_random_dropentry(void)
764{
765 int idx;
766 struct ip_vs_conn *cp;
767 struct ip_vs_conn *ct;
768
769 /*
770 * Randomly scan 1/32 of the whole table every second
771 */
772 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
773 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
774
775 /*
776 * Lock is actually needed in this loop.
777 */
778 ct_write_lock(hash);
779
780 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
781 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
782 /* connection template */
783 continue;
784
785 if (cp->protocol == IPPROTO_TCP) {
786 switch(cp->state) {
787 case IP_VS_TCP_S_SYN_RECV:
788 case IP_VS_TCP_S_SYNACK:
789 break;
790
791 case IP_VS_TCP_S_ESTABLISHED:
792 if (todrop_entry(cp))
793 break;
794 continue;
795
796 default:
797 continue;
798 }
799 } else {
800 if (!todrop_entry(cp))
801 continue;
802 }
803
804 /*
805 * Drop the entry, and drop its ct if not referenced
806 */
807 atomic_inc(&cp->refcnt);
808 ct_write_unlock(hash);
809
810 if ((ct = cp->control))
811 atomic_inc(&ct->refcnt);
812 IP_VS_DBG(4, "del connection\n");
813 ip_vs_conn_expire_now(cp);
814 if (ct) {
815 IP_VS_DBG(4, "del conn template\n");
816 ip_vs_conn_expire_now(ct);
817 }
818 ct_write_lock(hash);
819 }
820 ct_write_unlock(hash);
821 }
822}
823
824
825/*
826 * Flush all the connection entries in the ip_vs_conn_tab
827 */
828static void ip_vs_conn_flush(void)
829{
830 int idx;
831 struct ip_vs_conn *cp;
832 struct ip_vs_conn *ct;
833
834 flush_again:
835 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
836 /*
837 * Lock is actually needed in this loop.
838 */
839 ct_write_lock_bh(idx);
840
841 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
842 atomic_inc(&cp->refcnt);
843 ct_write_unlock(idx);
844
845 if ((ct = cp->control))
846 atomic_inc(&ct->refcnt);
847 IP_VS_DBG(4, "del connection\n");
848 ip_vs_conn_expire_now(cp);
849 if (ct) {
850 IP_VS_DBG(4, "del conn template\n");
851 ip_vs_conn_expire_now(ct);
852 }
853 ct_write_lock(idx);
854 }
855 ct_write_unlock_bh(idx);
856 }
857
858 /* the counter may be not NULL, because maybe some conn entries
859 are run by slow timer handler or unhashed but still referred */
860 if (atomic_read(&ip_vs_conn_count) != 0) {
861 schedule();
862 goto flush_again;
863 }
864}
865
866
867int ip_vs_conn_init(void)
868{
869 int idx;
870
871 /*
872 * Allocate the connection hash table and initialize its list heads
873 */
874 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
875 if (!ip_vs_conn_tab)
876 return -ENOMEM;
877
878 /* Allocate ip_vs_conn slab cache */
879 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
880 sizeof(struct ip_vs_conn), 0,
881 SLAB_HWCACHE_ALIGN, NULL, NULL);
882 if (!ip_vs_conn_cachep) {
883 vfree(ip_vs_conn_tab);
884 return -ENOMEM;
885 }
886
887 IP_VS_INFO("Connection hash table configured "
888 "(size=%d, memory=%ldKbytes)\n",
889 IP_VS_CONN_TAB_SIZE,
890 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
891 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
892 sizeof(struct ip_vs_conn));
893
894 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
895 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
896 }
897
898 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
899 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
900 }
901
902 proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
903
904 /* calculate the random value for connection hash */
905 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
906
907 return 0;
908}
909
910
911void ip_vs_conn_cleanup(void)
912{
913 /* flush all the connection entries first */
914 ip_vs_conn_flush();
915
916 /* Release the empty cache */
917 kmem_cache_destroy(ip_vs_conn_cachep);
918 proc_net_remove("ip_vs_conn");
919 vfree(ip_vs_conn_tab);
920}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
new file mode 100644
index 000000000000..5fb257dd07cb
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -0,0 +1,1191 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21 * and others.
22 *
23 * Changes:
24 * Paul `Rusty' Russell properly handle non-linear skbs
25 *
26 */
27
28#include <linux/module.h>
29#include <linux/kernel.h>
30#include <linux/ip.h>
31#include <linux/tcp.h>
32#include <linux/icmp.h>
33
34#include <net/ip.h>
35#include <net/tcp.h>
36#include <net/udp.h>
37#include <net/icmp.h> /* for icmp_send */
38#include <net/route.h>
39
40#include <linux/netfilter.h>
41#include <linux/netfilter_ipv4.h>
42
43#include <net/ip_vs.h>
44
45
46EXPORT_SYMBOL(register_ip_vs_scheduler);
47EXPORT_SYMBOL(unregister_ip_vs_scheduler);
48EXPORT_SYMBOL(ip_vs_skb_replace);
49EXPORT_SYMBOL(ip_vs_proto_name);
50EXPORT_SYMBOL(ip_vs_conn_new);
51EXPORT_SYMBOL(ip_vs_conn_in_get);
52EXPORT_SYMBOL(ip_vs_conn_out_get);
53#ifdef CONFIG_IP_VS_PROTO_TCP
54EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
55#endif
56EXPORT_SYMBOL(ip_vs_conn_put);
57#ifdef CONFIG_IP_VS_DEBUG
58EXPORT_SYMBOL(ip_vs_get_debug_level);
59#endif
60EXPORT_SYMBOL(ip_vs_make_skb_writable);
61
62
63/* ID used in ICMP lookups */
64#define icmp_id(icmph) (((icmph)->un).echo.id)
65
66const char *ip_vs_proto_name(unsigned proto)
67{
68 static char buf[20];
69
70 switch (proto) {
71 case IPPROTO_IP:
72 return "IP";
73 case IPPROTO_UDP:
74 return "UDP";
75 case IPPROTO_TCP:
76 return "TCP";
77 case IPPROTO_ICMP:
78 return "ICMP";
79 default:
80 sprintf(buf, "IP_%d", proto);
81 return buf;
82 }
83}
84
85void ip_vs_init_hash_table(struct list_head *table, int rows)
86{
87 while (--rows >= 0)
88 INIT_LIST_HEAD(&table[rows]);
89}
90
91static inline void
92ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
93{
94 struct ip_vs_dest *dest = cp->dest;
95 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
96 spin_lock(&dest->stats.lock);
97 dest->stats.inpkts++;
98 dest->stats.inbytes += skb->len;
99 spin_unlock(&dest->stats.lock);
100
101 spin_lock(&dest->svc->stats.lock);
102 dest->svc->stats.inpkts++;
103 dest->svc->stats.inbytes += skb->len;
104 spin_unlock(&dest->svc->stats.lock);
105
106 spin_lock(&ip_vs_stats.lock);
107 ip_vs_stats.inpkts++;
108 ip_vs_stats.inbytes += skb->len;
109 spin_unlock(&ip_vs_stats.lock);
110 }
111}
112
113
114static inline void
115ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
116{
117 struct ip_vs_dest *dest = cp->dest;
118 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
119 spin_lock(&dest->stats.lock);
120 dest->stats.outpkts++;
121 dest->stats.outbytes += skb->len;
122 spin_unlock(&dest->stats.lock);
123
124 spin_lock(&dest->svc->stats.lock);
125 dest->svc->stats.outpkts++;
126 dest->svc->stats.outbytes += skb->len;
127 spin_unlock(&dest->svc->stats.lock);
128
129 spin_lock(&ip_vs_stats.lock);
130 ip_vs_stats.outpkts++;
131 ip_vs_stats.outbytes += skb->len;
132 spin_unlock(&ip_vs_stats.lock);
133 }
134}
135
136
137static inline void
138ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
139{
140 spin_lock(&cp->dest->stats.lock);
141 cp->dest->stats.conns++;
142 spin_unlock(&cp->dest->stats.lock);
143
144 spin_lock(&svc->stats.lock);
145 svc->stats.conns++;
146 spin_unlock(&svc->stats.lock);
147
148 spin_lock(&ip_vs_stats.lock);
149 ip_vs_stats.conns++;
150 spin_unlock(&ip_vs_stats.lock);
151}
152
153
154static inline int
155ip_vs_set_state(struct ip_vs_conn *cp, int direction,
156 const struct sk_buff *skb,
157 struct ip_vs_protocol *pp)
158{
159 if (unlikely(!pp->state_transition))
160 return 0;
161 return pp->state_transition(cp, direction, skb, pp);
162}
163
164
165int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
166{
167 struct sk_buff *skb = *pskb;
168
169 /* skb is already used, better copy skb and its payload */
170 if (unlikely(skb_shared(skb) || skb->sk))
171 goto copy_skb;
172
173 /* skb data is already used, copy it */
174 if (unlikely(skb_cloned(skb)))
175 goto copy_data;
176
177 return pskb_may_pull(skb, writable_len);
178
179 copy_data:
180 if (unlikely(writable_len > skb->len))
181 return 0;
182 return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
183
184 copy_skb:
185 if (unlikely(writable_len > skb->len))
186 return 0;
187 skb = skb_copy(skb, GFP_ATOMIC);
188 if (!skb)
189 return 0;
190 BUG_ON(skb_is_nonlinear(skb));
191
192 /* Rest of kernel will get very unhappy if we pass it a
193 suddenly-orphaned skbuff */
194 if ((*pskb)->sk)
195 skb_set_owner_w(skb, (*pskb)->sk);
196 kfree_skb(*pskb);
197 *pskb = skb;
198 return 1;
199}
200
201/*
202 * IPVS persistent scheduling function
203 * It creates a connection entry according to its template if exists,
204 * or selects a server and creates a connection entry plus a template.
205 * Locking: we are svc user (svc->refcnt), so we hold all dests too
206 * Protocols supported: TCP, UDP
207 */
208static struct ip_vs_conn *
209ip_vs_sched_persist(struct ip_vs_service *svc,
210 const struct sk_buff *skb,
211 __u16 ports[2])
212{
213 struct ip_vs_conn *cp = NULL;
214 struct iphdr *iph = skb->nh.iph;
215 struct ip_vs_dest *dest;
216 struct ip_vs_conn *ct;
217 __u16 dport; /* destination port to forward */
218 __u32 snet; /* source network of the client, after masking */
219
220 /* Mask saddr with the netmask to adjust template granularity */
221 snet = iph->saddr & svc->netmask;
222
223 IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
224 "mnet %u.%u.%u.%u\n",
225 NIPQUAD(iph->saddr), ntohs(ports[0]),
226 NIPQUAD(iph->daddr), ntohs(ports[1]),
227 NIPQUAD(snet));
228
229 /*
230 * As far as we know, FTP is a very complicated network protocol, and
231 * it uses control connection and data connections. For active FTP,
232 * FTP server initialize data connection to the client, its source port
233 * is often 20. For passive FTP, FTP server tells the clients the port
234 * that it passively listens to, and the client issues the data
235 * connection. In the tunneling or direct routing mode, the load
236 * balancer is on the client-to-server half of connection, the port
237 * number is unknown to the load balancer. So, a conn template like
238 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
239 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
240 * is created for other persistent services.
241 */
242 if (ports[1] == svc->port) {
243 /* Check if a template already exists */
244 if (svc->port != FTPPORT)
245 ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
246 iph->daddr, ports[1]);
247 else
248 ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
249 iph->daddr, 0);
250
251 if (!ct || !ip_vs_check_template(ct)) {
252 /*
253 * No template found or the dest of the connection
254 * template is not available.
255 */
256 dest = svc->scheduler->schedule(svc, skb);
257 if (dest == NULL) {
258 IP_VS_DBG(1, "p-schedule: no dest found.\n");
259 return NULL;
260 }
261
262 /*
263 * Create a template like <protocol,caddr,0,
264 * vaddr,vport,daddr,dport> for non-ftp service,
265 * and <protocol,caddr,0,vaddr,0,daddr,0>
266 * for ftp service.
267 */
268 if (svc->port != FTPPORT)
269 ct = ip_vs_conn_new(iph->protocol,
270 snet, 0,
271 iph->daddr,
272 ports[1],
273 dest->addr, dest->port,
274 0,
275 dest);
276 else
277 ct = ip_vs_conn_new(iph->protocol,
278 snet, 0,
279 iph->daddr, 0,
280 dest->addr, 0,
281 0,
282 dest);
283 if (ct == NULL)
284 return NULL;
285
286 ct->timeout = svc->timeout;
287 } else {
288 /* set destination with the found template */
289 dest = ct->dest;
290 }
291 dport = dest->port;
292 } else {
293 /*
294 * Note: persistent fwmark-based services and persistent
295 * port zero service are handled here.
296 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
297 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
298 */
299 if (svc->fwmark)
300 ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
301 htonl(svc->fwmark), 0);
302 else
303 ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
304 iph->daddr, 0);
305
306 if (!ct || !ip_vs_check_template(ct)) {
307 /*
308 * If it is not persistent port zero, return NULL,
309 * otherwise create a connection template.
310 */
311 if (svc->port)
312 return NULL;
313
314 dest = svc->scheduler->schedule(svc, skb);
315 if (dest == NULL) {
316 IP_VS_DBG(1, "p-schedule: no dest found.\n");
317 return NULL;
318 }
319
320 /*
321 * Create a template according to the service
322 */
323 if (svc->fwmark)
324 ct = ip_vs_conn_new(IPPROTO_IP,
325 snet, 0,
326 htonl(svc->fwmark), 0,
327 dest->addr, 0,
328 0,
329 dest);
330 else
331 ct = ip_vs_conn_new(iph->protocol,
332 snet, 0,
333 iph->daddr, 0,
334 dest->addr, 0,
335 0,
336 dest);
337 if (ct == NULL)
338 return NULL;
339
340 ct->timeout = svc->timeout;
341 } else {
342 /* set destination with the found template */
343 dest = ct->dest;
344 }
345 dport = ports[1];
346 }
347
348 /*
349 * Create a new connection according to the template
350 */
351 cp = ip_vs_conn_new(iph->protocol,
352 iph->saddr, ports[0],
353 iph->daddr, ports[1],
354 dest->addr, dport,
355 0,
356 dest);
357 if (cp == NULL) {
358 ip_vs_conn_put(ct);
359 return NULL;
360 }
361
362 /*
363 * Add its control
364 */
365 ip_vs_control_add(cp, ct);
366 ip_vs_conn_put(ct);
367
368 ip_vs_conn_stats(cp, svc);
369 return cp;
370}
371
372
373/*
374 * IPVS main scheduling function
375 * It selects a server according to the virtual service, and
376 * creates a connection entry.
377 * Protocols supported: TCP, UDP
378 */
379struct ip_vs_conn *
380ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
381{
382 struct ip_vs_conn *cp = NULL;
383 struct iphdr *iph = skb->nh.iph;
384 struct ip_vs_dest *dest;
385 __u16 _ports[2], *pptr;
386
387 pptr = skb_header_pointer(skb, iph->ihl*4,
388 sizeof(_ports), _ports);
389 if (pptr == NULL)
390 return NULL;
391
392 /*
393 * Persistent service
394 */
395 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
396 return ip_vs_sched_persist(svc, skb, pptr);
397
398 /*
399 * Non-persistent service
400 */
401 if (!svc->fwmark && pptr[1] != svc->port) {
402 if (!svc->port)
403 IP_VS_ERR("Schedule: port zero only supported "
404 "in persistent services, "
405 "check your ipvs configuration\n");
406 return NULL;
407 }
408
409 dest = svc->scheduler->schedule(svc, skb);
410 if (dest == NULL) {
411 IP_VS_DBG(1, "Schedule: no dest found.\n");
412 return NULL;
413 }
414
415 /*
416 * Create a connection entry.
417 */
418 cp = ip_vs_conn_new(iph->protocol,
419 iph->saddr, pptr[0],
420 iph->daddr, pptr[1],
421 dest->addr, dest->port?dest->port:pptr[1],
422 0,
423 dest);
424 if (cp == NULL)
425 return NULL;
426
427 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
428 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
429 ip_vs_fwd_tag(cp),
430 NIPQUAD(cp->caddr), ntohs(cp->cport),
431 NIPQUAD(cp->vaddr), ntohs(cp->vport),
432 NIPQUAD(cp->daddr), ntohs(cp->dport),
433 cp->flags, atomic_read(&cp->refcnt));
434
435 ip_vs_conn_stats(cp, svc);
436 return cp;
437}
438
439
440/*
441 * Pass or drop the packet.
442 * Called by ip_vs_in, when the virtual service is available but
443 * no destination is available for a new connection.
444 */
445int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
446 struct ip_vs_protocol *pp)
447{
448 __u16 _ports[2], *pptr;
449 struct iphdr *iph = skb->nh.iph;
450
451 pptr = skb_header_pointer(skb, iph->ihl*4,
452 sizeof(_ports), _ports);
453 if (pptr == NULL) {
454 ip_vs_service_put(svc);
455 return NF_DROP;
456 }
457
458 /* if it is fwmark-based service, the cache_bypass sysctl is up
459 and the destination is RTN_UNICAST (and not local), then create
460 a cache_bypass connection entry */
461 if (sysctl_ip_vs_cache_bypass && svc->fwmark
462 && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
463 int ret, cs;
464 struct ip_vs_conn *cp;
465
466 ip_vs_service_put(svc);
467
468 /* create a new connection entry */
469 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
470 cp = ip_vs_conn_new(iph->protocol,
471 iph->saddr, pptr[0],
472 iph->daddr, pptr[1],
473 0, 0,
474 IP_VS_CONN_F_BYPASS,
475 NULL);
476 if (cp == NULL)
477 return NF_DROP;
478
479 /* statistics */
480 ip_vs_in_stats(cp, skb);
481
482 /* set state */
483 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
484
485 /* transmit the first SYN packet */
486 ret = cp->packet_xmit(skb, cp, pp);
487 /* do not touch skb anymore */
488
489 atomic_inc(&cp->in_pkts);
490 ip_vs_conn_put(cp);
491 return ret;
492 }
493
494 /*
495 * When the virtual ftp service is presented, packets destined
496 * for other services on the VIP may get here (except services
497 * listed in the ipvs table), pass the packets, because it is
498 * not ipvs job to decide to drop the packets.
499 */
500 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
501 ip_vs_service_put(svc);
502 return NF_ACCEPT;
503 }
504
505 ip_vs_service_put(svc);
506
507 /*
508 * Notify the client that the destination is unreachable, and
509 * release the socket buffer.
510 * Since it is in IP layer, the TCP socket is not actually
511 * created, the TCP RST packet cannot be sent, instead that
512 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
513 */
514 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
515 return NF_DROP;
516}
517
518
519/*
520 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
521 * chain, and is used for VS/NAT.
522 * It detects packets for VS/NAT connections and sends the packets
523 * immediately. This can avoid that iptable_nat mangles the packets
524 * for VS/NAT.
525 */
526static unsigned int ip_vs_post_routing(unsigned int hooknum,
527 struct sk_buff **pskb,
528 const struct net_device *in,
529 const struct net_device *out,
530 int (*okfn)(struct sk_buff *))
531{
532 if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
533 return NF_ACCEPT;
534
535 /* The packet was sent from IPVS, exit this chain */
536 (*okfn)(*pskb);
537
538 return NF_STOLEN;
539}
540
541u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
542{
543 return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
544}
545
546static inline struct sk_buff *
547ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
548{
549 skb = ip_defrag(skb, user);
550 if (skb)
551 ip_send_check(skb->nh.iph);
552 return skb;
553}
554
555/*
556 * Packet has been made sufficiently writable in caller
557 * - inout: 1=in->out, 0=out->in
558 */
559void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
560 struct ip_vs_conn *cp, int inout)
561{
562 struct iphdr *iph = skb->nh.iph;
563 unsigned int icmp_offset = iph->ihl*4;
564 struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset);
565 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
566
567 if (inout) {
568 iph->saddr = cp->vaddr;
569 ip_send_check(iph);
570 ciph->daddr = cp->vaddr;
571 ip_send_check(ciph);
572 } else {
573 iph->daddr = cp->daddr;
574 ip_send_check(iph);
575 ciph->saddr = cp->daddr;
576 ip_send_check(ciph);
577 }
578
579 /* the TCP/UDP port */
580 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
581 __u16 *ports = (void *)ciph + ciph->ihl*4;
582
583 if (inout)
584 ports[1] = cp->vport;
585 else
586 ports[0] = cp->dport;
587 }
588
589 /* And finally the ICMP checksum */
590 icmph->checksum = 0;
591 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
592 skb->ip_summed = CHECKSUM_UNNECESSARY;
593
594 if (inout)
595 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
596 "Forwarding altered outgoing ICMP");
597 else
598 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
599 "Forwarding altered incoming ICMP");
600}
601
602/*
603 * Handle ICMP messages in the inside-to-outside direction (outgoing).
604 * Find any that might be relevant, check against existing connections,
605 * forward to the right destination host if relevant.
606 * Currently handles error types - unreachable, quench, ttl exceeded.
607 * (Only used in VS/NAT)
608 */
609static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
610{
611 struct sk_buff *skb = *pskb;
612 struct iphdr *iph;
613 struct icmphdr _icmph, *ic;
614 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
615 struct ip_vs_conn *cp;
616 struct ip_vs_protocol *pp;
617 unsigned int offset, ihl, verdict;
618
619 *related = 1;
620
621 /* reassemble IP fragments */
622 if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
623 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
624 if (!skb)
625 return NF_STOLEN;
626 *pskb = skb;
627 }
628
629 iph = skb->nh.iph;
630 offset = ihl = iph->ihl * 4;
631 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
632 if (ic == NULL)
633 return NF_DROP;
634
635 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
636 ic->type, ntohs(icmp_id(ic)),
637 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
638
639 /*
640 * Work through seeing if this is for us.
641 * These checks are supposed to be in an order that means easy
642 * things are checked first to speed up processing.... however
643 * this means that some packets will manage to get a long way
644 * down this stack and then be rejected, but that's life.
645 */
646 if ((ic->type != ICMP_DEST_UNREACH) &&
647 (ic->type != ICMP_SOURCE_QUENCH) &&
648 (ic->type != ICMP_TIME_EXCEEDED)) {
649 *related = 0;
650 return NF_ACCEPT;
651 }
652
653 /* Now find the contained IP header */
654 offset += sizeof(_icmph);
655 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
656 if (cih == NULL)
657 return NF_ACCEPT; /* The packet looks wrong, ignore */
658
659 pp = ip_vs_proto_get(cih->protocol);
660 if (!pp)
661 return NF_ACCEPT;
662
663 /* Is the embedded protocol header present? */
664 if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
665 pp->dont_defrag))
666 return NF_ACCEPT;
667
668 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
669
670 offset += cih->ihl * 4;
671
672 /* The embedded headers contain source and dest in reverse order */
673 cp = pp->conn_out_get(skb, pp, cih, offset, 1);
674 if (!cp)
675 return NF_ACCEPT;
676
677 verdict = NF_DROP;
678
679 if (IP_VS_FWD_METHOD(cp) != 0) {
680 IP_VS_ERR("shouldn't reach here, because the box is on the"
681 "half connection in the tun/dr module.\n");
682 }
683
684 /* Ensure the checksum is correct */
685 if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
686 ip_vs_checksum_complete(skb, ihl)) {
687 /* Failed checksum! */
688 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
689 NIPQUAD(iph->saddr));
690 goto out;
691 }
692
693 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
694 offset += 2 * sizeof(__u16);
695 if (!ip_vs_make_skb_writable(pskb, offset))
696 goto out;
697 skb = *pskb;
698
699 ip_vs_nat_icmp(skb, pp, cp, 1);
700
701 /* do the statistics and put it back */
702 ip_vs_out_stats(cp, skb);
703
704 skb->nfcache |= NFC_IPVS_PROPERTY;
705 verdict = NF_ACCEPT;
706
707 out:
708 __ip_vs_conn_put(cp);
709
710 return verdict;
711}
712
713static inline int is_tcp_reset(const struct sk_buff *skb)
714{
715 struct tcphdr _tcph, *th;
716
717 th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
718 sizeof(_tcph), &_tcph);
719 if (th == NULL)
720 return 0;
721 return th->rst;
722}
723
724/*
725 * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
726 * Check if outgoing packet belongs to the established ip_vs_conn,
727 * rewrite addresses of the packet and send it on its way...
728 */
729static unsigned int
730ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
731 const struct net_device *in, const struct net_device *out,
732 int (*okfn)(struct sk_buff *))
733{
734 struct sk_buff *skb = *pskb;
735 struct iphdr *iph;
736 struct ip_vs_protocol *pp;
737 struct ip_vs_conn *cp;
738 int ihl;
739
740 EnterFunction(11);
741
742 if (skb->nfcache & NFC_IPVS_PROPERTY)
743 return NF_ACCEPT;
744
745 iph = skb->nh.iph;
746 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
747 int related, verdict = ip_vs_out_icmp(pskb, &related);
748
749 if (related)
750 return verdict;
751 skb = *pskb;
752 iph = skb->nh.iph;
753 }
754
755 pp = ip_vs_proto_get(iph->protocol);
756 if (unlikely(!pp))
757 return NF_ACCEPT;
758
759 /* reassemble IP fragments */
760 if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
761 !pp->dont_defrag)) {
762 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
763 if (!skb)
764 return NF_STOLEN;
765 iph = skb->nh.iph;
766 *pskb = skb;
767 }
768
769 ihl = iph->ihl << 2;
770
771 /*
772 * Check if the packet belongs to an existing entry
773 */
774 cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
775
776 if (unlikely(!cp)) {
777 if (sysctl_ip_vs_nat_icmp_send &&
778 (pp->protocol == IPPROTO_TCP ||
779 pp->protocol == IPPROTO_UDP)) {
780 __u16 _ports[2], *pptr;
781
782 pptr = skb_header_pointer(skb, ihl,
783 sizeof(_ports), _ports);
784 if (pptr == NULL)
785 return NF_ACCEPT; /* Not for me */
786 if (ip_vs_lookup_real_service(iph->protocol,
787 iph->saddr, pptr[0])) {
788 /*
789 * Notify the real server: there is no
790 * existing entry if it is not RST
791 * packet or not TCP packet.
792 */
793 if (iph->protocol != IPPROTO_TCP
794 || !is_tcp_reset(skb)) {
795 icmp_send(skb,ICMP_DEST_UNREACH,
796 ICMP_PORT_UNREACH, 0);
797 return NF_DROP;
798 }
799 }
800 }
801 IP_VS_DBG_PKT(12, pp, skb, 0,
802 "packet continues traversal as normal");
803 return NF_ACCEPT;
804 }
805
806 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
807
808 if (!ip_vs_make_skb_writable(pskb, ihl))
809 goto drop;
810
811 /* mangle the packet */
812 if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
813 goto drop;
814 skb = *pskb;
815 skb->nh.iph->saddr = cp->vaddr;
816 ip_send_check(skb->nh.iph);
817
818 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
819
820 ip_vs_out_stats(cp, skb);
821 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
822 ip_vs_conn_put(cp);
823
824 skb->nfcache |= NFC_IPVS_PROPERTY;
825
826 LeaveFunction(11);
827 return NF_ACCEPT;
828
829 drop:
830 ip_vs_conn_put(cp);
831 kfree_skb(*pskb);
832 return NF_STOLEN;
833}
834
835
836/*
837 * Handle ICMP messages in the outside-to-inside direction (incoming).
838 * Find any that might be relevant, check against existing connections,
839 * forward to the right destination host if relevant.
840 * Currently handles error types - unreachable, quench, ttl exceeded.
841 */
842static int
843ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
844{
845 struct sk_buff *skb = *pskb;
846 struct iphdr *iph;
847 struct icmphdr _icmph, *ic;
848 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
849 struct ip_vs_conn *cp;
850 struct ip_vs_protocol *pp;
851 unsigned int offset, ihl, verdict;
852
853 *related = 1;
854
855 /* reassemble IP fragments */
856 if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
857 skb = ip_vs_gather_frags(skb,
858 hooknum == NF_IP_LOCAL_IN ?
859 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
860 if (!skb)
861 return NF_STOLEN;
862 *pskb = skb;
863 }
864
865 iph = skb->nh.iph;
866 offset = ihl = iph->ihl * 4;
867 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
868 if (ic == NULL)
869 return NF_DROP;
870
871 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
872 ic->type, ntohs(icmp_id(ic)),
873 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
874
875 /*
876 * Work through seeing if this is for us.
877 * These checks are supposed to be in an order that means easy
878 * things are checked first to speed up processing.... however
879 * this means that some packets will manage to get a long way
880 * down this stack and then be rejected, but that's life.
881 */
882 if ((ic->type != ICMP_DEST_UNREACH) &&
883 (ic->type != ICMP_SOURCE_QUENCH) &&
884 (ic->type != ICMP_TIME_EXCEEDED)) {
885 *related = 0;
886 return NF_ACCEPT;
887 }
888
889 /* Now find the contained IP header */
890 offset += sizeof(_icmph);
891 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
892 if (cih == NULL)
893 return NF_ACCEPT; /* The packet looks wrong, ignore */
894
895 pp = ip_vs_proto_get(cih->protocol);
896 if (!pp)
897 return NF_ACCEPT;
898
899 /* Is the embedded protocol header present? */
900 if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
901 pp->dont_defrag))
902 return NF_ACCEPT;
903
904 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
905
906 offset += cih->ihl * 4;
907
908 /* The embedded headers contain source and dest in reverse order */
909 cp = pp->conn_in_get(skb, pp, cih, offset, 1);
910 if (!cp)
911 return NF_ACCEPT;
912
913 verdict = NF_DROP;
914
915 /* Ensure the checksum is correct */
916 if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
917 ip_vs_checksum_complete(skb, ihl)) {
918 /* Failed checksum! */
919 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
920 NIPQUAD(iph->saddr));
921 goto out;
922 }
923
924 /* do the statistics and put it back */
925 ip_vs_in_stats(cp, skb);
926 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
927 offset += 2 * sizeof(__u16);
928 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
929 /* do not touch skb anymore */
930
931 out:
932 __ip_vs_conn_put(cp);
933
934 return verdict;
935}
936
937/*
938 * Check if it's for virtual services, look it up,
939 * and send it on its way...
940 */
941static unsigned int
942ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
943 const struct net_device *in, const struct net_device *out,
944 int (*okfn)(struct sk_buff *))
945{
946 struct sk_buff *skb = *pskb;
947 struct iphdr *iph;
948 struct ip_vs_protocol *pp;
949 struct ip_vs_conn *cp;
950 int ret, restart;
951 int ihl;
952
953 /*
954 * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
955 * ... don't know why 1st test DOES NOT include 2nd (?)
956 */
957 if (unlikely(skb->pkt_type != PACKET_HOST
958 || skb->dev == &loopback_dev || skb->sk)) {
959 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
960 skb->pkt_type,
961 skb->nh.iph->protocol,
962 NIPQUAD(skb->nh.iph->daddr));
963 return NF_ACCEPT;
964 }
965
966 iph = skb->nh.iph;
967 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
968 int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
969
970 if (related)
971 return verdict;
972 skb = *pskb;
973 iph = skb->nh.iph;
974 }
975
976 /* Protocol supported? */
977 pp = ip_vs_proto_get(iph->protocol);
978 if (unlikely(!pp))
979 return NF_ACCEPT;
980
981 ihl = iph->ihl << 2;
982
983 /*
984 * Check if the packet belongs to an existing connection entry
985 */
986 cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
987
988 if (unlikely(!cp)) {
989 int v;
990
991 if (!pp->conn_schedule(skb, pp, &v, &cp))
992 return v;
993 }
994
995 if (unlikely(!cp)) {
996 /* sorry, all this trouble for a no-hit :) */
997 IP_VS_DBG_PKT(12, pp, skb, 0,
998 "packet continues traversal as normal");
999 return NF_ACCEPT;
1000 }
1001
1002 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1003
1004 /* Check the server status */
1005 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1006 /* the destination server is not available */
1007
1008 if (sysctl_ip_vs_expire_nodest_conn) {
1009 /* try to expire the connection immediately */
1010 ip_vs_conn_expire_now(cp);
1011 } else {
1012 /* don't restart its timer, and silently
1013 drop the packet. */
1014 __ip_vs_conn_put(cp);
1015 }
1016 return NF_DROP;
1017 }
1018
1019 ip_vs_in_stats(cp, skb);
1020 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1021 if (cp->packet_xmit)
1022 ret = cp->packet_xmit(skb, cp, pp);
1023 /* do not touch skb anymore */
1024 else {
1025 IP_VS_DBG_RL("warning: packet_xmit is null");
1026 ret = NF_ACCEPT;
1027 }
1028
1029 /* increase its packet counter and check if it is needed
1030 to be synchronized */
1031 atomic_inc(&cp->in_pkts);
1032 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1033 (cp->protocol != IPPROTO_TCP ||
1034 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1035 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1036 == sysctl_ip_vs_sync_threshold[0]))
1037 ip_vs_sync_conn(cp);
1038
1039 ip_vs_conn_put(cp);
1040 return ret;
1041}
1042
1043
1044/*
1045 * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
1046 * related packets destined for 0.0.0.0/0.
1047 * When fwmark-based virtual service is used, such as transparent
1048 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1049 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1050 * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
1051 * and send them to ip_vs_in_icmp.
1052 */
1053static unsigned int
1054ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
1055 const struct net_device *in, const struct net_device *out,
1056 int (*okfn)(struct sk_buff *))
1057{
1058 int r;
1059
1060 if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
1061 return NF_ACCEPT;
1062
1063 return ip_vs_in_icmp(pskb, &r, hooknum);
1064}
1065
1066
1067/* After packet filtering, forward packet through VS/DR, VS/TUN,
1068 or VS/NAT(change destination), so that filtering rules can be
1069 applied to IPVS. */
1070static struct nf_hook_ops ip_vs_in_ops = {
1071 .hook = ip_vs_in,
1072 .owner = THIS_MODULE,
1073 .pf = PF_INET,
1074 .hooknum = NF_IP_LOCAL_IN,
1075 .priority = 100,
1076};
1077
1078/* After packet filtering, change source only for VS/NAT */
1079static struct nf_hook_ops ip_vs_out_ops = {
1080 .hook = ip_vs_out,
1081 .owner = THIS_MODULE,
1082 .pf = PF_INET,
1083 .hooknum = NF_IP_FORWARD,
1084 .priority = 100,
1085};
1086
1087/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1088 destined for 0.0.0.0/0, which is for incoming IPVS connections */
1089static struct nf_hook_ops ip_vs_forward_icmp_ops = {
1090 .hook = ip_vs_forward_icmp,
1091 .owner = THIS_MODULE,
1092 .pf = PF_INET,
1093 .hooknum = NF_IP_FORWARD,
1094 .priority = 99,
1095};
1096
1097/* Before the netfilter connection tracking, exit from POST_ROUTING */
1098static struct nf_hook_ops ip_vs_post_routing_ops = {
1099 .hook = ip_vs_post_routing,
1100 .owner = THIS_MODULE,
1101 .pf = PF_INET,
1102 .hooknum = NF_IP_POST_ROUTING,
1103 .priority = NF_IP_PRI_NAT_SRC-1,
1104};
1105
1106
1107/*
1108 * Initialize IP Virtual Server
1109 */
1110static int __init ip_vs_init(void)
1111{
1112 int ret;
1113
1114 ret = ip_vs_control_init();
1115 if (ret < 0) {
1116 IP_VS_ERR("can't setup control.\n");
1117 goto cleanup_nothing;
1118 }
1119
1120 ip_vs_protocol_init();
1121
1122 ret = ip_vs_app_init();
1123 if (ret < 0) {
1124 IP_VS_ERR("can't setup application helper.\n");
1125 goto cleanup_protocol;
1126 }
1127
1128 ret = ip_vs_conn_init();
1129 if (ret < 0) {
1130 IP_VS_ERR("can't setup connection table.\n");
1131 goto cleanup_app;
1132 }
1133
1134 ret = nf_register_hook(&ip_vs_in_ops);
1135 if (ret < 0) {
1136 IP_VS_ERR("can't register in hook.\n");
1137 goto cleanup_conn;
1138 }
1139
1140 ret = nf_register_hook(&ip_vs_out_ops);
1141 if (ret < 0) {
1142 IP_VS_ERR("can't register out hook.\n");
1143 goto cleanup_inops;
1144 }
1145 ret = nf_register_hook(&ip_vs_post_routing_ops);
1146 if (ret < 0) {
1147 IP_VS_ERR("can't register post_routing hook.\n");
1148 goto cleanup_outops;
1149 }
1150 ret = nf_register_hook(&ip_vs_forward_icmp_ops);
1151 if (ret < 0) {
1152 IP_VS_ERR("can't register forward_icmp hook.\n");
1153 goto cleanup_postroutingops;
1154 }
1155
1156 IP_VS_INFO("ipvs loaded.\n");
1157 return ret;
1158
1159 cleanup_postroutingops:
1160 nf_unregister_hook(&ip_vs_post_routing_ops);
1161 cleanup_outops:
1162 nf_unregister_hook(&ip_vs_out_ops);
1163 cleanup_inops:
1164 nf_unregister_hook(&ip_vs_in_ops);
1165 cleanup_conn:
1166 ip_vs_conn_cleanup();
1167 cleanup_app:
1168 ip_vs_app_cleanup();
1169 cleanup_protocol:
1170 ip_vs_protocol_cleanup();
1171 ip_vs_control_cleanup();
1172 cleanup_nothing:
1173 return ret;
1174}
1175
1176static void __exit ip_vs_cleanup(void)
1177{
1178 nf_unregister_hook(&ip_vs_forward_icmp_ops);
1179 nf_unregister_hook(&ip_vs_post_routing_ops);
1180 nf_unregister_hook(&ip_vs_out_ops);
1181 nf_unregister_hook(&ip_vs_in_ops);
1182 ip_vs_conn_cleanup();
1183 ip_vs_app_cleanup();
1184 ip_vs_protocol_cleanup();
1185 ip_vs_control_cleanup();
1186 IP_VS_INFO("ipvs unloaded.\n");
1187}
1188
1189module_init(ip_vs_init);
1190module_exit(ip_vs_cleanup);
1191MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
new file mode 100644
index 000000000000..218d9701036e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -0,0 +1,2391 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
26#include <linux/fs.h>
27#include <linux/sysctl.h>
28#include <linux/proc_fs.h>
29#include <linux/workqueue.h>
30#include <linux/swap.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33
34#include <linux/netfilter.h>
35#include <linux/netfilter_ipv4.h>
36
37#include <net/ip.h>
38#include <net/sock.h>
39
40#include <asm/uaccess.h>
41
42#include <net/ip_vs.h>
43
44/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45static DECLARE_MUTEX(__ip_vs_mutex);
46
47/* lock for service table */
48static DEFINE_RWLOCK(__ip_vs_svc_lock);
49
50/* lock for table with the real services */
51static DEFINE_RWLOCK(__ip_vs_rs_lock);
52
53/* lock for state and timeout tables */
54static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
55
56/* lock for drop entry handling */
57static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
58
59/* lock for drop packet handling */
60static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
61
62/* 1/rate drop and drop-entry variables */
63int ip_vs_drop_rate = 0;
64int ip_vs_drop_counter = 0;
65static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
66
67/* number of virtual services */
68static int ip_vs_num_services = 0;
69
70/* sysctl variables */
71static int sysctl_ip_vs_drop_entry = 0;
72static int sysctl_ip_vs_drop_packet = 0;
73static int sysctl_ip_vs_secure_tcp = 0;
74static int sysctl_ip_vs_amemthresh = 1024;
75static int sysctl_ip_vs_am_droprate = 10;
76int sysctl_ip_vs_cache_bypass = 0;
77int sysctl_ip_vs_expire_nodest_conn = 0;
78int sysctl_ip_vs_expire_quiescent_template = 0;
79int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
80int sysctl_ip_vs_nat_icmp_send = 0;
81
82
83#ifdef CONFIG_IP_VS_DEBUG
84static int sysctl_ip_vs_debug_level = 0;
85
86int ip_vs_get_debug_level(void)
87{
88 return sysctl_ip_vs_debug_level;
89}
90#endif
91
92/*
93 * update_defense_level is called from keventd and from sysctl.
94 */
95static void update_defense_level(void)
96{
97 struct sysinfo i;
98 static int old_secure_tcp = 0;
99 int availmem;
100 int nomem;
101 int to_change = -1;
102
103 /* we only count free and buffered memory (in pages) */
104 si_meminfo(&i);
105 availmem = i.freeram + i.bufferram;
106 /* however in linux 2.5 the i.bufferram is total page cache size,
107 we need adjust it */
108 /* si_swapinfo(&i); */
109 /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111 nomem = (availmem < sysctl_ip_vs_amemthresh);
112
113 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) {
116 case 0:
117 atomic_set(&ip_vs_dropentry, 0);
118 break;
119 case 1:
120 if (nomem) {
121 atomic_set(&ip_vs_dropentry, 1);
122 sysctl_ip_vs_drop_entry = 2;
123 } else {
124 atomic_set(&ip_vs_dropentry, 0);
125 }
126 break;
127 case 2:
128 if (nomem) {
129 atomic_set(&ip_vs_dropentry, 1);
130 } else {
131 atomic_set(&ip_vs_dropentry, 0);
132 sysctl_ip_vs_drop_entry = 1;
133 };
134 break;
135 case 3:
136 atomic_set(&ip_vs_dropentry, 1);
137 break;
138 }
139 spin_unlock(&__ip_vs_dropentry_lock);
140
141 /* drop_packet */
142 spin_lock(&__ip_vs_droppacket_lock);
143 switch (sysctl_ip_vs_drop_packet) {
144 case 0:
145 ip_vs_drop_rate = 0;
146 break;
147 case 1:
148 if (nomem) {
149 ip_vs_drop_rate = ip_vs_drop_counter
150 = sysctl_ip_vs_amemthresh /
151 (sysctl_ip_vs_amemthresh-availmem);
152 sysctl_ip_vs_drop_packet = 2;
153 } else {
154 ip_vs_drop_rate = 0;
155 }
156 break;
157 case 2:
158 if (nomem) {
159 ip_vs_drop_rate = ip_vs_drop_counter
160 = sysctl_ip_vs_amemthresh /
161 (sysctl_ip_vs_amemthresh-availmem);
162 } else {
163 ip_vs_drop_rate = 0;
164 sysctl_ip_vs_drop_packet = 1;
165 }
166 break;
167 case 3:
168 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
169 break;
170 }
171 spin_unlock(&__ip_vs_droppacket_lock);
172
173 /* secure_tcp */
174 write_lock(&__ip_vs_securetcp_lock);
175 switch (sysctl_ip_vs_secure_tcp) {
176 case 0:
177 if (old_secure_tcp >= 2)
178 to_change = 0;
179 break;
180 case 1:
181 if (nomem) {
182 if (old_secure_tcp < 2)
183 to_change = 1;
184 sysctl_ip_vs_secure_tcp = 2;
185 } else {
186 if (old_secure_tcp >= 2)
187 to_change = 0;
188 }
189 break;
190 case 2:
191 if (nomem) {
192 if (old_secure_tcp < 2)
193 to_change = 1;
194 } else {
195 if (old_secure_tcp >= 2)
196 to_change = 0;
197 sysctl_ip_vs_secure_tcp = 1;
198 }
199 break;
200 case 3:
201 if (old_secure_tcp < 2)
202 to_change = 1;
203 break;
204 }
205 old_secure_tcp = sysctl_ip_vs_secure_tcp;
206 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock);
209}
210
211
212/*
213 * Timer for checking the defense
214 */
215#define DEFENSE_TIMER_PERIOD 1*HZ
216static void defense_work_handler(void *data);
217static DECLARE_WORK(defense_work, defense_work_handler, NULL);
218
219static void defense_work_handler(void *data)
220{
221 update_defense_level();
222 if (atomic_read(&ip_vs_dropentry))
223 ip_vs_random_dropentry();
224
225 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
226}
227
228int
229ip_vs_use_count_inc(void)
230{
231 return try_module_get(THIS_MODULE);
232}
233
234void
235ip_vs_use_count_dec(void)
236{
237 module_put(THIS_MODULE);
238}
239
240
241/*
242 * Hash table: for virtual service lookups
243 */
244#define IP_VS_SVC_TAB_BITS 8
245#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
246#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
247
248/* the service table hashed by <protocol, addr, port> */
249static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
250/* the service table hashed by fwmark */
251static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
252
253/*
254 * Hash table: for real service lookups
255 */
256#define IP_VS_RTAB_BITS 4
257#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
258#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
259
260static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
261
262/*
263 * Trash for destinations
264 */
265static LIST_HEAD(ip_vs_dest_trash);
266
267/*
268 * FTP & NULL virtual service counters
269 */
270static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
271static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
272
273
274/*
275 * Returns hash value for virtual service
276 */
277static __inline__ unsigned
278ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
279{
280 register unsigned porth = ntohs(port);
281
282 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
283 & IP_VS_SVC_TAB_MASK;
284}
285
286/*
287 * Returns hash value of fwmark for virtual service lookup
288 */
289static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
290{
291 return fwmark & IP_VS_SVC_TAB_MASK;
292}
293
294/*
295 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
296 * or in the ip_vs_svc_fwm_table by fwmark.
297 * Should be called with locked tables.
298 */
299static int ip_vs_svc_hash(struct ip_vs_service *svc)
300{
301 unsigned hash;
302
303 if (svc->flags & IP_VS_SVC_F_HASHED) {
304 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
305 "called from %p\n", __builtin_return_address(0));
306 return 0;
307 }
308
309 if (svc->fwmark == 0) {
310 /*
311 * Hash it by <protocol,addr,port> in ip_vs_svc_table
312 */
313 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
314 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
315 } else {
316 /*
317 * Hash it by fwmark in ip_vs_svc_fwm_table
318 */
319 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
320 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
321 }
322
323 svc->flags |= IP_VS_SVC_F_HASHED;
324 /* increase its refcnt because it is referenced by the svc table */
325 atomic_inc(&svc->refcnt);
326 return 1;
327}
328
329
330/*
331 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
332 * Should be called with locked tables.
333 */
334static int ip_vs_svc_unhash(struct ip_vs_service *svc)
335{
336 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
337 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
338 "called from %p\n", __builtin_return_address(0));
339 return 0;
340 }
341
342 if (svc->fwmark == 0) {
343 /* Remove it from the ip_vs_svc_table table */
344 list_del(&svc->s_list);
345 } else {
346 /* Remove it from the ip_vs_svc_fwm_table table */
347 list_del(&svc->f_list);
348 }
349
350 svc->flags &= ~IP_VS_SVC_F_HASHED;
351 atomic_dec(&svc->refcnt);
352 return 1;
353}
354
355
356/*
357 * Get service by {proto,addr,port} in the service table.
358 */
359static __inline__ struct ip_vs_service *
360__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
361{
362 unsigned hash;
363 struct ip_vs_service *svc;
364
365 /* Check for "full" addressed entries */
366 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
367
368 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
369 if ((svc->addr == vaddr)
370 && (svc->port == vport)
371 && (svc->protocol == protocol)) {
372 /* HIT */
373 atomic_inc(&svc->usecnt);
374 return svc;
375 }
376 }
377
378 return NULL;
379}
380
381
382/*
383 * Get service by {fwmark} in the service table.
384 */
385static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
386{
387 unsigned hash;
388 struct ip_vs_service *svc;
389
390 /* Check for fwmark addressed entries */
391 hash = ip_vs_svc_fwm_hashkey(fwmark);
392
393 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
394 if (svc->fwmark == fwmark) {
395 /* HIT */
396 atomic_inc(&svc->usecnt);
397 return svc;
398 }
399 }
400
401 return NULL;
402}
403
404struct ip_vs_service *
405ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
406{
407 struct ip_vs_service *svc;
408
409 read_lock(&__ip_vs_svc_lock);
410
411 /*
412 * Check the table hashed by fwmark first
413 */
414 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
415 goto out;
416
417 /*
418 * Check the table hashed by <protocol,addr,port>
419 * for "full" addressed entries
420 */
421 svc = __ip_vs_service_get(protocol, vaddr, vport);
422
423 if (svc == NULL
424 && protocol == IPPROTO_TCP
425 && atomic_read(&ip_vs_ftpsvc_counter)
426 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
427 /*
428 * Check if ftp service entry exists, the packet
429 * might belong to FTP data connections.
430 */
431 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
432 }
433
434 if (svc == NULL
435 && atomic_read(&ip_vs_nullsvc_counter)) {
436 /*
437 * Check if the catch-all port (port zero) exists
438 */
439 svc = __ip_vs_service_get(protocol, vaddr, 0);
440 }
441
442 out:
443 read_unlock(&__ip_vs_svc_lock);
444
445 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
446 fwmark, ip_vs_proto_name(protocol),
447 NIPQUAD(vaddr), ntohs(vport),
448 svc?"hit":"not hit");
449
450 return svc;
451}
452
453
454static inline void
455__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
456{
457 atomic_inc(&svc->refcnt);
458 dest->svc = svc;
459}
460
461static inline void
462__ip_vs_unbind_svc(struct ip_vs_dest *dest)
463{
464 struct ip_vs_service *svc = dest->svc;
465
466 dest->svc = NULL;
467 if (atomic_dec_and_test(&svc->refcnt))
468 kfree(svc);
469}
470
471
472/*
473 * Returns hash value for real service
474 */
475static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
476{
477 register unsigned porth = ntohs(port);
478
479 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
480 & IP_VS_RTAB_MASK;
481}
482
483/*
484 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
485 * should be called with locked tables.
486 */
487static int ip_vs_rs_hash(struct ip_vs_dest *dest)
488{
489 unsigned hash;
490
491 if (!list_empty(&dest->d_list)) {
492 return 0;
493 }
494
495 /*
496 * Hash by proto,addr,port,
497 * which are the parameters of the real service.
498 */
499 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
500 list_add(&dest->d_list, &ip_vs_rtable[hash]);
501
502 return 1;
503}
504
505/*
506 * UNhashes ip_vs_dest from ip_vs_rtable.
507 * should be called with locked tables.
508 */
509static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
510{
511 /*
512 * Remove it from the ip_vs_rtable table.
513 */
514 if (!list_empty(&dest->d_list)) {
515 list_del(&dest->d_list);
516 INIT_LIST_HEAD(&dest->d_list);
517 }
518
519 return 1;
520}
521
522/*
523 * Lookup real service by <proto,addr,port> in the real service table.
524 */
525struct ip_vs_dest *
526ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
527{
528 unsigned hash;
529 struct ip_vs_dest *dest;
530
531 /*
532 * Check for "full" addressed entries
533 * Return the first found entry
534 */
535 hash = ip_vs_rs_hashkey(daddr, dport);
536
537 read_lock(&__ip_vs_rs_lock);
538 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
539 if ((dest->addr == daddr)
540 && (dest->port == dport)
541 && ((dest->protocol == protocol) ||
542 dest->vfwmark)) {
543 /* HIT */
544 read_unlock(&__ip_vs_rs_lock);
545 return dest;
546 }
547 }
548 read_unlock(&__ip_vs_rs_lock);
549
550 return NULL;
551}
552
553/*
554 * Lookup destination by {addr,port} in the given service
555 */
556static struct ip_vs_dest *
557ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
558{
559 struct ip_vs_dest *dest;
560
561 /*
562 * Find the destination for the given service
563 */
564 list_for_each_entry(dest, &svc->destinations, n_list) {
565 if ((dest->addr == daddr) && (dest->port == dport)) {
566 /* HIT */
567 return dest;
568 }
569 }
570
571 return NULL;
572}
573
574
575/*
576 * Lookup dest by {svc,addr,port} in the destination trash.
577 * The destination trash is used to hold the destinations that are removed
578 * from the service table but are still referenced by some conn entries.
579 * The reason to add the destination trash is when the dest is temporary
580 * down (either by administrator or by monitor program), the dest can be
581 * picked back from the trash, the remaining connections to the dest can
582 * continue, and the counting information of the dest is also useful for
583 * scheduling.
584 */
585static struct ip_vs_dest *
586ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
587{
588 struct ip_vs_dest *dest, *nxt;
589
590 /*
591 * Find the destination in trash
592 */
593 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
594 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
595 "refcnt=%d\n",
596 dest->vfwmark,
597 NIPQUAD(dest->addr), ntohs(dest->port),
598 atomic_read(&dest->refcnt));
599 if (dest->addr == daddr &&
600 dest->port == dport &&
601 dest->vfwmark == svc->fwmark &&
602 dest->protocol == svc->protocol &&
603 (svc->fwmark ||
604 (dest->vaddr == svc->addr &&
605 dest->vport == svc->port))) {
606 /* HIT */
607 return dest;
608 }
609
610 /*
611 * Try to purge the destination from trash if not referenced
612 */
613 if (atomic_read(&dest->refcnt) == 1) {
614 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
615 "from trash\n",
616 dest->vfwmark,
617 NIPQUAD(dest->addr), ntohs(dest->port));
618 list_del(&dest->n_list);
619 ip_vs_dst_reset(dest);
620 __ip_vs_unbind_svc(dest);
621 kfree(dest);
622 }
623 }
624
625 return NULL;
626}
627
628
629/*
630 * Clean up all the destinations in the trash
631 * Called by the ip_vs_control_cleanup()
632 *
633 * When the ip_vs_control_clearup is activated by ipvs module exit,
634 * the service tables must have been flushed and all the connections
635 * are expired, and the refcnt of each destination in the trash must
636 * be 1, so we simply release them here.
637 */
638static void ip_vs_trash_cleanup(void)
639{
640 struct ip_vs_dest *dest, *nxt;
641
642 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
643 list_del(&dest->n_list);
644 ip_vs_dst_reset(dest);
645 __ip_vs_unbind_svc(dest);
646 kfree(dest);
647 }
648}
649
650
651static void
652ip_vs_zero_stats(struct ip_vs_stats *stats)
653{
654 spin_lock_bh(&stats->lock);
655 memset(stats, 0, (char *)&stats->lock - (char *)stats);
656 spin_unlock_bh(&stats->lock);
657 ip_vs_zero_estimator(stats);
658}
659
660/*
661 * Update a destination in the given service
662 */
663static void
664__ip_vs_update_dest(struct ip_vs_service *svc,
665 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
666{
667 int conn_flags;
668
669 /* set the weight and the flags */
670 atomic_set(&dest->weight, udest->weight);
671 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
672
673 /* check if local node and update the flags */
674 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
675 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
676 | IP_VS_CONN_F_LOCALNODE;
677 }
678
679 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
680 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
681 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
682 } else {
683 /*
684 * Put the real service in ip_vs_rtable if not present.
685 * For now only for NAT!
686 */
687 write_lock_bh(&__ip_vs_rs_lock);
688 ip_vs_rs_hash(dest);
689 write_unlock_bh(&__ip_vs_rs_lock);
690 }
691 atomic_set(&dest->conn_flags, conn_flags);
692
693 /* bind the service */
694 if (!dest->svc) {
695 __ip_vs_bind_svc(dest, svc);
696 } else {
697 if (dest->svc != svc) {
698 __ip_vs_unbind_svc(dest);
699 ip_vs_zero_stats(&dest->stats);
700 __ip_vs_bind_svc(dest, svc);
701 }
702 }
703
704 /* set the dest status flags */
705 dest->flags |= IP_VS_DEST_F_AVAILABLE;
706
707 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
708 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
709 dest->u_threshold = udest->u_threshold;
710 dest->l_threshold = udest->l_threshold;
711}
712
713
714/*
715 * Create a destination for the given service
716 */
717static int
718ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
719 struct ip_vs_dest **dest_p)
720{
721 struct ip_vs_dest *dest;
722 unsigned atype;
723
724 EnterFunction(2);
725
726 atype = inet_addr_type(udest->addr);
727 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
728 return -EINVAL;
729
730 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
731 if (dest == NULL) {
732 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
733 return -ENOMEM;
734 }
735 memset(dest, 0, sizeof(struct ip_vs_dest));
736
737 dest->protocol = svc->protocol;
738 dest->vaddr = svc->addr;
739 dest->vport = svc->port;
740 dest->vfwmark = svc->fwmark;
741 dest->addr = udest->addr;
742 dest->port = udest->port;
743
744 atomic_set(&dest->activeconns, 0);
745 atomic_set(&dest->inactconns, 0);
746 atomic_set(&dest->persistconns, 0);
747 atomic_set(&dest->refcnt, 0);
748
749 INIT_LIST_HEAD(&dest->d_list);
750 spin_lock_init(&dest->dst_lock);
751 spin_lock_init(&dest->stats.lock);
752 __ip_vs_update_dest(svc, dest, udest);
753 ip_vs_new_estimator(&dest->stats);
754
755 *dest_p = dest;
756
757 LeaveFunction(2);
758 return 0;
759}
760
761
762/*
763 * Add a destination into an existing service
764 */
765static int
766ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
767{
768 struct ip_vs_dest *dest;
769 __u32 daddr = udest->addr;
770 __u16 dport = udest->port;
771 int ret;
772
773 EnterFunction(2);
774
775 if (udest->weight < 0) {
776 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
777 return -ERANGE;
778 }
779
780 if (udest->l_threshold > udest->u_threshold) {
781 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
782 "upper threshold\n");
783 return -ERANGE;
784 }
785
786 /*
787 * Check if the dest already exists in the list
788 */
789 dest = ip_vs_lookup_dest(svc, daddr, dport);
790 if (dest != NULL) {
791 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
792 return -EEXIST;
793 }
794
795 /*
796 * Check if the dest already exists in the trash and
797 * is from the same service
798 */
799 dest = ip_vs_trash_get_dest(svc, daddr, dport);
800 if (dest != NULL) {
801 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
802 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
803 NIPQUAD(daddr), ntohs(dport),
804 atomic_read(&dest->refcnt),
805 dest->vfwmark,
806 NIPQUAD(dest->vaddr),
807 ntohs(dest->vport));
808 __ip_vs_update_dest(svc, dest, udest);
809
810 /*
811 * Get the destination from the trash
812 */
813 list_del(&dest->n_list);
814
815 ip_vs_new_estimator(&dest->stats);
816
817 write_lock_bh(&__ip_vs_svc_lock);
818
819 /*
820 * Wait until all other svc users go away.
821 */
822 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
823
824 list_add(&dest->n_list, &svc->destinations);
825 svc->num_dests++;
826
827 /* call the update_service function of its scheduler */
828 svc->scheduler->update_service(svc);
829
830 write_unlock_bh(&__ip_vs_svc_lock);
831 return 0;
832 }
833
834 /*
835 * Allocate and initialize the dest structure
836 */
837 ret = ip_vs_new_dest(svc, udest, &dest);
838 if (ret) {
839 return ret;
840 }
841
842 /*
843 * Add the dest entry into the list
844 */
845 atomic_inc(&dest->refcnt);
846
847 write_lock_bh(&__ip_vs_svc_lock);
848
849 /*
850 * Wait until all other svc users go away.
851 */
852 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
853
854 list_add(&dest->n_list, &svc->destinations);
855 svc->num_dests++;
856
857 /* call the update_service function of its scheduler */
858 svc->scheduler->update_service(svc);
859
860 write_unlock_bh(&__ip_vs_svc_lock);
861
862 LeaveFunction(2);
863
864 return 0;
865}
866
867
868/*
869 * Edit a destination in the given service
870 */
871static int
872ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
873{
874 struct ip_vs_dest *dest;
875 __u32 daddr = udest->addr;
876 __u16 dport = udest->port;
877
878 EnterFunction(2);
879
880 if (udest->weight < 0) {
881 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
882 return -ERANGE;
883 }
884
885 if (udest->l_threshold > udest->u_threshold) {
886 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
887 "upper threshold\n");
888 return -ERANGE;
889 }
890
891 /*
892 * Lookup the destination list
893 */
894 dest = ip_vs_lookup_dest(svc, daddr, dport);
895 if (dest == NULL) {
896 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
897 return -ENOENT;
898 }
899
900 __ip_vs_update_dest(svc, dest, udest);
901
902 write_lock_bh(&__ip_vs_svc_lock);
903
904 /* Wait until all other svc users go away */
905 while (atomic_read(&svc->usecnt) > 1) {};
906
907 /* call the update_service, because server weight may be changed */
908 svc->scheduler->update_service(svc);
909
910 write_unlock_bh(&__ip_vs_svc_lock);
911
912 LeaveFunction(2);
913
914 return 0;
915}
916
917
918/*
919 * Delete a destination (must be already unlinked from the service)
920 */
921static void __ip_vs_del_dest(struct ip_vs_dest *dest)
922{
923 ip_vs_kill_estimator(&dest->stats);
924
925 /*
926 * Remove it from the d-linked list with the real services.
927 */
928 write_lock_bh(&__ip_vs_rs_lock);
929 ip_vs_rs_unhash(dest);
930 write_unlock_bh(&__ip_vs_rs_lock);
931
932 /*
933 * Decrease the refcnt of the dest, and free the dest
934 * if nobody refers to it (refcnt=0). Otherwise, throw
935 * the destination into the trash.
936 */
937 if (atomic_dec_and_test(&dest->refcnt)) {
938 ip_vs_dst_reset(dest);
939 /* simply decrease svc->refcnt here, let the caller check
940 and release the service if nobody refers to it.
941 Only user context can release destination and service,
942 and only one user context can update virtual service at a
943 time, so the operation here is OK */
944 atomic_dec(&dest->svc->refcnt);
945 kfree(dest);
946 } else {
947 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
948 NIPQUAD(dest->addr), ntohs(dest->port),
949 atomic_read(&dest->refcnt));
950 list_add(&dest->n_list, &ip_vs_dest_trash);
951 atomic_inc(&dest->refcnt);
952 }
953}
954
955
956/*
957 * Unlink a destination from the given service
958 */
959static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
960 struct ip_vs_dest *dest,
961 int svcupd)
962{
963 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
964
965 /*
966 * Remove it from the d-linked destination list.
967 */
968 list_del(&dest->n_list);
969 svc->num_dests--;
970 if (svcupd) {
971 /*
972 * Call the update_service function of its scheduler
973 */
974 svc->scheduler->update_service(svc);
975 }
976}
977
978
979/*
980 * Delete a destination server in the given service
981 */
982static int
983ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
984{
985 struct ip_vs_dest *dest;
986 __u32 daddr = udest->addr;
987 __u16 dport = udest->port;
988
989 EnterFunction(2);
990
991 dest = ip_vs_lookup_dest(svc, daddr, dport);
992 if (dest == NULL) {
993 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
994 return -ENOENT;
995 }
996
997 write_lock_bh(&__ip_vs_svc_lock);
998
999 /*
1000 * Wait until all other svc users go away.
1001 */
1002 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1003
1004 /*
1005 * Unlink dest from the service
1006 */
1007 __ip_vs_unlink_dest(svc, dest, 1);
1008
1009 write_unlock_bh(&__ip_vs_svc_lock);
1010
1011 /*
1012 * Delete the destination
1013 */
1014 __ip_vs_del_dest(dest);
1015
1016 LeaveFunction(2);
1017
1018 return 0;
1019}
1020
1021
1022/*
1023 * Add a service into the service hash table
1024 */
1025static int
1026ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1027{
1028 int ret = 0;
1029 struct ip_vs_scheduler *sched = NULL;
1030 struct ip_vs_service *svc = NULL;
1031
1032 /* increase the module use count */
1033 ip_vs_use_count_inc();
1034
1035 /* Lookup the scheduler by 'u->sched_name' */
1036 sched = ip_vs_scheduler_get(u->sched_name);
1037 if (sched == NULL) {
1038 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1039 u->sched_name);
1040 ret = -ENOENT;
1041 goto out_mod_dec;
1042 }
1043
1044 svc = (struct ip_vs_service *)
1045 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1046 if (svc == NULL) {
1047 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1048 ret = -ENOMEM;
1049 goto out_err;
1050 }
1051 memset(svc, 0, sizeof(struct ip_vs_service));
1052
1053 /* I'm the first user of the service */
1054 atomic_set(&svc->usecnt, 1);
1055 atomic_set(&svc->refcnt, 0);
1056
1057 svc->protocol = u->protocol;
1058 svc->addr = u->addr;
1059 svc->port = u->port;
1060 svc->fwmark = u->fwmark;
1061 svc->flags = u->flags;
1062 svc->timeout = u->timeout * HZ;
1063 svc->netmask = u->netmask;
1064
1065 INIT_LIST_HEAD(&svc->destinations);
1066 rwlock_init(&svc->sched_lock);
1067 spin_lock_init(&svc->stats.lock);
1068
1069 /* Bind the scheduler */
1070 ret = ip_vs_bind_scheduler(svc, sched);
1071 if (ret)
1072 goto out_err;
1073 sched = NULL;
1074
1075 /* Update the virtual service counters */
1076 if (svc->port == FTPPORT)
1077 atomic_inc(&ip_vs_ftpsvc_counter);
1078 else if (svc->port == 0)
1079 atomic_inc(&ip_vs_nullsvc_counter);
1080
1081 ip_vs_new_estimator(&svc->stats);
1082 ip_vs_num_services++;
1083
1084 /* Hash the service into the service table */
1085 write_lock_bh(&__ip_vs_svc_lock);
1086 ip_vs_svc_hash(svc);
1087 write_unlock_bh(&__ip_vs_svc_lock);
1088
1089 *svc_p = svc;
1090 return 0;
1091
1092 out_err:
1093 if (svc != NULL) {
1094 if (svc->scheduler)
1095 ip_vs_unbind_scheduler(svc);
1096 if (svc->inc) {
1097 local_bh_disable();
1098 ip_vs_app_inc_put(svc->inc);
1099 local_bh_enable();
1100 }
1101 kfree(svc);
1102 }
1103 ip_vs_scheduler_put(sched);
1104
1105 out_mod_dec:
1106 /* decrease the module use count */
1107 ip_vs_use_count_dec();
1108
1109 return ret;
1110}
1111
1112
1113/*
1114 * Edit a service and bind it with a new scheduler
1115 */
1116static int
1117ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1118{
1119 struct ip_vs_scheduler *sched, *old_sched;
1120 int ret = 0;
1121
1122 /*
1123 * Lookup the scheduler, by 'u->sched_name'
1124 */
1125 sched = ip_vs_scheduler_get(u->sched_name);
1126 if (sched == NULL) {
1127 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1128 u->sched_name);
1129 return -ENOENT;
1130 }
1131 old_sched = sched;
1132
1133 write_lock_bh(&__ip_vs_svc_lock);
1134
1135 /*
1136 * Wait until all other svc users go away.
1137 */
1138 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1139
1140 /*
1141 * Set the flags and timeout value
1142 */
1143 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1144 svc->timeout = u->timeout * HZ;
1145 svc->netmask = u->netmask;
1146
1147 old_sched = svc->scheduler;
1148 if (sched != old_sched) {
1149 /*
1150 * Unbind the old scheduler
1151 */
1152 if ((ret = ip_vs_unbind_scheduler(svc))) {
1153 old_sched = sched;
1154 goto out;
1155 }
1156
1157 /*
1158 * Bind the new scheduler
1159 */
1160 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1161 /*
1162 * If ip_vs_bind_scheduler fails, restore the old
1163 * scheduler.
1164 * The main reason of failure is out of memory.
1165 *
1166 * The question is if the old scheduler can be
1167 * restored all the time. TODO: if it cannot be
1168 * restored some time, we must delete the service,
1169 * otherwise the system may crash.
1170 */
1171 ip_vs_bind_scheduler(svc, old_sched);
1172 old_sched = sched;
1173 goto out;
1174 }
1175 }
1176
1177 out:
1178 write_unlock_bh(&__ip_vs_svc_lock);
1179
1180 if (old_sched)
1181 ip_vs_scheduler_put(old_sched);
1182
1183 return ret;
1184}
1185
1186
1187/*
1188 * Delete a service from the service list
1189 * - The service must be unlinked, unlocked and not referenced!
1190 * - We are called under _bh lock
1191 */
1192static void __ip_vs_del_service(struct ip_vs_service *svc)
1193{
1194 struct ip_vs_dest *dest, *nxt;
1195 struct ip_vs_scheduler *old_sched;
1196
1197 ip_vs_num_services--;
1198 ip_vs_kill_estimator(&svc->stats);
1199
1200 /* Unbind scheduler */
1201 old_sched = svc->scheduler;
1202 ip_vs_unbind_scheduler(svc);
1203 if (old_sched)
1204 ip_vs_scheduler_put(old_sched);
1205
1206 /* Unbind app inc */
1207 if (svc->inc) {
1208 ip_vs_app_inc_put(svc->inc);
1209 svc->inc = NULL;
1210 }
1211
1212 /*
1213 * Unlink the whole destination list
1214 */
1215 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1216 __ip_vs_unlink_dest(svc, dest, 0);
1217 __ip_vs_del_dest(dest);
1218 }
1219
1220 /*
1221 * Update the virtual service counters
1222 */
1223 if (svc->port == FTPPORT)
1224 atomic_dec(&ip_vs_ftpsvc_counter);
1225 else if (svc->port == 0)
1226 atomic_dec(&ip_vs_nullsvc_counter);
1227
1228 /*
1229 * Free the service if nobody refers to it
1230 */
1231 if (atomic_read(&svc->refcnt) == 0)
1232 kfree(svc);
1233
1234 /* decrease the module use count */
1235 ip_vs_use_count_dec();
1236}
1237
1238/*
1239 * Delete a service from the service list
1240 */
1241static int ip_vs_del_service(struct ip_vs_service *svc)
1242{
1243 if (svc == NULL)
1244 return -EEXIST;
1245
1246 /*
1247 * Unhash it from the service table
1248 */
1249 write_lock_bh(&__ip_vs_svc_lock);
1250
1251 ip_vs_svc_unhash(svc);
1252
1253 /*
1254 * Wait until all the svc users go away.
1255 */
1256 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1257
1258 __ip_vs_del_service(svc);
1259
1260 write_unlock_bh(&__ip_vs_svc_lock);
1261
1262 return 0;
1263}
1264
1265
1266/*
1267 * Flush all the virtual services
1268 */
1269static int ip_vs_flush(void)
1270{
1271 int idx;
1272 struct ip_vs_service *svc, *nxt;
1273
1274 /*
1275 * Flush the service table hashed by <protocol,addr,port>
1276 */
1277 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1278 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1279 write_lock_bh(&__ip_vs_svc_lock);
1280 ip_vs_svc_unhash(svc);
1281 /*
1282 * Wait until all the svc users go away.
1283 */
1284 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1285 __ip_vs_del_service(svc);
1286 write_unlock_bh(&__ip_vs_svc_lock);
1287 }
1288 }
1289
1290 /*
1291 * Flush the service table hashed by fwmark
1292 */
1293 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1294 list_for_each_entry_safe(svc, nxt,
1295 &ip_vs_svc_fwm_table[idx], f_list) {
1296 write_lock_bh(&__ip_vs_svc_lock);
1297 ip_vs_svc_unhash(svc);
1298 /*
1299 * Wait until all the svc users go away.
1300 */
1301 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1302 __ip_vs_del_service(svc);
1303 write_unlock_bh(&__ip_vs_svc_lock);
1304 }
1305 }
1306
1307 return 0;
1308}
1309
1310
1311/*
1312 * Zero counters in a service or all services
1313 */
1314static int ip_vs_zero_service(struct ip_vs_service *svc)
1315{
1316 struct ip_vs_dest *dest;
1317
1318 write_lock_bh(&__ip_vs_svc_lock);
1319 list_for_each_entry(dest, &svc->destinations, n_list) {
1320 ip_vs_zero_stats(&dest->stats);
1321 }
1322 ip_vs_zero_stats(&svc->stats);
1323 write_unlock_bh(&__ip_vs_svc_lock);
1324 return 0;
1325}
1326
1327static int ip_vs_zero_all(void)
1328{
1329 int idx;
1330 struct ip_vs_service *svc;
1331
1332 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1333 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1334 ip_vs_zero_service(svc);
1335 }
1336 }
1337
1338 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1340 ip_vs_zero_service(svc);
1341 }
1342 }
1343
1344 ip_vs_zero_stats(&ip_vs_stats);
1345 return 0;
1346}
1347
1348
1349static int
1350proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1351 void __user *buffer, size_t *lenp, loff_t *ppos)
1352{
1353 int *valp = table->data;
1354 int val = *valp;
1355 int rc;
1356
1357 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1358 if (write && (*valp != val)) {
1359 if ((*valp < 0) || (*valp > 3)) {
1360 /* Restore the correct value */
1361 *valp = val;
1362 } else {
1363 local_bh_disable();
1364 update_defense_level();
1365 local_bh_enable();
1366 }
1367 }
1368 return rc;
1369}
1370
1371
1372static int
1373proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1374 void __user *buffer, size_t *lenp, loff_t *ppos)
1375{
1376 int *valp = table->data;
1377 int val[2];
1378 int rc;
1379
1380 /* backup the value first */
1381 memcpy(val, valp, sizeof(val));
1382
1383 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1384 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1385 /* Restore the correct value */
1386 memcpy(valp, val, sizeof(val));
1387 }
1388 return rc;
1389}
1390
1391
1392/*
1393 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1394 */
1395
1396static struct ctl_table vs_vars[] = {
1397 {
1398 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1399 .procname = "amemthresh",
1400 .data = &sysctl_ip_vs_amemthresh,
1401 .maxlen = sizeof(int),
1402 .mode = 0644,
1403 .proc_handler = &proc_dointvec,
1404 },
1405#ifdef CONFIG_IP_VS_DEBUG
1406 {
1407 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1408 .procname = "debug_level",
1409 .data = &sysctl_ip_vs_debug_level,
1410 .maxlen = sizeof(int),
1411 .mode = 0644,
1412 .proc_handler = &proc_dointvec,
1413 },
1414#endif
1415 {
1416 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1417 .procname = "am_droprate",
1418 .data = &sysctl_ip_vs_am_droprate,
1419 .maxlen = sizeof(int),
1420 .mode = 0644,
1421 .proc_handler = &proc_dointvec,
1422 },
1423 {
1424 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1425 .procname = "drop_entry",
1426 .data = &sysctl_ip_vs_drop_entry,
1427 .maxlen = sizeof(int),
1428 .mode = 0644,
1429 .proc_handler = &proc_do_defense_mode,
1430 },
1431 {
1432 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1433 .procname = "drop_packet",
1434 .data = &sysctl_ip_vs_drop_packet,
1435 .maxlen = sizeof(int),
1436 .mode = 0644,
1437 .proc_handler = &proc_do_defense_mode,
1438 },
1439 {
1440 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1441 .procname = "secure_tcp",
1442 .data = &sysctl_ip_vs_secure_tcp,
1443 .maxlen = sizeof(int),
1444 .mode = 0644,
1445 .proc_handler = &proc_do_defense_mode,
1446 },
1447#if 0
1448 {
1449 .ctl_name = NET_IPV4_VS_TO_ES,
1450 .procname = "timeout_established",
1451 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1452 .maxlen = sizeof(int),
1453 .mode = 0644,
1454 .proc_handler = &proc_dointvec_jiffies,
1455 },
1456 {
1457 .ctl_name = NET_IPV4_VS_TO_SS,
1458 .procname = "timeout_synsent",
1459 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1460 .maxlen = sizeof(int),
1461 .mode = 0644,
1462 .proc_handler = &proc_dointvec_jiffies,
1463 },
1464 {
1465 .ctl_name = NET_IPV4_VS_TO_SR,
1466 .procname = "timeout_synrecv",
1467 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1468 .maxlen = sizeof(int),
1469 .mode = 0644,
1470 .proc_handler = &proc_dointvec_jiffies,
1471 },
1472 {
1473 .ctl_name = NET_IPV4_VS_TO_FW,
1474 .procname = "timeout_finwait",
1475 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1476 .maxlen = sizeof(int),
1477 .mode = 0644,
1478 .proc_handler = &proc_dointvec_jiffies,
1479 },
1480 {
1481 .ctl_name = NET_IPV4_VS_TO_TW,
1482 .procname = "timeout_timewait",
1483 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1484 .maxlen = sizeof(int),
1485 .mode = 0644,
1486 .proc_handler = &proc_dointvec_jiffies,
1487 },
1488 {
1489 .ctl_name = NET_IPV4_VS_TO_CL,
1490 .procname = "timeout_close",
1491 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1492 .maxlen = sizeof(int),
1493 .mode = 0644,
1494 .proc_handler = &proc_dointvec_jiffies,
1495 },
1496 {
1497 .ctl_name = NET_IPV4_VS_TO_CW,
1498 .procname = "timeout_closewait",
1499 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1500 .maxlen = sizeof(int),
1501 .mode = 0644,
1502 .proc_handler = &proc_dointvec_jiffies,
1503 },
1504 {
1505 .ctl_name = NET_IPV4_VS_TO_LA,
1506 .procname = "timeout_lastack",
1507 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1508 .maxlen = sizeof(int),
1509 .mode = 0644,
1510 .proc_handler = &proc_dointvec_jiffies,
1511 },
1512 {
1513 .ctl_name = NET_IPV4_VS_TO_LI,
1514 .procname = "timeout_listen",
1515 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1516 .maxlen = sizeof(int),
1517 .mode = 0644,
1518 .proc_handler = &proc_dointvec_jiffies,
1519 },
1520 {
1521 .ctl_name = NET_IPV4_VS_TO_SA,
1522 .procname = "timeout_synack",
1523 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1524 .maxlen = sizeof(int),
1525 .mode = 0644,
1526 .proc_handler = &proc_dointvec_jiffies,
1527 },
1528 {
1529 .ctl_name = NET_IPV4_VS_TO_UDP,
1530 .procname = "timeout_udp",
1531 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1532 .maxlen = sizeof(int),
1533 .mode = 0644,
1534 .proc_handler = &proc_dointvec_jiffies,
1535 },
1536 {
1537 .ctl_name = NET_IPV4_VS_TO_ICMP,
1538 .procname = "timeout_icmp",
1539 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1540 .maxlen = sizeof(int),
1541 .mode = 0644,
1542 .proc_handler = &proc_dointvec_jiffies,
1543 },
1544#endif
1545 {
1546 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1547 .procname = "cache_bypass",
1548 .data = &sysctl_ip_vs_cache_bypass,
1549 .maxlen = sizeof(int),
1550 .mode = 0644,
1551 .proc_handler = &proc_dointvec,
1552 },
1553 {
1554 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1555 .procname = "expire_nodest_conn",
1556 .data = &sysctl_ip_vs_expire_nodest_conn,
1557 .maxlen = sizeof(int),
1558 .mode = 0644,
1559 .proc_handler = &proc_dointvec,
1560 },
1561 {
1562 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1563 .procname = "expire_quiescent_template",
1564 .data = &sysctl_ip_vs_expire_quiescent_template,
1565 .maxlen = sizeof(int),
1566 .mode = 0644,
1567 .proc_handler = &proc_dointvec,
1568 },
1569 {
1570 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1571 .procname = "sync_threshold",
1572 .data = &sysctl_ip_vs_sync_threshold,
1573 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1574 .mode = 0644,
1575 .proc_handler = &proc_do_sync_threshold,
1576 },
1577 {
1578 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1579 .procname = "nat_icmp_send",
1580 .data = &sysctl_ip_vs_nat_icmp_send,
1581 .maxlen = sizeof(int),
1582 .mode = 0644,
1583 .proc_handler = &proc_dointvec,
1584 },
1585 { .ctl_name = 0 }
1586};
1587
1588static ctl_table vs_table[] = {
1589 {
1590 .ctl_name = NET_IPV4_VS,
1591 .procname = "vs",
1592 .mode = 0555,
1593 .child = vs_vars
1594 },
1595 { .ctl_name = 0 }
1596};
1597
1598static ctl_table ipv4_table[] = {
1599 {
1600 .ctl_name = NET_IPV4,
1601 .procname = "ipv4",
1602 .mode = 0555,
1603 .child = vs_table,
1604 },
1605 { .ctl_name = 0 }
1606};
1607
1608static ctl_table vs_root_table[] = {
1609 {
1610 .ctl_name = CTL_NET,
1611 .procname = "net",
1612 .mode = 0555,
1613 .child = ipv4_table,
1614 },
1615 { .ctl_name = 0 }
1616};
1617
1618static struct ctl_table_header * sysctl_header;
1619
1620#ifdef CONFIG_PROC_FS
1621
1622struct ip_vs_iter {
1623 struct list_head *table;
1624 int bucket;
1625};
1626
1627/*
1628 * Write the contents of the VS rule table to a PROCfs file.
1629 * (It is kept just for backward compatibility)
1630 */
1631static inline const char *ip_vs_fwd_name(unsigned flags)
1632{
1633 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1634 case IP_VS_CONN_F_LOCALNODE:
1635 return "Local";
1636 case IP_VS_CONN_F_TUNNEL:
1637 return "Tunnel";
1638 case IP_VS_CONN_F_DROUTE:
1639 return "Route";
1640 default:
1641 return "Masq";
1642 }
1643}
1644
1645
1646/* Get the Nth entry in the two lists */
1647static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1648{
1649 struct ip_vs_iter *iter = seq->private;
1650 int idx;
1651 struct ip_vs_service *svc;
1652
1653 /* look in hash by protocol */
1654 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1655 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1656 if (pos-- == 0){
1657 iter->table = ip_vs_svc_table;
1658 iter->bucket = idx;
1659 return svc;
1660 }
1661 }
1662 }
1663
1664 /* keep looking in fwmark */
1665 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1666 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1667 if (pos-- == 0) {
1668 iter->table = ip_vs_svc_fwm_table;
1669 iter->bucket = idx;
1670 return svc;
1671 }
1672 }
1673 }
1674
1675 return NULL;
1676}
1677
1678static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1679{
1680
1681 read_lock_bh(&__ip_vs_svc_lock);
1682 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1683}
1684
1685
1686static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1687{
1688 struct list_head *e;
1689 struct ip_vs_iter *iter;
1690 struct ip_vs_service *svc;
1691
1692 ++*pos;
1693 if (v == SEQ_START_TOKEN)
1694 return ip_vs_info_array(seq,0);
1695
1696 svc = v;
1697 iter = seq->private;
1698
1699 if (iter->table == ip_vs_svc_table) {
1700 /* next service in table hashed by protocol */
1701 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1702 return list_entry(e, struct ip_vs_service, s_list);
1703
1704
1705 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1706 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1707 s_list) {
1708 return svc;
1709 }
1710 }
1711
1712 iter->table = ip_vs_svc_fwm_table;
1713 iter->bucket = -1;
1714 goto scan_fwmark;
1715 }
1716
1717 /* next service in hashed by fwmark */
1718 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1719 return list_entry(e, struct ip_vs_service, f_list);
1720
1721 scan_fwmark:
1722 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1723 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1724 f_list)
1725 return svc;
1726 }
1727
1728 return NULL;
1729}
1730
1731static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1732{
1733 read_unlock_bh(&__ip_vs_svc_lock);
1734}
1735
1736
1737static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1738{
1739 if (v == SEQ_START_TOKEN) {
1740 seq_printf(seq,
1741 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1742 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1743 seq_puts(seq,
1744 "Prot LocalAddress:Port Scheduler Flags\n");
1745 seq_puts(seq,
1746 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1747 } else {
1748 const struct ip_vs_service *svc = v;
1749 const struct ip_vs_iter *iter = seq->private;
1750 const struct ip_vs_dest *dest;
1751
1752 if (iter->table == ip_vs_svc_table)
1753 seq_printf(seq, "%s %08X:%04X %s ",
1754 ip_vs_proto_name(svc->protocol),
1755 ntohl(svc->addr),
1756 ntohs(svc->port),
1757 svc->scheduler->name);
1758 else
1759 seq_printf(seq, "FWM %08X %s ",
1760 svc->fwmark, svc->scheduler->name);
1761
1762 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1763 seq_printf(seq, "persistent %d %08X\n",
1764 svc->timeout,
1765 ntohl(svc->netmask));
1766 else
1767 seq_putc(seq, '\n');
1768
1769 list_for_each_entry(dest, &svc->destinations, n_list) {
1770 seq_printf(seq,
1771 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1772 ntohl(dest->addr), ntohs(dest->port),
1773 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1774 atomic_read(&dest->weight),
1775 atomic_read(&dest->activeconns),
1776 atomic_read(&dest->inactconns));
1777 }
1778 }
1779 return 0;
1780}
1781
1782static struct seq_operations ip_vs_info_seq_ops = {
1783 .start = ip_vs_info_seq_start,
1784 .next = ip_vs_info_seq_next,
1785 .stop = ip_vs_info_seq_stop,
1786 .show = ip_vs_info_seq_show,
1787};
1788
1789static int ip_vs_info_open(struct inode *inode, struct file *file)
1790{
1791 struct seq_file *seq;
1792 int rc = -ENOMEM;
1793 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1794
1795 if (!s)
1796 goto out;
1797
1798 rc = seq_open(file, &ip_vs_info_seq_ops);
1799 if (rc)
1800 goto out_kfree;
1801
1802 seq = file->private_data;
1803 seq->private = s;
1804 memset(s, 0, sizeof(*s));
1805out:
1806 return rc;
1807out_kfree:
1808 kfree(s);
1809 goto out;
1810}
1811
1812static struct file_operations ip_vs_info_fops = {
1813 .owner = THIS_MODULE,
1814 .open = ip_vs_info_open,
1815 .read = seq_read,
1816 .llseek = seq_lseek,
1817 .release = seq_release_private,
1818};
1819
1820#endif
1821
1822struct ip_vs_stats ip_vs_stats;
1823
1824#ifdef CONFIG_PROC_FS
1825static int ip_vs_stats_show(struct seq_file *seq, void *v)
1826{
1827
1828/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1829 seq_puts(seq,
1830 " Total Incoming Outgoing Incoming Outgoing\n");
1831 seq_printf(seq,
1832 " Conns Packets Packets Bytes Bytes\n");
1833
1834 spin_lock_bh(&ip_vs_stats.lock);
1835 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1836 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1837 (unsigned long long) ip_vs_stats.inbytes,
1838 (unsigned long long) ip_vs_stats.outbytes);
1839
1840/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1841 seq_puts(seq,
1842 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1843 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1844 ip_vs_stats.cps,
1845 ip_vs_stats.inpps,
1846 ip_vs_stats.outpps,
1847 ip_vs_stats.inbps,
1848 ip_vs_stats.outbps);
1849 spin_unlock_bh(&ip_vs_stats.lock);
1850
1851 return 0;
1852}
1853
1854static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1855{
1856 return single_open(file, ip_vs_stats_show, NULL);
1857}
1858
1859static struct file_operations ip_vs_stats_fops = {
1860 .owner = THIS_MODULE,
1861 .open = ip_vs_stats_seq_open,
1862 .read = seq_read,
1863 .llseek = seq_lseek,
1864 .release = single_release,
1865};
1866
1867#endif
1868
1869/*
1870 * Set timeout values for tcp tcpfin udp in the timeout_table.
1871 */
1872static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1873{
1874 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1875 u->tcp_timeout,
1876 u->tcp_fin_timeout,
1877 u->udp_timeout);
1878
1879#ifdef CONFIG_IP_VS_PROTO_TCP
1880 if (u->tcp_timeout) {
1881 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1882 = u->tcp_timeout * HZ;
1883 }
1884
1885 if (u->tcp_fin_timeout) {
1886 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1887 = u->tcp_fin_timeout * HZ;
1888 }
1889#endif
1890
1891#ifdef CONFIG_IP_VS_PROTO_UDP
1892 if (u->udp_timeout) {
1893 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1894 = u->udp_timeout * HZ;
1895 }
1896#endif
1897 return 0;
1898}
1899
1900
1901#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1902#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1903#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1904 sizeof(struct ip_vs_dest_user))
1905#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1906#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1907#define MAX_ARG_LEN SVCDEST_ARG_LEN
1908
1909static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1910 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1911 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1912 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1913 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1914 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1915 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1916 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1917 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1921};
1922
1923static int
1924do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1925{
1926 int ret;
1927 unsigned char arg[MAX_ARG_LEN];
1928 struct ip_vs_service_user *usvc;
1929 struct ip_vs_service *svc;
1930 struct ip_vs_dest_user *udest;
1931
1932 if (!capable(CAP_NET_ADMIN))
1933 return -EPERM;
1934
1935 if (len != set_arglen[SET_CMDID(cmd)]) {
1936 IP_VS_ERR("set_ctl: len %u != %u\n",
1937 len, set_arglen[SET_CMDID(cmd)]);
1938 return -EINVAL;
1939 }
1940
1941 if (copy_from_user(arg, user, len) != 0)
1942 return -EFAULT;
1943
1944 /* increase the module use count */
1945 ip_vs_use_count_inc();
1946
1947 if (down_interruptible(&__ip_vs_mutex)) {
1948 ret = -ERESTARTSYS;
1949 goto out_dec;
1950 }
1951
1952 if (cmd == IP_VS_SO_SET_FLUSH) {
1953 /* Flush the virtual service */
1954 ret = ip_vs_flush();
1955 goto out_unlock;
1956 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1957 /* Set timeout values for (tcp tcpfin udp) */
1958 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1959 goto out_unlock;
1960 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1961 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1962 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1963 goto out_unlock;
1964 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1965 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1966 ret = stop_sync_thread(dm->state);
1967 goto out_unlock;
1968 }
1969
1970 usvc = (struct ip_vs_service_user *)arg;
1971 udest = (struct ip_vs_dest_user *)(usvc + 1);
1972
1973 if (cmd == IP_VS_SO_SET_ZERO) {
1974 /* if no service address is set, zero counters in all */
1975 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1976 ret = ip_vs_zero_all();
1977 goto out_unlock;
1978 }
1979 }
1980
1981 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1982 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1983 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1984 usvc->protocol, NIPQUAD(usvc->addr),
1985 ntohs(usvc->port), usvc->sched_name);
1986 ret = -EFAULT;
1987 goto out_unlock;
1988 }
1989
1990 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1991 if (usvc->fwmark == 0)
1992 svc = __ip_vs_service_get(usvc->protocol,
1993 usvc->addr, usvc->port);
1994 else
1995 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1996
1997 if (cmd != IP_VS_SO_SET_ADD
1998 && (svc == NULL || svc->protocol != usvc->protocol)) {
1999 ret = -ESRCH;
2000 goto out_unlock;
2001 }
2002
2003 switch (cmd) {
2004 case IP_VS_SO_SET_ADD:
2005 if (svc != NULL)
2006 ret = -EEXIST;
2007 else
2008 ret = ip_vs_add_service(usvc, &svc);
2009 break;
2010 case IP_VS_SO_SET_EDIT:
2011 ret = ip_vs_edit_service(svc, usvc);
2012 break;
2013 case IP_VS_SO_SET_DEL:
2014 ret = ip_vs_del_service(svc);
2015 if (!ret)
2016 goto out_unlock;
2017 break;
2018 case IP_VS_SO_SET_ZERO:
2019 ret = ip_vs_zero_service(svc);
2020 break;
2021 case IP_VS_SO_SET_ADDDEST:
2022 ret = ip_vs_add_dest(svc, udest);
2023 break;
2024 case IP_VS_SO_SET_EDITDEST:
2025 ret = ip_vs_edit_dest(svc, udest);
2026 break;
2027 case IP_VS_SO_SET_DELDEST:
2028 ret = ip_vs_del_dest(svc, udest);
2029 break;
2030 default:
2031 ret = -EINVAL;
2032 }
2033
2034 if (svc)
2035 ip_vs_service_put(svc);
2036
2037 out_unlock:
2038 up(&__ip_vs_mutex);
2039 out_dec:
2040 /* decrease the module use count */
2041 ip_vs_use_count_dec();
2042
2043 return ret;
2044}
2045
2046
2047static void
2048ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2049{
2050 spin_lock_bh(&src->lock);
2051 memcpy(dst, src, (char*)&src->lock - (char*)src);
2052 spin_unlock_bh(&src->lock);
2053}
2054
2055static void
2056ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2057{
2058 dst->protocol = src->protocol;
2059 dst->addr = src->addr;
2060 dst->port = src->port;
2061 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name);
2063 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask;
2066 dst->num_dests = src->num_dests;
2067 ip_vs_copy_stats(&dst->stats, &src->stats);
2068}
2069
2070static inline int
2071__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2072 struct ip_vs_get_services __user *uptr)
2073{
2074 int idx, count=0;
2075 struct ip_vs_service *svc;
2076 struct ip_vs_service_entry entry;
2077 int ret = 0;
2078
2079 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services)
2082 goto out;
2083 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) {
2086 ret = -EFAULT;
2087 goto out;
2088 }
2089 count++;
2090 }
2091 }
2092
2093 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services)
2096 goto out;
2097 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) {
2100 ret = -EFAULT;
2101 goto out;
2102 }
2103 count++;
2104 }
2105 }
2106 out:
2107 return ret;
2108}
2109
2110static inline int
2111__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2112 struct ip_vs_get_dests __user *uptr)
2113{
2114 struct ip_vs_service *svc;
2115 int ret = 0;
2116
2117 if (get->fwmark)
2118 svc = __ip_vs_svc_fwm_get(get->fwmark);
2119 else
2120 svc = __ip_vs_service_get(get->protocol,
2121 get->addr, get->port);
2122 if (svc) {
2123 int count = 0;
2124 struct ip_vs_dest *dest;
2125 struct ip_vs_dest_entry entry;
2126
2127 list_for_each_entry(dest, &svc->destinations, n_list) {
2128 if (count >= get->num_dests)
2129 break;
2130
2131 entry.addr = dest->addr;
2132 entry.port = dest->port;
2133 entry.conn_flags = atomic_read(&dest->conn_flags);
2134 entry.weight = atomic_read(&dest->weight);
2135 entry.u_threshold = dest->u_threshold;
2136 entry.l_threshold = dest->l_threshold;
2137 entry.activeconns = atomic_read(&dest->activeconns);
2138 entry.inactconns = atomic_read(&dest->inactconns);
2139 entry.persistconns = atomic_read(&dest->persistconns);
2140 ip_vs_copy_stats(&entry.stats, &dest->stats);
2141 if (copy_to_user(&uptr->entrytable[count],
2142 &entry, sizeof(entry))) {
2143 ret = -EFAULT;
2144 break;
2145 }
2146 count++;
2147 }
2148 ip_vs_service_put(svc);
2149 } else
2150 ret = -ESRCH;
2151 return ret;
2152}
2153
2154static inline void
2155__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2156{
2157#ifdef CONFIG_IP_VS_PROTO_TCP
2158 u->tcp_timeout =
2159 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2160 u->tcp_fin_timeout =
2161 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2162#endif
2163#ifdef CONFIG_IP_VS_PROTO_UDP
2164 u->udp_timeout =
2165 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2166#endif
2167}
2168
2169
2170#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2171#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2172#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2173#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2174#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2175#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2176#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2177
2178static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2179 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2180 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2181 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2182 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2183 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2184 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2185 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2186};
2187
2188static int
2189do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2190{
2191 unsigned char arg[128];
2192 int ret = 0;
2193
2194 if (!capable(CAP_NET_ADMIN))
2195 return -EPERM;
2196
2197 if (*len < get_arglen[GET_CMDID(cmd)]) {
2198 IP_VS_ERR("get_ctl: len %u < %u\n",
2199 *len, get_arglen[GET_CMDID(cmd)]);
2200 return -EINVAL;
2201 }
2202
2203 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2204 return -EFAULT;
2205
2206 if (down_interruptible(&__ip_vs_mutex))
2207 return -ERESTARTSYS;
2208
2209 switch (cmd) {
2210 case IP_VS_SO_GET_VERSION:
2211 {
2212 char buf[64];
2213
2214 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2215 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2216 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2217 ret = -EFAULT;
2218 goto out;
2219 }
2220 *len = strlen(buf)+1;
2221 }
2222 break;
2223
2224 case IP_VS_SO_GET_INFO:
2225 {
2226 struct ip_vs_getinfo info;
2227 info.version = IP_VS_VERSION_CODE;
2228 info.size = IP_VS_CONN_TAB_SIZE;
2229 info.num_services = ip_vs_num_services;
2230 if (copy_to_user(user, &info, sizeof(info)) != 0)
2231 ret = -EFAULT;
2232 }
2233 break;
2234
2235 case IP_VS_SO_GET_SERVICES:
2236 {
2237 struct ip_vs_get_services *get;
2238 int size;
2239
2240 get = (struct ip_vs_get_services *)arg;
2241 size = sizeof(*get) +
2242 sizeof(struct ip_vs_service_entry) * get->num_services;
2243 if (*len != size) {
2244 IP_VS_ERR("length: %u != %u\n", *len, size);
2245 ret = -EINVAL;
2246 goto out;
2247 }
2248 ret = __ip_vs_get_service_entries(get, user);
2249 }
2250 break;
2251
2252 case IP_VS_SO_GET_SERVICE:
2253 {
2254 struct ip_vs_service_entry *entry;
2255 struct ip_vs_service *svc;
2256
2257 entry = (struct ip_vs_service_entry *)arg;
2258 if (entry->fwmark)
2259 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2260 else
2261 svc = __ip_vs_service_get(entry->protocol,
2262 entry->addr, entry->port);
2263 if (svc) {
2264 ip_vs_copy_service(entry, svc);
2265 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2266 ret = -EFAULT;
2267 ip_vs_service_put(svc);
2268 } else
2269 ret = -ESRCH;
2270 }
2271 break;
2272
2273 case IP_VS_SO_GET_DESTS:
2274 {
2275 struct ip_vs_get_dests *get;
2276 int size;
2277
2278 get = (struct ip_vs_get_dests *)arg;
2279 size = sizeof(*get) +
2280 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2281 if (*len != size) {
2282 IP_VS_ERR("length: %u != %u\n", *len, size);
2283 ret = -EINVAL;
2284 goto out;
2285 }
2286 ret = __ip_vs_get_dest_entries(get, user);
2287 }
2288 break;
2289
2290 case IP_VS_SO_GET_TIMEOUT:
2291 {
2292 struct ip_vs_timeout_user t;
2293
2294 __ip_vs_get_timeouts(&t);
2295 if (copy_to_user(user, &t, sizeof(t)) != 0)
2296 ret = -EFAULT;
2297 }
2298 break;
2299
2300 case IP_VS_SO_GET_DAEMON:
2301 {
2302 struct ip_vs_daemon_user d[2];
2303
2304 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
2308 d[0].syncid = ip_vs_master_syncid;
2309 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
2313 d[1].syncid = ip_vs_backup_syncid;
2314 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0)
2316 ret = -EFAULT;
2317 }
2318 break;
2319
2320 default:
2321 ret = -EINVAL;
2322 }
2323
2324 out:
2325 up(&__ip_vs_mutex);
2326 return ret;
2327}
2328
2329
2330static struct nf_sockopt_ops ip_vs_sockopts = {
2331 .pf = PF_INET,
2332 .set_optmin = IP_VS_BASE_CTL,
2333 .set_optmax = IP_VS_SO_SET_MAX+1,
2334 .set = do_ip_vs_set_ctl,
2335 .get_optmin = IP_VS_BASE_CTL,
2336 .get_optmax = IP_VS_SO_GET_MAX+1,
2337 .get = do_ip_vs_get_ctl,
2338};
2339
2340
2341int ip_vs_control_init(void)
2342{
2343 int ret;
2344 int idx;
2345
2346 EnterFunction(2);
2347
2348 ret = nf_register_sockopt(&ip_vs_sockopts);
2349 if (ret) {
2350 IP_VS_ERR("cannot register sockopt.\n");
2351 return ret;
2352 }
2353
2354 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2355 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2356
2357 sysctl_header = register_sysctl_table(vs_root_table, 0);
2358
2359 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2360 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2361 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2362 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2363 }
2364 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2365 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2366 }
2367
2368 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2369 spin_lock_init(&ip_vs_stats.lock);
2370 ip_vs_new_estimator(&ip_vs_stats);
2371
2372 /* Hook the defense timer */
2373 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2374
2375 LeaveFunction(2);
2376 return 0;
2377}
2378
2379
2380void ip_vs_control_cleanup(void)
2381{
2382 EnterFunction(2);
2383 ip_vs_trash_cleanup();
2384 cancel_rearming_delayed_work(&defense_work);
2385 ip_vs_kill_estimator(&ip_vs_stats);
2386 unregister_sysctl_table(sysctl_header);
2387 proc_net_remove("ip_vs_stats");
2388 proc_net_remove("ip_vs");
2389 nf_unregister_sockopt(&ip_vs_sockopts);
2390 LeaveFunction(2);
2391}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
new file mode 100644
index 000000000000..f3bc320dce93
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -0,0 +1,258 @@
1/*
2 * IPVS: Destination Hashing scheduling module
3 *
4 * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * Inspired by the consistent hashing scheduler patch from
9 * Thomas Proell <proellt@gmx.de>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 * Changes:
17 *
18 */
19
20/*
21 * The dh algorithm is to select server by the hash key of destination IP
22 * address. The pseudo code is as follows:
23 *
24 * n <- servernode[dest_ip];
25 * if (n is dead) OR
26 * (n is overloaded) OR (n.weight <= 0) then
27 * return NULL;
28 *
29 * return n;
30 *
31 * Notes that servernode is a 256-bucket hash table that maps the hash
32 * index derived from packet destination IP address to the current server
33 * array. If the dh scheduler is used in cache cluster, it is good to
34 * combine it with cache_bypass feature. When the statically assigned
35 * server is dead or overloaded, the load balancer can bypass the cache
36 * server and send requests to the original server directly.
37 *
38 */
39
40#include <linux/module.h>
41#include <linux/kernel.h>
42
43#include <net/ip_vs.h>
44
45
46/*
47 * IPVS DH bucket
48 */
49struct ip_vs_dh_bucket {
50 struct ip_vs_dest *dest; /* real server (cache) */
51};
52
53/*
54 * for IPVS DH entry hash table
55 */
56#ifndef CONFIG_IP_VS_DH_TAB_BITS
57#define CONFIG_IP_VS_DH_TAB_BITS 8
58#endif
59#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
60#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
61#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
62
63
64/*
65 * Returns hash value for IPVS DH entry
66 */
67static inline unsigned ip_vs_dh_hashkey(__u32 addr)
68{
69 return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
70}
71
72
73/*
74 * Get ip_vs_dest associated with supplied parameters.
75 */
76static inline struct ip_vs_dest *
77ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
78{
79 return (tbl[ip_vs_dh_hashkey(addr)]).dest;
80}
81
82
83/*
84 * Assign all the hash buckets of the specified table with the service.
85 */
86static int
87ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
88{
89 int i;
90 struct ip_vs_dh_bucket *b;
91 struct list_head *p;
92 struct ip_vs_dest *dest;
93
94 b = tbl;
95 p = &svc->destinations;
96 for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
97 if (list_empty(p)) {
98 b->dest = NULL;
99 } else {
100 if (p == &svc->destinations)
101 p = p->next;
102
103 dest = list_entry(p, struct ip_vs_dest, n_list);
104 atomic_inc(&dest->refcnt);
105 b->dest = dest;
106
107 p = p->next;
108 }
109 b++;
110 }
111 return 0;
112}
113
114
115/*
116 * Flush all the hash buckets of the specified table.
117 */
118static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
119{
120 int i;
121 struct ip_vs_dh_bucket *b;
122
123 b = tbl;
124 for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
125 if (b->dest) {
126 atomic_dec(&b->dest->refcnt);
127 b->dest = NULL;
128 }
129 b++;
130 }
131}
132
133
134static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
135{
136 struct ip_vs_dh_bucket *tbl;
137
138 /* allocate the DH table for this service */
139 tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
140 GFP_ATOMIC);
141 if (tbl == NULL) {
142 IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
143 return -ENOMEM;
144 }
145 svc->sched_data = tbl;
146 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
147 "current service\n",
148 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
149
150 /* assign the hash buckets with the updated service */
151 ip_vs_dh_assign(tbl, svc);
152
153 return 0;
154}
155
156
157static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
158{
159 struct ip_vs_dh_bucket *tbl = svc->sched_data;
160
161 /* got to clean up hash buckets here */
162 ip_vs_dh_flush(tbl);
163
164 /* release the table itself */
165 kfree(svc->sched_data);
166 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
167 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
168
169 return 0;
170}
171
172
173static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
174{
175 struct ip_vs_dh_bucket *tbl = svc->sched_data;
176
177 /* got to clean up hash buckets here */
178 ip_vs_dh_flush(tbl);
179
180 /* assign the hash buckets with the updated service */
181 ip_vs_dh_assign(tbl, svc);
182
183 return 0;
184}
185
186
187/*
188 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
189 * consider that the server is overloaded here.
190 */
191static inline int is_overloaded(struct ip_vs_dest *dest)
192{
193 return dest->flags & IP_VS_DEST_F_OVERLOAD;
194}
195
196
197/*
198 * Destination hashing scheduling
199 */
200static struct ip_vs_dest *
201ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
202{
203 struct ip_vs_dest *dest;
204 struct ip_vs_dh_bucket *tbl;
205 struct iphdr *iph = skb->nh.iph;
206
207 IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
208
209 tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
210 dest = ip_vs_dh_get(tbl, iph->daddr);
211 if (!dest
212 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
213 || atomic_read(&dest->weight) <= 0
214 || is_overloaded(dest)) {
215 return NULL;
216 }
217
218 IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
219 "--> server %u.%u.%u.%u:%d\n",
220 NIPQUAD(iph->daddr),
221 NIPQUAD(dest->addr),
222 ntohs(dest->port));
223
224 return dest;
225}
226
227
228/*
229 * IPVS DH Scheduler structure
230 */
231static struct ip_vs_scheduler ip_vs_dh_scheduler =
232{
233 .name = "dh",
234 .refcnt = ATOMIC_INIT(0),
235 .module = THIS_MODULE,
236 .init_service = ip_vs_dh_init_svc,
237 .done_service = ip_vs_dh_done_svc,
238 .update_service = ip_vs_dh_update_svc,
239 .schedule = ip_vs_dh_schedule,
240};
241
242
243static int __init ip_vs_dh_init(void)
244{
245 INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
246 return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
247}
248
249
250static void __exit ip_vs_dh_cleanup(void)
251{
252 unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
253}
254
255
256module_init(ip_vs_dh_init);
257module_exit(ip_vs_dh_cleanup);
258MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
new file mode 100644
index 000000000000..67b3e2fc1fa1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -0,0 +1,200 @@
1/*
2 * ip_vs_est.c: simple rate estimator for IPVS
3 *
4 * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16#include <linux/kernel.h>
17#include <linux/types.h>
18
19#include <net/ip_vs.h>
20
21/*
22 This code is to estimate rate in a shorter interval (such as 8
23 seconds) for virtual services and real servers. For measure rate in a
24 long interval, it is easy to implement a user level daemon which
25 periodically reads those statistical counters and measure rate.
26
27 Currently, the measurement is activated by slow timer handler. Hope
28 this measurement will not introduce too much load.
29
30 We measure rate during the last 8 seconds every 2 seconds:
31
32 avgrate = avgrate*(1-W) + rate*W
33
34 where W = 2^(-2)
35
36 NOTES.
37
38 * The stored value for average bps is scaled by 2^5, so that maximal
39 rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
40
41 * A lot code is taken from net/sched/estimator.c
42 */
43
44
45struct ip_vs_estimator
46{
47 struct ip_vs_estimator *next;
48 struct ip_vs_stats *stats;
49
50 u32 last_conns;
51 u32 last_inpkts;
52 u32 last_outpkts;
53 u64 last_inbytes;
54 u64 last_outbytes;
55
56 u32 cps;
57 u32 inpps;
58 u32 outpps;
59 u32 inbps;
60 u32 outbps;
61};
62
63
64static struct ip_vs_estimator *est_list = NULL;
65static DEFINE_RWLOCK(est_lock);
66static struct timer_list est_timer;
67
68static void estimation_timer(unsigned long arg)
69{
70 struct ip_vs_estimator *e;
71 struct ip_vs_stats *s;
72 u32 n_conns;
73 u32 n_inpkts, n_outpkts;
74 u64 n_inbytes, n_outbytes;
75 u32 rate;
76
77 read_lock(&est_lock);
78 for (e = est_list; e; e = e->next) {
79 s = e->stats;
80
81 spin_lock(&s->lock);
82 n_conns = s->conns;
83 n_inpkts = s->inpkts;
84 n_outpkts = s->outpkts;
85 n_inbytes = s->inbytes;
86 n_outbytes = s->outbytes;
87
88 /* scaled by 2^10, but divided 2 seconds */
89 rate = (n_conns - e->last_conns)<<9;
90 e->last_conns = n_conns;
91 e->cps += ((long)rate - (long)e->cps)>>2;
92 s->cps = (e->cps+0x1FF)>>10;
93
94 rate = (n_inpkts - e->last_inpkts)<<9;
95 e->last_inpkts = n_inpkts;
96 e->inpps += ((long)rate - (long)e->inpps)>>2;
97 s->inpps = (e->inpps+0x1FF)>>10;
98
99 rate = (n_outpkts - e->last_outpkts)<<9;
100 e->last_outpkts = n_outpkts;
101 e->outpps += ((long)rate - (long)e->outpps)>>2;
102 s->outpps = (e->outpps+0x1FF)>>10;
103
104 rate = (n_inbytes - e->last_inbytes)<<4;
105 e->last_inbytes = n_inbytes;
106 e->inbps += ((long)rate - (long)e->inbps)>>2;
107 s->inbps = (e->inbps+0xF)>>5;
108
109 rate = (n_outbytes - e->last_outbytes)<<4;
110 e->last_outbytes = n_outbytes;
111 e->outbps += ((long)rate - (long)e->outbps)>>2;
112 s->outbps = (e->outbps+0xF)>>5;
113 spin_unlock(&s->lock);
114 }
115 read_unlock(&est_lock);
116 mod_timer(&est_timer, jiffies + 2*HZ);
117}
118
119int ip_vs_new_estimator(struct ip_vs_stats *stats)
120{
121 struct ip_vs_estimator *est;
122
123 est = kmalloc(sizeof(*est), GFP_KERNEL);
124 if (est == NULL)
125 return -ENOMEM;
126
127 memset(est, 0, sizeof(*est));
128 est->stats = stats;
129 est->last_conns = stats->conns;
130 est->cps = stats->cps<<10;
131
132 est->last_inpkts = stats->inpkts;
133 est->inpps = stats->inpps<<10;
134
135 est->last_outpkts = stats->outpkts;
136 est->outpps = stats->outpps<<10;
137
138 est->last_inbytes = stats->inbytes;
139 est->inbps = stats->inbps<<5;
140
141 est->last_outbytes = stats->outbytes;
142 est->outbps = stats->outbps<<5;
143
144 write_lock_bh(&est_lock);
145 est->next = est_list;
146 if (est->next == NULL) {
147 init_timer(&est_timer);
148 est_timer.expires = jiffies + 2*HZ;
149 est_timer.function = estimation_timer;
150 add_timer(&est_timer);
151 }
152 est_list = est;
153 write_unlock_bh(&est_lock);
154 return 0;
155}
156
157void ip_vs_kill_estimator(struct ip_vs_stats *stats)
158{
159 struct ip_vs_estimator *est, **pest;
160 int killed = 0;
161
162 write_lock_bh(&est_lock);
163 pest = &est_list;
164 while ((est=*pest) != NULL) {
165 if (est->stats != stats) {
166 pest = &est->next;
167 continue;
168 }
169 *pest = est->next;
170 kfree(est);
171 killed++;
172 }
173 if (killed && est_list == NULL)
174 del_timer_sync(&est_timer);
175 write_unlock_bh(&est_lock);
176}
177
178void ip_vs_zero_estimator(struct ip_vs_stats *stats)
179{
180 struct ip_vs_estimator *e;
181
182 write_lock_bh(&est_lock);
183 for (e = est_list; e; e = e->next) {
184 if (e->stats != stats)
185 continue;
186
187 /* set counters zero */
188 e->last_conns = 0;
189 e->last_inpkts = 0;
190 e->last_outpkts = 0;
191 e->last_inbytes = 0;
192 e->last_outbytes = 0;
193 e->cps = 0;
194 e->inpps = 0;
195 e->outpps = 0;
196 e->inbps = 0;
197 e->outbps = 0;
198 }
199 write_unlock_bh(&est_lock);
200}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
new file mode 100644
index 000000000000..a19a33ceb811
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -0,0 +1,400 @@
1/*
2 * ip_vs_ftp.c: IPVS ftp application module
3 *
4 * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * Changes:
9 *
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
17 * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
18 *
19 * IP_MASQ_FTP ftp masquerading module
20 *
21 * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
22 *
23 * Author: Wouter Gadeyne
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/moduleparam.h>
29#include <linux/kernel.h>
30#include <linux/skbuff.h>
31#include <linux/in.h>
32#include <linux/ip.h>
33#include <net/protocol.h>
34#include <net/tcp.h>
35
36#include <net/ip_vs.h>
37
38
39#define SERVER_STRING "227 Entering Passive Mode ("
40#define CLIENT_STRING "PORT "
41
42
43/*
44 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
45 * First port is set to the default port.
46 */
47static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
48module_param_array(ports, int, NULL, 0);
49
50/*
51 * Debug level
52 */
53#ifdef CONFIG_IP_VS_DEBUG
54static int debug=0;
55module_param(debug, int, 0);
56#endif
57
58
59/* Dummy variable */
60static int ip_vs_ftp_pasv;
61
62
63static int
64ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
65{
66 return 0;
67}
68
69
70static int
71ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
72{
73 return 0;
74}
75
76
77/*
78 * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
79 * with the "pattern" and terminated with the "term" character.
80 * <addr,port> is in network order.
81 */
82static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
83 const char *pattern, size_t plen, char term,
84 __u32 *addr, __u16 *port,
85 char **start, char **end)
86{
87 unsigned char p[6];
88 int i = 0;
89
90 if (data_limit - data < plen) {
91 /* check if there is partial match */
92 if (strnicmp(data, pattern, data_limit - data) == 0)
93 return -1;
94 else
95 return 0;
96 }
97
98 if (strnicmp(data, pattern, plen) != 0) {
99 return 0;
100 }
101 *start = data + plen;
102
103 for (data = *start; *data != term; data++) {
104 if (data == data_limit)
105 return -1;
106 }
107 *end = data;
108
109 memset(p, 0, sizeof(p));
110 for (data = *start; data != *end; data++) {
111 if (*data >= '0' && *data <= '9') {
112 p[i] = p[i]*10 + *data - '0';
113 } else if (*data == ',' && i < 5) {
114 i++;
115 } else {
116 /* unexpected character */
117 return -1;
118 }
119 }
120
121 if (i != 5)
122 return -1;
123
124 *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
125 *port = (p[5]<<8) | p[4];
126 return 1;
127}
128
129
130/*
131 * Look at outgoing ftp packets to catch the response to a PASV command
132 * from the server (inside-to-outside).
133 * When we see one, we build a connection entry with the client address,
134 * client port 0 (unknown at the moment), the server address and the
135 * server port. Mark the current connection entry as a control channel
136 * of the new entry. All this work is just to make the data connection
137 * can be scheduled to the right server later.
138 *
139 * The outgoing packet should be something like
140 * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
141 * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
142 */
143static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
144 struct sk_buff **pskb, int *diff)
145{
146 struct iphdr *iph;
147 struct tcphdr *th;
148 char *data, *data_limit;
149 char *start, *end;
150 __u32 from;
151 __u16 port;
152 struct ip_vs_conn *n_cp;
153 char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
154 unsigned buf_len;
155 int ret;
156
157 *diff = 0;
158
159 /* Only useful for established sessions */
160 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
161 return 1;
162
163 /* Linear packets are much easier to deal with. */
164 if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
165 return 0;
166
167 if (cp->app_data == &ip_vs_ftp_pasv) {
168 iph = (*pskb)->nh.iph;
169 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
170 data = (char *)th + (th->doff << 2);
171 data_limit = (*pskb)->tail;
172
173 if (ip_vs_ftp_get_addrport(data, data_limit,
174 SERVER_STRING,
175 sizeof(SERVER_STRING)-1, ')',
176 &from, &port,
177 &start, &end) != 1)
178 return 1;
179
180 IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
181 "%u.%u.%u.%u:%d detected\n",
182 NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
183
184 /*
185 * Now update or create an connection entry for it
186 */
187 n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
188 cp->caddr, 0);
189 if (!n_cp) {
190 n_cp = ip_vs_conn_new(IPPROTO_TCP,
191 cp->caddr, 0,
192 cp->vaddr, port,
193 from, port,
194 IP_VS_CONN_F_NO_CPORT,
195 cp->dest);
196 if (!n_cp)
197 return 0;
198
199 /* add its controller */
200 ip_vs_control_add(n_cp, cp);
201 }
202
203 /*
204 * Replace the old passive address with the new one
205 */
206 from = n_cp->vaddr;
207 port = n_cp->vport;
208 sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
209 port&255, (port>>8)&255);
210 buf_len = strlen(buf);
211
212 /*
213 * Calculate required delta-offset to keep TCP happy
214 */
215 *diff = buf_len - (end-start);
216
217 if (*diff == 0) {
218 /* simply replace it with new passive address */
219 memcpy(start, buf, buf_len);
220 ret = 1;
221 } else {
222 ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
223 end-start, buf, buf_len);
224 }
225
226 cp->app_data = NULL;
227 ip_vs_tcp_conn_listen(n_cp);
228 ip_vs_conn_put(n_cp);
229 return ret;
230 }
231 return 1;
232}
233
234
235/*
236 * Look at incoming ftp packets to catch the PASV/PORT command
237 * (outside-to-inside).
238 *
239 * The incoming packet having the PORT command should be something like
240 * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
241 * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
242 * In this case, we create a connection entry using the client address and
243 * port, so that the active ftp data connection from the server can reach
244 * the client.
245 */
246static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
247 struct sk_buff **pskb, int *diff)
248{
249 struct iphdr *iph;
250 struct tcphdr *th;
251 char *data, *data_start, *data_limit;
252 char *start, *end;
253 __u32 to;
254 __u16 port;
255 struct ip_vs_conn *n_cp;
256
257 /* no diff required for incoming packets */
258 *diff = 0;
259
260 /* Only useful for established sessions */
261 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
262 return 1;
263
264 /* Linear packets are much easier to deal with. */
265 if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
266 return 0;
267
268 /*
269 * Detecting whether it is passive
270 */
271 iph = (*pskb)->nh.iph;
272 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
273
274 /* Since there may be OPTIONS in the TCP packet and the HLEN is
275 the length of the header in 32-bit multiples, it is accurate
276 to calculate data address by th+HLEN*4 */
277 data = data_start = (char *)th + (th->doff << 2);
278 data_limit = (*pskb)->tail;
279
280 while (data <= data_limit - 6) {
281 if (strnicmp(data, "PASV\r\n", 6) == 0) {
282 /* Passive mode on */
283 IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n",
284 data - data_start,
285 data_limit - data_start);
286 cp->app_data = &ip_vs_ftp_pasv;
287 return 1;
288 }
289 data++;
290 }
291
292 /*
293 * To support virtual FTP server, the scenerio is as follows:
294 * FTP client ----> Load Balancer ----> FTP server
295 * First detect the port number in the application data,
296 * then create a new connection entry for the coming data
297 * connection.
298 */
299 if (ip_vs_ftp_get_addrport(data_start, data_limit,
300 CLIENT_STRING, sizeof(CLIENT_STRING)-1,
301 '\r', &to, &port,
302 &start, &end) != 1)
303 return 1;
304
305 IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
306 NIPQUAD(to), ntohs(port));
307
308 /* Passive mode off */
309 cp->app_data = NULL;
310
311 /*
312 * Now update or create a connection entry for it
313 */
314 IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
315 ip_vs_proto_name(iph->protocol),
316 NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
317
318 n_cp = ip_vs_conn_in_get(iph->protocol,
319 to, port,
320 cp->vaddr, htons(ntohs(cp->vport)-1));
321 if (!n_cp) {
322 n_cp = ip_vs_conn_new(IPPROTO_TCP,
323 to, port,
324 cp->vaddr, htons(ntohs(cp->vport)-1),
325 cp->daddr, htons(ntohs(cp->dport)-1),
326 0,
327 cp->dest);
328 if (!n_cp)
329 return 0;
330
331 /* add its controller */
332 ip_vs_control_add(n_cp, cp);
333 }
334
335 /*
336 * Move tunnel to listen state
337 */
338 ip_vs_tcp_conn_listen(n_cp);
339 ip_vs_conn_put(n_cp);
340
341 return 1;
342}
343
344
345static struct ip_vs_app ip_vs_ftp = {
346 .name = "ftp",
347 .type = IP_VS_APP_TYPE_FTP,
348 .protocol = IPPROTO_TCP,
349 .module = THIS_MODULE,
350 .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
351 .init_conn = ip_vs_ftp_init_conn,
352 .done_conn = ip_vs_ftp_done_conn,
353 .bind_conn = NULL,
354 .unbind_conn = NULL,
355 .pkt_out = ip_vs_ftp_out,
356 .pkt_in = ip_vs_ftp_in,
357};
358
359
360/*
361 * ip_vs_ftp initialization
362 */
363static int __init ip_vs_ftp_init(void)
364{
365 int i, ret;
366 struct ip_vs_app *app = &ip_vs_ftp;
367
368 ret = register_ip_vs_app(app);
369 if (ret)
370 return ret;
371
372 for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
373 if (!ports[i])
374 continue;
375 ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
376 if (ret)
377 break;
378 IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
379 app->name, i, ports[i]);
380 }
381
382 if (ret)
383 unregister_ip_vs_app(app);
384
385 return ret;
386}
387
388
389/*
390 * ip_vs_ftp finish.
391 */
392static void __exit ip_vs_ftp_exit(void)
393{
394 unregister_ip_vs_app(&ip_vs_ftp);
395}
396
397
398module_init(ip_vs_ftp_init);
399module_exit(ip_vs_ftp_exit);
400MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
new file mode 100644
index 000000000000..c035838b780a
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -0,0 +1,624 @@
1/*
2 * IPVS: Locality-Based Least-Connection scheduling module
3 *
4 * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Martin Hamilton : fixed the terrible locking bugs
15 * *lock(tbl->lock) ==> *lock(&tbl->lock)
16 * Wensong Zhang : fixed the uninitilized tbl->lock bug
17 * Wensong Zhang : added doing full expiration check to
18 * collect stale entries of 24+ hours when
19 * no partial expire check in a half hour
20 * Julian Anastasov : replaced del_timer call with del_timer_sync
21 * to avoid the possible race between timer
22 * handler and del_timer thread in SMP
23 *
24 */
25
26/*
27 * The lblc algorithm is as follows (pseudo code):
28 *
29 * if cachenode[dest_ip] is null then
30 * n, cachenode[dest_ip] <- {weighted least-conn node};
31 * else
32 * n <- cachenode[dest_ip];
33 * if (n is dead) OR
34 * (n.conns>n.weight AND
35 * there is a node m with m.conns<m.weight/2) then
36 * n, cachenode[dest_ip] <- {weighted least-conn node};
37 *
38 * return n;
39 *
40 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
41 * me to write this module.
42 */
43
44#include <linux/module.h>
45#include <linux/kernel.h>
46
47/* for sysctl */
48#include <linux/fs.h>
49#include <linux/sysctl.h>
50
51#include <net/ip_vs.h>
52
53
54/*
55 * It is for garbage collection of stale IPVS lblc entries,
56 * when the table is full.
57 */
58#define CHECK_EXPIRE_INTERVAL (60*HZ)
59#define ENTRY_TIMEOUT (6*60*HZ)
60
61/*
62 * It is for full expiration check.
63 * When there is no partial expiration check (garbage collection)
64 * in a half hour, do a full expiration check to collect stale
65 * entries that haven't been touched for a day.
66 */
67#define COUNT_FOR_FULL_EXPIRATION 30
68static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
69
70
71/*
72 * for IPVS lblc entry hash table
73 */
74#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
75#define CONFIG_IP_VS_LBLC_TAB_BITS 10
76#endif
77#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
78#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
79#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
80
81
82/*
83 * IPVS lblc entry represents an association between destination
84 * IP address and its destination server
85 */
86struct ip_vs_lblc_entry {
87 struct list_head list;
88 __u32 addr; /* destination IP address */
89 struct ip_vs_dest *dest; /* real server (cache) */
90 unsigned long lastuse; /* last used time */
91};
92
93
94/*
95 * IPVS lblc hash table
96 */
97struct ip_vs_lblc_table {
98 rwlock_t lock; /* lock for this table */
99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
100 atomic_t entries; /* number of entries */
101 int max_size; /* maximum size of entries */
102 struct timer_list periodic_timer; /* collect stale entries */
103 int rover; /* rover for expire check */
104 int counter; /* counter for no expire */
105};
106
107
108/*
109 * IPVS LBLC sysctl table
110 */
111
112static ctl_table vs_vars_table[] = {
113 {
114 .ctl_name = NET_IPV4_VS_LBLC_EXPIRE,
115 .procname = "lblc_expiration",
116 .data = &sysctl_ip_vs_lblc_expiration,
117 .maxlen = sizeof(int),
118 .mode = 0644,
119 .proc_handler = &proc_dointvec_jiffies,
120 },
121 { .ctl_name = 0 }
122};
123
124static ctl_table vs_table[] = {
125 {
126 .ctl_name = NET_IPV4_VS,
127 .procname = "vs",
128 .mode = 0555,
129 .child = vs_vars_table
130 },
131 { .ctl_name = 0 }
132};
133
134static ctl_table ipv4_table[] = {
135 {
136 .ctl_name = NET_IPV4,
137 .procname = "ipv4",
138 .mode = 0555,
139 .child = vs_table
140 },
141 { .ctl_name = 0 }
142};
143
144static ctl_table lblc_root_table[] = {
145 {
146 .ctl_name = CTL_NET,
147 .procname = "net",
148 .mode = 0555,
149 .child = ipv4_table
150 },
151 { .ctl_name = 0 }
152};
153
154static struct ctl_table_header * sysctl_header;
155
156/*
157 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
158 * IP address to a server.
159 */
160static inline struct ip_vs_lblc_entry *
161ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
162{
163 struct ip_vs_lblc_entry *en;
164
165 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
166 if (en == NULL) {
167 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
168 return NULL;
169 }
170
171 INIT_LIST_HEAD(&en->list);
172 en->addr = daddr;
173
174 atomic_inc(&dest->refcnt);
175 en->dest = dest;
176
177 return en;
178}
179
180
181static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
182{
183 list_del(&en->list);
184 /*
185 * We don't kfree dest because it is refered either by its service
186 * or the trash dest list.
187 */
188 atomic_dec(&en->dest->refcnt);
189 kfree(en);
190}
191
192
193/*
194 * Returns hash value for IPVS LBLC entry
195 */
196static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
197{
198 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
199}
200
201
202/*
203 * Hash an entry in the ip_vs_lblc_table.
204 * returns bool success.
205 */
206static int
207ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
208{
209 unsigned hash;
210
211 if (!list_empty(&en->list)) {
212 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
213 "called from %p\n", __builtin_return_address(0));
214 return 0;
215 }
216
217 /*
218 * Hash by destination IP address
219 */
220 hash = ip_vs_lblc_hashkey(en->addr);
221
222 write_lock(&tbl->lock);
223 list_add(&en->list, &tbl->bucket[hash]);
224 atomic_inc(&tbl->entries);
225 write_unlock(&tbl->lock);
226
227 return 1;
228}
229
230
231#if 0000
232/*
233 * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
234 * returns bool success.
235 */
236static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
237 struct ip_vs_lblc_entry *en)
238{
239 if (list_empty(&en->list)) {
240 IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
241 "called from %p\n", __builtin_return_address(0));
242 return 0;
243 }
244
245 /*
246 * Remove it from the table
247 */
248 write_lock(&tbl->lock);
249 list_del(&en->list);
250 INIT_LIST_HEAD(&en->list);
251 write_unlock(&tbl->lock);
252
253 return 1;
254}
255#endif
256
257
258/*
259 * Get ip_vs_lblc_entry associated with supplied parameters.
260 */
261static inline struct ip_vs_lblc_entry *
262ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
263{
264 unsigned hash;
265 struct ip_vs_lblc_entry *en;
266
267 hash = ip_vs_lblc_hashkey(addr);
268
269 read_lock(&tbl->lock);
270
271 list_for_each_entry(en, &tbl->bucket[hash], list) {
272 if (en->addr == addr) {
273 /* HIT */
274 read_unlock(&tbl->lock);
275 return en;
276 }
277 }
278
279 read_unlock(&tbl->lock);
280
281 return NULL;
282}
283
284
285/*
286 * Flush all the entries of the specified table.
287 */
288static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
289{
290 int i;
291 struct ip_vs_lblc_entry *en, *nxt;
292
293 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
294 write_lock(&tbl->lock);
295 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
296 ip_vs_lblc_free(en);
297 atomic_dec(&tbl->entries);
298 }
299 write_unlock(&tbl->lock);
300 }
301}
302
303
304static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
305{
306 unsigned long now = jiffies;
307 int i, j;
308 struct ip_vs_lblc_entry *en, *nxt;
309
310 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
311 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
312
313 write_lock(&tbl->lock);
314 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
315 if (time_before(now,
316 en->lastuse + sysctl_ip_vs_lblc_expiration))
317 continue;
318
319 ip_vs_lblc_free(en);
320 atomic_dec(&tbl->entries);
321 }
322 write_unlock(&tbl->lock);
323 }
324 tbl->rover = j;
325}
326
327
328/*
329 * Periodical timer handler for IPVS lblc table
330 * It is used to collect stale entries when the number of entries
331 * exceeds the maximum size of the table.
332 *
333 * Fixme: we probably need more complicated algorithm to collect
334 * entries that have not been used for a long time even
335 * if the number of entries doesn't exceed the maximum size
336 * of the table.
337 * The full expiration check is for this purpose now.
338 */
339static void ip_vs_lblc_check_expire(unsigned long data)
340{
341 struct ip_vs_lblc_table *tbl;
342 unsigned long now = jiffies;
343 int goal;
344 int i, j;
345 struct ip_vs_lblc_entry *en, *nxt;
346
347 tbl = (struct ip_vs_lblc_table *)data;
348
349 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
350 /* do full expiration check */
351 ip_vs_lblc_full_check(tbl);
352 tbl->counter = 1;
353 goto out;
354 }
355
356 if (atomic_read(&tbl->entries) <= tbl->max_size) {
357 tbl->counter++;
358 goto out;
359 }
360
361 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
362 if (goal > tbl->max_size/2)
363 goal = tbl->max_size/2;
364
365 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
366 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
367
368 write_lock(&tbl->lock);
369 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
370 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
371 continue;
372
373 ip_vs_lblc_free(en);
374 atomic_dec(&tbl->entries);
375 goal--;
376 }
377 write_unlock(&tbl->lock);
378 if (goal <= 0)
379 break;
380 }
381 tbl->rover = j;
382
383 out:
384 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
385}
386
387
388static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
389{
390 int i;
391 struct ip_vs_lblc_table *tbl;
392
393 /*
394 * Allocate the ip_vs_lblc_table for this service
395 */
396 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
397 if (tbl == NULL) {
398 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
399 return -ENOMEM;
400 }
401 svc->sched_data = tbl;
402 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
403 "current service\n",
404 sizeof(struct ip_vs_lblc_table));
405
406 /*
407 * Initialize the hash buckets
408 */
409 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
410 INIT_LIST_HEAD(&tbl->bucket[i]);
411 }
412 rwlock_init(&tbl->lock);
413 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
414 tbl->rover = 0;
415 tbl->counter = 1;
416
417 /*
418 * Hook periodic timer for garbage collection
419 */
420 init_timer(&tbl->periodic_timer);
421 tbl->periodic_timer.data = (unsigned long)tbl;
422 tbl->periodic_timer.function = ip_vs_lblc_check_expire;
423 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
424 add_timer(&tbl->periodic_timer);
425
426 return 0;
427}
428
429
430static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
431{
432 struct ip_vs_lblc_table *tbl = svc->sched_data;
433
434 /* remove periodic timer */
435 del_timer_sync(&tbl->periodic_timer);
436
437 /* got to clean up table entries here */
438 ip_vs_lblc_flush(tbl);
439
440 /* release the table itself */
441 kfree(svc->sched_data);
442 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
443 sizeof(struct ip_vs_lblc_table));
444
445 return 0;
446}
447
448
449static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
450{
451 return 0;
452}
453
454
455static inline struct ip_vs_dest *
456__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
457{
458 struct ip_vs_dest *dest, *least;
459 int loh, doh;
460
461 /*
462 * We think the overhead of processing active connections is fifty
463 * times higher than that of inactive connections in average. (This
464 * fifty times might not be accurate, we will change it later.) We
465 * use the following formula to estimate the overhead:
466 * dest->activeconns*50 + dest->inactconns
467 * and the load:
468 * (dest overhead) / dest->weight
469 *
470 * Remember -- no floats in kernel mode!!!
471 * The comparison of h1*w2 > h2*w1 is equivalent to that of
472 * h1/w1 > h2/w2
473 * if every weight is larger than zero.
474 *
475 * The server with weight=0 is quiesced and will not receive any
476 * new connection.
477 */
478 list_for_each_entry(dest, &svc->destinations, n_list) {
479 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
480 continue;
481 if (atomic_read(&dest->weight) > 0) {
482 least = dest;
483 loh = atomic_read(&least->activeconns) * 50
484 + atomic_read(&least->inactconns);
485 goto nextstage;
486 }
487 }
488 return NULL;
489
490 /*
491 * Find the destination with the least load.
492 */
493 nextstage:
494 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
495 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
496 continue;
497
498 doh = atomic_read(&dest->activeconns) * 50
499 + atomic_read(&dest->inactconns);
500 if (loh * atomic_read(&dest->weight) >
501 doh * atomic_read(&least->weight)) {
502 least = dest;
503 loh = doh;
504 }
505 }
506
507 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
508 "activeconns %d refcnt %d weight %d overhead %d\n",
509 NIPQUAD(least->addr), ntohs(least->port),
510 atomic_read(&least->activeconns),
511 atomic_read(&least->refcnt),
512 atomic_read(&least->weight), loh);
513
514 return least;
515}
516
517
518/*
519 * If this destination server is overloaded and there is a less loaded
520 * server, then return true.
521 */
522static inline int
523is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
524{
525 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
526 struct ip_vs_dest *d;
527
528 list_for_each_entry(d, &svc->destinations, n_list) {
529 if (atomic_read(&d->activeconns)*2
530 < atomic_read(&d->weight)) {
531 return 1;
532 }
533 }
534 }
535 return 0;
536}
537
538
539/*
540 * Locality-Based (weighted) Least-Connection scheduling
541 */
542static struct ip_vs_dest *
543ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
544{
545 struct ip_vs_dest *dest;
546 struct ip_vs_lblc_table *tbl;
547 struct ip_vs_lblc_entry *en;
548 struct iphdr *iph = skb->nh.iph;
549
550 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
551
552 tbl = (struct ip_vs_lblc_table *)svc->sched_data;
553 en = ip_vs_lblc_get(tbl, iph->daddr);
554 if (en == NULL) {
555 dest = __ip_vs_wlc_schedule(svc, iph);
556 if (dest == NULL) {
557 IP_VS_DBG(1, "no destination available\n");
558 return NULL;
559 }
560 en = ip_vs_lblc_new(iph->daddr, dest);
561 if (en == NULL) {
562 return NULL;
563 }
564 ip_vs_lblc_hash(tbl, en);
565 } else {
566 dest = en->dest;
567 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
568 || atomic_read(&dest->weight) <= 0
569 || is_overloaded(dest, svc)) {
570 dest = __ip_vs_wlc_schedule(svc, iph);
571 if (dest == NULL) {
572 IP_VS_DBG(1, "no destination available\n");
573 return NULL;
574 }
575 atomic_dec(&en->dest->refcnt);
576 atomic_inc(&dest->refcnt);
577 en->dest = dest;
578 }
579 }
580 en->lastuse = jiffies;
581
582 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
583 "--> server %u.%u.%u.%u:%d\n",
584 NIPQUAD(en->addr),
585 NIPQUAD(dest->addr),
586 ntohs(dest->port));
587
588 return dest;
589}
590
591
592/*
593 * IPVS LBLC Scheduler structure
594 */
595static struct ip_vs_scheduler ip_vs_lblc_scheduler =
596{
597 .name = "lblc",
598 .refcnt = ATOMIC_INIT(0),
599 .module = THIS_MODULE,
600 .init_service = ip_vs_lblc_init_svc,
601 .done_service = ip_vs_lblc_done_svc,
602 .update_service = ip_vs_lblc_update_svc,
603 .schedule = ip_vs_lblc_schedule,
604};
605
606
607static int __init ip_vs_lblc_init(void)
608{
609 INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
610 sysctl_header = register_sysctl_table(lblc_root_table, 0);
611 return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
612}
613
614
615static void __exit ip_vs_lblc_cleanup(void)
616{
617 unregister_sysctl_table(sysctl_header);
618 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
619}
620
621
622module_init(ip_vs_lblc_init);
623module_exit(ip_vs_lblc_cleanup);
624MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
new file mode 100644
index 000000000000..22b5dd55d271
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,888 @@
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
4 * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Julian Anastasov : Added the missing (dest->weight>0)
15 * condition in the ip_vs_dest_set_max.
16 *
17 */
18
19/*
20 * The lblc/r algorithm is as follows (pseudo code):
21 *
22 * if serverSet[dest_ip] is null then
23 * n, serverSet[dest_ip] <- {weighted least-conn node};
24 * else
25 * n <- {least-conn (alive) node in serverSet[dest_ip]};
26 * if (n is null) OR
27 * (n.conns>n.weight AND
28 * there is a node m with m.conns<m.weight/2) then
29 * n <- {weighted least-conn node};
30 * add n to serverSet[dest_ip];
31 * if |serverSet[dest_ip]| > 1 AND
32 * now - serverSet[dest_ip].lastMod > T then
33 * m <- {most conn node in serverSet[dest_ip]};
34 * remove m from serverSet[dest_ip];
35 * if serverSet[dest_ip] changed then
36 * serverSet[dest_ip].lastMod <- now;
37 *
38 * return n;
39 *
40 */
41
42#include <linux/module.h>
43#include <linux/kernel.h>
44
45/* for sysctl */
46#include <linux/fs.h>
47#include <linux/sysctl.h>
48/* for proc_net_create/proc_net_remove */
49#include <linux/proc_fs.h>
50
51#include <net/ip_vs.h>
52
53
54/*
55 * It is for garbage collection of stale IPVS lblcr entries,
56 * when the table is full.
57 */
58#define CHECK_EXPIRE_INTERVAL (60*HZ)
59#define ENTRY_TIMEOUT (6*60*HZ)
60
61/*
62 * It is for full expiration check.
63 * When there is no partial expiration check (garbage collection)
64 * in a half hour, do a full expiration check to collect stale
65 * entries that haven't been touched for a day.
66 */
67#define COUNT_FOR_FULL_EXPIRATION 30
68static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
69
70
71/*
72 * for IPVS lblcr entry hash table
73 */
74#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
75#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
76#endif
77#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
78#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
79#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
80
81
82/*
83 * IPVS destination set structure and operations
84 */
85struct ip_vs_dest_list {
86 struct ip_vs_dest_list *next; /* list link */
87 struct ip_vs_dest *dest; /* destination server */
88};
89
90struct ip_vs_dest_set {
91 atomic_t size; /* set size */
92 unsigned long lastmod; /* last modified time */
93 struct ip_vs_dest_list *list; /* destination list */
94 rwlock_t lock; /* lock for this list */
95};
96
97
98static struct ip_vs_dest_list *
99ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
100{
101 struct ip_vs_dest_list *e;
102
103 for (e=set->list; e!=NULL; e=e->next) {
104 if (e->dest == dest)
105 /* already existed */
106 return NULL;
107 }
108
109 e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
110 if (e == NULL) {
111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
112 return NULL;
113 }
114
115 atomic_inc(&dest->refcnt);
116 e->dest = dest;
117
118 /* link it to the list */
119 write_lock(&set->lock);
120 e->next = set->list;
121 set->list = e;
122 atomic_inc(&set->size);
123 write_unlock(&set->lock);
124
125 set->lastmod = jiffies;
126 return e;
127}
128
129static void
130ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
131{
132 struct ip_vs_dest_list *e, **ep;
133
134 write_lock(&set->lock);
135 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
136 if (e->dest == dest) {
137 /* HIT */
138 *ep = e->next;
139 atomic_dec(&set->size);
140 set->lastmod = jiffies;
141 atomic_dec(&e->dest->refcnt);
142 kfree(e);
143 break;
144 }
145 ep = &e->next;
146 }
147 write_unlock(&set->lock);
148}
149
150static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
151{
152 struct ip_vs_dest_list *e, **ep;
153
154 write_lock(&set->lock);
155 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
156 *ep = e->next;
157 /*
158 * We don't kfree dest because it is refered either
159 * by its service or by the trash dest list.
160 */
161 atomic_dec(&e->dest->refcnt);
162 kfree(e);
163 }
164 write_unlock(&set->lock);
165}
166
167/* get weighted least-connection node in the destination set */
168static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
169{
170 register struct ip_vs_dest_list *e;
171 struct ip_vs_dest *dest, *least;
172 int loh, doh;
173
174 if (set == NULL)
175 return NULL;
176
177 read_lock(&set->lock);
178 /* select the first destination server, whose weight > 0 */
179 for (e=set->list; e!=NULL; e=e->next) {
180 least = e->dest;
181 if (least->flags & IP_VS_DEST_F_OVERLOAD)
182 continue;
183
184 if ((atomic_read(&least->weight) > 0)
185 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
186 loh = atomic_read(&least->activeconns) * 50
187 + atomic_read(&least->inactconns);
188 goto nextstage;
189 }
190 }
191 read_unlock(&set->lock);
192 return NULL;
193
194 /* find the destination with the weighted least load */
195 nextstage:
196 for (e=e->next; e!=NULL; e=e->next) {
197 dest = e->dest;
198 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
199 continue;
200
201 doh = atomic_read(&dest->activeconns) * 50
202 + atomic_read(&dest->inactconns);
203 if ((loh * atomic_read(&dest->weight) >
204 doh * atomic_read(&least->weight))
205 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
206 least = dest;
207 loh = doh;
208 }
209 }
210 read_unlock(&set->lock);
211
212 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
213 "activeconns %d refcnt %d weight %d overhead %d\n",
214 NIPQUAD(least->addr), ntohs(least->port),
215 atomic_read(&least->activeconns),
216 atomic_read(&least->refcnt),
217 atomic_read(&least->weight), loh);
218 return least;
219}
220
221
222/* get weighted most-connection node in the destination set */
223static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
224{
225 register struct ip_vs_dest_list *e;
226 struct ip_vs_dest *dest, *most;
227 int moh, doh;
228
229 if (set == NULL)
230 return NULL;
231
232 read_lock(&set->lock);
233 /* select the first destination server, whose weight > 0 */
234 for (e=set->list; e!=NULL; e=e->next) {
235 most = e->dest;
236 if (atomic_read(&most->weight) > 0) {
237 moh = atomic_read(&most->activeconns) * 50
238 + atomic_read(&most->inactconns);
239 goto nextstage;
240 }
241 }
242 read_unlock(&set->lock);
243 return NULL;
244
245 /* find the destination with the weighted most load */
246 nextstage:
247 for (e=e->next; e!=NULL; e=e->next) {
248 dest = e->dest;
249 doh = atomic_read(&dest->activeconns) * 50
250 + atomic_read(&dest->inactconns);
251 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
252 if ((moh * atomic_read(&dest->weight) <
253 doh * atomic_read(&most->weight))
254 && (atomic_read(&dest->weight) > 0)) {
255 most = dest;
256 moh = doh;
257 }
258 }
259 read_unlock(&set->lock);
260
261 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
262 "activeconns %d refcnt %d weight %d overhead %d\n",
263 NIPQUAD(most->addr), ntohs(most->port),
264 atomic_read(&most->activeconns),
265 atomic_read(&most->refcnt),
266 atomic_read(&most->weight), moh);
267 return most;
268}
269
270
271/*
272 * IPVS lblcr entry represents an association between destination
273 * IP address and its destination server set
274 */
275struct ip_vs_lblcr_entry {
276 struct list_head list;
277 __u32 addr; /* destination IP address */
278 struct ip_vs_dest_set set; /* destination server set */
279 unsigned long lastuse; /* last used time */
280};
281
282
283/*
284 * IPVS lblcr hash table
285 */
286struct ip_vs_lblcr_table {
287 rwlock_t lock; /* lock for this table */
288 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
289 atomic_t entries; /* number of entries */
290 int max_size; /* maximum size of entries */
291 struct timer_list periodic_timer; /* collect stale entries */
292 int rover; /* rover for expire check */
293 int counter; /* counter for no expire */
294};
295
296
297/*
298 * IPVS LBLCR sysctl table
299 */
300
301static ctl_table vs_vars_table[] = {
302 {
303 .ctl_name = NET_IPV4_VS_LBLCR_EXPIRE,
304 .procname = "lblcr_expiration",
305 .data = &sysctl_ip_vs_lblcr_expiration,
306 .maxlen = sizeof(int),
307 .mode = 0644,
308 .proc_handler = &proc_dointvec_jiffies,
309 },
310 { .ctl_name = 0 }
311};
312
313static ctl_table vs_table[] = {
314 {
315 .ctl_name = NET_IPV4_VS,
316 .procname = "vs",
317 .mode = 0555,
318 .child = vs_vars_table
319 },
320 { .ctl_name = 0 }
321};
322
323static ctl_table ipv4_table[] = {
324 {
325 .ctl_name = NET_IPV4,
326 .procname = "ipv4",
327 .mode = 0555,
328 .child = vs_table
329 },
330 { .ctl_name = 0 }
331};
332
333static ctl_table lblcr_root_table[] = {
334 {
335 .ctl_name = CTL_NET,
336 .procname = "net",
337 .mode = 0555,
338 .child = ipv4_table
339 },
340 { .ctl_name = 0 }
341};
342
343static struct ctl_table_header * sysctl_header;
344
345/*
346 * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
347 * IP address to a server.
348 */
349static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
350{
351 struct ip_vs_lblcr_entry *en;
352
353 en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
354 if (en == NULL) {
355 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
356 return NULL;
357 }
358
359 INIT_LIST_HEAD(&en->list);
360 en->addr = daddr;
361
362 /* initilize its dest set */
363 atomic_set(&(en->set.size), 0);
364 en->set.list = NULL;
365 rwlock_init(&en->set.lock);
366
367 return en;
368}
369
370
371static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
372{
373 list_del(&en->list);
374 ip_vs_dest_set_eraseall(&en->set);
375 kfree(en);
376}
377
378
379/*
380 * Returns hash value for IPVS LBLCR entry
381 */
382static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
383{
384 return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
385}
386
387
388/*
389 * Hash an entry in the ip_vs_lblcr_table.
390 * returns bool success.
391 */
392static int
393ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
394{
395 unsigned hash;
396
397 if (!list_empty(&en->list)) {
398 IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
399 "called from %p\n", __builtin_return_address(0));
400 return 0;
401 }
402
403 /*
404 * Hash by destination IP address
405 */
406 hash = ip_vs_lblcr_hashkey(en->addr);
407
408 write_lock(&tbl->lock);
409 list_add(&en->list, &tbl->bucket[hash]);
410 atomic_inc(&tbl->entries);
411 write_unlock(&tbl->lock);
412
413 return 1;
414}
415
416
417#if 0000
418/*
419 * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
420 * returns bool success.
421 */
422static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
423 struct ip_vs_lblcr_entry *en)
424{
425 if (list_empty(&en->list)) {
426 IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
427 "called from %p\n", __builtin_return_address(0));
428 return 0;
429 }
430
431 /*
432 * Remove it from the table
433 */
434 write_lock(&tbl->lock);
435 list_del(&en->list);
436 INIT_LIST_HEAD(&en->list);
437 write_unlock(&tbl->lock);
438
439 return 1;
440}
441#endif
442
443
444/*
445 * Get ip_vs_lblcr_entry associated with supplied parameters.
446 */
447static inline struct ip_vs_lblcr_entry *
448ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
449{
450 unsigned hash;
451 struct ip_vs_lblcr_entry *en;
452
453 hash = ip_vs_lblcr_hashkey(addr);
454
455 read_lock(&tbl->lock);
456
457 list_for_each_entry(en, &tbl->bucket[hash], list) {
458 if (en->addr == addr) {
459 /* HIT */
460 read_unlock(&tbl->lock);
461 return en;
462 }
463 }
464
465 read_unlock(&tbl->lock);
466
467 return NULL;
468}
469
470
471/*
472 * Flush all the entries of the specified table.
473 */
474static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
475{
476 int i;
477 struct ip_vs_lblcr_entry *en, *nxt;
478
479 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
480 write_lock(&tbl->lock);
481 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
482 ip_vs_lblcr_free(en);
483 atomic_dec(&tbl->entries);
484 }
485 write_unlock(&tbl->lock);
486 }
487}
488
489
490static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
491{
492 unsigned long now = jiffies;
493 int i, j;
494 struct ip_vs_lblcr_entry *en, *nxt;
495
496 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
497 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
498
499 write_lock(&tbl->lock);
500 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
501 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
502 now))
503 continue;
504
505 ip_vs_lblcr_free(en);
506 atomic_dec(&tbl->entries);
507 }
508 write_unlock(&tbl->lock);
509 }
510 tbl->rover = j;
511}
512
513
514/*
515 * Periodical timer handler for IPVS lblcr table
516 * It is used to collect stale entries when the number of entries
517 * exceeds the maximum size of the table.
518 *
519 * Fixme: we probably need more complicated algorithm to collect
520 * entries that have not been used for a long time even
521 * if the number of entries doesn't exceed the maximum size
522 * of the table.
523 * The full expiration check is for this purpose now.
524 */
525static void ip_vs_lblcr_check_expire(unsigned long data)
526{
527 struct ip_vs_lblcr_table *tbl;
528 unsigned long now = jiffies;
529 int goal;
530 int i, j;
531 struct ip_vs_lblcr_entry *en, *nxt;
532
533 tbl = (struct ip_vs_lblcr_table *)data;
534
535 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
536 /* do full expiration check */
537 ip_vs_lblcr_full_check(tbl);
538 tbl->counter = 1;
539 goto out;
540 }
541
542 if (atomic_read(&tbl->entries) <= tbl->max_size) {
543 tbl->counter++;
544 goto out;
545 }
546
547 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
548 if (goal > tbl->max_size/2)
549 goal = tbl->max_size/2;
550
551 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
552 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
553
554 write_lock(&tbl->lock);
555 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
556 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
557 continue;
558
559 ip_vs_lblcr_free(en);
560 atomic_dec(&tbl->entries);
561 goal--;
562 }
563 write_unlock(&tbl->lock);
564 if (goal <= 0)
565 break;
566 }
567 tbl->rover = j;
568
569 out:
570 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
571}
572
573
574#ifdef CONFIG_IP_VS_LBLCR_DEBUG
575static struct ip_vs_lblcr_table *lblcr_table_list;
576
577/*
578 * /proc/net/ip_vs_lblcr to display the mappings of
579 * destination IP address <==> its serverSet
580 */
581static int
582ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
583{
584 off_t pos=0, begin;
585 int len=0, size;
586 struct ip_vs_lblcr_table *tbl;
587 unsigned long now = jiffies;
588 int i;
589 struct ip_vs_lblcr_entry *en;
590
591 tbl = lblcr_table_list;
592
593 size = sprintf(buffer, "LastTime Dest IP address Server set\n");
594 pos += size;
595 len += size;
596
597 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
598 read_lock_bh(&tbl->lock);
599 list_for_each_entry(en, &tbl->bucket[i], list) {
600 char tbuf[16];
601 struct ip_vs_dest_list *d;
602
603 sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
604 size = sprintf(buffer+len, "%8lu %-16s ",
605 now-en->lastuse, tbuf);
606
607 read_lock(&en->set.lock);
608 for (d=en->set.list; d!=NULL; d=d->next) {
609 size += sprintf(buffer+len+size,
610 "%u.%u.%u.%u ",
611 NIPQUAD(d->dest->addr));
612 }
613 read_unlock(&en->set.lock);
614 size += sprintf(buffer+len+size, "\n");
615 len += size;
616 pos += size;
617 if (pos <= offset)
618 len=0;
619 if (pos >= offset+length) {
620 read_unlock_bh(&tbl->lock);
621 goto done;
622 }
623 }
624 read_unlock_bh(&tbl->lock);
625 }
626
627 done:
628 begin = len - (pos - offset);
629 *start = buffer + begin;
630 len -= begin;
631 if(len>length)
632 len = length;
633 return len;
634}
635#endif
636
637
638static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
639{
640 int i;
641 struct ip_vs_lblcr_table *tbl;
642
643 /*
644 * Allocate the ip_vs_lblcr_table for this service
645 */
646 tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
647 if (tbl == NULL) {
648 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
649 return -ENOMEM;
650 }
651 svc->sched_data = tbl;
652 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
653 "current service\n",
654 sizeof(struct ip_vs_lblcr_table));
655
656 /*
657 * Initialize the hash buckets
658 */
659 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
660 INIT_LIST_HEAD(&tbl->bucket[i]);
661 }
662 rwlock_init(&tbl->lock);
663 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
664 tbl->rover = 0;
665 tbl->counter = 1;
666
667 /*
668 * Hook periodic timer for garbage collection
669 */
670 init_timer(&tbl->periodic_timer);
671 tbl->periodic_timer.data = (unsigned long)tbl;
672 tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
673 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
674 add_timer(&tbl->periodic_timer);
675
676#ifdef CONFIG_IP_VS_LBLCR_DEBUG
677 lblcr_table_list = tbl;
678#endif
679 return 0;
680}
681
682
683static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
684{
685 struct ip_vs_lblcr_table *tbl = svc->sched_data;
686
687 /* remove periodic timer */
688 del_timer_sync(&tbl->periodic_timer);
689
690 /* got to clean up table entries here */
691 ip_vs_lblcr_flush(tbl);
692
693 /* release the table itself */
694 kfree(svc->sched_data);
695 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
696 sizeof(struct ip_vs_lblcr_table));
697
698 return 0;
699}
700
701
702static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
703{
704 return 0;
705}
706
707
708static inline struct ip_vs_dest *
709__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
710{
711 struct ip_vs_dest *dest, *least;
712 int loh, doh;
713
714 /*
715 * We think the overhead of processing active connections is fifty
716 * times higher than that of inactive connections in average. (This
717 * fifty times might not be accurate, we will change it later.) We
718 * use the following formula to estimate the overhead:
719 * dest->activeconns*50 + dest->inactconns
720 * and the load:
721 * (dest overhead) / dest->weight
722 *
723 * Remember -- no floats in kernel mode!!!
724 * The comparison of h1*w2 > h2*w1 is equivalent to that of
725 * h1/w1 > h2/w2
726 * if every weight is larger than zero.
727 *
728 * The server with weight=0 is quiesced and will not receive any
729 * new connection.
730 */
731 list_for_each_entry(dest, &svc->destinations, n_list) {
732 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
733 continue;
734
735 if (atomic_read(&dest->weight) > 0) {
736 least = dest;
737 loh = atomic_read(&least->activeconns) * 50
738 + atomic_read(&least->inactconns);
739 goto nextstage;
740 }
741 }
742 return NULL;
743
744 /*
745 * Find the destination with the least load.
746 */
747 nextstage:
748 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
749 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
750 continue;
751
752 doh = atomic_read(&dest->activeconns) * 50
753 + atomic_read(&dest->inactconns);
754 if (loh * atomic_read(&dest->weight) >
755 doh * atomic_read(&least->weight)) {
756 least = dest;
757 loh = doh;
758 }
759 }
760
761 IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
762 "activeconns %d refcnt %d weight %d overhead %d\n",
763 NIPQUAD(least->addr), ntohs(least->port),
764 atomic_read(&least->activeconns),
765 atomic_read(&least->refcnt),
766 atomic_read(&least->weight), loh);
767
768 return least;
769}
770
771
772/*
773 * If this destination server is overloaded and there is a less loaded
774 * server, then return true.
775 */
776static inline int
777is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
778{
779 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
780 struct ip_vs_dest *d;
781
782 list_for_each_entry(d, &svc->destinations, n_list) {
783 if (atomic_read(&d->activeconns)*2
784 < atomic_read(&d->weight)) {
785 return 1;
786 }
787 }
788 }
789 return 0;
790}
791
792
793/*
794 * Locality-Based (weighted) Least-Connection scheduling
795 */
796static struct ip_vs_dest *
797ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
798{
799 struct ip_vs_dest *dest;
800 struct ip_vs_lblcr_table *tbl;
801 struct ip_vs_lblcr_entry *en;
802 struct iphdr *iph = skb->nh.iph;
803
804 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
805
806 tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
807 en = ip_vs_lblcr_get(tbl, iph->daddr);
808 if (en == NULL) {
809 dest = __ip_vs_wlc_schedule(svc, iph);
810 if (dest == NULL) {
811 IP_VS_DBG(1, "no destination available\n");
812 return NULL;
813 }
814 en = ip_vs_lblcr_new(iph->daddr);
815 if (en == NULL) {
816 return NULL;
817 }
818 ip_vs_dest_set_insert(&en->set, dest);
819 ip_vs_lblcr_hash(tbl, en);
820 } else {
821 dest = ip_vs_dest_set_min(&en->set);
822 if (!dest || is_overloaded(dest, svc)) {
823 dest = __ip_vs_wlc_schedule(svc, iph);
824 if (dest == NULL) {
825 IP_VS_DBG(1, "no destination available\n");
826 return NULL;
827 }
828 ip_vs_dest_set_insert(&en->set, dest);
829 }
830 if (atomic_read(&en->set.size) > 1 &&
831 jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
832 struct ip_vs_dest *m;
833 m = ip_vs_dest_set_max(&en->set);
834 if (m)
835 ip_vs_dest_set_erase(&en->set, m);
836 }
837 }
838 en->lastuse = jiffies;
839
840 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
841 "--> server %u.%u.%u.%u:%d\n",
842 NIPQUAD(en->addr),
843 NIPQUAD(dest->addr),
844 ntohs(dest->port));
845
846 return dest;
847}
848
849
850/*
851 * IPVS LBLCR Scheduler structure
852 */
853static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
854{
855 .name = "lblcr",
856 .refcnt = ATOMIC_INIT(0),
857 .module = THIS_MODULE,
858 .init_service = ip_vs_lblcr_init_svc,
859 .done_service = ip_vs_lblcr_done_svc,
860 .update_service = ip_vs_lblcr_update_svc,
861 .schedule = ip_vs_lblcr_schedule,
862};
863
864
865static int __init ip_vs_lblcr_init(void)
866{
867 INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
868 sysctl_header = register_sysctl_table(lblcr_root_table, 0);
869#ifdef CONFIG_IP_VS_LBLCR_DEBUG
870 proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
871#endif
872 return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
873}
874
875
876static void __exit ip_vs_lblcr_cleanup(void)
877{
878#ifdef CONFIG_IP_VS_LBLCR_DEBUG
879 proc_net_remove("ip_vs_lblcr");
880#endif
881 unregister_sysctl_table(sysctl_header);
882 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
883}
884
885
886module_init(ip_vs_lblcr_init);
887module_exit(ip_vs_lblcr_cleanup);
888MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
new file mode 100644
index 000000000000..d88fef90a641
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -0,0 +1,123 @@
1/*
2 * IPVS: Least-Connection Scheduling module
3 *
4 * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Wensong Zhang : added the ip_vs_lc_update_svc
15 * Wensong Zhang : added any dest with weight=0 is quiesced
16 *
17 */
18
19#include <linux/module.h>
20#include <linux/kernel.h>
21
22#include <net/ip_vs.h>
23
24
25static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
26{
27 return 0;
28}
29
30
31static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
32{
33 return 0;
34}
35
36
37static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
38{
39 return 0;
40}
41
42
43static inline unsigned int
44ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
45{
46 /*
47 * We think the overhead of processing active connections is 256
48 * times higher than that of inactive connections in average. (This
49 * 256 times might not be accurate, we will change it later) We
50 * use the following formula to estimate the overhead now:
51 * dest->activeconns*256 + dest->inactconns
52 */
53 return (atomic_read(&dest->activeconns) << 8) +
54 atomic_read(&dest->inactconns);
55}
56
57
58/*
59 * Least Connection scheduling
60 */
61static struct ip_vs_dest *
62ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
63{
64 struct ip_vs_dest *dest, *least = NULL;
65 unsigned int loh = 0, doh;
66
67 IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
68
69 /*
70 * Simply select the server with the least number of
71 * (activeconns<<5) + inactconns
72 * Except whose weight is equal to zero.
73 * If the weight is equal to zero, it means that the server is
74 * quiesced, the existing connections to the server still get
75 * served, but no new connection is assigned to the server.
76 */
77
78 list_for_each_entry(dest, &svc->destinations, n_list) {
79 if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
80 atomic_read(&dest->weight) == 0)
81 continue;
82 doh = ip_vs_lc_dest_overhead(dest);
83 if (!least || doh < loh) {
84 least = dest;
85 loh = doh;
86 }
87 }
88
89 if (least)
90 IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
91 NIPQUAD(least->addr), ntohs(least->port),
92 atomic_read(&least->activeconns),
93 atomic_read(&least->inactconns));
94
95 return least;
96}
97
98
99static struct ip_vs_scheduler ip_vs_lc_scheduler = {
100 .name = "lc",
101 .refcnt = ATOMIC_INIT(0),
102 .module = THIS_MODULE,
103 .init_service = ip_vs_lc_init_svc,
104 .done_service = ip_vs_lc_done_svc,
105 .update_service = ip_vs_lc_update_svc,
106 .schedule = ip_vs_lc_schedule,
107};
108
109
110static int __init ip_vs_lc_init(void)
111{
112 INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
113 return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
114}
115
116static void __exit ip_vs_lc_cleanup(void)
117{
118 unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
119}
120
121module_init(ip_vs_lc_init);
122module_exit(ip_vs_lc_cleanup);
123MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
new file mode 100644
index 000000000000..bc2a9e5f2a7b
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -0,0 +1,161 @@
1/*
2 * IPVS: Never Queue scheduling module
3 *
4 * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16
17/*
18 * The NQ algorithm adopts a two-speed model. When there is an idle server
19 * available, the job will be sent to the idle server, instead of waiting
20 * for a fast one. When there is no idle server available, the job will be
21 * sent to the server that minimize its expected delay (The Shortest
22 * Expected Delay scheduling algorithm).
23 *
24 * See the following paper for more information:
25 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
26 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
27 * pages 986-994, 1988.
28 *
29 * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
30 *
31 * The difference between NQ and SED is that NQ can improve overall
32 * system utilization.
33 *
34 */
35
36#include <linux/module.h>
37#include <linux/kernel.h>
38
39#include <net/ip_vs.h>
40
41
42static int
43ip_vs_nq_init_svc(struct ip_vs_service *svc)
44{
45 return 0;
46}
47
48
49static int
50ip_vs_nq_done_svc(struct ip_vs_service *svc)
51{
52 return 0;
53}
54
55
56static int
57ip_vs_nq_update_svc(struct ip_vs_service *svc)
58{
59 return 0;
60}
61
62
63static inline unsigned int
64ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
65{
66 /*
67 * We only use the active connection number in the cost
68 * calculation here.
69 */
70 return atomic_read(&dest->activeconns) + 1;
71}
72
73
74/*
75 * Weighted Least Connection scheduling
76 */
77static struct ip_vs_dest *
78ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
79{
80 struct ip_vs_dest *dest, *least = NULL;
81 unsigned int loh = 0, doh;
82
83 IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
84
85 /*
86 * We calculate the load of each dest server as follows:
87 * (server expected overhead) / dest->weight
88 *
89 * Remember -- no floats in kernel mode!!!
90 * The comparison of h1*w2 > h2*w1 is equivalent to that of
91 * h1/w1 > h2/w2
92 * if every weight is larger than zero.
93 *
94 * The server with weight=0 is quiesced and will not receive any
95 * new connections.
96 */
97
98 list_for_each_entry(dest, &svc->destinations, n_list) {
99
100 if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
101 !atomic_read(&dest->weight))
102 continue;
103
104 doh = ip_vs_nq_dest_overhead(dest);
105
106 /* return the server directly if it is idle */
107 if (atomic_read(&dest->activeconns) == 0) {
108 least = dest;
109 loh = doh;
110 goto out;
111 }
112
113 if (!least ||
114 (loh * atomic_read(&dest->weight) >
115 doh * atomic_read(&least->weight))) {
116 least = dest;
117 loh = doh;
118 }
119 }
120
121 if (!least)
122 return NULL;
123
124 out:
125 IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
126 "activeconns %d refcnt %d weight %d overhead %d\n",
127 NIPQUAD(least->addr), ntohs(least->port),
128 atomic_read(&least->activeconns),
129 atomic_read(&least->refcnt),
130 atomic_read(&least->weight), loh);
131
132 return least;
133}
134
135
136static struct ip_vs_scheduler ip_vs_nq_scheduler =
137{
138 .name = "nq",
139 .refcnt = ATOMIC_INIT(0),
140 .module = THIS_MODULE,
141 .init_service = ip_vs_nq_init_svc,
142 .done_service = ip_vs_nq_done_svc,
143 .update_service = ip_vs_nq_update_svc,
144 .schedule = ip_vs_nq_schedule,
145};
146
147
148static int __init ip_vs_nq_init(void)
149{
150 INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
151 return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
152}
153
154static void __exit ip_vs_nq_cleanup(void)
155{
156 unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
157}
158
159module_init(ip_vs_nq_init);
160module_exit(ip_vs_nq_cleanup);
161MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
new file mode 100644
index 000000000000..253c46252bd5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -0,0 +1,244 @@
1/*
2 * ip_vs_proto.c: transport protocol load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/module.h>
19#include <linux/kernel.h>
20#include <linux/skbuff.h>
21#include <linux/in.h>
22#include <linux/ip.h>
23#include <net/protocol.h>
24#include <net/tcp.h>
25#include <net/udp.h>
26#include <asm/system.h>
27#include <linux/stat.h>
28#include <linux/proc_fs.h>
29
30#include <net/ip_vs.h>
31
32
33/*
34 * IPVS protocols can only be registered/unregistered when the ipvs
35 * module is loaded/unloaded, so no lock is needed in accessing the
36 * ipvs protocol table.
37 */
38
39#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
40#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
41
42static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
43
44
45/*
46 * register an ipvs protocol
47 */
48static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
49{
50 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
51
52 pp->next = ip_vs_proto_table[hash];
53 ip_vs_proto_table[hash] = pp;
54
55 if (pp->init != NULL)
56 pp->init(pp);
57
58 return 0;
59}
60
61
62/*
63 * unregister an ipvs protocol
64 */
65static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
66{
67 struct ip_vs_protocol **pp_p;
68 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
69
70 pp_p = &ip_vs_proto_table[hash];
71 for (; *pp_p; pp_p = &(*pp_p)->next) {
72 if (*pp_p == pp) {
73 *pp_p = pp->next;
74 if (pp->exit != NULL)
75 pp->exit(pp);
76 return 0;
77 }
78 }
79
80 return -ESRCH;
81}
82
83
84/*
85 * get ip_vs_protocol object by its proto.
86 */
87struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
88{
89 struct ip_vs_protocol *pp;
90 unsigned hash = IP_VS_PROTO_HASH(proto);
91
92 for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
93 if (pp->protocol == proto)
94 return pp;
95 }
96
97 return NULL;
98}
99
100
101/*
102 * Propagate event for state change to all protocols
103 */
104void ip_vs_protocol_timeout_change(int flags)
105{
106 struct ip_vs_protocol *pp;
107 int i;
108
109 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
110 for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
111 if (pp->timeout_change)
112 pp->timeout_change(pp, flags);
113 }
114 }
115}
116
117
118int *
119ip_vs_create_timeout_table(int *table, int size)
120{
121 int *t;
122
123 t = kmalloc(size, GFP_ATOMIC);
124 if (t == NULL)
125 return NULL;
126 memcpy(t, table, size);
127 return t;
128}
129
130
131/*
132 * Set timeout value for state specified by name
133 */
134int
135ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
136{
137 int i;
138
139 if (!table || !name || !to)
140 return -EINVAL;
141
142 for (i = 0; i < num; i++) {
143 if (strcmp(names[i], name))
144 continue;
145 table[i] = to * HZ;
146 return 0;
147 }
148 return -ENOENT;
149}
150
151
152const char * ip_vs_state_name(__u16 proto, int state)
153{
154 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
155
156 if (pp == NULL || pp->state_name == NULL)
157 return "ERR!";
158 return pp->state_name(state);
159}
160
161
162void
163ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
164 const struct sk_buff *skb,
165 int offset,
166 const char *msg)
167{
168 char buf[128];
169 struct iphdr _iph, *ih;
170
171 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
172 if (ih == NULL)
173 sprintf(buf, "%s TRUNCATED", pp->name);
174 else if (ih->frag_off & __constant_htons(IP_OFFSET))
175 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
176 pp->name, NIPQUAD(ih->saddr),
177 NIPQUAD(ih->daddr));
178 else {
179 __u16 _ports[2], *pptr
180;
181 pptr = skb_header_pointer(skb, offset + ih->ihl*4,
182 sizeof(_ports), _ports);
183 if (pptr == NULL)
184 sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
185 pp->name,
186 NIPQUAD(ih->saddr),
187 NIPQUAD(ih->daddr));
188 else
189 sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
190 pp->name,
191 NIPQUAD(ih->saddr),
192 ntohs(pptr[0]),
193 NIPQUAD(ih->daddr),
194 ntohs(pptr[1]));
195 }
196
197 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
198}
199
200
201int ip_vs_protocol_init(void)
202{
203 char protocols[64];
204#define REGISTER_PROTOCOL(p) \
205 do { \
206 register_ip_vs_protocol(p); \
207 strcat(protocols, ", "); \
208 strcat(protocols, (p)->name); \
209 } while (0)
210
211 protocols[0] = '\0';
212 protocols[2] = '\0';
213#ifdef CONFIG_IP_VS_PROTO_TCP
214 REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
215#endif
216#ifdef CONFIG_IP_VS_PROTO_UDP
217 REGISTER_PROTOCOL(&ip_vs_protocol_udp);
218#endif
219#ifdef CONFIG_IP_VS_PROTO_ICMP
220 REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
221#endif
222#ifdef CONFIG_IP_VS_PROTO_AH
223 REGISTER_PROTOCOL(&ip_vs_protocol_ah);
224#endif
225#ifdef CONFIG_IP_VS_PROTO_ESP
226 REGISTER_PROTOCOL(&ip_vs_protocol_esp);
227#endif
228 IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
229
230 return 0;
231}
232
233
234void ip_vs_protocol_cleanup(void)
235{
236 struct ip_vs_protocol *pp;
237 int i;
238
239 /* unregister all the ipvs protocols */
240 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
241 while ((pp = ip_vs_proto_table[i]) != NULL)
242 unregister_ip_vs_protocol(pp);
243 }
244}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
new file mode 100644
index 000000000000..453e94a0bbd7
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -0,0 +1,177 @@
1/*
2 * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
5 *
6 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
7 * Wensong Zhang <wensong@linuxvirtualserver.org>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * version 2 as published by the Free Software Foundation;
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42ah_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 __constant_htons(PORT_ISAKMP),
54 iph->daddr,
55 __constant_htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 __constant_htons(PORT_ISAKMP),
60 iph->saddr,
61 __constant_htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 __constant_htons(PORT_ISAKMP),
91 iph->daddr,
92 __constant_htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 __constant_htons(PORT_ISAKMP),
97 iph->saddr,
98 __constant_htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115ah_conn_schedule(struct sk_buff *skb,
116 struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp)
118{
119 /*
120 * AH is only related traffic. Pass the packet to IP stack.
121 */
122 *verdict = NF_ACCEPT;
123 return 0;
124}
125
126
127static void
128ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg)
130{
131 char buf[256];
132 struct iphdr _iph, *ih;
133
134 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
135 if (ih == NULL)
136 sprintf(buf, "%s TRUNCATED", pp->name);
137 else
138 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
139 pp->name, NIPQUAD(ih->saddr),
140 NIPQUAD(ih->daddr));
141
142 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
143}
144
145
146static void ah_init(struct ip_vs_protocol *pp)
147{
148 /* nothing to do now */
149}
150
151
152static void ah_exit(struct ip_vs_protocol *pp)
153{
154 /* nothing to do now */
155}
156
157
158struct ip_vs_protocol ip_vs_protocol_ah = {
159 .name = "AH",
160 .protocol = IPPROTO_AH,
161 .dont_defrag = 1,
162 .init = ah_init,
163 .exit = ah_exit,
164 .conn_schedule = ah_conn_schedule,
165 .conn_in_get = ah_conn_in_get,
166 .conn_out_get = ah_conn_out_get,
167 .snat_handler = NULL,
168 .dnat_handler = NULL,
169 .csum_check = NULL,
170 .state_transition = NULL,
171 .register_app = NULL,
172 .unregister_app = NULL,
173 .app_conn_bind = NULL,
174 .debug_packet = ah_debug_packet,
175 .timeout_change = NULL, /* ISAKMP */
176 .set_state_timeout = NULL,
177};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
new file mode 100644
index 000000000000..478e5c7c7e8e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -0,0 +1,175 @@
1/*
2 * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
5 *
6 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
7 * Wensong Zhang <wensong@linuxvirtualserver.org>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * version 2 as published by the Free Software Foundation;
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42esp_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 __constant_htons(PORT_ISAKMP),
54 iph->daddr,
55 __constant_htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 __constant_htons(PORT_ISAKMP),
60 iph->saddr,
61 __constant_htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 __constant_htons(PORT_ISAKMP),
91 iph->daddr,
92 __constant_htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 __constant_htons(PORT_ISAKMP),
97 iph->saddr,
98 __constant_htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
116 int *verdict, struct ip_vs_conn **cpp)
117{
118 /*
119 * ESP is only related traffic. Pass the packet to IP stack.
120 */
121 *verdict = NF_ACCEPT;
122 return 0;
123}
124
125
126static void
127esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
128 int offset, const char *msg)
129{
130 char buf[256];
131 struct iphdr _iph, *ih;
132
133 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
134 if (ih == NULL)
135 sprintf(buf, "%s TRUNCATED", pp->name);
136 else
137 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
138 pp->name, NIPQUAD(ih->saddr),
139 NIPQUAD(ih->daddr));
140
141 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
142}
143
144
145static void esp_init(struct ip_vs_protocol *pp)
146{
147 /* nothing to do now */
148}
149
150
151static void esp_exit(struct ip_vs_protocol *pp)
152{
153 /* nothing to do now */
154}
155
156
157struct ip_vs_protocol ip_vs_protocol_esp = {
158 .name = "ESP",
159 .protocol = IPPROTO_ESP,
160 .dont_defrag = 1,
161 .init = esp_init,
162 .exit = esp_exit,
163 .conn_schedule = esp_conn_schedule,
164 .conn_in_get = esp_conn_in_get,
165 .conn_out_get = esp_conn_out_get,
166 .snat_handler = NULL,
167 .dnat_handler = NULL,
168 .csum_check = NULL,
169 .state_transition = NULL,
170 .register_app = NULL,
171 .unregister_app = NULL,
172 .app_conn_bind = NULL,
173 .debug_packet = esp_debug_packet,
174 .timeout_change = NULL, /* ISAKMP */
175};
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
new file mode 100644
index 000000000000..191e94aa1c1f
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c
@@ -0,0 +1,182 @@
1/*
2 * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, March 2002
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation;
9 *
10 */
11
12#include <linux/module.h>
13#include <linux/kernel.h>
14#include <linux/icmp.h>
15#include <linux/netfilter.h>
16#include <linux/netfilter_ipv4.h>
17
18#include <net/ip_vs.h>
19
20
21static int icmp_timeouts[1] = { 1*60*HZ };
22
23static char * icmp_state_name_table[1] = { "ICMP" };
24
25static struct ip_vs_conn *
26icmp_conn_in_get(const struct sk_buff *skb,
27 struct ip_vs_protocol *pp,
28 const struct iphdr *iph,
29 unsigned int proto_off,
30 int inverse)
31{
32#if 0
33 struct ip_vs_conn *cp;
34
35 if (likely(!inverse)) {
36 cp = ip_vs_conn_in_get(iph->protocol,
37 iph->saddr, 0,
38 iph->daddr, 0);
39 } else {
40 cp = ip_vs_conn_in_get(iph->protocol,
41 iph->daddr, 0,
42 iph->saddr, 0);
43 }
44
45 return cp;
46
47#else
48 return NULL;
49#endif
50}
51
52static struct ip_vs_conn *
53icmp_conn_out_get(const struct sk_buff *skb,
54 struct ip_vs_protocol *pp,
55 const struct iphdr *iph,
56 unsigned int proto_off,
57 int inverse)
58{
59#if 0
60 struct ip_vs_conn *cp;
61
62 if (likely(!inverse)) {
63 cp = ip_vs_conn_out_get(iph->protocol,
64 iph->saddr, 0,
65 iph->daddr, 0);
66 } else {
67 cp = ip_vs_conn_out_get(IPPROTO_UDP,
68 iph->daddr, 0,
69 iph->saddr, 0);
70 }
71
72 return cp;
73#else
74 return NULL;
75#endif
76}
77
78static int
79icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
80 int *verdict, struct ip_vs_conn **cpp)
81{
82 *verdict = NF_ACCEPT;
83 return 0;
84}
85
86static int
87icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
88{
89 if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
90 if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
91 if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
92 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
93 return 0;
94 }
95 }
96 }
97 return 1;
98}
99
100static void
101icmp_debug_packet(struct ip_vs_protocol *pp,
102 const struct sk_buff *skb,
103 int offset,
104 const char *msg)
105{
106 char buf[256];
107 struct iphdr _iph, *ih;
108
109 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
110 if (ih == NULL)
111 sprintf(buf, "%s TRUNCATED", pp->name);
112 else if (ih->frag_off & __constant_htons(IP_OFFSET))
113 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
114 pp->name, NIPQUAD(ih->saddr),
115 NIPQUAD(ih->daddr));
116 else {
117 struct icmphdr _icmph, *ic;
118
119 ic = skb_header_pointer(skb, offset + ih->ihl*4,
120 sizeof(_icmph), &_icmph);
121 if (ic == NULL)
122 sprintf(buf, "%s TRUNCATED to %u bytes\n",
123 pp->name, skb->len - offset);
124 else
125 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
126 pp->name, NIPQUAD(ih->saddr),
127 NIPQUAD(ih->daddr),
128 ic->type, ic->code);
129 }
130 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
131}
132
133static int
134icmp_state_transition(struct ip_vs_conn *cp, int direction,
135 const struct sk_buff *skb,
136 struct ip_vs_protocol *pp)
137{
138 cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
139 return 1;
140}
141
142static int
143icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
144{
145 int num;
146 char **names;
147
148 num = IP_VS_ICMP_S_LAST;
149 names = icmp_state_name_table;
150 return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
151}
152
153
154static void icmp_init(struct ip_vs_protocol *pp)
155{
156 pp->timeout_table = icmp_timeouts;
157}
158
159static void icmp_exit(struct ip_vs_protocol *pp)
160{
161}
162
163struct ip_vs_protocol ip_vs_protocol_icmp = {
164 .name = "ICMP",
165 .protocol = IPPROTO_ICMP,
166 .dont_defrag = 0,
167 .init = icmp_init,
168 .exit = icmp_exit,
169 .conn_schedule = icmp_conn_schedule,
170 .conn_in_get = icmp_conn_in_get,
171 .conn_out_get = icmp_conn_out_get,
172 .snat_handler = NULL,
173 .dnat_handler = NULL,
174 .csum_check = icmp_csum_check,
175 .state_transition = icmp_state_transition,
176 .register_app = NULL,
177 .unregister_app = NULL,
178 .app_conn_bind = NULL,
179 .debug_packet = icmp_debug_packet,
180 .timeout_change = NULL,
181 .set_state_timeout = icmp_set_state_timeout,
182};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 000000000000..e65de675da74
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,640 @@
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/ip.h>
20#include <linux/tcp.h> /* for tcphdr */
21#include <net/ip.h>
22#include <net/tcp.h> /* for csum_tcpudp_magic */
23#include <linux/netfilter_ipv4.h>
24
25#include <net/ip_vs.h>
26
27
28static struct ip_vs_conn *
29tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
30 const struct iphdr *iph, unsigned int proto_off, int inverse)
31{
32 __u16 _ports[2], *pptr;
33
34 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
35 if (pptr == NULL)
36 return NULL;
37
38 if (likely(!inverse)) {
39 return ip_vs_conn_in_get(iph->protocol,
40 iph->saddr, pptr[0],
41 iph->daddr, pptr[1]);
42 } else {
43 return ip_vs_conn_in_get(iph->protocol,
44 iph->daddr, pptr[1],
45 iph->saddr, pptr[0]);
46 }
47}
48
49static struct ip_vs_conn *
50tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
51 const struct iphdr *iph, unsigned int proto_off, int inverse)
52{
53 __u16 _ports[2], *pptr;
54
55 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
56 if (pptr == NULL)
57 return NULL;
58
59 if (likely(!inverse)) {
60 return ip_vs_conn_out_get(iph->protocol,
61 iph->saddr, pptr[0],
62 iph->daddr, pptr[1]);
63 } else {
64 return ip_vs_conn_out_get(iph->protocol,
65 iph->daddr, pptr[1],
66 iph->saddr, pptr[0]);
67 }
68}
69
70
71static int
72tcp_conn_schedule(struct sk_buff *skb,
73 struct ip_vs_protocol *pp,
74 int *verdict, struct ip_vs_conn **cpp)
75{
76 struct ip_vs_service *svc;
77 struct tcphdr _tcph, *th;
78
79 th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
80 sizeof(_tcph), &_tcph);
81 if (th == NULL) {
82 *verdict = NF_DROP;
83 return 0;
84 }
85
86 if (th->syn &&
87 (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
88 skb->nh.iph->daddr, th->dest))) {
89 if (ip_vs_todrop()) {
90 /*
91 * It seems that we are very loaded.
92 * We have to drop this packet :(
93 */
94 ip_vs_service_put(svc);
95 *verdict = NF_DROP;
96 return 0;
97 }
98
99 /*
100 * Let the virtual server select a real server for the
101 * incoming connection, and create a connection entry.
102 */
103 *cpp = ip_vs_schedule(svc, skb);
104 if (!*cpp) {
105 *verdict = ip_vs_leave(svc, skb, pp);
106 return 0;
107 }
108 ip_vs_service_put(svc);
109 }
110 return 1;
111}
112
113
114static inline void
115tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
116 u16 oldport, u16 newport)
117{
118 tcph->check =
119 ip_vs_check_diff(~oldip, newip,
120 ip_vs_check_diff(oldport ^ 0xFFFF,
121 newport, tcph->check));
122}
123
124
125static int
126tcp_snat_handler(struct sk_buff **pskb,
127 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
128{
129 struct tcphdr *tcph;
130 unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
131
132 /* csum_check requires unshared skb */
133 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
134 return 0;
135
136 if (unlikely(cp->app != NULL)) {
137 /* Some checks before mangling */
138 if (pp->csum_check && !pp->csum_check(*pskb, pp))
139 return 0;
140
141 /* Call application helper if needed */
142 if (!ip_vs_app_pkt_out(cp, pskb))
143 return 0;
144 }
145
146 tcph = (void *)(*pskb)->nh.iph + tcphoff;
147 tcph->source = cp->vport;
148
149 /* Adjust TCP checksums */
150 if (!cp->app) {
151 /* Only port and addr are changed, do fast csum update */
152 tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
153 cp->dport, cp->vport);
154 if ((*pskb)->ip_summed == CHECKSUM_HW)
155 (*pskb)->ip_summed = CHECKSUM_NONE;
156 } else {
157 /* full checksum calculation */
158 tcph->check = 0;
159 (*pskb)->csum = skb_checksum(*pskb, tcphoff,
160 (*pskb)->len - tcphoff, 0);
161 tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
162 (*pskb)->len - tcphoff,
163 cp->protocol,
164 (*pskb)->csum);
165 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
166 pp->name, tcph->check,
167 (char*)&(tcph->check) - (char*)tcph);
168 }
169 return 1;
170}
171
172
173static int
174tcp_dnat_handler(struct sk_buff **pskb,
175 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
176{
177 struct tcphdr *tcph;
178 unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
179
180 /* csum_check requires unshared skb */
181 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
182 return 0;
183
184 if (unlikely(cp->app != NULL)) {
185 /* Some checks before mangling */
186 if (pp->csum_check && !pp->csum_check(*pskb, pp))
187 return 0;
188
189 /*
190 * Attempt ip_vs_app call.
191 * It will fix ip_vs_conn and iph ack_seq stuff
192 */
193 if (!ip_vs_app_pkt_in(cp, pskb))
194 return 0;
195 }
196
197 tcph = (void *)(*pskb)->nh.iph + tcphoff;
198 tcph->dest = cp->dport;
199
200 /*
201 * Adjust TCP checksums
202 */
203 if (!cp->app) {
204 /* Only port and addr are changed, do fast csum update */
205 tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
206 cp->vport, cp->dport);
207 if ((*pskb)->ip_summed == CHECKSUM_HW)
208 (*pskb)->ip_summed = CHECKSUM_NONE;
209 } else {
210 /* full checksum calculation */
211 tcph->check = 0;
212 (*pskb)->csum = skb_checksum(*pskb, tcphoff,
213 (*pskb)->len - tcphoff, 0);
214 tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
215 (*pskb)->len - tcphoff,
216 cp->protocol,
217 (*pskb)->csum);
218 (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
219 }
220 return 1;
221}
222
223
224static int
225tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
226{
227 unsigned int tcphoff = skb->nh.iph->ihl*4;
228
229 switch (skb->ip_summed) {
230 case CHECKSUM_NONE:
231 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
232 case CHECKSUM_HW:
233 if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
234 skb->len - tcphoff,
235 skb->nh.iph->protocol, skb->csum)) {
236 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
237 "Failed checksum for");
238 return 0;
239 }
240 break;
241 default:
242 /* CHECKSUM_UNNECESSARY */
243 break;
244 }
245
246 return 1;
247}
248
249
250#define TCP_DIR_INPUT 0
251#define TCP_DIR_OUTPUT 4
252#define TCP_DIR_INPUT_ONLY 8
253
254static int tcp_state_off[IP_VS_DIR_LAST] = {
255 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
256 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
257 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
258};
259
260/*
261 * Timeout table[state]
262 */
263static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
264 [IP_VS_TCP_S_NONE] = 2*HZ,
265 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
266 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
267 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
268 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
269 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
270 [IP_VS_TCP_S_CLOSE] = 10*HZ,
271 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
272 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
273 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
274 [IP_VS_TCP_S_SYNACK] = 120*HZ,
275 [IP_VS_TCP_S_LAST] = 2*HZ,
276};
277
278
279#if 0
280
281/* FIXME: This is going to die */
282
283static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
284 [IP_VS_TCP_S_NONE] = 2*HZ,
285 [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
286 [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
287 [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
288 [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
289 [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
290 [IP_VS_TCP_S_CLOSE] = 10*HZ,
291 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
292 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
293 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
294 [IP_VS_TCP_S_SYNACK] = 100*HZ,
295 [IP_VS_TCP_S_LAST] = 2*HZ,
296};
297
298#endif
299
300static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
301 [IP_VS_TCP_S_NONE] = "NONE",
302 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
303 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
304 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
305 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
306 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
307 [IP_VS_TCP_S_CLOSE] = "CLOSE",
308 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
309 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
310 [IP_VS_TCP_S_LISTEN] = "LISTEN",
311 [IP_VS_TCP_S_SYNACK] = "SYNACK",
312 [IP_VS_TCP_S_LAST] = "BUG!",
313};
314
315#define sNO IP_VS_TCP_S_NONE
316#define sES IP_VS_TCP_S_ESTABLISHED
317#define sSS IP_VS_TCP_S_SYN_SENT
318#define sSR IP_VS_TCP_S_SYN_RECV
319#define sFW IP_VS_TCP_S_FIN_WAIT
320#define sTW IP_VS_TCP_S_TIME_WAIT
321#define sCL IP_VS_TCP_S_CLOSE
322#define sCW IP_VS_TCP_S_CLOSE_WAIT
323#define sLA IP_VS_TCP_S_LAST_ACK
324#define sLI IP_VS_TCP_S_LISTEN
325#define sSA IP_VS_TCP_S_SYNACK
326
327struct tcp_states_t {
328 int next_state[IP_VS_TCP_S_LAST];
329};
330
331static const char * tcp_state_name(int state)
332{
333 if (state >= IP_VS_TCP_S_LAST)
334 return "ERR!";
335 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
336}
337
338static struct tcp_states_t tcp_states [] = {
339/* INPUT */
340/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
341/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
342/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
343/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
344/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
345
346/* OUTPUT */
347/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
348/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
349/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
350/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
351/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
352
353/* INPUT-ONLY */
354/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
355/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
356/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
357/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
358/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
359};
360
361static struct tcp_states_t tcp_states_dos [] = {
362/* INPUT */
363/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
364/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
365/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
366/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
367/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
368
369/* OUTPUT */
370/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
371/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
372/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
373/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
374/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
375
376/* INPUT-ONLY */
377/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
378/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
379/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
380/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
381/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
382};
383
384static struct tcp_states_t *tcp_state_table = tcp_states;
385
386
387static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
388{
389 int on = (flags & 1); /* secure_tcp */
390
391 /*
392 ** FIXME: change secure_tcp to independent sysctl var
393 ** or make it per-service or per-app because it is valid
394 ** for most if not for all of the applications. Something
395 ** like "capabilities" (flags) for each object.
396 */
397 tcp_state_table = (on? tcp_states_dos : tcp_states);
398}
399
400static int
401tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
402{
403 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
404 tcp_state_name_table, sname, to);
405}
406
407static inline int tcp_state_idx(struct tcphdr *th)
408{
409 if (th->rst)
410 return 3;
411 if (th->syn)
412 return 0;
413 if (th->fin)
414 return 1;
415 if (th->ack)
416 return 2;
417 return -1;
418}
419
420static inline void
421set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
422 int direction, struct tcphdr *th)
423{
424 int state_idx;
425 int new_state = IP_VS_TCP_S_CLOSE;
426 int state_off = tcp_state_off[direction];
427
428 /*
429 * Update state offset to INPUT_ONLY if necessary
430 * or delete NO_OUTPUT flag if output packet detected
431 */
432 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
433 if (state_off == TCP_DIR_OUTPUT)
434 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
435 else
436 state_off = TCP_DIR_INPUT_ONLY;
437 }
438
439 if ((state_idx = tcp_state_idx(th)) < 0) {
440 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
441 goto tcp_state_out;
442 }
443
444 new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
445
446 tcp_state_out:
447 if (new_state != cp->state) {
448 struct ip_vs_dest *dest = cp->dest;
449
450 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
451 "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
452 pp->name,
453 (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
454 th->syn? 'S' : '.',
455 th->fin? 'F' : '.',
456 th->ack? 'A' : '.',
457 th->rst? 'R' : '.',
458 NIPQUAD(cp->daddr), ntohs(cp->dport),
459 NIPQUAD(cp->caddr), ntohs(cp->cport),
460 tcp_state_name(cp->state),
461 tcp_state_name(new_state),
462 atomic_read(&cp->refcnt));
463 if (dest) {
464 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
465 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
466 atomic_dec(&dest->activeconns);
467 atomic_inc(&dest->inactconns);
468 cp->flags |= IP_VS_CONN_F_INACTIVE;
469 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
470 (new_state == IP_VS_TCP_S_ESTABLISHED)) {
471 atomic_inc(&dest->activeconns);
472 atomic_dec(&dest->inactconns);
473 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
474 }
475 }
476 }
477
478 cp->timeout = pp->timeout_table[cp->state = new_state];
479}
480
481
482/*
483 * Handle state transitions
484 */
485static int
486tcp_state_transition(struct ip_vs_conn *cp, int direction,
487 const struct sk_buff *skb,
488 struct ip_vs_protocol *pp)
489{
490 struct tcphdr _tcph, *th;
491
492 th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
493 sizeof(_tcph), &_tcph);
494 if (th == NULL)
495 return 0;
496
497 spin_lock(&cp->lock);
498 set_tcp_state(pp, cp, direction, th);
499 spin_unlock(&cp->lock);
500
501 return 1;
502}
503
504
505/*
506 * Hash table for TCP application incarnations
507 */
508#define TCP_APP_TAB_BITS 4
509#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
510#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
511
512static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
513static DEFINE_SPINLOCK(tcp_app_lock);
514
515static inline __u16 tcp_app_hashkey(__u16 port)
516{
517 return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
518}
519
520
521static int tcp_register_app(struct ip_vs_app *inc)
522{
523 struct ip_vs_app *i;
524 __u16 hash, port = inc->port;
525 int ret = 0;
526
527 hash = tcp_app_hashkey(port);
528
529 spin_lock_bh(&tcp_app_lock);
530 list_for_each_entry(i, &tcp_apps[hash], p_list) {
531 if (i->port == port) {
532 ret = -EEXIST;
533 goto out;
534 }
535 }
536 list_add(&inc->p_list, &tcp_apps[hash]);
537 atomic_inc(&ip_vs_protocol_tcp.appcnt);
538
539 out:
540 spin_unlock_bh(&tcp_app_lock);
541 return ret;
542}
543
544
545static void
546tcp_unregister_app(struct ip_vs_app *inc)
547{
548 spin_lock_bh(&tcp_app_lock);
549 atomic_dec(&ip_vs_protocol_tcp.appcnt);
550 list_del(&inc->p_list);
551 spin_unlock_bh(&tcp_app_lock);
552}
553
554
555static int
556tcp_app_conn_bind(struct ip_vs_conn *cp)
557{
558 int hash;
559 struct ip_vs_app *inc;
560 int result = 0;
561
562 /* Default binding: bind app only for NAT */
563 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
564 return 0;
565
566 /* Lookup application incarnations and bind the right one */
567 hash = tcp_app_hashkey(cp->vport);
568
569 spin_lock(&tcp_app_lock);
570 list_for_each_entry(inc, &tcp_apps[hash], p_list) {
571 if (inc->port == cp->vport) {
572 if (unlikely(!ip_vs_app_inc_get(inc)))
573 break;
574 spin_unlock(&tcp_app_lock);
575
576 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
577 "%u.%u.%u.%u:%u to app %s on port %u\n",
578 __FUNCTION__,
579 NIPQUAD(cp->caddr), ntohs(cp->cport),
580 NIPQUAD(cp->vaddr), ntohs(cp->vport),
581 inc->name, ntohs(inc->port));
582 cp->app = inc;
583 if (inc->init_conn)
584 result = inc->init_conn(inc, cp);
585 goto out;
586 }
587 }
588 spin_unlock(&tcp_app_lock);
589
590 out:
591 return result;
592}
593
594
595/*
596 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
597 */
598void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
599{
600 spin_lock(&cp->lock);
601 cp->state = IP_VS_TCP_S_LISTEN;
602 cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
603 spin_unlock(&cp->lock);
604}
605
606
607static void tcp_init(struct ip_vs_protocol *pp)
608{
609 IP_VS_INIT_HASH_TABLE(tcp_apps);
610 pp->timeout_table = tcp_timeouts;
611}
612
613
614static void tcp_exit(struct ip_vs_protocol *pp)
615{
616}
617
618
619struct ip_vs_protocol ip_vs_protocol_tcp = {
620 .name = "TCP",
621 .protocol = IPPROTO_TCP,
622 .dont_defrag = 0,
623 .appcnt = ATOMIC_INIT(0),
624 .init = tcp_init,
625 .exit = tcp_exit,
626 .register_app = tcp_register_app,
627 .unregister_app = tcp_unregister_app,
628 .conn_schedule = tcp_conn_schedule,
629 .conn_in_get = tcp_conn_in_get,
630 .conn_out_get = tcp_conn_out_get,
631 .snat_handler = tcp_snat_handler,
632 .dnat_handler = tcp_dnat_handler,
633 .csum_check = tcp_csum_check,
634 .state_name = tcp_state_name,
635 .state_transition = tcp_state_transition,
636 .app_conn_bind = tcp_app_conn_bind,
637 .debug_packet = ip_vs_tcpudp_debug_packet,
638 .timeout_change = tcp_timeout_change,
639 .set_state_timeout = tcp_set_state_timeout,
640};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 000000000000..8ae5f2e0aefa
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,427 @@
1/*
2 * ip_vs_proto_udp.c: UDP load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/netfilter_ipv4.h>
20
21#include <net/ip_vs.h>
22
23
24static struct ip_vs_conn *
25udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
26 const struct iphdr *iph, unsigned int proto_off, int inverse)
27{
28 struct ip_vs_conn *cp;
29 __u16 _ports[2], *pptr;
30
31 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
32 if (pptr == NULL)
33 return NULL;
34
35 if (likely(!inverse)) {
36 cp = ip_vs_conn_in_get(iph->protocol,
37 iph->saddr, pptr[0],
38 iph->daddr, pptr[1]);
39 } else {
40 cp = ip_vs_conn_in_get(iph->protocol,
41 iph->daddr, pptr[1],
42 iph->saddr, pptr[0]);
43 }
44
45 return cp;
46}
47
48
49static struct ip_vs_conn *
50udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
51 const struct iphdr *iph, unsigned int proto_off, int inverse)
52{
53 struct ip_vs_conn *cp;
54 __u16 _ports[2], *pptr;
55
56 pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4,
57 sizeof(_ports), _ports);
58 if (pptr == NULL)
59 return NULL;
60
61 if (likely(!inverse)) {
62 cp = ip_vs_conn_out_get(iph->protocol,
63 iph->saddr, pptr[0],
64 iph->daddr, pptr[1]);
65 } else {
66 cp = ip_vs_conn_out_get(iph->protocol,
67 iph->daddr, pptr[1],
68 iph->saddr, pptr[0]);
69 }
70
71 return cp;
72}
73
74
75static int
76udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
77 int *verdict, struct ip_vs_conn **cpp)
78{
79 struct ip_vs_service *svc;
80 struct udphdr _udph, *uh;
81
82 uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
83 sizeof(_udph), &_udph);
84 if (uh == NULL) {
85 *verdict = NF_DROP;
86 return 0;
87 }
88
89 if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
90 skb->nh.iph->daddr, uh->dest))) {
91 if (ip_vs_todrop()) {
92 /*
93 * It seems that we are very loaded.
94 * We have to drop this packet :(
95 */
96 ip_vs_service_put(svc);
97 *verdict = NF_DROP;
98 return 0;
99 }
100
101 /*
102 * Let the virtual server select a real server for the
103 * incoming connection, and create a connection entry.
104 */
105 *cpp = ip_vs_schedule(svc, skb);
106 if (!*cpp) {
107 *verdict = ip_vs_leave(svc, skb, pp);
108 return 0;
109 }
110 ip_vs_service_put(svc);
111 }
112 return 1;
113}
114
115
116static inline void
117udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
118 u16 oldport, u16 newport)
119{
120 uhdr->check =
121 ip_vs_check_diff(~oldip, newip,
122 ip_vs_check_diff(oldport ^ 0xFFFF,
123 newport, uhdr->check));
124 if (!uhdr->check)
125 uhdr->check = 0xFFFF;
126}
127
128static int
129udp_snat_handler(struct sk_buff **pskb,
130 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
131{
132 struct udphdr *udph;
133 unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
134
135 /* csum_check requires unshared skb */
136 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
137 return 0;
138
139 if (unlikely(cp->app != NULL)) {
140 /* Some checks before mangling */
141 if (pp->csum_check && !pp->csum_check(*pskb, pp))
142 return 0;
143
144 /*
145 * Call application helper if needed
146 */
147 if (!ip_vs_app_pkt_out(cp, pskb))
148 return 0;
149 }
150
151 udph = (void *)(*pskb)->nh.iph + udphoff;
152 udph->source = cp->vport;
153
154 /*
155 * Adjust UDP checksums
156 */
157 if (!cp->app && (udph->check != 0)) {
158 /* Only port and addr are changed, do fast csum update */
159 udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
160 cp->dport, cp->vport);
161 if ((*pskb)->ip_summed == CHECKSUM_HW)
162 (*pskb)->ip_summed = CHECKSUM_NONE;
163 } else {
164 /* full checksum calculation */
165 udph->check = 0;
166 (*pskb)->csum = skb_checksum(*pskb, udphoff,
167 (*pskb)->len - udphoff, 0);
168 udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
169 (*pskb)->len - udphoff,
170 cp->protocol,
171 (*pskb)->csum);
172 if (udph->check == 0)
173 udph->check = 0xFFFF;
174 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
175 pp->name, udph->check,
176 (char*)&(udph->check) - (char*)udph);
177 }
178 return 1;
179}
180
181
182static int
183udp_dnat_handler(struct sk_buff **pskb,
184 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
185{
186 struct udphdr *udph;
187 unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
188
189 /* csum_check requires unshared skb */
190 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
191 return 0;
192
193 if (unlikely(cp->app != NULL)) {
194 /* Some checks before mangling */
195 if (pp->csum_check && !pp->csum_check(*pskb, pp))
196 return 0;
197
198 /*
199 * Attempt ip_vs_app call.
200 * It will fix ip_vs_conn
201 */
202 if (!ip_vs_app_pkt_in(cp, pskb))
203 return 0;
204 }
205
206 udph = (void *)(*pskb)->nh.iph + udphoff;
207 udph->dest = cp->dport;
208
209 /*
210 * Adjust UDP checksums
211 */
212 if (!cp->app && (udph->check != 0)) {
213 /* Only port and addr are changed, do fast csum update */
214 udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
215 cp->vport, cp->dport);
216 if ((*pskb)->ip_summed == CHECKSUM_HW)
217 (*pskb)->ip_summed = CHECKSUM_NONE;
218 } else {
219 /* full checksum calculation */
220 udph->check = 0;
221 (*pskb)->csum = skb_checksum(*pskb, udphoff,
222 (*pskb)->len - udphoff, 0);
223 udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
224 (*pskb)->len - udphoff,
225 cp->protocol,
226 (*pskb)->csum);
227 if (udph->check == 0)
228 udph->check = 0xFFFF;
229 (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
230 }
231 return 1;
232}
233
234
235static int
236udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
237{
238 struct udphdr _udph, *uh;
239 unsigned int udphoff = skb->nh.iph->ihl*4;
240
241 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
242 if (uh == NULL)
243 return 0;
244
245 if (uh->check != 0) {
246 switch (skb->ip_summed) {
247 case CHECKSUM_NONE:
248 skb->csum = skb_checksum(skb, udphoff,
249 skb->len - udphoff, 0);
250 case CHECKSUM_HW:
251 if (csum_tcpudp_magic(skb->nh.iph->saddr,
252 skb->nh.iph->daddr,
253 skb->len - udphoff,
254 skb->nh.iph->protocol,
255 skb->csum)) {
256 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
257 "Failed checksum for");
258 return 0;
259 }
260 break;
261 default:
262 /* CHECKSUM_UNNECESSARY */
263 break;
264 }
265 }
266 return 1;
267}
268
269
270/*
271 * Note: the caller guarantees that only one of register_app,
272 * unregister_app or app_conn_bind is called each time.
273 */
274
275#define UDP_APP_TAB_BITS 4
276#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
277#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
278
279static struct list_head udp_apps[UDP_APP_TAB_SIZE];
280static DEFINE_SPINLOCK(udp_app_lock);
281
282static inline __u16 udp_app_hashkey(__u16 port)
283{
284 return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
285}
286
287
288static int udp_register_app(struct ip_vs_app *inc)
289{
290 struct ip_vs_app *i;
291 __u16 hash, port = inc->port;
292 int ret = 0;
293
294 hash = udp_app_hashkey(port);
295
296
297 spin_lock_bh(&udp_app_lock);
298 list_for_each_entry(i, &udp_apps[hash], p_list) {
299 if (i->port == port) {
300 ret = -EEXIST;
301 goto out;
302 }
303 }
304 list_add(&inc->p_list, &udp_apps[hash]);
305 atomic_inc(&ip_vs_protocol_udp.appcnt);
306
307 out:
308 spin_unlock_bh(&udp_app_lock);
309 return ret;
310}
311
312
313static void
314udp_unregister_app(struct ip_vs_app *inc)
315{
316 spin_lock_bh(&udp_app_lock);
317 atomic_dec(&ip_vs_protocol_udp.appcnt);
318 list_del(&inc->p_list);
319 spin_unlock_bh(&udp_app_lock);
320}
321
322
323static int udp_app_conn_bind(struct ip_vs_conn *cp)
324{
325 int hash;
326 struct ip_vs_app *inc;
327 int result = 0;
328
329 /* Default binding: bind app only for NAT */
330 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
331 return 0;
332
333 /* Lookup application incarnations and bind the right one */
334 hash = udp_app_hashkey(cp->vport);
335
336 spin_lock(&udp_app_lock);
337 list_for_each_entry(inc, &udp_apps[hash], p_list) {
338 if (inc->port == cp->vport) {
339 if (unlikely(!ip_vs_app_inc_get(inc)))
340 break;
341 spin_unlock(&udp_app_lock);
342
343 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
344 "%u.%u.%u.%u:%u to app %s on port %u\n",
345 __FUNCTION__,
346 NIPQUAD(cp->caddr), ntohs(cp->cport),
347 NIPQUAD(cp->vaddr), ntohs(cp->vport),
348 inc->name, ntohs(inc->port));
349 cp->app = inc;
350 if (inc->init_conn)
351 result = inc->init_conn(inc, cp);
352 goto out;
353 }
354 }
355 spin_unlock(&udp_app_lock);
356
357 out:
358 return result;
359}
360
361
362static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
363 [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
364 [IP_VS_UDP_S_LAST] = 2*HZ,
365};
366
367static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
368 [IP_VS_UDP_S_NORMAL] = "UDP",
369 [IP_VS_UDP_S_LAST] = "BUG!",
370};
371
372
373static int
374udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
375{
376 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
377 udp_state_name_table, sname, to);
378}
379
380static const char * udp_state_name(int state)
381{
382 if (state >= IP_VS_UDP_S_LAST)
383 return "ERR!";
384 return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
385}
386
387static int
388udp_state_transition(struct ip_vs_conn *cp, int direction,
389 const struct sk_buff *skb,
390 struct ip_vs_protocol *pp)
391{
392 cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
393 return 1;
394}
395
396static void udp_init(struct ip_vs_protocol *pp)
397{
398 IP_VS_INIT_HASH_TABLE(udp_apps);
399 pp->timeout_table = udp_timeouts;
400}
401
402static void udp_exit(struct ip_vs_protocol *pp)
403{
404}
405
406
407struct ip_vs_protocol ip_vs_protocol_udp = {
408 .name = "UDP",
409 .protocol = IPPROTO_UDP,
410 .dont_defrag = 0,
411 .init = udp_init,
412 .exit = udp_exit,
413 .conn_schedule = udp_conn_schedule,
414 .conn_in_get = udp_conn_in_get,
415 .conn_out_get = udp_conn_out_get,
416 .snat_handler = udp_snat_handler,
417 .dnat_handler = udp_dnat_handler,
418 .csum_check = udp_csum_check,
419 .state_transition = udp_state_transition,
420 .state_name = udp_state_name,
421 .register_app = udp_register_app,
422 .unregister_app = udp_unregister_app,
423 .app_conn_bind = udp_app_conn_bind,
424 .debug_packet = ip_vs_tcpudp_debug_packet,
425 .timeout_change = NULL,
426 .set_state_timeout = udp_set_state_timeout,
427};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
new file mode 100644
index 000000000000..b23bab231cab
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -0,0 +1,118 @@
1/*
2 * IPVS: Round-Robin Scheduling module
3 *
4 * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Peter Kese <peter.kese@ijs.si>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Fixes/Changes:
15 * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
16 * Julian Anastasov : fixed the NULL pointer access bug in debugging
17 * Wensong Zhang : changed some comestics things for debugging
18 * Wensong Zhang : changed for the d-linked destination list
19 * Wensong Zhang : added the ip_vs_rr_update_svc
20 * Wensong Zhang : added any dest with weight=0 is quiesced
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26
27#include <net/ip_vs.h>
28
29
30static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
31{
32 svc->sched_data = &svc->destinations;
33 return 0;
34}
35
36
37static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
38{
39 return 0;
40}
41
42
43static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
44{
45 svc->sched_data = &svc->destinations;
46 return 0;
47}
48
49
50/*
51 * Round-Robin Scheduling
52 */
53static struct ip_vs_dest *
54ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
55{
56 struct list_head *p, *q;
57 struct ip_vs_dest *dest;
58
59 IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
60
61 write_lock(&svc->sched_lock);
62 p = (struct list_head *)svc->sched_data;
63 p = p->next;
64 q = p;
65 do {
66 /* skip list head */
67 if (q == &svc->destinations) {
68 q = q->next;
69 continue;
70 }
71
72 dest = list_entry(q, struct ip_vs_dest, n_list);
73 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
74 atomic_read(&dest->weight) > 0)
75 /* HIT */
76 goto out;
77 q = q->next;
78 } while (q != p);
79 write_unlock(&svc->sched_lock);
80 return NULL;
81
82 out:
83 svc->sched_data = q;
84 write_unlock(&svc->sched_lock);
85 IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
86 "activeconns %d refcnt %d weight %d\n",
87 NIPQUAD(dest->addr), ntohs(dest->port),
88 atomic_read(&dest->activeconns),
89 atomic_read(&dest->refcnt), atomic_read(&dest->weight));
90
91 return dest;
92}
93
94
95static struct ip_vs_scheduler ip_vs_rr_scheduler = {
96 .name = "rr", /* name */
97 .refcnt = ATOMIC_INIT(0),
98 .module = THIS_MODULE,
99 .init_service = ip_vs_rr_init_svc,
100 .done_service = ip_vs_rr_done_svc,
101 .update_service = ip_vs_rr_update_svc,
102 .schedule = ip_vs_rr_schedule,
103};
104
105static int __init ip_vs_rr_init(void)
106{
107 INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
108 return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
109}
110
111static void __exit ip_vs_rr_cleanup(void)
112{
113 unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
114}
115
116module_init(ip_vs_rr_init);
117module_exit(ip_vs_rr_cleanup);
118MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
new file mode 100644
index 000000000000..0f7c56a225bd
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 *
18 * Changes:
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/sched.h>
24#include <linux/spinlock.h>
25#include <asm/string.h>
26#include <linux/kmod.h>
27
28#include <net/ip_vs.h>
29
30/*
31 * IPVS scheduler list
32 */
33static LIST_HEAD(ip_vs_schedulers);
34
35/* lock for service table */
36static DEFINE_RWLOCK(__ip_vs_sched_lock);
37
38
39/*
40 * Bind a service with a scheduler
41 */
42int ip_vs_bind_scheduler(struct ip_vs_service *svc,
43 struct ip_vs_scheduler *scheduler)
44{
45 int ret;
46
47 if (svc == NULL) {
48 IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
49 return -EINVAL;
50 }
51 if (scheduler == NULL) {
52 IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
53 return -EINVAL;
54 }
55
56 svc->scheduler = scheduler;
57
58 if (scheduler->init_service) {
59 ret = scheduler->init_service(svc);
60 if (ret) {
61 IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
62 return ret;
63 }
64 }
65
66 return 0;
67}
68
69
70/*
71 * Unbind a service with its scheduler
72 */
73int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
74{
75 struct ip_vs_scheduler *sched;
76
77 if (svc == NULL) {
78 IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
79 return -EINVAL;
80 }
81
82 sched = svc->scheduler;
83 if (sched == NULL) {
84 IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
85 return -EINVAL;
86 }
87
88 if (sched->done_service) {
89 if (sched->done_service(svc) != 0) {
90 IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
91 return -EINVAL;
92 }
93 }
94
95 svc->scheduler = NULL;
96 return 0;
97}
98
99
100/*
101 * Get scheduler in the scheduler list by name
102 */
103static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
104{
105 struct ip_vs_scheduler *sched;
106
107 IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
108 sched_name);
109
110 read_lock_bh(&__ip_vs_sched_lock);
111
112 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
113 /*
114 * Test and get the modules atomically
115 */
116 if (sched->module && !try_module_get(sched->module)) {
117 /*
118 * This scheduler is just deleted
119 */
120 continue;
121 }
122 if (strcmp(sched_name, sched->name)==0) {
123 /* HIT */
124 read_unlock_bh(&__ip_vs_sched_lock);
125 return sched;
126 }
127 if (sched->module)
128 module_put(sched->module);
129 }
130
131 read_unlock_bh(&__ip_vs_sched_lock);
132 return NULL;
133}
134
135
136/*
137 * Lookup scheduler and try to load it if it doesn't exist
138 */
139struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
140{
141 struct ip_vs_scheduler *sched;
142
143 /*
144 * Search for the scheduler by sched_name
145 */
146 sched = ip_vs_sched_getbyname(sched_name);
147
148 /*
149 * If scheduler not found, load the module and search again
150 */
151 if (sched == NULL) {
152 request_module("ip_vs_%s", sched_name);
153 sched = ip_vs_sched_getbyname(sched_name);
154 }
155
156 return sched;
157}
158
159void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
160{
161 if (scheduler->module)
162 module_put(scheduler->module);
163}
164
165
166/*
167 * Register a scheduler in the scheduler list
168 */
169int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
170{
171 struct ip_vs_scheduler *sched;
172
173 if (!scheduler) {
174 IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
175 return -EINVAL;
176 }
177
178 if (!scheduler->name) {
179 IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
180 return -EINVAL;
181 }
182
183 /* increase the module use count */
184 ip_vs_use_count_inc();
185
186 /*
187 * Make sure that the scheduler with this name doesn't exist
188 * in the scheduler list.
189 */
190 sched = ip_vs_sched_getbyname(scheduler->name);
191 if (sched) {
192 ip_vs_scheduler_put(sched);
193 ip_vs_use_count_dec();
194 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
195 "already existed in the system\n", scheduler->name);
196 return -EINVAL;
197 }
198
199 write_lock_bh(&__ip_vs_sched_lock);
200
201 if (scheduler->n_list.next != &scheduler->n_list) {
202 write_unlock_bh(&__ip_vs_sched_lock);
203 ip_vs_use_count_dec();
204 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
205 "already linked\n", scheduler->name);
206 return -EINVAL;
207 }
208
209 /*
210 * Add it into the d-linked scheduler list
211 */
212 list_add(&scheduler->n_list, &ip_vs_schedulers);
213 write_unlock_bh(&__ip_vs_sched_lock);
214
215 IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
216
217 return 0;
218}
219
220
221/*
222 * Unregister a scheduler from the scheduler list
223 */
224int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
225{
226 if (!scheduler) {
227 IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
228 return -EINVAL;
229 }
230
231 write_lock_bh(&__ip_vs_sched_lock);
232 if (scheduler->n_list.next == &scheduler->n_list) {
233 write_unlock_bh(&__ip_vs_sched_lock);
234 IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
235 "is not in the list. failed\n", scheduler->name);
236 return -EINVAL;
237 }
238
239 /*
240 * Remove it from the d-linked scheduler list
241 */
242 list_del(&scheduler->n_list);
243 write_unlock_bh(&__ip_vs_sched_lock);
244
245 /* decrease the module use count */
246 ip_vs_use_count_dec();
247
248 IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
249
250 return 0;
251}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
new file mode 100644
index 000000000000..ff366f7390d9
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -0,0 +1,163 @@
1/*
2 * IPVS: Shortest Expected Delay scheduling module
3 *
4 * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16
17/*
18 * The SED algorithm attempts to minimize each job's expected delay until
19 * completion. The expected delay that the job will experience is
20 * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
21 * jobs on the the ith server and Ui is the fixed service rate (weight) of
22 * the ith server. The SED algorithm adopts a greedy policy that each does
23 * what is in its own best interest, i.e. to join the queue which would
24 * minimize its expected delay of completion.
25 *
26 * See the following paper for more information:
27 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
28 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
29 * pages 986-994, 1988.
30 *
31 * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
32 *
33 * The difference between SED and WLC is that SED includes the incoming
34 * job in the cost function (the increment of 1). SED may outperform
35 * WLC, while scheduling big jobs under larger heterogeneous systems
36 * (the server weight varies a lot).
37 *
38 */
39
40#include <linux/module.h>
41#include <linux/kernel.h>
42
43#include <net/ip_vs.h>
44
45
46static int
47ip_vs_sed_init_svc(struct ip_vs_service *svc)
48{
49 return 0;
50}
51
52
53static int
54ip_vs_sed_done_svc(struct ip_vs_service *svc)
55{
56 return 0;
57}
58
59
60static int
61ip_vs_sed_update_svc(struct ip_vs_service *svc)
62{
63 return 0;
64}
65
66
67static inline unsigned int
68ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
69{
70 /*
71 * We only use the active connection number in the cost
72 * calculation here.
73 */
74 return atomic_read(&dest->activeconns) + 1;
75}
76
77
78/*
79 * Weighted Least Connection scheduling
80 */
81static struct ip_vs_dest *
82ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
83{
84 struct ip_vs_dest *dest, *least;
85 unsigned int loh, doh;
86
87 IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
88
89 /*
90 * We calculate the load of each dest server as follows:
91 * (server expected overhead) / dest->weight
92 *
93 * Remember -- no floats in kernel mode!!!
94 * The comparison of h1*w2 > h2*w1 is equivalent to that of
95 * h1/w1 > h2/w2
96 * if every weight is larger than zero.
97 *
98 * The server with weight=0 is quiesced and will not receive any
99 * new connections.
100 */
101
102 list_for_each_entry(dest, &svc->destinations, n_list) {
103 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
104 atomic_read(&dest->weight) > 0) {
105 least = dest;
106 loh = ip_vs_sed_dest_overhead(least);
107 goto nextstage;
108 }
109 }
110 return NULL;
111
112 /*
113 * Find the destination with the least load.
114 */
115 nextstage:
116 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
117 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
118 continue;
119 doh = ip_vs_sed_dest_overhead(dest);
120 if (loh * atomic_read(&dest->weight) >
121 doh * atomic_read(&least->weight)) {
122 least = dest;
123 loh = doh;
124 }
125 }
126
127 IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
128 "activeconns %d refcnt %d weight %d overhead %d\n",
129 NIPQUAD(least->addr), ntohs(least->port),
130 atomic_read(&least->activeconns),
131 atomic_read(&least->refcnt),
132 atomic_read(&least->weight), loh);
133
134 return least;
135}
136
137
138static struct ip_vs_scheduler ip_vs_sed_scheduler =
139{
140 .name = "sed",
141 .refcnt = ATOMIC_INIT(0),
142 .module = THIS_MODULE,
143 .init_service = ip_vs_sed_init_svc,
144 .done_service = ip_vs_sed_done_svc,
145 .update_service = ip_vs_sed_update_svc,
146 .schedule = ip_vs_sed_schedule,
147};
148
149
150static int __init ip_vs_sed_init(void)
151{
152 INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
153 return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
154}
155
156static void __exit ip_vs_sed_cleanup(void)
157{
158 unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
159}
160
161module_init(ip_vs_sed_init);
162module_exit(ip_vs_sed_cleanup);
163MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
new file mode 100644
index 000000000000..6f7c50e44a39
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -0,0 +1,255 @@
1/*
2 * IPVS: Source Hashing scheduling module
3 *
4 * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16
17/*
18 * The sh algorithm is to select server by the hash key of source IP
19 * address. The pseudo code is as follows:
20 *
21 * n <- servernode[src_ip];
22 * if (n is dead) OR
23 * (n is overloaded) or (n.weight <= 0) then
24 * return NULL;
25 *
26 * return n;
27 *
28 * Notes that servernode is a 256-bucket hash table that maps the hash
29 * index derived from packet source IP address to the current server
30 * array. If the sh scheduler is used in cache cluster, it is good to
31 * combine it with cache_bypass feature. When the statically assigned
32 * server is dead or overloaded, the load balancer can bypass the cache
33 * server and send requests to the original server directly.
34 *
35 */
36
37#include <linux/module.h>
38#include <linux/kernel.h>
39
40#include <net/ip_vs.h>
41
42
43/*
44 * IPVS SH bucket
45 */
46struct ip_vs_sh_bucket {
47 struct ip_vs_dest *dest; /* real server (cache) */
48};
49
50/*
51 * for IPVS SH entry hash table
52 */
53#ifndef CONFIG_IP_VS_SH_TAB_BITS
54#define CONFIG_IP_VS_SH_TAB_BITS 8
55#endif
56#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
57#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
58#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
59
60
61/*
62 * Returns hash value for IPVS SH entry
63 */
64static inline unsigned ip_vs_sh_hashkey(__u32 addr)
65{
66 return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
67}
68
69
70/*
71 * Get ip_vs_dest associated with supplied parameters.
72 */
73static inline struct ip_vs_dest *
74ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
75{
76 return (tbl[ip_vs_sh_hashkey(addr)]).dest;
77}
78
79
80/*
81 * Assign all the hash buckets of the specified table with the service.
82 */
83static int
84ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
85{
86 int i;
87 struct ip_vs_sh_bucket *b;
88 struct list_head *p;
89 struct ip_vs_dest *dest;
90
91 b = tbl;
92 p = &svc->destinations;
93 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
94 if (list_empty(p)) {
95 b->dest = NULL;
96 } else {
97 if (p == &svc->destinations)
98 p = p->next;
99
100 dest = list_entry(p, struct ip_vs_dest, n_list);
101 atomic_inc(&dest->refcnt);
102 b->dest = dest;
103
104 p = p->next;
105 }
106 b++;
107 }
108 return 0;
109}
110
111
112/*
113 * Flush all the hash buckets of the specified table.
114 */
115static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
116{
117 int i;
118 struct ip_vs_sh_bucket *b;
119
120 b = tbl;
121 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
122 if (b->dest) {
123 atomic_dec(&b->dest->refcnt);
124 b->dest = NULL;
125 }
126 b++;
127 }
128}
129
130
131static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
132{
133 struct ip_vs_sh_bucket *tbl;
134
135 /* allocate the SH table for this service */
136 tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
137 GFP_ATOMIC);
138 if (tbl == NULL) {
139 IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
140 return -ENOMEM;
141 }
142 svc->sched_data = tbl;
143 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
144 "current service\n",
145 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
146
147 /* assign the hash buckets with the updated service */
148 ip_vs_sh_assign(tbl, svc);
149
150 return 0;
151}
152
153
154static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
155{
156 struct ip_vs_sh_bucket *tbl = svc->sched_data;
157
158 /* got to clean up hash buckets here */
159 ip_vs_sh_flush(tbl);
160
161 /* release the table itself */
162 kfree(svc->sched_data);
163 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
164 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
165
166 return 0;
167}
168
169
170static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
171{
172 struct ip_vs_sh_bucket *tbl = svc->sched_data;
173
174 /* got to clean up hash buckets here */
175 ip_vs_sh_flush(tbl);
176
177 /* assign the hash buckets with the updated service */
178 ip_vs_sh_assign(tbl, svc);
179
180 return 0;
181}
182
183
184/*
185 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
186 * consider that the server is overloaded here.
187 */
188static inline int is_overloaded(struct ip_vs_dest *dest)
189{
190 return dest->flags & IP_VS_DEST_F_OVERLOAD;
191}
192
193
194/*
195 * Source Hashing scheduling
196 */
197static struct ip_vs_dest *
198ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
199{
200 struct ip_vs_dest *dest;
201 struct ip_vs_sh_bucket *tbl;
202 struct iphdr *iph = skb->nh.iph;
203
204 IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
205
206 tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
207 dest = ip_vs_sh_get(tbl, iph->saddr);
208 if (!dest
209 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
210 || atomic_read(&dest->weight) <= 0
211 || is_overloaded(dest)) {
212 return NULL;
213 }
214
215 IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
216 "--> server %u.%u.%u.%u:%d\n",
217 NIPQUAD(iph->saddr),
218 NIPQUAD(dest->addr),
219 ntohs(dest->port));
220
221 return dest;
222}
223
224
225/*
226 * IPVS SH Scheduler structure
227 */
228static struct ip_vs_scheduler ip_vs_sh_scheduler =
229{
230 .name = "sh",
231 .refcnt = ATOMIC_INIT(0),
232 .module = THIS_MODULE,
233 .init_service = ip_vs_sh_init_svc,
234 .done_service = ip_vs_sh_done_svc,
235 .update_service = ip_vs_sh_update_svc,
236 .schedule = ip_vs_sh_schedule,
237};
238
239
240static int __init ip_vs_sh_init(void)
241{
242 INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
243 return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
244}
245
246
247static void __exit ip_vs_sh_cleanup(void)
248{
249 unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
250}
251
252
253module_init(ip_vs_sh_init);
254module_exit(ip_vs_sh_cleanup);
255MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
new file mode 100644
index 000000000000..25c479550a32
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -0,0 +1,892 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 *
12 * ip_vs_sync: sync connection info from master load balancer to backups
13 * through multicast
14 *
15 * Changes:
16 * Alexandre Cassen : Added master & backup support at a time.
17 * Alexandre Cassen : Added SyncID support for incoming sync
18 * messages filtering.
19 * Justin Ossevoort : Fix endian problem on sync message size.
20 */
21
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/net.h>
25#include <linux/completion.h>
26#include <linux/delay.h>
27#include <linux/skbuff.h>
28#include <linux/in.h>
29#include <linux/igmp.h> /* for ip_mc_join_group */
30
31#include <net/ip.h>
32#include <net/sock.h>
33#include <asm/uaccess.h> /* for get_fs and set_fs */
34
35#include <net/ip_vs.h>
36
37#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
38#define IP_VS_SYNC_PORT 8848 /* multicast port */
39
40
41/*
42 * IPVS sync connection entry
43 */
44struct ip_vs_sync_conn {
45 __u8 reserved;
46
47 /* Protocol, addresses and port numbers */
48 __u8 protocol; /* Which protocol (TCP/UDP) */
49 __u16 cport;
50 __u16 vport;
51 __u16 dport;
52 __u32 caddr; /* client address */
53 __u32 vaddr; /* virtual address */
54 __u32 daddr; /* destination address */
55
56 /* Flags and state transition */
57 __u16 flags; /* status flags */
58 __u16 state; /* state info */
59
60 /* The sequence options start here */
61};
62
63struct ip_vs_sync_conn_options {
64 struct ip_vs_seq in_seq; /* incoming seq. struct */
65 struct ip_vs_seq out_seq; /* outgoing seq. struct */
66};
67
68#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
69#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
70#define FULL_CONN_SIZE \
71(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
72
73
74/*
75 The master mulitcasts messages to the backup load balancers in the
76 following format.
77
78 0 1 2 3
79 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
80 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
81 | Count Conns | SyncID | Size |
82 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 | |
84 | IPVS Sync Connection (1) |
85 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
86 | . |
87 | . |
88 | . |
89 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 | |
91 | IPVS Sync Connection (n) |
92 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93*/
94
95#define SYNC_MESG_HEADER_LEN 4
96
97struct ip_vs_sync_mesg {
98 __u8 nr_conns;
99 __u8 syncid;
100 __u16 size;
101
102 /* ip_vs_sync_conn entries start here */
103};
104
105/* the maximum length of sync (sending/receiving) message */
106static int sync_send_mesg_maxlen;
107static int sync_recv_mesg_maxlen;
108
109struct ip_vs_sync_buff {
110 struct list_head list;
111 unsigned long firstuse;
112
113 /* pointers for the message data */
114 struct ip_vs_sync_mesg *mesg;
115 unsigned char *head;
116 unsigned char *end;
117};
118
119
120/* the sync_buff list head and the lock */
121static LIST_HEAD(ip_vs_sync_queue);
122static DEFINE_SPINLOCK(ip_vs_sync_lock);
123
124/* current sync_buff for accepting new conn entries */
125static struct ip_vs_sync_buff *curr_sb = NULL;
126static DEFINE_SPINLOCK(curr_sb_lock);
127
128/* ipvs sync daemon state */
129volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
130volatile int ip_vs_master_syncid = 0;
131volatile int ip_vs_backup_syncid = 0;
132
133/* multicast interface name */
134char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
135char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
136
137/* multicast addr */
138static struct sockaddr_in mcast_addr;
139
140
141static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
142{
143 spin_lock(&ip_vs_sync_lock);
144 list_add_tail(&sb->list, &ip_vs_sync_queue);
145 spin_unlock(&ip_vs_sync_lock);
146}
147
148static inline struct ip_vs_sync_buff * sb_dequeue(void)
149{
150 struct ip_vs_sync_buff *sb;
151
152 spin_lock_bh(&ip_vs_sync_lock);
153 if (list_empty(&ip_vs_sync_queue)) {
154 sb = NULL;
155 } else {
156 sb = list_entry(ip_vs_sync_queue.next,
157 struct ip_vs_sync_buff,
158 list);
159 list_del(&sb->list);
160 }
161 spin_unlock_bh(&ip_vs_sync_lock);
162
163 return sb;
164}
165
166static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
167{
168 struct ip_vs_sync_buff *sb;
169
170 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
171 return NULL;
172
173 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
174 kfree(sb);
175 return NULL;
176 }
177 sb->mesg->nr_conns = 0;
178 sb->mesg->syncid = ip_vs_master_syncid;
179 sb->mesg->size = 4;
180 sb->head = (unsigned char *)sb->mesg + 4;
181 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
182 sb->firstuse = jiffies;
183 return sb;
184}
185
186static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
187{
188 kfree(sb->mesg);
189 kfree(sb);
190}
191
192/*
193 * Get the current sync buffer if it has been created for more
194 * than the specified time or the specified time is zero.
195 */
196static inline struct ip_vs_sync_buff *
197get_curr_sync_buff(unsigned long time)
198{
199 struct ip_vs_sync_buff *sb;
200
201 spin_lock_bh(&curr_sb_lock);
202 if (curr_sb && (time == 0 ||
203 time_before(jiffies - curr_sb->firstuse, time))) {
204 sb = curr_sb;
205 curr_sb = NULL;
206 } else
207 sb = NULL;
208 spin_unlock_bh(&curr_sb_lock);
209 return sb;
210}
211
212
213/*
214 * Add an ip_vs_conn information into the current sync_buff.
215 * Called by ip_vs_in.
216 */
217void ip_vs_sync_conn(struct ip_vs_conn *cp)
218{
219 struct ip_vs_sync_mesg *m;
220 struct ip_vs_sync_conn *s;
221 int len;
222
223 spin_lock(&curr_sb_lock);
224 if (!curr_sb) {
225 if (!(curr_sb=ip_vs_sync_buff_create())) {
226 spin_unlock(&curr_sb_lock);
227 IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
228 return;
229 }
230 }
231
232 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
233 SIMPLE_CONN_SIZE;
234 m = curr_sb->mesg;
235 s = (struct ip_vs_sync_conn *)curr_sb->head;
236
237 /* copy members */
238 s->protocol = cp->protocol;
239 s->cport = cp->cport;
240 s->vport = cp->vport;
241 s->dport = cp->dport;
242 s->caddr = cp->caddr;
243 s->vaddr = cp->vaddr;
244 s->daddr = cp->daddr;
245 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
246 s->state = htons(cp->state);
247 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
248 struct ip_vs_sync_conn_options *opt =
249 (struct ip_vs_sync_conn_options *)&s[1];
250 memcpy(opt, &cp->in_seq, sizeof(*opt));
251 }
252
253 m->nr_conns++;
254 m->size += len;
255 curr_sb->head += len;
256
257 /* check if there is a space for next one */
258 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
259 sb_queue_tail(curr_sb);
260 curr_sb = NULL;
261 }
262 spin_unlock(&curr_sb_lock);
263
264 /* synchronize its controller if it has */
265 if (cp->control)
266 ip_vs_sync_conn(cp->control);
267}
268
269
270/*
271 * Process received multicast message and create the corresponding
272 * ip_vs_conn entries.
273 */
274static void ip_vs_process_message(const char *buffer, const size_t buflen)
275{
276 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
277 struct ip_vs_sync_conn *s;
278 struct ip_vs_sync_conn_options *opt;
279 struct ip_vs_conn *cp;
280 char *p;
281 int i;
282
283 /* Convert size back to host byte order */
284 m->size = ntohs(m->size);
285
286 if (buflen != m->size) {
287 IP_VS_ERR("bogus message\n");
288 return;
289 }
290
291 /* SyncID sanity check */
292 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
293 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
294 m->syncid);
295 return;
296 }
297
298 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
299 for (i=0; i<m->nr_conns; i++) {
300 s = (struct ip_vs_sync_conn *)p;
301 cp = ip_vs_conn_in_get(s->protocol,
302 s->caddr, s->cport,
303 s->vaddr, s->vport);
304 if (!cp) {
305 cp = ip_vs_conn_new(s->protocol,
306 s->caddr, s->cport,
307 s->vaddr, s->vport,
308 s->daddr, s->dport,
309 ntohs(s->flags), NULL);
310 if (!cp) {
311 IP_VS_ERR("ip_vs_conn_new failed\n");
312 return;
313 }
314 cp->state = ntohs(s->state);
315 } else if (!cp->dest) {
316 /* it is an entry created by the synchronization */
317 cp->state = ntohs(s->state);
318 cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
319 } /* Note that we don't touch its state and flags
320 if it is a normal entry. */
321
322 if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
323 opt = (struct ip_vs_sync_conn_options *)&s[1];
324 memcpy(&cp->in_seq, opt, sizeof(*opt));
325 p += FULL_CONN_SIZE;
326 } else
327 p += SIMPLE_CONN_SIZE;
328
329 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
330 cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
331 ip_vs_conn_put(cp);
332
333 if (p > buffer+buflen) {
334 IP_VS_ERR("bogus message\n");
335 return;
336 }
337 }
338}
339
340
341/*
342 * Setup loopback of outgoing multicasts on a sending socket
343 */
344static void set_mcast_loop(struct sock *sk, u_char loop)
345{
346 struct inet_sock *inet = inet_sk(sk);
347
348 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
349 lock_sock(sk);
350 inet->mc_loop = loop ? 1 : 0;
351 release_sock(sk);
352}
353
354/*
355 * Specify TTL for outgoing multicasts on a sending socket
356 */
357static void set_mcast_ttl(struct sock *sk, u_char ttl)
358{
359 struct inet_sock *inet = inet_sk(sk);
360
361 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
362 lock_sock(sk);
363 inet->mc_ttl = ttl;
364 release_sock(sk);
365}
366
367/*
368 * Specifiy default interface for outgoing multicasts
369 */
370static int set_mcast_if(struct sock *sk, char *ifname)
371{
372 struct net_device *dev;
373 struct inet_sock *inet = inet_sk(sk);
374
375 if ((dev = __dev_get_by_name(ifname)) == NULL)
376 return -ENODEV;
377
378 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
379 return -EINVAL;
380
381 lock_sock(sk);
382 inet->mc_index = dev->ifindex;
383 /* inet->mc_addr = 0; */
384 release_sock(sk);
385
386 return 0;
387}
388
389
390/*
391 * Set the maximum length of sync message according to the
392 * specified interface's MTU.
393 */
394static int set_sync_mesg_maxlen(int sync_state)
395{
396 struct net_device *dev;
397 int num;
398
399 if (sync_state == IP_VS_STATE_MASTER) {
400 if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
401 return -ENODEV;
402
403 num = (dev->mtu - sizeof(struct iphdr) -
404 sizeof(struct udphdr) -
405 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
406 sync_send_mesg_maxlen =
407 SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
408 IP_VS_DBG(7, "setting the maximum length of sync sending "
409 "message %d.\n", sync_send_mesg_maxlen);
410 } else if (sync_state == IP_VS_STATE_BACKUP) {
411 if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
412 return -ENODEV;
413
414 sync_recv_mesg_maxlen = dev->mtu -
415 sizeof(struct iphdr) - sizeof(struct udphdr);
416 IP_VS_DBG(7, "setting the maximum length of sync receiving "
417 "message %d.\n", sync_recv_mesg_maxlen);
418 }
419
420 return 0;
421}
422
423
424/*
425 * Join a multicast group.
426 * the group is specified by a class D multicast address 224.0.0.0/8
427 * in the in_addr structure passed in as a parameter.
428 */
429static int
430join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
431{
432 struct ip_mreqn mreq;
433 struct net_device *dev;
434 int ret;
435
436 memset(&mreq, 0, sizeof(mreq));
437 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
438
439 if ((dev = __dev_get_by_name(ifname)) == NULL)
440 return -ENODEV;
441 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
442 return -EINVAL;
443
444 mreq.imr_ifindex = dev->ifindex;
445
446 lock_sock(sk);
447 ret = ip_mc_join_group(sk, &mreq);
448 release_sock(sk);
449
450 return ret;
451}
452
453
454static int bind_mcastif_addr(struct socket *sock, char *ifname)
455{
456 struct net_device *dev;
457 u32 addr;
458 struct sockaddr_in sin;
459
460 if ((dev = __dev_get_by_name(ifname)) == NULL)
461 return -ENODEV;
462
463 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
464 if (!addr)
465 IP_VS_ERR("You probably need to specify IP address on "
466 "multicast interface.\n");
467
468 IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
469 ifname, NIPQUAD(addr));
470
471 /* Now bind the socket with the address of multicast interface */
472 sin.sin_family = AF_INET;
473 sin.sin_addr.s_addr = addr;
474 sin.sin_port = 0;
475
476 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
477}
478
479/*
480 * Set up sending multicast socket over UDP
481 */
482static struct socket * make_send_sock(void)
483{
484 struct socket *sock;
485
486 /* First create a socket */
487 if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
488 IP_VS_ERR("Error during creation of socket; terminating\n");
489 return NULL;
490 }
491
492 if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
493 IP_VS_ERR("Error setting outbound mcast interface\n");
494 goto error;
495 }
496
497 set_mcast_loop(sock->sk, 0);
498 set_mcast_ttl(sock->sk, 1);
499
500 if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
501 IP_VS_ERR("Error binding address of the mcast interface\n");
502 goto error;
503 }
504
505 if (sock->ops->connect(sock,
506 (struct sockaddr*)&mcast_addr,
507 sizeof(struct sockaddr), 0) < 0) {
508 IP_VS_ERR("Error connecting to the multicast addr\n");
509 goto error;
510 }
511
512 return sock;
513
514 error:
515 sock_release(sock);
516 return NULL;
517}
518
519
520/*
521 * Set up receiving multicast socket over UDP
522 */
523static struct socket * make_receive_sock(void)
524{
525 struct socket *sock;
526
527 /* First create a socket */
528 if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
529 IP_VS_ERR("Error during creation of socket; terminating\n");
530 return NULL;
531 }
532
533 /* it is equivalent to the REUSEADDR option in user-space */
534 sock->sk->sk_reuse = 1;
535
536 if (sock->ops->bind(sock,
537 (struct sockaddr*)&mcast_addr,
538 sizeof(struct sockaddr)) < 0) {
539 IP_VS_ERR("Error binding to the multicast addr\n");
540 goto error;
541 }
542
543 /* join the multicast group */
544 if (join_mcast_group(sock->sk,
545 (struct in_addr*)&mcast_addr.sin_addr,
546 ip_vs_backup_mcast_ifn) < 0) {
547 IP_VS_ERR("Error joining to the multicast group\n");
548 goto error;
549 }
550
551 return sock;
552
553 error:
554 sock_release(sock);
555 return NULL;
556}
557
558
559static int
560ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
561{
562 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
563 struct kvec iov;
564 int len;
565
566 EnterFunction(7);
567 iov.iov_base = (void *)buffer;
568 iov.iov_len = length;
569
570 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
571
572 LeaveFunction(7);
573 return len;
574}
575
576static void
577ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
578{
579 int msize;
580
581 msize = msg->size;
582
583 /* Put size in network byte order */
584 msg->size = htons(msg->size);
585
586 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
587 IP_VS_ERR("ip_vs_send_async error\n");
588}
589
590static int
591ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
592{
593 struct msghdr msg = {NULL,};
594 struct kvec iov;
595 int len;
596
597 EnterFunction(7);
598
599 /* Receive a packet */
600 iov.iov_base = buffer;
601 iov.iov_len = (size_t)buflen;
602
603 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
604
605 if (len < 0)
606 return -1;
607
608 LeaveFunction(7);
609 return len;
610}
611
612
613static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
614static pid_t sync_master_pid = 0;
615static pid_t sync_backup_pid = 0;
616
617static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
618static int stop_master_sync = 0;
619static int stop_backup_sync = 0;
620
621static void sync_master_loop(void)
622{
623 struct socket *sock;
624 struct ip_vs_sync_buff *sb;
625
626 /* create the sending multicast socket */
627 sock = make_send_sock();
628 if (!sock)
629 return;
630
631 IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
632 "syncid = %d\n",
633 ip_vs_master_mcast_ifn, ip_vs_master_syncid);
634
635 for (;;) {
636 while ((sb=sb_dequeue())) {
637 ip_vs_send_sync_msg(sock, sb->mesg);
638 ip_vs_sync_buff_release(sb);
639 }
640
641 /* check if entries stay in curr_sb for 2 seconds */
642 if ((sb = get_curr_sync_buff(2*HZ))) {
643 ip_vs_send_sync_msg(sock, sb->mesg);
644 ip_vs_sync_buff_release(sb);
645 }
646
647 if (stop_master_sync)
648 break;
649
650 ssleep(1);
651 }
652
653 /* clean up the sync_buff queue */
654 while ((sb=sb_dequeue())) {
655 ip_vs_sync_buff_release(sb);
656 }
657
658 /* clean up the current sync_buff */
659 if ((sb = get_curr_sync_buff(0))) {
660 ip_vs_sync_buff_release(sb);
661 }
662
663 /* release the sending multicast socket */
664 sock_release(sock);
665}
666
667
668static void sync_backup_loop(void)
669{
670 struct socket *sock;
671 char *buf;
672 int len;
673
674 if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
675 IP_VS_ERR("sync_backup_loop: kmalloc error\n");
676 return;
677 }
678
679 /* create the receiving multicast socket */
680 sock = make_receive_sock();
681 if (!sock)
682 goto out;
683
684 IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
685 "syncid = %d\n",
686 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
687
688 for (;;) {
689 /* do you have data now? */
690 while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
691 if ((len =
692 ip_vs_receive(sock, buf,
693 sync_recv_mesg_maxlen)) <= 0) {
694 IP_VS_ERR("receiving message error\n");
695 break;
696 }
697 /* disable bottom half, because it accessed the data
698 shared by softirq while getting/creating conns */
699 local_bh_disable();
700 ip_vs_process_message(buf, len);
701 local_bh_enable();
702 }
703
704 if (stop_backup_sync)
705 break;
706
707 ssleep(1);
708 }
709
710 /* release the sending multicast socket */
711 sock_release(sock);
712
713 out:
714 kfree(buf);
715}
716
717
718static void set_sync_pid(int sync_state, pid_t sync_pid)
719{
720 if (sync_state == IP_VS_STATE_MASTER)
721 sync_master_pid = sync_pid;
722 else if (sync_state == IP_VS_STATE_BACKUP)
723 sync_backup_pid = sync_pid;
724}
725
726static void set_stop_sync(int sync_state, int set)
727{
728 if (sync_state == IP_VS_STATE_MASTER)
729 stop_master_sync = set;
730 else if (sync_state == IP_VS_STATE_BACKUP)
731 stop_backup_sync = set;
732 else {
733 stop_master_sync = set;
734 stop_backup_sync = set;
735 }
736}
737
738static int sync_thread(void *startup)
739{
740 DECLARE_WAITQUEUE(wait, current);
741 mm_segment_t oldmm;
742 int state;
743 const char *name;
744
745 /* increase the module use count */
746 ip_vs_use_count_inc();
747
748 if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
749 state = IP_VS_STATE_MASTER;
750 name = "ipvs_syncmaster";
751 } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
752 state = IP_VS_STATE_BACKUP;
753 name = "ipvs_syncbackup";
754 } else {
755 IP_VS_BUG();
756 ip_vs_use_count_dec();
757 return -EINVAL;
758 }
759
760 daemonize(name);
761
762 oldmm = get_fs();
763 set_fs(KERNEL_DS);
764
765 /* Block all signals */
766 spin_lock_irq(&current->sighand->siglock);
767 siginitsetinv(&current->blocked, 0);
768 recalc_sigpending();
769 spin_unlock_irq(&current->sighand->siglock);
770
771 /* set the maximum length of sync message */
772 set_sync_mesg_maxlen(state);
773
774 /* set up multicast address */
775 mcast_addr.sin_family = AF_INET;
776 mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
777 mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
778
779 add_wait_queue(&sync_wait, &wait);
780
781 set_sync_pid(state, current->pid);
782 complete((struct completion *)startup);
783
784 /* processing master/backup loop here */
785 if (state == IP_VS_STATE_MASTER)
786 sync_master_loop();
787 else if (state == IP_VS_STATE_BACKUP)
788 sync_backup_loop();
789 else IP_VS_BUG();
790
791 remove_wait_queue(&sync_wait, &wait);
792
793 /* thread exits */
794 set_sync_pid(state, 0);
795 IP_VS_INFO("sync thread stopped!\n");
796
797 set_fs(oldmm);
798
799 /* decrease the module use count */
800 ip_vs_use_count_dec();
801
802 set_stop_sync(state, 0);
803 wake_up(&stop_sync_wait);
804
805 return 0;
806}
807
808
809static int fork_sync_thread(void *startup)
810{
811 pid_t pid;
812
813 /* fork the sync thread here, then the parent process of the
814 sync thread is the init process after this thread exits. */
815 repeat:
816 if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
817 IP_VS_ERR("could not create sync_thread due to %d... "
818 "retrying.\n", pid);
819 ssleep(1);
820 goto repeat;
821 }
822
823 return 0;
824}
825
826
827int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
828{
829 DECLARE_COMPLETION(startup);
830 pid_t pid;
831
832 if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
833 (state == IP_VS_STATE_BACKUP && sync_backup_pid))
834 return -EEXIST;
835
836 IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
837 IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
838 sizeof(struct ip_vs_sync_conn));
839
840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
843 ip_vs_master_syncid = syncid;
844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
846 ip_vs_backup_syncid = syncid;
847 }
848
849 repeat:
850 if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
851 IP_VS_ERR("could not create fork_sync_thread due to %d... "
852 "retrying.\n", pid);
853 ssleep(1);
854 goto repeat;
855 }
856
857 wait_for_completion(&startup);
858
859 return 0;
860}
861
862
863int stop_sync_thread(int state)
864{
865 DECLARE_WAITQUEUE(wait, current);
866
867 if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
868 (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
869 return -ESRCH;
870
871 IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
872 IP_VS_INFO("stopping sync thread %d ...\n",
873 (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
874
875 __set_current_state(TASK_UNINTERRUPTIBLE);
876 add_wait_queue(&stop_sync_wait, &wait);
877 set_stop_sync(state, 1);
878 ip_vs_sync_state -= state;
879 wake_up(&sync_wait);
880 schedule();
881 __set_current_state(TASK_RUNNING);
882 remove_wait_queue(&stop_sync_wait, &wait);
883
884 /* Note: no need to reap the sync thread, because its parent
885 process is the init process */
886
887 if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
888 (state == IP_VS_STATE_BACKUP && stop_backup_sync))
889 IP_VS_BUG();
890
891 return 0;
892}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
new file mode 100644
index 000000000000..8a9d913261d8
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -0,0 +1,151 @@
1/*
2 * IPVS: Weighted Least-Connection Scheduling module
3 *
4 * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Peter Kese <peter.kese@ijs.si>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
16 * Wensong Zhang : changed to use the inactconns in scheduling
17 * Wensong Zhang : changed some comestics things for debugging
18 * Wensong Zhang : changed for the d-linked destination list
19 * Wensong Zhang : added the ip_vs_wlc_update_svc
20 * Wensong Zhang : added any dest with weight=0 is quiesced
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26
27#include <net/ip_vs.h>
28
29
30static int
31ip_vs_wlc_init_svc(struct ip_vs_service *svc)
32{
33 return 0;
34}
35
36
37static int
38ip_vs_wlc_done_svc(struct ip_vs_service *svc)
39{
40 return 0;
41}
42
43
44static int
45ip_vs_wlc_update_svc(struct ip_vs_service *svc)
46{
47 return 0;
48}
49
50
51static inline unsigned int
52ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
53{
54 /*
55 * We think the overhead of processing active connections is 256
56 * times higher than that of inactive connections in average. (This
57 * 256 times might not be accurate, we will change it later) We
58 * use the following formula to estimate the overhead now:
59 * dest->activeconns*256 + dest->inactconns
60 */
61 return (atomic_read(&dest->activeconns) << 8) +
62 atomic_read(&dest->inactconns);
63}
64
65
66/*
67 * Weighted Least Connection scheduling
68 */
69static struct ip_vs_dest *
70ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
71{
72 struct ip_vs_dest *dest, *least;
73 unsigned int loh, doh;
74
75 IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
76
77 /*
78 * We calculate the load of each dest server as follows:
79 * (dest overhead) / dest->weight
80 *
81 * Remember -- no floats in kernel mode!!!
82 * The comparison of h1*w2 > h2*w1 is equivalent to that of
83 * h1/w1 > h2/w2
84 * if every weight is larger than zero.
85 *
86 * The server with weight=0 is quiesced and will not receive any
87 * new connections.
88 */
89
90 list_for_each_entry(dest, &svc->destinations, n_list) {
91 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
92 atomic_read(&dest->weight) > 0) {
93 least = dest;
94 loh = ip_vs_wlc_dest_overhead(least);
95 goto nextstage;
96 }
97 }
98 return NULL;
99
100 /*
101 * Find the destination with the least load.
102 */
103 nextstage:
104 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
105 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
106 continue;
107 doh = ip_vs_wlc_dest_overhead(dest);
108 if (loh * atomic_read(&dest->weight) >
109 doh * atomic_read(&least->weight)) {
110 least = dest;
111 loh = doh;
112 }
113 }
114
115 IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
116 "activeconns %d refcnt %d weight %d overhead %d\n",
117 NIPQUAD(least->addr), ntohs(least->port),
118 atomic_read(&least->activeconns),
119 atomic_read(&least->refcnt),
120 atomic_read(&least->weight), loh);
121
122 return least;
123}
124
125
126static struct ip_vs_scheduler ip_vs_wlc_scheduler =
127{
128 .name = "wlc",
129 .refcnt = ATOMIC_INIT(0),
130 .module = THIS_MODULE,
131 .init_service = ip_vs_wlc_init_svc,
132 .done_service = ip_vs_wlc_done_svc,
133 .update_service = ip_vs_wlc_update_svc,
134 .schedule = ip_vs_wlc_schedule,
135};
136
137
138static int __init ip_vs_wlc_init(void)
139{
140 INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
141 return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
142}
143
144static void __exit ip_vs_wlc_cleanup(void)
145{
146 unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
147}
148
149module_init(ip_vs_wlc_init);
150module_exit(ip_vs_wlc_cleanup);
151MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
new file mode 100644
index 000000000000..749fa044eca5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -0,0 +1,235 @@
1/*
2 * IPVS: Weighted Round-Robin Scheduling module
3 *
4 * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_wrr_update_svc
18 * Julian Anastasov : fixed the bug of returning destination
19 * with weight 0 when all weights are zero
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25
26#include <net/ip_vs.h>
27
28/*
29 * current destination pointer for weighted round-robin scheduling
30 */
31struct ip_vs_wrr_mark {
32 struct list_head *cl; /* current list head */
33 int cw; /* current weight */
34 int mw; /* maximum weight */
35 int di; /* decreasing interval */
36};
37
38
39/*
40 * Get the gcd of server weights
41 */
42static int gcd(int a, int b)
43{
44 int c;
45
46 while ((c = a % b)) {
47 a = b;
48 b = c;
49 }
50 return b;
51}
52
53static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
54{
55 struct ip_vs_dest *dest;
56 int weight;
57 int g = 0;
58
59 list_for_each_entry(dest, &svc->destinations, n_list) {
60 weight = atomic_read(&dest->weight);
61 if (weight > 0) {
62 if (g > 0)
63 g = gcd(weight, g);
64 else
65 g = weight;
66 }
67 }
68 return g ? g : 1;
69}
70
71
72/*
73 * Get the maximum weight of the service destinations.
74 */
75static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
76{
77 struct ip_vs_dest *dest;
78 int weight = 0;
79
80 list_for_each_entry(dest, &svc->destinations, n_list) {
81 if (atomic_read(&dest->weight) > weight)
82 weight = atomic_read(&dest->weight);
83 }
84
85 return weight;
86}
87
88
89static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
90{
91 struct ip_vs_wrr_mark *mark;
92
93 /*
94 * Allocate the mark variable for WRR scheduling
95 */
96 mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
97 if (mark == NULL) {
98 IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
99 return -ENOMEM;
100 }
101 mark->cl = &svc->destinations;
102 mark->cw = 0;
103 mark->mw = ip_vs_wrr_max_weight(svc);
104 mark->di = ip_vs_wrr_gcd_weight(svc);
105 svc->sched_data = mark;
106
107 return 0;
108}
109
110
111static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
112{
113 /*
114 * Release the mark variable
115 */
116 kfree(svc->sched_data);
117
118 return 0;
119}
120
121
122static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
123{
124 struct ip_vs_wrr_mark *mark = svc->sched_data;
125
126 mark->cl = &svc->destinations;
127 mark->mw = ip_vs_wrr_max_weight(svc);
128 mark->di = ip_vs_wrr_gcd_weight(svc);
129 if (mark->cw > mark->mw)
130 mark->cw = 0;
131 return 0;
132}
133
134
135/*
136 * Weighted Round-Robin Scheduling
137 */
138static struct ip_vs_dest *
139ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
140{
141 struct ip_vs_dest *dest;
142 struct ip_vs_wrr_mark *mark = svc->sched_data;
143 struct list_head *p;
144
145 IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
146
147 /*
148 * This loop will always terminate, because mark->cw in (0, max_weight]
149 * and at least one server has its weight equal to max_weight.
150 */
151 write_lock(&svc->sched_lock);
152 p = mark->cl;
153 while (1) {
154 if (mark->cl == &svc->destinations) {
155 /* it is at the head of the destination list */
156
157 if (mark->cl == mark->cl->next) {
158 /* no dest entry */
159 dest = NULL;
160 goto out;
161 }
162
163 mark->cl = svc->destinations.next;
164 mark->cw -= mark->di;
165 if (mark->cw <= 0) {
166 mark->cw = mark->mw;
167 /*
168 * Still zero, which means no available servers.
169 */
170 if (mark->cw == 0) {
171 mark->cl = &svc->destinations;
172 IP_VS_INFO("ip_vs_wrr_schedule(): "
173 "no available servers\n");
174 dest = NULL;
175 goto out;
176 }
177 }
178 } else
179 mark->cl = mark->cl->next;
180
181 if (mark->cl != &svc->destinations) {
182 /* not at the head of the list */
183 dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
184 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
185 atomic_read(&dest->weight) >= mark->cw) {
186 /* got it */
187 break;
188 }
189 }
190
191 if (mark->cl == p && mark->cw == mark->di) {
192 /* back to the start, and no dest is found.
193 It is only possible when all dests are OVERLOADED */
194 dest = NULL;
195 goto out;
196 }
197 }
198
199 IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
200 "activeconns %d refcnt %d weight %d\n",
201 NIPQUAD(dest->addr), ntohs(dest->port),
202 atomic_read(&dest->activeconns),
203 atomic_read(&dest->refcnt),
204 atomic_read(&dest->weight));
205
206 out:
207 write_unlock(&svc->sched_lock);
208 return dest;
209}
210
211
212static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
213 .name = "wrr",
214 .refcnt = ATOMIC_INIT(0),
215 .module = THIS_MODULE,
216 .init_service = ip_vs_wrr_init_svc,
217 .done_service = ip_vs_wrr_done_svc,
218 .update_service = ip_vs_wrr_update_svc,
219 .schedule = ip_vs_wrr_schedule,
220};
221
222static int __init ip_vs_wrr_init(void)
223{
224 INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
225 return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
226}
227
228static void __exit ip_vs_wrr_cleanup(void)
229{
230 unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
231}
232
233module_init(ip_vs_wrr_init);
234module_exit(ip_vs_wrr_cleanup);
235MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
new file mode 100644
index 000000000000..faa6176bbeb1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -0,0 +1,563 @@
1/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
4 * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/ip.h>
20#include <linux/tcp.h> /* for tcphdr */
21#include <net/tcp.h> /* for csum_tcpudp_magic */
22#include <net/udp.h>
23#include <net/icmp.h> /* for icmp_send */
24#include <net/route.h> /* for ip_route_output */
25#include <linux/netfilter.h>
26#include <linux/netfilter_ipv4.h>
27
28#include <net/ip_vs.h>
29
30
31/*
32 * Destination cache to speed up outgoing route lookup
33 */
34static inline void
35__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
36{
37 struct dst_entry *old_dst;
38
39 old_dst = dest->dst_cache;
40 dest->dst_cache = dst;
41 dest->dst_rtos = rtos;
42 dst_release(old_dst);
43}
44
45static inline struct dst_entry *
46__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47{
48 struct dst_entry *dst = dest->dst_cache;
49
50 if (!dst)
51 return NULL;
52 if ((dst->obsolete || rtos != dest->dst_rtos) &&
53 dst->ops->check(dst, cookie) == NULL) {
54 dest->dst_cache = NULL;
55 dst_release(dst);
56 return NULL;
57 }
58 dst_hold(dst);
59 return dst;
60}
61
62static inline struct rtable *
63__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
64{
65 struct rtable *rt; /* Route to the other host */
66 struct ip_vs_dest *dest = cp->dest;
67
68 if (dest) {
69 spin_lock(&dest->dst_lock);
70 if (!(rt = (struct rtable *)
71 __ip_vs_dst_check(dest, rtos, 0))) {
72 struct flowi fl = {
73 .oif = 0,
74 .nl_u = {
75 .ip4_u = {
76 .daddr = dest->addr,
77 .saddr = 0,
78 .tos = rtos, } },
79 };
80
81 if (ip_route_output_key(&rt, &fl)) {
82 spin_unlock(&dest->dst_lock);
83 IP_VS_DBG_RL("ip_route_output error, "
84 "dest: %u.%u.%u.%u\n",
85 NIPQUAD(dest->addr));
86 return NULL;
87 }
88 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
89 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
90 NIPQUAD(dest->addr),
91 atomic_read(&rt->u.dst.__refcnt), rtos);
92 }
93 spin_unlock(&dest->dst_lock);
94 } else {
95 struct flowi fl = {
96 .oif = 0,
97 .nl_u = {
98 .ip4_u = {
99 .daddr = cp->daddr,
100 .saddr = 0,
101 .tos = rtos, } },
102 };
103
104 if (ip_route_output_key(&rt, &fl)) {
105 IP_VS_DBG_RL("ip_route_output error, dest: "
106 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
107 return NULL;
108 }
109 }
110
111 return rt;
112}
113
114
115/*
116 * Release dest->dst_cache before a dest is removed
117 */
118void
119ip_vs_dst_reset(struct ip_vs_dest *dest)
120{
121 struct dst_entry *old_dst;
122
123 old_dst = dest->dst_cache;
124 dest->dst_cache = NULL;
125 dst_release(old_dst);
126}
127
128#define IP_VS_XMIT(skb, rt) \
129do { \
130 nf_reset_debug(skb); \
131 (skb)->nfcache |= NFC_IPVS_PROPERTY; \
132 (skb)->ip_summed = CHECKSUM_NONE; \
133 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
134 (rt)->u.dst.dev, dst_output); \
135} while (0)
136
137
138/*
139 * NULL transmitter (do nothing except return NF_ACCEPT)
140 */
141int
142ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
143 struct ip_vs_protocol *pp)
144{
145 /* we do not touch skb and do not need pskb ptr */
146 return NF_ACCEPT;
147}
148
149
150/*
151 * Bypass transmitter
152 * Let packets bypass the destination when the destination is not
153 * available, it may be only used in transparent cache cluster.
154 */
155int
156ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
157 struct ip_vs_protocol *pp)
158{
159 struct rtable *rt; /* Route to the other host */
160 struct iphdr *iph = skb->nh.iph;
161 u8 tos = iph->tos;
162 int mtu;
163 struct flowi fl = {
164 .oif = 0,
165 .nl_u = {
166 .ip4_u = {
167 .daddr = iph->daddr,
168 .saddr = 0,
169 .tos = RT_TOS(tos), } },
170 };
171
172 EnterFunction(10);
173
174 if (ip_route_output_key(&rt, &fl)) {
175 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
176 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
177 goto tx_error_icmp;
178 }
179
180 /* MTU checking */
181 mtu = dst_mtu(&rt->u.dst);
182 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
183 ip_rt_put(rt);
184 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
185 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
186 goto tx_error;
187 }
188
189 /*
190 * Call ip_send_check because we are not sure it is called
191 * after ip_defrag. Is copy-on-write needed?
192 */
193 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
194 ip_rt_put(rt);
195 return NF_STOLEN;
196 }
197 ip_send_check(skb->nh.iph);
198
199 /* drop old route */
200 dst_release(skb->dst);
201 skb->dst = &rt->u.dst;
202
203 /* Another hack: avoid icmp_send in ip_fragment */
204 skb->local_df = 1;
205
206 IP_VS_XMIT(skb, rt);
207
208 LeaveFunction(10);
209 return NF_STOLEN;
210
211 tx_error_icmp:
212 dst_link_failure(skb);
213 tx_error:
214 kfree_skb(skb);
215 LeaveFunction(10);
216 return NF_STOLEN;
217}
218
219
220/*
221 * NAT transmitter (only for outside-to-inside nat forwarding)
222 * Not used for related ICMP
223 */
224int
225ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
226 struct ip_vs_protocol *pp)
227{
228 struct rtable *rt; /* Route to the other host */
229 int mtu;
230 struct iphdr *iph = skb->nh.iph;
231
232 EnterFunction(10);
233
234 /* check if it is a connection of no-client-port */
235 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
236 __u16 _pt, *p;
237 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
238 if (p == NULL)
239 goto tx_error;
240 ip_vs_conn_fill_cport(cp, *p);
241 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
242 }
243
244 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
245 goto tx_error_icmp;
246
247 /* MTU checking */
248 mtu = dst_mtu(&rt->u.dst);
249 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
250 ip_rt_put(rt);
251 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
252 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
253 goto tx_error;
254 }
255
256 /* copy-on-write the packet before mangling it */
257 if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
258 goto tx_error_put;
259
260 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
261 goto tx_error_put;
262
263 /* drop old route */
264 dst_release(skb->dst);
265 skb->dst = &rt->u.dst;
266
267 /* mangle the packet */
268 if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
269 goto tx_error;
270 skb->nh.iph->daddr = cp->daddr;
271 ip_send_check(skb->nh.iph);
272
273 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
274
275 /* FIXME: when application helper enlarges the packet and the length
276 is larger than the MTU of outgoing device, there will be still
277 MTU problem. */
278
279 /* Another hack: avoid icmp_send in ip_fragment */
280 skb->local_df = 1;
281
282 IP_VS_XMIT(skb, rt);
283
284 LeaveFunction(10);
285 return NF_STOLEN;
286
287 tx_error_icmp:
288 dst_link_failure(skb);
289 tx_error:
290 LeaveFunction(10);
291 kfree_skb(skb);
292 return NF_STOLEN;
293 tx_error_put:
294 ip_rt_put(rt);
295 goto tx_error;
296}
297
298
299/*
300 * IP Tunneling transmitter
301 *
302 * This function encapsulates the packet in a new IP packet, its
303 * destination will be set to cp->daddr. Most code of this function
304 * is taken from ipip.c.
305 *
306 * It is used in VS/TUN cluster. The load balancer selects a real
307 * server from a cluster based on a scheduling algorithm,
308 * encapsulates the request packet and forwards it to the selected
309 * server. For example, all real servers are configured with
310 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
311 * the encapsulated packet, it will decapsulate the packet, processe
312 * the request and return the response packets directly to the client
313 * without passing the load balancer. This can greatly increase the
314 * scalability of virtual server.
315 *
316 * Used for ANY protocol
317 */
318int
319ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
320 struct ip_vs_protocol *pp)
321{
322 struct rtable *rt; /* Route to the other host */
323 struct net_device *tdev; /* Device to other host */
324 struct iphdr *old_iph = skb->nh.iph;
325 u8 tos = old_iph->tos;
326 u16 df = old_iph->frag_off;
327 struct iphdr *iph; /* Our new IP header */
328 int max_headroom; /* The extra header space needed */
329 int mtu;
330
331 EnterFunction(10);
332
333 if (skb->protocol != __constant_htons(ETH_P_IP)) {
334 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
335 "ETH_P_IP: %d, skb protocol: %d\n",
336 __constant_htons(ETH_P_IP), skb->protocol);
337 goto tx_error;
338 }
339
340 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
341 goto tx_error_icmp;
342
343 tdev = rt->u.dst.dev;
344
345 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
346 if (mtu < 68) {
347 ip_rt_put(rt);
348 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
349 goto tx_error;
350 }
351 if (skb->dst)
352 skb->dst->ops->update_pmtu(skb->dst, mtu);
353
354 df |= (old_iph->frag_off&__constant_htons(IP_DF));
355
356 if ((old_iph->frag_off&__constant_htons(IP_DF))
357 && mtu < ntohs(old_iph->tot_len)) {
358 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
359 ip_rt_put(rt);
360 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
361 goto tx_error;
362 }
363
364 /*
365 * Okay, now see if we can stuff it in the buffer as-is.
366 */
367 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
368
369 if (skb_headroom(skb) < max_headroom
370 || skb_cloned(skb) || skb_shared(skb)) {
371 struct sk_buff *new_skb =
372 skb_realloc_headroom(skb, max_headroom);
373 if (!new_skb) {
374 ip_rt_put(rt);
375 kfree_skb(skb);
376 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
377 return NF_STOLEN;
378 }
379 kfree_skb(skb);
380 skb = new_skb;
381 old_iph = skb->nh.iph;
382 }
383
384 skb->h.raw = (void *) old_iph;
385
386 /* fix old IP header checksum */
387 ip_send_check(old_iph);
388
389 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
390 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
391
392 /* drop old route */
393 dst_release(skb->dst);
394 skb->dst = &rt->u.dst;
395
396 /*
397 * Push down and install the IPIP header.
398 */
399 iph = skb->nh.iph;
400 iph->version = 4;
401 iph->ihl = sizeof(struct iphdr)>>2;
402 iph->frag_off = df;
403 iph->protocol = IPPROTO_IPIP;
404 iph->tos = tos;
405 iph->daddr = rt->rt_dst;
406 iph->saddr = rt->rt_src;
407 iph->ttl = old_iph->ttl;
408 iph->tot_len = htons(skb->len);
409 ip_select_ident(iph, &rt->u.dst, NULL);
410 ip_send_check(iph);
411
412 /* Another hack: avoid icmp_send in ip_fragment */
413 skb->local_df = 1;
414
415 IP_VS_XMIT(skb, rt);
416
417 LeaveFunction(10);
418
419 return NF_STOLEN;
420
421 tx_error_icmp:
422 dst_link_failure(skb);
423 tx_error:
424 kfree_skb(skb);
425 LeaveFunction(10);
426 return NF_STOLEN;
427}
428
429
430/*
431 * Direct Routing transmitter
432 * Used for ANY protocol
433 */
434int
435ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
436 struct ip_vs_protocol *pp)
437{
438 struct rtable *rt; /* Route to the other host */
439 struct iphdr *iph = skb->nh.iph;
440 int mtu;
441
442 EnterFunction(10);
443
444 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
445 goto tx_error_icmp;
446
447 /* MTU checking */
448 mtu = dst_mtu(&rt->u.dst);
449 if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
450 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
451 ip_rt_put(rt);
452 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
453 goto tx_error;
454 }
455
456 /*
457 * Call ip_send_check because we are not sure it is called
458 * after ip_defrag. Is copy-on-write needed?
459 */
460 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
461 ip_rt_put(rt);
462 return NF_STOLEN;
463 }
464 ip_send_check(skb->nh.iph);
465
466 /* drop old route */
467 dst_release(skb->dst);
468 skb->dst = &rt->u.dst;
469
470 /* Another hack: avoid icmp_send in ip_fragment */
471 skb->local_df = 1;
472
473 IP_VS_XMIT(skb, rt);
474
475 LeaveFunction(10);
476 return NF_STOLEN;
477
478 tx_error_icmp:
479 dst_link_failure(skb);
480 tx_error:
481 kfree_skb(skb);
482 LeaveFunction(10);
483 return NF_STOLEN;
484}
485
486
487/*
488 * ICMP packet transmitter
489 * called by the ip_vs_in_icmp
490 */
491int
492ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
493 struct ip_vs_protocol *pp, int offset)
494{
495 struct rtable *rt; /* Route to the other host */
496 int mtu;
497 int rc;
498
499 EnterFunction(10);
500
501 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
502 forwarded directly here, because there is no need to
503 translate address/port back */
504 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
505 if (cp->packet_xmit)
506 rc = cp->packet_xmit(skb, cp, pp);
507 else
508 rc = NF_ACCEPT;
509 /* do not touch skb anymore */
510 atomic_inc(&cp->in_pkts);
511 __ip_vs_conn_put(cp);
512 goto out;
513 }
514
515 /*
516 * mangle and send the packet here (only for VS/NAT)
517 */
518
519 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
520 goto tx_error_icmp;
521
522 /* MTU checking */
523 mtu = dst_mtu(&rt->u.dst);
524 if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
525 ip_rt_put(rt);
526 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
527 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
528 goto tx_error;
529 }
530
531 /* copy-on-write the packet before mangling it */
532 if (!ip_vs_make_skb_writable(&skb, offset))
533 goto tx_error_put;
534
535 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
536 goto tx_error_put;
537
538 /* drop the old route when skb is not shared */
539 dst_release(skb->dst);
540 skb->dst = &rt->u.dst;
541
542 ip_vs_nat_icmp(skb, pp, cp, 0);
543
544 /* Another hack: avoid icmp_send in ip_fragment */
545 skb->local_df = 1;
546
547 IP_VS_XMIT(skb, rt);
548
549 rc = NF_STOLEN;
550 goto out;
551
552 tx_error_icmp:
553 dst_link_failure(skb);
554 tx_error:
555 dev_kfree_skb(skb);
556 rc = NF_STOLEN;
557 out:
558 LeaveFunction(10);
559 return rc;
560 tx_error_put:
561 ip_rt_put(rt);
562 goto tx_error;
563}