aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/ipvs
diff options
context:
space:
mode:
authorJulius Volz <juliusv@google.com>2008-09-19 06:32:57 -0400
committerSimon Horman <horms@verge.net.au>2008-10-06 17:38:24 -0400
commitcb7f6a7b716e801097b564dec3ccb58d330aef56 (patch)
tree92fa8fa5381e04576c43eab88874ab54ea670767 /net/ipv4/ipvs
parent8d5803bf6fbe5264000afc8c34bff08e8ecc023b (diff)
IPVS: Move IPVS to net/netfilter/ipvs
Since IPVS now has partial IPv6 support, this patch moves IPVS from net/ipv4/ipvs to net/netfilter/ipvs. It's a result of: $ git mv net/ipv4/ipvs net/netfilter and adapting the relevant Kconfigs/Makefiles to the new path. Signed-off-by: Julius Volz <juliusv@google.com> Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net/ipv4/ipvs')
-rw-r--r--net/ipv4/ipvs/Kconfig239
-rw-r--r--net/ipv4/ipvs/Makefile33
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c622
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c1110
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1542
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c3443
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c166
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c410
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c555
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c755
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c103
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c138
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c288
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah_esp.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c732
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c533
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c112
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c140
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c258
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c942
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c128
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c237
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c1004
24 files changed, 0 insertions, 13976 deletions
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
deleted file mode 100644
index de6004de80bc..000000000000
--- a/net/ipv4/ipvs/Kconfig
+++ /dev/null
@@ -1,239 +0,0 @@
1#
2# IP Virtual Server configuration
3#
4menuconfig IP_VS
5 tristate "IP virtual server support (EXPERIMENTAL)"
6 depends on NETFILTER
7 ---help---
8 IP Virtual Server support will let you build a high-performance
9 virtual server based on cluster of two or more real servers. This
10 option must be enabled for at least one of the clustered computers
11 that will take care of intercepting incoming connections to a
12 single IP address and scheduling them to real servers.
13
14 Three request dispatching techniques are implemented, they are
15 virtual server via NAT, virtual server via tunneling and virtual
16 server via direct routing. The several scheduling algorithms can
17 be used to choose which server the connection is directed to,
18 thus load balancing can be achieved among the servers. For more
19 information and its administration program, please visit the
20 following URL: <http://www.linuxvirtualserver.org/>.
21
22 If you want to compile it in kernel, say Y. To compile it as a
23 module, choose M here. If unsure, say N.
24
25if IP_VS
26
27config IP_VS_IPV6
28 bool "IPv6 support for IPVS (DANGEROUS)"
29 depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
30 ---help---
31 Add IPv6 support to IPVS. This is incomplete and might be dangerous.
32
33 Say N if unsure.
34
35config IP_VS_DEBUG
36 bool "IP virtual server debugging"
37 ---help---
38 Say Y here if you want to get additional messages useful in
39 debugging the IP virtual server code. You can change the debug
40 level in /proc/sys/net/ipv4/vs/debug_level
41
42config IP_VS_TAB_BITS
43 int "IPVS connection table size (the Nth power of 2)"
44 range 8 20
45 default 12
46 ---help---
47 The IPVS connection hash table uses the chaining scheme to handle
48 hash collisions. Using a big IPVS connection hash table will greatly
49 reduce conflicts when there are hundreds of thousands of connections
50 in the hash table.
51
52 Note the table size must be power of 2. The table size will be the
53 value of 2 to the your input number power. The number to choose is
54 from 8 to 20, the default number is 12, which means the table size
55 is 4096. Don't input the number too small, otherwise you will lose
56 performance on it. You can adapt the table size yourself, according
57 to your virtual server application. It is good to set the table size
58 not far less than the number of connections per second multiplying
59 average lasting time of connection in the table. For example, your
60 virtual server gets 200 connections per second, the connection lasts
61 for 200 seconds in average in the connection table, the table size
62 should be not far less than 200x200, it is good to set the table
63 size 32768 (2**15).
64
65 Another note that each connection occupies 128 bytes effectively and
66 each hash entry uses 8 bytes, so you can estimate how much memory is
67 needed for your box.
68
69comment "IPVS transport protocol load balancing support"
70
71config IP_VS_PROTO_TCP
72 bool "TCP load balancing support"
73 ---help---
74 This option enables support for load balancing TCP transport
75 protocol. Say Y if unsure.
76
77config IP_VS_PROTO_UDP
78 bool "UDP load balancing support"
79 ---help---
80 This option enables support for load balancing UDP transport
81 protocol. Say Y if unsure.
82
83config IP_VS_PROTO_AH_ESP
84 bool
85 depends on UNDEFINED
86
87config IP_VS_PROTO_ESP
88 bool "ESP load balancing support"
89 select IP_VS_PROTO_AH_ESP
90 ---help---
91 This option enables support for load balancing ESP (Encapsulation
92 Security Payload) transport protocol. Say Y if unsure.
93
94config IP_VS_PROTO_AH
95 bool "AH load balancing support"
96 select IP_VS_PROTO_AH_ESP
97 ---help---
98 This option enables support for load balancing AH (Authentication
99 Header) transport protocol. Say Y if unsure.
100
101comment "IPVS scheduler"
102
103config IP_VS_RR
104 tristate "round-robin scheduling"
105 ---help---
106 The robin-robin scheduling algorithm simply directs network
107 connections to different real servers in a round-robin manner.
108
109 If you want to compile it in kernel, say Y. To compile it as a
110 module, choose M here. If unsure, say N.
111
112config IP_VS_WRR
113 tristate "weighted round-robin scheduling"
114 ---help---
115 The weighted robin-robin scheduling algorithm directs network
116 connections to different real servers based on server weights
117 in a round-robin manner. Servers with higher weights receive
118 new connections first than those with less weights, and servers
119 with higher weights get more connections than those with less
120 weights and servers with equal weights get equal connections.
121
122 If you want to compile it in kernel, say Y. To compile it as a
123 module, choose M here. If unsure, say N.
124
125config IP_VS_LC
126 tristate "least-connection scheduling"
127 ---help---
128 The least-connection scheduling algorithm directs network
129 connections to the server with the least number of active
130 connections.
131
132 If you want to compile it in kernel, say Y. To compile it as a
133 module, choose M here. If unsure, say N.
134
135config IP_VS_WLC
136 tristate "weighted least-connection scheduling"
137 ---help---
138 The weighted least-connection scheduling algorithm directs network
139 connections to the server with the least active connections
140 normalized by the server weight.
141
142 If you want to compile it in kernel, say Y. To compile it as a
143 module, choose M here. If unsure, say N.
144
145config IP_VS_LBLC
146 tristate "locality-based least-connection scheduling"
147 ---help---
148 The locality-based least-connection scheduling algorithm is for
149 destination IP load balancing. It is usually used in cache cluster.
150 This algorithm usually directs packet destined for an IP address to
151 its server if the server is alive and under load. If the server is
152 overloaded (its active connection numbers is larger than its weight)
153 and there is a server in its half load, then allocate the weighted
154 least-connection server to this IP address.
155
156 If you want to compile it in kernel, say Y. To compile it as a
157 module, choose M here. If unsure, say N.
158
159config IP_VS_LBLCR
160 tristate "locality-based least-connection with replication scheduling"
161 ---help---
162 The locality-based least-connection with replication scheduling
163 algorithm is also for destination IP load balancing. It is
164 usually used in cache cluster. It differs from the LBLC scheduling
165 as follows: the load balancer maintains mappings from a target
166 to a set of server nodes that can serve the target. Requests for
167 a target are assigned to the least-connection node in the target's
168 server set. If all the node in the server set are over loaded,
169 it picks up a least-connection node in the cluster and adds it
170 in the sever set for the target. If the server set has not been
171 modified for the specified time, the most loaded node is removed
172 from the server set, in order to avoid high degree of replication.
173
174 If you want to compile it in kernel, say Y. To compile it as a
175 module, choose M here. If unsure, say N.
176
177config IP_VS_DH
178 tristate "destination hashing scheduling"
179 ---help---
180 The destination hashing scheduling algorithm assigns network
181 connections to the servers through looking up a statically assigned
182 hash table by their destination IP addresses.
183
184 If you want to compile it in kernel, say Y. To compile it as a
185 module, choose M here. If unsure, say N.
186
187config IP_VS_SH
188 tristate "source hashing scheduling"
189 ---help---
190 The source hashing scheduling algorithm assigns network
191 connections to the servers through looking up a statically assigned
192 hash table by their source IP addresses.
193
194 If you want to compile it in kernel, say Y. To compile it as a
195 module, choose M here. If unsure, say N.
196
197config IP_VS_SED
198 tristate "shortest expected delay scheduling"
199 ---help---
200 The shortest expected delay scheduling algorithm assigns network
201 connections to the server with the shortest expected delay. The
202 expected delay that the job will experience is (Ci + 1) / Ui if
203 sent to the ith server, in which Ci is the number of connections
204 on the ith server and Ui is the fixed service rate (weight)
205 of the ith server.
206
207 If you want to compile it in kernel, say Y. To compile it as a
208 module, choose M here. If unsure, say N.
209
210config IP_VS_NQ
211 tristate "never queue scheduling"
212 ---help---
213 The never queue scheduling algorithm adopts a two-speed model.
214 When there is an idle server available, the job will be sent to
215 the idle server, instead of waiting for a fast one. When there
216 is no idle server available, the job will be sent to the server
217 that minimize its expected delay (The Shortest Expected Delay
218 scheduling algorithm).
219
220 If you want to compile it in kernel, say Y. To compile it as a
221 module, choose M here. If unsure, say N.
222
223comment 'IPVS application helper'
224
225config IP_VS_FTP
226 tristate "FTP protocol helper"
227 depends on IP_VS_PROTO_TCP
228 ---help---
229 FTP is a protocol that transfers IP address and/or port number in
230 the payload. In the virtual server via Network Address Translation,
231 the IP address and port number of real servers cannot be sent to
232 clients in ftp connections directly, so FTP protocol helper is
233 required for tracking the connection and mangling it back to that of
234 virtual service.
235
236 If you want to compile it in kernel, say Y. To compile it as a
237 module, choose M here. If unsure, say N.
238
239endif # IP_VS
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
deleted file mode 100644
index 73a46fe1fe4c..000000000000
--- a/net/ipv4/ipvs/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
1#
2# Makefile for the IPVS modules on top of IPv4.
3#
4
5# IPVS transport protocol load balancing support
6ip_vs_proto-objs-y :=
7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
10
11ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
12 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
13 ip_vs_est.o ip_vs_proto.o \
14 $(ip_vs_proto-objs-y)
15
16
17# IPVS core
18obj-$(CONFIG_IP_VS) += ip_vs.o
19
20# IPVS schedulers
21obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
22obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
23obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
24obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
25obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
26obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
27obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
28obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
29obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
30obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
31
32# IPVS application helpers
33obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
deleted file mode 100644
index 201b8ea3020d..000000000000
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ /dev/null
@@ -1,622 +0,0 @@
1/*
2 * ip_vs_app.c: Application module support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
12 * is that ip_vs_app module handles the reverse direction (incoming requests
13 * and outgoing responses).
14 *
15 * IP_MASQ_APP application masquerading module
16 *
17 * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/skbuff.h>
24#include <linux/in.h>
25#include <linux/ip.h>
26#include <linux/netfilter.h>
27#include <net/net_namespace.h>
28#include <net/protocol.h>
29#include <net/tcp.h>
30#include <asm/system.h>
31#include <linux/stat.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/mutex.h>
35
36#include <net/ip_vs.h>
37
38EXPORT_SYMBOL(register_ip_vs_app);
39EXPORT_SYMBOL(unregister_ip_vs_app);
40EXPORT_SYMBOL(register_ip_vs_app_inc);
41
42/* ipvs application list head */
43static LIST_HEAD(ip_vs_app_list);
44static DEFINE_MUTEX(__ip_vs_app_mutex);
45
46
47/*
48 * Get an ip_vs_app object
49 */
50static inline int ip_vs_app_get(struct ip_vs_app *app)
51{
52 return try_module_get(app->module);
53}
54
55
56static inline void ip_vs_app_put(struct ip_vs_app *app)
57{
58 module_put(app->module);
59}
60
61
62/*
63 * Allocate/initialize app incarnation and register it in proto apps.
64 */
65static int
66ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
67{
68 struct ip_vs_protocol *pp;
69 struct ip_vs_app *inc;
70 int ret;
71
72 if (!(pp = ip_vs_proto_get(proto)))
73 return -EPROTONOSUPPORT;
74
75 if (!pp->unregister_app)
76 return -EOPNOTSUPP;
77
78 inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
79 if (!inc)
80 return -ENOMEM;
81 INIT_LIST_HEAD(&inc->p_list);
82 INIT_LIST_HEAD(&inc->incs_list);
83 inc->app = app;
84 inc->port = htons(port);
85 atomic_set(&inc->usecnt, 0);
86
87 if (app->timeouts) {
88 inc->timeout_table =
89 ip_vs_create_timeout_table(app->timeouts,
90 app->timeouts_size);
91 if (!inc->timeout_table) {
92 ret = -ENOMEM;
93 goto out;
94 }
95 }
96
97 ret = pp->register_app(inc);
98 if (ret)
99 goto out;
100
101 list_add(&inc->a_list, &app->incs_list);
102 IP_VS_DBG(9, "%s application %s:%u registered\n",
103 pp->name, inc->name, inc->port);
104
105 return 0;
106
107 out:
108 kfree(inc->timeout_table);
109 kfree(inc);
110 return ret;
111}
112
113
114/*
115 * Release app incarnation
116 */
117static void
118ip_vs_app_inc_release(struct ip_vs_app *inc)
119{
120 struct ip_vs_protocol *pp;
121
122 if (!(pp = ip_vs_proto_get(inc->protocol)))
123 return;
124
125 if (pp->unregister_app)
126 pp->unregister_app(inc);
127
128 IP_VS_DBG(9, "%s App %s:%u unregistered\n",
129 pp->name, inc->name, inc->port);
130
131 list_del(&inc->a_list);
132
133 kfree(inc->timeout_table);
134 kfree(inc);
135}
136
137
138/*
139 * Get reference to app inc (only called from softirq)
140 *
141 */
142int ip_vs_app_inc_get(struct ip_vs_app *inc)
143{
144 int result;
145
146 atomic_inc(&inc->usecnt);
147 if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
148 atomic_dec(&inc->usecnt);
149 return result;
150}
151
152
153/*
154 * Put the app inc (only called from timer or net softirq)
155 */
156void ip_vs_app_inc_put(struct ip_vs_app *inc)
157{
158 ip_vs_app_put(inc->app);
159 atomic_dec(&inc->usecnt);
160}
161
162
163/*
164 * Register an application incarnation in protocol applications
165 */
166int
167register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
168{
169 int result;
170
171 mutex_lock(&__ip_vs_app_mutex);
172
173 result = ip_vs_app_inc_new(app, proto, port);
174
175 mutex_unlock(&__ip_vs_app_mutex);
176
177 return result;
178}
179
180
181/*
182 * ip_vs_app registration routine
183 */
184int register_ip_vs_app(struct ip_vs_app *app)
185{
186 /* increase the module use count */
187 ip_vs_use_count_inc();
188
189 mutex_lock(&__ip_vs_app_mutex);
190
191 list_add(&app->a_list, &ip_vs_app_list);
192
193 mutex_unlock(&__ip_vs_app_mutex);
194
195 return 0;
196}
197
198
199/*
200 * ip_vs_app unregistration routine
201 * We are sure there are no app incarnations attached to services
202 */
203void unregister_ip_vs_app(struct ip_vs_app *app)
204{
205 struct ip_vs_app *inc, *nxt;
206
207 mutex_lock(&__ip_vs_app_mutex);
208
209 list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
210 ip_vs_app_inc_release(inc);
211 }
212
213 list_del(&app->a_list);
214
215 mutex_unlock(&__ip_vs_app_mutex);
216
217 /* decrease the module use count */
218 ip_vs_use_count_dec();
219}
220
221
222/*
223 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
224 */
225int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
226{
227 return pp->app_conn_bind(cp);
228}
229
230
231/*
232 * Unbind cp from application incarnation (called by cp destructor)
233 */
234void ip_vs_unbind_app(struct ip_vs_conn *cp)
235{
236 struct ip_vs_app *inc = cp->app;
237
238 if (!inc)
239 return;
240
241 if (inc->unbind_conn)
242 inc->unbind_conn(inc, cp);
243 if (inc->done_conn)
244 inc->done_conn(inc, cp);
245 ip_vs_app_inc_put(inc);
246 cp->app = NULL;
247}
248
249
250/*
251 * Fixes th->seq based on ip_vs_seq info.
252 */
253static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
254{
255 __u32 seq = ntohl(th->seq);
256
257 /*
258 * Adjust seq with delta-offset for all packets after
259 * the most recent resized pkt seq and with previous_delta offset
260 * for all packets before most recent resized pkt seq.
261 */
262 if (vseq->delta || vseq->previous_delta) {
263 if(after(seq, vseq->init_seq)) {
264 th->seq = htonl(seq + vseq->delta);
265 IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
266 vseq->delta);
267 } else {
268 th->seq = htonl(seq + vseq->previous_delta);
269 IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
270 "(%d) to seq\n", vseq->previous_delta);
271 }
272 }
273}
274
275
276/*
277 * Fixes th->ack_seq based on ip_vs_seq info.
278 */
279static inline void
280vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
281{
282 __u32 ack_seq = ntohl(th->ack_seq);
283
284 /*
285 * Adjust ack_seq with delta-offset for
286 * the packets AFTER most recent resized pkt has caused a shift
287 * for packets before most recent resized pkt, use previous_delta
288 */
289 if (vseq->delta || vseq->previous_delta) {
290 /* since ack_seq is the number of octet that is expected
291 to receive next, so compare it with init_seq+delta */
292 if(after(ack_seq, vseq->init_seq+vseq->delta)) {
293 th->ack_seq = htonl(ack_seq - vseq->delta);
294 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
295 "(%d) from ack_seq\n", vseq->delta);
296
297 } else {
298 th->ack_seq = htonl(ack_seq - vseq->previous_delta);
299 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
300 "previous_delta (%d) from ack_seq\n",
301 vseq->previous_delta);
302 }
303 }
304}
305
306
307/*
308 * Updates ip_vs_seq if pkt has been resized
309 * Assumes already checked proto==IPPROTO_TCP and diff!=0.
310 */
311static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
312 unsigned flag, __u32 seq, int diff)
313{
314 /* spinlock is to keep updating cp->flags atomic */
315 spin_lock(&cp->lock);
316 if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
317 vseq->previous_delta = vseq->delta;
318 vseq->delta += diff;
319 vseq->init_seq = seq;
320 cp->flags |= flag;
321 }
322 spin_unlock(&cp->lock);
323}
324
325static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
326 struct ip_vs_app *app)
327{
328 int diff;
329 const unsigned int tcp_offset = ip_hdrlen(skb);
330 struct tcphdr *th;
331 __u32 seq;
332
333 if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
334 return 0;
335
336 th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
337
338 /*
339 * Remember seq number in case this pkt gets resized
340 */
341 seq = ntohl(th->seq);
342
343 /*
344 * Fix seq stuff if flagged as so.
345 */
346 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
347 vs_fix_seq(&cp->out_seq, th);
348 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
349 vs_fix_ack_seq(&cp->in_seq, th);
350
351 /*
352 * Call private output hook function
353 */
354 if (app->pkt_out == NULL)
355 return 1;
356
357 if (!app->pkt_out(app, cp, skb, &diff))
358 return 0;
359
360 /*
361 * Update ip_vs seq stuff if len has changed.
362 */
363 if (diff != 0)
364 vs_seq_update(cp, &cp->out_seq,
365 IP_VS_CONN_F_OUT_SEQ, seq, diff);
366
367 return 1;
368}
369
370/*
371 * Output pkt hook. Will call bound ip_vs_app specific function
372 * called by ipvs packet handler, assumes previously checked cp!=NULL
373 * returns false if it can't handle packet (oom)
374 */
375int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
376{
377 struct ip_vs_app *app;
378
379 /*
380 * check if application module is bound to
381 * this ip_vs_conn.
382 */
383 if ((app = cp->app) == NULL)
384 return 1;
385
386 /* TCP is complicated */
387 if (cp->protocol == IPPROTO_TCP)
388 return app_tcp_pkt_out(cp, skb, app);
389
390 /*
391 * Call private output hook function
392 */
393 if (app->pkt_out == NULL)
394 return 1;
395
396 return app->pkt_out(app, cp, skb, NULL);
397}
398
399
400static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
401 struct ip_vs_app *app)
402{
403 int diff;
404 const unsigned int tcp_offset = ip_hdrlen(skb);
405 struct tcphdr *th;
406 __u32 seq;
407
408 if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
409 return 0;
410
411 th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
412
413 /*
414 * Remember seq number in case this pkt gets resized
415 */
416 seq = ntohl(th->seq);
417
418 /*
419 * Fix seq stuff if flagged as so.
420 */
421 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
422 vs_fix_seq(&cp->in_seq, th);
423 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
424 vs_fix_ack_seq(&cp->out_seq, th);
425
426 /*
427 * Call private input hook function
428 */
429 if (app->pkt_in == NULL)
430 return 1;
431
432 if (!app->pkt_in(app, cp, skb, &diff))
433 return 0;
434
435 /*
436 * Update ip_vs seq stuff if len has changed.
437 */
438 if (diff != 0)
439 vs_seq_update(cp, &cp->in_seq,
440 IP_VS_CONN_F_IN_SEQ, seq, diff);
441
442 return 1;
443}
444
445/*
446 * Input pkt hook. Will call bound ip_vs_app specific function
447 * called by ipvs packet handler, assumes previously checked cp!=NULL.
448 * returns false if can't handle packet (oom).
449 */
450int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
451{
452 struct ip_vs_app *app;
453
454 /*
455 * check if application module is bound to
456 * this ip_vs_conn.
457 */
458 if ((app = cp->app) == NULL)
459 return 1;
460
461 /* TCP is complicated */
462 if (cp->protocol == IPPROTO_TCP)
463 return app_tcp_pkt_in(cp, skb, app);
464
465 /*
466 * Call private input hook function
467 */
468 if (app->pkt_in == NULL)
469 return 1;
470
471 return app->pkt_in(app, cp, skb, NULL);
472}
473
474
475#ifdef CONFIG_PROC_FS
476/*
477 * /proc/net/ip_vs_app entry function
478 */
479
480static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
481{
482 struct ip_vs_app *app, *inc;
483
484 list_for_each_entry(app, &ip_vs_app_list, a_list) {
485 list_for_each_entry(inc, &app->incs_list, a_list) {
486 if (pos-- == 0)
487 return inc;
488 }
489 }
490 return NULL;
491
492}
493
494static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
495{
496 mutex_lock(&__ip_vs_app_mutex);
497
498 return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
499}
500
501static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
502{
503 struct ip_vs_app *inc, *app;
504 struct list_head *e;
505
506 ++*pos;
507 if (v == SEQ_START_TOKEN)
508 return ip_vs_app_idx(0);
509
510 inc = v;
511 app = inc->app;
512
513 if ((e = inc->a_list.next) != &app->incs_list)
514 return list_entry(e, struct ip_vs_app, a_list);
515
516 /* go on to next application */
517 for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
518 app = list_entry(e, struct ip_vs_app, a_list);
519 list_for_each_entry(inc, &app->incs_list, a_list) {
520 return inc;
521 }
522 }
523 return NULL;
524}
525
526static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
527{
528 mutex_unlock(&__ip_vs_app_mutex);
529}
530
531static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
532{
533 if (v == SEQ_START_TOKEN)
534 seq_puts(seq, "prot port usecnt name\n");
535 else {
536 const struct ip_vs_app *inc = v;
537
538 seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
539 ip_vs_proto_name(inc->protocol),
540 ntohs(inc->port),
541 atomic_read(&inc->usecnt),
542 inc->name);
543 }
544 return 0;
545}
546
547static const struct seq_operations ip_vs_app_seq_ops = {
548 .start = ip_vs_app_seq_start,
549 .next = ip_vs_app_seq_next,
550 .stop = ip_vs_app_seq_stop,
551 .show = ip_vs_app_seq_show,
552};
553
554static int ip_vs_app_open(struct inode *inode, struct file *file)
555{
556 return seq_open(file, &ip_vs_app_seq_ops);
557}
558
559static const struct file_operations ip_vs_app_fops = {
560 .owner = THIS_MODULE,
561 .open = ip_vs_app_open,
562 .read = seq_read,
563 .llseek = seq_lseek,
564 .release = seq_release,
565};
566#endif
567
568
569/*
570 * Replace a segment of data with a new segment
571 */
572int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
573 char *o_buf, int o_len, char *n_buf, int n_len)
574{
575 int diff;
576 int o_offset;
577 int o_left;
578
579 EnterFunction(9);
580
581 diff = n_len - o_len;
582 o_offset = o_buf - (char *)skb->data;
583 /* The length of left data after o_buf+o_len in the skb data */
584 o_left = skb->len - (o_offset + o_len);
585
586 if (diff <= 0) {
587 memmove(o_buf + n_len, o_buf + o_len, o_left);
588 memcpy(o_buf, n_buf, n_len);
589 skb_trim(skb, skb->len + diff);
590 } else if (diff <= skb_tailroom(skb)) {
591 skb_put(skb, diff);
592 memmove(o_buf + n_len, o_buf + o_len, o_left);
593 memcpy(o_buf, n_buf, n_len);
594 } else {
595 if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
596 return -ENOMEM;
597 skb_put(skb, diff);
598 memmove(skb->data + o_offset + n_len,
599 skb->data + o_offset + o_len, o_left);
600 skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
601 }
602
603 /* must update the iph total length here */
604 ip_hdr(skb)->tot_len = htons(skb->len);
605
606 LeaveFunction(9);
607 return 0;
608}
609
610
611int __init ip_vs_app_init(void)
612{
613 /* we will replace it with proc_net_ipvs_create() soon */
614 proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
615 return 0;
616}
617
618
619void ip_vs_app_cleanup(void)
620{
621 proc_net_remove(&init_net, "ip_vs_app");
622}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
deleted file mode 100644
index 9a24332fbed8..000000000000
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ /dev/null
@@ -1,1110 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
20 *
21 * Changes:
22 *
23 */
24
25#include <linux/interrupt.h>
26#include <linux/in.h>
27#include <linux/net.h>
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/vmalloc.h>
31#include <linux/proc_fs.h> /* for proc_net_* */
32#include <linux/seq_file.h>
33#include <linux/jhash.h>
34#include <linux/random.h>
35
36#include <net/net_namespace.h>
37#include <net/ip_vs.h>
38
39
40/*
41 * Connection hash table: for input and output packets lookups of IPVS
42 */
43static struct list_head *ip_vs_conn_tab;
44
45/* SLAB cache for IPVS connections */
46static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
47
48/* counter for current IPVS connections */
49static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
50
51/* counter for no client port connections */
52static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
53
54/* random value for IPVS connection hash */
55static unsigned int ip_vs_conn_rnd;
56
57/*
58 * Fine locking granularity for big connection hash table
59 */
60#define CT_LOCKARRAY_BITS 4
61#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
62#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
63
64struct ip_vs_aligned_lock
65{
66 rwlock_t l;
67} __attribute__((__aligned__(SMP_CACHE_BYTES)));
68
69/* lock array for conn table */
70static struct ip_vs_aligned_lock
71__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
72
73static inline void ct_read_lock(unsigned key)
74{
75 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
76}
77
78static inline void ct_read_unlock(unsigned key)
79{
80 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
81}
82
83static inline void ct_write_lock(unsigned key)
84{
85 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
86}
87
88static inline void ct_write_unlock(unsigned key)
89{
90 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
91}
92
93static inline void ct_read_lock_bh(unsigned key)
94{
95 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
96}
97
98static inline void ct_read_unlock_bh(unsigned key)
99{
100 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
101}
102
103static inline void ct_write_lock_bh(unsigned key)
104{
105 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
106}
107
108static inline void ct_write_unlock_bh(unsigned key)
109{
110 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
111}
112
113
114/*
115 * Returns hash value for IPVS connection entry
116 */
117static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
118 const union nf_inet_addr *addr,
119 __be16 port)
120{
121#ifdef CONFIG_IP_VS_IPV6
122 if (af == AF_INET6)
123 return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
124 (__force u32)port, proto, ip_vs_conn_rnd)
125 & IP_VS_CONN_TAB_MASK;
126#endif
127 return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
128 ip_vs_conn_rnd)
129 & IP_VS_CONN_TAB_MASK;
130}
131
132
133/*
134 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
135 * returns bool success.
136 */
137static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
138{
139 unsigned hash;
140 int ret;
141
142 /* Hash by protocol, client address and port */
143 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
144
145 ct_write_lock(hash);
146
147 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
148 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
149 cp->flags |= IP_VS_CONN_F_HASHED;
150 atomic_inc(&cp->refcnt);
151 ret = 1;
152 } else {
153 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
154 "called from %p\n", __builtin_return_address(0));
155 ret = 0;
156 }
157
158 ct_write_unlock(hash);
159
160 return ret;
161}
162
163
164/*
165 * UNhashes ip_vs_conn from ip_vs_conn_tab.
166 * returns bool success.
167 */
168static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
169{
170 unsigned hash;
171 int ret;
172
173 /* unhash it and decrease its reference counter */
174 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
175
176 ct_write_lock(hash);
177
178 if (cp->flags & IP_VS_CONN_F_HASHED) {
179 list_del(&cp->c_list);
180 cp->flags &= ~IP_VS_CONN_F_HASHED;
181 atomic_dec(&cp->refcnt);
182 ret = 1;
183 } else
184 ret = 0;
185
186 ct_write_unlock(hash);
187
188 return ret;
189}
190
191
192/*
193 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
194 * Called for pkts coming from OUTside-to-INside.
195 * s_addr, s_port: pkt source address (foreign host)
196 * d_addr, d_port: pkt dest address (load balancer)
197 */
198static inline struct ip_vs_conn *__ip_vs_conn_in_get
199(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
200 const union nf_inet_addr *d_addr, __be16 d_port)
201{
202 unsigned hash;
203 struct ip_vs_conn *cp;
204
205 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
206
207 ct_read_lock(hash);
208
209 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
210 if (cp->af == af &&
211 ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
212 ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
213 s_port == cp->cport && d_port == cp->vport &&
214 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
215 protocol == cp->protocol) {
216 /* HIT */
217 atomic_inc(&cp->refcnt);
218 ct_read_unlock(hash);
219 return cp;
220 }
221 }
222
223 ct_read_unlock(hash);
224
225 return NULL;
226}
227
228struct ip_vs_conn *ip_vs_conn_in_get
229(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
230 const union nf_inet_addr *d_addr, __be16 d_port)
231{
232 struct ip_vs_conn *cp;
233
234 cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
235 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
236 cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
237 d_port);
238
239 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
240 ip_vs_proto_name(protocol),
241 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
242 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
243 cp ? "hit" : "not hit");
244
245 return cp;
246}
247
248/* Get reference to connection template */
249struct ip_vs_conn *ip_vs_ct_in_get
250(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
251 const union nf_inet_addr *d_addr, __be16 d_port)
252{
253 unsigned hash;
254 struct ip_vs_conn *cp;
255
256 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
257
258 ct_read_lock(hash);
259
260 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
261 if (cp->af == af &&
262 ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
263 ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
264 s_port == cp->cport && d_port == cp->vport &&
265 cp->flags & IP_VS_CONN_F_TEMPLATE &&
266 protocol == cp->protocol) {
267 /* HIT */
268 atomic_inc(&cp->refcnt);
269 goto out;
270 }
271 }
272 cp = NULL;
273
274 out:
275 ct_read_unlock(hash);
276
277 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
278 ip_vs_proto_name(protocol),
279 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
280 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
281 cp ? "hit" : "not hit");
282
283 return cp;
284}
285
286/*
287 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
288 * Called for pkts coming from inside-to-OUTside.
289 * s_addr, s_port: pkt source address (inside host)
290 * d_addr, d_port: pkt dest address (foreign host)
291 */
292struct ip_vs_conn *ip_vs_conn_out_get
293(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
294 const union nf_inet_addr *d_addr, __be16 d_port)
295{
296 unsigned hash;
297 struct ip_vs_conn *cp, *ret=NULL;
298
299 /*
300 * Check for "full" addressed entries
301 */
302 hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
303
304 ct_read_lock(hash);
305
306 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
307 if (cp->af == af &&
308 ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
309 ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
310 d_port == cp->cport && s_port == cp->dport &&
311 protocol == cp->protocol) {
312 /* HIT */
313 atomic_inc(&cp->refcnt);
314 ret = cp;
315 break;
316 }
317 }
318
319 ct_read_unlock(hash);
320
321 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
322 ip_vs_proto_name(protocol),
323 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
324 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
325 ret ? "hit" : "not hit");
326
327 return ret;
328}
329
330
331/*
332 * Put back the conn and restart its timer with its timeout
333 */
334void ip_vs_conn_put(struct ip_vs_conn *cp)
335{
336 /* reset it expire in its timeout */
337 mod_timer(&cp->timer, jiffies+cp->timeout);
338
339 __ip_vs_conn_put(cp);
340}
341
342
343/*
344 * Fill a no_client_port connection with a client port number
345 */
346void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
347{
348 if (ip_vs_conn_unhash(cp)) {
349 spin_lock(&cp->lock);
350 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
351 atomic_dec(&ip_vs_conn_no_cport_cnt);
352 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
353 cp->cport = cport;
354 }
355 spin_unlock(&cp->lock);
356
357 /* hash on new dport */
358 ip_vs_conn_hash(cp);
359 }
360}
361
362
363/*
364 * Bind a connection entry with the corresponding packet_xmit.
365 * Called by ip_vs_conn_new.
366 */
367static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
368{
369 switch (IP_VS_FWD_METHOD(cp)) {
370 case IP_VS_CONN_F_MASQ:
371 cp->packet_xmit = ip_vs_nat_xmit;
372 break;
373
374 case IP_VS_CONN_F_TUNNEL:
375 cp->packet_xmit = ip_vs_tunnel_xmit;
376 break;
377
378 case IP_VS_CONN_F_DROUTE:
379 cp->packet_xmit = ip_vs_dr_xmit;
380 break;
381
382 case IP_VS_CONN_F_LOCALNODE:
383 cp->packet_xmit = ip_vs_null_xmit;
384 break;
385
386 case IP_VS_CONN_F_BYPASS:
387 cp->packet_xmit = ip_vs_bypass_xmit;
388 break;
389 }
390}
391
392#ifdef CONFIG_IP_VS_IPV6
393static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
394{
395 switch (IP_VS_FWD_METHOD(cp)) {
396 case IP_VS_CONN_F_MASQ:
397 cp->packet_xmit = ip_vs_nat_xmit_v6;
398 break;
399
400 case IP_VS_CONN_F_TUNNEL:
401 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
402 break;
403
404 case IP_VS_CONN_F_DROUTE:
405 cp->packet_xmit = ip_vs_dr_xmit_v6;
406 break;
407
408 case IP_VS_CONN_F_LOCALNODE:
409 cp->packet_xmit = ip_vs_null_xmit;
410 break;
411
412 case IP_VS_CONN_F_BYPASS:
413 cp->packet_xmit = ip_vs_bypass_xmit_v6;
414 break;
415 }
416}
417#endif
418
419
420static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
421{
422 return atomic_read(&dest->activeconns)
423 + atomic_read(&dest->inactconns);
424}
425
426/*
427 * Bind a connection entry with a virtual service destination
428 * Called just after a new connection entry is created.
429 */
430static inline void
431ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
432{
433 /* if dest is NULL, then return directly */
434 if (!dest)
435 return;
436
437 /* Increase the refcnt counter of the dest */
438 atomic_inc(&dest->refcnt);
439
440 /* Bind with the destination and its corresponding transmitter */
441 if ((cp->flags & IP_VS_CONN_F_SYNC) &&
442 (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
443 /* if the connection is not template and is created
444 * by sync, preserve the activity flag.
445 */
446 cp->flags |= atomic_read(&dest->conn_flags) &
447 (~IP_VS_CONN_F_INACTIVE);
448 else
449 cp->flags |= atomic_read(&dest->conn_flags);
450 cp->dest = dest;
451
452 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
453 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
454 "dest->refcnt:%d\n",
455 ip_vs_proto_name(cp->protocol),
456 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
457 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
458 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
459 ip_vs_fwd_tag(cp), cp->state,
460 cp->flags, atomic_read(&cp->refcnt),
461 atomic_read(&dest->refcnt));
462
463 /* Update the connection counters */
464 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
465 /* It is a normal connection, so increase the inactive
466 connection counter because it is in TCP SYNRECV
467 state (inactive) or other protocol inacive state */
468 if ((cp->flags & IP_VS_CONN_F_SYNC) &&
469 (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
470 atomic_inc(&dest->activeconns);
471 else
472 atomic_inc(&dest->inactconns);
473 } else {
474 /* It is a persistent connection/template, so increase
475 the peristent connection counter */
476 atomic_inc(&dest->persistconns);
477 }
478
479 if (dest->u_threshold != 0 &&
480 ip_vs_dest_totalconns(dest) >= dest->u_threshold)
481 dest->flags |= IP_VS_DEST_F_OVERLOAD;
482}
483
484
485/*
486 * Check if there is a destination for the connection, if so
487 * bind the connection to the destination.
488 */
489struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
490{
491 struct ip_vs_dest *dest;
492
493 if ((cp) && (!cp->dest)) {
494 dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
495 &cp->vaddr, cp->vport,
496 cp->protocol);
497 ip_vs_bind_dest(cp, dest);
498 return dest;
499 } else
500 return NULL;
501}
502
503
504/*
505 * Unbind a connection entry with its VS destination
506 * Called by the ip_vs_conn_expire function.
507 */
508static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
509{
510 struct ip_vs_dest *dest = cp->dest;
511
512 if (!dest)
513 return;
514
515 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
516 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
517 "dest->refcnt:%d\n",
518 ip_vs_proto_name(cp->protocol),
519 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
520 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
521 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
522 ip_vs_fwd_tag(cp), cp->state,
523 cp->flags, atomic_read(&cp->refcnt),
524 atomic_read(&dest->refcnt));
525
526 /* Update the connection counters */
527 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
528 /* It is a normal connection, so decrease the inactconns
529 or activeconns counter */
530 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
531 atomic_dec(&dest->inactconns);
532 } else {
533 atomic_dec(&dest->activeconns);
534 }
535 } else {
536 /* It is a persistent connection/template, so decrease
537 the peristent connection counter */
538 atomic_dec(&dest->persistconns);
539 }
540
541 if (dest->l_threshold != 0) {
542 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
543 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
544 } else if (dest->u_threshold != 0) {
545 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
546 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
547 } else {
548 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
549 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
550 }
551
552 /*
553 * Simply decrease the refcnt of the dest, because the
554 * dest will be either in service's destination list
555 * or in the trash.
556 */
557 atomic_dec(&dest->refcnt);
558}
559
560
561/*
562 * Checking if the destination of a connection template is available.
563 * If available, return 1, otherwise invalidate this connection
564 * template and return 0.
565 */
566int ip_vs_check_template(struct ip_vs_conn *ct)
567{
568 struct ip_vs_dest *dest = ct->dest;
569
570 /*
571 * Checking the dest server status.
572 */
573 if ((dest == NULL) ||
574 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
575 (sysctl_ip_vs_expire_quiescent_template &&
576 (atomic_read(&dest->weight) == 0))) {
577 IP_VS_DBG_BUF(9, "check_template: dest not available for "
578 "protocol %s s:%s:%d v:%s:%d "
579 "-> d:%s:%d\n",
580 ip_vs_proto_name(ct->protocol),
581 IP_VS_DBG_ADDR(ct->af, &ct->caddr),
582 ntohs(ct->cport),
583 IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
584 ntohs(ct->vport),
585 IP_VS_DBG_ADDR(ct->af, &ct->daddr),
586 ntohs(ct->dport));
587
588 /*
589 * Invalidate the connection template
590 */
591 if (ct->vport != htons(0xffff)) {
592 if (ip_vs_conn_unhash(ct)) {
593 ct->dport = htons(0xffff);
594 ct->vport = htons(0xffff);
595 ct->cport = 0;
596 ip_vs_conn_hash(ct);
597 }
598 }
599
600 /*
601 * Simply decrease the refcnt of the template,
602 * don't restart its timer.
603 */
604 atomic_dec(&ct->refcnt);
605 return 0;
606 }
607 return 1;
608}
609
610static void ip_vs_conn_expire(unsigned long data)
611{
612 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
613
614 cp->timeout = 60*HZ;
615
616 /*
617 * hey, I'm using it
618 */
619 atomic_inc(&cp->refcnt);
620
621 /*
622 * do I control anybody?
623 */
624 if (atomic_read(&cp->n_control))
625 goto expire_later;
626
627 /*
628 * unhash it if it is hashed in the conn table
629 */
630 if (!ip_vs_conn_unhash(cp))
631 goto expire_later;
632
633 /*
634 * refcnt==1 implies I'm the only one referrer
635 */
636 if (likely(atomic_read(&cp->refcnt) == 1)) {
637 /* delete the timer if it is activated by other users */
638 if (timer_pending(&cp->timer))
639 del_timer(&cp->timer);
640
641 /* does anybody control me? */
642 if (cp->control)
643 ip_vs_control_del(cp);
644
645 if (unlikely(cp->app != NULL))
646 ip_vs_unbind_app(cp);
647 ip_vs_unbind_dest(cp);
648 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
649 atomic_dec(&ip_vs_conn_no_cport_cnt);
650 atomic_dec(&ip_vs_conn_count);
651
652 kmem_cache_free(ip_vs_conn_cachep, cp);
653 return;
654 }
655
656 /* hash it back to the table */
657 ip_vs_conn_hash(cp);
658
659 expire_later:
660 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
661 atomic_read(&cp->refcnt)-1,
662 atomic_read(&cp->n_control));
663
664 ip_vs_conn_put(cp);
665}
666
667
668void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
669{
670 if (del_timer(&cp->timer))
671 mod_timer(&cp->timer, jiffies);
672}
673
674
675/*
676 * Create a new connection entry and hash it into the ip_vs_conn_tab
677 */
678struct ip_vs_conn *
679ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
680 const union nf_inet_addr *vaddr, __be16 vport,
681 const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
682 struct ip_vs_dest *dest)
683{
684 struct ip_vs_conn *cp;
685 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
686
687 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
688 if (cp == NULL) {
689 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
690 return NULL;
691 }
692
693 INIT_LIST_HEAD(&cp->c_list);
694 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
695 cp->af = af;
696 cp->protocol = proto;
697 ip_vs_addr_copy(af, &cp->caddr, caddr);
698 cp->cport = cport;
699 ip_vs_addr_copy(af, &cp->vaddr, vaddr);
700 cp->vport = vport;
701 ip_vs_addr_copy(af, &cp->daddr, daddr);
702 cp->dport = dport;
703 cp->flags = flags;
704 spin_lock_init(&cp->lock);
705
706 /*
707 * Set the entry is referenced by the current thread before hashing
708 * it in the table, so that other thread run ip_vs_random_dropentry
709 * but cannot drop this entry.
710 */
711 atomic_set(&cp->refcnt, 1);
712
713 atomic_set(&cp->n_control, 0);
714 atomic_set(&cp->in_pkts, 0);
715
716 atomic_inc(&ip_vs_conn_count);
717 if (flags & IP_VS_CONN_F_NO_CPORT)
718 atomic_inc(&ip_vs_conn_no_cport_cnt);
719
720 /* Bind the connection with a destination server */
721 ip_vs_bind_dest(cp, dest);
722
723 /* Set its state and timeout */
724 cp->state = 0;
725 cp->timeout = 3*HZ;
726
727 /* Bind its packet transmitter */
728#ifdef CONFIG_IP_VS_IPV6
729 if (af == AF_INET6)
730 ip_vs_bind_xmit_v6(cp);
731 else
732#endif
733 ip_vs_bind_xmit(cp);
734
735 if (unlikely(pp && atomic_read(&pp->appcnt)))
736 ip_vs_bind_app(cp, pp);
737
738 /* Hash it in the ip_vs_conn_tab finally */
739 ip_vs_conn_hash(cp);
740
741 return cp;
742}
743
744
745/*
746 * /proc/net/ip_vs_conn entries
747 */
748#ifdef CONFIG_PROC_FS
749
750static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
751{
752 int idx;
753 struct ip_vs_conn *cp;
754
755 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
756 ct_read_lock_bh(idx);
757 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
758 if (pos-- == 0) {
759 seq->private = &ip_vs_conn_tab[idx];
760 return cp;
761 }
762 }
763 ct_read_unlock_bh(idx);
764 }
765
766 return NULL;
767}
768
769static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
770{
771 seq->private = NULL;
772 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
773}
774
775static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
776{
777 struct ip_vs_conn *cp = v;
778 struct list_head *e, *l = seq->private;
779 int idx;
780
781 ++*pos;
782 if (v == SEQ_START_TOKEN)
783 return ip_vs_conn_array(seq, 0);
784
785 /* more on same hash chain? */
786 if ((e = cp->c_list.next) != l)
787 return list_entry(e, struct ip_vs_conn, c_list);
788
789 idx = l - ip_vs_conn_tab;
790 ct_read_unlock_bh(idx);
791
792 while (++idx < IP_VS_CONN_TAB_SIZE) {
793 ct_read_lock_bh(idx);
794 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
795 seq->private = &ip_vs_conn_tab[idx];
796 return cp;
797 }
798 ct_read_unlock_bh(idx);
799 }
800 seq->private = NULL;
801 return NULL;
802}
803
804static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
805{
806 struct list_head *l = seq->private;
807
808 if (l)
809 ct_read_unlock_bh(l - ip_vs_conn_tab);
810}
811
812static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
813{
814
815 if (v == SEQ_START_TOKEN)
816 seq_puts(seq,
817 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
818 else {
819 const struct ip_vs_conn *cp = v;
820
821#ifdef CONFIG_IP_VS_IPV6
822 if (cp->af == AF_INET6)
823 seq_printf(seq,
824 "%-3s " NIP6_FMT " %04X " NIP6_FMT
825 " %04X " NIP6_FMT " %04X %-11s %7lu\n",
826 ip_vs_proto_name(cp->protocol),
827 NIP6(cp->caddr.in6), ntohs(cp->cport),
828 NIP6(cp->vaddr.in6), ntohs(cp->vport),
829 NIP6(cp->daddr.in6), ntohs(cp->dport),
830 ip_vs_state_name(cp->protocol, cp->state),
831 (cp->timer.expires-jiffies)/HZ);
832 else
833#endif
834 seq_printf(seq,
835 "%-3s %08X %04X %08X %04X"
836 " %08X %04X %-11s %7lu\n",
837 ip_vs_proto_name(cp->protocol),
838 ntohl(cp->caddr.ip), ntohs(cp->cport),
839 ntohl(cp->vaddr.ip), ntohs(cp->vport),
840 ntohl(cp->daddr.ip), ntohs(cp->dport),
841 ip_vs_state_name(cp->protocol, cp->state),
842 (cp->timer.expires-jiffies)/HZ);
843 }
844 return 0;
845}
846
847static const struct seq_operations ip_vs_conn_seq_ops = {
848 .start = ip_vs_conn_seq_start,
849 .next = ip_vs_conn_seq_next,
850 .stop = ip_vs_conn_seq_stop,
851 .show = ip_vs_conn_seq_show,
852};
853
854static int ip_vs_conn_open(struct inode *inode, struct file *file)
855{
856 return seq_open(file, &ip_vs_conn_seq_ops);
857}
858
859static const struct file_operations ip_vs_conn_fops = {
860 .owner = THIS_MODULE,
861 .open = ip_vs_conn_open,
862 .read = seq_read,
863 .llseek = seq_lseek,
864 .release = seq_release,
865};
866
867static const char *ip_vs_origin_name(unsigned flags)
868{
869 if (flags & IP_VS_CONN_F_SYNC)
870 return "SYNC";
871 else
872 return "LOCAL";
873}
874
875static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
876{
877
878 if (v == SEQ_START_TOKEN)
879 seq_puts(seq,
880 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
881 else {
882 const struct ip_vs_conn *cp = v;
883
884#ifdef CONFIG_IP_VS_IPV6
885 if (cp->af == AF_INET6)
886 seq_printf(seq,
887 "%-3s " NIP6_FMT " %04X " NIP6_FMT
888 " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
889 ip_vs_proto_name(cp->protocol),
890 NIP6(cp->caddr.in6), ntohs(cp->cport),
891 NIP6(cp->vaddr.in6), ntohs(cp->vport),
892 NIP6(cp->daddr.in6), ntohs(cp->dport),
893 ip_vs_state_name(cp->protocol, cp->state),
894 ip_vs_origin_name(cp->flags),
895 (cp->timer.expires-jiffies)/HZ);
896 else
897#endif
898 seq_printf(seq,
899 "%-3s %08X %04X %08X %04X "
900 "%08X %04X %-11s %-6s %7lu\n",
901 ip_vs_proto_name(cp->protocol),
902 ntohl(cp->caddr.ip), ntohs(cp->cport),
903 ntohl(cp->vaddr.ip), ntohs(cp->vport),
904 ntohl(cp->daddr.ip), ntohs(cp->dport),
905 ip_vs_state_name(cp->protocol, cp->state),
906 ip_vs_origin_name(cp->flags),
907 (cp->timer.expires-jiffies)/HZ);
908 }
909 return 0;
910}
911
912static const struct seq_operations ip_vs_conn_sync_seq_ops = {
913 .start = ip_vs_conn_seq_start,
914 .next = ip_vs_conn_seq_next,
915 .stop = ip_vs_conn_seq_stop,
916 .show = ip_vs_conn_sync_seq_show,
917};
918
919static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
920{
921 return seq_open(file, &ip_vs_conn_sync_seq_ops);
922}
923
924static const struct file_operations ip_vs_conn_sync_fops = {
925 .owner = THIS_MODULE,
926 .open = ip_vs_conn_sync_open,
927 .read = seq_read,
928 .llseek = seq_lseek,
929 .release = seq_release,
930};
931
932#endif
933
934
935/*
936 * Randomly drop connection entries before running out of memory
937 */
938static inline int todrop_entry(struct ip_vs_conn *cp)
939{
940 /*
941 * The drop rate array needs tuning for real environments.
942 * Called from timer bh only => no locking
943 */
944 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
945 static char todrop_counter[9] = {0};
946 int i;
947
948 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
949 This will leave enough time for normal connection to get
950 through. */
951 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
952 return 0;
953
954 /* Don't drop the entry if its number of incoming packets is not
955 located in [0, 8] */
956 i = atomic_read(&cp->in_pkts);
957 if (i > 8 || i < 0) return 0;
958
959 if (!todrop_rate[i]) return 0;
960 if (--todrop_counter[i] > 0) return 0;
961
962 todrop_counter[i] = todrop_rate[i];
963 return 1;
964}
965
966/* Called from keventd and must protect itself from softirqs */
967void ip_vs_random_dropentry(void)
968{
969 int idx;
970 struct ip_vs_conn *cp;
971
972 /*
973 * Randomly scan 1/32 of the whole table every second
974 */
975 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
976 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
977
978 /*
979 * Lock is actually needed in this loop.
980 */
981 ct_write_lock_bh(hash);
982
983 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
984 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
985 /* connection template */
986 continue;
987
988 if (cp->protocol == IPPROTO_TCP) {
989 switch(cp->state) {
990 case IP_VS_TCP_S_SYN_RECV:
991 case IP_VS_TCP_S_SYNACK:
992 break;
993
994 case IP_VS_TCP_S_ESTABLISHED:
995 if (todrop_entry(cp))
996 break;
997 continue;
998
999 default:
1000 continue;
1001 }
1002 } else {
1003 if (!todrop_entry(cp))
1004 continue;
1005 }
1006
1007 IP_VS_DBG(4, "del connection\n");
1008 ip_vs_conn_expire_now(cp);
1009 if (cp->control) {
1010 IP_VS_DBG(4, "del conn template\n");
1011 ip_vs_conn_expire_now(cp->control);
1012 }
1013 }
1014 ct_write_unlock_bh(hash);
1015 }
1016}
1017
1018
1019/*
1020 * Flush all the connection entries in the ip_vs_conn_tab
1021 */
1022static void ip_vs_conn_flush(void)
1023{
1024 int idx;
1025 struct ip_vs_conn *cp;
1026
1027 flush_again:
1028 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
1029 /*
1030 * Lock is actually needed in this loop.
1031 */
1032 ct_write_lock_bh(idx);
1033
1034 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
1035
1036 IP_VS_DBG(4, "del connection\n");
1037 ip_vs_conn_expire_now(cp);
1038 if (cp->control) {
1039 IP_VS_DBG(4, "del conn template\n");
1040 ip_vs_conn_expire_now(cp->control);
1041 }
1042 }
1043 ct_write_unlock_bh(idx);
1044 }
1045
1046 /* the counter may be not NULL, because maybe some conn entries
1047 are run by slow timer handler or unhashed but still referred */
1048 if (atomic_read(&ip_vs_conn_count) != 0) {
1049 schedule();
1050 goto flush_again;
1051 }
1052}
1053
1054
1055int __init ip_vs_conn_init(void)
1056{
1057 int idx;
1058
1059 /*
1060 * Allocate the connection hash table and initialize its list heads
1061 */
1062 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
1063 if (!ip_vs_conn_tab)
1064 return -ENOMEM;
1065
1066 /* Allocate ip_vs_conn slab cache */
1067 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1068 sizeof(struct ip_vs_conn), 0,
1069 SLAB_HWCACHE_ALIGN, NULL);
1070 if (!ip_vs_conn_cachep) {
1071 vfree(ip_vs_conn_tab);
1072 return -ENOMEM;
1073 }
1074
1075 IP_VS_INFO("Connection hash table configured "
1076 "(size=%d, memory=%ldKbytes)\n",
1077 IP_VS_CONN_TAB_SIZE,
1078 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
1079 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
1080 sizeof(struct ip_vs_conn));
1081
1082 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1083 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1084 }
1085
1086 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
1087 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1088 }
1089
1090 proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1091 proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1092
1093 /* calculate the random value for connection hash */
1094 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1095
1096 return 0;
1097}
1098
1099
1100void ip_vs_conn_cleanup(void)
1101{
1102 /* flush all the connection entries first */
1103 ip_vs_conn_flush();
1104
1105 /* Release the empty cache */
1106 kmem_cache_destroy(ip_vs_conn_cachep);
1107 proc_net_remove(&init_net, "ip_vs_conn");
1108 proc_net_remove(&init_net, "ip_vs_conn_sync");
1109 vfree(ip_vs_conn_tab);
1110}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
deleted file mode 100644
index 958abf3e5f8c..000000000000
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ /dev/null
@@ -1,1542 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others.
20 *
21 * Changes:
22 * Paul `Rusty' Russell properly handle non-linear skbs
23 * Harald Welte don't use nfcache
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/ip.h>
30#include <linux/tcp.h>
31#include <linux/icmp.h>
32
33#include <net/ip.h>
34#include <net/tcp.h>
35#include <net/udp.h>
36#include <net/icmp.h> /* for icmp_send */
37#include <net/route.h>
38
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv4.h>
41
42#ifdef CONFIG_IP_VS_IPV6
43#include <net/ipv6.h>
44#include <linux/netfilter_ipv6.h>
45#endif
46
47#include <net/ip_vs.h>
48
49
50EXPORT_SYMBOL(register_ip_vs_scheduler);
51EXPORT_SYMBOL(unregister_ip_vs_scheduler);
52EXPORT_SYMBOL(ip_vs_skb_replace);
53EXPORT_SYMBOL(ip_vs_proto_name);
54EXPORT_SYMBOL(ip_vs_conn_new);
55EXPORT_SYMBOL(ip_vs_conn_in_get);
56EXPORT_SYMBOL(ip_vs_conn_out_get);
57#ifdef CONFIG_IP_VS_PROTO_TCP
58EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
59#endif
60EXPORT_SYMBOL(ip_vs_conn_put);
61#ifdef CONFIG_IP_VS_DEBUG
62EXPORT_SYMBOL(ip_vs_get_debug_level);
63#endif
64
65
66/* ID used in ICMP lookups */
67#define icmp_id(icmph) (((icmph)->un).echo.id)
68#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
69
70const char *ip_vs_proto_name(unsigned proto)
71{
72 static char buf[20];
73
74 switch (proto) {
75 case IPPROTO_IP:
76 return "IP";
77 case IPPROTO_UDP:
78 return "UDP";
79 case IPPROTO_TCP:
80 return "TCP";
81 case IPPROTO_ICMP:
82 return "ICMP";
83#ifdef CONFIG_IP_VS_IPV6
84 case IPPROTO_ICMPV6:
85 return "ICMPv6";
86#endif
87 default:
88 sprintf(buf, "IP_%d", proto);
89 return buf;
90 }
91}
92
93void ip_vs_init_hash_table(struct list_head *table, int rows)
94{
95 while (--rows >= 0)
96 INIT_LIST_HEAD(&table[rows]);
97}
98
99static inline void
100ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
101{
102 struct ip_vs_dest *dest = cp->dest;
103 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
104 spin_lock(&dest->stats.lock);
105 dest->stats.ustats.inpkts++;
106 dest->stats.ustats.inbytes += skb->len;
107 spin_unlock(&dest->stats.lock);
108
109 spin_lock(&dest->svc->stats.lock);
110 dest->svc->stats.ustats.inpkts++;
111 dest->svc->stats.ustats.inbytes += skb->len;
112 spin_unlock(&dest->svc->stats.lock);
113
114 spin_lock(&ip_vs_stats.lock);
115 ip_vs_stats.ustats.inpkts++;
116 ip_vs_stats.ustats.inbytes += skb->len;
117 spin_unlock(&ip_vs_stats.lock);
118 }
119}
120
121
122static inline void
123ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
124{
125 struct ip_vs_dest *dest = cp->dest;
126 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
127 spin_lock(&dest->stats.lock);
128 dest->stats.ustats.outpkts++;
129 dest->stats.ustats.outbytes += skb->len;
130 spin_unlock(&dest->stats.lock);
131
132 spin_lock(&dest->svc->stats.lock);
133 dest->svc->stats.ustats.outpkts++;
134 dest->svc->stats.ustats.outbytes += skb->len;
135 spin_unlock(&dest->svc->stats.lock);
136
137 spin_lock(&ip_vs_stats.lock);
138 ip_vs_stats.ustats.outpkts++;
139 ip_vs_stats.ustats.outbytes += skb->len;
140 spin_unlock(&ip_vs_stats.lock);
141 }
142}
143
144
145static inline void
146ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
147{
148 spin_lock(&cp->dest->stats.lock);
149 cp->dest->stats.ustats.conns++;
150 spin_unlock(&cp->dest->stats.lock);
151
152 spin_lock(&svc->stats.lock);
153 svc->stats.ustats.conns++;
154 spin_unlock(&svc->stats.lock);
155
156 spin_lock(&ip_vs_stats.lock);
157 ip_vs_stats.ustats.conns++;
158 spin_unlock(&ip_vs_stats.lock);
159}
160
161
162static inline int
163ip_vs_set_state(struct ip_vs_conn *cp, int direction,
164 const struct sk_buff *skb,
165 struct ip_vs_protocol *pp)
166{
167 if (unlikely(!pp->state_transition))
168 return 0;
169 return pp->state_transition(cp, direction, skb, pp);
170}
171
172
173/*
174 * IPVS persistent scheduling function
175 * It creates a connection entry according to its template if exists,
176 * or selects a server and creates a connection entry plus a template.
177 * Locking: we are svc user (svc->refcnt), so we hold all dests too
178 * Protocols supported: TCP, UDP
179 */
180static struct ip_vs_conn *
181ip_vs_sched_persist(struct ip_vs_service *svc,
182 const struct sk_buff *skb,
183 __be16 ports[2])
184{
185 struct ip_vs_conn *cp = NULL;
186 struct ip_vs_iphdr iph;
187 struct ip_vs_dest *dest;
188 struct ip_vs_conn *ct;
189 __be16 dport; /* destination port to forward */
190 union nf_inet_addr snet; /* source network of the client,
191 after masking */
192
193 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
194
195 /* Mask saddr with the netmask to adjust template granularity */
196#ifdef CONFIG_IP_VS_IPV6
197 if (svc->af == AF_INET6)
198 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
199 else
200#endif
201 snet.ip = iph.saddr.ip & svc->netmask;
202
203 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
204 "mnet %s\n",
205 IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
206 IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
207 IP_VS_DBG_ADDR(svc->af, &snet));
208
209 /*
210 * As far as we know, FTP is a very complicated network protocol, and
211 * it uses control connection and data connections. For active FTP,
212 * FTP server initialize data connection to the client, its source port
213 * is often 20. For passive FTP, FTP server tells the clients the port
214 * that it passively listens to, and the client issues the data
215 * connection. In the tunneling or direct routing mode, the load
216 * balancer is on the client-to-server half of connection, the port
217 * number is unknown to the load balancer. So, a conn template like
218 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
219 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
220 * is created for other persistent services.
221 */
222 if (ports[1] == svc->port) {
223 /* Check if a template already exists */
224 if (svc->port != FTPPORT)
225 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
226 &iph.daddr, ports[1]);
227 else
228 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
229 &iph.daddr, 0);
230
231 if (!ct || !ip_vs_check_template(ct)) {
232 /*
233 * No template found or the dest of the connection
234 * template is not available.
235 */
236 dest = svc->scheduler->schedule(svc, skb);
237 if (dest == NULL) {
238 IP_VS_DBG(1, "p-schedule: no dest found.\n");
239 return NULL;
240 }
241
242 /*
243 * Create a template like <protocol,caddr,0,
244 * vaddr,vport,daddr,dport> for non-ftp service,
245 * and <protocol,caddr,0,vaddr,0,daddr,0>
246 * for ftp service.
247 */
248 if (svc->port != FTPPORT)
249 ct = ip_vs_conn_new(svc->af, iph.protocol,
250 &snet, 0,
251 &iph.daddr,
252 ports[1],
253 &dest->addr, dest->port,
254 IP_VS_CONN_F_TEMPLATE,
255 dest);
256 else
257 ct = ip_vs_conn_new(svc->af, iph.protocol,
258 &snet, 0,
259 &iph.daddr, 0,
260 &dest->addr, 0,
261 IP_VS_CONN_F_TEMPLATE,
262 dest);
263 if (ct == NULL)
264 return NULL;
265
266 ct->timeout = svc->timeout;
267 } else {
268 /* set destination with the found template */
269 dest = ct->dest;
270 }
271 dport = dest->port;
272 } else {
273 /*
274 * Note: persistent fwmark-based services and persistent
275 * port zero service are handled here.
276 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
277 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
278 */
279 if (svc->fwmark) {
280 union nf_inet_addr fwmark = {
281 .all = { 0, 0, 0, htonl(svc->fwmark) }
282 };
283
284 ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
285 &fwmark, 0);
286 } else
287 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
288 &iph.daddr, 0);
289
290 if (!ct || !ip_vs_check_template(ct)) {
291 /*
292 * If it is not persistent port zero, return NULL,
293 * otherwise create a connection template.
294 */
295 if (svc->port)
296 return NULL;
297
298 dest = svc->scheduler->schedule(svc, skb);
299 if (dest == NULL) {
300 IP_VS_DBG(1, "p-schedule: no dest found.\n");
301 return NULL;
302 }
303
304 /*
305 * Create a template according to the service
306 */
307 if (svc->fwmark) {
308 union nf_inet_addr fwmark = {
309 .all = { 0, 0, 0, htonl(svc->fwmark) }
310 };
311
312 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
313 &snet, 0,
314 &fwmark, 0,
315 &dest->addr, 0,
316 IP_VS_CONN_F_TEMPLATE,
317 dest);
318 } else
319 ct = ip_vs_conn_new(svc->af, iph.protocol,
320 &snet, 0,
321 &iph.daddr, 0,
322 &dest->addr, 0,
323 IP_VS_CONN_F_TEMPLATE,
324 dest);
325 if (ct == NULL)
326 return NULL;
327
328 ct->timeout = svc->timeout;
329 } else {
330 /* set destination with the found template */
331 dest = ct->dest;
332 }
333 dport = ports[1];
334 }
335
336 /*
337 * Create a new connection according to the template
338 */
339 cp = ip_vs_conn_new(svc->af, iph.protocol,
340 &iph.saddr, ports[0],
341 &iph.daddr, ports[1],
342 &dest->addr, dport,
343 0,
344 dest);
345 if (cp == NULL) {
346 ip_vs_conn_put(ct);
347 return NULL;
348 }
349
350 /*
351 * Add its control
352 */
353 ip_vs_control_add(cp, ct);
354 ip_vs_conn_put(ct);
355
356 ip_vs_conn_stats(cp, svc);
357 return cp;
358}
359
360
361/*
362 * IPVS main scheduling function
363 * It selects a server according to the virtual service, and
364 * creates a connection entry.
365 * Protocols supported: TCP, UDP
366 */
367struct ip_vs_conn *
368ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
369{
370 struct ip_vs_conn *cp = NULL;
371 struct ip_vs_iphdr iph;
372 struct ip_vs_dest *dest;
373 __be16 _ports[2], *pptr;
374
375 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
376 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
377 if (pptr == NULL)
378 return NULL;
379
380 /*
381 * Persistent service
382 */
383 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
384 return ip_vs_sched_persist(svc, skb, pptr);
385
386 /*
387 * Non-persistent service
388 */
389 if (!svc->fwmark && pptr[1] != svc->port) {
390 if (!svc->port)
391 IP_VS_ERR("Schedule: port zero only supported "
392 "in persistent services, "
393 "check your ipvs configuration\n");
394 return NULL;
395 }
396
397 dest = svc->scheduler->schedule(svc, skb);
398 if (dest == NULL) {
399 IP_VS_DBG(1, "Schedule: no dest found.\n");
400 return NULL;
401 }
402
403 /*
404 * Create a connection entry.
405 */
406 cp = ip_vs_conn_new(svc->af, iph.protocol,
407 &iph.saddr, pptr[0],
408 &iph.daddr, pptr[1],
409 &dest->addr, dest->port ? dest->port : pptr[1],
410 0,
411 dest);
412 if (cp == NULL)
413 return NULL;
414
415 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
416 "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
417 ip_vs_fwd_tag(cp),
418 IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
419 IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
420 IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
421 cp->flags, atomic_read(&cp->refcnt));
422
423 ip_vs_conn_stats(cp, svc);
424 return cp;
425}
426
427
428/*
429 * Pass or drop the packet.
430 * Called by ip_vs_in, when the virtual service is available but
431 * no destination is available for a new connection.
432 */
433int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
434 struct ip_vs_protocol *pp)
435{
436 __be16 _ports[2], *pptr;
437 struct ip_vs_iphdr iph;
438 int unicast;
439 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
440
441 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
442 if (pptr == NULL) {
443 ip_vs_service_put(svc);
444 return NF_DROP;
445 }
446
447#ifdef CONFIG_IP_VS_IPV6
448 if (svc->af == AF_INET6)
449 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
450 else
451#endif
452 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
453
454 /* if it is fwmark-based service, the cache_bypass sysctl is up
455 and the destination is a non-local unicast, then create
456 a cache_bypass connection entry */
457 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
458 int ret, cs;
459 struct ip_vs_conn *cp;
460 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
461
462 ip_vs_service_put(svc);
463
464 /* create a new connection entry */
465 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
466 cp = ip_vs_conn_new(svc->af, iph.protocol,
467 &iph.saddr, pptr[0],
468 &iph.daddr, pptr[1],
469 &daddr, 0,
470 IP_VS_CONN_F_BYPASS,
471 NULL);
472 if (cp == NULL)
473 return NF_DROP;
474
475 /* statistics */
476 ip_vs_in_stats(cp, skb);
477
478 /* set state */
479 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
480
481 /* transmit the first SYN packet */
482 ret = cp->packet_xmit(skb, cp, pp);
483 /* do not touch skb anymore */
484
485 atomic_inc(&cp->in_pkts);
486 ip_vs_conn_put(cp);
487 return ret;
488 }
489
490 /*
491 * When the virtual ftp service is presented, packets destined
492 * for other services on the VIP may get here (except services
493 * listed in the ipvs table), pass the packets, because it is
494 * not ipvs job to decide to drop the packets.
495 */
496 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
497 ip_vs_service_put(svc);
498 return NF_ACCEPT;
499 }
500
501 ip_vs_service_put(svc);
502
503 /*
504 * Notify the client that the destination is unreachable, and
505 * release the socket buffer.
506 * Since it is in IP layer, the TCP socket is not actually
507 * created, the TCP RST packet cannot be sent, instead that
508 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
509 */
510#ifdef CONFIG_IP_VS_IPV6
511 if (svc->af == AF_INET6)
512 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
513 skb->dev);
514 else
515#endif
516 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
517
518 return NF_DROP;
519}
520
521
522/*
523 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
524 * chain, and is used for VS/NAT.
525 * It detects packets for VS/NAT connections and sends the packets
526 * immediately. This can avoid that iptable_nat mangles the packets
527 * for VS/NAT.
528 */
529static unsigned int ip_vs_post_routing(unsigned int hooknum,
530 struct sk_buff *skb,
531 const struct net_device *in,
532 const struct net_device *out,
533 int (*okfn)(struct sk_buff *))
534{
535 if (!skb->ipvs_property)
536 return NF_ACCEPT;
537 /* The packet was sent from IPVS, exit this chain */
538 return NF_STOP;
539}
540
541__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
542{
543 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
544}
545
546static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
547{
548 int err = ip_defrag(skb, user);
549
550 if (!err)
551 ip_send_check(ip_hdr(skb));
552
553 return err;
554}
555
556#ifdef CONFIG_IP_VS_IPV6
557static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
558{
559 /* TODO IPv6: Find out what to do here for IPv6 */
560 return 0;
561}
562#endif
563
564/*
565 * Packet has been made sufficiently writable in caller
566 * - inout: 1=in->out, 0=out->in
567 */
568void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
569 struct ip_vs_conn *cp, int inout)
570{
571 struct iphdr *iph = ip_hdr(skb);
572 unsigned int icmp_offset = iph->ihl*4;
573 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
574 icmp_offset);
575 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
576
577 if (inout) {
578 iph->saddr = cp->vaddr.ip;
579 ip_send_check(iph);
580 ciph->daddr = cp->vaddr.ip;
581 ip_send_check(ciph);
582 } else {
583 iph->daddr = cp->daddr.ip;
584 ip_send_check(iph);
585 ciph->saddr = cp->daddr.ip;
586 ip_send_check(ciph);
587 }
588
589 /* the TCP/UDP port */
590 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
591 __be16 *ports = (void *)ciph + ciph->ihl*4;
592
593 if (inout)
594 ports[1] = cp->vport;
595 else
596 ports[0] = cp->dport;
597 }
598
599 /* And finally the ICMP checksum */
600 icmph->checksum = 0;
601 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
602 skb->ip_summed = CHECKSUM_UNNECESSARY;
603
604 if (inout)
605 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
606 "Forwarding altered outgoing ICMP");
607 else
608 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
609 "Forwarding altered incoming ICMP");
610}
611
612#ifdef CONFIG_IP_VS_IPV6
613void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
614 struct ip_vs_conn *cp, int inout)
615{
616 struct ipv6hdr *iph = ipv6_hdr(skb);
617 unsigned int icmp_offset = sizeof(struct ipv6hdr);
618 struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
619 icmp_offset);
620 struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
621
622 if (inout) {
623 iph->saddr = cp->vaddr.in6;
624 ciph->daddr = cp->vaddr.in6;
625 } else {
626 iph->daddr = cp->daddr.in6;
627 ciph->saddr = cp->daddr.in6;
628 }
629
630 /* the TCP/UDP port */
631 if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
632 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
633
634 if (inout)
635 ports[1] = cp->vport;
636 else
637 ports[0] = cp->dport;
638 }
639
640 /* And finally the ICMP checksum */
641 icmph->icmp6_cksum = 0;
642 /* TODO IPv6: is this correct for ICMPv6? */
643 ip_vs_checksum_complete(skb, icmp_offset);
644 skb->ip_summed = CHECKSUM_UNNECESSARY;
645
646 if (inout)
647 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
648 "Forwarding altered outgoing ICMPv6");
649 else
650 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
651 "Forwarding altered incoming ICMPv6");
652}
653#endif
654
655/* Handle relevant response ICMP messages - forward to the right
656 * destination host. Used for NAT and local client.
657 */
658static int handle_response_icmp(int af, struct sk_buff *skb,
659 union nf_inet_addr *snet,
660 __u8 protocol, struct ip_vs_conn *cp,
661 struct ip_vs_protocol *pp,
662 unsigned int offset, unsigned int ihl)
663{
664 unsigned int verdict = NF_DROP;
665
666 if (IP_VS_FWD_METHOD(cp) != 0) {
667 IP_VS_ERR("shouldn't reach here, because the box is on the "
668 "half connection in the tun/dr module.\n");
669 }
670
671 /* Ensure the checksum is correct */
672 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
673 /* Failed checksum! */
674 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
675 IP_VS_DBG_ADDR(af, snet));
676 goto out;
677 }
678
679 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
680 offset += 2 * sizeof(__u16);
681 if (!skb_make_writable(skb, offset))
682 goto out;
683
684#ifdef CONFIG_IP_VS_IPV6
685 if (af == AF_INET6)
686 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
687 else
688#endif
689 ip_vs_nat_icmp(skb, pp, cp, 1);
690
691 /* do the statistics and put it back */
692 ip_vs_out_stats(cp, skb);
693
694 skb->ipvs_property = 1;
695 verdict = NF_ACCEPT;
696
697out:
698 __ip_vs_conn_put(cp);
699
700 return verdict;
701}
702
703/*
704 * Handle ICMP messages in the inside-to-outside direction (outgoing).
705 * Find any that might be relevant, check against existing connections.
706 * Currently handles error types - unreachable, quench, ttl exceeded.
707 */
708static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
709{
710 struct iphdr *iph;
711 struct icmphdr _icmph, *ic;
712 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
713 struct ip_vs_iphdr ciph;
714 struct ip_vs_conn *cp;
715 struct ip_vs_protocol *pp;
716 unsigned int offset, ihl;
717 union nf_inet_addr snet;
718
719 *related = 1;
720
721 /* reassemble IP fragments */
722 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
723 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
724 return NF_STOLEN;
725 }
726
727 iph = ip_hdr(skb);
728 offset = ihl = iph->ihl * 4;
729 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
730 if (ic == NULL)
731 return NF_DROP;
732
733 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
734 ic->type, ntohs(icmp_id(ic)),
735 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
736
737 /*
738 * Work through seeing if this is for us.
739 * These checks are supposed to be in an order that means easy
740 * things are checked first to speed up processing.... however
741 * this means that some packets will manage to get a long way
742 * down this stack and then be rejected, but that's life.
743 */
744 if ((ic->type != ICMP_DEST_UNREACH) &&
745 (ic->type != ICMP_SOURCE_QUENCH) &&
746 (ic->type != ICMP_TIME_EXCEEDED)) {
747 *related = 0;
748 return NF_ACCEPT;
749 }
750
751 /* Now find the contained IP header */
752 offset += sizeof(_icmph);
753 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
754 if (cih == NULL)
755 return NF_ACCEPT; /* The packet looks wrong, ignore */
756
757 pp = ip_vs_proto_get(cih->protocol);
758 if (!pp)
759 return NF_ACCEPT;
760
761 /* Is the embedded protocol header present? */
762 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
763 pp->dont_defrag))
764 return NF_ACCEPT;
765
766 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
767
768 offset += cih->ihl * 4;
769
770 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
771 /* The embedded headers contain source and dest in reverse order */
772 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
773 if (!cp)
774 return NF_ACCEPT;
775
776 snet.ip = iph->saddr;
777 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
778 pp, offset, ihl);
779}
780
781#ifdef CONFIG_IP_VS_IPV6
782static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
783{
784 struct ipv6hdr *iph;
785 struct icmp6hdr _icmph, *ic;
786 struct ipv6hdr _ciph, *cih; /* The ip header contained
787 within the ICMP */
788 struct ip_vs_iphdr ciph;
789 struct ip_vs_conn *cp;
790 struct ip_vs_protocol *pp;
791 unsigned int offset;
792 union nf_inet_addr snet;
793
794 *related = 1;
795
796 /* reassemble IP fragments */
797 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
799 return NF_STOLEN;
800 }
801
802 iph = ipv6_hdr(skb);
803 offset = sizeof(struct ipv6hdr);
804 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
805 if (ic == NULL)
806 return NF_DROP;
807
808 IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
809 ic->icmp6_type, ntohs(icmpv6_id(ic)),
810 NIP6(iph->saddr), NIP6(iph->daddr));
811
812 /*
813 * Work through seeing if this is for us.
814 * These checks are supposed to be in an order that means easy
815 * things are checked first to speed up processing.... however
816 * this means that some packets will manage to get a long way
817 * down this stack and then be rejected, but that's life.
818 */
819 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
820 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
821 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
822 *related = 0;
823 return NF_ACCEPT;
824 }
825
826 /* Now find the contained IP header */
827 offset += sizeof(_icmph);
828 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
829 if (cih == NULL)
830 return NF_ACCEPT; /* The packet looks wrong, ignore */
831
832 pp = ip_vs_proto_get(cih->nexthdr);
833 if (!pp)
834 return NF_ACCEPT;
835
836 /* Is the embedded protocol header present? */
837 /* TODO: we don't support fragmentation at the moment anyways */
838 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839 return NF_ACCEPT;
840
841 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
842
843 offset += sizeof(struct ipv6hdr);
844
845 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
846 /* The embedded headers contain source and dest in reverse order */
847 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
848 if (!cp)
849 return NF_ACCEPT;
850
851 ipv6_addr_copy(&snet.in6, &iph->saddr);
852 return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
853 pp, offset, sizeof(struct ipv6hdr));
854}
855#endif
856
857static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
858{
859 struct tcphdr _tcph, *th;
860
861 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
862 if (th == NULL)
863 return 0;
864 return th->rst;
865}
866
867/* Handle response packets: rewrite addresses and send away...
868 * Used for NAT and local client.
869 */
870static unsigned int
871handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
872 struct ip_vs_conn *cp, int ihl)
873{
874 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
875
876 if (!skb_make_writable(skb, ihl))
877 goto drop;
878
879 /* mangle the packet */
880 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
881 goto drop;
882
883#ifdef CONFIG_IP_VS_IPV6
884 if (af == AF_INET6)
885 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
886 else
887#endif
888 {
889 ip_hdr(skb)->saddr = cp->vaddr.ip;
890 ip_send_check(ip_hdr(skb));
891 }
892
893 /* For policy routing, packets originating from this
894 * machine itself may be routed differently to packets
895 * passing through. We want this packet to be routed as
896 * if it came from this machine itself. So re-compute
897 * the routing information.
898 */
899#ifdef CONFIG_IP_VS_IPV6
900 if (af == AF_INET6) {
901 if (ip6_route_me_harder(skb) != 0)
902 goto drop;
903 } else
904#endif
905 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
906 goto drop;
907
908 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
909
910 ip_vs_out_stats(cp, skb);
911 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
912 ip_vs_conn_put(cp);
913
914 skb->ipvs_property = 1;
915
916 LeaveFunction(11);
917 return NF_ACCEPT;
918
919drop:
920 ip_vs_conn_put(cp);
921 kfree_skb(skb);
922 return NF_STOLEN;
923}
924
925/*
926 * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
927 * Check if outgoing packet belongs to the established ip_vs_conn.
928 */
929static unsigned int
930ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
931 const struct net_device *in, const struct net_device *out,
932 int (*okfn)(struct sk_buff *))
933{
934 struct ip_vs_iphdr iph;
935 struct ip_vs_protocol *pp;
936 struct ip_vs_conn *cp;
937 int af;
938
939 EnterFunction(11);
940
941 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
942
943 if (skb->ipvs_property)
944 return NF_ACCEPT;
945
946 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
947#ifdef CONFIG_IP_VS_IPV6
948 if (af == AF_INET6) {
949 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
950 int related, verdict = ip_vs_out_icmp_v6(skb, &related);
951
952 if (related)
953 return verdict;
954 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
955 }
956 } else
957#endif
958 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
959 int related, verdict = ip_vs_out_icmp(skb, &related);
960
961 if (related)
962 return verdict;
963 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
964 }
965
966 pp = ip_vs_proto_get(iph.protocol);
967 if (unlikely(!pp))
968 return NF_ACCEPT;
969
970 /* reassemble IP fragments */
971#ifdef CONFIG_IP_VS_IPV6
972 if (af == AF_INET6) {
973 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
974 int related, verdict = ip_vs_out_icmp_v6(skb, &related);
975
976 if (related)
977 return verdict;
978
979 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
980 }
981 } else
982#endif
983 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
984 !pp->dont_defrag)) {
985 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
986 return NF_STOLEN;
987
988 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
989 }
990
991 /*
992 * Check if the packet belongs to an existing entry
993 */
994 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
995
996 if (unlikely(!cp)) {
997 if (sysctl_ip_vs_nat_icmp_send &&
998 (pp->protocol == IPPROTO_TCP ||
999 pp->protocol == IPPROTO_UDP)) {
1000 __be16 _ports[2], *pptr;
1001
1002 pptr = skb_header_pointer(skb, iph.len,
1003 sizeof(_ports), _ports);
1004 if (pptr == NULL)
1005 return NF_ACCEPT; /* Not for me */
1006 if (ip_vs_lookup_real_service(af, iph.protocol,
1007 &iph.saddr,
1008 pptr[0])) {
1009 /*
1010 * Notify the real server: there is no
1011 * existing entry if it is not RST
1012 * packet or not TCP packet.
1013 */
1014 if (iph.protocol != IPPROTO_TCP
1015 || !is_tcp_reset(skb, iph.len)) {
1016#ifdef CONFIG_IP_VS_IPV6
1017 if (af == AF_INET6)
1018 icmpv6_send(skb,
1019 ICMPV6_DEST_UNREACH,
1020 ICMPV6_PORT_UNREACH,
1021 0, skb->dev);
1022 else
1023#endif
1024 icmp_send(skb,
1025 ICMP_DEST_UNREACH,
1026 ICMP_PORT_UNREACH, 0);
1027 return NF_DROP;
1028 }
1029 }
1030 }
1031 IP_VS_DBG_PKT(12, pp, skb, 0,
1032 "packet continues traversal as normal");
1033 return NF_ACCEPT;
1034 }
1035
1036 return handle_response(af, skb, pp, cp, iph.len);
1037}
1038
1039
1040/*
1041 * Handle ICMP messages in the outside-to-inside direction (incoming).
1042 * Find any that might be relevant, check against existing connections,
1043 * forward to the right destination host if relevant.
1044 * Currently handles error types - unreachable, quench, ttl exceeded.
1045 */
1046static int
1047ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1048{
1049 struct iphdr *iph;
1050 struct icmphdr _icmph, *ic;
1051 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
1052 struct ip_vs_iphdr ciph;
1053 struct ip_vs_conn *cp;
1054 struct ip_vs_protocol *pp;
1055 unsigned int offset, ihl, verdict;
1056 union nf_inet_addr snet;
1057
1058 *related = 1;
1059
1060 /* reassemble IP fragments */
1061 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1062 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1063 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1064 return NF_STOLEN;
1065 }
1066
1067 iph = ip_hdr(skb);
1068 offset = ihl = iph->ihl * 4;
1069 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1070 if (ic == NULL)
1071 return NF_DROP;
1072
1073 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
1074 ic->type, ntohs(icmp_id(ic)),
1075 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1076
1077 /*
1078 * Work through seeing if this is for us.
1079 * These checks are supposed to be in an order that means easy
1080 * things are checked first to speed up processing.... however
1081 * this means that some packets will manage to get a long way
1082 * down this stack and then be rejected, but that's life.
1083 */
1084 if ((ic->type != ICMP_DEST_UNREACH) &&
1085 (ic->type != ICMP_SOURCE_QUENCH) &&
1086 (ic->type != ICMP_TIME_EXCEEDED)) {
1087 *related = 0;
1088 return NF_ACCEPT;
1089 }
1090
1091 /* Now find the contained IP header */
1092 offset += sizeof(_icmph);
1093 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1094 if (cih == NULL)
1095 return NF_ACCEPT; /* The packet looks wrong, ignore */
1096
1097 pp = ip_vs_proto_get(cih->protocol);
1098 if (!pp)
1099 return NF_ACCEPT;
1100
1101 /* Is the embedded protocol header present? */
1102 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1103 pp->dont_defrag))
1104 return NF_ACCEPT;
1105
1106 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1107
1108 offset += cih->ihl * 4;
1109
1110 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1111 /* The embedded headers contain source and dest in reverse order */
1112 cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1113 if (!cp) {
1114 /* The packet could also belong to a local client */
1115 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1116 if (cp) {
1117 snet.ip = iph->saddr;
1118 return handle_response_icmp(AF_INET, skb, &snet,
1119 cih->protocol, cp, pp,
1120 offset, ihl);
1121 }
1122 return NF_ACCEPT;
1123 }
1124
1125 verdict = NF_DROP;
1126
1127 /* Ensure the checksum is correct */
1128 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1129 /* Failed checksum! */
1130 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
1131 NIPQUAD(iph->saddr));
1132 goto out;
1133 }
1134
1135 /* do the statistics and put it back */
1136 ip_vs_in_stats(cp, skb);
1137 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1138 offset += 2 * sizeof(__u16);
1139 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1140 /* do not touch skb anymore */
1141
1142 out:
1143 __ip_vs_conn_put(cp);
1144
1145 return verdict;
1146}
1147
1148#ifdef CONFIG_IP_VS_IPV6
1149static int
1150ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1151{
1152 struct ipv6hdr *iph;
1153 struct icmp6hdr _icmph, *ic;
1154 struct ipv6hdr _ciph, *cih; /* The ip header contained
1155 within the ICMP */
1156 struct ip_vs_iphdr ciph;
1157 struct ip_vs_conn *cp;
1158 struct ip_vs_protocol *pp;
1159 unsigned int offset, verdict;
1160 union nf_inet_addr snet;
1161
1162 *related = 1;
1163
1164 /* reassemble IP fragments */
1165 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1166 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1167 IP_DEFRAG_VS_IN :
1168 IP_DEFRAG_VS_FWD))
1169 return NF_STOLEN;
1170 }
1171
1172 iph = ipv6_hdr(skb);
1173 offset = sizeof(struct ipv6hdr);
1174 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1175 if (ic == NULL)
1176 return NF_DROP;
1177
1178 IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
1179 ic->icmp6_type, ntohs(icmpv6_id(ic)),
1180 NIP6(iph->saddr), NIP6(iph->daddr));
1181
1182 /*
1183 * Work through seeing if this is for us.
1184 * These checks are supposed to be in an order that means easy
1185 * things are checked first to speed up processing.... however
1186 * this means that some packets will manage to get a long way
1187 * down this stack and then be rejected, but that's life.
1188 */
1189 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1190 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1191 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1192 *related = 0;
1193 return NF_ACCEPT;
1194 }
1195
1196 /* Now find the contained IP header */
1197 offset += sizeof(_icmph);
1198 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1199 if (cih == NULL)
1200 return NF_ACCEPT; /* The packet looks wrong, ignore */
1201
1202 pp = ip_vs_proto_get(cih->nexthdr);
1203 if (!pp)
1204 return NF_ACCEPT;
1205
1206 /* Is the embedded protocol header present? */
1207 /* TODO: we don't support fragmentation at the moment anyways */
1208 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1209 return NF_ACCEPT;
1210
1211 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1212
1213 offset += sizeof(struct ipv6hdr);
1214
1215 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1216 /* The embedded headers contain source and dest in reverse order */
1217 cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1218 if (!cp) {
1219 /* The packet could also belong to a local client */
1220 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1221 if (cp) {
1222 ipv6_addr_copy(&snet.in6, &iph->saddr);
1223 return handle_response_icmp(AF_INET6, skb, &snet,
1224 cih->nexthdr,
1225 cp, pp, offset,
1226 sizeof(struct ipv6hdr));
1227 }
1228 return NF_ACCEPT;
1229 }
1230
1231 verdict = NF_DROP;
1232
1233 /* do the statistics and put it back */
1234 ip_vs_in_stats(cp, skb);
1235 if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1236 offset += 2 * sizeof(__u16);
1237 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1238 /* do not touch skb anymore */
1239
1240 __ip_vs_conn_put(cp);
1241
1242 return verdict;
1243}
1244#endif
1245
1246
1247/*
1248 * Check if it's for virtual services, look it up,
1249 * and send it on its way...
1250 */
1251static unsigned int
1252ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1253 const struct net_device *in, const struct net_device *out,
1254 int (*okfn)(struct sk_buff *))
1255{
1256 struct ip_vs_iphdr iph;
1257 struct ip_vs_protocol *pp;
1258 struct ip_vs_conn *cp;
1259 int ret, restart, af;
1260
1261 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1262
1263 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1264
1265 /*
1266 * Big tappo: only PACKET_HOST, including loopback for local client
1267 * Don't handle local packets on IPv6 for now
1268 */
1269 if (unlikely(skb->pkt_type != PACKET_HOST)) {
1270 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1271 skb->pkt_type,
1272 iph.protocol,
1273 IP_VS_DBG_ADDR(af, &iph.daddr));
1274 return NF_ACCEPT;
1275 }
1276
1277 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1278 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1279
1280 if (related)
1281 return verdict;
1282 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1283 }
1284
1285 /* Protocol supported? */
1286 pp = ip_vs_proto_get(iph.protocol);
1287 if (unlikely(!pp))
1288 return NF_ACCEPT;
1289
1290 /*
1291 * Check if the packet belongs to an existing connection entry
1292 */
1293 cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1294
1295 if (unlikely(!cp)) {
1296 int v;
1297
1298 /* For local client packets, it could be a response */
1299 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1300 if (cp)
1301 return handle_response(af, skb, pp, cp, iph.len);
1302
1303 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1304 return v;
1305 }
1306
1307 if (unlikely(!cp)) {
1308 /* sorry, all this trouble for a no-hit :) */
1309 IP_VS_DBG_PKT(12, pp, skb, 0,
1310 "packet continues traversal as normal");
1311 return NF_ACCEPT;
1312 }
1313
1314 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1315
1316 /* Check the server status */
1317 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1318 /* the destination server is not available */
1319
1320 if (sysctl_ip_vs_expire_nodest_conn) {
1321 /* try to expire the connection immediately */
1322 ip_vs_conn_expire_now(cp);
1323 }
1324 /* don't restart its timer, and silently
1325 drop the packet. */
1326 __ip_vs_conn_put(cp);
1327 return NF_DROP;
1328 }
1329
1330 ip_vs_in_stats(cp, skb);
1331 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1332 if (cp->packet_xmit)
1333 ret = cp->packet_xmit(skb, cp, pp);
1334 /* do not touch skb anymore */
1335 else {
1336 IP_VS_DBG_RL("warning: packet_xmit is null");
1337 ret = NF_ACCEPT;
1338 }
1339
1340 /* Increase its packet counter and check if it is needed
1341 * to be synchronized
1342 *
1343 * Sync connection if it is about to close to
1344 * encorage the standby servers to update the connections timeout
1345 */
1346 atomic_inc(&cp->in_pkts);
1347 if (af == AF_INET &&
1348 (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1349 (((cp->protocol != IPPROTO_TCP ||
1350 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1351 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1352 == sysctl_ip_vs_sync_threshold[0])) ||
1353 ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1354 ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1355 (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1356 (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1357 ip_vs_sync_conn(cp);
1358 cp->old_state = cp->state;
1359
1360 ip_vs_conn_put(cp);
1361 return ret;
1362}
1363
1364
1365/*
1366 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1367 * related packets destined for 0.0.0.0/0.
1368 * When fwmark-based virtual service is used, such as transparent
1369 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1370 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1371 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1372 * and send them to ip_vs_in_icmp.
1373 */
1374static unsigned int
1375ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1376 const struct net_device *in, const struct net_device *out,
1377 int (*okfn)(struct sk_buff *))
1378{
1379 int r;
1380
1381 if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1382 return NF_ACCEPT;
1383
1384 return ip_vs_in_icmp(skb, &r, hooknum);
1385}
1386
1387#ifdef CONFIG_IP_VS_IPV6
1388static unsigned int
1389ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1390 const struct net_device *in, const struct net_device *out,
1391 int (*okfn)(struct sk_buff *))
1392{
1393 int r;
1394
1395 if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1396 return NF_ACCEPT;
1397
1398 return ip_vs_in_icmp_v6(skb, &r, hooknum);
1399}
1400#endif
1401
1402
1403static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1404 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1405 * or VS/NAT(change destination), so that filtering rules can be
1406 * applied to IPVS. */
1407 {
1408 .hook = ip_vs_in,
1409 .owner = THIS_MODULE,
1410 .pf = PF_INET,
1411 .hooknum = NF_INET_LOCAL_IN,
1412 .priority = 100,
1413 },
1414 /* After packet filtering, change source only for VS/NAT */
1415 {
1416 .hook = ip_vs_out,
1417 .owner = THIS_MODULE,
1418 .pf = PF_INET,
1419 .hooknum = NF_INET_FORWARD,
1420 .priority = 100,
1421 },
1422 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1423 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1424 {
1425 .hook = ip_vs_forward_icmp,
1426 .owner = THIS_MODULE,
1427 .pf = PF_INET,
1428 .hooknum = NF_INET_FORWARD,
1429 .priority = 99,
1430 },
1431 /* Before the netfilter connection tracking, exit from POST_ROUTING */
1432 {
1433 .hook = ip_vs_post_routing,
1434 .owner = THIS_MODULE,
1435 .pf = PF_INET,
1436 .hooknum = NF_INET_POST_ROUTING,
1437 .priority = NF_IP_PRI_NAT_SRC-1,
1438 },
1439#ifdef CONFIG_IP_VS_IPV6
1440 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1441 * or VS/NAT(change destination), so that filtering rules can be
1442 * applied to IPVS. */
1443 {
1444 .hook = ip_vs_in,
1445 .owner = THIS_MODULE,
1446 .pf = PF_INET6,
1447 .hooknum = NF_INET_LOCAL_IN,
1448 .priority = 100,
1449 },
1450 /* After packet filtering, change source only for VS/NAT */
1451 {
1452 .hook = ip_vs_out,
1453 .owner = THIS_MODULE,
1454 .pf = PF_INET6,
1455 .hooknum = NF_INET_FORWARD,
1456 .priority = 100,
1457 },
1458 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1459 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1460 {
1461 .hook = ip_vs_forward_icmp_v6,
1462 .owner = THIS_MODULE,
1463 .pf = PF_INET6,
1464 .hooknum = NF_INET_FORWARD,
1465 .priority = 99,
1466 },
1467 /* Before the netfilter connection tracking, exit from POST_ROUTING */
1468 {
1469 .hook = ip_vs_post_routing,
1470 .owner = THIS_MODULE,
1471 .pf = PF_INET6,
1472 .hooknum = NF_INET_POST_ROUTING,
1473 .priority = NF_IP6_PRI_NAT_SRC-1,
1474 },
1475#endif
1476};
1477
1478
1479/*
1480 * Initialize IP Virtual Server
1481 */
1482static int __init ip_vs_init(void)
1483{
1484 int ret;
1485
1486 ip_vs_estimator_init();
1487
1488 ret = ip_vs_control_init();
1489 if (ret < 0) {
1490 IP_VS_ERR("can't setup control.\n");
1491 goto cleanup_estimator;
1492 }
1493
1494 ip_vs_protocol_init();
1495
1496 ret = ip_vs_app_init();
1497 if (ret < 0) {
1498 IP_VS_ERR("can't setup application helper.\n");
1499 goto cleanup_protocol;
1500 }
1501
1502 ret = ip_vs_conn_init();
1503 if (ret < 0) {
1504 IP_VS_ERR("can't setup connection table.\n");
1505 goto cleanup_app;
1506 }
1507
1508 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1509 if (ret < 0) {
1510 IP_VS_ERR("can't register hooks.\n");
1511 goto cleanup_conn;
1512 }
1513
1514 IP_VS_INFO("ipvs loaded.\n");
1515 return ret;
1516
1517 cleanup_conn:
1518 ip_vs_conn_cleanup();
1519 cleanup_app:
1520 ip_vs_app_cleanup();
1521 cleanup_protocol:
1522 ip_vs_protocol_cleanup();
1523 ip_vs_control_cleanup();
1524 cleanup_estimator:
1525 ip_vs_estimator_cleanup();
1526 return ret;
1527}
1528
1529static void __exit ip_vs_cleanup(void)
1530{
1531 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1532 ip_vs_conn_cleanup();
1533 ip_vs_app_cleanup();
1534 ip_vs_protocol_cleanup();
1535 ip_vs_control_cleanup();
1536 ip_vs_estimator_cleanup();
1537 IP_VS_INFO("ipvs unloaded.\n");
1538}
1539
1540module_init(ip_vs_init);
1541module_exit(ip_vs_cleanup);
1542MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
deleted file mode 100644
index 0302cf3e5039..000000000000
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ /dev/null
@@ -1,3443 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/types.h>
24#include <linux/capability.h>
25#include <linux/fs.h>
26#include <linux/sysctl.h>
27#include <linux/proc_fs.h>
28#include <linux/workqueue.h>
29#include <linux/swap.h>
30#include <linux/seq_file.h>
31
32#include <linux/netfilter.h>
33#include <linux/netfilter_ipv4.h>
34#include <linux/mutex.h>
35
36#include <net/net_namespace.h>
37#include <net/ip.h>
38#ifdef CONFIG_IP_VS_IPV6
39#include <net/ipv6.h>
40#include <net/ip6_route.h>
41#endif
42#include <net/route.h>
43#include <net/sock.h>
44#include <net/genetlink.h>
45
46#include <asm/uaccess.h>
47
48#include <net/ip_vs.h>
49
50/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
51static DEFINE_MUTEX(__ip_vs_mutex);
52
53/* lock for service table */
54static DEFINE_RWLOCK(__ip_vs_svc_lock);
55
56/* lock for table with the real services */
57static DEFINE_RWLOCK(__ip_vs_rs_lock);
58
59/* lock for state and timeout tables */
60static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
61
62/* lock for drop entry handling */
63static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
64
65/* lock for drop packet handling */
66static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
67
68/* 1/rate drop and drop-entry variables */
69int ip_vs_drop_rate = 0;
70int ip_vs_drop_counter = 0;
71static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
72
73/* number of virtual services */
74static int ip_vs_num_services = 0;
75
76/* sysctl variables */
77static int sysctl_ip_vs_drop_entry = 0;
78static int sysctl_ip_vs_drop_packet = 0;
79static int sysctl_ip_vs_secure_tcp = 0;
80static int sysctl_ip_vs_amemthresh = 1024;
81static int sysctl_ip_vs_am_droprate = 10;
82int sysctl_ip_vs_cache_bypass = 0;
83int sysctl_ip_vs_expire_nodest_conn = 0;
84int sysctl_ip_vs_expire_quiescent_template = 0;
85int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
86int sysctl_ip_vs_nat_icmp_send = 0;
87
88
89#ifdef CONFIG_IP_VS_DEBUG
90static int sysctl_ip_vs_debug_level = 0;
91
92int ip_vs_get_debug_level(void)
93{
94 return sysctl_ip_vs_debug_level;
95}
96#endif
97
98#ifdef CONFIG_IP_VS_IPV6
99/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
100static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
101{
102 struct rt6_info *rt;
103 struct flowi fl = {
104 .oif = 0,
105 .nl_u = {
106 .ip6_u = {
107 .daddr = *addr,
108 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
109 };
110
111 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
112 if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
113 return 1;
114
115 return 0;
116}
117#endif
118/*
119 * update_defense_level is called from keventd and from sysctl,
120 * so it needs to protect itself from softirqs
121 */
122static void update_defense_level(void)
123{
124 struct sysinfo i;
125 static int old_secure_tcp = 0;
126 int availmem;
127 int nomem;
128 int to_change = -1;
129
130 /* we only count free and buffered memory (in pages) */
131 si_meminfo(&i);
132 availmem = i.freeram + i.bufferram;
133 /* however in linux 2.5 the i.bufferram is total page cache size,
134 we need adjust it */
135 /* si_swapinfo(&i); */
136 /* availmem = availmem - (i.totalswap - i.freeswap); */
137
138 nomem = (availmem < sysctl_ip_vs_amemthresh);
139
140 local_bh_disable();
141
142 /* drop_entry */
143 spin_lock(&__ip_vs_dropentry_lock);
144 switch (sysctl_ip_vs_drop_entry) {
145 case 0:
146 atomic_set(&ip_vs_dropentry, 0);
147 break;
148 case 1:
149 if (nomem) {
150 atomic_set(&ip_vs_dropentry, 1);
151 sysctl_ip_vs_drop_entry = 2;
152 } else {
153 atomic_set(&ip_vs_dropentry, 0);
154 }
155 break;
156 case 2:
157 if (nomem) {
158 atomic_set(&ip_vs_dropentry, 1);
159 } else {
160 atomic_set(&ip_vs_dropentry, 0);
161 sysctl_ip_vs_drop_entry = 1;
162 };
163 break;
164 case 3:
165 atomic_set(&ip_vs_dropentry, 1);
166 break;
167 }
168 spin_unlock(&__ip_vs_dropentry_lock);
169
170 /* drop_packet */
171 spin_lock(&__ip_vs_droppacket_lock);
172 switch (sysctl_ip_vs_drop_packet) {
173 case 0:
174 ip_vs_drop_rate = 0;
175 break;
176 case 1:
177 if (nomem) {
178 ip_vs_drop_rate = ip_vs_drop_counter
179 = sysctl_ip_vs_amemthresh /
180 (sysctl_ip_vs_amemthresh-availmem);
181 sysctl_ip_vs_drop_packet = 2;
182 } else {
183 ip_vs_drop_rate = 0;
184 }
185 break;
186 case 2:
187 if (nomem) {
188 ip_vs_drop_rate = ip_vs_drop_counter
189 = sysctl_ip_vs_amemthresh /
190 (sysctl_ip_vs_amemthresh-availmem);
191 } else {
192 ip_vs_drop_rate = 0;
193 sysctl_ip_vs_drop_packet = 1;
194 }
195 break;
196 case 3:
197 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
198 break;
199 }
200 spin_unlock(&__ip_vs_droppacket_lock);
201
202 /* secure_tcp */
203 write_lock(&__ip_vs_securetcp_lock);
204 switch (sysctl_ip_vs_secure_tcp) {
205 case 0:
206 if (old_secure_tcp >= 2)
207 to_change = 0;
208 break;
209 case 1:
210 if (nomem) {
211 if (old_secure_tcp < 2)
212 to_change = 1;
213 sysctl_ip_vs_secure_tcp = 2;
214 } else {
215 if (old_secure_tcp >= 2)
216 to_change = 0;
217 }
218 break;
219 case 2:
220 if (nomem) {
221 if (old_secure_tcp < 2)
222 to_change = 1;
223 } else {
224 if (old_secure_tcp >= 2)
225 to_change = 0;
226 sysctl_ip_vs_secure_tcp = 1;
227 }
228 break;
229 case 3:
230 if (old_secure_tcp < 2)
231 to_change = 1;
232 break;
233 }
234 old_secure_tcp = sysctl_ip_vs_secure_tcp;
235 if (to_change >= 0)
236 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
237 write_unlock(&__ip_vs_securetcp_lock);
238
239 local_bh_enable();
240}
241
242
243/*
244 * Timer for checking the defense
245 */
246#define DEFENSE_TIMER_PERIOD 1*HZ
247static void defense_work_handler(struct work_struct *work);
248static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
249
250static void defense_work_handler(struct work_struct *work)
251{
252 update_defense_level();
253 if (atomic_read(&ip_vs_dropentry))
254 ip_vs_random_dropentry();
255
256 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
257}
258
259int
260ip_vs_use_count_inc(void)
261{
262 return try_module_get(THIS_MODULE);
263}
264
265void
266ip_vs_use_count_dec(void)
267{
268 module_put(THIS_MODULE);
269}
270
271
272/*
273 * Hash table: for virtual service lookups
274 */
275#define IP_VS_SVC_TAB_BITS 8
276#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
277#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
278
279/* the service table hashed by <protocol, addr, port> */
280static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
281/* the service table hashed by fwmark */
282static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
283
284/*
285 * Hash table: for real service lookups
286 */
287#define IP_VS_RTAB_BITS 4
288#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
289#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
290
291static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
292
293/*
294 * Trash for destinations
295 */
296static LIST_HEAD(ip_vs_dest_trash);
297
298/*
299 * FTP & NULL virtual service counters
300 */
301static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
302static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
303
304
305/*
306 * Returns hash value for virtual service
307 */
308static __inline__ unsigned
309ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
310 __be16 port)
311{
312 register unsigned porth = ntohs(port);
313 __be32 addr_fold = addr->ip;
314
315#ifdef CONFIG_IP_VS_IPV6
316 if (af == AF_INET6)
317 addr_fold = addr->ip6[0]^addr->ip6[1]^
318 addr->ip6[2]^addr->ip6[3];
319#endif
320
321 return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
322 & IP_VS_SVC_TAB_MASK;
323}
324
325/*
326 * Returns hash value of fwmark for virtual service lookup
327 */
328static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
329{
330 return fwmark & IP_VS_SVC_TAB_MASK;
331}
332
333/*
334 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
335 * or in the ip_vs_svc_fwm_table by fwmark.
336 * Should be called with locked tables.
337 */
338static int ip_vs_svc_hash(struct ip_vs_service *svc)
339{
340 unsigned hash;
341
342 if (svc->flags & IP_VS_SVC_F_HASHED) {
343 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
344 "called from %p\n", __builtin_return_address(0));
345 return 0;
346 }
347
348 if (svc->fwmark == 0) {
349 /*
350 * Hash it by <protocol,addr,port> in ip_vs_svc_table
351 */
352 hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
353 svc->port);
354 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
355 } else {
356 /*
357 * Hash it by fwmark in ip_vs_svc_fwm_table
358 */
359 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
360 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
361 }
362
363 svc->flags |= IP_VS_SVC_F_HASHED;
364 /* increase its refcnt because it is referenced by the svc table */
365 atomic_inc(&svc->refcnt);
366 return 1;
367}
368
369
370/*
371 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
372 * Should be called with locked tables.
373 */
374static int ip_vs_svc_unhash(struct ip_vs_service *svc)
375{
376 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
377 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
378 "called from %p\n", __builtin_return_address(0));
379 return 0;
380 }
381
382 if (svc->fwmark == 0) {
383 /* Remove it from the ip_vs_svc_table table */
384 list_del(&svc->s_list);
385 } else {
386 /* Remove it from the ip_vs_svc_fwm_table table */
387 list_del(&svc->f_list);
388 }
389
390 svc->flags &= ~IP_VS_SVC_F_HASHED;
391 atomic_dec(&svc->refcnt);
392 return 1;
393}
394
395
396/*
397 * Get service by {proto,addr,port} in the service table.
398 */
399static inline struct ip_vs_service *
400__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
401 __be16 vport)
402{
403 unsigned hash;
404 struct ip_vs_service *svc;
405
406 /* Check for "full" addressed entries */
407 hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
408
409 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
410 if ((svc->af == af)
411 && ip_vs_addr_equal(af, &svc->addr, vaddr)
412 && (svc->port == vport)
413 && (svc->protocol == protocol)) {
414 /* HIT */
415 atomic_inc(&svc->usecnt);
416 return svc;
417 }
418 }
419
420 return NULL;
421}
422
423
424/*
425 * Get service by {fwmark} in the service table.
426 */
427static inline struct ip_vs_service *
428__ip_vs_svc_fwm_get(int af, __u32 fwmark)
429{
430 unsigned hash;
431 struct ip_vs_service *svc;
432
433 /* Check for fwmark addressed entries */
434 hash = ip_vs_svc_fwm_hashkey(fwmark);
435
436 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
437 if (svc->fwmark == fwmark && svc->af == af) {
438 /* HIT */
439 atomic_inc(&svc->usecnt);
440 return svc;
441 }
442 }
443
444 return NULL;
445}
446
447struct ip_vs_service *
448ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
449 const union nf_inet_addr *vaddr, __be16 vport)
450{
451 struct ip_vs_service *svc;
452
453 read_lock(&__ip_vs_svc_lock);
454
455 /*
456 * Check the table hashed by fwmark first
457 */
458 if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
459 goto out;
460
461 /*
462 * Check the table hashed by <protocol,addr,port>
463 * for "full" addressed entries
464 */
465 svc = __ip_vs_service_get(af, protocol, vaddr, vport);
466
467 if (svc == NULL
468 && protocol == IPPROTO_TCP
469 && atomic_read(&ip_vs_ftpsvc_counter)
470 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
471 /*
472 * Check if ftp service entry exists, the packet
473 * might belong to FTP data connections.
474 */
475 svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
476 }
477
478 if (svc == NULL
479 && atomic_read(&ip_vs_nullsvc_counter)) {
480 /*
481 * Check if the catch-all port (port zero) exists
482 */
483 svc = __ip_vs_service_get(af, protocol, vaddr, 0);
484 }
485
486 out:
487 read_unlock(&__ip_vs_svc_lock);
488
489 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
490 fwmark, ip_vs_proto_name(protocol),
491 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
492 svc ? "hit" : "not hit");
493
494 return svc;
495}
496
497
498static inline void
499__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
500{
501 atomic_inc(&svc->refcnt);
502 dest->svc = svc;
503}
504
505static inline void
506__ip_vs_unbind_svc(struct ip_vs_dest *dest)
507{
508 struct ip_vs_service *svc = dest->svc;
509
510 dest->svc = NULL;
511 if (atomic_dec_and_test(&svc->refcnt))
512 kfree(svc);
513}
514
515
516/*
517 * Returns hash value for real service
518 */
519static inline unsigned ip_vs_rs_hashkey(int af,
520 const union nf_inet_addr *addr,
521 __be16 port)
522{
523 register unsigned porth = ntohs(port);
524 __be32 addr_fold = addr->ip;
525
526#ifdef CONFIG_IP_VS_IPV6
527 if (af == AF_INET6)
528 addr_fold = addr->ip6[0]^addr->ip6[1]^
529 addr->ip6[2]^addr->ip6[3];
530#endif
531
532 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
533 & IP_VS_RTAB_MASK;
534}
535
536/*
537 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
538 * should be called with locked tables.
539 */
540static int ip_vs_rs_hash(struct ip_vs_dest *dest)
541{
542 unsigned hash;
543
544 if (!list_empty(&dest->d_list)) {
545 return 0;
546 }
547
548 /*
549 * Hash by proto,addr,port,
550 * which are the parameters of the real service.
551 */
552 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
553
554 list_add(&dest->d_list, &ip_vs_rtable[hash]);
555
556 return 1;
557}
558
559/*
560 * UNhashes ip_vs_dest from ip_vs_rtable.
561 * should be called with locked tables.
562 */
563static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
564{
565 /*
566 * Remove it from the ip_vs_rtable table.
567 */
568 if (!list_empty(&dest->d_list)) {
569 list_del(&dest->d_list);
570 INIT_LIST_HEAD(&dest->d_list);
571 }
572
573 return 1;
574}
575
576/*
577 * Lookup real service by <proto,addr,port> in the real service table.
578 */
579struct ip_vs_dest *
580ip_vs_lookup_real_service(int af, __u16 protocol,
581 const union nf_inet_addr *daddr,
582 __be16 dport)
583{
584 unsigned hash;
585 struct ip_vs_dest *dest;
586
587 /*
588 * Check for "full" addressed entries
589 * Return the first found entry
590 */
591 hash = ip_vs_rs_hashkey(af, daddr, dport);
592
593 read_lock(&__ip_vs_rs_lock);
594 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
595 if ((dest->af == af)
596 && ip_vs_addr_equal(af, &dest->addr, daddr)
597 && (dest->port == dport)
598 && ((dest->protocol == protocol) ||
599 dest->vfwmark)) {
600 /* HIT */
601 read_unlock(&__ip_vs_rs_lock);
602 return dest;
603 }
604 }
605 read_unlock(&__ip_vs_rs_lock);
606
607 return NULL;
608}
609
610/*
611 * Lookup destination by {addr,port} in the given service
612 */
613static struct ip_vs_dest *
614ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
615 __be16 dport)
616{
617 struct ip_vs_dest *dest;
618
619 /*
620 * Find the destination for the given service
621 */
622 list_for_each_entry(dest, &svc->destinations, n_list) {
623 if ((dest->af == svc->af)
624 && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
625 && (dest->port == dport)) {
626 /* HIT */
627 return dest;
628 }
629 }
630
631 return NULL;
632}
633
634/*
635 * Find destination by {daddr,dport,vaddr,protocol}
636 * Cretaed to be used in ip_vs_process_message() in
637 * the backup synchronization daemon. It finds the
638 * destination to be bound to the received connection
639 * on the backup.
640 *
641 * ip_vs_lookup_real_service() looked promissing, but
642 * seems not working as expected.
643 */
644struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
645 __be16 dport,
646 const union nf_inet_addr *vaddr,
647 __be16 vport, __u16 protocol)
648{
649 struct ip_vs_dest *dest;
650 struct ip_vs_service *svc;
651
652 svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
653 if (!svc)
654 return NULL;
655 dest = ip_vs_lookup_dest(svc, daddr, dport);
656 if (dest)
657 atomic_inc(&dest->refcnt);
658 ip_vs_service_put(svc);
659 return dest;
660}
661
662/*
663 * Lookup dest by {svc,addr,port} in the destination trash.
664 * The destination trash is used to hold the destinations that are removed
665 * from the service table but are still referenced by some conn entries.
666 * The reason to add the destination trash is when the dest is temporary
667 * down (either by administrator or by monitor program), the dest can be
668 * picked back from the trash, the remaining connections to the dest can
669 * continue, and the counting information of the dest is also useful for
670 * scheduling.
671 */
672static struct ip_vs_dest *
673ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
674 __be16 dport)
675{
676 struct ip_vs_dest *dest, *nxt;
677
678 /*
679 * Find the destination in trash
680 */
681 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
682 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
683 "dest->refcnt=%d\n",
684 dest->vfwmark,
685 IP_VS_DBG_ADDR(svc->af, &dest->addr),
686 ntohs(dest->port),
687 atomic_read(&dest->refcnt));
688 if (dest->af == svc->af &&
689 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
690 dest->port == dport &&
691 dest->vfwmark == svc->fwmark &&
692 dest->protocol == svc->protocol &&
693 (svc->fwmark ||
694 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
695 dest->vport == svc->port))) {
696 /* HIT */
697 return dest;
698 }
699
700 /*
701 * Try to purge the destination from trash if not referenced
702 */
703 if (atomic_read(&dest->refcnt) == 1) {
704 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
705 "from trash\n",
706 dest->vfwmark,
707 IP_VS_DBG_ADDR(svc->af, &dest->addr),
708 ntohs(dest->port));
709 list_del(&dest->n_list);
710 ip_vs_dst_reset(dest);
711 __ip_vs_unbind_svc(dest);
712 kfree(dest);
713 }
714 }
715
716 return NULL;
717}
718
719
720/*
721 * Clean up all the destinations in the trash
722 * Called by the ip_vs_control_cleanup()
723 *
724 * When the ip_vs_control_clearup is activated by ipvs module exit,
725 * the service tables must have been flushed and all the connections
726 * are expired, and the refcnt of each destination in the trash must
727 * be 1, so we simply release them here.
728 */
729static void ip_vs_trash_cleanup(void)
730{
731 struct ip_vs_dest *dest, *nxt;
732
733 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
734 list_del(&dest->n_list);
735 ip_vs_dst_reset(dest);
736 __ip_vs_unbind_svc(dest);
737 kfree(dest);
738 }
739}
740
741
742static void
743ip_vs_zero_stats(struct ip_vs_stats *stats)
744{
745 spin_lock_bh(&stats->lock);
746
747 memset(&stats->ustats, 0, sizeof(stats->ustats));
748 ip_vs_zero_estimator(stats);
749
750 spin_unlock_bh(&stats->lock);
751}
752
753/*
754 * Update a destination in the given service
755 */
756static void
757__ip_vs_update_dest(struct ip_vs_service *svc,
758 struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
759{
760 int conn_flags;
761
762 /* set the weight and the flags */
763 atomic_set(&dest->weight, udest->weight);
764 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
765
766 /* check if local node and update the flags */
767#ifdef CONFIG_IP_VS_IPV6
768 if (svc->af == AF_INET6) {
769 if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
770 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
771 | IP_VS_CONN_F_LOCALNODE;
772 }
773 } else
774#endif
775 if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
776 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
777 | IP_VS_CONN_F_LOCALNODE;
778 }
779
780 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
781 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
782 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
783 } else {
784 /*
785 * Put the real service in ip_vs_rtable if not present.
786 * For now only for NAT!
787 */
788 write_lock_bh(&__ip_vs_rs_lock);
789 ip_vs_rs_hash(dest);
790 write_unlock_bh(&__ip_vs_rs_lock);
791 }
792 atomic_set(&dest->conn_flags, conn_flags);
793
794 /* bind the service */
795 if (!dest->svc) {
796 __ip_vs_bind_svc(dest, svc);
797 } else {
798 if (dest->svc != svc) {
799 __ip_vs_unbind_svc(dest);
800 ip_vs_zero_stats(&dest->stats);
801 __ip_vs_bind_svc(dest, svc);
802 }
803 }
804
805 /* set the dest status flags */
806 dest->flags |= IP_VS_DEST_F_AVAILABLE;
807
808 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
809 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
810 dest->u_threshold = udest->u_threshold;
811 dest->l_threshold = udest->l_threshold;
812}
813
814
815/*
816 * Create a destination for the given service
817 */
818static int
819ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
820 struct ip_vs_dest **dest_p)
821{
822 struct ip_vs_dest *dest;
823 unsigned atype;
824
825 EnterFunction(2);
826
827#ifdef CONFIG_IP_VS_IPV6
828 if (svc->af == AF_INET6) {
829 atype = ipv6_addr_type(&udest->addr.in6);
830 if ((!(atype & IPV6_ADDR_UNICAST) ||
831 atype & IPV6_ADDR_LINKLOCAL) &&
832 !__ip_vs_addr_is_local_v6(&udest->addr.in6))
833 return -EINVAL;
834 } else
835#endif
836 {
837 atype = inet_addr_type(&init_net, udest->addr.ip);
838 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
839 return -EINVAL;
840 }
841
842 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
843 if (dest == NULL) {
844 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
845 return -ENOMEM;
846 }
847
848 dest->af = svc->af;
849 dest->protocol = svc->protocol;
850 dest->vaddr = svc->addr;
851 dest->vport = svc->port;
852 dest->vfwmark = svc->fwmark;
853 ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
854 dest->port = udest->port;
855
856 atomic_set(&dest->activeconns, 0);
857 atomic_set(&dest->inactconns, 0);
858 atomic_set(&dest->persistconns, 0);
859 atomic_set(&dest->refcnt, 0);
860
861 INIT_LIST_HEAD(&dest->d_list);
862 spin_lock_init(&dest->dst_lock);
863 spin_lock_init(&dest->stats.lock);
864 __ip_vs_update_dest(svc, dest, udest);
865 ip_vs_new_estimator(&dest->stats);
866
867 *dest_p = dest;
868
869 LeaveFunction(2);
870 return 0;
871}
872
873
874/*
875 * Add a destination into an existing service
876 */
877static int
878ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
879{
880 struct ip_vs_dest *dest;
881 union nf_inet_addr daddr;
882 __be16 dport = udest->port;
883 int ret;
884
885 EnterFunction(2);
886
887 if (udest->weight < 0) {
888 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
889 return -ERANGE;
890 }
891
892 if (udest->l_threshold > udest->u_threshold) {
893 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
894 "upper threshold\n");
895 return -ERANGE;
896 }
897
898 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
899
900 /*
901 * Check if the dest already exists in the list
902 */
903 dest = ip_vs_lookup_dest(svc, &daddr, dport);
904
905 if (dest != NULL) {
906 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
907 return -EEXIST;
908 }
909
910 /*
911 * Check if the dest already exists in the trash and
912 * is from the same service
913 */
914 dest = ip_vs_trash_get_dest(svc, &daddr, dport);
915
916 if (dest != NULL) {
917 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
918 "dest->refcnt=%d, service %u/%s:%u\n",
919 IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
920 atomic_read(&dest->refcnt),
921 dest->vfwmark,
922 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
923 ntohs(dest->vport));
924
925 __ip_vs_update_dest(svc, dest, udest);
926
927 /*
928 * Get the destination from the trash
929 */
930 list_del(&dest->n_list);
931
932 ip_vs_new_estimator(&dest->stats);
933
934 write_lock_bh(&__ip_vs_svc_lock);
935
936 /*
937 * Wait until all other svc users go away.
938 */
939 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
940
941 list_add(&dest->n_list, &svc->destinations);
942 svc->num_dests++;
943
944 /* call the update_service function of its scheduler */
945 if (svc->scheduler->update_service)
946 svc->scheduler->update_service(svc);
947
948 write_unlock_bh(&__ip_vs_svc_lock);
949 return 0;
950 }
951
952 /*
953 * Allocate and initialize the dest structure
954 */
955 ret = ip_vs_new_dest(svc, udest, &dest);
956 if (ret) {
957 return ret;
958 }
959
960 /*
961 * Add the dest entry into the list
962 */
963 atomic_inc(&dest->refcnt);
964
965 write_lock_bh(&__ip_vs_svc_lock);
966
967 /*
968 * Wait until all other svc users go away.
969 */
970 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
971
972 list_add(&dest->n_list, &svc->destinations);
973 svc->num_dests++;
974
975 /* call the update_service function of its scheduler */
976 if (svc->scheduler->update_service)
977 svc->scheduler->update_service(svc);
978
979 write_unlock_bh(&__ip_vs_svc_lock);
980
981 LeaveFunction(2);
982
983 return 0;
984}
985
986
987/*
988 * Edit a destination in the given service
989 */
990static int
991ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
992{
993 struct ip_vs_dest *dest;
994 union nf_inet_addr daddr;
995 __be16 dport = udest->port;
996
997 EnterFunction(2);
998
999 if (udest->weight < 0) {
1000 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
1001 return -ERANGE;
1002 }
1003
1004 if (udest->l_threshold > udest->u_threshold) {
1005 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
1006 "upper threshold\n");
1007 return -ERANGE;
1008 }
1009
1010 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
1011
1012 /*
1013 * Lookup the destination list
1014 */
1015 dest = ip_vs_lookup_dest(svc, &daddr, dport);
1016
1017 if (dest == NULL) {
1018 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
1019 return -ENOENT;
1020 }
1021
1022 __ip_vs_update_dest(svc, dest, udest);
1023
1024 write_lock_bh(&__ip_vs_svc_lock);
1025
1026 /* Wait until all other svc users go away */
1027 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1028
1029 /* call the update_service, because server weight may be changed */
1030 if (svc->scheduler->update_service)
1031 svc->scheduler->update_service(svc);
1032
1033 write_unlock_bh(&__ip_vs_svc_lock);
1034
1035 LeaveFunction(2);
1036
1037 return 0;
1038}
1039
1040
1041/*
1042 * Delete a destination (must be already unlinked from the service)
1043 */
1044static void __ip_vs_del_dest(struct ip_vs_dest *dest)
1045{
1046 ip_vs_kill_estimator(&dest->stats);
1047
1048 /*
1049 * Remove it from the d-linked list with the real services.
1050 */
1051 write_lock_bh(&__ip_vs_rs_lock);
1052 ip_vs_rs_unhash(dest);
1053 write_unlock_bh(&__ip_vs_rs_lock);
1054
1055 /*
1056 * Decrease the refcnt of the dest, and free the dest
1057 * if nobody refers to it (refcnt=0). Otherwise, throw
1058 * the destination into the trash.
1059 */
1060 if (atomic_dec_and_test(&dest->refcnt)) {
1061 ip_vs_dst_reset(dest);
1062 /* simply decrease svc->refcnt here, let the caller check
1063 and release the service if nobody refers to it.
1064 Only user context can release destination and service,
1065 and only one user context can update virtual service at a
1066 time, so the operation here is OK */
1067 atomic_dec(&dest->svc->refcnt);
1068 kfree(dest);
1069 } else {
1070 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1071 "dest->refcnt=%d\n",
1072 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1073 ntohs(dest->port),
1074 atomic_read(&dest->refcnt));
1075 list_add(&dest->n_list, &ip_vs_dest_trash);
1076 atomic_inc(&dest->refcnt);
1077 }
1078}
1079
1080
1081/*
1082 * Unlink a destination from the given service
1083 */
1084static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1085 struct ip_vs_dest *dest,
1086 int svcupd)
1087{
1088 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1089
1090 /*
1091 * Remove it from the d-linked destination list.
1092 */
1093 list_del(&dest->n_list);
1094 svc->num_dests--;
1095
1096 /*
1097 * Call the update_service function of its scheduler
1098 */
1099 if (svcupd && svc->scheduler->update_service)
1100 svc->scheduler->update_service(svc);
1101}
1102
1103
1104/*
1105 * Delete a destination server in the given service
1106 */
1107static int
1108ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1109{
1110 struct ip_vs_dest *dest;
1111 __be16 dport = udest->port;
1112
1113 EnterFunction(2);
1114
1115 dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1116
1117 if (dest == NULL) {
1118 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1119 return -ENOENT;
1120 }
1121
1122 write_lock_bh(&__ip_vs_svc_lock);
1123
1124 /*
1125 * Wait until all other svc users go away.
1126 */
1127 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1128
1129 /*
1130 * Unlink dest from the service
1131 */
1132 __ip_vs_unlink_dest(svc, dest, 1);
1133
1134 write_unlock_bh(&__ip_vs_svc_lock);
1135
1136 /*
1137 * Delete the destination
1138 */
1139 __ip_vs_del_dest(dest);
1140
1141 LeaveFunction(2);
1142
1143 return 0;
1144}
1145
1146
1147/*
1148 * Add a service into the service hash table
1149 */
1150static int
1151ip_vs_add_service(struct ip_vs_service_user_kern *u,
1152 struct ip_vs_service **svc_p)
1153{
1154 int ret = 0;
1155 struct ip_vs_scheduler *sched = NULL;
1156 struct ip_vs_service *svc = NULL;
1157
1158 /* increase the module use count */
1159 ip_vs_use_count_inc();
1160
1161 /* Lookup the scheduler by 'u->sched_name' */
1162 sched = ip_vs_scheduler_get(u->sched_name);
1163 if (sched == NULL) {
1164 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1165 u->sched_name);
1166 ret = -ENOENT;
1167 goto out_mod_dec;
1168 }
1169
1170#ifdef CONFIG_IP_VS_IPV6
1171 if (u->af == AF_INET6) {
1172 if (!sched->supports_ipv6) {
1173 ret = -EAFNOSUPPORT;
1174 goto out_err;
1175 }
1176 if ((u->netmask < 1) || (u->netmask > 128)) {
1177 ret = -EINVAL;
1178 goto out_err;
1179 }
1180 }
1181#endif
1182
1183 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1184 if (svc == NULL) {
1185 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1186 ret = -ENOMEM;
1187 goto out_err;
1188 }
1189
1190 /* I'm the first user of the service */
1191 atomic_set(&svc->usecnt, 1);
1192 atomic_set(&svc->refcnt, 0);
1193
1194 svc->af = u->af;
1195 svc->protocol = u->protocol;
1196 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1197 svc->port = u->port;
1198 svc->fwmark = u->fwmark;
1199 svc->flags = u->flags;
1200 svc->timeout = u->timeout * HZ;
1201 svc->netmask = u->netmask;
1202
1203 INIT_LIST_HEAD(&svc->destinations);
1204 rwlock_init(&svc->sched_lock);
1205 spin_lock_init(&svc->stats.lock);
1206
1207 /* Bind the scheduler */
1208 ret = ip_vs_bind_scheduler(svc, sched);
1209 if (ret)
1210 goto out_err;
1211 sched = NULL;
1212
1213 /* Update the virtual service counters */
1214 if (svc->port == FTPPORT)
1215 atomic_inc(&ip_vs_ftpsvc_counter);
1216 else if (svc->port == 0)
1217 atomic_inc(&ip_vs_nullsvc_counter);
1218
1219 ip_vs_new_estimator(&svc->stats);
1220
1221 /* Count only IPv4 services for old get/setsockopt interface */
1222 if (svc->af == AF_INET)
1223 ip_vs_num_services++;
1224
1225 /* Hash the service into the service table */
1226 write_lock_bh(&__ip_vs_svc_lock);
1227 ip_vs_svc_hash(svc);
1228 write_unlock_bh(&__ip_vs_svc_lock);
1229
1230 *svc_p = svc;
1231 return 0;
1232
1233 out_err:
1234 if (svc != NULL) {
1235 if (svc->scheduler)
1236 ip_vs_unbind_scheduler(svc);
1237 if (svc->inc) {
1238 local_bh_disable();
1239 ip_vs_app_inc_put(svc->inc);
1240 local_bh_enable();
1241 }
1242 kfree(svc);
1243 }
1244 ip_vs_scheduler_put(sched);
1245
1246 out_mod_dec:
1247 /* decrease the module use count */
1248 ip_vs_use_count_dec();
1249
1250 return ret;
1251}
1252
1253
1254/*
1255 * Edit a service and bind it with a new scheduler
1256 */
1257static int
1258ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1259{
1260 struct ip_vs_scheduler *sched, *old_sched;
1261 int ret = 0;
1262
1263 /*
1264 * Lookup the scheduler, by 'u->sched_name'
1265 */
1266 sched = ip_vs_scheduler_get(u->sched_name);
1267 if (sched == NULL) {
1268 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1269 u->sched_name);
1270 return -ENOENT;
1271 }
1272 old_sched = sched;
1273
1274#ifdef CONFIG_IP_VS_IPV6
1275 if (u->af == AF_INET6) {
1276 if (!sched->supports_ipv6) {
1277 ret = -EAFNOSUPPORT;
1278 goto out;
1279 }
1280 if ((u->netmask < 1) || (u->netmask > 128)) {
1281 ret = -EINVAL;
1282 goto out;
1283 }
1284 }
1285#endif
1286
1287 write_lock_bh(&__ip_vs_svc_lock);
1288
1289 /*
1290 * Wait until all other svc users go away.
1291 */
1292 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1293
1294 /*
1295 * Set the flags and timeout value
1296 */
1297 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1298 svc->timeout = u->timeout * HZ;
1299 svc->netmask = u->netmask;
1300
1301 old_sched = svc->scheduler;
1302 if (sched != old_sched) {
1303 /*
1304 * Unbind the old scheduler
1305 */
1306 if ((ret = ip_vs_unbind_scheduler(svc))) {
1307 old_sched = sched;
1308 goto out_unlock;
1309 }
1310
1311 /*
1312 * Bind the new scheduler
1313 */
1314 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1315 /*
1316 * If ip_vs_bind_scheduler fails, restore the old
1317 * scheduler.
1318 * The main reason of failure is out of memory.
1319 *
1320 * The question is if the old scheduler can be
1321 * restored all the time. TODO: if it cannot be
1322 * restored some time, we must delete the service,
1323 * otherwise the system may crash.
1324 */
1325 ip_vs_bind_scheduler(svc, old_sched);
1326 old_sched = sched;
1327 goto out_unlock;
1328 }
1329 }
1330
1331 out_unlock:
1332 write_unlock_bh(&__ip_vs_svc_lock);
1333#ifdef CONFIG_IP_VS_IPV6
1334 out:
1335#endif
1336
1337 if (old_sched)
1338 ip_vs_scheduler_put(old_sched);
1339
1340 return ret;
1341}
1342
1343
1344/*
1345 * Delete a service from the service list
1346 * - The service must be unlinked, unlocked and not referenced!
1347 * - We are called under _bh lock
1348 */
1349static void __ip_vs_del_service(struct ip_vs_service *svc)
1350{
1351 struct ip_vs_dest *dest, *nxt;
1352 struct ip_vs_scheduler *old_sched;
1353
1354 /* Count only IPv4 services for old get/setsockopt interface */
1355 if (svc->af == AF_INET)
1356 ip_vs_num_services--;
1357
1358 ip_vs_kill_estimator(&svc->stats);
1359
1360 /* Unbind scheduler */
1361 old_sched = svc->scheduler;
1362 ip_vs_unbind_scheduler(svc);
1363 if (old_sched)
1364 ip_vs_scheduler_put(old_sched);
1365
1366 /* Unbind app inc */
1367 if (svc->inc) {
1368 ip_vs_app_inc_put(svc->inc);
1369 svc->inc = NULL;
1370 }
1371
1372 /*
1373 * Unlink the whole destination list
1374 */
1375 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1376 __ip_vs_unlink_dest(svc, dest, 0);
1377 __ip_vs_del_dest(dest);
1378 }
1379
1380 /*
1381 * Update the virtual service counters
1382 */
1383 if (svc->port == FTPPORT)
1384 atomic_dec(&ip_vs_ftpsvc_counter);
1385 else if (svc->port == 0)
1386 atomic_dec(&ip_vs_nullsvc_counter);
1387
1388 /*
1389 * Free the service if nobody refers to it
1390 */
1391 if (atomic_read(&svc->refcnt) == 0)
1392 kfree(svc);
1393
1394 /* decrease the module use count */
1395 ip_vs_use_count_dec();
1396}
1397
1398/*
1399 * Delete a service from the service list
1400 */
1401static int ip_vs_del_service(struct ip_vs_service *svc)
1402{
1403 if (svc == NULL)
1404 return -EEXIST;
1405
1406 /*
1407 * Unhash it from the service table
1408 */
1409 write_lock_bh(&__ip_vs_svc_lock);
1410
1411 ip_vs_svc_unhash(svc);
1412
1413 /*
1414 * Wait until all the svc users go away.
1415 */
1416 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1417
1418 __ip_vs_del_service(svc);
1419
1420 write_unlock_bh(&__ip_vs_svc_lock);
1421
1422 return 0;
1423}
1424
1425
1426/*
1427 * Flush all the virtual services
1428 */
1429static int ip_vs_flush(void)
1430{
1431 int idx;
1432 struct ip_vs_service *svc, *nxt;
1433
1434 /*
1435 * Flush the service table hashed by <protocol,addr,port>
1436 */
1437 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1438 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1439 write_lock_bh(&__ip_vs_svc_lock);
1440 ip_vs_svc_unhash(svc);
1441 /*
1442 * Wait until all the svc users go away.
1443 */
1444 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1445 __ip_vs_del_service(svc);
1446 write_unlock_bh(&__ip_vs_svc_lock);
1447 }
1448 }
1449
1450 /*
1451 * Flush the service table hashed by fwmark
1452 */
1453 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1454 list_for_each_entry_safe(svc, nxt,
1455 &ip_vs_svc_fwm_table[idx], f_list) {
1456 write_lock_bh(&__ip_vs_svc_lock);
1457 ip_vs_svc_unhash(svc);
1458 /*
1459 * Wait until all the svc users go away.
1460 */
1461 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1462 __ip_vs_del_service(svc);
1463 write_unlock_bh(&__ip_vs_svc_lock);
1464 }
1465 }
1466
1467 return 0;
1468}
1469
1470
1471/*
1472 * Zero counters in a service or all services
1473 */
1474static int ip_vs_zero_service(struct ip_vs_service *svc)
1475{
1476 struct ip_vs_dest *dest;
1477
1478 write_lock_bh(&__ip_vs_svc_lock);
1479 list_for_each_entry(dest, &svc->destinations, n_list) {
1480 ip_vs_zero_stats(&dest->stats);
1481 }
1482 ip_vs_zero_stats(&svc->stats);
1483 write_unlock_bh(&__ip_vs_svc_lock);
1484 return 0;
1485}
1486
1487static int ip_vs_zero_all(void)
1488{
1489 int idx;
1490 struct ip_vs_service *svc;
1491
1492 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1493 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1494 ip_vs_zero_service(svc);
1495 }
1496 }
1497
1498 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1499 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1500 ip_vs_zero_service(svc);
1501 }
1502 }
1503
1504 ip_vs_zero_stats(&ip_vs_stats);
1505 return 0;
1506}
1507
1508
1509static int
1510proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1511 void __user *buffer, size_t *lenp, loff_t *ppos)
1512{
1513 int *valp = table->data;
1514 int val = *valp;
1515 int rc;
1516
1517 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1518 if (write && (*valp != val)) {
1519 if ((*valp < 0) || (*valp > 3)) {
1520 /* Restore the correct value */
1521 *valp = val;
1522 } else {
1523 update_defense_level();
1524 }
1525 }
1526 return rc;
1527}
1528
1529
1530static int
1531proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1532 void __user *buffer, size_t *lenp, loff_t *ppos)
1533{
1534 int *valp = table->data;
1535 int val[2];
1536 int rc;
1537
1538 /* backup the value first */
1539 memcpy(val, valp, sizeof(val));
1540
1541 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1542 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1543 /* Restore the correct value */
1544 memcpy(valp, val, sizeof(val));
1545 }
1546 return rc;
1547}
1548
1549
1550/*
1551 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1552 */
1553
1554static struct ctl_table vs_vars[] = {
1555 {
1556 .procname = "amemthresh",
1557 .data = &sysctl_ip_vs_amemthresh,
1558 .maxlen = sizeof(int),
1559 .mode = 0644,
1560 .proc_handler = &proc_dointvec,
1561 },
1562#ifdef CONFIG_IP_VS_DEBUG
1563 {
1564 .procname = "debug_level",
1565 .data = &sysctl_ip_vs_debug_level,
1566 .maxlen = sizeof(int),
1567 .mode = 0644,
1568 .proc_handler = &proc_dointvec,
1569 },
1570#endif
1571 {
1572 .procname = "am_droprate",
1573 .data = &sysctl_ip_vs_am_droprate,
1574 .maxlen = sizeof(int),
1575 .mode = 0644,
1576 .proc_handler = &proc_dointvec,
1577 },
1578 {
1579 .procname = "drop_entry",
1580 .data = &sysctl_ip_vs_drop_entry,
1581 .maxlen = sizeof(int),
1582 .mode = 0644,
1583 .proc_handler = &proc_do_defense_mode,
1584 },
1585 {
1586 .procname = "drop_packet",
1587 .data = &sysctl_ip_vs_drop_packet,
1588 .maxlen = sizeof(int),
1589 .mode = 0644,
1590 .proc_handler = &proc_do_defense_mode,
1591 },
1592 {
1593 .procname = "secure_tcp",
1594 .data = &sysctl_ip_vs_secure_tcp,
1595 .maxlen = sizeof(int),
1596 .mode = 0644,
1597 .proc_handler = &proc_do_defense_mode,
1598 },
1599#if 0
1600 {
1601 .procname = "timeout_established",
1602 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1603 .maxlen = sizeof(int),
1604 .mode = 0644,
1605 .proc_handler = &proc_dointvec_jiffies,
1606 },
1607 {
1608 .procname = "timeout_synsent",
1609 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1610 .maxlen = sizeof(int),
1611 .mode = 0644,
1612 .proc_handler = &proc_dointvec_jiffies,
1613 },
1614 {
1615 .procname = "timeout_synrecv",
1616 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1617 .maxlen = sizeof(int),
1618 .mode = 0644,
1619 .proc_handler = &proc_dointvec_jiffies,
1620 },
1621 {
1622 .procname = "timeout_finwait",
1623 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1624 .maxlen = sizeof(int),
1625 .mode = 0644,
1626 .proc_handler = &proc_dointvec_jiffies,
1627 },
1628 {
1629 .procname = "timeout_timewait",
1630 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1631 .maxlen = sizeof(int),
1632 .mode = 0644,
1633 .proc_handler = &proc_dointvec_jiffies,
1634 },
1635 {
1636 .procname = "timeout_close",
1637 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1638 .maxlen = sizeof(int),
1639 .mode = 0644,
1640 .proc_handler = &proc_dointvec_jiffies,
1641 },
1642 {
1643 .procname = "timeout_closewait",
1644 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1645 .maxlen = sizeof(int),
1646 .mode = 0644,
1647 .proc_handler = &proc_dointvec_jiffies,
1648 },
1649 {
1650 .procname = "timeout_lastack",
1651 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1652 .maxlen = sizeof(int),
1653 .mode = 0644,
1654 .proc_handler = &proc_dointvec_jiffies,
1655 },
1656 {
1657 .procname = "timeout_listen",
1658 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1659 .maxlen = sizeof(int),
1660 .mode = 0644,
1661 .proc_handler = &proc_dointvec_jiffies,
1662 },
1663 {
1664 .procname = "timeout_synack",
1665 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1666 .maxlen = sizeof(int),
1667 .mode = 0644,
1668 .proc_handler = &proc_dointvec_jiffies,
1669 },
1670 {
1671 .procname = "timeout_udp",
1672 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1673 .maxlen = sizeof(int),
1674 .mode = 0644,
1675 .proc_handler = &proc_dointvec_jiffies,
1676 },
1677 {
1678 .procname = "timeout_icmp",
1679 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1680 .maxlen = sizeof(int),
1681 .mode = 0644,
1682 .proc_handler = &proc_dointvec_jiffies,
1683 },
1684#endif
1685 {
1686 .procname = "cache_bypass",
1687 .data = &sysctl_ip_vs_cache_bypass,
1688 .maxlen = sizeof(int),
1689 .mode = 0644,
1690 .proc_handler = &proc_dointvec,
1691 },
1692 {
1693 .procname = "expire_nodest_conn",
1694 .data = &sysctl_ip_vs_expire_nodest_conn,
1695 .maxlen = sizeof(int),
1696 .mode = 0644,
1697 .proc_handler = &proc_dointvec,
1698 },
1699 {
1700 .procname = "expire_quiescent_template",
1701 .data = &sysctl_ip_vs_expire_quiescent_template,
1702 .maxlen = sizeof(int),
1703 .mode = 0644,
1704 .proc_handler = &proc_dointvec,
1705 },
1706 {
1707 .procname = "sync_threshold",
1708 .data = &sysctl_ip_vs_sync_threshold,
1709 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1710 .mode = 0644,
1711 .proc_handler = &proc_do_sync_threshold,
1712 },
1713 {
1714 .procname = "nat_icmp_send",
1715 .data = &sysctl_ip_vs_nat_icmp_send,
1716 .maxlen = sizeof(int),
1717 .mode = 0644,
1718 .proc_handler = &proc_dointvec,
1719 },
1720 { .ctl_name = 0 }
1721};
1722
1723const struct ctl_path net_vs_ctl_path[] = {
1724 { .procname = "net", .ctl_name = CTL_NET, },
1725 { .procname = "ipv4", .ctl_name = NET_IPV4, },
1726 { .procname = "vs", },
1727 { }
1728};
1729EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1730
1731static struct ctl_table_header * sysctl_header;
1732
1733#ifdef CONFIG_PROC_FS
1734
1735struct ip_vs_iter {
1736 struct list_head *table;
1737 int bucket;
1738};
1739
1740/*
1741 * Write the contents of the VS rule table to a PROCfs file.
1742 * (It is kept just for backward compatibility)
1743 */
1744static inline const char *ip_vs_fwd_name(unsigned flags)
1745{
1746 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1747 case IP_VS_CONN_F_LOCALNODE:
1748 return "Local";
1749 case IP_VS_CONN_F_TUNNEL:
1750 return "Tunnel";
1751 case IP_VS_CONN_F_DROUTE:
1752 return "Route";
1753 default:
1754 return "Masq";
1755 }
1756}
1757
1758
1759/* Get the Nth entry in the two lists */
1760static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1761{
1762 struct ip_vs_iter *iter = seq->private;
1763 int idx;
1764 struct ip_vs_service *svc;
1765
1766 /* look in hash by protocol */
1767 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1768 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1769 if (pos-- == 0){
1770 iter->table = ip_vs_svc_table;
1771 iter->bucket = idx;
1772 return svc;
1773 }
1774 }
1775 }
1776
1777 /* keep looking in fwmark */
1778 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1779 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1780 if (pos-- == 0) {
1781 iter->table = ip_vs_svc_fwm_table;
1782 iter->bucket = idx;
1783 return svc;
1784 }
1785 }
1786 }
1787
1788 return NULL;
1789}
1790
1791static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1792__acquires(__ip_vs_svc_lock)
1793{
1794
1795 read_lock_bh(&__ip_vs_svc_lock);
1796 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1797}
1798
1799
1800static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1801{
1802 struct list_head *e;
1803 struct ip_vs_iter *iter;
1804 struct ip_vs_service *svc;
1805
1806 ++*pos;
1807 if (v == SEQ_START_TOKEN)
1808 return ip_vs_info_array(seq,0);
1809
1810 svc = v;
1811 iter = seq->private;
1812
1813 if (iter->table == ip_vs_svc_table) {
1814 /* next service in table hashed by protocol */
1815 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1816 return list_entry(e, struct ip_vs_service, s_list);
1817
1818
1819 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1820 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1821 s_list) {
1822 return svc;
1823 }
1824 }
1825
1826 iter->table = ip_vs_svc_fwm_table;
1827 iter->bucket = -1;
1828 goto scan_fwmark;
1829 }
1830
1831 /* next service in hashed by fwmark */
1832 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1833 return list_entry(e, struct ip_vs_service, f_list);
1834
1835 scan_fwmark:
1836 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1837 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1838 f_list)
1839 return svc;
1840 }
1841
1842 return NULL;
1843}
1844
1845static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1846__releases(__ip_vs_svc_lock)
1847{
1848 read_unlock_bh(&__ip_vs_svc_lock);
1849}
1850
1851
1852static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1853{
1854 if (v == SEQ_START_TOKEN) {
1855 seq_printf(seq,
1856 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1857 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1858 seq_puts(seq,
1859 "Prot LocalAddress:Port Scheduler Flags\n");
1860 seq_puts(seq,
1861 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1862 } else {
1863 const struct ip_vs_service *svc = v;
1864 const struct ip_vs_iter *iter = seq->private;
1865 const struct ip_vs_dest *dest;
1866
1867 if (iter->table == ip_vs_svc_table) {
1868#ifdef CONFIG_IP_VS_IPV6
1869 if (svc->af == AF_INET6)
1870 seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ",
1871 ip_vs_proto_name(svc->protocol),
1872 NIP6(svc->addr.in6),
1873 ntohs(svc->port),
1874 svc->scheduler->name);
1875 else
1876#endif
1877 seq_printf(seq, "%s %08X:%04X %s ",
1878 ip_vs_proto_name(svc->protocol),
1879 ntohl(svc->addr.ip),
1880 ntohs(svc->port),
1881 svc->scheduler->name);
1882 } else {
1883 seq_printf(seq, "FWM %08X %s ",
1884 svc->fwmark, svc->scheduler->name);
1885 }
1886
1887 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1888 seq_printf(seq, "persistent %d %08X\n",
1889 svc->timeout,
1890 ntohl(svc->netmask));
1891 else
1892 seq_putc(seq, '\n');
1893
1894 list_for_each_entry(dest, &svc->destinations, n_list) {
1895#ifdef CONFIG_IP_VS_IPV6
1896 if (dest->af == AF_INET6)
1897 seq_printf(seq,
1898 " -> [" NIP6_FMT "]:%04X"
1899 " %-7s %-6d %-10d %-10d\n",
1900 NIP6(dest->addr.in6),
1901 ntohs(dest->port),
1902 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1903 atomic_read(&dest->weight),
1904 atomic_read(&dest->activeconns),
1905 atomic_read(&dest->inactconns));
1906 else
1907#endif
1908 seq_printf(seq,
1909 " -> %08X:%04X "
1910 "%-7s %-6d %-10d %-10d\n",
1911 ntohl(dest->addr.ip),
1912 ntohs(dest->port),
1913 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1914 atomic_read(&dest->weight),
1915 atomic_read(&dest->activeconns),
1916 atomic_read(&dest->inactconns));
1917
1918 }
1919 }
1920 return 0;
1921}
1922
1923static const struct seq_operations ip_vs_info_seq_ops = {
1924 .start = ip_vs_info_seq_start,
1925 .next = ip_vs_info_seq_next,
1926 .stop = ip_vs_info_seq_stop,
1927 .show = ip_vs_info_seq_show,
1928};
1929
1930static int ip_vs_info_open(struct inode *inode, struct file *file)
1931{
1932 return seq_open_private(file, &ip_vs_info_seq_ops,
1933 sizeof(struct ip_vs_iter));
1934}
1935
1936static const struct file_operations ip_vs_info_fops = {
1937 .owner = THIS_MODULE,
1938 .open = ip_vs_info_open,
1939 .read = seq_read,
1940 .llseek = seq_lseek,
1941 .release = seq_release_private,
1942};
1943
1944#endif
1945
1946struct ip_vs_stats ip_vs_stats = {
1947 .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
1948};
1949
1950#ifdef CONFIG_PROC_FS
1951static int ip_vs_stats_show(struct seq_file *seq, void *v)
1952{
1953
1954/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1955 seq_puts(seq,
1956 " Total Incoming Outgoing Incoming Outgoing\n");
1957 seq_printf(seq,
1958 " Conns Packets Packets Bytes Bytes\n");
1959
1960 spin_lock_bh(&ip_vs_stats.lock);
1961 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
1962 ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
1963 (unsigned long long) ip_vs_stats.ustats.inbytes,
1964 (unsigned long long) ip_vs_stats.ustats.outbytes);
1965
1966/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1967 seq_puts(seq,
1968 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1969 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1970 ip_vs_stats.ustats.cps,
1971 ip_vs_stats.ustats.inpps,
1972 ip_vs_stats.ustats.outpps,
1973 ip_vs_stats.ustats.inbps,
1974 ip_vs_stats.ustats.outbps);
1975 spin_unlock_bh(&ip_vs_stats.lock);
1976
1977 return 0;
1978}
1979
1980static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1981{
1982 return single_open(file, ip_vs_stats_show, NULL);
1983}
1984
1985static const struct file_operations ip_vs_stats_fops = {
1986 .owner = THIS_MODULE,
1987 .open = ip_vs_stats_seq_open,
1988 .read = seq_read,
1989 .llseek = seq_lseek,
1990 .release = single_release,
1991};
1992
1993#endif
1994
1995/*
1996 * Set timeout values for tcp tcpfin udp in the timeout_table.
1997 */
1998static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1999{
2000 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2001 u->tcp_timeout,
2002 u->tcp_fin_timeout,
2003 u->udp_timeout);
2004
2005#ifdef CONFIG_IP_VS_PROTO_TCP
2006 if (u->tcp_timeout) {
2007 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
2008 = u->tcp_timeout * HZ;
2009 }
2010
2011 if (u->tcp_fin_timeout) {
2012 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
2013 = u->tcp_fin_timeout * HZ;
2014 }
2015#endif
2016
2017#ifdef CONFIG_IP_VS_PROTO_UDP
2018 if (u->udp_timeout) {
2019 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
2020 = u->udp_timeout * HZ;
2021 }
2022#endif
2023 return 0;
2024}
2025
2026
2027#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2028#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
2029#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
2030 sizeof(struct ip_vs_dest_user))
2031#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2032#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
2033#define MAX_ARG_LEN SVCDEST_ARG_LEN
2034
2035static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2036 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
2037 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
2038 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
2039 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
2040 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
2041 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
2042 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
2043 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
2044 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
2045 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
2046 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
2047};
2048
2049static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2050 struct ip_vs_service_user *usvc_compat)
2051{
2052 usvc->af = AF_INET;
2053 usvc->protocol = usvc_compat->protocol;
2054 usvc->addr.ip = usvc_compat->addr;
2055 usvc->port = usvc_compat->port;
2056 usvc->fwmark = usvc_compat->fwmark;
2057
2058 /* Deep copy of sched_name is not needed here */
2059 usvc->sched_name = usvc_compat->sched_name;
2060
2061 usvc->flags = usvc_compat->flags;
2062 usvc->timeout = usvc_compat->timeout;
2063 usvc->netmask = usvc_compat->netmask;
2064}
2065
2066static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2067 struct ip_vs_dest_user *udest_compat)
2068{
2069 udest->addr.ip = udest_compat->addr;
2070 udest->port = udest_compat->port;
2071 udest->conn_flags = udest_compat->conn_flags;
2072 udest->weight = udest_compat->weight;
2073 udest->u_threshold = udest_compat->u_threshold;
2074 udest->l_threshold = udest_compat->l_threshold;
2075}
2076
2077static int
2078do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2079{
2080 int ret;
2081 unsigned char arg[MAX_ARG_LEN];
2082 struct ip_vs_service_user *usvc_compat;
2083 struct ip_vs_service_user_kern usvc;
2084 struct ip_vs_service *svc;
2085 struct ip_vs_dest_user *udest_compat;
2086 struct ip_vs_dest_user_kern udest;
2087
2088 if (!capable(CAP_NET_ADMIN))
2089 return -EPERM;
2090
2091 if (len != set_arglen[SET_CMDID(cmd)]) {
2092 IP_VS_ERR("set_ctl: len %u != %u\n",
2093 len, set_arglen[SET_CMDID(cmd)]);
2094 return -EINVAL;
2095 }
2096
2097 if (copy_from_user(arg, user, len) != 0)
2098 return -EFAULT;
2099
2100 /* increase the module use count */
2101 ip_vs_use_count_inc();
2102
2103 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2104 ret = -ERESTARTSYS;
2105 goto out_dec;
2106 }
2107
2108 if (cmd == IP_VS_SO_SET_FLUSH) {
2109 /* Flush the virtual service */
2110 ret = ip_vs_flush();
2111 goto out_unlock;
2112 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2113 /* Set timeout values for (tcp tcpfin udp) */
2114 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
2115 goto out_unlock;
2116 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2117 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2118 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
2119 goto out_unlock;
2120 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2121 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2122 ret = stop_sync_thread(dm->state);
2123 goto out_unlock;
2124 }
2125
2126 usvc_compat = (struct ip_vs_service_user *)arg;
2127 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2128
2129 /* We only use the new structs internally, so copy userspace compat
2130 * structs to extended internal versions */
2131 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2132 ip_vs_copy_udest_compat(&udest, udest_compat);
2133
2134 if (cmd == IP_VS_SO_SET_ZERO) {
2135 /* if no service address is set, zero counters in all */
2136 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2137 ret = ip_vs_zero_all();
2138 goto out_unlock;
2139 }
2140 }
2141
2142 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
2143 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
2144 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
2145 usvc.protocol, NIPQUAD(usvc.addr.ip),
2146 ntohs(usvc.port), usvc.sched_name);
2147 ret = -EFAULT;
2148 goto out_unlock;
2149 }
2150
2151 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2152 if (usvc.fwmark == 0)
2153 svc = __ip_vs_service_get(usvc.af, usvc.protocol,
2154 &usvc.addr, usvc.port);
2155 else
2156 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
2157
2158 if (cmd != IP_VS_SO_SET_ADD
2159 && (svc == NULL || svc->protocol != usvc.protocol)) {
2160 ret = -ESRCH;
2161 goto out_unlock;
2162 }
2163
2164 switch (cmd) {
2165 case IP_VS_SO_SET_ADD:
2166 if (svc != NULL)
2167 ret = -EEXIST;
2168 else
2169 ret = ip_vs_add_service(&usvc, &svc);
2170 break;
2171 case IP_VS_SO_SET_EDIT:
2172 ret = ip_vs_edit_service(svc, &usvc);
2173 break;
2174 case IP_VS_SO_SET_DEL:
2175 ret = ip_vs_del_service(svc);
2176 if (!ret)
2177 goto out_unlock;
2178 break;
2179 case IP_VS_SO_SET_ZERO:
2180 ret = ip_vs_zero_service(svc);
2181 break;
2182 case IP_VS_SO_SET_ADDDEST:
2183 ret = ip_vs_add_dest(svc, &udest);
2184 break;
2185 case IP_VS_SO_SET_EDITDEST:
2186 ret = ip_vs_edit_dest(svc, &udest);
2187 break;
2188 case IP_VS_SO_SET_DELDEST:
2189 ret = ip_vs_del_dest(svc, &udest);
2190 break;
2191 default:
2192 ret = -EINVAL;
2193 }
2194
2195 if (svc)
2196 ip_vs_service_put(svc);
2197
2198 out_unlock:
2199 mutex_unlock(&__ip_vs_mutex);
2200 out_dec:
2201 /* decrease the module use count */
2202 ip_vs_use_count_dec();
2203
2204 return ret;
2205}
2206
2207
2208static void
2209ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2210{
2211 spin_lock_bh(&src->lock);
2212 memcpy(dst, &src->ustats, sizeof(*dst));
2213 spin_unlock_bh(&src->lock);
2214}
2215
2216static void
2217ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2218{
2219 dst->protocol = src->protocol;
2220 dst->addr = src->addr.ip;
2221 dst->port = src->port;
2222 dst->fwmark = src->fwmark;
2223 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2224 dst->flags = src->flags;
2225 dst->timeout = src->timeout / HZ;
2226 dst->netmask = src->netmask;
2227 dst->num_dests = src->num_dests;
2228 ip_vs_copy_stats(&dst->stats, &src->stats);
2229}
2230
2231static inline int
2232__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2233 struct ip_vs_get_services __user *uptr)
2234{
2235 int idx, count=0;
2236 struct ip_vs_service *svc;
2237 struct ip_vs_service_entry entry;
2238 int ret = 0;
2239
2240 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2241 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2242 /* Only expose IPv4 entries to old interface */
2243 if (svc->af != AF_INET)
2244 continue;
2245
2246 if (count >= get->num_services)
2247 goto out;
2248 memset(&entry, 0, sizeof(entry));
2249 ip_vs_copy_service(&entry, svc);
2250 if (copy_to_user(&uptr->entrytable[count],
2251 &entry, sizeof(entry))) {
2252 ret = -EFAULT;
2253 goto out;
2254 }
2255 count++;
2256 }
2257 }
2258
2259 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2260 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2261 /* Only expose IPv4 entries to old interface */
2262 if (svc->af != AF_INET)
2263 continue;
2264
2265 if (count >= get->num_services)
2266 goto out;
2267 memset(&entry, 0, sizeof(entry));
2268 ip_vs_copy_service(&entry, svc);
2269 if (copy_to_user(&uptr->entrytable[count],
2270 &entry, sizeof(entry))) {
2271 ret = -EFAULT;
2272 goto out;
2273 }
2274 count++;
2275 }
2276 }
2277 out:
2278 return ret;
2279}
2280
2281static inline int
2282__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2283 struct ip_vs_get_dests __user *uptr)
2284{
2285 struct ip_vs_service *svc;
2286 union nf_inet_addr addr = { .ip = get->addr };
2287 int ret = 0;
2288
2289 if (get->fwmark)
2290 svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
2291 else
2292 svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
2293 get->port);
2294
2295 if (svc) {
2296 int count = 0;
2297 struct ip_vs_dest *dest;
2298 struct ip_vs_dest_entry entry;
2299
2300 list_for_each_entry(dest, &svc->destinations, n_list) {
2301 if (count >= get->num_dests)
2302 break;
2303
2304 entry.addr = dest->addr.ip;
2305 entry.port = dest->port;
2306 entry.conn_flags = atomic_read(&dest->conn_flags);
2307 entry.weight = atomic_read(&dest->weight);
2308 entry.u_threshold = dest->u_threshold;
2309 entry.l_threshold = dest->l_threshold;
2310 entry.activeconns = atomic_read(&dest->activeconns);
2311 entry.inactconns = atomic_read(&dest->inactconns);
2312 entry.persistconns = atomic_read(&dest->persistconns);
2313 ip_vs_copy_stats(&entry.stats, &dest->stats);
2314 if (copy_to_user(&uptr->entrytable[count],
2315 &entry, sizeof(entry))) {
2316 ret = -EFAULT;
2317 break;
2318 }
2319 count++;
2320 }
2321 ip_vs_service_put(svc);
2322 } else
2323 ret = -ESRCH;
2324 return ret;
2325}
2326
2327static inline void
2328__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2329{
2330#ifdef CONFIG_IP_VS_PROTO_TCP
2331 u->tcp_timeout =
2332 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2333 u->tcp_fin_timeout =
2334 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2335#endif
2336#ifdef CONFIG_IP_VS_PROTO_UDP
2337 u->udp_timeout =
2338 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2339#endif
2340}
2341
2342
2343#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2344#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2345#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2346#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2347#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2348#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2349#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2350
2351static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2352 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2353 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2354 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2355 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2356 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2357 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2358 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2359};
2360
2361static int
2362do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2363{
2364 unsigned char arg[128];
2365 int ret = 0;
2366
2367 if (!capable(CAP_NET_ADMIN))
2368 return -EPERM;
2369
2370 if (*len < get_arglen[GET_CMDID(cmd)]) {
2371 IP_VS_ERR("get_ctl: len %u < %u\n",
2372 *len, get_arglen[GET_CMDID(cmd)]);
2373 return -EINVAL;
2374 }
2375
2376 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2377 return -EFAULT;
2378
2379 if (mutex_lock_interruptible(&__ip_vs_mutex))
2380 return -ERESTARTSYS;
2381
2382 switch (cmd) {
2383 case IP_VS_SO_GET_VERSION:
2384 {
2385 char buf[64];
2386
2387 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2388 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2389 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2390 ret = -EFAULT;
2391 goto out;
2392 }
2393 *len = strlen(buf)+1;
2394 }
2395 break;
2396
2397 case IP_VS_SO_GET_INFO:
2398 {
2399 struct ip_vs_getinfo info;
2400 info.version = IP_VS_VERSION_CODE;
2401 info.size = IP_VS_CONN_TAB_SIZE;
2402 info.num_services = ip_vs_num_services;
2403 if (copy_to_user(user, &info, sizeof(info)) != 0)
2404 ret = -EFAULT;
2405 }
2406 break;
2407
2408 case IP_VS_SO_GET_SERVICES:
2409 {
2410 struct ip_vs_get_services *get;
2411 int size;
2412
2413 get = (struct ip_vs_get_services *)arg;
2414 size = sizeof(*get) +
2415 sizeof(struct ip_vs_service_entry) * get->num_services;
2416 if (*len != size) {
2417 IP_VS_ERR("length: %u != %u\n", *len, size);
2418 ret = -EINVAL;
2419 goto out;
2420 }
2421 ret = __ip_vs_get_service_entries(get, user);
2422 }
2423 break;
2424
2425 case IP_VS_SO_GET_SERVICE:
2426 {
2427 struct ip_vs_service_entry *entry;
2428 struct ip_vs_service *svc;
2429 union nf_inet_addr addr;
2430
2431 entry = (struct ip_vs_service_entry *)arg;
2432 addr.ip = entry->addr;
2433 if (entry->fwmark)
2434 svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
2435 else
2436 svc = __ip_vs_service_get(AF_INET, entry->protocol,
2437 &addr, entry->port);
2438 if (svc) {
2439 ip_vs_copy_service(entry, svc);
2440 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2441 ret = -EFAULT;
2442 ip_vs_service_put(svc);
2443 } else
2444 ret = -ESRCH;
2445 }
2446 break;
2447
2448 case IP_VS_SO_GET_DESTS:
2449 {
2450 struct ip_vs_get_dests *get;
2451 int size;
2452
2453 get = (struct ip_vs_get_dests *)arg;
2454 size = sizeof(*get) +
2455 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2456 if (*len != size) {
2457 IP_VS_ERR("length: %u != %u\n", *len, size);
2458 ret = -EINVAL;
2459 goto out;
2460 }
2461 ret = __ip_vs_get_dest_entries(get, user);
2462 }
2463 break;
2464
2465 case IP_VS_SO_GET_TIMEOUT:
2466 {
2467 struct ip_vs_timeout_user t;
2468
2469 __ip_vs_get_timeouts(&t);
2470 if (copy_to_user(user, &t, sizeof(t)) != 0)
2471 ret = -EFAULT;
2472 }
2473 break;
2474
2475 case IP_VS_SO_GET_DAEMON:
2476 {
2477 struct ip_vs_daemon_user d[2];
2478
2479 memset(&d, 0, sizeof(d));
2480 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2481 d[0].state = IP_VS_STATE_MASTER;
2482 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2483 d[0].syncid = ip_vs_master_syncid;
2484 }
2485 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2486 d[1].state = IP_VS_STATE_BACKUP;
2487 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2488 d[1].syncid = ip_vs_backup_syncid;
2489 }
2490 if (copy_to_user(user, &d, sizeof(d)) != 0)
2491 ret = -EFAULT;
2492 }
2493 break;
2494
2495 default:
2496 ret = -EINVAL;
2497 }
2498
2499 out:
2500 mutex_unlock(&__ip_vs_mutex);
2501 return ret;
2502}
2503
2504
2505static struct nf_sockopt_ops ip_vs_sockopts = {
2506 .pf = PF_INET,
2507 .set_optmin = IP_VS_BASE_CTL,
2508 .set_optmax = IP_VS_SO_SET_MAX+1,
2509 .set = do_ip_vs_set_ctl,
2510 .get_optmin = IP_VS_BASE_CTL,
2511 .get_optmax = IP_VS_SO_GET_MAX+1,
2512 .get = do_ip_vs_get_ctl,
2513 .owner = THIS_MODULE,
2514};
2515
2516/*
2517 * Generic Netlink interface
2518 */
2519
2520/* IPVS genetlink family */
2521static struct genl_family ip_vs_genl_family = {
2522 .id = GENL_ID_GENERATE,
2523 .hdrsize = 0,
2524 .name = IPVS_GENL_NAME,
2525 .version = IPVS_GENL_VERSION,
2526 .maxattr = IPVS_CMD_MAX,
2527};
2528
2529/* Policy used for first-level command attributes */
2530static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2531 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
2532 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
2533 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
2534 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
2535 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2536 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
2537};
2538
2539/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2540static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2541 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
2542 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
2543 .len = IP_VS_IFNAME_MAXLEN },
2544 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
2545};
2546
2547/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2548static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2549 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
2550 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
2551 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
2552 .len = sizeof(union nf_inet_addr) },
2553 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
2554 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
2555 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
2556 .len = IP_VS_SCHEDNAME_MAXLEN },
2557 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
2558 .len = sizeof(struct ip_vs_flags) },
2559 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
2560 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
2561 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
2562};
2563
2564/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2565static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2566 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
2567 .len = sizeof(union nf_inet_addr) },
2568 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
2569 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
2570 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
2571 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
2572 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
2573 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
2574 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
2575 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
2576 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
2577};
2578
2579static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2580 struct ip_vs_stats *stats)
2581{
2582 struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2583 if (!nl_stats)
2584 return -EMSGSIZE;
2585
2586 spin_lock_bh(&stats->lock);
2587
2588 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
2589 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
2590 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
2591 NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
2592 NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
2593 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
2594 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
2595 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
2596 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
2597 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
2598
2599 spin_unlock_bh(&stats->lock);
2600
2601 nla_nest_end(skb, nl_stats);
2602
2603 return 0;
2604
2605nla_put_failure:
2606 spin_unlock_bh(&stats->lock);
2607 nla_nest_cancel(skb, nl_stats);
2608 return -EMSGSIZE;
2609}
2610
2611static int ip_vs_genl_fill_service(struct sk_buff *skb,
2612 struct ip_vs_service *svc)
2613{
2614 struct nlattr *nl_service;
2615 struct ip_vs_flags flags = { .flags = svc->flags,
2616 .mask = ~0 };
2617
2618 nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2619 if (!nl_service)
2620 return -EMSGSIZE;
2621
2622 NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2623
2624 if (svc->fwmark) {
2625 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2626 } else {
2627 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2628 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2629 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2630 }
2631
2632 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2633 NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2634 NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2635 NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2636
2637 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2638 goto nla_put_failure;
2639
2640 nla_nest_end(skb, nl_service);
2641
2642 return 0;
2643
2644nla_put_failure:
2645 nla_nest_cancel(skb, nl_service);
2646 return -EMSGSIZE;
2647}
2648
2649static int ip_vs_genl_dump_service(struct sk_buff *skb,
2650 struct ip_vs_service *svc,
2651 struct netlink_callback *cb)
2652{
2653 void *hdr;
2654
2655 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2656 &ip_vs_genl_family, NLM_F_MULTI,
2657 IPVS_CMD_NEW_SERVICE);
2658 if (!hdr)
2659 return -EMSGSIZE;
2660
2661 if (ip_vs_genl_fill_service(skb, svc) < 0)
2662 goto nla_put_failure;
2663
2664 return genlmsg_end(skb, hdr);
2665
2666nla_put_failure:
2667 genlmsg_cancel(skb, hdr);
2668 return -EMSGSIZE;
2669}
2670
2671static int ip_vs_genl_dump_services(struct sk_buff *skb,
2672 struct netlink_callback *cb)
2673{
2674 int idx = 0, i;
2675 int start = cb->args[0];
2676 struct ip_vs_service *svc;
2677
2678 mutex_lock(&__ip_vs_mutex);
2679 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2680 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2681 if (++idx <= start)
2682 continue;
2683 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2684 idx--;
2685 goto nla_put_failure;
2686 }
2687 }
2688 }
2689
2690 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2691 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2692 if (++idx <= start)
2693 continue;
2694 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2695 idx--;
2696 goto nla_put_failure;
2697 }
2698 }
2699 }
2700
2701nla_put_failure:
2702 mutex_unlock(&__ip_vs_mutex);
2703 cb->args[0] = idx;
2704
2705 return skb->len;
2706}
2707
2708static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2709 struct nlattr *nla, int full_entry)
2710{
2711 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2712 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2713
2714 /* Parse mandatory identifying service fields first */
2715 if (nla == NULL ||
2716 nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2717 return -EINVAL;
2718
2719 nla_af = attrs[IPVS_SVC_ATTR_AF];
2720 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
2721 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
2722 nla_port = attrs[IPVS_SVC_ATTR_PORT];
2723 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
2724
2725 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2726 return -EINVAL;
2727
2728 usvc->af = nla_get_u16(nla_af);
2729#ifdef CONFIG_IP_VS_IPV6
2730 if (usvc->af != AF_INET && usvc->af != AF_INET6)
2731#else
2732 if (usvc->af != AF_INET)
2733#endif
2734 return -EAFNOSUPPORT;
2735
2736 if (nla_fwmark) {
2737 usvc->protocol = IPPROTO_TCP;
2738 usvc->fwmark = nla_get_u32(nla_fwmark);
2739 } else {
2740 usvc->protocol = nla_get_u16(nla_protocol);
2741 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2742 usvc->port = nla_get_u16(nla_port);
2743 usvc->fwmark = 0;
2744 }
2745
2746 /* If a full entry was requested, check for the additional fields */
2747 if (full_entry) {
2748 struct nlattr *nla_sched, *nla_flags, *nla_timeout,
2749 *nla_netmask;
2750 struct ip_vs_flags flags;
2751 struct ip_vs_service *svc;
2752
2753 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2754 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2755 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2756 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2757
2758 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2759 return -EINVAL;
2760
2761 nla_memcpy(&flags, nla_flags, sizeof(flags));
2762
2763 /* prefill flags from service if it already exists */
2764 if (usvc->fwmark)
2765 svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
2766 else
2767 svc = __ip_vs_service_get(usvc->af, usvc->protocol,
2768 &usvc->addr, usvc->port);
2769 if (svc) {
2770 usvc->flags = svc->flags;
2771 ip_vs_service_put(svc);
2772 } else
2773 usvc->flags = 0;
2774
2775 /* set new flags from userland */
2776 usvc->flags = (usvc->flags & ~flags.mask) |
2777 (flags.flags & flags.mask);
2778 usvc->sched_name = nla_data(nla_sched);
2779 usvc->timeout = nla_get_u32(nla_timeout);
2780 usvc->netmask = nla_get_u32(nla_netmask);
2781 }
2782
2783 return 0;
2784}
2785
2786static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
2787{
2788 struct ip_vs_service_user_kern usvc;
2789 int ret;
2790
2791 ret = ip_vs_genl_parse_service(&usvc, nla, 0);
2792 if (ret)
2793 return ERR_PTR(ret);
2794
2795 if (usvc.fwmark)
2796 return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
2797 else
2798 return __ip_vs_service_get(usvc.af, usvc.protocol,
2799 &usvc.addr, usvc.port);
2800}
2801
2802static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2803{
2804 struct nlattr *nl_dest;
2805
2806 nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2807 if (!nl_dest)
2808 return -EMSGSIZE;
2809
2810 NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2811 NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2812
2813 NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2814 atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2815 NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2816 NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2817 NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2818 NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2819 atomic_read(&dest->activeconns));
2820 NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2821 atomic_read(&dest->inactconns));
2822 NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2823 atomic_read(&dest->persistconns));
2824
2825 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2826 goto nla_put_failure;
2827
2828 nla_nest_end(skb, nl_dest);
2829
2830 return 0;
2831
2832nla_put_failure:
2833 nla_nest_cancel(skb, nl_dest);
2834 return -EMSGSIZE;
2835}
2836
2837static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2838 struct netlink_callback *cb)
2839{
2840 void *hdr;
2841
2842 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2843 &ip_vs_genl_family, NLM_F_MULTI,
2844 IPVS_CMD_NEW_DEST);
2845 if (!hdr)
2846 return -EMSGSIZE;
2847
2848 if (ip_vs_genl_fill_dest(skb, dest) < 0)
2849 goto nla_put_failure;
2850
2851 return genlmsg_end(skb, hdr);
2852
2853nla_put_failure:
2854 genlmsg_cancel(skb, hdr);
2855 return -EMSGSIZE;
2856}
2857
2858static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2859 struct netlink_callback *cb)
2860{
2861 int idx = 0;
2862 int start = cb->args[0];
2863 struct ip_vs_service *svc;
2864 struct ip_vs_dest *dest;
2865 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2866
2867 mutex_lock(&__ip_vs_mutex);
2868
2869 /* Try to find the service for which to dump destinations */
2870 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2871 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2872 goto out_err;
2873
2874 svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
2875 if (IS_ERR(svc) || svc == NULL)
2876 goto out_err;
2877
2878 /* Dump the destinations */
2879 list_for_each_entry(dest, &svc->destinations, n_list) {
2880 if (++idx <= start)
2881 continue;
2882 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
2883 idx--;
2884 goto nla_put_failure;
2885 }
2886 }
2887
2888nla_put_failure:
2889 cb->args[0] = idx;
2890 ip_vs_service_put(svc);
2891
2892out_err:
2893 mutex_unlock(&__ip_vs_mutex);
2894
2895 return skb->len;
2896}
2897
2898static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
2899 struct nlattr *nla, int full_entry)
2900{
2901 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
2902 struct nlattr *nla_addr, *nla_port;
2903
2904 /* Parse mandatory identifying destination fields first */
2905 if (nla == NULL ||
2906 nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
2907 return -EINVAL;
2908
2909 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
2910 nla_port = attrs[IPVS_DEST_ATTR_PORT];
2911
2912 if (!(nla_addr && nla_port))
2913 return -EINVAL;
2914
2915 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
2916 udest->port = nla_get_u16(nla_port);
2917
2918 /* If a full entry was requested, check for the additional fields */
2919 if (full_entry) {
2920 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
2921 *nla_l_thresh;
2922
2923 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
2924 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
2925 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
2926 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
2927
2928 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
2929 return -EINVAL;
2930
2931 udest->conn_flags = nla_get_u32(nla_fwd)
2932 & IP_VS_CONN_F_FWD_MASK;
2933 udest->weight = nla_get_u32(nla_weight);
2934 udest->u_threshold = nla_get_u32(nla_u_thresh);
2935 udest->l_threshold = nla_get_u32(nla_l_thresh);
2936 }
2937
2938 return 0;
2939}
2940
2941static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
2942 const char *mcast_ifn, __be32 syncid)
2943{
2944 struct nlattr *nl_daemon;
2945
2946 nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
2947 if (!nl_daemon)
2948 return -EMSGSIZE;
2949
2950 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
2951 NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
2952 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
2953
2954 nla_nest_end(skb, nl_daemon);
2955
2956 return 0;
2957
2958nla_put_failure:
2959 nla_nest_cancel(skb, nl_daemon);
2960 return -EMSGSIZE;
2961}
2962
2963static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
2964 const char *mcast_ifn, __be32 syncid,
2965 struct netlink_callback *cb)
2966{
2967 void *hdr;
2968 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2969 &ip_vs_genl_family, NLM_F_MULTI,
2970 IPVS_CMD_NEW_DAEMON);
2971 if (!hdr)
2972 return -EMSGSIZE;
2973
2974 if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
2975 goto nla_put_failure;
2976
2977 return genlmsg_end(skb, hdr);
2978
2979nla_put_failure:
2980 genlmsg_cancel(skb, hdr);
2981 return -EMSGSIZE;
2982}
2983
2984static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
2985 struct netlink_callback *cb)
2986{
2987 mutex_lock(&__ip_vs_mutex);
2988 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
2989 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
2990 ip_vs_master_mcast_ifn,
2991 ip_vs_master_syncid, cb) < 0)
2992 goto nla_put_failure;
2993
2994 cb->args[0] = 1;
2995 }
2996
2997 if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
2998 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
2999 ip_vs_backup_mcast_ifn,
3000 ip_vs_backup_syncid, cb) < 0)
3001 goto nla_put_failure;
3002
3003 cb->args[1] = 1;
3004 }
3005
3006nla_put_failure:
3007 mutex_unlock(&__ip_vs_mutex);
3008
3009 return skb->len;
3010}
3011
3012static int ip_vs_genl_new_daemon(struct nlattr **attrs)
3013{
3014 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3015 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3016 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3017 return -EINVAL;
3018
3019 return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3020 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3021 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3022}
3023
3024static int ip_vs_genl_del_daemon(struct nlattr **attrs)
3025{
3026 if (!attrs[IPVS_DAEMON_ATTR_STATE])
3027 return -EINVAL;
3028
3029 return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3030}
3031
3032static int ip_vs_genl_set_config(struct nlattr **attrs)
3033{
3034 struct ip_vs_timeout_user t;
3035
3036 __ip_vs_get_timeouts(&t);
3037
3038 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3039 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3040
3041 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3042 t.tcp_fin_timeout =
3043 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3044
3045 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3046 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3047
3048 return ip_vs_set_timeout(&t);
3049}
3050
3051static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3052{
3053 struct ip_vs_service *svc = NULL;
3054 struct ip_vs_service_user_kern usvc;
3055 struct ip_vs_dest_user_kern udest;
3056 int ret = 0, cmd;
3057 int need_full_svc = 0, need_full_dest = 0;
3058
3059 cmd = info->genlhdr->cmd;
3060
3061 mutex_lock(&__ip_vs_mutex);
3062
3063 if (cmd == IPVS_CMD_FLUSH) {
3064 ret = ip_vs_flush();
3065 goto out;
3066 } else if (cmd == IPVS_CMD_SET_CONFIG) {
3067 ret = ip_vs_genl_set_config(info->attrs);
3068 goto out;
3069 } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3070 cmd == IPVS_CMD_DEL_DAEMON) {
3071
3072 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3073
3074 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3075 nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3076 info->attrs[IPVS_CMD_ATTR_DAEMON],
3077 ip_vs_daemon_policy)) {
3078 ret = -EINVAL;
3079 goto out;
3080 }
3081
3082 if (cmd == IPVS_CMD_NEW_DAEMON)
3083 ret = ip_vs_genl_new_daemon(daemon_attrs);
3084 else
3085 ret = ip_vs_genl_del_daemon(daemon_attrs);
3086 goto out;
3087 } else if (cmd == IPVS_CMD_ZERO &&
3088 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3089 ret = ip_vs_zero_all();
3090 goto out;
3091 }
3092
3093 /* All following commands require a service argument, so check if we
3094 * received a valid one. We need a full service specification when
3095 * adding / editing a service. Only identifying members otherwise. */
3096 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3097 need_full_svc = 1;
3098
3099 ret = ip_vs_genl_parse_service(&usvc,
3100 info->attrs[IPVS_CMD_ATTR_SERVICE],
3101 need_full_svc);
3102 if (ret)
3103 goto out;
3104
3105 /* Lookup the exact service by <protocol, addr, port> or fwmark */
3106 if (usvc.fwmark == 0)
3107 svc = __ip_vs_service_get(usvc.af, usvc.protocol,
3108 &usvc.addr, usvc.port);
3109 else
3110 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
3111
3112 /* Unless we're adding a new service, the service must already exist */
3113 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3114 ret = -ESRCH;
3115 goto out;
3116 }
3117
3118 /* Destination commands require a valid destination argument. For
3119 * adding / editing a destination, we need a full destination
3120 * specification. */
3121 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3122 cmd == IPVS_CMD_DEL_DEST) {
3123 if (cmd != IPVS_CMD_DEL_DEST)
3124 need_full_dest = 1;
3125
3126 ret = ip_vs_genl_parse_dest(&udest,
3127 info->attrs[IPVS_CMD_ATTR_DEST],
3128 need_full_dest);
3129 if (ret)
3130 goto out;
3131 }
3132
3133 switch (cmd) {
3134 case IPVS_CMD_NEW_SERVICE:
3135 if (svc == NULL)
3136 ret = ip_vs_add_service(&usvc, &svc);
3137 else
3138 ret = -EEXIST;
3139 break;
3140 case IPVS_CMD_SET_SERVICE:
3141 ret = ip_vs_edit_service(svc, &usvc);
3142 break;
3143 case IPVS_CMD_DEL_SERVICE:
3144 ret = ip_vs_del_service(svc);
3145 break;
3146 case IPVS_CMD_NEW_DEST:
3147 ret = ip_vs_add_dest(svc, &udest);
3148 break;
3149 case IPVS_CMD_SET_DEST:
3150 ret = ip_vs_edit_dest(svc, &udest);
3151 break;
3152 case IPVS_CMD_DEL_DEST:
3153 ret = ip_vs_del_dest(svc, &udest);
3154 break;
3155 case IPVS_CMD_ZERO:
3156 ret = ip_vs_zero_service(svc);
3157 break;
3158 default:
3159 ret = -EINVAL;
3160 }
3161
3162out:
3163 if (svc)
3164 ip_vs_service_put(svc);
3165 mutex_unlock(&__ip_vs_mutex);
3166
3167 return ret;
3168}
3169
3170static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3171{
3172 struct sk_buff *msg;
3173 void *reply;
3174 int ret, cmd, reply_cmd;
3175
3176 cmd = info->genlhdr->cmd;
3177
3178 if (cmd == IPVS_CMD_GET_SERVICE)
3179 reply_cmd = IPVS_CMD_NEW_SERVICE;
3180 else if (cmd == IPVS_CMD_GET_INFO)
3181 reply_cmd = IPVS_CMD_SET_INFO;
3182 else if (cmd == IPVS_CMD_GET_CONFIG)
3183 reply_cmd = IPVS_CMD_SET_CONFIG;
3184 else {
3185 IP_VS_ERR("unknown Generic Netlink command\n");
3186 return -EINVAL;
3187 }
3188
3189 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3190 if (!msg)
3191 return -ENOMEM;
3192
3193 mutex_lock(&__ip_vs_mutex);
3194
3195 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3196 if (reply == NULL)
3197 goto nla_put_failure;
3198
3199 switch (cmd) {
3200 case IPVS_CMD_GET_SERVICE:
3201 {
3202 struct ip_vs_service *svc;
3203
3204 svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
3205 if (IS_ERR(svc)) {
3206 ret = PTR_ERR(svc);
3207 goto out_err;
3208 } else if (svc) {
3209 ret = ip_vs_genl_fill_service(msg, svc);
3210 ip_vs_service_put(svc);
3211 if (ret)
3212 goto nla_put_failure;
3213 } else {
3214 ret = -ESRCH;
3215 goto out_err;
3216 }
3217
3218 break;
3219 }
3220
3221 case IPVS_CMD_GET_CONFIG:
3222 {
3223 struct ip_vs_timeout_user t;
3224
3225 __ip_vs_get_timeouts(&t);
3226#ifdef CONFIG_IP_VS_PROTO_TCP
3227 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3228 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3229 t.tcp_fin_timeout);
3230#endif
3231#ifdef CONFIG_IP_VS_PROTO_UDP
3232 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3233#endif
3234
3235 break;
3236 }
3237
3238 case IPVS_CMD_GET_INFO:
3239 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3240 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3241 IP_VS_CONN_TAB_SIZE);
3242 break;
3243 }
3244
3245 genlmsg_end(msg, reply);
3246 ret = genlmsg_unicast(msg, info->snd_pid);
3247 goto out;
3248
3249nla_put_failure:
3250 IP_VS_ERR("not enough space in Netlink message\n");
3251 ret = -EMSGSIZE;
3252
3253out_err:
3254 nlmsg_free(msg);
3255out:
3256 mutex_unlock(&__ip_vs_mutex);
3257
3258 return ret;
3259}
3260
3261
3262static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3263 {
3264 .cmd = IPVS_CMD_NEW_SERVICE,
3265 .flags = GENL_ADMIN_PERM,
3266 .policy = ip_vs_cmd_policy,
3267 .doit = ip_vs_genl_set_cmd,
3268 },
3269 {
3270 .cmd = IPVS_CMD_SET_SERVICE,
3271 .flags = GENL_ADMIN_PERM,
3272 .policy = ip_vs_cmd_policy,
3273 .doit = ip_vs_genl_set_cmd,
3274 },
3275 {
3276 .cmd = IPVS_CMD_DEL_SERVICE,
3277 .flags = GENL_ADMIN_PERM,
3278 .policy = ip_vs_cmd_policy,
3279 .doit = ip_vs_genl_set_cmd,
3280 },
3281 {
3282 .cmd = IPVS_CMD_GET_SERVICE,
3283 .flags = GENL_ADMIN_PERM,
3284 .doit = ip_vs_genl_get_cmd,
3285 .dumpit = ip_vs_genl_dump_services,
3286 .policy = ip_vs_cmd_policy,
3287 },
3288 {
3289 .cmd = IPVS_CMD_NEW_DEST,
3290 .flags = GENL_ADMIN_PERM,
3291 .policy = ip_vs_cmd_policy,
3292 .doit = ip_vs_genl_set_cmd,
3293 },
3294 {
3295 .cmd = IPVS_CMD_SET_DEST,
3296 .flags = GENL_ADMIN_PERM,
3297 .policy = ip_vs_cmd_policy,
3298 .doit = ip_vs_genl_set_cmd,
3299 },
3300 {
3301 .cmd = IPVS_CMD_DEL_DEST,
3302 .flags = GENL_ADMIN_PERM,
3303 .policy = ip_vs_cmd_policy,
3304 .doit = ip_vs_genl_set_cmd,
3305 },
3306 {
3307 .cmd = IPVS_CMD_GET_DEST,
3308 .flags = GENL_ADMIN_PERM,
3309 .policy = ip_vs_cmd_policy,
3310 .dumpit = ip_vs_genl_dump_dests,
3311 },
3312 {
3313 .cmd = IPVS_CMD_NEW_DAEMON,
3314 .flags = GENL_ADMIN_PERM,
3315 .policy = ip_vs_cmd_policy,
3316 .doit = ip_vs_genl_set_cmd,
3317 },
3318 {
3319 .cmd = IPVS_CMD_DEL_DAEMON,
3320 .flags = GENL_ADMIN_PERM,
3321 .policy = ip_vs_cmd_policy,
3322 .doit = ip_vs_genl_set_cmd,
3323 },
3324 {
3325 .cmd = IPVS_CMD_GET_DAEMON,
3326 .flags = GENL_ADMIN_PERM,
3327 .dumpit = ip_vs_genl_dump_daemons,
3328 },
3329 {
3330 .cmd = IPVS_CMD_SET_CONFIG,
3331 .flags = GENL_ADMIN_PERM,
3332 .policy = ip_vs_cmd_policy,
3333 .doit = ip_vs_genl_set_cmd,
3334 },
3335 {
3336 .cmd = IPVS_CMD_GET_CONFIG,
3337 .flags = GENL_ADMIN_PERM,
3338 .doit = ip_vs_genl_get_cmd,
3339 },
3340 {
3341 .cmd = IPVS_CMD_GET_INFO,
3342 .flags = GENL_ADMIN_PERM,
3343 .doit = ip_vs_genl_get_cmd,
3344 },
3345 {
3346 .cmd = IPVS_CMD_ZERO,
3347 .flags = GENL_ADMIN_PERM,
3348 .policy = ip_vs_cmd_policy,
3349 .doit = ip_vs_genl_set_cmd,
3350 },
3351 {
3352 .cmd = IPVS_CMD_FLUSH,
3353 .flags = GENL_ADMIN_PERM,
3354 .doit = ip_vs_genl_set_cmd,
3355 },
3356};
3357
3358static int __init ip_vs_genl_register(void)
3359{
3360 int ret, i;
3361
3362 ret = genl_register_family(&ip_vs_genl_family);
3363 if (ret)
3364 return ret;
3365
3366 for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
3367 ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
3368 if (ret)
3369 goto err_out;
3370 }
3371 return 0;
3372
3373err_out:
3374 genl_unregister_family(&ip_vs_genl_family);
3375 return ret;
3376}
3377
3378static void ip_vs_genl_unregister(void)
3379{
3380 genl_unregister_family(&ip_vs_genl_family);
3381}
3382
3383/* End of Generic Netlink interface definitions */
3384
3385
3386int __init ip_vs_control_init(void)
3387{
3388 int ret;
3389 int idx;
3390
3391 EnterFunction(2);
3392
3393 ret = nf_register_sockopt(&ip_vs_sockopts);
3394 if (ret) {
3395 IP_VS_ERR("cannot register sockopt.\n");
3396 return ret;
3397 }
3398
3399 ret = ip_vs_genl_register();
3400 if (ret) {
3401 IP_VS_ERR("cannot register Generic Netlink interface.\n");
3402 nf_unregister_sockopt(&ip_vs_sockopts);
3403 return ret;
3404 }
3405
3406 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
3407 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
3408
3409 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
3410
3411 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
3412 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3413 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3414 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3415 }
3416 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
3417 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
3418 }
3419
3420 ip_vs_new_estimator(&ip_vs_stats);
3421
3422 /* Hook the defense timer */
3423 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
3424
3425 LeaveFunction(2);
3426 return 0;
3427}
3428
3429
3430void ip_vs_control_cleanup(void)
3431{
3432 EnterFunction(2);
3433 ip_vs_trash_cleanup();
3434 cancel_rearming_delayed_work(&defense_work);
3435 cancel_work_sync(&defense_work.work);
3436 ip_vs_kill_estimator(&ip_vs_stats);
3437 unregister_sysctl_table(sysctl_header);
3438 proc_net_remove(&init_net, "ip_vs_stats");
3439 proc_net_remove(&init_net, "ip_vs");
3440 ip_vs_genl_unregister();
3441 nf_unregister_sockopt(&ip_vs_sockopts);
3442 LeaveFunction(2);
3443}
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
deleted file mode 100644
index 2eb2860dabb5..000000000000
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ /dev/null
@@ -1,166 +0,0 @@
1/*
2 * ip_vs_est.c: simple rate estimator for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/slab.h>
17#include <linux/types.h>
18#include <linux/interrupt.h>
19#include <linux/sysctl.h>
20#include <linux/list.h>
21
22#include <net/ip_vs.h>
23
24/*
25 This code is to estimate rate in a shorter interval (such as 8
26 seconds) for virtual services and real servers. For measure rate in a
27 long interval, it is easy to implement a user level daemon which
28 periodically reads those statistical counters and measure rate.
29
30 Currently, the measurement is activated by slow timer handler. Hope
31 this measurement will not introduce too much load.
32
33 We measure rate during the last 8 seconds every 2 seconds:
34
35 avgrate = avgrate*(1-W) + rate*W
36
37 where W = 2^(-2)
38
39 NOTES.
40
41 * The stored value for average bps is scaled by 2^5, so that maximal
42 rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
43
44 * A lot code is taken from net/sched/estimator.c
45 */
46
47
48static void estimation_timer(unsigned long arg);
49
50static LIST_HEAD(est_list);
51static DEFINE_SPINLOCK(est_lock);
52static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
53
54static void estimation_timer(unsigned long arg)
55{
56 struct ip_vs_estimator *e;
57 struct ip_vs_stats *s;
58 u32 n_conns;
59 u32 n_inpkts, n_outpkts;
60 u64 n_inbytes, n_outbytes;
61 u32 rate;
62
63 spin_lock(&est_lock);
64 list_for_each_entry(e, &est_list, list) {
65 s = container_of(e, struct ip_vs_stats, est);
66
67 spin_lock(&s->lock);
68 n_conns = s->ustats.conns;
69 n_inpkts = s->ustats.inpkts;
70 n_outpkts = s->ustats.outpkts;
71 n_inbytes = s->ustats.inbytes;
72 n_outbytes = s->ustats.outbytes;
73
74 /* scaled by 2^10, but divided 2 seconds */
75 rate = (n_conns - e->last_conns)<<9;
76 e->last_conns = n_conns;
77 e->cps += ((long)rate - (long)e->cps)>>2;
78 s->ustats.cps = (e->cps+0x1FF)>>10;
79
80 rate = (n_inpkts - e->last_inpkts)<<9;
81 e->last_inpkts = n_inpkts;
82 e->inpps += ((long)rate - (long)e->inpps)>>2;
83 s->ustats.inpps = (e->inpps+0x1FF)>>10;
84
85 rate = (n_outpkts - e->last_outpkts)<<9;
86 e->last_outpkts = n_outpkts;
87 e->outpps += ((long)rate - (long)e->outpps)>>2;
88 s->ustats.outpps = (e->outpps+0x1FF)>>10;
89
90 rate = (n_inbytes - e->last_inbytes)<<4;
91 e->last_inbytes = n_inbytes;
92 e->inbps += ((long)rate - (long)e->inbps)>>2;
93 s->ustats.inbps = (e->inbps+0xF)>>5;
94
95 rate = (n_outbytes - e->last_outbytes)<<4;
96 e->last_outbytes = n_outbytes;
97 e->outbps += ((long)rate - (long)e->outbps)>>2;
98 s->ustats.outbps = (e->outbps+0xF)>>5;
99 spin_unlock(&s->lock);
100 }
101 spin_unlock(&est_lock);
102 mod_timer(&est_timer, jiffies + 2*HZ);
103}
104
105void ip_vs_new_estimator(struct ip_vs_stats *stats)
106{
107 struct ip_vs_estimator *est = &stats->est;
108
109 INIT_LIST_HEAD(&est->list);
110
111 est->last_conns = stats->ustats.conns;
112 est->cps = stats->ustats.cps<<10;
113
114 est->last_inpkts = stats->ustats.inpkts;
115 est->inpps = stats->ustats.inpps<<10;
116
117 est->last_outpkts = stats->ustats.outpkts;
118 est->outpps = stats->ustats.outpps<<10;
119
120 est->last_inbytes = stats->ustats.inbytes;
121 est->inbps = stats->ustats.inbps<<5;
122
123 est->last_outbytes = stats->ustats.outbytes;
124 est->outbps = stats->ustats.outbps<<5;
125
126 spin_lock_bh(&est_lock);
127 list_add(&est->list, &est_list);
128 spin_unlock_bh(&est_lock);
129}
130
131void ip_vs_kill_estimator(struct ip_vs_stats *stats)
132{
133 struct ip_vs_estimator *est = &stats->est;
134
135 spin_lock_bh(&est_lock);
136 list_del(&est->list);
137 spin_unlock_bh(&est_lock);
138}
139
140void ip_vs_zero_estimator(struct ip_vs_stats *stats)
141{
142 struct ip_vs_estimator *est = &stats->est;
143
144 /* set counters zero, caller must hold the stats->lock lock */
145 est->last_inbytes = 0;
146 est->last_outbytes = 0;
147 est->last_conns = 0;
148 est->last_inpkts = 0;
149 est->last_outpkts = 0;
150 est->cps = 0;
151 est->inpps = 0;
152 est->outpps = 0;
153 est->inbps = 0;
154 est->outbps = 0;
155}
156
157int __init ip_vs_estimator_init(void)
158{
159 mod_timer(&est_timer, jiffies + 2 * HZ);
160 return 0;
161}
162
163void ip_vs_estimator_cleanup(void)
164{
165 del_timer_sync(&est_timer);
166}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
deleted file mode 100644
index 2e7dbd8b73a4..000000000000
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * ip_vs_ftp.c: IPVS ftp application module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * Changes:
7 *
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
15 * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
16 *
17 * IP_MASQ_FTP ftp masquerading module
18 *
19 * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
20 *
21 * Author: Wouter Gadeyne
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/kernel.h>
28#include <linux/skbuff.h>
29#include <linux/in.h>
30#include <linux/ip.h>
31#include <linux/netfilter.h>
32#include <net/protocol.h>
33#include <net/tcp.h>
34#include <asm/unaligned.h>
35
36#include <net/ip_vs.h>
37
38
39#define SERVER_STRING "227 Entering Passive Mode ("
40#define CLIENT_STRING "PORT "
41
42
43/*
44 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
45 * First port is set to the default port.
46 */
47static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
48module_param_array(ports, ushort, NULL, 0);
49MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
50
51
52/* Dummy variable */
53static int ip_vs_ftp_pasv;
54
55
56static int
57ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
58{
59 return 0;
60}
61
62
63static int
64ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
65{
66 return 0;
67}
68
69
70/*
71 * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
72 * with the "pattern" and terminated with the "term" character.
73 * <addr,port> is in network order.
74 */
75static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
76 const char *pattern, size_t plen, char term,
77 __be32 *addr, __be16 *port,
78 char **start, char **end)
79{
80 unsigned char p[6];
81 int i = 0;
82
83 if (data_limit - data < plen) {
84 /* check if there is partial match */
85 if (strnicmp(data, pattern, data_limit - data) == 0)
86 return -1;
87 else
88 return 0;
89 }
90
91 if (strnicmp(data, pattern, plen) != 0) {
92 return 0;
93 }
94 *start = data + plen;
95
96 for (data = *start; *data != term; data++) {
97 if (data == data_limit)
98 return -1;
99 }
100 *end = data;
101
102 memset(p, 0, sizeof(p));
103 for (data = *start; data != *end; data++) {
104 if (*data >= '0' && *data <= '9') {
105 p[i] = p[i]*10 + *data - '0';
106 } else if (*data == ',' && i < 5) {
107 i++;
108 } else {
109 /* unexpected character */
110 return -1;
111 }
112 }
113
114 if (i != 5)
115 return -1;
116
117 *addr = get_unaligned((__be32 *)p);
118 *port = get_unaligned((__be16 *)(p + 4));
119 return 1;
120}
121
122
123/*
124 * Look at outgoing ftp packets to catch the response to a PASV command
125 * from the server (inside-to-outside).
126 * When we see one, we build a connection entry with the client address,
127 * client port 0 (unknown at the moment), the server address and the
128 * server port. Mark the current connection entry as a control channel
129 * of the new entry. All this work is just to make the data connection
130 * can be scheduled to the right server later.
131 *
132 * The outgoing packet should be something like
133 * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
134 * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
135 */
136static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
137 struct sk_buff *skb, int *diff)
138{
139 struct iphdr *iph;
140 struct tcphdr *th;
141 char *data, *data_limit;
142 char *start, *end;
143 union nf_inet_addr from;
144 __be16 port;
145 struct ip_vs_conn *n_cp;
146 char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
147 unsigned buf_len;
148 int ret;
149
150#ifdef CONFIG_IP_VS_IPV6
151 /* This application helper doesn't work with IPv6 yet,
152 * so turn this into a no-op for IPv6 packets
153 */
154 if (cp->af == AF_INET6)
155 return 1;
156#endif
157
158 *diff = 0;
159
160 /* Only useful for established sessions */
161 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
162 return 1;
163
164 /* Linear packets are much easier to deal with. */
165 if (!skb_make_writable(skb, skb->len))
166 return 0;
167
168 if (cp->app_data == &ip_vs_ftp_pasv) {
169 iph = ip_hdr(skb);
170 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
171 data = (char *)th + (th->doff << 2);
172 data_limit = skb_tail_pointer(skb);
173
174 if (ip_vs_ftp_get_addrport(data, data_limit,
175 SERVER_STRING,
176 sizeof(SERVER_STRING)-1, ')',
177 &from.ip, &port,
178 &start, &end) != 1)
179 return 1;
180
181 IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
182 "%u.%u.%u.%u:%d detected\n",
183 NIPQUAD(from.ip), ntohs(port),
184 NIPQUAD(cp->caddr.ip), 0);
185
186 /*
187 * Now update or create an connection entry for it
188 */
189 n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
190 &cp->caddr, 0);
191 if (!n_cp) {
192 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
193 &cp->caddr, 0,
194 &cp->vaddr, port,
195 &from, port,
196 IP_VS_CONN_F_NO_CPORT,
197 cp->dest);
198 if (!n_cp)
199 return 0;
200
201 /* add its controller */
202 ip_vs_control_add(n_cp, cp);
203 }
204
205 /*
206 * Replace the old passive address with the new one
207 */
208 from.ip = n_cp->vaddr.ip;
209 port = n_cp->vport;
210 sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
211 (ntohs(port)>>8)&255, ntohs(port)&255);
212 buf_len = strlen(buf);
213
214 /*
215 * Calculate required delta-offset to keep TCP happy
216 */
217 *diff = buf_len - (end-start);
218
219 if (*diff == 0) {
220 /* simply replace it with new passive address */
221 memcpy(start, buf, buf_len);
222 ret = 1;
223 } else {
224 ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
225 end-start, buf, buf_len);
226 }
227
228 cp->app_data = NULL;
229 ip_vs_tcp_conn_listen(n_cp);
230 ip_vs_conn_put(n_cp);
231 return ret;
232 }
233 return 1;
234}
235
236
237/*
238 * Look at incoming ftp packets to catch the PASV/PORT command
239 * (outside-to-inside).
240 *
241 * The incoming packet having the PORT command should be something like
242 * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
243 * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
244 * In this case, we create a connection entry using the client address and
245 * port, so that the active ftp data connection from the server can reach
246 * the client.
247 */
248static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
249 struct sk_buff *skb, int *diff)
250{
251 struct iphdr *iph;
252 struct tcphdr *th;
253 char *data, *data_start, *data_limit;
254 char *start, *end;
255 union nf_inet_addr to;
256 __be16 port;
257 struct ip_vs_conn *n_cp;
258
259#ifdef CONFIG_IP_VS_IPV6
260 /* This application helper doesn't work with IPv6 yet,
261 * so turn this into a no-op for IPv6 packets
262 */
263 if (cp->af == AF_INET6)
264 return 1;
265#endif
266
267 /* no diff required for incoming packets */
268 *diff = 0;
269
270 /* Only useful for established sessions */
271 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
272 return 1;
273
274 /* Linear packets are much easier to deal with. */
275 if (!skb_make_writable(skb, skb->len))
276 return 0;
277
278 /*
279 * Detecting whether it is passive
280 */
281 iph = ip_hdr(skb);
282 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
283
284 /* Since there may be OPTIONS in the TCP packet and the HLEN is
285 the length of the header in 32-bit multiples, it is accurate
286 to calculate data address by th+HLEN*4 */
287 data = data_start = (char *)th + (th->doff << 2);
288 data_limit = skb_tail_pointer(skb);
289
290 while (data <= data_limit - 6) {
291 if (strnicmp(data, "PASV\r\n", 6) == 0) {
292 /* Passive mode on */
293 IP_VS_DBG(7, "got PASV at %td of %td\n",
294 data - data_start,
295 data_limit - data_start);
296 cp->app_data = &ip_vs_ftp_pasv;
297 return 1;
298 }
299 data++;
300 }
301
302 /*
303 * To support virtual FTP server, the scenerio is as follows:
304 * FTP client ----> Load Balancer ----> FTP server
305 * First detect the port number in the application data,
306 * then create a new connection entry for the coming data
307 * connection.
308 */
309 if (ip_vs_ftp_get_addrport(data_start, data_limit,
310 CLIENT_STRING, sizeof(CLIENT_STRING)-1,
311 '\r', &to.ip, &port,
312 &start, &end) != 1)
313 return 1;
314
315 IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
316 NIPQUAD(to.ip), ntohs(port));
317
318 /* Passive mode off */
319 cp->app_data = NULL;
320
321 /*
322 * Now update or create a connection entry for it
323 */
324 IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
325 ip_vs_proto_name(iph->protocol),
326 NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
327
328 n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
329 &to, port,
330 &cp->vaddr, htons(ntohs(cp->vport)-1));
331 if (!n_cp) {
332 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
333 &to, port,
334 &cp->vaddr, htons(ntohs(cp->vport)-1),
335 &cp->daddr, htons(ntohs(cp->dport)-1),
336 0,
337 cp->dest);
338 if (!n_cp)
339 return 0;
340
341 /* add its controller */
342 ip_vs_control_add(n_cp, cp);
343 }
344
345 /*
346 * Move tunnel to listen state
347 */
348 ip_vs_tcp_conn_listen(n_cp);
349 ip_vs_conn_put(n_cp);
350
351 return 1;
352}
353
354
355static struct ip_vs_app ip_vs_ftp = {
356 .name = "ftp",
357 .type = IP_VS_APP_TYPE_FTP,
358 .protocol = IPPROTO_TCP,
359 .module = THIS_MODULE,
360 .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
361 .init_conn = ip_vs_ftp_init_conn,
362 .done_conn = ip_vs_ftp_done_conn,
363 .bind_conn = NULL,
364 .unbind_conn = NULL,
365 .pkt_out = ip_vs_ftp_out,
366 .pkt_in = ip_vs_ftp_in,
367};
368
369
370/*
371 * ip_vs_ftp initialization
372 */
373static int __init ip_vs_ftp_init(void)
374{
375 int i, ret;
376 struct ip_vs_app *app = &ip_vs_ftp;
377
378 ret = register_ip_vs_app(app);
379 if (ret)
380 return ret;
381
382 for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
383 if (!ports[i])
384 continue;
385 ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
386 if (ret)
387 break;
388 IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
389 app->name, i, ports[i]);
390 }
391
392 if (ret)
393 unregister_ip_vs_app(app);
394
395 return ret;
396}
397
398
399/*
400 * ip_vs_ftp finish.
401 */
402static void __exit ip_vs_ftp_exit(void)
403{
404 unregister_ip_vs_app(&ip_vs_ftp);
405}
406
407
408module_init(ip_vs_ftp_init);
409module_exit(ip_vs_ftp_exit);
410MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
deleted file mode 100644
index 6ecef3518cac..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ /dev/null
@@ -1,555 +0,0 @@
1/*
2 * IPVS: Locality-Based Least-Connection scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Martin Hamilton : fixed the terrible locking bugs
13 * *lock(tbl->lock) ==> *lock(&tbl->lock)
14 * Wensong Zhang : fixed the uninitilized tbl->lock bug
15 * Wensong Zhang : added doing full expiration check to
16 * collect stale entries of 24+ hours when
17 * no partial expire check in a half hour
18 * Julian Anastasov : replaced del_timer call with del_timer_sync
19 * to avoid the possible race between timer
20 * handler and del_timer thread in SMP
21 *
22 */
23
24/*
25 * The lblc algorithm is as follows (pseudo code):
26 *
27 * if cachenode[dest_ip] is null then
28 * n, cachenode[dest_ip] <- {weighted least-conn node};
29 * else
30 * n <- cachenode[dest_ip];
31 * if (n is dead) OR
32 * (n.conns>n.weight AND
33 * there is a node m with m.conns<m.weight/2) then
34 * n, cachenode[dest_ip] <- {weighted least-conn node};
35 *
36 * return n;
37 *
38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
39 * me to write this module.
40 */
41
42#include <linux/ip.h>
43#include <linux/module.h>
44#include <linux/kernel.h>
45#include <linux/skbuff.h>
46#include <linux/jiffies.h>
47
48/* for sysctl */
49#include <linux/fs.h>
50#include <linux/sysctl.h>
51
52#include <net/ip_vs.h>
53
54
55/*
56 * It is for garbage collection of stale IPVS lblc entries,
57 * when the table is full.
58 */
59#define CHECK_EXPIRE_INTERVAL (60*HZ)
60#define ENTRY_TIMEOUT (6*60*HZ)
61
62/*
63 * It is for full expiration check.
64 * When there is no partial expiration check (garbage collection)
65 * in a half hour, do a full expiration check to collect stale
66 * entries that haven't been touched for a day.
67 */
68#define COUNT_FOR_FULL_EXPIRATION 30
69static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
70
71
72/*
73 * for IPVS lblc entry hash table
74 */
75#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
76#define CONFIG_IP_VS_LBLC_TAB_BITS 10
77#endif
78#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
79#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
80#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
81
82
83/*
84 * IPVS lblc entry represents an association between destination
85 * IP address and its destination server
86 */
87struct ip_vs_lblc_entry {
88 struct list_head list;
89 __be32 addr; /* destination IP address */
90 struct ip_vs_dest *dest; /* real server (cache) */
91 unsigned long lastuse; /* last used time */
92};
93
94
95/*
96 * IPVS lblc hash table
97 */
98struct ip_vs_lblc_table {
99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
100 atomic_t entries; /* number of entries */
101 int max_size; /* maximum size of entries */
102 struct timer_list periodic_timer; /* collect stale entries */
103 int rover; /* rover for expire check */
104 int counter; /* counter for no expire */
105};
106
107
108/*
109 * IPVS LBLC sysctl table
110 */
111
112static ctl_table vs_vars_table[] = {
113 {
114 .procname = "lblc_expiration",
115 .data = &sysctl_ip_vs_lblc_expiration,
116 .maxlen = sizeof(int),
117 .mode = 0644,
118 .proc_handler = &proc_dointvec_jiffies,
119 },
120 { .ctl_name = 0 }
121};
122
123static struct ctl_table_header * sysctl_header;
124
125static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
126{
127 list_del(&en->list);
128 /*
129 * We don't kfree dest because it is refered either by its service
130 * or the trash dest list.
131 */
132 atomic_dec(&en->dest->refcnt);
133 kfree(en);
134}
135
136
137/*
138 * Returns hash value for IPVS LBLC entry
139 */
140static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
141{
142 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
143}
144
145
146/*
147 * Hash an entry in the ip_vs_lblc_table.
148 * returns bool success.
149 */
150static void
151ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
152{
153 unsigned hash = ip_vs_lblc_hashkey(en->addr);
154
155 list_add(&en->list, &tbl->bucket[hash]);
156 atomic_inc(&tbl->entries);
157}
158
159
160/*
161 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
162 * lock
163 */
164static inline struct ip_vs_lblc_entry *
165ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
166{
167 unsigned hash = ip_vs_lblc_hashkey(addr);
168 struct ip_vs_lblc_entry *en;
169
170 list_for_each_entry(en, &tbl->bucket[hash], list)
171 if (en->addr == addr)
172 return en;
173
174 return NULL;
175}
176
177
178/*
179 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
180 * address to a server. Called under write lock.
181 */
182static inline struct ip_vs_lblc_entry *
183ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
184 struct ip_vs_dest *dest)
185{
186 struct ip_vs_lblc_entry *en;
187
188 en = ip_vs_lblc_get(tbl, daddr);
189 if (!en) {
190 en = kmalloc(sizeof(*en), GFP_ATOMIC);
191 if (!en) {
192 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
193 return NULL;
194 }
195
196 en->addr = daddr;
197 en->lastuse = jiffies;
198
199 atomic_inc(&dest->refcnt);
200 en->dest = dest;
201
202 ip_vs_lblc_hash(tbl, en);
203 } else if (en->dest != dest) {
204 atomic_dec(&en->dest->refcnt);
205 atomic_inc(&dest->refcnt);
206 en->dest = dest;
207 }
208
209 return en;
210}
211
212
213/*
214 * Flush all the entries of the specified table.
215 */
216static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
217{
218 struct ip_vs_lblc_entry *en, *nxt;
219 int i;
220
221 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
222 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
223 ip_vs_lblc_free(en);
224 atomic_dec(&tbl->entries);
225 }
226 }
227}
228
229
230static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
231{
232 struct ip_vs_lblc_table *tbl = svc->sched_data;
233 struct ip_vs_lblc_entry *en, *nxt;
234 unsigned long now = jiffies;
235 int i, j;
236
237 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
238 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
239
240 write_lock(&svc->sched_lock);
241 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
242 if (time_before(now,
243 en->lastuse + sysctl_ip_vs_lblc_expiration))
244 continue;
245
246 ip_vs_lblc_free(en);
247 atomic_dec(&tbl->entries);
248 }
249 write_unlock(&svc->sched_lock);
250 }
251 tbl->rover = j;
252}
253
254
255/*
256 * Periodical timer handler for IPVS lblc table
257 * It is used to collect stale entries when the number of entries
258 * exceeds the maximum size of the table.
259 *
260 * Fixme: we probably need more complicated algorithm to collect
261 * entries that have not been used for a long time even
262 * if the number of entries doesn't exceed the maximum size
263 * of the table.
264 * The full expiration check is for this purpose now.
265 */
266static void ip_vs_lblc_check_expire(unsigned long data)
267{
268 struct ip_vs_service *svc = (struct ip_vs_service *) data;
269 struct ip_vs_lblc_table *tbl = svc->sched_data;
270 unsigned long now = jiffies;
271 int goal;
272 int i, j;
273 struct ip_vs_lblc_entry *en, *nxt;
274
275 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
276 /* do full expiration check */
277 ip_vs_lblc_full_check(svc);
278 tbl->counter = 1;
279 goto out;
280 }
281
282 if (atomic_read(&tbl->entries) <= tbl->max_size) {
283 tbl->counter++;
284 goto out;
285 }
286
287 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
288 if (goal > tbl->max_size/2)
289 goal = tbl->max_size/2;
290
291 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
292 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
293
294 write_lock(&svc->sched_lock);
295 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
296 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
297 continue;
298
299 ip_vs_lblc_free(en);
300 atomic_dec(&tbl->entries);
301 goal--;
302 }
303 write_unlock(&svc->sched_lock);
304 if (goal <= 0)
305 break;
306 }
307 tbl->rover = j;
308
309 out:
310 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
311}
312
313
314static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
315{
316 int i;
317 struct ip_vs_lblc_table *tbl;
318
319 /*
320 * Allocate the ip_vs_lblc_table for this service
321 */
322 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
323 if (tbl == NULL) {
324 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
325 return -ENOMEM;
326 }
327 svc->sched_data = tbl;
328 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
329 "current service\n", sizeof(*tbl));
330
331 /*
332 * Initialize the hash buckets
333 */
334 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
335 INIT_LIST_HEAD(&tbl->bucket[i]);
336 }
337 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
338 tbl->rover = 0;
339 tbl->counter = 1;
340
341 /*
342 * Hook periodic timer for garbage collection
343 */
344 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
345 (unsigned long)svc);
346 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
347
348 return 0;
349}
350
351
352static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
353{
354 struct ip_vs_lblc_table *tbl = svc->sched_data;
355
356 /* remove periodic timer */
357 del_timer_sync(&tbl->periodic_timer);
358
359 /* got to clean up table entries here */
360 ip_vs_lblc_flush(tbl);
361
362 /* release the table itself */
363 kfree(tbl);
364 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
365 sizeof(*tbl));
366
367 return 0;
368}
369
370
371static inline struct ip_vs_dest *
372__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
373{
374 struct ip_vs_dest *dest, *least;
375 int loh, doh;
376
377 /*
378 * We think the overhead of processing active connections is fifty
379 * times higher than that of inactive connections in average. (This
380 * fifty times might not be accurate, we will change it later.) We
381 * use the following formula to estimate the overhead:
382 * dest->activeconns*50 + dest->inactconns
383 * and the load:
384 * (dest overhead) / dest->weight
385 *
386 * Remember -- no floats in kernel mode!!!
387 * The comparison of h1*w2 > h2*w1 is equivalent to that of
388 * h1/w1 > h2/w2
389 * if every weight is larger than zero.
390 *
391 * The server with weight=0 is quiesced and will not receive any
392 * new connection.
393 */
394 list_for_each_entry(dest, &svc->destinations, n_list) {
395 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
396 continue;
397 if (atomic_read(&dest->weight) > 0) {
398 least = dest;
399 loh = atomic_read(&least->activeconns) * 50
400 + atomic_read(&least->inactconns);
401 goto nextstage;
402 }
403 }
404 return NULL;
405
406 /*
407 * Find the destination with the least load.
408 */
409 nextstage:
410 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
411 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
412 continue;
413
414 doh = atomic_read(&dest->activeconns) * 50
415 + atomic_read(&dest->inactconns);
416 if (loh * atomic_read(&dest->weight) >
417 doh * atomic_read(&least->weight)) {
418 least = dest;
419 loh = doh;
420 }
421 }
422
423 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
424 "activeconns %d refcnt %d weight %d overhead %d\n",
425 NIPQUAD(least->addr.ip), ntohs(least->port),
426 atomic_read(&least->activeconns),
427 atomic_read(&least->refcnt),
428 atomic_read(&least->weight), loh);
429
430 return least;
431}
432
433
434/*
435 * If this destination server is overloaded and there is a less loaded
436 * server, then return true.
437 */
438static inline int
439is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
440{
441 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
442 struct ip_vs_dest *d;
443
444 list_for_each_entry(d, &svc->destinations, n_list) {
445 if (atomic_read(&d->activeconns)*2
446 < atomic_read(&d->weight)) {
447 return 1;
448 }
449 }
450 }
451 return 0;
452}
453
454
455/*
456 * Locality-Based (weighted) Least-Connection scheduling
457 */
458static struct ip_vs_dest *
459ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
460{
461 struct ip_vs_lblc_table *tbl = svc->sched_data;
462 struct iphdr *iph = ip_hdr(skb);
463 struct ip_vs_dest *dest = NULL;
464 struct ip_vs_lblc_entry *en;
465
466 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
467
468 /* First look in our cache */
469 read_lock(&svc->sched_lock);
470 en = ip_vs_lblc_get(tbl, iph->daddr);
471 if (en) {
472 /* We only hold a read lock, but this is atomic */
473 en->lastuse = jiffies;
474
475 /*
476 * If the destination is not available, i.e. it's in the trash,
477 * we must ignore it, as it may be removed from under our feet,
478 * if someone drops our reference count. Our caller only makes
479 * sure that destinations, that are not in the trash, are not
480 * moved to the trash, while we are scheduling. But anyone can
481 * free up entries from the trash at any time.
482 */
483
484 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
485 dest = en->dest;
486 }
487 read_unlock(&svc->sched_lock);
488
489 /* If the destination has a weight and is not overloaded, use it */
490 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
491 goto out;
492
493 /* No cache entry or it is invalid, time to schedule */
494 dest = __ip_vs_lblc_schedule(svc, iph);
495 if (!dest) {
496 IP_VS_DBG(1, "no destination available\n");
497 return NULL;
498 }
499
500 /* If we fail to create a cache entry, we'll just use the valid dest */
501 write_lock(&svc->sched_lock);
502 ip_vs_lblc_new(tbl, iph->daddr, dest);
503 write_unlock(&svc->sched_lock);
504
505out:
506 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
507 "--> server %u.%u.%u.%u:%d\n",
508 NIPQUAD(iph->daddr),
509 NIPQUAD(dest->addr.ip),
510 ntohs(dest->port));
511
512 return dest;
513}
514
515
516/*
517 * IPVS LBLC Scheduler structure
518 */
519static struct ip_vs_scheduler ip_vs_lblc_scheduler =
520{
521 .name = "lblc",
522 .refcnt = ATOMIC_INIT(0),
523 .module = THIS_MODULE,
524 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
525#ifdef CONFIG_IP_VS_IPV6
526 .supports_ipv6 = 0,
527#endif
528 .init_service = ip_vs_lblc_init_svc,
529 .done_service = ip_vs_lblc_done_svc,
530 .schedule = ip_vs_lblc_schedule,
531};
532
533
534static int __init ip_vs_lblc_init(void)
535{
536 int ret;
537
538 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
539 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
540 if (ret)
541 unregister_sysctl_table(sysctl_header);
542 return ret;
543}
544
545
546static void __exit ip_vs_lblc_cleanup(void)
547{
548 unregister_sysctl_table(sysctl_header);
549 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
550}
551
552
553module_init(ip_vs_lblc_init);
554module_exit(ip_vs_lblc_cleanup);
555MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
deleted file mode 100644
index 1f75ea83bcf8..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ /dev/null
@@ -1,755 +0,0 @@
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Julian Anastasov : Added the missing (dest->weight>0)
13 * condition in the ip_vs_dest_set_max.
14 *
15 */
16
17/*
18 * The lblc/r algorithm is as follows (pseudo code):
19 *
20 * if serverSet[dest_ip] is null then
21 * n, serverSet[dest_ip] <- {weighted least-conn node};
22 * else
23 * n <- {least-conn (alive) node in serverSet[dest_ip]};
24 * if (n is null) OR
25 * (n.conns>n.weight AND
26 * there is a node m with m.conns<m.weight/2) then
27 * n <- {weighted least-conn node};
28 * add n to serverSet[dest_ip];
29 * if |serverSet[dest_ip]| > 1 AND
30 * now - serverSet[dest_ip].lastMod > T then
31 * m <- {most conn node in serverSet[dest_ip]};
32 * remove m from serverSet[dest_ip];
33 * if serverSet[dest_ip] changed then
34 * serverSet[dest_ip].lastMod <- now;
35 *
36 * return n;
37 *
38 */
39
40#include <linux/ip.h>
41#include <linux/module.h>
42#include <linux/kernel.h>
43#include <linux/skbuff.h>
44#include <linux/jiffies.h>
45
46/* for sysctl */
47#include <linux/fs.h>
48#include <linux/sysctl.h>
49#include <net/net_namespace.h>
50
51#include <net/ip_vs.h>
52
53
54/*
55 * It is for garbage collection of stale IPVS lblcr entries,
56 * when the table is full.
57 */
58#define CHECK_EXPIRE_INTERVAL (60*HZ)
59#define ENTRY_TIMEOUT (6*60*HZ)
60
61/*
62 * It is for full expiration check.
63 * When there is no partial expiration check (garbage collection)
64 * in a half hour, do a full expiration check to collect stale
65 * entries that haven't been touched for a day.
66 */
67#define COUNT_FOR_FULL_EXPIRATION 30
68static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
69
70
71/*
72 * for IPVS lblcr entry hash table
73 */
74#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
75#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
76#endif
77#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
78#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
79#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
80
81
82/*
83 * IPVS destination set structure and operations
84 */
85struct ip_vs_dest_list {
86 struct ip_vs_dest_list *next; /* list link */
87 struct ip_vs_dest *dest; /* destination server */
88};
89
90struct ip_vs_dest_set {
91 atomic_t size; /* set size */
92 unsigned long lastmod; /* last modified time */
93 struct ip_vs_dest_list *list; /* destination list */
94 rwlock_t lock; /* lock for this list */
95};
96
97
98static struct ip_vs_dest_list *
99ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
100{
101 struct ip_vs_dest_list *e;
102
103 for (e=set->list; e!=NULL; e=e->next) {
104 if (e->dest == dest)
105 /* already existed */
106 return NULL;
107 }
108
109 e = kmalloc(sizeof(*e), GFP_ATOMIC);
110 if (e == NULL) {
111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
112 return NULL;
113 }
114
115 atomic_inc(&dest->refcnt);
116 e->dest = dest;
117
118 /* link it to the list */
119 e->next = set->list;
120 set->list = e;
121 atomic_inc(&set->size);
122
123 set->lastmod = jiffies;
124 return e;
125}
126
127static void
128ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
129{
130 struct ip_vs_dest_list *e, **ep;
131
132 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
133 if (e->dest == dest) {
134 /* HIT */
135 *ep = e->next;
136 atomic_dec(&set->size);
137 set->lastmod = jiffies;
138 atomic_dec(&e->dest->refcnt);
139 kfree(e);
140 break;
141 }
142 ep = &e->next;
143 }
144}
145
146static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
147{
148 struct ip_vs_dest_list *e, **ep;
149
150 write_lock(&set->lock);
151 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
152 *ep = e->next;
153 /*
154 * We don't kfree dest because it is refered either
155 * by its service or by the trash dest list.
156 */
157 atomic_dec(&e->dest->refcnt);
158 kfree(e);
159 }
160 write_unlock(&set->lock);
161}
162
163/* get weighted least-connection node in the destination set */
164static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
165{
166 register struct ip_vs_dest_list *e;
167 struct ip_vs_dest *dest, *least;
168 int loh, doh;
169
170 if (set == NULL)
171 return NULL;
172
173 /* select the first destination server, whose weight > 0 */
174 for (e=set->list; e!=NULL; e=e->next) {
175 least = e->dest;
176 if (least->flags & IP_VS_DEST_F_OVERLOAD)
177 continue;
178
179 if ((atomic_read(&least->weight) > 0)
180 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
181 loh = atomic_read(&least->activeconns) * 50
182 + atomic_read(&least->inactconns);
183 goto nextstage;
184 }
185 }
186 return NULL;
187
188 /* find the destination with the weighted least load */
189 nextstage:
190 for (e=e->next; e!=NULL; e=e->next) {
191 dest = e->dest;
192 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
193 continue;
194
195 doh = atomic_read(&dest->activeconns) * 50
196 + atomic_read(&dest->inactconns);
197 if ((loh * atomic_read(&dest->weight) >
198 doh * atomic_read(&least->weight))
199 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
200 least = dest;
201 loh = doh;
202 }
203 }
204
205 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
206 "activeconns %d refcnt %d weight %d overhead %d\n",
207 NIPQUAD(least->addr.ip), ntohs(least->port),
208 atomic_read(&least->activeconns),
209 atomic_read(&least->refcnt),
210 atomic_read(&least->weight), loh);
211 return least;
212}
213
214
215/* get weighted most-connection node in the destination set */
216static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
217{
218 register struct ip_vs_dest_list *e;
219 struct ip_vs_dest *dest, *most;
220 int moh, doh;
221
222 if (set == NULL)
223 return NULL;
224
225 /* select the first destination server, whose weight > 0 */
226 for (e=set->list; e!=NULL; e=e->next) {
227 most = e->dest;
228 if (atomic_read(&most->weight) > 0) {
229 moh = atomic_read(&most->activeconns) * 50
230 + atomic_read(&most->inactconns);
231 goto nextstage;
232 }
233 }
234 return NULL;
235
236 /* find the destination with the weighted most load */
237 nextstage:
238 for (e=e->next; e!=NULL; e=e->next) {
239 dest = e->dest;
240 doh = atomic_read(&dest->activeconns) * 50
241 + atomic_read(&dest->inactconns);
242 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
243 if ((moh * atomic_read(&dest->weight) <
244 doh * atomic_read(&most->weight))
245 && (atomic_read(&dest->weight) > 0)) {
246 most = dest;
247 moh = doh;
248 }
249 }
250
251 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
252 "activeconns %d refcnt %d weight %d overhead %d\n",
253 NIPQUAD(most->addr.ip), ntohs(most->port),
254 atomic_read(&most->activeconns),
255 atomic_read(&most->refcnt),
256 atomic_read(&most->weight), moh);
257 return most;
258}
259
260
261/*
262 * IPVS lblcr entry represents an association between destination
263 * IP address and its destination server set
264 */
265struct ip_vs_lblcr_entry {
266 struct list_head list;
267 __be32 addr; /* destination IP address */
268 struct ip_vs_dest_set set; /* destination server set */
269 unsigned long lastuse; /* last used time */
270};
271
272
273/*
274 * IPVS lblcr hash table
275 */
276struct ip_vs_lblcr_table {
277 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
278 atomic_t entries; /* number of entries */
279 int max_size; /* maximum size of entries */
280 struct timer_list periodic_timer; /* collect stale entries */
281 int rover; /* rover for expire check */
282 int counter; /* counter for no expire */
283};
284
285
286/*
287 * IPVS LBLCR sysctl table
288 */
289
290static ctl_table vs_vars_table[] = {
291 {
292 .procname = "lblcr_expiration",
293 .data = &sysctl_ip_vs_lblcr_expiration,
294 .maxlen = sizeof(int),
295 .mode = 0644,
296 .proc_handler = &proc_dointvec_jiffies,
297 },
298 { .ctl_name = 0 }
299};
300
301static struct ctl_table_header * sysctl_header;
302
303static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
304{
305 list_del(&en->list);
306 ip_vs_dest_set_eraseall(&en->set);
307 kfree(en);
308}
309
310
311/*
312 * Returns hash value for IPVS LBLCR entry
313 */
314static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
315{
316 return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
317}
318
319
320/*
321 * Hash an entry in the ip_vs_lblcr_table.
322 * returns bool success.
323 */
324static void
325ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
326{
327 unsigned hash = ip_vs_lblcr_hashkey(en->addr);
328
329 list_add(&en->list, &tbl->bucket[hash]);
330 atomic_inc(&tbl->entries);
331}
332
333
334/*
335 * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
336 * read lock.
337 */
338static inline struct ip_vs_lblcr_entry *
339ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
340{
341 unsigned hash = ip_vs_lblcr_hashkey(addr);
342 struct ip_vs_lblcr_entry *en;
343
344 list_for_each_entry(en, &tbl->bucket[hash], list)
345 if (en->addr == addr)
346 return en;
347
348 return NULL;
349}
350
351
352/*
353 * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
354 * IP address to a server. Called under write lock.
355 */
356static inline struct ip_vs_lblcr_entry *
357ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
358 struct ip_vs_dest *dest)
359{
360 struct ip_vs_lblcr_entry *en;
361
362 en = ip_vs_lblcr_get(tbl, daddr);
363 if (!en) {
364 en = kmalloc(sizeof(*en), GFP_ATOMIC);
365 if (!en) {
366 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
367 return NULL;
368 }
369
370 en->addr = daddr;
371 en->lastuse = jiffies;
372
373 /* initilize its dest set */
374 atomic_set(&(en->set.size), 0);
375 en->set.list = NULL;
376 rwlock_init(&en->set.lock);
377
378 ip_vs_lblcr_hash(tbl, en);
379 }
380
381 write_lock(&en->set.lock);
382 ip_vs_dest_set_insert(&en->set, dest);
383 write_unlock(&en->set.lock);
384
385 return en;
386}
387
388
389/*
390 * Flush all the entries of the specified table.
391 */
392static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
393{
394 int i;
395 struct ip_vs_lblcr_entry *en, *nxt;
396
397 /* No locking required, only called during cleanup. */
398 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
399 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
400 ip_vs_lblcr_free(en);
401 }
402 }
403}
404
405
406static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
407{
408 struct ip_vs_lblcr_table *tbl = svc->sched_data;
409 unsigned long now = jiffies;
410 int i, j;
411 struct ip_vs_lblcr_entry *en, *nxt;
412
413 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
414 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
415
416 write_lock(&svc->sched_lock);
417 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
418 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
419 now))
420 continue;
421
422 ip_vs_lblcr_free(en);
423 atomic_dec(&tbl->entries);
424 }
425 write_unlock(&svc->sched_lock);
426 }
427 tbl->rover = j;
428}
429
430
431/*
432 * Periodical timer handler for IPVS lblcr table
433 * It is used to collect stale entries when the number of entries
434 * exceeds the maximum size of the table.
435 *
436 * Fixme: we probably need more complicated algorithm to collect
437 * entries that have not been used for a long time even
438 * if the number of entries doesn't exceed the maximum size
439 * of the table.
440 * The full expiration check is for this purpose now.
441 */
442static void ip_vs_lblcr_check_expire(unsigned long data)
443{
444 struct ip_vs_service *svc = (struct ip_vs_service *) data;
445 struct ip_vs_lblcr_table *tbl = svc->sched_data;
446 unsigned long now = jiffies;
447 int goal;
448 int i, j;
449 struct ip_vs_lblcr_entry *en, *nxt;
450
451 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
452 /* do full expiration check */
453 ip_vs_lblcr_full_check(svc);
454 tbl->counter = 1;
455 goto out;
456 }
457
458 if (atomic_read(&tbl->entries) <= tbl->max_size) {
459 tbl->counter++;
460 goto out;
461 }
462
463 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
464 if (goal > tbl->max_size/2)
465 goal = tbl->max_size/2;
466
467 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
468 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
469
470 write_lock(&svc->sched_lock);
471 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
472 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
473 continue;
474
475 ip_vs_lblcr_free(en);
476 atomic_dec(&tbl->entries);
477 goal--;
478 }
479 write_unlock(&svc->sched_lock);
480 if (goal <= 0)
481 break;
482 }
483 tbl->rover = j;
484
485 out:
486 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
487}
488
489static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
490{
491 int i;
492 struct ip_vs_lblcr_table *tbl;
493
494 /*
495 * Allocate the ip_vs_lblcr_table for this service
496 */
497 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
498 if (tbl == NULL) {
499 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
500 return -ENOMEM;
501 }
502 svc->sched_data = tbl;
503 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
504 "current service\n", sizeof(*tbl));
505
506 /*
507 * Initialize the hash buckets
508 */
509 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
510 INIT_LIST_HEAD(&tbl->bucket[i]);
511 }
512 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
513 tbl->rover = 0;
514 tbl->counter = 1;
515
516 /*
517 * Hook periodic timer for garbage collection
518 */
519 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
520 (unsigned long)svc);
521 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
522
523 return 0;
524}
525
526
527static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
528{
529 struct ip_vs_lblcr_table *tbl = svc->sched_data;
530
531 /* remove periodic timer */
532 del_timer_sync(&tbl->periodic_timer);
533
534 /* got to clean up table entries here */
535 ip_vs_lblcr_flush(tbl);
536
537 /* release the table itself */
538 kfree(tbl);
539 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
540 sizeof(*tbl));
541
542 return 0;
543}
544
545
546static inline struct ip_vs_dest *
547__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
548{
549 struct ip_vs_dest *dest, *least;
550 int loh, doh;
551
552 /*
553 * We think the overhead of processing active connections is fifty
554 * times higher than that of inactive connections in average. (This
555 * fifty times might not be accurate, we will change it later.) We
556 * use the following formula to estimate the overhead:
557 * dest->activeconns*50 + dest->inactconns
558 * and the load:
559 * (dest overhead) / dest->weight
560 *
561 * Remember -- no floats in kernel mode!!!
562 * The comparison of h1*w2 > h2*w1 is equivalent to that of
563 * h1/w1 > h2/w2
564 * if every weight is larger than zero.
565 *
566 * The server with weight=0 is quiesced and will not receive any
567 * new connection.
568 */
569 list_for_each_entry(dest, &svc->destinations, n_list) {
570 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
571 continue;
572
573 if (atomic_read(&dest->weight) > 0) {
574 least = dest;
575 loh = atomic_read(&least->activeconns) * 50
576 + atomic_read(&least->inactconns);
577 goto nextstage;
578 }
579 }
580 return NULL;
581
582 /*
583 * Find the destination with the least load.
584 */
585 nextstage:
586 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
587 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
588 continue;
589
590 doh = atomic_read(&dest->activeconns) * 50
591 + atomic_read(&dest->inactconns);
592 if (loh * atomic_read(&dest->weight) >
593 doh * atomic_read(&least->weight)) {
594 least = dest;
595 loh = doh;
596 }
597 }
598
599 IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
600 "activeconns %d refcnt %d weight %d overhead %d\n",
601 NIPQUAD(least->addr.ip), ntohs(least->port),
602 atomic_read(&least->activeconns),
603 atomic_read(&least->refcnt),
604 atomic_read(&least->weight), loh);
605
606 return least;
607}
608
609
610/*
611 * If this destination server is overloaded and there is a less loaded
612 * server, then return true.
613 */
614static inline int
615is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
616{
617 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
618 struct ip_vs_dest *d;
619
620 list_for_each_entry(d, &svc->destinations, n_list) {
621 if (atomic_read(&d->activeconns)*2
622 < atomic_read(&d->weight)) {
623 return 1;
624 }
625 }
626 }
627 return 0;
628}
629
630
631/*
632 * Locality-Based (weighted) Least-Connection scheduling
633 */
634static struct ip_vs_dest *
635ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
636{
637 struct ip_vs_lblcr_table *tbl = svc->sched_data;
638 struct iphdr *iph = ip_hdr(skb);
639 struct ip_vs_dest *dest = NULL;
640 struct ip_vs_lblcr_entry *en;
641
642 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
643
644 /* First look in our cache */
645 read_lock(&svc->sched_lock);
646 en = ip_vs_lblcr_get(tbl, iph->daddr);
647 if (en) {
648 /* We only hold a read lock, but this is atomic */
649 en->lastuse = jiffies;
650
651 /* Get the least loaded destination */
652 read_lock(&en->set.lock);
653 dest = ip_vs_dest_set_min(&en->set);
654 read_unlock(&en->set.lock);
655
656 /* More than one destination + enough time passed by, cleanup */
657 if (atomic_read(&en->set.size) > 1 &&
658 time_after(jiffies, en->set.lastmod +
659 sysctl_ip_vs_lblcr_expiration)) {
660 struct ip_vs_dest *m;
661
662 write_lock(&en->set.lock);
663 m = ip_vs_dest_set_max(&en->set);
664 if (m)
665 ip_vs_dest_set_erase(&en->set, m);
666 write_unlock(&en->set.lock);
667 }
668
669 /* If the destination is not overloaded, use it */
670 if (dest && !is_overloaded(dest, svc)) {
671 read_unlock(&svc->sched_lock);
672 goto out;
673 }
674
675 /* The cache entry is invalid, time to schedule */
676 dest = __ip_vs_lblcr_schedule(svc, iph);
677 if (!dest) {
678 IP_VS_DBG(1, "no destination available\n");
679 read_unlock(&svc->sched_lock);
680 return NULL;
681 }
682
683 /* Update our cache entry */
684 write_lock(&en->set.lock);
685 ip_vs_dest_set_insert(&en->set, dest);
686 write_unlock(&en->set.lock);
687 }
688 read_unlock(&svc->sched_lock);
689
690 if (dest)
691 goto out;
692
693 /* No cache entry, time to schedule */
694 dest = __ip_vs_lblcr_schedule(svc, iph);
695 if (!dest) {
696 IP_VS_DBG(1, "no destination available\n");
697 return NULL;
698 }
699
700 /* If we fail to create a cache entry, we'll just use the valid dest */
701 write_lock(&svc->sched_lock);
702 ip_vs_lblcr_new(tbl, iph->daddr, dest);
703 write_unlock(&svc->sched_lock);
704
705out:
706 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
707 "--> server %u.%u.%u.%u:%d\n",
708 NIPQUAD(iph->daddr),
709 NIPQUAD(dest->addr.ip),
710 ntohs(dest->port));
711
712 return dest;
713}
714
715
716/*
717 * IPVS LBLCR Scheduler structure
718 */
719static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
720{
721 .name = "lblcr",
722 .refcnt = ATOMIC_INIT(0),
723 .module = THIS_MODULE,
724 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
725#ifdef CONFIG_IP_VS_IPV6
726 .supports_ipv6 = 0,
727#endif
728 .init_service = ip_vs_lblcr_init_svc,
729 .done_service = ip_vs_lblcr_done_svc,
730 .schedule = ip_vs_lblcr_schedule,
731};
732
733
734static int __init ip_vs_lblcr_init(void)
735{
736 int ret;
737
738 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
739 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
740 if (ret)
741 unregister_sysctl_table(sysctl_header);
742 return ret;
743}
744
745
746static void __exit ip_vs_lblcr_cleanup(void)
747{
748 unregister_sysctl_table(sysctl_header);
749 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
750}
751
752
753module_init(ip_vs_lblcr_init);
754module_exit(ip_vs_lblcr_cleanup);
755MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
deleted file mode 100644
index b69f808ac461..000000000000
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ /dev/null
@@ -1,103 +0,0 @@
1/*
2 * IPVS: Least-Connection Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Wensong Zhang : added the ip_vs_lc_update_svc
13 * Wensong Zhang : added any dest with weight=0 is quiesced
14 *
15 */
16
17#include <linux/module.h>
18#include <linux/kernel.h>
19
20#include <net/ip_vs.h>
21
22
23static inline unsigned int
24ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
25{
26 /*
27 * We think the overhead of processing active connections is 256
28 * times higher than that of inactive connections in average. (This
29 * 256 times might not be accurate, we will change it later) We
30 * use the following formula to estimate the overhead now:
31 * dest->activeconns*256 + dest->inactconns
32 */
33 return (atomic_read(&dest->activeconns) << 8) +
34 atomic_read(&dest->inactconns);
35}
36
37
38/*
39 * Least Connection scheduling
40 */
41static struct ip_vs_dest *
42ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
43{
44 struct ip_vs_dest *dest, *least = NULL;
45 unsigned int loh = 0, doh;
46
47 IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
48
49 /*
50 * Simply select the server with the least number of
51 * (activeconns<<5) + inactconns
52 * Except whose weight is equal to zero.
53 * If the weight is equal to zero, it means that the server is
54 * quiesced, the existing connections to the server still get
55 * served, but no new connection is assigned to the server.
56 */
57
58 list_for_each_entry(dest, &svc->destinations, n_list) {
59 if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
60 atomic_read(&dest->weight) == 0)
61 continue;
62 doh = ip_vs_lc_dest_overhead(dest);
63 if (!least || doh < loh) {
64 least = dest;
65 loh = doh;
66 }
67 }
68
69 if (least)
70 IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
71 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
72 atomic_read(&least->activeconns),
73 atomic_read(&least->inactconns));
74
75 return least;
76}
77
78
79static struct ip_vs_scheduler ip_vs_lc_scheduler = {
80 .name = "lc",
81 .refcnt = ATOMIC_INIT(0),
82 .module = THIS_MODULE,
83 .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
84#ifdef CONFIG_IP_VS_IPV6
85 .supports_ipv6 = 1,
86#endif
87 .schedule = ip_vs_lc_schedule,
88};
89
90
91static int __init ip_vs_lc_init(void)
92{
93 return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
94}
95
96static void __exit ip_vs_lc_cleanup(void)
97{
98 unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
99}
100
101module_init(ip_vs_lc_init);
102module_exit(ip_vs_lc_cleanup);
103MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
deleted file mode 100644
index 9a2d8033f08f..000000000000
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ /dev/null
@@ -1,138 +0,0 @@
1/*
2 * IPVS: Never Queue scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The NQ algorithm adopts a two-speed model. When there is an idle server
17 * available, the job will be sent to the idle server, instead of waiting
18 * for a fast one. When there is no idle server available, the job will be
19 * sent to the server that minimize its expected delay (The Shortest
20 * Expected Delay scheduling algorithm).
21 *
22 * See the following paper for more information:
23 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
24 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
25 * pages 986-994, 1988.
26 *
27 * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
28 *
29 * The difference between NQ and SED is that NQ can improve overall
30 * system utilization.
31 *
32 */
33
34#include <linux/module.h>
35#include <linux/kernel.h>
36
37#include <net/ip_vs.h>
38
39
40static inline unsigned int
41ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
42{
43 /*
44 * We only use the active connection number in the cost
45 * calculation here.
46 */
47 return atomic_read(&dest->activeconns) + 1;
48}
49
50
51/*
52 * Weighted Least Connection scheduling
53 */
54static struct ip_vs_dest *
55ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
56{
57 struct ip_vs_dest *dest, *least = NULL;
58 unsigned int loh = 0, doh;
59
60 IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
61
62 /*
63 * We calculate the load of each dest server as follows:
64 * (server expected overhead) / dest->weight
65 *
66 * Remember -- no floats in kernel mode!!!
67 * The comparison of h1*w2 > h2*w1 is equivalent to that of
68 * h1/w1 > h2/w2
69 * if every weight is larger than zero.
70 *
71 * The server with weight=0 is quiesced and will not receive any
72 * new connections.
73 */
74
75 list_for_each_entry(dest, &svc->destinations, n_list) {
76
77 if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
78 !atomic_read(&dest->weight))
79 continue;
80
81 doh = ip_vs_nq_dest_overhead(dest);
82
83 /* return the server directly if it is idle */
84 if (atomic_read(&dest->activeconns) == 0) {
85 least = dest;
86 loh = doh;
87 goto out;
88 }
89
90 if (!least ||
91 (loh * atomic_read(&dest->weight) >
92 doh * atomic_read(&least->weight))) {
93 least = dest;
94 loh = doh;
95 }
96 }
97
98 if (!least)
99 return NULL;
100
101 out:
102 IP_VS_DBG_BUF(6, "NQ: server %s:%u "
103 "activeconns %d refcnt %d weight %d overhead %d\n",
104 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
105 atomic_read(&least->activeconns),
106 atomic_read(&least->refcnt),
107 atomic_read(&least->weight), loh);
108
109 return least;
110}
111
112
113static struct ip_vs_scheduler ip_vs_nq_scheduler =
114{
115 .name = "nq",
116 .refcnt = ATOMIC_INIT(0),
117 .module = THIS_MODULE,
118 .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
119#ifdef CONFIG_IP_VS_IPV6
120 .supports_ipv6 = 1,
121#endif
122 .schedule = ip_vs_nq_schedule,
123};
124
125
126static int __init ip_vs_nq_init(void)
127{
128 return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
129}
130
131static void __exit ip_vs_nq_cleanup(void)
132{
133 unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
134}
135
136module_init(ip_vs_nq_init);
137module_exit(ip_vs_nq_cleanup);
138MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
deleted file mode 100644
index 0791f9e08feb..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/*
2 * ip_vs_proto.c: transport protocol load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/module.h>
17#include <linux/kernel.h>
18#include <linux/skbuff.h>
19#include <linux/in.h>
20#include <linux/ip.h>
21#include <net/protocol.h>
22#include <net/tcp.h>
23#include <net/udp.h>
24#include <asm/system.h>
25#include <linux/stat.h>
26#include <linux/proc_fs.h>
27
28#include <net/ip_vs.h>
29
30
31/*
32 * IPVS protocols can only be registered/unregistered when the ipvs
33 * module is loaded/unloaded, so no lock is needed in accessing the
34 * ipvs protocol table.
35 */
36
37#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
38#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
39
40static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
41
42
43/*
44 * register an ipvs protocol
45 */
46static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
47{
48 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
49
50 pp->next = ip_vs_proto_table[hash];
51 ip_vs_proto_table[hash] = pp;
52
53 if (pp->init != NULL)
54 pp->init(pp);
55
56 return 0;
57}
58
59
60/*
61 * unregister an ipvs protocol
62 */
63static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
64{
65 struct ip_vs_protocol **pp_p;
66 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
67
68 pp_p = &ip_vs_proto_table[hash];
69 for (; *pp_p; pp_p = &(*pp_p)->next) {
70 if (*pp_p == pp) {
71 *pp_p = pp->next;
72 if (pp->exit != NULL)
73 pp->exit(pp);
74 return 0;
75 }
76 }
77
78 return -ESRCH;
79}
80
81
82/*
83 * get ip_vs_protocol object by its proto.
84 */
85struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
86{
87 struct ip_vs_protocol *pp;
88 unsigned hash = IP_VS_PROTO_HASH(proto);
89
90 for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
91 if (pp->protocol == proto)
92 return pp;
93 }
94
95 return NULL;
96}
97
98
99/*
100 * Propagate event for state change to all protocols
101 */
102void ip_vs_protocol_timeout_change(int flags)
103{
104 struct ip_vs_protocol *pp;
105 int i;
106
107 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
108 for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
109 if (pp->timeout_change)
110 pp->timeout_change(pp, flags);
111 }
112 }
113}
114
115
116int *
117ip_vs_create_timeout_table(int *table, int size)
118{
119 return kmemdup(table, size, GFP_ATOMIC);
120}
121
122
123/*
124 * Set timeout value for state specified by name
125 */
126int
127ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
128{
129 int i;
130
131 if (!table || !name || !to)
132 return -EINVAL;
133
134 for (i = 0; i < num; i++) {
135 if (strcmp(names[i], name))
136 continue;
137 table[i] = to * HZ;
138 return 0;
139 }
140 return -ENOENT;
141}
142
143
144const char * ip_vs_state_name(__u16 proto, int state)
145{
146 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
147
148 if (pp == NULL || pp->state_name == NULL)
149 return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
150 return pp->state_name(state);
151}
152
153
154static void
155ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
156 const struct sk_buff *skb,
157 int offset,
158 const char *msg)
159{
160 char buf[128];
161 struct iphdr _iph, *ih;
162
163 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
164 if (ih == NULL)
165 sprintf(buf, "%s TRUNCATED", pp->name);
166 else if (ih->frag_off & htons(IP_OFFSET))
167 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
168 pp->name, NIPQUAD(ih->saddr),
169 NIPQUAD(ih->daddr));
170 else {
171 __be16 _ports[2], *pptr
172;
173 pptr = skb_header_pointer(skb, offset + ih->ihl*4,
174 sizeof(_ports), _ports);
175 if (pptr == NULL)
176 sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
177 pp->name,
178 NIPQUAD(ih->saddr),
179 NIPQUAD(ih->daddr));
180 else
181 sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
182 pp->name,
183 NIPQUAD(ih->saddr),
184 ntohs(pptr[0]),
185 NIPQUAD(ih->daddr),
186 ntohs(pptr[1]));
187 }
188
189 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
190}
191
192#ifdef CONFIG_IP_VS_IPV6
193static void
194ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
195 const struct sk_buff *skb,
196 int offset,
197 const char *msg)
198{
199 char buf[192];
200 struct ipv6hdr _iph, *ih;
201
202 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
203 if (ih == NULL)
204 sprintf(buf, "%s TRUNCATED", pp->name);
205 else if (ih->nexthdr == IPPROTO_FRAGMENT)
206 sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
207 pp->name, NIP6(ih->saddr),
208 NIP6(ih->daddr));
209 else {
210 __be16 _ports[2], *pptr;
211
212 pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
213 sizeof(_ports), _ports);
214 if (pptr == NULL)
215 sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
216 pp->name,
217 NIP6(ih->saddr),
218 NIP6(ih->daddr));
219 else
220 sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
221 pp->name,
222 NIP6(ih->saddr),
223 ntohs(pptr[0]),
224 NIP6(ih->daddr),
225 ntohs(pptr[1]));
226 }
227
228 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
229}
230#endif
231
232
233void
234ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
235 const struct sk_buff *skb,
236 int offset,
237 const char *msg)
238{
239#ifdef CONFIG_IP_VS_IPV6
240 if (skb->protocol == htons(ETH_P_IPV6))
241 ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
242 else
243#endif
244 ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
245}
246
247
248int __init ip_vs_protocol_init(void)
249{
250 char protocols[64];
251#define REGISTER_PROTOCOL(p) \
252 do { \
253 register_ip_vs_protocol(p); \
254 strcat(protocols, ", "); \
255 strcat(protocols, (p)->name); \
256 } while (0)
257
258 protocols[0] = '\0';
259 protocols[2] = '\0';
260#ifdef CONFIG_IP_VS_PROTO_TCP
261 REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
262#endif
263#ifdef CONFIG_IP_VS_PROTO_UDP
264 REGISTER_PROTOCOL(&ip_vs_protocol_udp);
265#endif
266#ifdef CONFIG_IP_VS_PROTO_AH
267 REGISTER_PROTOCOL(&ip_vs_protocol_ah);
268#endif
269#ifdef CONFIG_IP_VS_PROTO_ESP
270 REGISTER_PROTOCOL(&ip_vs_protocol_esp);
271#endif
272 IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
273
274 return 0;
275}
276
277
278void ip_vs_protocol_cleanup(void)
279{
280 struct ip_vs_protocol *pp;
281 int i;
282
283 /* unregister all the ipvs protocols */
284 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
285 while ((pp = ip_vs_proto_table[i]) != NULL)
286 unregister_ip_vs_protocol(pp);
287 }
288}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
deleted file mode 100644
index 80ab0c8e5b4a..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ /dev/null
@@ -1,235 +0,0 @@
1/*
2 * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
43 const struct ip_vs_iphdr *iph, unsigned int proto_off,
44 int inverse)
45{
46 struct ip_vs_conn *cp;
47
48 if (likely(!inverse)) {
49 cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
50 &iph->saddr,
51 htons(PORT_ISAKMP),
52 &iph->daddr,
53 htons(PORT_ISAKMP));
54 } else {
55 cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
56 &iph->daddr,
57 htons(PORT_ISAKMP),
58 &iph->saddr,
59 htons(PORT_ISAKMP));
60 }
61
62 if (!cp) {
63 /*
64 * We are not sure if the packet is from our
65 * service, so our conn_schedule hook should return NF_ACCEPT
66 */
67 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
68 "%s%s %s->%s\n",
69 inverse ? "ICMP+" : "",
70 pp->name,
71 IP_VS_DBG_ADDR(af, &iph->saddr),
72 IP_VS_DBG_ADDR(af, &iph->daddr));
73 }
74
75 return cp;
76}
77
78
79static struct ip_vs_conn *
80ah_esp_conn_out_get(int af, const struct sk_buff *skb,
81 struct ip_vs_protocol *pp,
82 const struct ip_vs_iphdr *iph,
83 unsigned int proto_off,
84 int inverse)
85{
86 struct ip_vs_conn *cp;
87
88 if (likely(!inverse)) {
89 cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
90 &iph->saddr,
91 htons(PORT_ISAKMP),
92 &iph->daddr,
93 htons(PORT_ISAKMP));
94 } else {
95 cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
96 &iph->daddr,
97 htons(PORT_ISAKMP),
98 &iph->saddr,
99 htons(PORT_ISAKMP));
100 }
101
102 if (!cp) {
103 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
104 "%s%s %s->%s\n",
105 inverse ? "ICMP+" : "",
106 pp->name,
107 IP_VS_DBG_ADDR(af, &iph->saddr),
108 IP_VS_DBG_ADDR(af, &iph->daddr));
109 }
110
111 return cp;
112}
113
114
115static int
116ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp)
118{
119 /*
120 * AH/ESP is only related traffic. Pass the packet to IP stack.
121 */
122 *verdict = NF_ACCEPT;
123 return 0;
124}
125
126
127static void
128ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg)
130{
131 char buf[256];
132 struct iphdr _iph, *ih;
133
134 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
135 if (ih == NULL)
136 sprintf(buf, "%s TRUNCATED", pp->name);
137 else
138 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
139 pp->name, NIPQUAD(ih->saddr),
140 NIPQUAD(ih->daddr));
141
142 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
143}
144
145#ifdef CONFIG_IP_VS_IPV6
146static void
147ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
148 int offset, const char *msg)
149{
150 char buf[256];
151 struct ipv6hdr _iph, *ih;
152
153 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
154 if (ih == NULL)
155 sprintf(buf, "%s TRUNCATED", pp->name);
156 else
157 sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
158 pp->name, NIP6(ih->saddr),
159 NIP6(ih->daddr));
160
161 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
162}
163#endif
164
165static void
166ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
167 int offset, const char *msg)
168{
169#ifdef CONFIG_IP_VS_IPV6
170 if (skb->protocol == htons(ETH_P_IPV6))
171 ah_esp_debug_packet_v6(pp, skb, offset, msg);
172 else
173#endif
174 ah_esp_debug_packet_v4(pp, skb, offset, msg);
175}
176
177
178static void ah_esp_init(struct ip_vs_protocol *pp)
179{
180 /* nothing to do now */
181}
182
183
184static void ah_esp_exit(struct ip_vs_protocol *pp)
185{
186 /* nothing to do now */
187}
188
189
190#ifdef CONFIG_IP_VS_PROTO_AH
191struct ip_vs_protocol ip_vs_protocol_ah = {
192 .name = "AH",
193 .protocol = IPPROTO_AH,
194 .num_states = 1,
195 .dont_defrag = 1,
196 .init = ah_esp_init,
197 .exit = ah_esp_exit,
198 .conn_schedule = ah_esp_conn_schedule,
199 .conn_in_get = ah_esp_conn_in_get,
200 .conn_out_get = ah_esp_conn_out_get,
201 .snat_handler = NULL,
202 .dnat_handler = NULL,
203 .csum_check = NULL,
204 .state_transition = NULL,
205 .register_app = NULL,
206 .unregister_app = NULL,
207 .app_conn_bind = NULL,
208 .debug_packet = ah_esp_debug_packet,
209 .timeout_change = NULL, /* ISAKMP */
210 .set_state_timeout = NULL,
211};
212#endif
213
214#ifdef CONFIG_IP_VS_PROTO_ESP
215struct ip_vs_protocol ip_vs_protocol_esp = {
216 .name = "ESP",
217 .protocol = IPPROTO_ESP,
218 .num_states = 1,
219 .dont_defrag = 1,
220 .init = ah_esp_init,
221 .exit = ah_esp_exit,
222 .conn_schedule = ah_esp_conn_schedule,
223 .conn_in_get = ah_esp_conn_in_get,
224 .conn_out_get = ah_esp_conn_out_get,
225 .snat_handler = NULL,
226 .dnat_handler = NULL,
227 .csum_check = NULL,
228 .state_transition = NULL,
229 .register_app = NULL,
230 .unregister_app = NULL,
231 .app_conn_bind = NULL,
232 .debug_packet = ah_esp_debug_packet,
233 .timeout_change = NULL, /* ISAKMP */
234};
235#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
deleted file mode 100644
index dd4566ea2bff..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ /dev/null
@@ -1,732 +0,0 @@
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/ip.h>
18#include <linux/tcp.h> /* for tcphdr */
19#include <net/ip.h>
20#include <net/tcp.h> /* for csum_tcpudp_magic */
21#include <net/ip6_checksum.h>
22#include <linux/netfilter.h>
23#include <linux/netfilter_ipv4.h>
24
25#include <net/ip_vs.h>
26
27
28static struct ip_vs_conn *
29tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
30 const struct ip_vs_iphdr *iph, unsigned int proto_off,
31 int inverse)
32{
33 __be16 _ports[2], *pptr;
34
35 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
36 if (pptr == NULL)
37 return NULL;
38
39 if (likely(!inverse)) {
40 return ip_vs_conn_in_get(af, iph->protocol,
41 &iph->saddr, pptr[0],
42 &iph->daddr, pptr[1]);
43 } else {
44 return ip_vs_conn_in_get(af, iph->protocol,
45 &iph->daddr, pptr[1],
46 &iph->saddr, pptr[0]);
47 }
48}
49
50static struct ip_vs_conn *
51tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
52 const struct ip_vs_iphdr *iph, unsigned int proto_off,
53 int inverse)
54{
55 __be16 _ports[2], *pptr;
56
57 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
58 if (pptr == NULL)
59 return NULL;
60
61 if (likely(!inverse)) {
62 return ip_vs_conn_out_get(af, iph->protocol,
63 &iph->saddr, pptr[0],
64 &iph->daddr, pptr[1]);
65 } else {
66 return ip_vs_conn_out_get(af, iph->protocol,
67 &iph->daddr, pptr[1],
68 &iph->saddr, pptr[0]);
69 }
70}
71
72
73static int
74tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
75 int *verdict, struct ip_vs_conn **cpp)
76{
77 struct ip_vs_service *svc;
78 struct tcphdr _tcph, *th;
79 struct ip_vs_iphdr iph;
80
81 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
82
83 th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
84 if (th == NULL) {
85 *verdict = NF_DROP;
86 return 0;
87 }
88
89 if (th->syn &&
90 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
91 th->dest))) {
92 if (ip_vs_todrop()) {
93 /*
94 * It seems that we are very loaded.
95 * We have to drop this packet :(
96 */
97 ip_vs_service_put(svc);
98 *verdict = NF_DROP;
99 return 0;
100 }
101
102 /*
103 * Let the virtual server select a real server for the
104 * incoming connection, and create a connection entry.
105 */
106 *cpp = ip_vs_schedule(svc, skb);
107 if (!*cpp) {
108 *verdict = ip_vs_leave(svc, skb, pp);
109 return 0;
110 }
111 ip_vs_service_put(svc);
112 }
113 return 1;
114}
115
116
117static inline void
118tcp_fast_csum_update(int af, struct tcphdr *tcph,
119 const union nf_inet_addr *oldip,
120 const union nf_inet_addr *newip,
121 __be16 oldport, __be16 newport)
122{
123#ifdef CONFIG_IP_VS_IPV6
124 if (af == AF_INET6)
125 tcph->check =
126 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
127 ip_vs_check_diff2(oldport, newport,
128 ~csum_unfold(tcph->check))));
129 else
130#endif
131 tcph->check =
132 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
133 ip_vs_check_diff2(oldport, newport,
134 ~csum_unfold(tcph->check))));
135}
136
137
138static inline void
139tcp_partial_csum_update(int af, struct tcphdr *tcph,
140 const union nf_inet_addr *oldip,
141 const union nf_inet_addr *newip,
142 __be16 oldlen, __be16 newlen)
143{
144#ifdef CONFIG_IP_VS_IPV6
145 if (af == AF_INET6)
146 tcph->check =
147 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
148 ip_vs_check_diff2(oldlen, newlen,
149 ~csum_unfold(tcph->check))));
150 else
151#endif
152 tcph->check =
153 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
154 ip_vs_check_diff2(oldlen, newlen,
155 ~csum_unfold(tcph->check))));
156}
157
158
159static int
160tcp_snat_handler(struct sk_buff *skb,
161 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
162{
163 struct tcphdr *tcph;
164 unsigned int tcphoff;
165 int oldlen;
166
167#ifdef CONFIG_IP_VS_IPV6
168 if (cp->af == AF_INET6)
169 tcphoff = sizeof(struct ipv6hdr);
170 else
171#endif
172 tcphoff = ip_hdrlen(skb);
173 oldlen = skb->len - tcphoff;
174
175 /* csum_check requires unshared skb */
176 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
177 return 0;
178
179 if (unlikely(cp->app != NULL)) {
180 /* Some checks before mangling */
181 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
182 return 0;
183
184 /* Call application helper if needed */
185 if (!ip_vs_app_pkt_out(cp, skb))
186 return 0;
187 }
188
189 tcph = (void *)skb_network_header(skb) + tcphoff;
190 tcph->source = cp->vport;
191
192 /* Adjust TCP checksums */
193 if (skb->ip_summed == CHECKSUM_PARTIAL) {
194 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
195 htonl(oldlen),
196 htonl(skb->len - tcphoff));
197 } else if (!cp->app) {
198 /* Only port and addr are changed, do fast csum update */
199 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
200 cp->dport, cp->vport);
201 if (skb->ip_summed == CHECKSUM_COMPLETE)
202 skb->ip_summed = CHECKSUM_NONE;
203 } else {
204 /* full checksum calculation */
205 tcph->check = 0;
206 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
207#ifdef CONFIG_IP_VS_IPV6
208 if (cp->af == AF_INET6)
209 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
210 &cp->caddr.in6,
211 skb->len - tcphoff,
212 cp->protocol, skb->csum);
213 else
214#endif
215 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
216 cp->caddr.ip,
217 skb->len - tcphoff,
218 cp->protocol,
219 skb->csum);
220
221 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
222 pp->name, tcph->check,
223 (char*)&(tcph->check) - (char*)tcph);
224 }
225 return 1;
226}
227
228
229static int
230tcp_dnat_handler(struct sk_buff *skb,
231 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
232{
233 struct tcphdr *tcph;
234 unsigned int tcphoff;
235 int oldlen;
236
237#ifdef CONFIG_IP_VS_IPV6
238 if (cp->af == AF_INET6)
239 tcphoff = sizeof(struct ipv6hdr);
240 else
241#endif
242 tcphoff = ip_hdrlen(skb);
243 oldlen = skb->len - tcphoff;
244
245 /* csum_check requires unshared skb */
246 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
247 return 0;
248
249 if (unlikely(cp->app != NULL)) {
250 /* Some checks before mangling */
251 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
252 return 0;
253
254 /*
255 * Attempt ip_vs_app call.
256 * It will fix ip_vs_conn and iph ack_seq stuff
257 */
258 if (!ip_vs_app_pkt_in(cp, skb))
259 return 0;
260 }
261
262 tcph = (void *)skb_network_header(skb) + tcphoff;
263 tcph->dest = cp->dport;
264
265 /*
266 * Adjust TCP checksums
267 */
268 if (skb->ip_summed == CHECKSUM_PARTIAL) {
269 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
270 htonl(oldlen),
271 htonl(skb->len - tcphoff));
272 } else if (!cp->app) {
273 /* Only port and addr are changed, do fast csum update */
274 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
275 cp->vport, cp->dport);
276 if (skb->ip_summed == CHECKSUM_COMPLETE)
277 skb->ip_summed = CHECKSUM_NONE;
278 } else {
279 /* full checksum calculation */
280 tcph->check = 0;
281 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
282#ifdef CONFIG_IP_VS_IPV6
283 if (cp->af == AF_INET6)
284 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
285 &cp->daddr.in6,
286 skb->len - tcphoff,
287 cp->protocol, skb->csum);
288 else
289#endif
290 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
291 cp->daddr.ip,
292 skb->len - tcphoff,
293 cp->protocol,
294 skb->csum);
295 skb->ip_summed = CHECKSUM_UNNECESSARY;
296 }
297 return 1;
298}
299
300
301static int
302tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
303{
304 unsigned int tcphoff;
305
306#ifdef CONFIG_IP_VS_IPV6
307 if (af == AF_INET6)
308 tcphoff = sizeof(struct ipv6hdr);
309 else
310#endif
311 tcphoff = ip_hdrlen(skb);
312
313 switch (skb->ip_summed) {
314 case CHECKSUM_NONE:
315 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
316 case CHECKSUM_COMPLETE:
317#ifdef CONFIG_IP_VS_IPV6
318 if (af == AF_INET6) {
319 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
320 &ipv6_hdr(skb)->daddr,
321 skb->len - tcphoff,
322 ipv6_hdr(skb)->nexthdr,
323 skb->csum)) {
324 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
325 "Failed checksum for");
326 return 0;
327 }
328 } else
329#endif
330 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
331 ip_hdr(skb)->daddr,
332 skb->len - tcphoff,
333 ip_hdr(skb)->protocol,
334 skb->csum)) {
335 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
336 "Failed checksum for");
337 return 0;
338 }
339 break;
340 default:
341 /* No need to checksum. */
342 break;
343 }
344
345 return 1;
346}
347
348
349#define TCP_DIR_INPUT 0
350#define TCP_DIR_OUTPUT 4
351#define TCP_DIR_INPUT_ONLY 8
352
353static const int tcp_state_off[IP_VS_DIR_LAST] = {
354 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
355 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
356 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
357};
358
359/*
360 * Timeout table[state]
361 */
362static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
363 [IP_VS_TCP_S_NONE] = 2*HZ,
364 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
365 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
366 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
367 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
368 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
369 [IP_VS_TCP_S_CLOSE] = 10*HZ,
370 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
371 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
372 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
373 [IP_VS_TCP_S_SYNACK] = 120*HZ,
374 [IP_VS_TCP_S_LAST] = 2*HZ,
375};
376
377static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
378 [IP_VS_TCP_S_NONE] = "NONE",
379 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
380 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
381 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
382 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
383 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
384 [IP_VS_TCP_S_CLOSE] = "CLOSE",
385 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
386 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
387 [IP_VS_TCP_S_LISTEN] = "LISTEN",
388 [IP_VS_TCP_S_SYNACK] = "SYNACK",
389 [IP_VS_TCP_S_LAST] = "BUG!",
390};
391
392#define sNO IP_VS_TCP_S_NONE
393#define sES IP_VS_TCP_S_ESTABLISHED
394#define sSS IP_VS_TCP_S_SYN_SENT
395#define sSR IP_VS_TCP_S_SYN_RECV
396#define sFW IP_VS_TCP_S_FIN_WAIT
397#define sTW IP_VS_TCP_S_TIME_WAIT
398#define sCL IP_VS_TCP_S_CLOSE
399#define sCW IP_VS_TCP_S_CLOSE_WAIT
400#define sLA IP_VS_TCP_S_LAST_ACK
401#define sLI IP_VS_TCP_S_LISTEN
402#define sSA IP_VS_TCP_S_SYNACK
403
404struct tcp_states_t {
405 int next_state[IP_VS_TCP_S_LAST];
406};
407
408static const char * tcp_state_name(int state)
409{
410 if (state >= IP_VS_TCP_S_LAST)
411 return "ERR!";
412 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
413}
414
415static struct tcp_states_t tcp_states [] = {
416/* INPUT */
417/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
418/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
419/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
420/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
421/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
422
423/* OUTPUT */
424/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
425/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
426/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
427/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
428/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
429
430/* INPUT-ONLY */
431/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
432/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
433/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
434/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
435/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
436};
437
438static struct tcp_states_t tcp_states_dos [] = {
439/* INPUT */
440/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
441/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
442/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
443/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
444/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
445
446/* OUTPUT */
447/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
448/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
449/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
450/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
451/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
452
453/* INPUT-ONLY */
454/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
455/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
456/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
457/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
458/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
459};
460
461static struct tcp_states_t *tcp_state_table = tcp_states;
462
463
464static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
465{
466 int on = (flags & 1); /* secure_tcp */
467
468 /*
469 ** FIXME: change secure_tcp to independent sysctl var
470 ** or make it per-service or per-app because it is valid
471 ** for most if not for all of the applications. Something
472 ** like "capabilities" (flags) for each object.
473 */
474 tcp_state_table = (on? tcp_states_dos : tcp_states);
475}
476
477static int
478tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
479{
480 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
481 tcp_state_name_table, sname, to);
482}
483
484static inline int tcp_state_idx(struct tcphdr *th)
485{
486 if (th->rst)
487 return 3;
488 if (th->syn)
489 return 0;
490 if (th->fin)
491 return 1;
492 if (th->ack)
493 return 2;
494 return -1;
495}
496
497static inline void
498set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
499 int direction, struct tcphdr *th)
500{
501 int state_idx;
502 int new_state = IP_VS_TCP_S_CLOSE;
503 int state_off = tcp_state_off[direction];
504
505 /*
506 * Update state offset to INPUT_ONLY if necessary
507 * or delete NO_OUTPUT flag if output packet detected
508 */
509 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
510 if (state_off == TCP_DIR_OUTPUT)
511 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
512 else
513 state_off = TCP_DIR_INPUT_ONLY;
514 }
515
516 if ((state_idx = tcp_state_idx(th)) < 0) {
517 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
518 goto tcp_state_out;
519 }
520
521 new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
522
523 tcp_state_out:
524 if (new_state != cp->state) {
525 struct ip_vs_dest *dest = cp->dest;
526
527 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
528 "%s:%d state: %s->%s conn->refcnt:%d\n",
529 pp->name,
530 ((state_off == TCP_DIR_OUTPUT) ?
531 "output " : "input "),
532 th->syn ? 'S' : '.',
533 th->fin ? 'F' : '.',
534 th->ack ? 'A' : '.',
535 th->rst ? 'R' : '.',
536 IP_VS_DBG_ADDR(cp->af, &cp->daddr),
537 ntohs(cp->dport),
538 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
539 ntohs(cp->cport),
540 tcp_state_name(cp->state),
541 tcp_state_name(new_state),
542 atomic_read(&cp->refcnt));
543
544 if (dest) {
545 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
546 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
547 atomic_dec(&dest->activeconns);
548 atomic_inc(&dest->inactconns);
549 cp->flags |= IP_VS_CONN_F_INACTIVE;
550 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
551 (new_state == IP_VS_TCP_S_ESTABLISHED)) {
552 atomic_inc(&dest->activeconns);
553 atomic_dec(&dest->inactconns);
554 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
555 }
556 }
557 }
558
559 cp->timeout = pp->timeout_table[cp->state = new_state];
560}
561
562
563/*
564 * Handle state transitions
565 */
566static int
567tcp_state_transition(struct ip_vs_conn *cp, int direction,
568 const struct sk_buff *skb,
569 struct ip_vs_protocol *pp)
570{
571 struct tcphdr _tcph, *th;
572
573#ifdef CONFIG_IP_VS_IPV6
574 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
575#else
576 int ihl = ip_hdrlen(skb);
577#endif
578
579 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
580 if (th == NULL)
581 return 0;
582
583 spin_lock(&cp->lock);
584 set_tcp_state(pp, cp, direction, th);
585 spin_unlock(&cp->lock);
586
587 return 1;
588}
589
590
591/*
592 * Hash table for TCP application incarnations
593 */
594#define TCP_APP_TAB_BITS 4
595#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
596#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
597
598static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
599static DEFINE_SPINLOCK(tcp_app_lock);
600
601static inline __u16 tcp_app_hashkey(__be16 port)
602{
603 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
604 & TCP_APP_TAB_MASK;
605}
606
607
608static int tcp_register_app(struct ip_vs_app *inc)
609{
610 struct ip_vs_app *i;
611 __u16 hash;
612 __be16 port = inc->port;
613 int ret = 0;
614
615 hash = tcp_app_hashkey(port);
616
617 spin_lock_bh(&tcp_app_lock);
618 list_for_each_entry(i, &tcp_apps[hash], p_list) {
619 if (i->port == port) {
620 ret = -EEXIST;
621 goto out;
622 }
623 }
624 list_add(&inc->p_list, &tcp_apps[hash]);
625 atomic_inc(&ip_vs_protocol_tcp.appcnt);
626
627 out:
628 spin_unlock_bh(&tcp_app_lock);
629 return ret;
630}
631
632
633static void
634tcp_unregister_app(struct ip_vs_app *inc)
635{
636 spin_lock_bh(&tcp_app_lock);
637 atomic_dec(&ip_vs_protocol_tcp.appcnt);
638 list_del(&inc->p_list);
639 spin_unlock_bh(&tcp_app_lock);
640}
641
642
643static int
644tcp_app_conn_bind(struct ip_vs_conn *cp)
645{
646 int hash;
647 struct ip_vs_app *inc;
648 int result = 0;
649
650 /* Default binding: bind app only for NAT */
651 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
652 return 0;
653
654 /* Lookup application incarnations and bind the right one */
655 hash = tcp_app_hashkey(cp->vport);
656
657 spin_lock(&tcp_app_lock);
658 list_for_each_entry(inc, &tcp_apps[hash], p_list) {
659 if (inc->port == cp->vport) {
660 if (unlikely(!ip_vs_app_inc_get(inc)))
661 break;
662 spin_unlock(&tcp_app_lock);
663
664 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
665 "%s:%u to app %s on port %u\n",
666 __func__,
667 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
668 ntohs(cp->cport),
669 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
670 ntohs(cp->vport),
671 inc->name, ntohs(inc->port));
672
673 cp->app = inc;
674 if (inc->init_conn)
675 result = inc->init_conn(inc, cp);
676 goto out;
677 }
678 }
679 spin_unlock(&tcp_app_lock);
680
681 out:
682 return result;
683}
684
685
686/*
687 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
688 */
689void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
690{
691 spin_lock(&cp->lock);
692 cp->state = IP_VS_TCP_S_LISTEN;
693 cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
694 spin_unlock(&cp->lock);
695}
696
697
698static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
699{
700 IP_VS_INIT_HASH_TABLE(tcp_apps);
701 pp->timeout_table = tcp_timeouts;
702}
703
704
705static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
706{
707}
708
709
710struct ip_vs_protocol ip_vs_protocol_tcp = {
711 .name = "TCP",
712 .protocol = IPPROTO_TCP,
713 .num_states = IP_VS_TCP_S_LAST,
714 .dont_defrag = 0,
715 .appcnt = ATOMIC_INIT(0),
716 .init = ip_vs_tcp_init,
717 .exit = ip_vs_tcp_exit,
718 .register_app = tcp_register_app,
719 .unregister_app = tcp_unregister_app,
720 .conn_schedule = tcp_conn_schedule,
721 .conn_in_get = tcp_conn_in_get,
722 .conn_out_get = tcp_conn_out_get,
723 .snat_handler = tcp_snat_handler,
724 .dnat_handler = tcp_dnat_handler,
725 .csum_check = tcp_csum_check,
726 .state_name = tcp_state_name,
727 .state_transition = tcp_state_transition,
728 .app_conn_bind = tcp_app_conn_bind,
729 .debug_packet = ip_vs_tcpudp_debug_packet,
730 .timeout_change = tcp_timeout_change,
731 .set_state_timeout = tcp_set_state_timeout,
732};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
deleted file mode 100644
index 6eb6039d6343..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ /dev/null
@@ -1,533 +0,0 @@
1/*
2 * ip_vs_proto_udp.c: UDP load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/in.h>
17#include <linux/ip.h>
18#include <linux/kernel.h>
19#include <linux/netfilter.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/udp.h>
22
23#include <net/ip_vs.h>
24#include <net/ip.h>
25#include <net/ip6_checksum.h>
26
27static struct ip_vs_conn *
28udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
29 const struct ip_vs_iphdr *iph, unsigned int proto_off,
30 int inverse)
31{
32 struct ip_vs_conn *cp;
33 __be16 _ports[2], *pptr;
34
35 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
36 if (pptr == NULL)
37 return NULL;
38
39 if (likely(!inverse)) {
40 cp = ip_vs_conn_in_get(af, iph->protocol,
41 &iph->saddr, pptr[0],
42 &iph->daddr, pptr[1]);
43 } else {
44 cp = ip_vs_conn_in_get(af, iph->protocol,
45 &iph->daddr, pptr[1],
46 &iph->saddr, pptr[0]);
47 }
48
49 return cp;
50}
51
52
53static struct ip_vs_conn *
54udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
55 const struct ip_vs_iphdr *iph, unsigned int proto_off,
56 int inverse)
57{
58 struct ip_vs_conn *cp;
59 __be16 _ports[2], *pptr;
60
61 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
62 if (pptr == NULL)
63 return NULL;
64
65 if (likely(!inverse)) {
66 cp = ip_vs_conn_out_get(af, iph->protocol,
67 &iph->saddr, pptr[0],
68 &iph->daddr, pptr[1]);
69 } else {
70 cp = ip_vs_conn_out_get(af, iph->protocol,
71 &iph->daddr, pptr[1],
72 &iph->saddr, pptr[0]);
73 }
74
75 return cp;
76}
77
78
79static int
80udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
81 int *verdict, struct ip_vs_conn **cpp)
82{
83 struct ip_vs_service *svc;
84 struct udphdr _udph, *uh;
85 struct ip_vs_iphdr iph;
86
87 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
88
89 uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
90 if (uh == NULL) {
91 *verdict = NF_DROP;
92 return 0;
93 }
94
95 svc = ip_vs_service_get(af, skb->mark, iph.protocol,
96 &iph.daddr, uh->dest);
97 if (svc) {
98 if (ip_vs_todrop()) {
99 /*
100 * It seems that we are very loaded.
101 * We have to drop this packet :(
102 */
103 ip_vs_service_put(svc);
104 *verdict = NF_DROP;
105 return 0;
106 }
107
108 /*
109 * Let the virtual server select a real server for the
110 * incoming connection, and create a connection entry.
111 */
112 *cpp = ip_vs_schedule(svc, skb);
113 if (!*cpp) {
114 *verdict = ip_vs_leave(svc, skb, pp);
115 return 0;
116 }
117 ip_vs_service_put(svc);
118 }
119 return 1;
120}
121
122
123static inline void
124udp_fast_csum_update(int af, struct udphdr *uhdr,
125 const union nf_inet_addr *oldip,
126 const union nf_inet_addr *newip,
127 __be16 oldport, __be16 newport)
128{
129#ifdef CONFIG_IP_VS_IPV6
130 if (af == AF_INET6)
131 uhdr->check =
132 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
133 ip_vs_check_diff2(oldport, newport,
134 ~csum_unfold(uhdr->check))));
135 else
136#endif
137 uhdr->check =
138 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
139 ip_vs_check_diff2(oldport, newport,
140 ~csum_unfold(uhdr->check))));
141 if (!uhdr->check)
142 uhdr->check = CSUM_MANGLED_0;
143}
144
145static inline void
146udp_partial_csum_update(int af, struct udphdr *uhdr,
147 const union nf_inet_addr *oldip,
148 const union nf_inet_addr *newip,
149 __be16 oldlen, __be16 newlen)
150{
151#ifdef CONFIG_IP_VS_IPV6
152 if (af == AF_INET6)
153 uhdr->check =
154 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
155 ip_vs_check_diff2(oldlen, newlen,
156 ~csum_unfold(uhdr->check))));
157 else
158#endif
159 uhdr->check =
160 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
161 ip_vs_check_diff2(oldlen, newlen,
162 ~csum_unfold(uhdr->check))));
163}
164
165
166static int
167udp_snat_handler(struct sk_buff *skb,
168 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
169{
170 struct udphdr *udph;
171 unsigned int udphoff;
172 int oldlen;
173
174#ifdef CONFIG_IP_VS_IPV6
175 if (cp->af == AF_INET6)
176 udphoff = sizeof(struct ipv6hdr);
177 else
178#endif
179 udphoff = ip_hdrlen(skb);
180 oldlen = skb->len - udphoff;
181
182 /* csum_check requires unshared skb */
183 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
184 return 0;
185
186 if (unlikely(cp->app != NULL)) {
187 /* Some checks before mangling */
188 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
189 return 0;
190
191 /*
192 * Call application helper if needed
193 */
194 if (!ip_vs_app_pkt_out(cp, skb))
195 return 0;
196 }
197
198 udph = (void *)skb_network_header(skb) + udphoff;
199 udph->source = cp->vport;
200
201 /*
202 * Adjust UDP checksums
203 */
204 if (skb->ip_summed == CHECKSUM_PARTIAL) {
205 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
206 htonl(oldlen),
207 htonl(skb->len - udphoff));
208 } else if (!cp->app && (udph->check != 0)) {
209 /* Only port and addr are changed, do fast csum update */
210 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
211 cp->dport, cp->vport);
212 if (skb->ip_summed == CHECKSUM_COMPLETE)
213 skb->ip_summed = CHECKSUM_NONE;
214 } else {
215 /* full checksum calculation */
216 udph->check = 0;
217 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
218#ifdef CONFIG_IP_VS_IPV6
219 if (cp->af == AF_INET6)
220 udph->check = csum_ipv6_magic(&cp->vaddr.in6,
221 &cp->caddr.in6,
222 skb->len - udphoff,
223 cp->protocol, skb->csum);
224 else
225#endif
226 udph->check = csum_tcpudp_magic(cp->vaddr.ip,
227 cp->caddr.ip,
228 skb->len - udphoff,
229 cp->protocol,
230 skb->csum);
231 if (udph->check == 0)
232 udph->check = CSUM_MANGLED_0;
233 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
234 pp->name, udph->check,
235 (char*)&(udph->check) - (char*)udph);
236 }
237 return 1;
238}
239
240
241static int
242udp_dnat_handler(struct sk_buff *skb,
243 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
244{
245 struct udphdr *udph;
246 unsigned int udphoff;
247 int oldlen;
248
249#ifdef CONFIG_IP_VS_IPV6
250 if (cp->af == AF_INET6)
251 udphoff = sizeof(struct ipv6hdr);
252 else
253#endif
254 udphoff = ip_hdrlen(skb);
255 oldlen = skb->len - udphoff;
256
257 /* csum_check requires unshared skb */
258 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
259 return 0;
260
261 if (unlikely(cp->app != NULL)) {
262 /* Some checks before mangling */
263 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
264 return 0;
265
266 /*
267 * Attempt ip_vs_app call.
268 * It will fix ip_vs_conn
269 */
270 if (!ip_vs_app_pkt_in(cp, skb))
271 return 0;
272 }
273
274 udph = (void *)skb_network_header(skb) + udphoff;
275 udph->dest = cp->dport;
276
277 /*
278 * Adjust UDP checksums
279 */
280 if (skb->ip_summed == CHECKSUM_PARTIAL) {
281 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
282 htonl(oldlen),
283 htonl(skb->len - udphoff));
284 } else if (!cp->app && (udph->check != 0)) {
285 /* Only port and addr are changed, do fast csum update */
286 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
287 cp->vport, cp->dport);
288 if (skb->ip_summed == CHECKSUM_COMPLETE)
289 skb->ip_summed = CHECKSUM_NONE;
290 } else {
291 /* full checksum calculation */
292 udph->check = 0;
293 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
294#ifdef CONFIG_IP_VS_IPV6
295 if (cp->af == AF_INET6)
296 udph->check = csum_ipv6_magic(&cp->caddr.in6,
297 &cp->daddr.in6,
298 skb->len - udphoff,
299 cp->protocol, skb->csum);
300 else
301#endif
302 udph->check = csum_tcpudp_magic(cp->caddr.ip,
303 cp->daddr.ip,
304 skb->len - udphoff,
305 cp->protocol,
306 skb->csum);
307 if (udph->check == 0)
308 udph->check = CSUM_MANGLED_0;
309 skb->ip_summed = CHECKSUM_UNNECESSARY;
310 }
311 return 1;
312}
313
314
315static int
316udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
317{
318 struct udphdr _udph, *uh;
319 unsigned int udphoff;
320
321#ifdef CONFIG_IP_VS_IPV6
322 if (af == AF_INET6)
323 udphoff = sizeof(struct ipv6hdr);
324 else
325#endif
326 udphoff = ip_hdrlen(skb);
327
328 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
329 if (uh == NULL)
330 return 0;
331
332 if (uh->check != 0) {
333 switch (skb->ip_summed) {
334 case CHECKSUM_NONE:
335 skb->csum = skb_checksum(skb, udphoff,
336 skb->len - udphoff, 0);
337 case CHECKSUM_COMPLETE:
338#ifdef CONFIG_IP_VS_IPV6
339 if (af == AF_INET6) {
340 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
341 &ipv6_hdr(skb)->daddr,
342 skb->len - udphoff,
343 ipv6_hdr(skb)->nexthdr,
344 skb->csum)) {
345 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
346 "Failed checksum for");
347 return 0;
348 }
349 } else
350#endif
351 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
352 ip_hdr(skb)->daddr,
353 skb->len - udphoff,
354 ip_hdr(skb)->protocol,
355 skb->csum)) {
356 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
357 "Failed checksum for");
358 return 0;
359 }
360 break;
361 default:
362 /* No need to checksum. */
363 break;
364 }
365 }
366 return 1;
367}
368
369
370/*
371 * Note: the caller guarantees that only one of register_app,
372 * unregister_app or app_conn_bind is called each time.
373 */
374
375#define UDP_APP_TAB_BITS 4
376#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
377#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
378
379static struct list_head udp_apps[UDP_APP_TAB_SIZE];
380static DEFINE_SPINLOCK(udp_app_lock);
381
382static inline __u16 udp_app_hashkey(__be16 port)
383{
384 return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
385 & UDP_APP_TAB_MASK;
386}
387
388
389static int udp_register_app(struct ip_vs_app *inc)
390{
391 struct ip_vs_app *i;
392 __u16 hash;
393 __be16 port = inc->port;
394 int ret = 0;
395
396 hash = udp_app_hashkey(port);
397
398
399 spin_lock_bh(&udp_app_lock);
400 list_for_each_entry(i, &udp_apps[hash], p_list) {
401 if (i->port == port) {
402 ret = -EEXIST;
403 goto out;
404 }
405 }
406 list_add(&inc->p_list, &udp_apps[hash]);
407 atomic_inc(&ip_vs_protocol_udp.appcnt);
408
409 out:
410 spin_unlock_bh(&udp_app_lock);
411 return ret;
412}
413
414
415static void
416udp_unregister_app(struct ip_vs_app *inc)
417{
418 spin_lock_bh(&udp_app_lock);
419 atomic_dec(&ip_vs_protocol_udp.appcnt);
420 list_del(&inc->p_list);
421 spin_unlock_bh(&udp_app_lock);
422}
423
424
425static int udp_app_conn_bind(struct ip_vs_conn *cp)
426{
427 int hash;
428 struct ip_vs_app *inc;
429 int result = 0;
430
431 /* Default binding: bind app only for NAT */
432 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
433 return 0;
434
435 /* Lookup application incarnations and bind the right one */
436 hash = udp_app_hashkey(cp->vport);
437
438 spin_lock(&udp_app_lock);
439 list_for_each_entry(inc, &udp_apps[hash], p_list) {
440 if (inc->port == cp->vport) {
441 if (unlikely(!ip_vs_app_inc_get(inc)))
442 break;
443 spin_unlock(&udp_app_lock);
444
445 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
446 "%s:%u to app %s on port %u\n",
447 __func__,
448 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
449 ntohs(cp->cport),
450 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
451 ntohs(cp->vport),
452 inc->name, ntohs(inc->port));
453
454 cp->app = inc;
455 if (inc->init_conn)
456 result = inc->init_conn(inc, cp);
457 goto out;
458 }
459 }
460 spin_unlock(&udp_app_lock);
461
462 out:
463 return result;
464}
465
466
467static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
468 [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
469 [IP_VS_UDP_S_LAST] = 2*HZ,
470};
471
472static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
473 [IP_VS_UDP_S_NORMAL] = "UDP",
474 [IP_VS_UDP_S_LAST] = "BUG!",
475};
476
477
478static int
479udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
480{
481 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
482 udp_state_name_table, sname, to);
483}
484
485static const char * udp_state_name(int state)
486{
487 if (state >= IP_VS_UDP_S_LAST)
488 return "ERR!";
489 return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
490}
491
492static int
493udp_state_transition(struct ip_vs_conn *cp, int direction,
494 const struct sk_buff *skb,
495 struct ip_vs_protocol *pp)
496{
497 cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
498 return 1;
499}
500
501static void udp_init(struct ip_vs_protocol *pp)
502{
503 IP_VS_INIT_HASH_TABLE(udp_apps);
504 pp->timeout_table = udp_timeouts;
505}
506
507static void udp_exit(struct ip_vs_protocol *pp)
508{
509}
510
511
512struct ip_vs_protocol ip_vs_protocol_udp = {
513 .name = "UDP",
514 .protocol = IPPROTO_UDP,
515 .num_states = IP_VS_UDP_S_LAST,
516 .dont_defrag = 0,
517 .init = udp_init,
518 .exit = udp_exit,
519 .conn_schedule = udp_conn_schedule,
520 .conn_in_get = udp_conn_in_get,
521 .conn_out_get = udp_conn_out_get,
522 .snat_handler = udp_snat_handler,
523 .dnat_handler = udp_dnat_handler,
524 .csum_check = udp_csum_check,
525 .state_transition = udp_state_transition,
526 .state_name = udp_state_name,
527 .register_app = udp_register_app,
528 .unregister_app = udp_unregister_app,
529 .app_conn_bind = udp_app_conn_bind,
530 .debug_packet = ip_vs_tcpudp_debug_packet,
531 .timeout_change = NULL,
532 .set_state_timeout = udp_set_state_timeout,
533};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
deleted file mode 100644
index a22195f68ac4..000000000000
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ /dev/null
@@ -1,112 +0,0 @@
1/*
2 * IPVS: Round-Robin Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Peter Kese <peter.kese@ijs.si>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Fixes/Changes:
13 * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
14 * Julian Anastasov : fixed the NULL pointer access bug in debugging
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_rr_update_svc
18 * Wensong Zhang : added any dest with weight=0 is quiesced
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24
25#include <net/ip_vs.h>
26
27
28static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
29{
30 svc->sched_data = &svc->destinations;
31 return 0;
32}
33
34
35static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
36{
37 svc->sched_data = &svc->destinations;
38 return 0;
39}
40
41
42/*
43 * Round-Robin Scheduling
44 */
45static struct ip_vs_dest *
46ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
47{
48 struct list_head *p, *q;
49 struct ip_vs_dest *dest;
50
51 IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
52
53 write_lock(&svc->sched_lock);
54 p = (struct list_head *)svc->sched_data;
55 p = p->next;
56 q = p;
57 do {
58 /* skip list head */
59 if (q == &svc->destinations) {
60 q = q->next;
61 continue;
62 }
63
64 dest = list_entry(q, struct ip_vs_dest, n_list);
65 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
66 atomic_read(&dest->weight) > 0)
67 /* HIT */
68 goto out;
69 q = q->next;
70 } while (q != p);
71 write_unlock(&svc->sched_lock);
72 return NULL;
73
74 out:
75 svc->sched_data = q;
76 write_unlock(&svc->sched_lock);
77 IP_VS_DBG_BUF(6, "RR: server %s:%u "
78 "activeconns %d refcnt %d weight %d\n",
79 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
80 atomic_read(&dest->activeconns),
81 atomic_read(&dest->refcnt), atomic_read(&dest->weight));
82
83 return dest;
84}
85
86
87static struct ip_vs_scheduler ip_vs_rr_scheduler = {
88 .name = "rr", /* name */
89 .refcnt = ATOMIC_INIT(0),
90 .module = THIS_MODULE,
91 .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
92#ifdef CONFIG_IP_VS_IPV6
93 .supports_ipv6 = 1,
94#endif
95 .init_service = ip_vs_rr_init_svc,
96 .update_service = ip_vs_rr_update_svc,
97 .schedule = ip_vs_rr_schedule,
98};
99
100static int __init ip_vs_rr_init(void)
101{
102 return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
103}
104
105static void __exit ip_vs_rr_cleanup(void)
106{
107 unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
108}
109
110module_init(ip_vs_rr_init);
111module_exit(ip_vs_rr_cleanup);
112MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
deleted file mode 100644
index a46ad9e35016..000000000000
--- a/net/ipv4/ipvs/ip_vs_sched.c
+++ /dev/null
@@ -1,251 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 * Changes:
17 *
18 */
19
20#include <linux/module.h>
21#include <linux/spinlock.h>
22#include <linux/interrupt.h>
23#include <asm/string.h>
24#include <linux/kmod.h>
25#include <linux/sysctl.h>
26
27#include <net/ip_vs.h>
28
29/*
30 * IPVS scheduler list
31 */
32static LIST_HEAD(ip_vs_schedulers);
33
34/* lock for service table */
35static DEFINE_RWLOCK(__ip_vs_sched_lock);
36
37
38/*
39 * Bind a service with a scheduler
40 */
41int ip_vs_bind_scheduler(struct ip_vs_service *svc,
42 struct ip_vs_scheduler *scheduler)
43{
44 int ret;
45
46 if (svc == NULL) {
47 IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
48 return -EINVAL;
49 }
50 if (scheduler == NULL) {
51 IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
52 return -EINVAL;
53 }
54
55 svc->scheduler = scheduler;
56
57 if (scheduler->init_service) {
58 ret = scheduler->init_service(svc);
59 if (ret) {
60 IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
61 return ret;
62 }
63 }
64
65 return 0;
66}
67
68
69/*
70 * Unbind a service with its scheduler
71 */
72int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
73{
74 struct ip_vs_scheduler *sched;
75
76 if (svc == NULL) {
77 IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
78 return -EINVAL;
79 }
80
81 sched = svc->scheduler;
82 if (sched == NULL) {
83 IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
84 return -EINVAL;
85 }
86
87 if (sched->done_service) {
88 if (sched->done_service(svc) != 0) {
89 IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
90 return -EINVAL;
91 }
92 }
93
94 svc->scheduler = NULL;
95 return 0;
96}
97
98
99/*
100 * Get scheduler in the scheduler list by name
101 */
102static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
103{
104 struct ip_vs_scheduler *sched;
105
106 IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
107 sched_name);
108
109 read_lock_bh(&__ip_vs_sched_lock);
110
111 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
112 /*
113 * Test and get the modules atomically
114 */
115 if (sched->module && !try_module_get(sched->module)) {
116 /*
117 * This scheduler is just deleted
118 */
119 continue;
120 }
121 if (strcmp(sched_name, sched->name)==0) {
122 /* HIT */
123 read_unlock_bh(&__ip_vs_sched_lock);
124 return sched;
125 }
126 if (sched->module)
127 module_put(sched->module);
128 }
129
130 read_unlock_bh(&__ip_vs_sched_lock);
131 return NULL;
132}
133
134
135/*
136 * Lookup scheduler and try to load it if it doesn't exist
137 */
138struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
139{
140 struct ip_vs_scheduler *sched;
141
142 /*
143 * Search for the scheduler by sched_name
144 */
145 sched = ip_vs_sched_getbyname(sched_name);
146
147 /*
148 * If scheduler not found, load the module and search again
149 */
150 if (sched == NULL) {
151 request_module("ip_vs_%s", sched_name);
152 sched = ip_vs_sched_getbyname(sched_name);
153 }
154
155 return sched;
156}
157
158void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
159{
160 if (scheduler->module)
161 module_put(scheduler->module);
162}
163
164
165/*
166 * Register a scheduler in the scheduler list
167 */
168int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
169{
170 struct ip_vs_scheduler *sched;
171
172 if (!scheduler) {
173 IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
174 return -EINVAL;
175 }
176
177 if (!scheduler->name) {
178 IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
179 return -EINVAL;
180 }
181
182 /* increase the module use count */
183 ip_vs_use_count_inc();
184
185 write_lock_bh(&__ip_vs_sched_lock);
186
187 if (!list_empty(&scheduler->n_list)) {
188 write_unlock_bh(&__ip_vs_sched_lock);
189 ip_vs_use_count_dec();
190 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
191 "already linked\n", scheduler->name);
192 return -EINVAL;
193 }
194
195 /*
196 * Make sure that the scheduler with this name doesn't exist
197 * in the scheduler list.
198 */
199 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
200 if (strcmp(scheduler->name, sched->name) == 0) {
201 write_unlock_bh(&__ip_vs_sched_lock);
202 ip_vs_use_count_dec();
203 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
204 "already existed in the system\n",
205 scheduler->name);
206 return -EINVAL;
207 }
208 }
209 /*
210 * Add it into the d-linked scheduler list
211 */
212 list_add(&scheduler->n_list, &ip_vs_schedulers);
213 write_unlock_bh(&__ip_vs_sched_lock);
214
215 IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
216
217 return 0;
218}
219
220
221/*
222 * Unregister a scheduler from the scheduler list
223 */
224int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
225{
226 if (!scheduler) {
227 IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
228 return -EINVAL;
229 }
230
231 write_lock_bh(&__ip_vs_sched_lock);
232 if (list_empty(&scheduler->n_list)) {
233 write_unlock_bh(&__ip_vs_sched_lock);
234 IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
235 "is not in the list. failed\n", scheduler->name);
236 return -EINVAL;
237 }
238
239 /*
240 * Remove it from the d-linked scheduler list
241 */
242 list_del(&scheduler->n_list);
243 write_unlock_bh(&__ip_vs_sched_lock);
244
245 /* decrease the module use count */
246 ip_vs_use_count_dec();
247
248 IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
249
250 return 0;
251}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
deleted file mode 100644
index 7d2f22f04b83..000000000000
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ /dev/null
@@ -1,140 +0,0 @@
1/*
2 * IPVS: Shortest Expected Delay scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The SED algorithm attempts to minimize each job's expected delay until
17 * completion. The expected delay that the job will experience is
18 * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
19 * jobs on the ith server and Ui is the fixed service rate (weight) of
20 * the ith server. The SED algorithm adopts a greedy policy that each does
21 * what is in its own best interest, i.e. to join the queue which would
22 * minimize its expected delay of completion.
23 *
24 * See the following paper for more information:
25 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
26 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
27 * pages 986-994, 1988.
28 *
29 * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
30 *
31 * The difference between SED and WLC is that SED includes the incoming
32 * job in the cost function (the increment of 1). SED may outperform
33 * WLC, while scheduling big jobs under larger heterogeneous systems
34 * (the server weight varies a lot).
35 *
36 */
37
38#include <linux/module.h>
39#include <linux/kernel.h>
40
41#include <net/ip_vs.h>
42
43
44static inline unsigned int
45ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
46{
47 /*
48 * We only use the active connection number in the cost
49 * calculation here.
50 */
51 return atomic_read(&dest->activeconns) + 1;
52}
53
54
55/*
56 * Weighted Least Connection scheduling
57 */
58static struct ip_vs_dest *
59ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
60{
61 struct ip_vs_dest *dest, *least;
62 unsigned int loh, doh;
63
64 IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
65
66 /*
67 * We calculate the load of each dest server as follows:
68 * (server expected overhead) / dest->weight
69 *
70 * Remember -- no floats in kernel mode!!!
71 * The comparison of h1*w2 > h2*w1 is equivalent to that of
72 * h1/w1 > h2/w2
73 * if every weight is larger than zero.
74 *
75 * The server with weight=0 is quiesced and will not receive any
76 * new connections.
77 */
78
79 list_for_each_entry(dest, &svc->destinations, n_list) {
80 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
81 atomic_read(&dest->weight) > 0) {
82 least = dest;
83 loh = ip_vs_sed_dest_overhead(least);
84 goto nextstage;
85 }
86 }
87 return NULL;
88
89 /*
90 * Find the destination with the least load.
91 */
92 nextstage:
93 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
94 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
95 continue;
96 doh = ip_vs_sed_dest_overhead(dest);
97 if (loh * atomic_read(&dest->weight) >
98 doh * atomic_read(&least->weight)) {
99 least = dest;
100 loh = doh;
101 }
102 }
103
104 IP_VS_DBG_BUF(6, "SED: server %s:%u "
105 "activeconns %d refcnt %d weight %d overhead %d\n",
106 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
107 atomic_read(&least->activeconns),
108 atomic_read(&least->refcnt),
109 atomic_read(&least->weight), loh);
110
111 return least;
112}
113
114
115static struct ip_vs_scheduler ip_vs_sed_scheduler =
116{
117 .name = "sed",
118 .refcnt = ATOMIC_INIT(0),
119 .module = THIS_MODULE,
120 .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
121#ifdef CONFIG_IP_VS_IPV6
122 .supports_ipv6 = 1,
123#endif
124 .schedule = ip_vs_sed_schedule,
125};
126
127
128static int __init ip_vs_sed_init(void)
129{
130 return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
131}
132
133static void __exit ip_vs_sed_cleanup(void)
134{
135 unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
136}
137
138module_init(ip_vs_sed_init);
139module_exit(ip_vs_sed_cleanup);
140MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
deleted file mode 100644
index 1d96de27fefd..000000000000
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ /dev/null
@@ -1,258 +0,0 @@
1/*
2 * IPVS: Source Hashing scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The sh algorithm is to select server by the hash key of source IP
17 * address. The pseudo code is as follows:
18 *
19 * n <- servernode[src_ip];
20 * if (n is dead) OR
21 * (n is overloaded) or (n.weight <= 0) then
22 * return NULL;
23 *
24 * return n;
25 *
26 * Notes that servernode is a 256-bucket hash table that maps the hash
27 * index derived from packet source IP address to the current server
28 * array. If the sh scheduler is used in cache cluster, it is good to
29 * combine it with cache_bypass feature. When the statically assigned
30 * server is dead or overloaded, the load balancer can bypass the cache
31 * server and send requests to the original server directly.
32 *
33 */
34
35#include <linux/ip.h>
36#include <linux/module.h>
37#include <linux/kernel.h>
38#include <linux/skbuff.h>
39
40#include <net/ip_vs.h>
41
42
43/*
44 * IPVS SH bucket
45 */
46struct ip_vs_sh_bucket {
47 struct ip_vs_dest *dest; /* real server (cache) */
48};
49
50/*
51 * for IPVS SH entry hash table
52 */
53#ifndef CONFIG_IP_VS_SH_TAB_BITS
54#define CONFIG_IP_VS_SH_TAB_BITS 8
55#endif
56#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
57#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
58#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
59
60
61/*
62 * Returns hash value for IPVS SH entry
63 */
64static inline unsigned ip_vs_sh_hashkey(__be32 addr)
65{
66 return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
67}
68
69
70/*
71 * Get ip_vs_dest associated with supplied parameters.
72 */
73static inline struct ip_vs_dest *
74ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
75{
76 return (tbl[ip_vs_sh_hashkey(addr)]).dest;
77}
78
79
80/*
81 * Assign all the hash buckets of the specified table with the service.
82 */
83static int
84ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
85{
86 int i;
87 struct ip_vs_sh_bucket *b;
88 struct list_head *p;
89 struct ip_vs_dest *dest;
90
91 b = tbl;
92 p = &svc->destinations;
93 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
94 if (list_empty(p)) {
95 b->dest = NULL;
96 } else {
97 if (p == &svc->destinations)
98 p = p->next;
99
100 dest = list_entry(p, struct ip_vs_dest, n_list);
101 atomic_inc(&dest->refcnt);
102 b->dest = dest;
103
104 p = p->next;
105 }
106 b++;
107 }
108 return 0;
109}
110
111
112/*
113 * Flush all the hash buckets of the specified table.
114 */
115static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
116{
117 int i;
118 struct ip_vs_sh_bucket *b;
119
120 b = tbl;
121 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
122 if (b->dest) {
123 atomic_dec(&b->dest->refcnt);
124 b->dest = NULL;
125 }
126 b++;
127 }
128}
129
130
131static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
132{
133 struct ip_vs_sh_bucket *tbl;
134
135 /* allocate the SH table for this service */
136 tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
137 GFP_ATOMIC);
138 if (tbl == NULL) {
139 IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
140 return -ENOMEM;
141 }
142 svc->sched_data = tbl;
143 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
144 "current service\n",
145 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
146
147 /* assign the hash buckets with the updated service */
148 ip_vs_sh_assign(tbl, svc);
149
150 return 0;
151}
152
153
154static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
155{
156 struct ip_vs_sh_bucket *tbl = svc->sched_data;
157
158 /* got to clean up hash buckets here */
159 ip_vs_sh_flush(tbl);
160
161 /* release the table itself */
162 kfree(svc->sched_data);
163 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
164 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
165
166 return 0;
167}
168
169
170static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
171{
172 struct ip_vs_sh_bucket *tbl = svc->sched_data;
173
174 /* got to clean up hash buckets here */
175 ip_vs_sh_flush(tbl);
176
177 /* assign the hash buckets with the updated service */
178 ip_vs_sh_assign(tbl, svc);
179
180 return 0;
181}
182
183
184/*
185 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
186 * consider that the server is overloaded here.
187 */
188static inline int is_overloaded(struct ip_vs_dest *dest)
189{
190 return dest->flags & IP_VS_DEST_F_OVERLOAD;
191}
192
193
194/*
195 * Source Hashing scheduling
196 */
197static struct ip_vs_dest *
198ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
199{
200 struct ip_vs_dest *dest;
201 struct ip_vs_sh_bucket *tbl;
202 struct iphdr *iph = ip_hdr(skb);
203
204 IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
205
206 tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
207 dest = ip_vs_sh_get(tbl, iph->saddr);
208 if (!dest
209 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
210 || atomic_read(&dest->weight) <= 0
211 || is_overloaded(dest)) {
212 return NULL;
213 }
214
215 IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
216 "--> server %u.%u.%u.%u:%d\n",
217 NIPQUAD(iph->saddr),
218 NIPQUAD(dest->addr.ip),
219 ntohs(dest->port));
220
221 return dest;
222}
223
224
225/*
226 * IPVS SH Scheduler structure
227 */
228static struct ip_vs_scheduler ip_vs_sh_scheduler =
229{
230 .name = "sh",
231 .refcnt = ATOMIC_INIT(0),
232 .module = THIS_MODULE,
233 .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
234#ifdef CONFIG_IP_VS_IPV6
235 .supports_ipv6 = 0,
236#endif
237 .init_service = ip_vs_sh_init_svc,
238 .done_service = ip_vs_sh_done_svc,
239 .update_service = ip_vs_sh_update_svc,
240 .schedule = ip_vs_sh_schedule,
241};
242
243
244static int __init ip_vs_sh_init(void)
245{
246 return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
247}
248
249
250static void __exit ip_vs_sh_cleanup(void)
251{
252 unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
253}
254
255
256module_init(ip_vs_sh_init);
257module_exit(ip_vs_sh_cleanup);
258MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
deleted file mode 100644
index de5e7e118eed..000000000000
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ /dev/null
@@ -1,942 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 *
10 * ip_vs_sync: sync connection info from master load balancer to backups
11 * through multicast
12 *
13 * Changes:
14 * Alexandre Cassen : Added master & backup support at a time.
15 * Alexandre Cassen : Added SyncID support for incoming sync
16 * messages filtering.
17 * Justin Ossevoort : Fix endian problem on sync message size.
18 */
19
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/inetdevice.h>
23#include <linux/net.h>
24#include <linux/completion.h>
25#include <linux/delay.h>
26#include <linux/skbuff.h>
27#include <linux/in.h>
28#include <linux/igmp.h> /* for ip_mc_join_group */
29#include <linux/udp.h>
30#include <linux/err.h>
31#include <linux/kthread.h>
32#include <linux/wait.h>
33#include <linux/kernel.h>
34
35#include <net/ip.h>
36#include <net/sock.h>
37
38#include <net/ip_vs.h>
39
40#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
41#define IP_VS_SYNC_PORT 8848 /* multicast port */
42
43
44/*
45 * IPVS sync connection entry
46 */
47struct ip_vs_sync_conn {
48 __u8 reserved;
49
50 /* Protocol, addresses and port numbers */
51 __u8 protocol; /* Which protocol (TCP/UDP) */
52 __be16 cport;
53 __be16 vport;
54 __be16 dport;
55 __be32 caddr; /* client address */
56 __be32 vaddr; /* virtual address */
57 __be32 daddr; /* destination address */
58
59 /* Flags and state transition */
60 __be16 flags; /* status flags */
61 __be16 state; /* state info */
62
63 /* The sequence options start here */
64};
65
66struct ip_vs_sync_conn_options {
67 struct ip_vs_seq in_seq; /* incoming seq. struct */
68 struct ip_vs_seq out_seq; /* outgoing seq. struct */
69};
70
71struct ip_vs_sync_thread_data {
72 struct socket *sock;
73 char *buf;
74};
75
76#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
77#define FULL_CONN_SIZE \
78(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
79
80
81/*
82 The master mulitcasts messages to the backup load balancers in the
83 following format.
84
85 0 1 2 3
86 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
87 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 | Count Conns | SyncID | Size |
89 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 | |
91 | IPVS Sync Connection (1) |
92 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93 | . |
94 | . |
95 | . |
96 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97 | |
98 | IPVS Sync Connection (n) |
99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100*/
101
102#define SYNC_MESG_HEADER_LEN 4
103#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
104
105struct ip_vs_sync_mesg {
106 __u8 nr_conns;
107 __u8 syncid;
108 __u16 size;
109
110 /* ip_vs_sync_conn entries start here */
111};
112
113/* the maximum length of sync (sending/receiving) message */
114static int sync_send_mesg_maxlen;
115static int sync_recv_mesg_maxlen;
116
117struct ip_vs_sync_buff {
118 struct list_head list;
119 unsigned long firstuse;
120
121 /* pointers for the message data */
122 struct ip_vs_sync_mesg *mesg;
123 unsigned char *head;
124 unsigned char *end;
125};
126
127
128/* the sync_buff list head and the lock */
129static LIST_HEAD(ip_vs_sync_queue);
130static DEFINE_SPINLOCK(ip_vs_sync_lock);
131
132/* current sync_buff for accepting new conn entries */
133static struct ip_vs_sync_buff *curr_sb = NULL;
134static DEFINE_SPINLOCK(curr_sb_lock);
135
136/* ipvs sync daemon state */
137volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
138volatile int ip_vs_master_syncid = 0;
139volatile int ip_vs_backup_syncid = 0;
140
141/* multicast interface name */
142char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
143char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
144
145/* sync daemon tasks */
146static struct task_struct *sync_master_thread;
147static struct task_struct *sync_backup_thread;
148
149/* multicast addr */
150static struct sockaddr_in mcast_addr = {
151 .sin_family = AF_INET,
152 .sin_port = __constant_htons(IP_VS_SYNC_PORT),
153 .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP),
154};
155
156
157static inline struct ip_vs_sync_buff *sb_dequeue(void)
158{
159 struct ip_vs_sync_buff *sb;
160
161 spin_lock_bh(&ip_vs_sync_lock);
162 if (list_empty(&ip_vs_sync_queue)) {
163 sb = NULL;
164 } else {
165 sb = list_entry(ip_vs_sync_queue.next,
166 struct ip_vs_sync_buff,
167 list);
168 list_del(&sb->list);
169 }
170 spin_unlock_bh(&ip_vs_sync_lock);
171
172 return sb;
173}
174
175static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
176{
177 struct ip_vs_sync_buff *sb;
178
179 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
180 return NULL;
181
182 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
183 kfree(sb);
184 return NULL;
185 }
186 sb->mesg->nr_conns = 0;
187 sb->mesg->syncid = ip_vs_master_syncid;
188 sb->mesg->size = 4;
189 sb->head = (unsigned char *)sb->mesg + 4;
190 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
191 sb->firstuse = jiffies;
192 return sb;
193}
194
195static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
196{
197 kfree(sb->mesg);
198 kfree(sb);
199}
200
201static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
202{
203 spin_lock(&ip_vs_sync_lock);
204 if (ip_vs_sync_state & IP_VS_STATE_MASTER)
205 list_add_tail(&sb->list, &ip_vs_sync_queue);
206 else
207 ip_vs_sync_buff_release(sb);
208 spin_unlock(&ip_vs_sync_lock);
209}
210
211/*
212 * Get the current sync buffer if it has been created for more
213 * than the specified time or the specified time is zero.
214 */
215static inline struct ip_vs_sync_buff *
216get_curr_sync_buff(unsigned long time)
217{
218 struct ip_vs_sync_buff *sb;
219
220 spin_lock_bh(&curr_sb_lock);
221 if (curr_sb && (time == 0 ||
222 time_before(jiffies - curr_sb->firstuse, time))) {
223 sb = curr_sb;
224 curr_sb = NULL;
225 } else
226 sb = NULL;
227 spin_unlock_bh(&curr_sb_lock);
228 return sb;
229}
230
231
232/*
233 * Add an ip_vs_conn information into the current sync_buff.
234 * Called by ip_vs_in.
235 */
236void ip_vs_sync_conn(struct ip_vs_conn *cp)
237{
238 struct ip_vs_sync_mesg *m;
239 struct ip_vs_sync_conn *s;
240 int len;
241
242 spin_lock(&curr_sb_lock);
243 if (!curr_sb) {
244 if (!(curr_sb=ip_vs_sync_buff_create())) {
245 spin_unlock(&curr_sb_lock);
246 IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
247 return;
248 }
249 }
250
251 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
252 SIMPLE_CONN_SIZE;
253 m = curr_sb->mesg;
254 s = (struct ip_vs_sync_conn *)curr_sb->head;
255
256 /* copy members */
257 s->protocol = cp->protocol;
258 s->cport = cp->cport;
259 s->vport = cp->vport;
260 s->dport = cp->dport;
261 s->caddr = cp->caddr.ip;
262 s->vaddr = cp->vaddr.ip;
263 s->daddr = cp->daddr.ip;
264 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
265 s->state = htons(cp->state);
266 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
267 struct ip_vs_sync_conn_options *opt =
268 (struct ip_vs_sync_conn_options *)&s[1];
269 memcpy(opt, &cp->in_seq, sizeof(*opt));
270 }
271
272 m->nr_conns++;
273 m->size += len;
274 curr_sb->head += len;
275
276 /* check if there is a space for next one */
277 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
278 sb_queue_tail(curr_sb);
279 curr_sb = NULL;
280 }
281 spin_unlock(&curr_sb_lock);
282
283 /* synchronize its controller if it has */
284 if (cp->control)
285 ip_vs_sync_conn(cp->control);
286}
287
288
289/*
290 * Process received multicast message and create the corresponding
291 * ip_vs_conn entries.
292 */
293static void ip_vs_process_message(const char *buffer, const size_t buflen)
294{
295 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
296 struct ip_vs_sync_conn *s;
297 struct ip_vs_sync_conn_options *opt;
298 struct ip_vs_conn *cp;
299 struct ip_vs_protocol *pp;
300 struct ip_vs_dest *dest;
301 char *p;
302 int i;
303
304 if (buflen < sizeof(struct ip_vs_sync_mesg)) {
305 IP_VS_ERR_RL("sync message header too short\n");
306 return;
307 }
308
309 /* Convert size back to host byte order */
310 m->size = ntohs(m->size);
311
312 if (buflen != m->size) {
313 IP_VS_ERR_RL("bogus sync message size\n");
314 return;
315 }
316
317 /* SyncID sanity check */
318 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
319 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
320 m->syncid);
321 return;
322 }
323
324 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
325 for (i=0; i<m->nr_conns; i++) {
326 unsigned flags, state;
327
328 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
329 IP_VS_ERR_RL("bogus conn in sync message\n");
330 return;
331 }
332 s = (struct ip_vs_sync_conn *) p;
333 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
334 flags &= ~IP_VS_CONN_F_HASHED;
335 if (flags & IP_VS_CONN_F_SEQ_MASK) {
336 opt = (struct ip_vs_sync_conn_options *)&s[1];
337 p += FULL_CONN_SIZE;
338 if (p > buffer+buflen) {
339 IP_VS_ERR_RL("bogus conn options in sync message\n");
340 return;
341 }
342 } else {
343 opt = NULL;
344 p += SIMPLE_CONN_SIZE;
345 }
346
347 state = ntohs(s->state);
348 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
349 pp = ip_vs_proto_get(s->protocol);
350 if (!pp) {
351 IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
352 s->protocol);
353 continue;
354 }
355 if (state >= pp->num_states) {
356 IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
357 pp->name, state);
358 continue;
359 }
360 } else {
361 /* protocol in templates is not used for state/timeout */
362 pp = NULL;
363 if (state > 0) {
364 IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
365 state);
366 state = 0;
367 }
368 }
369
370 if (!(flags & IP_VS_CONN_F_TEMPLATE))
371 cp = ip_vs_conn_in_get(AF_INET, s->protocol,
372 (union nf_inet_addr *)&s->caddr,
373 s->cport,
374 (union nf_inet_addr *)&s->vaddr,
375 s->vport);
376 else
377 cp = ip_vs_ct_in_get(AF_INET, s->protocol,
378 (union nf_inet_addr *)&s->caddr,
379 s->cport,
380 (union nf_inet_addr *)&s->vaddr,
381 s->vport);
382 if (!cp) {
383 /*
384 * Find the appropriate destination for the connection.
385 * If it is not found the connection will remain unbound
386 * but still handled.
387 */
388 dest = ip_vs_find_dest(AF_INET,
389 (union nf_inet_addr *)&s->daddr,
390 s->dport,
391 (union nf_inet_addr *)&s->vaddr,
392 s->vport,
393 s->protocol);
394 /* Set the approprite ativity flag */
395 if (s->protocol == IPPROTO_TCP) {
396 if (state != IP_VS_TCP_S_ESTABLISHED)
397 flags |= IP_VS_CONN_F_INACTIVE;
398 else
399 flags &= ~IP_VS_CONN_F_INACTIVE;
400 }
401 cp = ip_vs_conn_new(AF_INET, s->protocol,
402 (union nf_inet_addr *)&s->caddr,
403 s->cport,
404 (union nf_inet_addr *)&s->vaddr,
405 s->vport,
406 (union nf_inet_addr *)&s->daddr,
407 s->dport,
408 flags, dest);
409 if (dest)
410 atomic_dec(&dest->refcnt);
411 if (!cp) {
412 IP_VS_ERR("ip_vs_conn_new failed\n");
413 return;
414 }
415 } else if (!cp->dest) {
416 dest = ip_vs_try_bind_dest(cp);
417 if (dest)
418 atomic_dec(&dest->refcnt);
419 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
420 (cp->state != state)) {
421 /* update active/inactive flag for the connection */
422 dest = cp->dest;
423 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
424 (state != IP_VS_TCP_S_ESTABLISHED)) {
425 atomic_dec(&dest->activeconns);
426 atomic_inc(&dest->inactconns);
427 cp->flags |= IP_VS_CONN_F_INACTIVE;
428 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
429 (state == IP_VS_TCP_S_ESTABLISHED)) {
430 atomic_inc(&dest->activeconns);
431 atomic_dec(&dest->inactconns);
432 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
433 }
434 }
435
436 if (opt)
437 memcpy(&cp->in_seq, opt, sizeof(*opt));
438 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
439 cp->state = state;
440 cp->old_state = cp->state;
441 /*
442 * We can not recover the right timeout for templates
443 * in all cases, we can not find the right fwmark
444 * virtual service. If needed, we can do it for
445 * non-fwmark persistent services.
446 */
447 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
448 cp->timeout = pp->timeout_table[state];
449 else
450 cp->timeout = (3*60*HZ);
451 ip_vs_conn_put(cp);
452 }
453}
454
455
456/*
457 * Setup loopback of outgoing multicasts on a sending socket
458 */
459static void set_mcast_loop(struct sock *sk, u_char loop)
460{
461 struct inet_sock *inet = inet_sk(sk);
462
463 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
464 lock_sock(sk);
465 inet->mc_loop = loop ? 1 : 0;
466 release_sock(sk);
467}
468
469/*
470 * Specify TTL for outgoing multicasts on a sending socket
471 */
472static void set_mcast_ttl(struct sock *sk, u_char ttl)
473{
474 struct inet_sock *inet = inet_sk(sk);
475
476 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
477 lock_sock(sk);
478 inet->mc_ttl = ttl;
479 release_sock(sk);
480}
481
482/*
483 * Specifiy default interface for outgoing multicasts
484 */
485static int set_mcast_if(struct sock *sk, char *ifname)
486{
487 struct net_device *dev;
488 struct inet_sock *inet = inet_sk(sk);
489
490 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
491 return -ENODEV;
492
493 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
494 return -EINVAL;
495
496 lock_sock(sk);
497 inet->mc_index = dev->ifindex;
498 /* inet->mc_addr = 0; */
499 release_sock(sk);
500
501 return 0;
502}
503
504
505/*
506 * Set the maximum length of sync message according to the
507 * specified interface's MTU.
508 */
509static int set_sync_mesg_maxlen(int sync_state)
510{
511 struct net_device *dev;
512 int num;
513
514 if (sync_state == IP_VS_STATE_MASTER) {
515 if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
516 return -ENODEV;
517
518 num = (dev->mtu - sizeof(struct iphdr) -
519 sizeof(struct udphdr) -
520 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
521 sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
522 SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
523 IP_VS_DBG(7, "setting the maximum length of sync sending "
524 "message %d.\n", sync_send_mesg_maxlen);
525 } else if (sync_state == IP_VS_STATE_BACKUP) {
526 if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
527 return -ENODEV;
528
529 sync_recv_mesg_maxlen = dev->mtu -
530 sizeof(struct iphdr) - sizeof(struct udphdr);
531 IP_VS_DBG(7, "setting the maximum length of sync receiving "
532 "message %d.\n", sync_recv_mesg_maxlen);
533 }
534
535 return 0;
536}
537
538
539/*
540 * Join a multicast group.
541 * the group is specified by a class D multicast address 224.0.0.0/8
542 * in the in_addr structure passed in as a parameter.
543 */
544static int
545join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
546{
547 struct ip_mreqn mreq;
548 struct net_device *dev;
549 int ret;
550
551 memset(&mreq, 0, sizeof(mreq));
552 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
553
554 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
555 return -ENODEV;
556 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
557 return -EINVAL;
558
559 mreq.imr_ifindex = dev->ifindex;
560
561 lock_sock(sk);
562 ret = ip_mc_join_group(sk, &mreq);
563 release_sock(sk);
564
565 return ret;
566}
567
568
569static int bind_mcastif_addr(struct socket *sock, char *ifname)
570{
571 struct net_device *dev;
572 __be32 addr;
573 struct sockaddr_in sin;
574
575 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
576 return -ENODEV;
577
578 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
579 if (!addr)
580 IP_VS_ERR("You probably need to specify IP address on "
581 "multicast interface.\n");
582
583 IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
584 ifname, NIPQUAD(addr));
585
586 /* Now bind the socket with the address of multicast interface */
587 sin.sin_family = AF_INET;
588 sin.sin_addr.s_addr = addr;
589 sin.sin_port = 0;
590
591 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
592}
593
594/*
595 * Set up sending multicast socket over UDP
596 */
597static struct socket * make_send_sock(void)
598{
599 struct socket *sock;
600 int result;
601
602 /* First create a socket */
603 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
604 if (result < 0) {
605 IP_VS_ERR("Error during creation of socket; terminating\n");
606 return ERR_PTR(result);
607 }
608
609 result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
610 if (result < 0) {
611 IP_VS_ERR("Error setting outbound mcast interface\n");
612 goto error;
613 }
614
615 set_mcast_loop(sock->sk, 0);
616 set_mcast_ttl(sock->sk, 1);
617
618 result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
619 if (result < 0) {
620 IP_VS_ERR("Error binding address of the mcast interface\n");
621 goto error;
622 }
623
624 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
625 sizeof(struct sockaddr), 0);
626 if (result < 0) {
627 IP_VS_ERR("Error connecting to the multicast addr\n");
628 goto error;
629 }
630
631 return sock;
632
633 error:
634 sock_release(sock);
635 return ERR_PTR(result);
636}
637
638
639/*
640 * Set up receiving multicast socket over UDP
641 */
642static struct socket * make_receive_sock(void)
643{
644 struct socket *sock;
645 int result;
646
647 /* First create a socket */
648 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
649 if (result < 0) {
650 IP_VS_ERR("Error during creation of socket; terminating\n");
651 return ERR_PTR(result);
652 }
653
654 /* it is equivalent to the REUSEADDR option in user-space */
655 sock->sk->sk_reuse = 1;
656
657 result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
658 sizeof(struct sockaddr));
659 if (result < 0) {
660 IP_VS_ERR("Error binding to the multicast addr\n");
661 goto error;
662 }
663
664 /* join the multicast group */
665 result = join_mcast_group(sock->sk,
666 (struct in_addr *) &mcast_addr.sin_addr,
667 ip_vs_backup_mcast_ifn);
668 if (result < 0) {
669 IP_VS_ERR("Error joining to the multicast group\n");
670 goto error;
671 }
672
673 return sock;
674
675 error:
676 sock_release(sock);
677 return ERR_PTR(result);
678}
679
680
681static int
682ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
683{
684 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
685 struct kvec iov;
686 int len;
687
688 EnterFunction(7);
689 iov.iov_base = (void *)buffer;
690 iov.iov_len = length;
691
692 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
693
694 LeaveFunction(7);
695 return len;
696}
697
698static void
699ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
700{
701 int msize;
702
703 msize = msg->size;
704
705 /* Put size in network byte order */
706 msg->size = htons(msg->size);
707
708 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
709 IP_VS_ERR("ip_vs_send_async error\n");
710}
711
712static int
713ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
714{
715 struct msghdr msg = {NULL,};
716 struct kvec iov;
717 int len;
718
719 EnterFunction(7);
720
721 /* Receive a packet */
722 iov.iov_base = buffer;
723 iov.iov_len = (size_t)buflen;
724
725 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
726
727 if (len < 0)
728 return -1;
729
730 LeaveFunction(7);
731 return len;
732}
733
734
735static int sync_thread_master(void *data)
736{
737 struct ip_vs_sync_thread_data *tinfo = data;
738 struct ip_vs_sync_buff *sb;
739
740 IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
741 "syncid = %d\n",
742 ip_vs_master_mcast_ifn, ip_vs_master_syncid);
743
744 while (!kthread_should_stop()) {
745 while ((sb = sb_dequeue())) {
746 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
747 ip_vs_sync_buff_release(sb);
748 }
749
750 /* check if entries stay in curr_sb for 2 seconds */
751 sb = get_curr_sync_buff(2 * HZ);
752 if (sb) {
753 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
754 ip_vs_sync_buff_release(sb);
755 }
756
757 schedule_timeout_interruptible(HZ);
758 }
759
760 /* clean up the sync_buff queue */
761 while ((sb=sb_dequeue())) {
762 ip_vs_sync_buff_release(sb);
763 }
764
765 /* clean up the current sync_buff */
766 if ((sb = get_curr_sync_buff(0))) {
767 ip_vs_sync_buff_release(sb);
768 }
769
770 /* release the sending multicast socket */
771 sock_release(tinfo->sock);
772 kfree(tinfo);
773
774 return 0;
775}
776
777
778static int sync_thread_backup(void *data)
779{
780 struct ip_vs_sync_thread_data *tinfo = data;
781 int len;
782
783 IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
784 "syncid = %d\n",
785 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
786
787 while (!kthread_should_stop()) {
788 wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
789 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
790 || kthread_should_stop());
791
792 /* do we have data now? */
793 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
794 len = ip_vs_receive(tinfo->sock, tinfo->buf,
795 sync_recv_mesg_maxlen);
796 if (len <= 0) {
797 IP_VS_ERR("receiving message error\n");
798 break;
799 }
800
801 /* disable bottom half, because it accesses the data
802 shared by softirq while getting/creating conns */
803 local_bh_disable();
804 ip_vs_process_message(tinfo->buf, len);
805 local_bh_enable();
806 }
807 }
808
809 /* release the sending multicast socket */
810 sock_release(tinfo->sock);
811 kfree(tinfo->buf);
812 kfree(tinfo);
813
814 return 0;
815}
816
817
818int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
819{
820 struct ip_vs_sync_thread_data *tinfo;
821 struct task_struct **realtask, *task;
822 struct socket *sock;
823 char *name, *buf = NULL;
824 int (*threadfn)(void *data);
825 int result = -ENOMEM;
826
827 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
828 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
829 sizeof(struct ip_vs_sync_conn));
830
831 if (state == IP_VS_STATE_MASTER) {
832 if (sync_master_thread)
833 return -EEXIST;
834
835 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
836 sizeof(ip_vs_master_mcast_ifn));
837 ip_vs_master_syncid = syncid;
838 realtask = &sync_master_thread;
839 name = "ipvs_syncmaster";
840 threadfn = sync_thread_master;
841 sock = make_send_sock();
842 } else if (state == IP_VS_STATE_BACKUP) {
843 if (sync_backup_thread)
844 return -EEXIST;
845
846 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
847 sizeof(ip_vs_backup_mcast_ifn));
848 ip_vs_backup_syncid = syncid;
849 realtask = &sync_backup_thread;
850 name = "ipvs_syncbackup";
851 threadfn = sync_thread_backup;
852 sock = make_receive_sock();
853 } else {
854 return -EINVAL;
855 }
856
857 if (IS_ERR(sock)) {
858 result = PTR_ERR(sock);
859 goto out;
860 }
861
862 set_sync_mesg_maxlen(state);
863 if (state == IP_VS_STATE_BACKUP) {
864 buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
865 if (!buf)
866 goto outsocket;
867 }
868
869 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
870 if (!tinfo)
871 goto outbuf;
872
873 tinfo->sock = sock;
874 tinfo->buf = buf;
875
876 task = kthread_run(threadfn, tinfo, name);
877 if (IS_ERR(task)) {
878 result = PTR_ERR(task);
879 goto outtinfo;
880 }
881
882 /* mark as active */
883 *realtask = task;
884 ip_vs_sync_state |= state;
885
886 /* increase the module use count */
887 ip_vs_use_count_inc();
888
889 return 0;
890
891outtinfo:
892 kfree(tinfo);
893outbuf:
894 kfree(buf);
895outsocket:
896 sock_release(sock);
897out:
898 return result;
899}
900
901
902int stop_sync_thread(int state)
903{
904 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
905
906 if (state == IP_VS_STATE_MASTER) {
907 if (!sync_master_thread)
908 return -ESRCH;
909
910 IP_VS_INFO("stopping master sync thread %d ...\n",
911 task_pid_nr(sync_master_thread));
912
913 /*
914 * The lock synchronizes with sb_queue_tail(), so that we don't
915 * add sync buffers to the queue, when we are already in
916 * progress of stopping the master sync daemon.
917 */
918
919 spin_lock_bh(&ip_vs_sync_lock);
920 ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
921 spin_unlock_bh(&ip_vs_sync_lock);
922 kthread_stop(sync_master_thread);
923 sync_master_thread = NULL;
924 } else if (state == IP_VS_STATE_BACKUP) {
925 if (!sync_backup_thread)
926 return -ESRCH;
927
928 IP_VS_INFO("stopping backup sync thread %d ...\n",
929 task_pid_nr(sync_backup_thread));
930
931 ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
932 kthread_stop(sync_backup_thread);
933 sync_backup_thread = NULL;
934 } else {
935 return -EINVAL;
936 }
937
938 /* decrease the module use count */
939 ip_vs_use_count_dec();
940
941 return 0;
942}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
deleted file mode 100644
index 8c596e712599..000000000000
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ /dev/null
@@ -1,128 +0,0 @@
1/*
2 * IPVS: Weighted Least-Connection Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Peter Kese <peter.kese@ijs.si>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
14 * Wensong Zhang : changed to use the inactconns in scheduling
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_wlc_update_svc
18 * Wensong Zhang : added any dest with weight=0 is quiesced
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24
25#include <net/ip_vs.h>
26
27
28static inline unsigned int
29ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
30{
31 /*
32 * We think the overhead of processing active connections is 256
33 * times higher than that of inactive connections in average. (This
34 * 256 times might not be accurate, we will change it later) We
35 * use the following formula to estimate the overhead now:
36 * dest->activeconns*256 + dest->inactconns
37 */
38 return (atomic_read(&dest->activeconns) << 8) +
39 atomic_read(&dest->inactconns);
40}
41
42
43/*
44 * Weighted Least Connection scheduling
45 */
46static struct ip_vs_dest *
47ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
48{
49 struct ip_vs_dest *dest, *least;
50 unsigned int loh, doh;
51
52 IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
53
54 /*
55 * We calculate the load of each dest server as follows:
56 * (dest overhead) / dest->weight
57 *
58 * Remember -- no floats in kernel mode!!!
59 * The comparison of h1*w2 > h2*w1 is equivalent to that of
60 * h1/w1 > h2/w2
61 * if every weight is larger than zero.
62 *
63 * The server with weight=0 is quiesced and will not receive any
64 * new connections.
65 */
66
67 list_for_each_entry(dest, &svc->destinations, n_list) {
68 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
69 atomic_read(&dest->weight) > 0) {
70 least = dest;
71 loh = ip_vs_wlc_dest_overhead(least);
72 goto nextstage;
73 }
74 }
75 return NULL;
76
77 /*
78 * Find the destination with the least load.
79 */
80 nextstage:
81 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
82 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
83 continue;
84 doh = ip_vs_wlc_dest_overhead(dest);
85 if (loh * atomic_read(&dest->weight) >
86 doh * atomic_read(&least->weight)) {
87 least = dest;
88 loh = doh;
89 }
90 }
91
92 IP_VS_DBG_BUF(6, "WLC: server %s:%u "
93 "activeconns %d refcnt %d weight %d overhead %d\n",
94 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
95 atomic_read(&least->activeconns),
96 atomic_read(&least->refcnt),
97 atomic_read(&least->weight), loh);
98
99 return least;
100}
101
102
103static struct ip_vs_scheduler ip_vs_wlc_scheduler =
104{
105 .name = "wlc",
106 .refcnt = ATOMIC_INIT(0),
107 .module = THIS_MODULE,
108 .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
109#ifdef CONFIG_IP_VS_IPV6
110 .supports_ipv6 = 1,
111#endif
112 .schedule = ip_vs_wlc_schedule,
113};
114
115
116static int __init ip_vs_wlc_init(void)
117{
118 return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
119}
120
121static void __exit ip_vs_wlc_cleanup(void)
122{
123 unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
124}
125
126module_init(ip_vs_wlc_init);
127module_exit(ip_vs_wlc_cleanup);
128MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
deleted file mode 100644
index 7ea92fed50bf..000000000000
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ /dev/null
@@ -1,237 +0,0 @@
1/*
2 * IPVS: Weighted Round-Robin Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
13 * Wensong Zhang : changed some comestics things for debugging
14 * Wensong Zhang : changed for the d-linked destination list
15 * Wensong Zhang : added the ip_vs_wrr_update_svc
16 * Julian Anastasov : fixed the bug of returning destination
17 * with weight 0 when all weights are zero
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/net.h>
24
25#include <net/ip_vs.h>
26
27/*
28 * current destination pointer for weighted round-robin scheduling
29 */
30struct ip_vs_wrr_mark {
31 struct list_head *cl; /* current list head */
32 int cw; /* current weight */
33 int mw; /* maximum weight */
34 int di; /* decreasing interval */
35};
36
37
38/*
39 * Get the gcd of server weights
40 */
41static int gcd(int a, int b)
42{
43 int c;
44
45 while ((c = a % b)) {
46 a = b;
47 b = c;
48 }
49 return b;
50}
51
52static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
53{
54 struct ip_vs_dest *dest;
55 int weight;
56 int g = 0;
57
58 list_for_each_entry(dest, &svc->destinations, n_list) {
59 weight = atomic_read(&dest->weight);
60 if (weight > 0) {
61 if (g > 0)
62 g = gcd(weight, g);
63 else
64 g = weight;
65 }
66 }
67 return g ? g : 1;
68}
69
70
71/*
72 * Get the maximum weight of the service destinations.
73 */
74static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
75{
76 struct ip_vs_dest *dest;
77 int weight = 0;
78
79 list_for_each_entry(dest, &svc->destinations, n_list) {
80 if (atomic_read(&dest->weight) > weight)
81 weight = atomic_read(&dest->weight);
82 }
83
84 return weight;
85}
86
87
88static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
89{
90 struct ip_vs_wrr_mark *mark;
91
92 /*
93 * Allocate the mark variable for WRR scheduling
94 */
95 mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
96 if (mark == NULL) {
97 IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
98 return -ENOMEM;
99 }
100 mark->cl = &svc->destinations;
101 mark->cw = 0;
102 mark->mw = ip_vs_wrr_max_weight(svc);
103 mark->di = ip_vs_wrr_gcd_weight(svc);
104 svc->sched_data = mark;
105
106 return 0;
107}
108
109
110static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
111{
112 /*
113 * Release the mark variable
114 */
115 kfree(svc->sched_data);
116
117 return 0;
118}
119
120
121static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
122{
123 struct ip_vs_wrr_mark *mark = svc->sched_data;
124
125 mark->cl = &svc->destinations;
126 mark->mw = ip_vs_wrr_max_weight(svc);
127 mark->di = ip_vs_wrr_gcd_weight(svc);
128 if (mark->cw > mark->mw)
129 mark->cw = 0;
130 return 0;
131}
132
133
134/*
135 * Weighted Round-Robin Scheduling
136 */
137static struct ip_vs_dest *
138ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
139{
140 struct ip_vs_dest *dest;
141 struct ip_vs_wrr_mark *mark = svc->sched_data;
142 struct list_head *p;
143
144 IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
145
146 /*
147 * This loop will always terminate, because mark->cw in (0, max_weight]
148 * and at least one server has its weight equal to max_weight.
149 */
150 write_lock(&svc->sched_lock);
151 p = mark->cl;
152 while (1) {
153 if (mark->cl == &svc->destinations) {
154 /* it is at the head of the destination list */
155
156 if (mark->cl == mark->cl->next) {
157 /* no dest entry */
158 dest = NULL;
159 goto out;
160 }
161
162 mark->cl = svc->destinations.next;
163 mark->cw -= mark->di;
164 if (mark->cw <= 0) {
165 mark->cw = mark->mw;
166 /*
167 * Still zero, which means no available servers.
168 */
169 if (mark->cw == 0) {
170 mark->cl = &svc->destinations;
171 IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
172 "no available servers\n");
173 dest = NULL;
174 goto out;
175 }
176 }
177 } else
178 mark->cl = mark->cl->next;
179
180 if (mark->cl != &svc->destinations) {
181 /* not at the head of the list */
182 dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
183 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
184 atomic_read(&dest->weight) >= mark->cw) {
185 /* got it */
186 break;
187 }
188 }
189
190 if (mark->cl == p && mark->cw == mark->di) {
191 /* back to the start, and no dest is found.
192 It is only possible when all dests are OVERLOADED */
193 dest = NULL;
194 goto out;
195 }
196 }
197
198 IP_VS_DBG_BUF(6, "WRR: server %s:%u "
199 "activeconns %d refcnt %d weight %d\n",
200 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
201 atomic_read(&dest->activeconns),
202 atomic_read(&dest->refcnt),
203 atomic_read(&dest->weight));
204
205 out:
206 write_unlock(&svc->sched_lock);
207 return dest;
208}
209
210
211static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
212 .name = "wrr",
213 .refcnt = ATOMIC_INIT(0),
214 .module = THIS_MODULE,
215 .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
216#ifdef CONFIG_IP_VS_IPV6
217 .supports_ipv6 = 1,
218#endif
219 .init_service = ip_vs_wrr_init_svc,
220 .done_service = ip_vs_wrr_done_svc,
221 .update_service = ip_vs_wrr_update_svc,
222 .schedule = ip_vs_wrr_schedule,
223};
224
225static int __init ip_vs_wrr_init(void)
226{
227 return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
228}
229
230static void __exit ip_vs_wrr_cleanup(void)
231{
232 unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
233}
234
235module_init(ip_vs_wrr_init);
236module_exit(ip_vs_wrr_cleanup);
237MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
deleted file mode 100644
index 02ddc2b3ce2e..000000000000
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ /dev/null
@@ -1,1004 +0,0 @@
1/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/tcp.h> /* for tcphdr */
18#include <net/ip.h>
19#include <net/tcp.h> /* for csum_tcpudp_magic */
20#include <net/udp.h>
21#include <net/icmp.h> /* for icmp_send */
22#include <net/route.h> /* for ip_route_output */
23#include <net/ipv6.h>
24#include <net/ip6_route.h>
25#include <linux/icmpv6.h>
26#include <linux/netfilter.h>
27#include <linux/netfilter_ipv4.h>
28
29#include <net/ip_vs.h>
30
31
32/*
33 * Destination cache to speed up outgoing route lookup
34 */
35static inline void
36__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
37{
38 struct dst_entry *old_dst;
39
40 old_dst = dest->dst_cache;
41 dest->dst_cache = dst;
42 dest->dst_rtos = rtos;
43 dst_release(old_dst);
44}
45
46static inline struct dst_entry *
47__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
48{
49 struct dst_entry *dst = dest->dst_cache;
50
51 if (!dst)
52 return NULL;
53 if ((dst->obsolete
54 || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
55 dst->ops->check(dst, cookie) == NULL) {
56 dest->dst_cache = NULL;
57 dst_release(dst);
58 return NULL;
59 }
60 dst_hold(dst);
61 return dst;
62}
63
64static struct rtable *
65__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
66{
67 struct rtable *rt; /* Route to the other host */
68 struct ip_vs_dest *dest = cp->dest;
69
70 if (dest) {
71 spin_lock(&dest->dst_lock);
72 if (!(rt = (struct rtable *)
73 __ip_vs_dst_check(dest, rtos, 0))) {
74 struct flowi fl = {
75 .oif = 0,
76 .nl_u = {
77 .ip4_u = {
78 .daddr = dest->addr.ip,
79 .saddr = 0,
80 .tos = rtos, } },
81 };
82
83 if (ip_route_output_key(&init_net, &rt, &fl)) {
84 spin_unlock(&dest->dst_lock);
85 IP_VS_DBG_RL("ip_route_output error, "
86 "dest: %u.%u.%u.%u\n",
87 NIPQUAD(dest->addr.ip));
88 return NULL;
89 }
90 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
91 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
92 NIPQUAD(dest->addr.ip),
93 atomic_read(&rt->u.dst.__refcnt), rtos);
94 }
95 spin_unlock(&dest->dst_lock);
96 } else {
97 struct flowi fl = {
98 .oif = 0,
99 .nl_u = {
100 .ip4_u = {
101 .daddr = cp->daddr.ip,
102 .saddr = 0,
103 .tos = rtos, } },
104 };
105
106 if (ip_route_output_key(&init_net, &rt, &fl)) {
107 IP_VS_DBG_RL("ip_route_output error, dest: "
108 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
109 return NULL;
110 }
111 }
112
113 return rt;
114}
115
116#ifdef CONFIG_IP_VS_IPV6
117static struct rt6_info *
118__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
119{
120 struct rt6_info *rt; /* Route to the other host */
121 struct ip_vs_dest *dest = cp->dest;
122
123 if (dest) {
124 spin_lock(&dest->dst_lock);
125 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
126 if (!rt) {
127 struct flowi fl = {
128 .oif = 0,
129 .nl_u = {
130 .ip6_u = {
131 .daddr = dest->addr.in6,
132 .saddr = {
133 .s6_addr32 =
134 { 0, 0, 0, 0 },
135 },
136 },
137 },
138 };
139
140 rt = (struct rt6_info *)ip6_route_output(&init_net,
141 NULL, &fl);
142 if (!rt) {
143 spin_unlock(&dest->dst_lock);
144 IP_VS_DBG_RL("ip6_route_output error, "
145 "dest: " NIP6_FMT "\n",
146 NIP6(dest->addr.in6));
147 return NULL;
148 }
149 __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
150 IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
151 NIP6(dest->addr.in6),
152 atomic_read(&rt->u.dst.__refcnt));
153 }
154 spin_unlock(&dest->dst_lock);
155 } else {
156 struct flowi fl = {
157 .oif = 0,
158 .nl_u = {
159 .ip6_u = {
160 .daddr = cp->daddr.in6,
161 .saddr = {
162 .s6_addr32 = { 0, 0, 0, 0 },
163 },
164 },
165 },
166 };
167
168 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
169 if (!rt) {
170 IP_VS_DBG_RL("ip6_route_output error, dest: "
171 NIP6_FMT "\n", NIP6(cp->daddr.in6));
172 return NULL;
173 }
174 }
175
176 return rt;
177}
178#endif
179
180
181/*
182 * Release dest->dst_cache before a dest is removed
183 */
184void
185ip_vs_dst_reset(struct ip_vs_dest *dest)
186{
187 struct dst_entry *old_dst;
188
189 old_dst = dest->dst_cache;
190 dest->dst_cache = NULL;
191 dst_release(old_dst);
192}
193
194#define IP_VS_XMIT(pf, skb, rt) \
195do { \
196 (skb)->ipvs_property = 1; \
197 skb_forward_csum(skb); \
198 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
199 (rt)->u.dst.dev, dst_output); \
200} while (0)
201
202
203/*
204 * NULL transmitter (do nothing except return NF_ACCEPT)
205 */
206int
207ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
208 struct ip_vs_protocol *pp)
209{
210 /* we do not touch skb and do not need pskb ptr */
211 return NF_ACCEPT;
212}
213
214
215/*
216 * Bypass transmitter
217 * Let packets bypass the destination when the destination is not
218 * available, it may be only used in transparent cache cluster.
219 */
220int
221ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
222 struct ip_vs_protocol *pp)
223{
224 struct rtable *rt; /* Route to the other host */
225 struct iphdr *iph = ip_hdr(skb);
226 u8 tos = iph->tos;
227 int mtu;
228 struct flowi fl = {
229 .oif = 0,
230 .nl_u = {
231 .ip4_u = {
232 .daddr = iph->daddr,
233 .saddr = 0,
234 .tos = RT_TOS(tos), } },
235 };
236
237 EnterFunction(10);
238
239 if (ip_route_output_key(&init_net, &rt, &fl)) {
240 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
241 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
242 goto tx_error_icmp;
243 }
244
245 /* MTU checking */
246 mtu = dst_mtu(&rt->u.dst);
247 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
248 ip_rt_put(rt);
249 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
250 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
251 goto tx_error;
252 }
253
254 /*
255 * Call ip_send_check because we are not sure it is called
256 * after ip_defrag. Is copy-on-write needed?
257 */
258 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
259 ip_rt_put(rt);
260 return NF_STOLEN;
261 }
262 ip_send_check(ip_hdr(skb));
263
264 /* drop old route */
265 dst_release(skb->dst);
266 skb->dst = &rt->u.dst;
267
268 /* Another hack: avoid icmp_send in ip_fragment */
269 skb->local_df = 1;
270
271 IP_VS_XMIT(PF_INET, skb, rt);
272
273 LeaveFunction(10);
274 return NF_STOLEN;
275
276 tx_error_icmp:
277 dst_link_failure(skb);
278 tx_error:
279 kfree_skb(skb);
280 LeaveFunction(10);
281 return NF_STOLEN;
282}
283
284#ifdef CONFIG_IP_VS_IPV6
285int
286ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
287 struct ip_vs_protocol *pp)
288{
289 struct rt6_info *rt; /* Route to the other host */
290 struct ipv6hdr *iph = ipv6_hdr(skb);
291 int mtu;
292 struct flowi fl = {
293 .oif = 0,
294 .nl_u = {
295 .ip6_u = {
296 .daddr = iph->daddr,
297 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
298 };
299
300 EnterFunction(10);
301
302 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
303 if (!rt) {
304 IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
305 "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
306 goto tx_error_icmp;
307 }
308
309 /* MTU checking */
310 mtu = dst_mtu(&rt->u.dst);
311 if (skb->len > mtu) {
312 dst_release(&rt->u.dst);
313 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
314 IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
315 goto tx_error;
316 }
317
318 /*
319 * Call ip_send_check because we are not sure it is called
320 * after ip_defrag. Is copy-on-write needed?
321 */
322 skb = skb_share_check(skb, GFP_ATOMIC);
323 if (unlikely(skb == NULL)) {
324 dst_release(&rt->u.dst);
325 return NF_STOLEN;
326 }
327
328 /* drop old route */
329 dst_release(skb->dst);
330 skb->dst = &rt->u.dst;
331
332 /* Another hack: avoid icmp_send in ip_fragment */
333 skb->local_df = 1;
334
335 IP_VS_XMIT(PF_INET6, skb, rt);
336
337 LeaveFunction(10);
338 return NF_STOLEN;
339
340 tx_error_icmp:
341 dst_link_failure(skb);
342 tx_error:
343 kfree_skb(skb);
344 LeaveFunction(10);
345 return NF_STOLEN;
346}
347#endif
348
349/*
350 * NAT transmitter (only for outside-to-inside nat forwarding)
351 * Not used for related ICMP
352 */
353int
354ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
355 struct ip_vs_protocol *pp)
356{
357 struct rtable *rt; /* Route to the other host */
358 int mtu;
359 struct iphdr *iph = ip_hdr(skb);
360
361 EnterFunction(10);
362
363 /* check if it is a connection of no-client-port */
364 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
365 __be16 _pt, *p;
366 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
367 if (p == NULL)
368 goto tx_error;
369 ip_vs_conn_fill_cport(cp, *p);
370 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
371 }
372
373 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
374 goto tx_error_icmp;
375
376 /* MTU checking */
377 mtu = dst_mtu(&rt->u.dst);
378 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
379 ip_rt_put(rt);
380 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
381 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
382 goto tx_error;
383 }
384
385 /* copy-on-write the packet before mangling it */
386 if (!skb_make_writable(skb, sizeof(struct iphdr)))
387 goto tx_error_put;
388
389 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
390 goto tx_error_put;
391
392 /* drop old route */
393 dst_release(skb->dst);
394 skb->dst = &rt->u.dst;
395
396 /* mangle the packet */
397 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
398 goto tx_error;
399 ip_hdr(skb)->daddr = cp->daddr.ip;
400 ip_send_check(ip_hdr(skb));
401
402 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
403
404 /* FIXME: when application helper enlarges the packet and the length
405 is larger than the MTU of outgoing device, there will be still
406 MTU problem. */
407
408 /* Another hack: avoid icmp_send in ip_fragment */
409 skb->local_df = 1;
410
411 IP_VS_XMIT(PF_INET, skb, rt);
412
413 LeaveFunction(10);
414 return NF_STOLEN;
415
416 tx_error_icmp:
417 dst_link_failure(skb);
418 tx_error:
419 LeaveFunction(10);
420 kfree_skb(skb);
421 return NF_STOLEN;
422 tx_error_put:
423 ip_rt_put(rt);
424 goto tx_error;
425}
426
427#ifdef CONFIG_IP_VS_IPV6
428int
429ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
430 struct ip_vs_protocol *pp)
431{
432 struct rt6_info *rt; /* Route to the other host */
433 int mtu;
434
435 EnterFunction(10);
436
437 /* check if it is a connection of no-client-port */
438 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
439 __be16 _pt, *p;
440 p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
441 sizeof(_pt), &_pt);
442 if (p == NULL)
443 goto tx_error;
444 ip_vs_conn_fill_cport(cp, *p);
445 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
446 }
447
448 rt = __ip_vs_get_out_rt_v6(cp);
449 if (!rt)
450 goto tx_error_icmp;
451
452 /* MTU checking */
453 mtu = dst_mtu(&rt->u.dst);
454 if (skb->len > mtu) {
455 dst_release(&rt->u.dst);
456 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
457 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
458 "ip_vs_nat_xmit_v6(): frag needed for");
459 goto tx_error;
460 }
461
462 /* copy-on-write the packet before mangling it */
463 if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
464 goto tx_error_put;
465
466 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
467 goto tx_error_put;
468
469 /* drop old route */
470 dst_release(skb->dst);
471 skb->dst = &rt->u.dst;
472
473 /* mangle the packet */
474 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
475 goto tx_error;
476 ipv6_hdr(skb)->daddr = cp->daddr.in6;
477
478 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
479
480 /* FIXME: when application helper enlarges the packet and the length
481 is larger than the MTU of outgoing device, there will be still
482 MTU problem. */
483
484 /* Another hack: avoid icmp_send in ip_fragment */
485 skb->local_df = 1;
486
487 IP_VS_XMIT(PF_INET6, skb, rt);
488
489 LeaveFunction(10);
490 return NF_STOLEN;
491
492tx_error_icmp:
493 dst_link_failure(skb);
494tx_error:
495 LeaveFunction(10);
496 kfree_skb(skb);
497 return NF_STOLEN;
498tx_error_put:
499 dst_release(&rt->u.dst);
500 goto tx_error;
501}
502#endif
503
504
505/*
506 * IP Tunneling transmitter
507 *
508 * This function encapsulates the packet in a new IP packet, its
509 * destination will be set to cp->daddr. Most code of this function
510 * is taken from ipip.c.
511 *
512 * It is used in VS/TUN cluster. The load balancer selects a real
513 * server from a cluster based on a scheduling algorithm,
514 * encapsulates the request packet and forwards it to the selected
515 * server. For example, all real servers are configured with
516 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
517 * the encapsulated packet, it will decapsulate the packet, processe
518 * the request and return the response packets directly to the client
519 * without passing the load balancer. This can greatly increase the
520 * scalability of virtual server.
521 *
522 * Used for ANY protocol
523 */
524int
525ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
526 struct ip_vs_protocol *pp)
527{
528 struct rtable *rt; /* Route to the other host */
529 struct net_device *tdev; /* Device to other host */
530 struct iphdr *old_iph = ip_hdr(skb);
531 u8 tos = old_iph->tos;
532 __be16 df = old_iph->frag_off;
533 sk_buff_data_t old_transport_header = skb->transport_header;
534 struct iphdr *iph; /* Our new IP header */
535 unsigned int max_headroom; /* The extra header space needed */
536 int mtu;
537
538 EnterFunction(10);
539
540 if (skb->protocol != htons(ETH_P_IP)) {
541 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
542 "ETH_P_IP: %d, skb protocol: %d\n",
543 htons(ETH_P_IP), skb->protocol);
544 goto tx_error;
545 }
546
547 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
548 goto tx_error_icmp;
549
550 tdev = rt->u.dst.dev;
551
552 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
553 if (mtu < 68) {
554 ip_rt_put(rt);
555 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
556 goto tx_error;
557 }
558 if (skb->dst)
559 skb->dst->ops->update_pmtu(skb->dst, mtu);
560
561 df |= (old_iph->frag_off & htons(IP_DF));
562
563 if ((old_iph->frag_off & htons(IP_DF))
564 && mtu < ntohs(old_iph->tot_len)) {
565 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
566 ip_rt_put(rt);
567 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
568 goto tx_error;
569 }
570
571 /*
572 * Okay, now see if we can stuff it in the buffer as-is.
573 */
574 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
575
576 if (skb_headroom(skb) < max_headroom
577 || skb_cloned(skb) || skb_shared(skb)) {
578 struct sk_buff *new_skb =
579 skb_realloc_headroom(skb, max_headroom);
580 if (!new_skb) {
581 ip_rt_put(rt);
582 kfree_skb(skb);
583 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
584 return NF_STOLEN;
585 }
586 kfree_skb(skb);
587 skb = new_skb;
588 old_iph = ip_hdr(skb);
589 }
590
591 skb->transport_header = old_transport_header;
592
593 /* fix old IP header checksum */
594 ip_send_check(old_iph);
595
596 skb_push(skb, sizeof(struct iphdr));
597 skb_reset_network_header(skb);
598 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
599
600 /* drop old route */
601 dst_release(skb->dst);
602 skb->dst = &rt->u.dst;
603
604 /*
605 * Push down and install the IPIP header.
606 */
607 iph = ip_hdr(skb);
608 iph->version = 4;
609 iph->ihl = sizeof(struct iphdr)>>2;
610 iph->frag_off = df;
611 iph->protocol = IPPROTO_IPIP;
612 iph->tos = tos;
613 iph->daddr = rt->rt_dst;
614 iph->saddr = rt->rt_src;
615 iph->ttl = old_iph->ttl;
616 ip_select_ident(iph, &rt->u.dst, NULL);
617
618 /* Another hack: avoid icmp_send in ip_fragment */
619 skb->local_df = 1;
620
621 ip_local_out(skb);
622
623 LeaveFunction(10);
624
625 return NF_STOLEN;
626
627 tx_error_icmp:
628 dst_link_failure(skb);
629 tx_error:
630 kfree_skb(skb);
631 LeaveFunction(10);
632 return NF_STOLEN;
633}
634
635#ifdef CONFIG_IP_VS_IPV6
636int
637ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
638 struct ip_vs_protocol *pp)
639{
640 struct rt6_info *rt; /* Route to the other host */
641 struct net_device *tdev; /* Device to other host */
642 struct ipv6hdr *old_iph = ipv6_hdr(skb);
643 sk_buff_data_t old_transport_header = skb->transport_header;
644 struct ipv6hdr *iph; /* Our new IP header */
645 unsigned int max_headroom; /* The extra header space needed */
646 int mtu;
647
648 EnterFunction(10);
649
650 if (skb->protocol != htons(ETH_P_IPV6)) {
651 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
652 "ETH_P_IPV6: %d, skb protocol: %d\n",
653 htons(ETH_P_IPV6), skb->protocol);
654 goto tx_error;
655 }
656
657 rt = __ip_vs_get_out_rt_v6(cp);
658 if (!rt)
659 goto tx_error_icmp;
660
661 tdev = rt->u.dst.dev;
662
663 mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
664 /* TODO IPv6: do we need this check in IPv6? */
665 if (mtu < 1280) {
666 dst_release(&rt->u.dst);
667 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
668 goto tx_error;
669 }
670 if (skb->dst)
671 skb->dst->ops->update_pmtu(skb->dst, mtu);
672
673 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
674 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
675 dst_release(&rt->u.dst);
676 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
677 goto tx_error;
678 }
679
680 /*
681 * Okay, now see if we can stuff it in the buffer as-is.
682 */
683 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
684
685 if (skb_headroom(skb) < max_headroom
686 || skb_cloned(skb) || skb_shared(skb)) {
687 struct sk_buff *new_skb =
688 skb_realloc_headroom(skb, max_headroom);
689 if (!new_skb) {
690 dst_release(&rt->u.dst);
691 kfree_skb(skb);
692 IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
693 return NF_STOLEN;
694 }
695 kfree_skb(skb);
696 skb = new_skb;
697 old_iph = ipv6_hdr(skb);
698 }
699
700 skb->transport_header = old_transport_header;
701
702 skb_push(skb, sizeof(struct ipv6hdr));
703 skb_reset_network_header(skb);
704 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
705
706 /* drop old route */
707 dst_release(skb->dst);
708 skb->dst = &rt->u.dst;
709
710 /*
711 * Push down and install the IPIP header.
712 */
713 iph = ipv6_hdr(skb);
714 iph->version = 6;
715 iph->nexthdr = IPPROTO_IPV6;
716 iph->payload_len = old_iph->payload_len + sizeof(old_iph);
717 iph->priority = old_iph->priority;
718 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
719 iph->daddr = rt->rt6i_dst.addr;
720 iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */
721 iph->hop_limit = old_iph->hop_limit;
722
723 /* Another hack: avoid icmp_send in ip_fragment */
724 skb->local_df = 1;
725
726 ip6_local_out(skb);
727
728 LeaveFunction(10);
729
730 return NF_STOLEN;
731
732tx_error_icmp:
733 dst_link_failure(skb);
734tx_error:
735 kfree_skb(skb);
736 LeaveFunction(10);
737 return NF_STOLEN;
738}
739#endif
740
741
742/*
743 * Direct Routing transmitter
744 * Used for ANY protocol
745 */
746int
747ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
748 struct ip_vs_protocol *pp)
749{
750 struct rtable *rt; /* Route to the other host */
751 struct iphdr *iph = ip_hdr(skb);
752 int mtu;
753
754 EnterFunction(10);
755
756 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
757 goto tx_error_icmp;
758
759 /* MTU checking */
760 mtu = dst_mtu(&rt->u.dst);
761 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
762 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
763 ip_rt_put(rt);
764 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
765 goto tx_error;
766 }
767
768 /*
769 * Call ip_send_check because we are not sure it is called
770 * after ip_defrag. Is copy-on-write needed?
771 */
772 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
773 ip_rt_put(rt);
774 return NF_STOLEN;
775 }
776 ip_send_check(ip_hdr(skb));
777
778 /* drop old route */
779 dst_release(skb->dst);
780 skb->dst = &rt->u.dst;
781
782 /* Another hack: avoid icmp_send in ip_fragment */
783 skb->local_df = 1;
784
785 IP_VS_XMIT(PF_INET, skb, rt);
786
787 LeaveFunction(10);
788 return NF_STOLEN;
789
790 tx_error_icmp:
791 dst_link_failure(skb);
792 tx_error:
793 kfree_skb(skb);
794 LeaveFunction(10);
795 return NF_STOLEN;
796}
797
798#ifdef CONFIG_IP_VS_IPV6
799int
800ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
801 struct ip_vs_protocol *pp)
802{
803 struct rt6_info *rt; /* Route to the other host */
804 int mtu;
805
806 EnterFunction(10);
807
808 rt = __ip_vs_get_out_rt_v6(cp);
809 if (!rt)
810 goto tx_error_icmp;
811
812 /* MTU checking */
813 mtu = dst_mtu(&rt->u.dst);
814 if (skb->len > mtu) {
815 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
816 dst_release(&rt->u.dst);
817 IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
818 goto tx_error;
819 }
820
821 /*
822 * Call ip_send_check because we are not sure it is called
823 * after ip_defrag. Is copy-on-write needed?
824 */
825 skb = skb_share_check(skb, GFP_ATOMIC);
826 if (unlikely(skb == NULL)) {
827 dst_release(&rt->u.dst);
828 return NF_STOLEN;
829 }
830
831 /* drop old route */
832 dst_release(skb->dst);
833 skb->dst = &rt->u.dst;
834
835 /* Another hack: avoid icmp_send in ip_fragment */
836 skb->local_df = 1;
837
838 IP_VS_XMIT(PF_INET6, skb, rt);
839
840 LeaveFunction(10);
841 return NF_STOLEN;
842
843tx_error_icmp:
844 dst_link_failure(skb);
845tx_error:
846 kfree_skb(skb);
847 LeaveFunction(10);
848 return NF_STOLEN;
849}
850#endif
851
852
853/*
854 * ICMP packet transmitter
855 * called by the ip_vs_in_icmp
856 */
857int
858ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
859 struct ip_vs_protocol *pp, int offset)
860{
861 struct rtable *rt; /* Route to the other host */
862 int mtu;
863 int rc;
864
865 EnterFunction(10);
866
867 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
868 forwarded directly here, because there is no need to
869 translate address/port back */
870 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
871 if (cp->packet_xmit)
872 rc = cp->packet_xmit(skb, cp, pp);
873 else
874 rc = NF_ACCEPT;
875 /* do not touch skb anymore */
876 atomic_inc(&cp->in_pkts);
877 goto out;
878 }
879
880 /*
881 * mangle and send the packet here (only for VS/NAT)
882 */
883
884 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
885 goto tx_error_icmp;
886
887 /* MTU checking */
888 mtu = dst_mtu(&rt->u.dst);
889 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
890 ip_rt_put(rt);
891 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
892 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
893 goto tx_error;
894 }
895
896 /* copy-on-write the packet before mangling it */
897 if (!skb_make_writable(skb, offset))
898 goto tx_error_put;
899
900 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
901 goto tx_error_put;
902
903 /* drop the old route when skb is not shared */
904 dst_release(skb->dst);
905 skb->dst = &rt->u.dst;
906
907 ip_vs_nat_icmp(skb, pp, cp, 0);
908
909 /* Another hack: avoid icmp_send in ip_fragment */
910 skb->local_df = 1;
911
912 IP_VS_XMIT(PF_INET, skb, rt);
913
914 rc = NF_STOLEN;
915 goto out;
916
917 tx_error_icmp:
918 dst_link_failure(skb);
919 tx_error:
920 dev_kfree_skb(skb);
921 rc = NF_STOLEN;
922 out:
923 LeaveFunction(10);
924 return rc;
925 tx_error_put:
926 ip_rt_put(rt);
927 goto tx_error;
928}
929
930#ifdef CONFIG_IP_VS_IPV6
931int
932ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
933 struct ip_vs_protocol *pp, int offset)
934{
935 struct rt6_info *rt; /* Route to the other host */
936 int mtu;
937 int rc;
938
939 EnterFunction(10);
940
941 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
942 forwarded directly here, because there is no need to
943 translate address/port back */
944 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
945 if (cp->packet_xmit)
946 rc = cp->packet_xmit(skb, cp, pp);
947 else
948 rc = NF_ACCEPT;
949 /* do not touch skb anymore */
950 atomic_inc(&cp->in_pkts);
951 goto out;
952 }
953
954 /*
955 * mangle and send the packet here (only for VS/NAT)
956 */
957
958 rt = __ip_vs_get_out_rt_v6(cp);
959 if (!rt)
960 goto tx_error_icmp;
961
962 /* MTU checking */
963 mtu = dst_mtu(&rt->u.dst);
964 if (skb->len > mtu) {
965 dst_release(&rt->u.dst);
966 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
967 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
968 goto tx_error;
969 }
970
971 /* copy-on-write the packet before mangling it */
972 if (!skb_make_writable(skb, offset))
973 goto tx_error_put;
974
975 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
976 goto tx_error_put;
977
978 /* drop the old route when skb is not shared */
979 dst_release(skb->dst);
980 skb->dst = &rt->u.dst;
981
982 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
983
984 /* Another hack: avoid icmp_send in ip_fragment */
985 skb->local_df = 1;
986
987 IP_VS_XMIT(PF_INET6, skb, rt);
988
989 rc = NF_STOLEN;
990 goto out;
991
992tx_error_icmp:
993 dst_link_failure(skb);
994tx_error:
995 dev_kfree_skb(skb);
996 rc = NF_STOLEN;
997out:
998 LeaveFunction(10);
999 return rc;
1000tx_error_put:
1001 dst_release(&rt->u.dst);
1002 goto tx_error;
1003}
1004#endif