aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2008-10-08 17:26:36 -0400
committerDavid S. Miller <davem@davemloft.net>2008-10-08 17:26:36 -0400
commitdb2bf2476b2d99d91b4ce87e102dd3a61e92366f (patch)
tree4b41acff6edf29fd7ea23bdadfb95bcf67165590 /net/ipv4
parent02015180e2509afd2e3fe3790a333b30708a116b (diff)
parenta5e8546a8bff5d2047adc279df5753c44ba7b1a1 (diff)
Merge branch 'lvs-next-2.6' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-2.6
Conflicts: net/netfilter/Kconfig
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig2
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/ipvs/Kconfig239
-rw-r--r--net/ipv4/ipvs/Makefile33
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c622
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c1110
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1542
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c3441
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c166
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c410
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c555
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c755
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c103
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c138
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c288
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah_esp.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c732
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c533
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c112
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c140
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c258
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c940
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c128
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c237
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c1004
26 files changed, 0 insertions, 13975 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 591ea23639ca..691268f3a359 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -630,5 +630,3 @@ config TCP_MD5SIG
630 630
631 If unsure, say N. 631 If unsure, say N.
632 632
633source "net/ipv4/ipvs/Kconfig"
634
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ad40ef3f9ebc..80ff87ce43aa 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
33obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o 33obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
34obj-$(CONFIG_IP_PNP) += ipconfig.o 34obj-$(CONFIG_IP_PNP) += ipconfig.o
35obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ 35obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
36obj-$(CONFIG_IP_VS) += ipvs/
37obj-$(CONFIG_INET_DIAG) += inet_diag.o 36obj-$(CONFIG_INET_DIAG) += inet_diag.o
38obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 37obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
39obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 38obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
deleted file mode 100644
index de6004de80bc..000000000000
--- a/net/ipv4/ipvs/Kconfig
+++ /dev/null
@@ -1,239 +0,0 @@
1#
2# IP Virtual Server configuration
3#
4menuconfig IP_VS
5 tristate "IP virtual server support (EXPERIMENTAL)"
6 depends on NETFILTER
7 ---help---
8 IP Virtual Server support will let you build a high-performance
9 virtual server based on cluster of two or more real servers. This
10 option must be enabled for at least one of the clustered computers
11 that will take care of intercepting incoming connections to a
12 single IP address and scheduling them to real servers.
13
14 Three request dispatching techniques are implemented, they are
15 virtual server via NAT, virtual server via tunneling and virtual
16 server via direct routing. The several scheduling algorithms can
17 be used to choose which server the connection is directed to,
18 thus load balancing can be achieved among the servers. For more
19 information and its administration program, please visit the
20 following URL: <http://www.linuxvirtualserver.org/>.
21
22 If you want to compile it in kernel, say Y. To compile it as a
23 module, choose M here. If unsure, say N.
24
25if IP_VS
26
27config IP_VS_IPV6
28 bool "IPv6 support for IPVS (DANGEROUS)"
29 depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
30 ---help---
31 Add IPv6 support to IPVS. This is incomplete and might be dangerous.
32
33 Say N if unsure.
34
35config IP_VS_DEBUG
36 bool "IP virtual server debugging"
37 ---help---
38 Say Y here if you want to get additional messages useful in
39 debugging the IP virtual server code. You can change the debug
40 level in /proc/sys/net/ipv4/vs/debug_level
41
42config IP_VS_TAB_BITS
43 int "IPVS connection table size (the Nth power of 2)"
44 range 8 20
45 default 12
46 ---help---
47 The IPVS connection hash table uses the chaining scheme to handle
48 hash collisions. Using a big IPVS connection hash table will greatly
49 reduce conflicts when there are hundreds of thousands of connections
50 in the hash table.
51
52 Note the table size must be power of 2. The table size will be the
53 value of 2 to the your input number power. The number to choose is
54 from 8 to 20, the default number is 12, which means the table size
55 is 4096. Don't input the number too small, otherwise you will lose
56 performance on it. You can adapt the table size yourself, according
57 to your virtual server application. It is good to set the table size
58 not far less than the number of connections per second multiplying
59 average lasting time of connection in the table. For example, your
60 virtual server gets 200 connections per second, the connection lasts
61 for 200 seconds in average in the connection table, the table size
62 should be not far less than 200x200, it is good to set the table
63 size 32768 (2**15).
64
65 Another note that each connection occupies 128 bytes effectively and
66 each hash entry uses 8 bytes, so you can estimate how much memory is
67 needed for your box.
68
69comment "IPVS transport protocol load balancing support"
70
71config IP_VS_PROTO_TCP
72 bool "TCP load balancing support"
73 ---help---
74 This option enables support for load balancing TCP transport
75 protocol. Say Y if unsure.
76
77config IP_VS_PROTO_UDP
78 bool "UDP load balancing support"
79 ---help---
80 This option enables support for load balancing UDP transport
81 protocol. Say Y if unsure.
82
83config IP_VS_PROTO_AH_ESP
84 bool
85 depends on UNDEFINED
86
87config IP_VS_PROTO_ESP
88 bool "ESP load balancing support"
89 select IP_VS_PROTO_AH_ESP
90 ---help---
91 This option enables support for load balancing ESP (Encapsulation
92 Security Payload) transport protocol. Say Y if unsure.
93
94config IP_VS_PROTO_AH
95 bool "AH load balancing support"
96 select IP_VS_PROTO_AH_ESP
97 ---help---
98 This option enables support for load balancing AH (Authentication
99 Header) transport protocol. Say Y if unsure.
100
101comment "IPVS scheduler"
102
103config IP_VS_RR
104 tristate "round-robin scheduling"
105 ---help---
106 The robin-robin scheduling algorithm simply directs network
107 connections to different real servers in a round-robin manner.
108
109 If you want to compile it in kernel, say Y. To compile it as a
110 module, choose M here. If unsure, say N.
111
112config IP_VS_WRR
113 tristate "weighted round-robin scheduling"
114 ---help---
115 The weighted robin-robin scheduling algorithm directs network
116 connections to different real servers based on server weights
117 in a round-robin manner. Servers with higher weights receive
118 new connections first than those with less weights, and servers
119 with higher weights get more connections than those with less
120 weights and servers with equal weights get equal connections.
121
122 If you want to compile it in kernel, say Y. To compile it as a
123 module, choose M here. If unsure, say N.
124
125config IP_VS_LC
126 tristate "least-connection scheduling"
127 ---help---
128 The least-connection scheduling algorithm directs network
129 connections to the server with the least number of active
130 connections.
131
132 If you want to compile it in kernel, say Y. To compile it as a
133 module, choose M here. If unsure, say N.
134
135config IP_VS_WLC
136 tristate "weighted least-connection scheduling"
137 ---help---
138 The weighted least-connection scheduling algorithm directs network
139 connections to the server with the least active connections
140 normalized by the server weight.
141
142 If you want to compile it in kernel, say Y. To compile it as a
143 module, choose M here. If unsure, say N.
144
145config IP_VS_LBLC
146 tristate "locality-based least-connection scheduling"
147 ---help---
148 The locality-based least-connection scheduling algorithm is for
149 destination IP load balancing. It is usually used in cache cluster.
150 This algorithm usually directs packet destined for an IP address to
151 its server if the server is alive and under load. If the server is
152 overloaded (its active connection numbers is larger than its weight)
153 and there is a server in its half load, then allocate the weighted
154 least-connection server to this IP address.
155
156 If you want to compile it in kernel, say Y. To compile it as a
157 module, choose M here. If unsure, say N.
158
159config IP_VS_LBLCR
160 tristate "locality-based least-connection with replication scheduling"
161 ---help---
162 The locality-based least-connection with replication scheduling
163 algorithm is also for destination IP load balancing. It is
164 usually used in cache cluster. It differs from the LBLC scheduling
165 as follows: the load balancer maintains mappings from a target
166 to a set of server nodes that can serve the target. Requests for
167 a target are assigned to the least-connection node in the target's
168 server set. If all the node in the server set are over loaded,
169 it picks up a least-connection node in the cluster and adds it
170 in the sever set for the target. If the server set has not been
171 modified for the specified time, the most loaded node is removed
172 from the server set, in order to avoid high degree of replication.
173
174 If you want to compile it in kernel, say Y. To compile it as a
175 module, choose M here. If unsure, say N.
176
177config IP_VS_DH
178 tristate "destination hashing scheduling"
179 ---help---
180 The destination hashing scheduling algorithm assigns network
181 connections to the servers through looking up a statically assigned
182 hash table by their destination IP addresses.
183
184 If you want to compile it in kernel, say Y. To compile it as a
185 module, choose M here. If unsure, say N.
186
187config IP_VS_SH
188 tristate "source hashing scheduling"
189 ---help---
190 The source hashing scheduling algorithm assigns network
191 connections to the servers through looking up a statically assigned
192 hash table by their source IP addresses.
193
194 If you want to compile it in kernel, say Y. To compile it as a
195 module, choose M here. If unsure, say N.
196
197config IP_VS_SED
198 tristate "shortest expected delay scheduling"
199 ---help---
200 The shortest expected delay scheduling algorithm assigns network
201 connections to the server with the shortest expected delay. The
202 expected delay that the job will experience is (Ci + 1) / Ui if
203 sent to the ith server, in which Ci is the number of connections
204 on the ith server and Ui is the fixed service rate (weight)
205 of the ith server.
206
207 If you want to compile it in kernel, say Y. To compile it as a
208 module, choose M here. If unsure, say N.
209
210config IP_VS_NQ
211 tristate "never queue scheduling"
212 ---help---
213 The never queue scheduling algorithm adopts a two-speed model.
214 When there is an idle server available, the job will be sent to
215 the idle server, instead of waiting for a fast one. When there
216 is no idle server available, the job will be sent to the server
217 that minimize its expected delay (The Shortest Expected Delay
218 scheduling algorithm).
219
220 If you want to compile it in kernel, say Y. To compile it as a
221 module, choose M here. If unsure, say N.
222
223comment 'IPVS application helper'
224
225config IP_VS_FTP
226 tristate "FTP protocol helper"
227 depends on IP_VS_PROTO_TCP
228 ---help---
229 FTP is a protocol that transfers IP address and/or port number in
230 the payload. In the virtual server via Network Address Translation,
231 the IP address and port number of real servers cannot be sent to
232 clients in ftp connections directly, so FTP protocol helper is
233 required for tracking the connection and mangling it back to that of
234 virtual service.
235
236 If you want to compile it in kernel, say Y. To compile it as a
237 module, choose M here. If unsure, say N.
238
239endif # IP_VS
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
deleted file mode 100644
index 73a46fe1fe4c..000000000000
--- a/net/ipv4/ipvs/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
1#
2# Makefile for the IPVS modules on top of IPv4.
3#
4
5# IPVS transport protocol load balancing support
6ip_vs_proto-objs-y :=
7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
10
11ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
12 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
13 ip_vs_est.o ip_vs_proto.o \
14 $(ip_vs_proto-objs-y)
15
16
17# IPVS core
18obj-$(CONFIG_IP_VS) += ip_vs.o
19
20# IPVS schedulers
21obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
22obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
23obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
24obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
25obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
26obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
27obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
28obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
29obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
30obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
31
32# IPVS application helpers
33obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
deleted file mode 100644
index 201b8ea3020d..000000000000
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ /dev/null
@@ -1,622 +0,0 @@
1/*
2 * ip_vs_app.c: Application module support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
12 * is that ip_vs_app module handles the reverse direction (incoming requests
13 * and outgoing responses).
14 *
15 * IP_MASQ_APP application masquerading module
16 *
17 * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/skbuff.h>
24#include <linux/in.h>
25#include <linux/ip.h>
26#include <linux/netfilter.h>
27#include <net/net_namespace.h>
28#include <net/protocol.h>
29#include <net/tcp.h>
30#include <asm/system.h>
31#include <linux/stat.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/mutex.h>
35
36#include <net/ip_vs.h>
37
38EXPORT_SYMBOL(register_ip_vs_app);
39EXPORT_SYMBOL(unregister_ip_vs_app);
40EXPORT_SYMBOL(register_ip_vs_app_inc);
41
42/* ipvs application list head */
43static LIST_HEAD(ip_vs_app_list);
44static DEFINE_MUTEX(__ip_vs_app_mutex);
45
46
47/*
48 * Get an ip_vs_app object
49 */
50static inline int ip_vs_app_get(struct ip_vs_app *app)
51{
52 return try_module_get(app->module);
53}
54
55
56static inline void ip_vs_app_put(struct ip_vs_app *app)
57{
58 module_put(app->module);
59}
60
61
62/*
63 * Allocate/initialize app incarnation and register it in proto apps.
64 */
65static int
66ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
67{
68 struct ip_vs_protocol *pp;
69 struct ip_vs_app *inc;
70 int ret;
71
72 if (!(pp = ip_vs_proto_get(proto)))
73 return -EPROTONOSUPPORT;
74
75 if (!pp->unregister_app)
76 return -EOPNOTSUPP;
77
78 inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
79 if (!inc)
80 return -ENOMEM;
81 INIT_LIST_HEAD(&inc->p_list);
82 INIT_LIST_HEAD(&inc->incs_list);
83 inc->app = app;
84 inc->port = htons(port);
85 atomic_set(&inc->usecnt, 0);
86
87 if (app->timeouts) {
88 inc->timeout_table =
89 ip_vs_create_timeout_table(app->timeouts,
90 app->timeouts_size);
91 if (!inc->timeout_table) {
92 ret = -ENOMEM;
93 goto out;
94 }
95 }
96
97 ret = pp->register_app(inc);
98 if (ret)
99 goto out;
100
101 list_add(&inc->a_list, &app->incs_list);
102 IP_VS_DBG(9, "%s application %s:%u registered\n",
103 pp->name, inc->name, inc->port);
104
105 return 0;
106
107 out:
108 kfree(inc->timeout_table);
109 kfree(inc);
110 return ret;
111}
112
113
114/*
115 * Release app incarnation
116 */
117static void
118ip_vs_app_inc_release(struct ip_vs_app *inc)
119{
120 struct ip_vs_protocol *pp;
121
122 if (!(pp = ip_vs_proto_get(inc->protocol)))
123 return;
124
125 if (pp->unregister_app)
126 pp->unregister_app(inc);
127
128 IP_VS_DBG(9, "%s App %s:%u unregistered\n",
129 pp->name, inc->name, inc->port);
130
131 list_del(&inc->a_list);
132
133 kfree(inc->timeout_table);
134 kfree(inc);
135}
136
137
138/*
139 * Get reference to app inc (only called from softirq)
140 *
141 */
142int ip_vs_app_inc_get(struct ip_vs_app *inc)
143{
144 int result;
145
146 atomic_inc(&inc->usecnt);
147 if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
148 atomic_dec(&inc->usecnt);
149 return result;
150}
151
152
153/*
154 * Put the app inc (only called from timer or net softirq)
155 */
156void ip_vs_app_inc_put(struct ip_vs_app *inc)
157{
158 ip_vs_app_put(inc->app);
159 atomic_dec(&inc->usecnt);
160}
161
162
163/*
164 * Register an application incarnation in protocol applications
165 */
166int
167register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
168{
169 int result;
170
171 mutex_lock(&__ip_vs_app_mutex);
172
173 result = ip_vs_app_inc_new(app, proto, port);
174
175 mutex_unlock(&__ip_vs_app_mutex);
176
177 return result;
178}
179
180
181/*
182 * ip_vs_app registration routine
183 */
184int register_ip_vs_app(struct ip_vs_app *app)
185{
186 /* increase the module use count */
187 ip_vs_use_count_inc();
188
189 mutex_lock(&__ip_vs_app_mutex);
190
191 list_add(&app->a_list, &ip_vs_app_list);
192
193 mutex_unlock(&__ip_vs_app_mutex);
194
195 return 0;
196}
197
198
199/*
200 * ip_vs_app unregistration routine
201 * We are sure there are no app incarnations attached to services
202 */
203void unregister_ip_vs_app(struct ip_vs_app *app)
204{
205 struct ip_vs_app *inc, *nxt;
206
207 mutex_lock(&__ip_vs_app_mutex);
208
209 list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
210 ip_vs_app_inc_release(inc);
211 }
212
213 list_del(&app->a_list);
214
215 mutex_unlock(&__ip_vs_app_mutex);
216
217 /* decrease the module use count */
218 ip_vs_use_count_dec();
219}
220
221
222/*
223 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
224 */
225int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
226{
227 return pp->app_conn_bind(cp);
228}
229
230
231/*
232 * Unbind cp from application incarnation (called by cp destructor)
233 */
234void ip_vs_unbind_app(struct ip_vs_conn *cp)
235{
236 struct ip_vs_app *inc = cp->app;
237
238 if (!inc)
239 return;
240
241 if (inc->unbind_conn)
242 inc->unbind_conn(inc, cp);
243 if (inc->done_conn)
244 inc->done_conn(inc, cp);
245 ip_vs_app_inc_put(inc);
246 cp->app = NULL;
247}
248
249
250/*
251 * Fixes th->seq based on ip_vs_seq info.
252 */
253static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
254{
255 __u32 seq = ntohl(th->seq);
256
257 /*
258 * Adjust seq with delta-offset for all packets after
259 * the most recent resized pkt seq and with previous_delta offset
260 * for all packets before most recent resized pkt seq.
261 */
262 if (vseq->delta || vseq->previous_delta) {
263 if(after(seq, vseq->init_seq)) {
264 th->seq = htonl(seq + vseq->delta);
265 IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
266 vseq->delta);
267 } else {
268 th->seq = htonl(seq + vseq->previous_delta);
269 IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
270 "(%d) to seq\n", vseq->previous_delta);
271 }
272 }
273}
274
275
276/*
277 * Fixes th->ack_seq based on ip_vs_seq info.
278 */
279static inline void
280vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
281{
282 __u32 ack_seq = ntohl(th->ack_seq);
283
284 /*
285 * Adjust ack_seq with delta-offset for
286 * the packets AFTER most recent resized pkt has caused a shift
287 * for packets before most recent resized pkt, use previous_delta
288 */
289 if (vseq->delta || vseq->previous_delta) {
290 /* since ack_seq is the number of octet that is expected
291 to receive next, so compare it with init_seq+delta */
292 if(after(ack_seq, vseq->init_seq+vseq->delta)) {
293 th->ack_seq = htonl(ack_seq - vseq->delta);
294 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
295 "(%d) from ack_seq\n", vseq->delta);
296
297 } else {
298 th->ack_seq = htonl(ack_seq - vseq->previous_delta);
299 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
300 "previous_delta (%d) from ack_seq\n",
301 vseq->previous_delta);
302 }
303 }
304}
305
306
307/*
308 * Updates ip_vs_seq if pkt has been resized
309 * Assumes already checked proto==IPPROTO_TCP and diff!=0.
310 */
311static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
312 unsigned flag, __u32 seq, int diff)
313{
314 /* spinlock is to keep updating cp->flags atomic */
315 spin_lock(&cp->lock);
316 if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
317 vseq->previous_delta = vseq->delta;
318 vseq->delta += diff;
319 vseq->init_seq = seq;
320 cp->flags |= flag;
321 }
322 spin_unlock(&cp->lock);
323}
324
325static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
326 struct ip_vs_app *app)
327{
328 int diff;
329 const unsigned int tcp_offset = ip_hdrlen(skb);
330 struct tcphdr *th;
331 __u32 seq;
332
333 if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
334 return 0;
335
336 th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
337
338 /*
339 * Remember seq number in case this pkt gets resized
340 */
341 seq = ntohl(th->seq);
342
343 /*
344 * Fix seq stuff if flagged as so.
345 */
346 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
347 vs_fix_seq(&cp->out_seq, th);
348 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
349 vs_fix_ack_seq(&cp->in_seq, th);
350
351 /*
352 * Call private output hook function
353 */
354 if (app->pkt_out == NULL)
355 return 1;
356
357 if (!app->pkt_out(app, cp, skb, &diff))
358 return 0;
359
360 /*
361 * Update ip_vs seq stuff if len has changed.
362 */
363 if (diff != 0)
364 vs_seq_update(cp, &cp->out_seq,
365 IP_VS_CONN_F_OUT_SEQ, seq, diff);
366
367 return 1;
368}
369
370/*
371 * Output pkt hook. Will call bound ip_vs_app specific function
372 * called by ipvs packet handler, assumes previously checked cp!=NULL
373 * returns false if it can't handle packet (oom)
374 */
375int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
376{
377 struct ip_vs_app *app;
378
379 /*
380 * check if application module is bound to
381 * this ip_vs_conn.
382 */
383 if ((app = cp->app) == NULL)
384 return 1;
385
386 /* TCP is complicated */
387 if (cp->protocol == IPPROTO_TCP)
388 return app_tcp_pkt_out(cp, skb, app);
389
390 /*
391 * Call private output hook function
392 */
393 if (app->pkt_out == NULL)
394 return 1;
395
396 return app->pkt_out(app, cp, skb, NULL);
397}
398
399
400static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
401 struct ip_vs_app *app)
402{
403 int diff;
404 const unsigned int tcp_offset = ip_hdrlen(skb);
405 struct tcphdr *th;
406 __u32 seq;
407
408 if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
409 return 0;
410
411 th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
412
413 /*
414 * Remember seq number in case this pkt gets resized
415 */
416 seq = ntohl(th->seq);
417
418 /*
419 * Fix seq stuff if flagged as so.
420 */
421 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
422 vs_fix_seq(&cp->in_seq, th);
423 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
424 vs_fix_ack_seq(&cp->out_seq, th);
425
426 /*
427 * Call private input hook function
428 */
429 if (app->pkt_in == NULL)
430 return 1;
431
432 if (!app->pkt_in(app, cp, skb, &diff))
433 return 0;
434
435 /*
436 * Update ip_vs seq stuff if len has changed.
437 */
438 if (diff != 0)
439 vs_seq_update(cp, &cp->in_seq,
440 IP_VS_CONN_F_IN_SEQ, seq, diff);
441
442 return 1;
443}
444
445/*
446 * Input pkt hook. Will call bound ip_vs_app specific function
447 * called by ipvs packet handler, assumes previously checked cp!=NULL.
448 * returns false if can't handle packet (oom).
449 */
450int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
451{
452 struct ip_vs_app *app;
453
454 /*
455 * check if application module is bound to
456 * this ip_vs_conn.
457 */
458 if ((app = cp->app) == NULL)
459 return 1;
460
461 /* TCP is complicated */
462 if (cp->protocol == IPPROTO_TCP)
463 return app_tcp_pkt_in(cp, skb, app);
464
465 /*
466 * Call private input hook function
467 */
468 if (app->pkt_in == NULL)
469 return 1;
470
471 return app->pkt_in(app, cp, skb, NULL);
472}
473
474
475#ifdef CONFIG_PROC_FS
476/*
477 * /proc/net/ip_vs_app entry function
478 */
479
480static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
481{
482 struct ip_vs_app *app, *inc;
483
484 list_for_each_entry(app, &ip_vs_app_list, a_list) {
485 list_for_each_entry(inc, &app->incs_list, a_list) {
486 if (pos-- == 0)
487 return inc;
488 }
489 }
490 return NULL;
491
492}
493
494static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
495{
496 mutex_lock(&__ip_vs_app_mutex);
497
498 return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
499}
500
501static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
502{
503 struct ip_vs_app *inc, *app;
504 struct list_head *e;
505
506 ++*pos;
507 if (v == SEQ_START_TOKEN)
508 return ip_vs_app_idx(0);
509
510 inc = v;
511 app = inc->app;
512
513 if ((e = inc->a_list.next) != &app->incs_list)
514 return list_entry(e, struct ip_vs_app, a_list);
515
516 /* go on to next application */
517 for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
518 app = list_entry(e, struct ip_vs_app, a_list);
519 list_for_each_entry(inc, &app->incs_list, a_list) {
520 return inc;
521 }
522 }
523 return NULL;
524}
525
526static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
527{
528 mutex_unlock(&__ip_vs_app_mutex);
529}
530
531static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
532{
533 if (v == SEQ_START_TOKEN)
534 seq_puts(seq, "prot port usecnt name\n");
535 else {
536 const struct ip_vs_app *inc = v;
537
538 seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
539 ip_vs_proto_name(inc->protocol),
540 ntohs(inc->port),
541 atomic_read(&inc->usecnt),
542 inc->name);
543 }
544 return 0;
545}
546
547static const struct seq_operations ip_vs_app_seq_ops = {
548 .start = ip_vs_app_seq_start,
549 .next = ip_vs_app_seq_next,
550 .stop = ip_vs_app_seq_stop,
551 .show = ip_vs_app_seq_show,
552};
553
554static int ip_vs_app_open(struct inode *inode, struct file *file)
555{
556 return seq_open(file, &ip_vs_app_seq_ops);
557}
558
559static const struct file_operations ip_vs_app_fops = {
560 .owner = THIS_MODULE,
561 .open = ip_vs_app_open,
562 .read = seq_read,
563 .llseek = seq_lseek,
564 .release = seq_release,
565};
566#endif
567
568
569/*
570 * Replace a segment of data with a new segment
571 */
572int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
573 char *o_buf, int o_len, char *n_buf, int n_len)
574{
575 int diff;
576 int o_offset;
577 int o_left;
578
579 EnterFunction(9);
580
581 diff = n_len - o_len;
582 o_offset = o_buf - (char *)skb->data;
583 /* The length of left data after o_buf+o_len in the skb data */
584 o_left = skb->len - (o_offset + o_len);
585
586 if (diff <= 0) {
587 memmove(o_buf + n_len, o_buf + o_len, o_left);
588 memcpy(o_buf, n_buf, n_len);
589 skb_trim(skb, skb->len + diff);
590 } else if (diff <= skb_tailroom(skb)) {
591 skb_put(skb, diff);
592 memmove(o_buf + n_len, o_buf + o_len, o_left);
593 memcpy(o_buf, n_buf, n_len);
594 } else {
595 if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
596 return -ENOMEM;
597 skb_put(skb, diff);
598 memmove(skb->data + o_offset + n_len,
599 skb->data + o_offset + o_len, o_left);
600 skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
601 }
602
603 /* must update the iph total length here */
604 ip_hdr(skb)->tot_len = htons(skb->len);
605
606 LeaveFunction(9);
607 return 0;
608}
609
610
611int __init ip_vs_app_init(void)
612{
613 /* we will replace it with proc_net_ipvs_create() soon */
614 proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
615 return 0;
616}
617
618
619void ip_vs_app_cleanup(void)
620{
621 proc_net_remove(&init_net, "ip_vs_app");
622}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
deleted file mode 100644
index 9a24332fbed8..000000000000
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ /dev/null
@@ -1,1110 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
20 *
21 * Changes:
22 *
23 */
24
25#include <linux/interrupt.h>
26#include <linux/in.h>
27#include <linux/net.h>
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/vmalloc.h>
31#include <linux/proc_fs.h> /* for proc_net_* */
32#include <linux/seq_file.h>
33#include <linux/jhash.h>
34#include <linux/random.h>
35
36#include <net/net_namespace.h>
37#include <net/ip_vs.h>
38
39
40/*
41 * Connection hash table: for input and output packets lookups of IPVS
42 */
43static struct list_head *ip_vs_conn_tab;
44
45/* SLAB cache for IPVS connections */
46static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
47
48/* counter for current IPVS connections */
49static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
50
51/* counter for no client port connections */
52static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
53
54/* random value for IPVS connection hash */
55static unsigned int ip_vs_conn_rnd;
56
57/*
58 * Fine locking granularity for big connection hash table
59 */
60#define CT_LOCKARRAY_BITS 4
61#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
62#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
63
64struct ip_vs_aligned_lock
65{
66 rwlock_t l;
67} __attribute__((__aligned__(SMP_CACHE_BYTES)));
68
69/* lock array for conn table */
70static struct ip_vs_aligned_lock
71__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
72
73static inline void ct_read_lock(unsigned key)
74{
75 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
76}
77
78static inline void ct_read_unlock(unsigned key)
79{
80 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
81}
82
83static inline void ct_write_lock(unsigned key)
84{
85 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
86}
87
88static inline void ct_write_unlock(unsigned key)
89{
90 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
91}
92
93static inline void ct_read_lock_bh(unsigned key)
94{
95 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
96}
97
98static inline void ct_read_unlock_bh(unsigned key)
99{
100 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
101}
102
103static inline void ct_write_lock_bh(unsigned key)
104{
105 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
106}
107
108static inline void ct_write_unlock_bh(unsigned key)
109{
110 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
111}
112
113
114/*
115 * Returns hash value for IPVS connection entry
116 */
117static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
118 const union nf_inet_addr *addr,
119 __be16 port)
120{
121#ifdef CONFIG_IP_VS_IPV6
122 if (af == AF_INET6)
123 return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
124 (__force u32)port, proto, ip_vs_conn_rnd)
125 & IP_VS_CONN_TAB_MASK;
126#endif
127 return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
128 ip_vs_conn_rnd)
129 & IP_VS_CONN_TAB_MASK;
130}
131
132
133/*
134 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
135 * returns bool success.
136 */
137static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
138{
139 unsigned hash;
140 int ret;
141
142 /* Hash by protocol, client address and port */
143 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
144
145 ct_write_lock(hash);
146
147 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
148 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
149 cp->flags |= IP_VS_CONN_F_HASHED;
150 atomic_inc(&cp->refcnt);
151 ret = 1;
152 } else {
153 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
154 "called from %p\n", __builtin_return_address(0));
155 ret = 0;
156 }
157
158 ct_write_unlock(hash);
159
160 return ret;
161}
162
163
164/*
165 * UNhashes ip_vs_conn from ip_vs_conn_tab.
166 * returns bool success.
167 */
168static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
169{
170 unsigned hash;
171 int ret;
172
173 /* unhash it and decrease its reference counter */
174 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
175
176 ct_write_lock(hash);
177
178 if (cp->flags & IP_VS_CONN_F_HASHED) {
179 list_del(&cp->c_list);
180 cp->flags &= ~IP_VS_CONN_F_HASHED;
181 atomic_dec(&cp->refcnt);
182 ret = 1;
183 } else
184 ret = 0;
185
186 ct_write_unlock(hash);
187
188 return ret;
189}
190
191
192/*
193 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
194 * Called for pkts coming from OUTside-to-INside.
195 * s_addr, s_port: pkt source address (foreign host)
196 * d_addr, d_port: pkt dest address (load balancer)
197 */
198static inline struct ip_vs_conn *__ip_vs_conn_in_get
199(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
200 const union nf_inet_addr *d_addr, __be16 d_port)
201{
202 unsigned hash;
203 struct ip_vs_conn *cp;
204
205 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
206
207 ct_read_lock(hash);
208
209 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
210 if (cp->af == af &&
211 ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
212 ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
213 s_port == cp->cport && d_port == cp->vport &&
214 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
215 protocol == cp->protocol) {
216 /* HIT */
217 atomic_inc(&cp->refcnt);
218 ct_read_unlock(hash);
219 return cp;
220 }
221 }
222
223 ct_read_unlock(hash);
224
225 return NULL;
226}
227
228struct ip_vs_conn *ip_vs_conn_in_get
229(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
230 const union nf_inet_addr *d_addr, __be16 d_port)
231{
232 struct ip_vs_conn *cp;
233
234 cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
235 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
236 cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
237 d_port);
238
239 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
240 ip_vs_proto_name(protocol),
241 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
242 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
243 cp ? "hit" : "not hit");
244
245 return cp;
246}
247
248/* Get reference to connection template */
249struct ip_vs_conn *ip_vs_ct_in_get
250(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
251 const union nf_inet_addr *d_addr, __be16 d_port)
252{
253 unsigned hash;
254 struct ip_vs_conn *cp;
255
256 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
257
258 ct_read_lock(hash);
259
260 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
261 if (cp->af == af &&
262 ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
263 ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
264 s_port == cp->cport && d_port == cp->vport &&
265 cp->flags & IP_VS_CONN_F_TEMPLATE &&
266 protocol == cp->protocol) {
267 /* HIT */
268 atomic_inc(&cp->refcnt);
269 goto out;
270 }
271 }
272 cp = NULL;
273
274 out:
275 ct_read_unlock(hash);
276
277 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
278 ip_vs_proto_name(protocol),
279 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
280 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
281 cp ? "hit" : "not hit");
282
283 return cp;
284}
285
286/*
287 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
288 * Called for pkts coming from inside-to-OUTside.
289 * s_addr, s_port: pkt source address (inside host)
290 * d_addr, d_port: pkt dest address (foreign host)
291 */
292struct ip_vs_conn *ip_vs_conn_out_get
293(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
294 const union nf_inet_addr *d_addr, __be16 d_port)
295{
296 unsigned hash;
297 struct ip_vs_conn *cp, *ret=NULL;
298
299 /*
300 * Check for "full" addressed entries
301 */
302 hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
303
304 ct_read_lock(hash);
305
306 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
307 if (cp->af == af &&
308 ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
309 ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
310 d_port == cp->cport && s_port == cp->dport &&
311 protocol == cp->protocol) {
312 /* HIT */
313 atomic_inc(&cp->refcnt);
314 ret = cp;
315 break;
316 }
317 }
318
319 ct_read_unlock(hash);
320
321 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
322 ip_vs_proto_name(protocol),
323 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
324 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
325 ret ? "hit" : "not hit");
326
327 return ret;
328}
329
330
331/*
332 * Put back the conn and restart its timer with its timeout
333 */
334void ip_vs_conn_put(struct ip_vs_conn *cp)
335{
336 /* reset it expire in its timeout */
337 mod_timer(&cp->timer, jiffies+cp->timeout);
338
339 __ip_vs_conn_put(cp);
340}
341
342
343/*
344 * Fill a no_client_port connection with a client port number
345 */
346void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
347{
348 if (ip_vs_conn_unhash(cp)) {
349 spin_lock(&cp->lock);
350 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
351 atomic_dec(&ip_vs_conn_no_cport_cnt);
352 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
353 cp->cport = cport;
354 }
355 spin_unlock(&cp->lock);
356
357 /* hash on new dport */
358 ip_vs_conn_hash(cp);
359 }
360}
361
362
363/*
364 * Bind a connection entry with the corresponding packet_xmit.
365 * Called by ip_vs_conn_new.
366 */
367static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
368{
369 switch (IP_VS_FWD_METHOD(cp)) {
370 case IP_VS_CONN_F_MASQ:
371 cp->packet_xmit = ip_vs_nat_xmit;
372 break;
373
374 case IP_VS_CONN_F_TUNNEL:
375 cp->packet_xmit = ip_vs_tunnel_xmit;
376 break;
377
378 case IP_VS_CONN_F_DROUTE:
379 cp->packet_xmit = ip_vs_dr_xmit;
380 break;
381
382 case IP_VS_CONN_F_LOCALNODE:
383 cp->packet_xmit = ip_vs_null_xmit;
384 break;
385
386 case IP_VS_CONN_F_BYPASS:
387 cp->packet_xmit = ip_vs_bypass_xmit;
388 break;
389 }
390}
391
392#ifdef CONFIG_IP_VS_IPV6
393static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
394{
395 switch (IP_VS_FWD_METHOD(cp)) {
396 case IP_VS_CONN_F_MASQ:
397 cp->packet_xmit = ip_vs_nat_xmit_v6;
398 break;
399
400 case IP_VS_CONN_F_TUNNEL:
401 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
402 break;
403
404 case IP_VS_CONN_F_DROUTE:
405 cp->packet_xmit = ip_vs_dr_xmit_v6;
406 break;
407
408 case IP_VS_CONN_F_LOCALNODE:
409 cp->packet_xmit = ip_vs_null_xmit;
410 break;
411
412 case IP_VS_CONN_F_BYPASS:
413 cp->packet_xmit = ip_vs_bypass_xmit_v6;
414 break;
415 }
416}
417#endif
418
419
420static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
421{
422 return atomic_read(&dest->activeconns)
423 + atomic_read(&dest->inactconns);
424}
425
426/*
427 * Bind a connection entry with a virtual service destination
428 * Called just after a new connection entry is created.
429 */
430static inline void
431ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
432{
433 /* if dest is NULL, then return directly */
434 if (!dest)
435 return;
436
437 /* Increase the refcnt counter of the dest */
438 atomic_inc(&dest->refcnt);
439
440 /* Bind with the destination and its corresponding transmitter */
441 if ((cp->flags & IP_VS_CONN_F_SYNC) &&
442 (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
443 /* if the connection is not template and is created
444 * by sync, preserve the activity flag.
445 */
446 cp->flags |= atomic_read(&dest->conn_flags) &
447 (~IP_VS_CONN_F_INACTIVE);
448 else
449 cp->flags |= atomic_read(&dest->conn_flags);
450 cp->dest = dest;
451
452 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
453 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
454 "dest->refcnt:%d\n",
455 ip_vs_proto_name(cp->protocol),
456 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
457 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
458 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
459 ip_vs_fwd_tag(cp), cp->state,
460 cp->flags, atomic_read(&cp->refcnt),
461 atomic_read(&dest->refcnt));
462
463 /* Update the connection counters */
464 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
465 /* It is a normal connection, so increase the inactive
466 connection counter because it is in TCP SYNRECV
467 state (inactive) or other protocol inacive state */
468 if ((cp->flags & IP_VS_CONN_F_SYNC) &&
469 (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
470 atomic_inc(&dest->activeconns);
471 else
472 atomic_inc(&dest->inactconns);
473 } else {
474 /* It is a persistent connection/template, so increase
475 the peristent connection counter */
476 atomic_inc(&dest->persistconns);
477 }
478
479 if (dest->u_threshold != 0 &&
480 ip_vs_dest_totalconns(dest) >= dest->u_threshold)
481 dest->flags |= IP_VS_DEST_F_OVERLOAD;
482}
483
484
485/*
486 * Check if there is a destination for the connection, if so
487 * bind the connection to the destination.
488 */
489struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
490{
491 struct ip_vs_dest *dest;
492
493 if ((cp) && (!cp->dest)) {
494 dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
495 &cp->vaddr, cp->vport,
496 cp->protocol);
497 ip_vs_bind_dest(cp, dest);
498 return dest;
499 } else
500 return NULL;
501}
502
503
504/*
505 * Unbind a connection entry with its VS destination
506 * Called by the ip_vs_conn_expire function.
507 */
508static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
509{
510 struct ip_vs_dest *dest = cp->dest;
511
512 if (!dest)
513 return;
514
515 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
516 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
517 "dest->refcnt:%d\n",
518 ip_vs_proto_name(cp->protocol),
519 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
520 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
521 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
522 ip_vs_fwd_tag(cp), cp->state,
523 cp->flags, atomic_read(&cp->refcnt),
524 atomic_read(&dest->refcnt));
525
526 /* Update the connection counters */
527 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
528 /* It is a normal connection, so decrease the inactconns
529 or activeconns counter */
530 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
531 atomic_dec(&dest->inactconns);
532 } else {
533 atomic_dec(&dest->activeconns);
534 }
535 } else {
536 /* It is a persistent connection/template, so decrease
537 the peristent connection counter */
538 atomic_dec(&dest->persistconns);
539 }
540
541 if (dest->l_threshold != 0) {
542 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
543 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
544 } else if (dest->u_threshold != 0) {
545 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
546 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
547 } else {
548 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
549 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
550 }
551
552 /*
553 * Simply decrease the refcnt of the dest, because the
554 * dest will be either in service's destination list
555 * or in the trash.
556 */
557 atomic_dec(&dest->refcnt);
558}
559
560
561/*
562 * Checking if the destination of a connection template is available.
563 * If available, return 1, otherwise invalidate this connection
564 * template and return 0.
565 */
566int ip_vs_check_template(struct ip_vs_conn *ct)
567{
568 struct ip_vs_dest *dest = ct->dest;
569
570 /*
571 * Checking the dest server status.
572 */
573 if ((dest == NULL) ||
574 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
575 (sysctl_ip_vs_expire_quiescent_template &&
576 (atomic_read(&dest->weight) == 0))) {
577 IP_VS_DBG_BUF(9, "check_template: dest not available for "
578 "protocol %s s:%s:%d v:%s:%d "
579 "-> d:%s:%d\n",
580 ip_vs_proto_name(ct->protocol),
581 IP_VS_DBG_ADDR(ct->af, &ct->caddr),
582 ntohs(ct->cport),
583 IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
584 ntohs(ct->vport),
585 IP_VS_DBG_ADDR(ct->af, &ct->daddr),
586 ntohs(ct->dport));
587
588 /*
589 * Invalidate the connection template
590 */
591 if (ct->vport != htons(0xffff)) {
592 if (ip_vs_conn_unhash(ct)) {
593 ct->dport = htons(0xffff);
594 ct->vport = htons(0xffff);
595 ct->cport = 0;
596 ip_vs_conn_hash(ct);
597 }
598 }
599
600 /*
601 * Simply decrease the refcnt of the template,
602 * don't restart its timer.
603 */
604 atomic_dec(&ct->refcnt);
605 return 0;
606 }
607 return 1;
608}
609
610static void ip_vs_conn_expire(unsigned long data)
611{
612 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
613
614 cp->timeout = 60*HZ;
615
616 /*
617 * hey, I'm using it
618 */
619 atomic_inc(&cp->refcnt);
620
621 /*
622 * do I control anybody?
623 */
624 if (atomic_read(&cp->n_control))
625 goto expire_later;
626
627 /*
628 * unhash it if it is hashed in the conn table
629 */
630 if (!ip_vs_conn_unhash(cp))
631 goto expire_later;
632
633 /*
634 * refcnt==1 implies I'm the only one referrer
635 */
636 if (likely(atomic_read(&cp->refcnt) == 1)) {
637 /* delete the timer if it is activated by other users */
638 if (timer_pending(&cp->timer))
639 del_timer(&cp->timer);
640
641 /* does anybody control me? */
642 if (cp->control)
643 ip_vs_control_del(cp);
644
645 if (unlikely(cp->app != NULL))
646 ip_vs_unbind_app(cp);
647 ip_vs_unbind_dest(cp);
648 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
649 atomic_dec(&ip_vs_conn_no_cport_cnt);
650 atomic_dec(&ip_vs_conn_count);
651
652 kmem_cache_free(ip_vs_conn_cachep, cp);
653 return;
654 }
655
656 /* hash it back to the table */
657 ip_vs_conn_hash(cp);
658
659 expire_later:
660 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
661 atomic_read(&cp->refcnt)-1,
662 atomic_read(&cp->n_control));
663
664 ip_vs_conn_put(cp);
665}
666
667
668void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
669{
670 if (del_timer(&cp->timer))
671 mod_timer(&cp->timer, jiffies);
672}
673
674
675/*
676 * Create a new connection entry and hash it into the ip_vs_conn_tab
677 */
678struct ip_vs_conn *
679ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
680 const union nf_inet_addr *vaddr, __be16 vport,
681 const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
682 struct ip_vs_dest *dest)
683{
684 struct ip_vs_conn *cp;
685 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
686
687 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
688 if (cp == NULL) {
689 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
690 return NULL;
691 }
692
693 INIT_LIST_HEAD(&cp->c_list);
694 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
695 cp->af = af;
696 cp->protocol = proto;
697 ip_vs_addr_copy(af, &cp->caddr, caddr);
698 cp->cport = cport;
699 ip_vs_addr_copy(af, &cp->vaddr, vaddr);
700 cp->vport = vport;
701 ip_vs_addr_copy(af, &cp->daddr, daddr);
702 cp->dport = dport;
703 cp->flags = flags;
704 spin_lock_init(&cp->lock);
705
706 /*
707 * Set the entry is referenced by the current thread before hashing
708 * it in the table, so that other thread run ip_vs_random_dropentry
709 * but cannot drop this entry.
710 */
711 atomic_set(&cp->refcnt, 1);
712
713 atomic_set(&cp->n_control, 0);
714 atomic_set(&cp->in_pkts, 0);
715
716 atomic_inc(&ip_vs_conn_count);
717 if (flags & IP_VS_CONN_F_NO_CPORT)
718 atomic_inc(&ip_vs_conn_no_cport_cnt);
719
720 /* Bind the connection with a destination server */
721 ip_vs_bind_dest(cp, dest);
722
723 /* Set its state and timeout */
724 cp->state = 0;
725 cp->timeout = 3*HZ;
726
727 /* Bind its packet transmitter */
728#ifdef CONFIG_IP_VS_IPV6
729 if (af == AF_INET6)
730 ip_vs_bind_xmit_v6(cp);
731 else
732#endif
733 ip_vs_bind_xmit(cp);
734
735 if (unlikely(pp && atomic_read(&pp->appcnt)))
736 ip_vs_bind_app(cp, pp);
737
738 /* Hash it in the ip_vs_conn_tab finally */
739 ip_vs_conn_hash(cp);
740
741 return cp;
742}
743
744
745/*
746 * /proc/net/ip_vs_conn entries
747 */
748#ifdef CONFIG_PROC_FS
749
750static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
751{
752 int idx;
753 struct ip_vs_conn *cp;
754
755 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
756 ct_read_lock_bh(idx);
757 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
758 if (pos-- == 0) {
759 seq->private = &ip_vs_conn_tab[idx];
760 return cp;
761 }
762 }
763 ct_read_unlock_bh(idx);
764 }
765
766 return NULL;
767}
768
769static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
770{
771 seq->private = NULL;
772 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
773}
774
775static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
776{
777 struct ip_vs_conn *cp = v;
778 struct list_head *e, *l = seq->private;
779 int idx;
780
781 ++*pos;
782 if (v == SEQ_START_TOKEN)
783 return ip_vs_conn_array(seq, 0);
784
785 /* more on same hash chain? */
786 if ((e = cp->c_list.next) != l)
787 return list_entry(e, struct ip_vs_conn, c_list);
788
789 idx = l - ip_vs_conn_tab;
790 ct_read_unlock_bh(idx);
791
792 while (++idx < IP_VS_CONN_TAB_SIZE) {
793 ct_read_lock_bh(idx);
794 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
795 seq->private = &ip_vs_conn_tab[idx];
796 return cp;
797 }
798 ct_read_unlock_bh(idx);
799 }
800 seq->private = NULL;
801 return NULL;
802}
803
804static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
805{
806 struct list_head *l = seq->private;
807
808 if (l)
809 ct_read_unlock_bh(l - ip_vs_conn_tab);
810}
811
812static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
813{
814
815 if (v == SEQ_START_TOKEN)
816 seq_puts(seq,
817 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
818 else {
819 const struct ip_vs_conn *cp = v;
820
821#ifdef CONFIG_IP_VS_IPV6
822 if (cp->af == AF_INET6)
823 seq_printf(seq,
824 "%-3s " NIP6_FMT " %04X " NIP6_FMT
825 " %04X " NIP6_FMT " %04X %-11s %7lu\n",
826 ip_vs_proto_name(cp->protocol),
827 NIP6(cp->caddr.in6), ntohs(cp->cport),
828 NIP6(cp->vaddr.in6), ntohs(cp->vport),
829 NIP6(cp->daddr.in6), ntohs(cp->dport),
830 ip_vs_state_name(cp->protocol, cp->state),
831 (cp->timer.expires-jiffies)/HZ);
832 else
833#endif
834 seq_printf(seq,
835 "%-3s %08X %04X %08X %04X"
836 " %08X %04X %-11s %7lu\n",
837 ip_vs_proto_name(cp->protocol),
838 ntohl(cp->caddr.ip), ntohs(cp->cport),
839 ntohl(cp->vaddr.ip), ntohs(cp->vport),
840 ntohl(cp->daddr.ip), ntohs(cp->dport),
841 ip_vs_state_name(cp->protocol, cp->state),
842 (cp->timer.expires-jiffies)/HZ);
843 }
844 return 0;
845}
846
847static const struct seq_operations ip_vs_conn_seq_ops = {
848 .start = ip_vs_conn_seq_start,
849 .next = ip_vs_conn_seq_next,
850 .stop = ip_vs_conn_seq_stop,
851 .show = ip_vs_conn_seq_show,
852};
853
854static int ip_vs_conn_open(struct inode *inode, struct file *file)
855{
856 return seq_open(file, &ip_vs_conn_seq_ops);
857}
858
859static const struct file_operations ip_vs_conn_fops = {
860 .owner = THIS_MODULE,
861 .open = ip_vs_conn_open,
862 .read = seq_read,
863 .llseek = seq_lseek,
864 .release = seq_release,
865};
866
867static const char *ip_vs_origin_name(unsigned flags)
868{
869 if (flags & IP_VS_CONN_F_SYNC)
870 return "SYNC";
871 else
872 return "LOCAL";
873}
874
875static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
876{
877
878 if (v == SEQ_START_TOKEN)
879 seq_puts(seq,
880 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
881 else {
882 const struct ip_vs_conn *cp = v;
883
884#ifdef CONFIG_IP_VS_IPV6
885 if (cp->af == AF_INET6)
886 seq_printf(seq,
887 "%-3s " NIP6_FMT " %04X " NIP6_FMT
888 " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
889 ip_vs_proto_name(cp->protocol),
890 NIP6(cp->caddr.in6), ntohs(cp->cport),
891 NIP6(cp->vaddr.in6), ntohs(cp->vport),
892 NIP6(cp->daddr.in6), ntohs(cp->dport),
893 ip_vs_state_name(cp->protocol, cp->state),
894 ip_vs_origin_name(cp->flags),
895 (cp->timer.expires-jiffies)/HZ);
896 else
897#endif
898 seq_printf(seq,
899 "%-3s %08X %04X %08X %04X "
900 "%08X %04X %-11s %-6s %7lu\n",
901 ip_vs_proto_name(cp->protocol),
902 ntohl(cp->caddr.ip), ntohs(cp->cport),
903 ntohl(cp->vaddr.ip), ntohs(cp->vport),
904 ntohl(cp->daddr.ip), ntohs(cp->dport),
905 ip_vs_state_name(cp->protocol, cp->state),
906 ip_vs_origin_name(cp->flags),
907 (cp->timer.expires-jiffies)/HZ);
908 }
909 return 0;
910}
911
912static const struct seq_operations ip_vs_conn_sync_seq_ops = {
913 .start = ip_vs_conn_seq_start,
914 .next = ip_vs_conn_seq_next,
915 .stop = ip_vs_conn_seq_stop,
916 .show = ip_vs_conn_sync_seq_show,
917};
918
919static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
920{
921 return seq_open(file, &ip_vs_conn_sync_seq_ops);
922}
923
924static const struct file_operations ip_vs_conn_sync_fops = {
925 .owner = THIS_MODULE,
926 .open = ip_vs_conn_sync_open,
927 .read = seq_read,
928 .llseek = seq_lseek,
929 .release = seq_release,
930};
931
932#endif
933
934
935/*
936 * Randomly drop connection entries before running out of memory
937 */
938static inline int todrop_entry(struct ip_vs_conn *cp)
939{
940 /*
941 * The drop rate array needs tuning for real environments.
942 * Called from timer bh only => no locking
943 */
944 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
945 static char todrop_counter[9] = {0};
946 int i;
947
948 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
949 This will leave enough time for normal connection to get
950 through. */
951 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
952 return 0;
953
954 /* Don't drop the entry if its number of incoming packets is not
955 located in [0, 8] */
956 i = atomic_read(&cp->in_pkts);
957 if (i > 8 || i < 0) return 0;
958
959 if (!todrop_rate[i]) return 0;
960 if (--todrop_counter[i] > 0) return 0;
961
962 todrop_counter[i] = todrop_rate[i];
963 return 1;
964}
965
966/* Called from keventd and must protect itself from softirqs */
967void ip_vs_random_dropentry(void)
968{
969 int idx;
970 struct ip_vs_conn *cp;
971
972 /*
973 * Randomly scan 1/32 of the whole table every second
974 */
975 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
976 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
977
978 /*
979 * Lock is actually needed in this loop.
980 */
981 ct_write_lock_bh(hash);
982
983 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
984 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
985 /* connection template */
986 continue;
987
988 if (cp->protocol == IPPROTO_TCP) {
989 switch(cp->state) {
990 case IP_VS_TCP_S_SYN_RECV:
991 case IP_VS_TCP_S_SYNACK:
992 break;
993
994 case IP_VS_TCP_S_ESTABLISHED:
995 if (todrop_entry(cp))
996 break;
997 continue;
998
999 default:
1000 continue;
1001 }
1002 } else {
1003 if (!todrop_entry(cp))
1004 continue;
1005 }
1006
1007 IP_VS_DBG(4, "del connection\n");
1008 ip_vs_conn_expire_now(cp);
1009 if (cp->control) {
1010 IP_VS_DBG(4, "del conn template\n");
1011 ip_vs_conn_expire_now(cp->control);
1012 }
1013 }
1014 ct_write_unlock_bh(hash);
1015 }
1016}
1017
1018
1019/*
1020 * Flush all the connection entries in the ip_vs_conn_tab
1021 */
1022static void ip_vs_conn_flush(void)
1023{
1024 int idx;
1025 struct ip_vs_conn *cp;
1026
1027 flush_again:
1028 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
1029 /*
1030 * Lock is actually needed in this loop.
1031 */
1032 ct_write_lock_bh(idx);
1033
1034 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
1035
1036 IP_VS_DBG(4, "del connection\n");
1037 ip_vs_conn_expire_now(cp);
1038 if (cp->control) {
1039 IP_VS_DBG(4, "del conn template\n");
1040 ip_vs_conn_expire_now(cp->control);
1041 }
1042 }
1043 ct_write_unlock_bh(idx);
1044 }
1045
1046 /* the counter may be not NULL, because maybe some conn entries
1047 are run by slow timer handler or unhashed but still referred */
1048 if (atomic_read(&ip_vs_conn_count) != 0) {
1049 schedule();
1050 goto flush_again;
1051 }
1052}
1053
1054
1055int __init ip_vs_conn_init(void)
1056{
1057 int idx;
1058
1059 /*
1060 * Allocate the connection hash table and initialize its list heads
1061 */
1062 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
1063 if (!ip_vs_conn_tab)
1064 return -ENOMEM;
1065
1066 /* Allocate ip_vs_conn slab cache */
1067 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1068 sizeof(struct ip_vs_conn), 0,
1069 SLAB_HWCACHE_ALIGN, NULL);
1070 if (!ip_vs_conn_cachep) {
1071 vfree(ip_vs_conn_tab);
1072 return -ENOMEM;
1073 }
1074
1075 IP_VS_INFO("Connection hash table configured "
1076 "(size=%d, memory=%ldKbytes)\n",
1077 IP_VS_CONN_TAB_SIZE,
1078 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
1079 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
1080 sizeof(struct ip_vs_conn));
1081
1082 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1083 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1084 }
1085
1086 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
1087 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1088 }
1089
1090 proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1091 proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1092
1093 /* calculate the random value for connection hash */
1094 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1095
1096 return 0;
1097}
1098
1099
1100void ip_vs_conn_cleanup(void)
1101{
1102 /* flush all the connection entries first */
1103 ip_vs_conn_flush();
1104
1105 /* Release the empty cache */
1106 kmem_cache_destroy(ip_vs_conn_cachep);
1107 proc_net_remove(&init_net, "ip_vs_conn");
1108 proc_net_remove(&init_net, "ip_vs_conn_sync");
1109 vfree(ip_vs_conn_tab);
1110}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
deleted file mode 100644
index 958abf3e5f8c..000000000000
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ /dev/null
@@ -1,1542 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others.
20 *
21 * Changes:
22 * Paul `Rusty' Russell properly handle non-linear skbs
23 * Harald Welte don't use nfcache
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/ip.h>
30#include <linux/tcp.h>
31#include <linux/icmp.h>
32
33#include <net/ip.h>
34#include <net/tcp.h>
35#include <net/udp.h>
36#include <net/icmp.h> /* for icmp_send */
37#include <net/route.h>
38
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv4.h>
41
42#ifdef CONFIG_IP_VS_IPV6
43#include <net/ipv6.h>
44#include <linux/netfilter_ipv6.h>
45#endif
46
47#include <net/ip_vs.h>
48
49
50EXPORT_SYMBOL(register_ip_vs_scheduler);
51EXPORT_SYMBOL(unregister_ip_vs_scheduler);
52EXPORT_SYMBOL(ip_vs_skb_replace);
53EXPORT_SYMBOL(ip_vs_proto_name);
54EXPORT_SYMBOL(ip_vs_conn_new);
55EXPORT_SYMBOL(ip_vs_conn_in_get);
56EXPORT_SYMBOL(ip_vs_conn_out_get);
57#ifdef CONFIG_IP_VS_PROTO_TCP
58EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
59#endif
60EXPORT_SYMBOL(ip_vs_conn_put);
61#ifdef CONFIG_IP_VS_DEBUG
62EXPORT_SYMBOL(ip_vs_get_debug_level);
63#endif
64
65
66/* ID used in ICMP lookups */
67#define icmp_id(icmph) (((icmph)->un).echo.id)
68#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
69
70const char *ip_vs_proto_name(unsigned proto)
71{
72 static char buf[20];
73
74 switch (proto) {
75 case IPPROTO_IP:
76 return "IP";
77 case IPPROTO_UDP:
78 return "UDP";
79 case IPPROTO_TCP:
80 return "TCP";
81 case IPPROTO_ICMP:
82 return "ICMP";
83#ifdef CONFIG_IP_VS_IPV6
84 case IPPROTO_ICMPV6:
85 return "ICMPv6";
86#endif
87 default:
88 sprintf(buf, "IP_%d", proto);
89 return buf;
90 }
91}
92
93void ip_vs_init_hash_table(struct list_head *table, int rows)
94{
95 while (--rows >= 0)
96 INIT_LIST_HEAD(&table[rows]);
97}
98
99static inline void
100ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
101{
102 struct ip_vs_dest *dest = cp->dest;
103 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
104 spin_lock(&dest->stats.lock);
105 dest->stats.ustats.inpkts++;
106 dest->stats.ustats.inbytes += skb->len;
107 spin_unlock(&dest->stats.lock);
108
109 spin_lock(&dest->svc->stats.lock);
110 dest->svc->stats.ustats.inpkts++;
111 dest->svc->stats.ustats.inbytes += skb->len;
112 spin_unlock(&dest->svc->stats.lock);
113
114 spin_lock(&ip_vs_stats.lock);
115 ip_vs_stats.ustats.inpkts++;
116 ip_vs_stats.ustats.inbytes += skb->len;
117 spin_unlock(&ip_vs_stats.lock);
118 }
119}
120
121
122static inline void
123ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
124{
125 struct ip_vs_dest *dest = cp->dest;
126 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
127 spin_lock(&dest->stats.lock);
128 dest->stats.ustats.outpkts++;
129 dest->stats.ustats.outbytes += skb->len;
130 spin_unlock(&dest->stats.lock);
131
132 spin_lock(&dest->svc->stats.lock);
133 dest->svc->stats.ustats.outpkts++;
134 dest->svc->stats.ustats.outbytes += skb->len;
135 spin_unlock(&dest->svc->stats.lock);
136
137 spin_lock(&ip_vs_stats.lock);
138 ip_vs_stats.ustats.outpkts++;
139 ip_vs_stats.ustats.outbytes += skb->len;
140 spin_unlock(&ip_vs_stats.lock);
141 }
142}
143
144
145static inline void
146ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
147{
148 spin_lock(&cp->dest->stats.lock);
149 cp->dest->stats.ustats.conns++;
150 spin_unlock(&cp->dest->stats.lock);
151
152 spin_lock(&svc->stats.lock);
153 svc->stats.ustats.conns++;
154 spin_unlock(&svc->stats.lock);
155
156 spin_lock(&ip_vs_stats.lock);
157 ip_vs_stats.ustats.conns++;
158 spin_unlock(&ip_vs_stats.lock);
159}
160
161
162static inline int
163ip_vs_set_state(struct ip_vs_conn *cp, int direction,
164 const struct sk_buff *skb,
165 struct ip_vs_protocol *pp)
166{
167 if (unlikely(!pp->state_transition))
168 return 0;
169 return pp->state_transition(cp, direction, skb, pp);
170}
171
172
173/*
174 * IPVS persistent scheduling function
175 * It creates a connection entry according to its template if exists,
176 * or selects a server and creates a connection entry plus a template.
177 * Locking: we are svc user (svc->refcnt), so we hold all dests too
178 * Protocols supported: TCP, UDP
179 */
180static struct ip_vs_conn *
181ip_vs_sched_persist(struct ip_vs_service *svc,
182 const struct sk_buff *skb,
183 __be16 ports[2])
184{
185 struct ip_vs_conn *cp = NULL;
186 struct ip_vs_iphdr iph;
187 struct ip_vs_dest *dest;
188 struct ip_vs_conn *ct;
189 __be16 dport; /* destination port to forward */
190 union nf_inet_addr snet; /* source network of the client,
191 after masking */
192
193 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
194
195 /* Mask saddr with the netmask to adjust template granularity */
196#ifdef CONFIG_IP_VS_IPV6
197 if (svc->af == AF_INET6)
198 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
199 else
200#endif
201 snet.ip = iph.saddr.ip & svc->netmask;
202
203 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
204 "mnet %s\n",
205 IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
206 IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
207 IP_VS_DBG_ADDR(svc->af, &snet));
208
209 /*
210 * As far as we know, FTP is a very complicated network protocol, and
211 * it uses control connection and data connections. For active FTP,
212 * FTP server initialize data connection to the client, its source port
213 * is often 20. For passive FTP, FTP server tells the clients the port
214 * that it passively listens to, and the client issues the data
215 * connection. In the tunneling or direct routing mode, the load
216 * balancer is on the client-to-server half of connection, the port
217 * number is unknown to the load balancer. So, a conn template like
218 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
219 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
220 * is created for other persistent services.
221 */
222 if (ports[1] == svc->port) {
223 /* Check if a template already exists */
224 if (svc->port != FTPPORT)
225 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
226 &iph.daddr, ports[1]);
227 else
228 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
229 &iph.daddr, 0);
230
231 if (!ct || !ip_vs_check_template(ct)) {
232 /*
233 * No template found or the dest of the connection
234 * template is not available.
235 */
236 dest = svc->scheduler->schedule(svc, skb);
237 if (dest == NULL) {
238 IP_VS_DBG(1, "p-schedule: no dest found.\n");
239 return NULL;
240 }
241
242 /*
243 * Create a template like <protocol,caddr,0,
244 * vaddr,vport,daddr,dport> for non-ftp service,
245 * and <protocol,caddr,0,vaddr,0,daddr,0>
246 * for ftp service.
247 */
248 if (svc->port != FTPPORT)
249 ct = ip_vs_conn_new(svc->af, iph.protocol,
250 &snet, 0,
251 &iph.daddr,
252 ports[1],
253 &dest->addr, dest->port,
254 IP_VS_CONN_F_TEMPLATE,
255 dest);
256 else
257 ct = ip_vs_conn_new(svc->af, iph.protocol,
258 &snet, 0,
259 &iph.daddr, 0,
260 &dest->addr, 0,
261 IP_VS_CONN_F_TEMPLATE,
262 dest);
263 if (ct == NULL)
264 return NULL;
265
266 ct->timeout = svc->timeout;
267 } else {
268 /* set destination with the found template */
269 dest = ct->dest;
270 }
271 dport = dest->port;
272 } else {
273 /*
274 * Note: persistent fwmark-based services and persistent
275 * port zero service are handled here.
276 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
277 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
278 */
279 if (svc->fwmark) {
280 union nf_inet_addr fwmark = {
281 .all = { 0, 0, 0, htonl(svc->fwmark) }
282 };
283
284 ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
285 &fwmark, 0);
286 } else
287 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
288 &iph.daddr, 0);
289
290 if (!ct || !ip_vs_check_template(ct)) {
291 /*
292 * If it is not persistent port zero, return NULL,
293 * otherwise create a connection template.
294 */
295 if (svc->port)
296 return NULL;
297
298 dest = svc->scheduler->schedule(svc, skb);
299 if (dest == NULL) {
300 IP_VS_DBG(1, "p-schedule: no dest found.\n");
301 return NULL;
302 }
303
304 /*
305 * Create a template according to the service
306 */
307 if (svc->fwmark) {
308 union nf_inet_addr fwmark = {
309 .all = { 0, 0, 0, htonl(svc->fwmark) }
310 };
311
312 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
313 &snet, 0,
314 &fwmark, 0,
315 &dest->addr, 0,
316 IP_VS_CONN_F_TEMPLATE,
317 dest);
318 } else
319 ct = ip_vs_conn_new(svc->af, iph.protocol,
320 &snet, 0,
321 &iph.daddr, 0,
322 &dest->addr, 0,
323 IP_VS_CONN_F_TEMPLATE,
324 dest);
325 if (ct == NULL)
326 return NULL;
327
328 ct->timeout = svc->timeout;
329 } else {
330 /* set destination with the found template */
331 dest = ct->dest;
332 }
333 dport = ports[1];
334 }
335
336 /*
337 * Create a new connection according to the template
338 */
339 cp = ip_vs_conn_new(svc->af, iph.protocol,
340 &iph.saddr, ports[0],
341 &iph.daddr, ports[1],
342 &dest->addr, dport,
343 0,
344 dest);
345 if (cp == NULL) {
346 ip_vs_conn_put(ct);
347 return NULL;
348 }
349
350 /*
351 * Add its control
352 */
353 ip_vs_control_add(cp, ct);
354 ip_vs_conn_put(ct);
355
356 ip_vs_conn_stats(cp, svc);
357 return cp;
358}
359
360
361/*
362 * IPVS main scheduling function
363 * It selects a server according to the virtual service, and
364 * creates a connection entry.
365 * Protocols supported: TCP, UDP
366 */
367struct ip_vs_conn *
368ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
369{
370 struct ip_vs_conn *cp = NULL;
371 struct ip_vs_iphdr iph;
372 struct ip_vs_dest *dest;
373 __be16 _ports[2], *pptr;
374
375 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
376 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
377 if (pptr == NULL)
378 return NULL;
379
380 /*
381 * Persistent service
382 */
383 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
384 return ip_vs_sched_persist(svc, skb, pptr);
385
386 /*
387 * Non-persistent service
388 */
389 if (!svc->fwmark && pptr[1] != svc->port) {
390 if (!svc->port)
391 IP_VS_ERR("Schedule: port zero only supported "
392 "in persistent services, "
393 "check your ipvs configuration\n");
394 return NULL;
395 }
396
397 dest = svc->scheduler->schedule(svc, skb);
398 if (dest == NULL) {
399 IP_VS_DBG(1, "Schedule: no dest found.\n");
400 return NULL;
401 }
402
403 /*
404 * Create a connection entry.
405 */
406 cp = ip_vs_conn_new(svc->af, iph.protocol,
407 &iph.saddr, pptr[0],
408 &iph.daddr, pptr[1],
409 &dest->addr, dest->port ? dest->port : pptr[1],
410 0,
411 dest);
412 if (cp == NULL)
413 return NULL;
414
415 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
416 "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
417 ip_vs_fwd_tag(cp),
418 IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
419 IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
420 IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
421 cp->flags, atomic_read(&cp->refcnt));
422
423 ip_vs_conn_stats(cp, svc);
424 return cp;
425}
426
427
428/*
429 * Pass or drop the packet.
430 * Called by ip_vs_in, when the virtual service is available but
431 * no destination is available for a new connection.
432 */
433int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
434 struct ip_vs_protocol *pp)
435{
436 __be16 _ports[2], *pptr;
437 struct ip_vs_iphdr iph;
438 int unicast;
439 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
440
441 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
442 if (pptr == NULL) {
443 ip_vs_service_put(svc);
444 return NF_DROP;
445 }
446
447#ifdef CONFIG_IP_VS_IPV6
448 if (svc->af == AF_INET6)
449 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
450 else
451#endif
452 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
453
454 /* if it is fwmark-based service, the cache_bypass sysctl is up
455 and the destination is a non-local unicast, then create
456 a cache_bypass connection entry */
457 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
458 int ret, cs;
459 struct ip_vs_conn *cp;
460 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
461
462 ip_vs_service_put(svc);
463
464 /* create a new connection entry */
465 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
466 cp = ip_vs_conn_new(svc->af, iph.protocol,
467 &iph.saddr, pptr[0],
468 &iph.daddr, pptr[1],
469 &daddr, 0,
470 IP_VS_CONN_F_BYPASS,
471 NULL);
472 if (cp == NULL)
473 return NF_DROP;
474
475 /* statistics */
476 ip_vs_in_stats(cp, skb);
477
478 /* set state */
479 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
480
481 /* transmit the first SYN packet */
482 ret = cp->packet_xmit(skb, cp, pp);
483 /* do not touch skb anymore */
484
485 atomic_inc(&cp->in_pkts);
486 ip_vs_conn_put(cp);
487 return ret;
488 }
489
490 /*
491 * When the virtual ftp service is presented, packets destined
492 * for other services on the VIP may get here (except services
493 * listed in the ipvs table), pass the packets, because it is
494 * not ipvs job to decide to drop the packets.
495 */
496 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
497 ip_vs_service_put(svc);
498 return NF_ACCEPT;
499 }
500
501 ip_vs_service_put(svc);
502
503 /*
504 * Notify the client that the destination is unreachable, and
505 * release the socket buffer.
506 * Since it is in IP layer, the TCP socket is not actually
507 * created, the TCP RST packet cannot be sent, instead that
508 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
509 */
510#ifdef CONFIG_IP_VS_IPV6
511 if (svc->af == AF_INET6)
512 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
513 skb->dev);
514 else
515#endif
516 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
517
518 return NF_DROP;
519}
520
521
522/*
523 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
524 * chain, and is used for VS/NAT.
525 * It detects packets for VS/NAT connections and sends the packets
526 * immediately. This can avoid that iptable_nat mangles the packets
527 * for VS/NAT.
528 */
529static unsigned int ip_vs_post_routing(unsigned int hooknum,
530 struct sk_buff *skb,
531 const struct net_device *in,
532 const struct net_device *out,
533 int (*okfn)(struct sk_buff *))
534{
535 if (!skb->ipvs_property)
536 return NF_ACCEPT;
537 /* The packet was sent from IPVS, exit this chain */
538 return NF_STOP;
539}
540
541__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
542{
543 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
544}
545
546static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
547{
548 int err = ip_defrag(skb, user);
549
550 if (!err)
551 ip_send_check(ip_hdr(skb));
552
553 return err;
554}
555
556#ifdef CONFIG_IP_VS_IPV6
557static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
558{
559 /* TODO IPv6: Find out what to do here for IPv6 */
560 return 0;
561}
562#endif
563
564/*
565 * Packet has been made sufficiently writable in caller
566 * - inout: 1=in->out, 0=out->in
567 */
568void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
569 struct ip_vs_conn *cp, int inout)
570{
571 struct iphdr *iph = ip_hdr(skb);
572 unsigned int icmp_offset = iph->ihl*4;
573 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
574 icmp_offset);
575 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
576
577 if (inout) {
578 iph->saddr = cp->vaddr.ip;
579 ip_send_check(iph);
580 ciph->daddr = cp->vaddr.ip;
581 ip_send_check(ciph);
582 } else {
583 iph->daddr = cp->daddr.ip;
584 ip_send_check(iph);
585 ciph->saddr = cp->daddr.ip;
586 ip_send_check(ciph);
587 }
588
589 /* the TCP/UDP port */
590 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
591 __be16 *ports = (void *)ciph + ciph->ihl*4;
592
593 if (inout)
594 ports[1] = cp->vport;
595 else
596 ports[0] = cp->dport;
597 }
598
599 /* And finally the ICMP checksum */
600 icmph->checksum = 0;
601 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
602 skb->ip_summed = CHECKSUM_UNNECESSARY;
603
604 if (inout)
605 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
606 "Forwarding altered outgoing ICMP");
607 else
608 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
609 "Forwarding altered incoming ICMP");
610}
611
612#ifdef CONFIG_IP_VS_IPV6
613void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
614 struct ip_vs_conn *cp, int inout)
615{
616 struct ipv6hdr *iph = ipv6_hdr(skb);
617 unsigned int icmp_offset = sizeof(struct ipv6hdr);
618 struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
619 icmp_offset);
620 struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
621
622 if (inout) {
623 iph->saddr = cp->vaddr.in6;
624 ciph->daddr = cp->vaddr.in6;
625 } else {
626 iph->daddr = cp->daddr.in6;
627 ciph->saddr = cp->daddr.in6;
628 }
629
630 /* the TCP/UDP port */
631 if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
632 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
633
634 if (inout)
635 ports[1] = cp->vport;
636 else
637 ports[0] = cp->dport;
638 }
639
640 /* And finally the ICMP checksum */
641 icmph->icmp6_cksum = 0;
642 /* TODO IPv6: is this correct for ICMPv6? */
643 ip_vs_checksum_complete(skb, icmp_offset);
644 skb->ip_summed = CHECKSUM_UNNECESSARY;
645
646 if (inout)
647 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
648 "Forwarding altered outgoing ICMPv6");
649 else
650 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
651 "Forwarding altered incoming ICMPv6");
652}
653#endif
654
655/* Handle relevant response ICMP messages - forward to the right
656 * destination host. Used for NAT and local client.
657 */
658static int handle_response_icmp(int af, struct sk_buff *skb,
659 union nf_inet_addr *snet,
660 __u8 protocol, struct ip_vs_conn *cp,
661 struct ip_vs_protocol *pp,
662 unsigned int offset, unsigned int ihl)
663{
664 unsigned int verdict = NF_DROP;
665
666 if (IP_VS_FWD_METHOD(cp) != 0) {
667 IP_VS_ERR("shouldn't reach here, because the box is on the "
668 "half connection in the tun/dr module.\n");
669 }
670
671 /* Ensure the checksum is correct */
672 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
673 /* Failed checksum! */
674 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
675 IP_VS_DBG_ADDR(af, snet));
676 goto out;
677 }
678
679 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
680 offset += 2 * sizeof(__u16);
681 if (!skb_make_writable(skb, offset))
682 goto out;
683
684#ifdef CONFIG_IP_VS_IPV6
685 if (af == AF_INET6)
686 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
687 else
688#endif
689 ip_vs_nat_icmp(skb, pp, cp, 1);
690
691 /* do the statistics and put it back */
692 ip_vs_out_stats(cp, skb);
693
694 skb->ipvs_property = 1;
695 verdict = NF_ACCEPT;
696
697out:
698 __ip_vs_conn_put(cp);
699
700 return verdict;
701}
702
703/*
704 * Handle ICMP messages in the inside-to-outside direction (outgoing).
705 * Find any that might be relevant, check against existing connections.
706 * Currently handles error types - unreachable, quench, ttl exceeded.
707 */
708static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
709{
710 struct iphdr *iph;
711 struct icmphdr _icmph, *ic;
712 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
713 struct ip_vs_iphdr ciph;
714 struct ip_vs_conn *cp;
715 struct ip_vs_protocol *pp;
716 unsigned int offset, ihl;
717 union nf_inet_addr snet;
718
719 *related = 1;
720
721 /* reassemble IP fragments */
722 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
723 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
724 return NF_STOLEN;
725 }
726
727 iph = ip_hdr(skb);
728 offset = ihl = iph->ihl * 4;
729 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
730 if (ic == NULL)
731 return NF_DROP;
732
733 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
734 ic->type, ntohs(icmp_id(ic)),
735 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
736
737 /*
738 * Work through seeing if this is for us.
739 * These checks are supposed to be in an order that means easy
740 * things are checked first to speed up processing.... however
741 * this means that some packets will manage to get a long way
742 * down this stack and then be rejected, but that's life.
743 */
744 if ((ic->type != ICMP_DEST_UNREACH) &&
745 (ic->type != ICMP_SOURCE_QUENCH) &&
746 (ic->type != ICMP_TIME_EXCEEDED)) {
747 *related = 0;
748 return NF_ACCEPT;
749 }
750
751 /* Now find the contained IP header */
752 offset += sizeof(_icmph);
753 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
754 if (cih == NULL)
755 return NF_ACCEPT; /* The packet looks wrong, ignore */
756
757 pp = ip_vs_proto_get(cih->protocol);
758 if (!pp)
759 return NF_ACCEPT;
760
761 /* Is the embedded protocol header present? */
762 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
763 pp->dont_defrag))
764 return NF_ACCEPT;
765
766 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
767
768 offset += cih->ihl * 4;
769
770 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
771 /* The embedded headers contain source and dest in reverse order */
772 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
773 if (!cp)
774 return NF_ACCEPT;
775
776 snet.ip = iph->saddr;
777 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
778 pp, offset, ihl);
779}
780
781#ifdef CONFIG_IP_VS_IPV6
782static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
783{
784 struct ipv6hdr *iph;
785 struct icmp6hdr _icmph, *ic;
786 struct ipv6hdr _ciph, *cih; /* The ip header contained
787 within the ICMP */
788 struct ip_vs_iphdr ciph;
789 struct ip_vs_conn *cp;
790 struct ip_vs_protocol *pp;
791 unsigned int offset;
792 union nf_inet_addr snet;
793
794 *related = 1;
795
796 /* reassemble IP fragments */
797 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
799 return NF_STOLEN;
800 }
801
802 iph = ipv6_hdr(skb);
803 offset = sizeof(struct ipv6hdr);
804 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
805 if (ic == NULL)
806 return NF_DROP;
807
808 IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
809 ic->icmp6_type, ntohs(icmpv6_id(ic)),
810 NIP6(iph->saddr), NIP6(iph->daddr));
811
812 /*
813 * Work through seeing if this is for us.
814 * These checks are supposed to be in an order that means easy
815 * things are checked first to speed up processing.... however
816 * this means that some packets will manage to get a long way
817 * down this stack and then be rejected, but that's life.
818 */
819 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
820 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
821 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
822 *related = 0;
823 return NF_ACCEPT;
824 }
825
826 /* Now find the contained IP header */
827 offset += sizeof(_icmph);
828 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
829 if (cih == NULL)
830 return NF_ACCEPT; /* The packet looks wrong, ignore */
831
832 pp = ip_vs_proto_get(cih->nexthdr);
833 if (!pp)
834 return NF_ACCEPT;
835
836 /* Is the embedded protocol header present? */
837 /* TODO: we don't support fragmentation at the moment anyways */
838 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839 return NF_ACCEPT;
840
841 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
842
843 offset += sizeof(struct ipv6hdr);
844
845 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
846 /* The embedded headers contain source and dest in reverse order */
847 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
848 if (!cp)
849 return NF_ACCEPT;
850
851 ipv6_addr_copy(&snet.in6, &iph->saddr);
852 return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
853 pp, offset, sizeof(struct ipv6hdr));
854}
855#endif
856
857static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
858{
859 struct tcphdr _tcph, *th;
860
861 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
862 if (th == NULL)
863 return 0;
864 return th->rst;
865}
866
867/* Handle response packets: rewrite addresses and send away...
868 * Used for NAT and local client.
869 */
870static unsigned int
871handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
872 struct ip_vs_conn *cp, int ihl)
873{
874 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
875
876 if (!skb_make_writable(skb, ihl))
877 goto drop;
878
879 /* mangle the packet */
880 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
881 goto drop;
882
883#ifdef CONFIG_IP_VS_IPV6
884 if (af == AF_INET6)
885 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
886 else
887#endif
888 {
889 ip_hdr(skb)->saddr = cp->vaddr.ip;
890 ip_send_check(ip_hdr(skb));
891 }
892
893 /* For policy routing, packets originating from this
894 * machine itself may be routed differently to packets
895 * passing through. We want this packet to be routed as
896 * if it came from this machine itself. So re-compute
897 * the routing information.
898 */
899#ifdef CONFIG_IP_VS_IPV6
900 if (af == AF_INET6) {
901 if (ip6_route_me_harder(skb) != 0)
902 goto drop;
903 } else
904#endif
905 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
906 goto drop;
907
908 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
909
910 ip_vs_out_stats(cp, skb);
911 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
912 ip_vs_conn_put(cp);
913
914 skb->ipvs_property = 1;
915
916 LeaveFunction(11);
917 return NF_ACCEPT;
918
919drop:
920 ip_vs_conn_put(cp);
921 kfree_skb(skb);
922 return NF_STOLEN;
923}
924
925/*
926 * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
927 * Check if outgoing packet belongs to the established ip_vs_conn.
928 */
929static unsigned int
930ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
931 const struct net_device *in, const struct net_device *out,
932 int (*okfn)(struct sk_buff *))
933{
934 struct ip_vs_iphdr iph;
935 struct ip_vs_protocol *pp;
936 struct ip_vs_conn *cp;
937 int af;
938
939 EnterFunction(11);
940
941 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
942
943 if (skb->ipvs_property)
944 return NF_ACCEPT;
945
946 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
947#ifdef CONFIG_IP_VS_IPV6
948 if (af == AF_INET6) {
949 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
950 int related, verdict = ip_vs_out_icmp_v6(skb, &related);
951
952 if (related)
953 return verdict;
954 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
955 }
956 } else
957#endif
958 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
959 int related, verdict = ip_vs_out_icmp(skb, &related);
960
961 if (related)
962 return verdict;
963 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
964 }
965
966 pp = ip_vs_proto_get(iph.protocol);
967 if (unlikely(!pp))
968 return NF_ACCEPT;
969
970 /* reassemble IP fragments */
971#ifdef CONFIG_IP_VS_IPV6
972 if (af == AF_INET6) {
973 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
974 int related, verdict = ip_vs_out_icmp_v6(skb, &related);
975
976 if (related)
977 return verdict;
978
979 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
980 }
981 } else
982#endif
983 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
984 !pp->dont_defrag)) {
985 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
986 return NF_STOLEN;
987
988 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
989 }
990
991 /*
992 * Check if the packet belongs to an existing entry
993 */
994 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
995
996 if (unlikely(!cp)) {
997 if (sysctl_ip_vs_nat_icmp_send &&
998 (pp->protocol == IPPROTO_TCP ||
999 pp->protocol == IPPROTO_UDP)) {
1000 __be16 _ports[2], *pptr;
1001
1002 pptr = skb_header_pointer(skb, iph.len,
1003 sizeof(_ports), _ports);
1004 if (pptr == NULL)
1005 return NF_ACCEPT; /* Not for me */
1006 if (ip_vs_lookup_real_service(af, iph.protocol,
1007 &iph.saddr,
1008 pptr[0])) {
1009 /*
1010 * Notify the real server: there is no
1011 * existing entry if it is not RST
1012 * packet or not TCP packet.
1013 */
1014 if (iph.protocol != IPPROTO_TCP
1015 || !is_tcp_reset(skb, iph.len)) {
1016#ifdef CONFIG_IP_VS_IPV6
1017 if (af == AF_INET6)
1018 icmpv6_send(skb,
1019 ICMPV6_DEST_UNREACH,
1020 ICMPV6_PORT_UNREACH,
1021 0, skb->dev);
1022 else
1023#endif
1024 icmp_send(skb,
1025 ICMP_DEST_UNREACH,
1026 ICMP_PORT_UNREACH, 0);
1027 return NF_DROP;
1028 }
1029 }
1030 }
1031 IP_VS_DBG_PKT(12, pp, skb, 0,
1032 "packet continues traversal as normal");
1033 return NF_ACCEPT;
1034 }
1035
1036 return handle_response(af, skb, pp, cp, iph.len);
1037}
1038
1039
1040/*
1041 * Handle ICMP messages in the outside-to-inside direction (incoming).
1042 * Find any that might be relevant, check against existing connections,
1043 * forward to the right destination host if relevant.
1044 * Currently handles error types - unreachable, quench, ttl exceeded.
1045 */
1046static int
1047ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1048{
1049 struct iphdr *iph;
1050 struct icmphdr _icmph, *ic;
1051 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
1052 struct ip_vs_iphdr ciph;
1053 struct ip_vs_conn *cp;
1054 struct ip_vs_protocol *pp;
1055 unsigned int offset, ihl, verdict;
1056 union nf_inet_addr snet;
1057
1058 *related = 1;
1059
1060 /* reassemble IP fragments */
1061 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1062 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1063 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1064 return NF_STOLEN;
1065 }
1066
1067 iph = ip_hdr(skb);
1068 offset = ihl = iph->ihl * 4;
1069 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1070 if (ic == NULL)
1071 return NF_DROP;
1072
1073 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
1074 ic->type, ntohs(icmp_id(ic)),
1075 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1076
1077 /*
1078 * Work through seeing if this is for us.
1079 * These checks are supposed to be in an order that means easy
1080 * things are checked first to speed up processing.... however
1081 * this means that some packets will manage to get a long way
1082 * down this stack and then be rejected, but that's life.
1083 */
1084 if ((ic->type != ICMP_DEST_UNREACH) &&
1085 (ic->type != ICMP_SOURCE_QUENCH) &&
1086 (ic->type != ICMP_TIME_EXCEEDED)) {
1087 *related = 0;
1088 return NF_ACCEPT;
1089 }
1090
1091 /* Now find the contained IP header */
1092 offset += sizeof(_icmph);
1093 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1094 if (cih == NULL)
1095 return NF_ACCEPT; /* The packet looks wrong, ignore */
1096
1097 pp = ip_vs_proto_get(cih->protocol);
1098 if (!pp)
1099 return NF_ACCEPT;
1100
1101 /* Is the embedded protocol header present? */
1102 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1103 pp->dont_defrag))
1104 return NF_ACCEPT;
1105
1106 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1107
1108 offset += cih->ihl * 4;
1109
1110 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1111 /* The embedded headers contain source and dest in reverse order */
1112 cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1113 if (!cp) {
1114 /* The packet could also belong to a local client */
1115 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1116 if (cp) {
1117 snet.ip = iph->saddr;
1118 return handle_response_icmp(AF_INET, skb, &snet,
1119 cih->protocol, cp, pp,
1120 offset, ihl);
1121 }
1122 return NF_ACCEPT;
1123 }
1124
1125 verdict = NF_DROP;
1126
1127 /* Ensure the checksum is correct */
1128 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1129 /* Failed checksum! */
1130 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
1131 NIPQUAD(iph->saddr));
1132 goto out;
1133 }
1134
1135 /* do the statistics and put it back */
1136 ip_vs_in_stats(cp, skb);
1137 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1138 offset += 2 * sizeof(__u16);
1139 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1140 /* do not touch skb anymore */
1141
1142 out:
1143 __ip_vs_conn_put(cp);
1144
1145 return verdict;
1146}
1147
1148#ifdef CONFIG_IP_VS_IPV6
1149static int
1150ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1151{
1152 struct ipv6hdr *iph;
1153 struct icmp6hdr _icmph, *ic;
1154 struct ipv6hdr _ciph, *cih; /* The ip header contained
1155 within the ICMP */
1156 struct ip_vs_iphdr ciph;
1157 struct ip_vs_conn *cp;
1158 struct ip_vs_protocol *pp;
1159 unsigned int offset, verdict;
1160 union nf_inet_addr snet;
1161
1162 *related = 1;
1163
1164 /* reassemble IP fragments */
1165 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1166 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1167 IP_DEFRAG_VS_IN :
1168 IP_DEFRAG_VS_FWD))
1169 return NF_STOLEN;
1170 }
1171
1172 iph = ipv6_hdr(skb);
1173 offset = sizeof(struct ipv6hdr);
1174 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1175 if (ic == NULL)
1176 return NF_DROP;
1177
1178 IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
1179 ic->icmp6_type, ntohs(icmpv6_id(ic)),
1180 NIP6(iph->saddr), NIP6(iph->daddr));
1181
1182 /*
1183 * Work through seeing if this is for us.
1184 * These checks are supposed to be in an order that means easy
1185 * things are checked first to speed up processing.... however
1186 * this means that some packets will manage to get a long way
1187 * down this stack and then be rejected, but that's life.
1188 */
1189 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1190 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1191 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1192 *related = 0;
1193 return NF_ACCEPT;
1194 }
1195
1196 /* Now find the contained IP header */
1197 offset += sizeof(_icmph);
1198 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1199 if (cih == NULL)
1200 return NF_ACCEPT; /* The packet looks wrong, ignore */
1201
1202 pp = ip_vs_proto_get(cih->nexthdr);
1203 if (!pp)
1204 return NF_ACCEPT;
1205
1206 /* Is the embedded protocol header present? */
1207 /* TODO: we don't support fragmentation at the moment anyways */
1208 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1209 return NF_ACCEPT;
1210
1211 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1212
1213 offset += sizeof(struct ipv6hdr);
1214
1215 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1216 /* The embedded headers contain source and dest in reverse order */
1217 cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1218 if (!cp) {
1219 /* The packet could also belong to a local client */
1220 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1221 if (cp) {
1222 ipv6_addr_copy(&snet.in6, &iph->saddr);
1223 return handle_response_icmp(AF_INET6, skb, &snet,
1224 cih->nexthdr,
1225 cp, pp, offset,
1226 sizeof(struct ipv6hdr));
1227 }
1228 return NF_ACCEPT;
1229 }
1230
1231 verdict = NF_DROP;
1232
1233 /* do the statistics and put it back */
1234 ip_vs_in_stats(cp, skb);
1235 if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1236 offset += 2 * sizeof(__u16);
1237 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1238 /* do not touch skb anymore */
1239
1240 __ip_vs_conn_put(cp);
1241
1242 return verdict;
1243}
1244#endif
1245
1246
1247/*
1248 * Check if it's for virtual services, look it up,
1249 * and send it on its way...
1250 */
1251static unsigned int
1252ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1253 const struct net_device *in, const struct net_device *out,
1254 int (*okfn)(struct sk_buff *))
1255{
1256 struct ip_vs_iphdr iph;
1257 struct ip_vs_protocol *pp;
1258 struct ip_vs_conn *cp;
1259 int ret, restart, af;
1260
1261 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1262
1263 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1264
1265 /*
1266 * Big tappo: only PACKET_HOST, including loopback for local client
1267 * Don't handle local packets on IPv6 for now
1268 */
1269 if (unlikely(skb->pkt_type != PACKET_HOST)) {
1270 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1271 skb->pkt_type,
1272 iph.protocol,
1273 IP_VS_DBG_ADDR(af, &iph.daddr));
1274 return NF_ACCEPT;
1275 }
1276
1277 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1278 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1279
1280 if (related)
1281 return verdict;
1282 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1283 }
1284
1285 /* Protocol supported? */
1286 pp = ip_vs_proto_get(iph.protocol);
1287 if (unlikely(!pp))
1288 return NF_ACCEPT;
1289
1290 /*
1291 * Check if the packet belongs to an existing connection entry
1292 */
1293 cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1294
1295 if (unlikely(!cp)) {
1296 int v;
1297
1298 /* For local client packets, it could be a response */
1299 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1300 if (cp)
1301 return handle_response(af, skb, pp, cp, iph.len);
1302
1303 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1304 return v;
1305 }
1306
1307 if (unlikely(!cp)) {
1308 /* sorry, all this trouble for a no-hit :) */
1309 IP_VS_DBG_PKT(12, pp, skb, 0,
1310 "packet continues traversal as normal");
1311 return NF_ACCEPT;
1312 }
1313
1314 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1315
1316 /* Check the server status */
1317 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1318 /* the destination server is not available */
1319
1320 if (sysctl_ip_vs_expire_nodest_conn) {
1321 /* try to expire the connection immediately */
1322 ip_vs_conn_expire_now(cp);
1323 }
1324 /* don't restart its timer, and silently
1325 drop the packet. */
1326 __ip_vs_conn_put(cp);
1327 return NF_DROP;
1328 }
1329
1330 ip_vs_in_stats(cp, skb);
1331 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1332 if (cp->packet_xmit)
1333 ret = cp->packet_xmit(skb, cp, pp);
1334 /* do not touch skb anymore */
1335 else {
1336 IP_VS_DBG_RL("warning: packet_xmit is null");
1337 ret = NF_ACCEPT;
1338 }
1339
1340 /* Increase its packet counter and check if it is needed
1341 * to be synchronized
1342 *
1343 * Sync connection if it is about to close to
1344 * encorage the standby servers to update the connections timeout
1345 */
1346 atomic_inc(&cp->in_pkts);
1347 if (af == AF_INET &&
1348 (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1349 (((cp->protocol != IPPROTO_TCP ||
1350 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1351 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1352 == sysctl_ip_vs_sync_threshold[0])) ||
1353 ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1354 ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1355 (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1356 (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1357 ip_vs_sync_conn(cp);
1358 cp->old_state = cp->state;
1359
1360 ip_vs_conn_put(cp);
1361 return ret;
1362}
1363
1364
1365/*
1366 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1367 * related packets destined for 0.0.0.0/0.
1368 * When fwmark-based virtual service is used, such as transparent
1369 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1370 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1371 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1372 * and send them to ip_vs_in_icmp.
1373 */
1374static unsigned int
1375ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1376 const struct net_device *in, const struct net_device *out,
1377 int (*okfn)(struct sk_buff *))
1378{
1379 int r;
1380
1381 if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1382 return NF_ACCEPT;
1383
1384 return ip_vs_in_icmp(skb, &r, hooknum);
1385}
1386
1387#ifdef CONFIG_IP_VS_IPV6
1388static unsigned int
1389ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1390 const struct net_device *in, const struct net_device *out,
1391 int (*okfn)(struct sk_buff *))
1392{
1393 int r;
1394
1395 if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1396 return NF_ACCEPT;
1397
1398 return ip_vs_in_icmp_v6(skb, &r, hooknum);
1399}
1400#endif
1401
1402
1403static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1404 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1405 * or VS/NAT(change destination), so that filtering rules can be
1406 * applied to IPVS. */
1407 {
1408 .hook = ip_vs_in,
1409 .owner = THIS_MODULE,
1410 .pf = PF_INET,
1411 .hooknum = NF_INET_LOCAL_IN,
1412 .priority = 100,
1413 },
1414 /* After packet filtering, change source only for VS/NAT */
1415 {
1416 .hook = ip_vs_out,
1417 .owner = THIS_MODULE,
1418 .pf = PF_INET,
1419 .hooknum = NF_INET_FORWARD,
1420 .priority = 100,
1421 },
1422 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1423 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1424 {
1425 .hook = ip_vs_forward_icmp,
1426 .owner = THIS_MODULE,
1427 .pf = PF_INET,
1428 .hooknum = NF_INET_FORWARD,
1429 .priority = 99,
1430 },
1431 /* Before the netfilter connection tracking, exit from POST_ROUTING */
1432 {
1433 .hook = ip_vs_post_routing,
1434 .owner = THIS_MODULE,
1435 .pf = PF_INET,
1436 .hooknum = NF_INET_POST_ROUTING,
1437 .priority = NF_IP_PRI_NAT_SRC-1,
1438 },
1439#ifdef CONFIG_IP_VS_IPV6
1440 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1441 * or VS/NAT(change destination), so that filtering rules can be
1442 * applied to IPVS. */
1443 {
1444 .hook = ip_vs_in,
1445 .owner = THIS_MODULE,
1446 .pf = PF_INET6,
1447 .hooknum = NF_INET_LOCAL_IN,
1448 .priority = 100,
1449 },
1450 /* After packet filtering, change source only for VS/NAT */
1451 {
1452 .hook = ip_vs_out,
1453 .owner = THIS_MODULE,
1454 .pf = PF_INET6,
1455 .hooknum = NF_INET_FORWARD,
1456 .priority = 100,
1457 },
1458 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1459 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1460 {
1461 .hook = ip_vs_forward_icmp_v6,
1462 .owner = THIS_MODULE,
1463 .pf = PF_INET6,
1464 .hooknum = NF_INET_FORWARD,
1465 .priority = 99,
1466 },
1467 /* Before the netfilter connection tracking, exit from POST_ROUTING */
1468 {
1469 .hook = ip_vs_post_routing,
1470 .owner = THIS_MODULE,
1471 .pf = PF_INET6,
1472 .hooknum = NF_INET_POST_ROUTING,
1473 .priority = NF_IP6_PRI_NAT_SRC-1,
1474 },
1475#endif
1476};
1477
1478
1479/*
1480 * Initialize IP Virtual Server
1481 */
1482static int __init ip_vs_init(void)
1483{
1484 int ret;
1485
1486 ip_vs_estimator_init();
1487
1488 ret = ip_vs_control_init();
1489 if (ret < 0) {
1490 IP_VS_ERR("can't setup control.\n");
1491 goto cleanup_estimator;
1492 }
1493
1494 ip_vs_protocol_init();
1495
1496 ret = ip_vs_app_init();
1497 if (ret < 0) {
1498 IP_VS_ERR("can't setup application helper.\n");
1499 goto cleanup_protocol;
1500 }
1501
1502 ret = ip_vs_conn_init();
1503 if (ret < 0) {
1504 IP_VS_ERR("can't setup connection table.\n");
1505 goto cleanup_app;
1506 }
1507
1508 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1509 if (ret < 0) {
1510 IP_VS_ERR("can't register hooks.\n");
1511 goto cleanup_conn;
1512 }
1513
1514 IP_VS_INFO("ipvs loaded.\n");
1515 return ret;
1516
1517 cleanup_conn:
1518 ip_vs_conn_cleanup();
1519 cleanup_app:
1520 ip_vs_app_cleanup();
1521 cleanup_protocol:
1522 ip_vs_protocol_cleanup();
1523 ip_vs_control_cleanup();
1524 cleanup_estimator:
1525 ip_vs_estimator_cleanup();
1526 return ret;
1527}
1528
1529static void __exit ip_vs_cleanup(void)
1530{
1531 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1532 ip_vs_conn_cleanup();
1533 ip_vs_app_cleanup();
1534 ip_vs_protocol_cleanup();
1535 ip_vs_control_cleanup();
1536 ip_vs_estimator_cleanup();
1537 IP_VS_INFO("ipvs unloaded.\n");
1538}
1539
1540module_init(ip_vs_init);
1541module_exit(ip_vs_cleanup);
1542MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
deleted file mode 100644
index 771551d8fba9..000000000000
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ /dev/null
@@ -1,3441 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/types.h>
24#include <linux/capability.h>
25#include <linux/fs.h>
26#include <linux/sysctl.h>
27#include <linux/proc_fs.h>
28#include <linux/workqueue.h>
29#include <linux/swap.h>
30#include <linux/seq_file.h>
31
32#include <linux/netfilter.h>
33#include <linux/netfilter_ipv4.h>
34#include <linux/mutex.h>
35
36#include <net/net_namespace.h>
37#include <net/ip.h>
38#ifdef CONFIG_IP_VS_IPV6
39#include <net/ipv6.h>
40#include <net/ip6_route.h>
41#endif
42#include <net/route.h>
43#include <net/sock.h>
44#include <net/genetlink.h>
45
46#include <asm/uaccess.h>
47
48#include <net/ip_vs.h>
49
50/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
51static DEFINE_MUTEX(__ip_vs_mutex);
52
53/* lock for service table */
54static DEFINE_RWLOCK(__ip_vs_svc_lock);
55
56/* lock for table with the real services */
57static DEFINE_RWLOCK(__ip_vs_rs_lock);
58
59/* lock for state and timeout tables */
60static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
61
62/* lock for drop entry handling */
63static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
64
65/* lock for drop packet handling */
66static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
67
68/* 1/rate drop and drop-entry variables */
69int ip_vs_drop_rate = 0;
70int ip_vs_drop_counter = 0;
71static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
72
73/* number of virtual services */
74static int ip_vs_num_services = 0;
75
76/* sysctl variables */
77static int sysctl_ip_vs_drop_entry = 0;
78static int sysctl_ip_vs_drop_packet = 0;
79static int sysctl_ip_vs_secure_tcp = 0;
80static int sysctl_ip_vs_amemthresh = 1024;
81static int sysctl_ip_vs_am_droprate = 10;
82int sysctl_ip_vs_cache_bypass = 0;
83int sysctl_ip_vs_expire_nodest_conn = 0;
84int sysctl_ip_vs_expire_quiescent_template = 0;
85int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
86int sysctl_ip_vs_nat_icmp_send = 0;
87
88
89#ifdef CONFIG_IP_VS_DEBUG
90static int sysctl_ip_vs_debug_level = 0;
91
92int ip_vs_get_debug_level(void)
93{
94 return sysctl_ip_vs_debug_level;
95}
96#endif
97
98#ifdef CONFIG_IP_VS_IPV6
99/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
100static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
101{
102 struct rt6_info *rt;
103 struct flowi fl = {
104 .oif = 0,
105 .nl_u = {
106 .ip6_u = {
107 .daddr = *addr,
108 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
109 };
110
111 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
112 if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
113 return 1;
114
115 return 0;
116}
117#endif
118/*
119 * update_defense_level is called from keventd and from sysctl,
120 * so it needs to protect itself from softirqs
121 */
122static void update_defense_level(void)
123{
124 struct sysinfo i;
125 static int old_secure_tcp = 0;
126 int availmem;
127 int nomem;
128 int to_change = -1;
129
130 /* we only count free and buffered memory (in pages) */
131 si_meminfo(&i);
132 availmem = i.freeram + i.bufferram;
133 /* however in linux 2.5 the i.bufferram is total page cache size,
134 we need adjust it */
135 /* si_swapinfo(&i); */
136 /* availmem = availmem - (i.totalswap - i.freeswap); */
137
138 nomem = (availmem < sysctl_ip_vs_amemthresh);
139
140 local_bh_disable();
141
142 /* drop_entry */
143 spin_lock(&__ip_vs_dropentry_lock);
144 switch (sysctl_ip_vs_drop_entry) {
145 case 0:
146 atomic_set(&ip_vs_dropentry, 0);
147 break;
148 case 1:
149 if (nomem) {
150 atomic_set(&ip_vs_dropentry, 1);
151 sysctl_ip_vs_drop_entry = 2;
152 } else {
153 atomic_set(&ip_vs_dropentry, 0);
154 }
155 break;
156 case 2:
157 if (nomem) {
158 atomic_set(&ip_vs_dropentry, 1);
159 } else {
160 atomic_set(&ip_vs_dropentry, 0);
161 sysctl_ip_vs_drop_entry = 1;
162 };
163 break;
164 case 3:
165 atomic_set(&ip_vs_dropentry, 1);
166 break;
167 }
168 spin_unlock(&__ip_vs_dropentry_lock);
169
170 /* drop_packet */
171 spin_lock(&__ip_vs_droppacket_lock);
172 switch (sysctl_ip_vs_drop_packet) {
173 case 0:
174 ip_vs_drop_rate = 0;
175 break;
176 case 1:
177 if (nomem) {
178 ip_vs_drop_rate = ip_vs_drop_counter
179 = sysctl_ip_vs_amemthresh /
180 (sysctl_ip_vs_amemthresh-availmem);
181 sysctl_ip_vs_drop_packet = 2;
182 } else {
183 ip_vs_drop_rate = 0;
184 }
185 break;
186 case 2:
187 if (nomem) {
188 ip_vs_drop_rate = ip_vs_drop_counter
189 = sysctl_ip_vs_amemthresh /
190 (sysctl_ip_vs_amemthresh-availmem);
191 } else {
192 ip_vs_drop_rate = 0;
193 sysctl_ip_vs_drop_packet = 1;
194 }
195 break;
196 case 3:
197 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
198 break;
199 }
200 spin_unlock(&__ip_vs_droppacket_lock);
201
202 /* secure_tcp */
203 write_lock(&__ip_vs_securetcp_lock);
204 switch (sysctl_ip_vs_secure_tcp) {
205 case 0:
206 if (old_secure_tcp >= 2)
207 to_change = 0;
208 break;
209 case 1:
210 if (nomem) {
211 if (old_secure_tcp < 2)
212 to_change = 1;
213 sysctl_ip_vs_secure_tcp = 2;
214 } else {
215 if (old_secure_tcp >= 2)
216 to_change = 0;
217 }
218 break;
219 case 2:
220 if (nomem) {
221 if (old_secure_tcp < 2)
222 to_change = 1;
223 } else {
224 if (old_secure_tcp >= 2)
225 to_change = 0;
226 sysctl_ip_vs_secure_tcp = 1;
227 }
228 break;
229 case 3:
230 if (old_secure_tcp < 2)
231 to_change = 1;
232 break;
233 }
234 old_secure_tcp = sysctl_ip_vs_secure_tcp;
235 if (to_change >= 0)
236 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
237 write_unlock(&__ip_vs_securetcp_lock);
238
239 local_bh_enable();
240}
241
242
243/*
244 * Timer for checking the defense
245 */
246#define DEFENSE_TIMER_PERIOD 1*HZ
247static void defense_work_handler(struct work_struct *work);
248static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
249
250static void defense_work_handler(struct work_struct *work)
251{
252 update_defense_level();
253 if (atomic_read(&ip_vs_dropentry))
254 ip_vs_random_dropentry();
255
256 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
257}
258
259int
260ip_vs_use_count_inc(void)
261{
262 return try_module_get(THIS_MODULE);
263}
264
265void
266ip_vs_use_count_dec(void)
267{
268 module_put(THIS_MODULE);
269}
270
271
272/*
273 * Hash table: for virtual service lookups
274 */
275#define IP_VS_SVC_TAB_BITS 8
276#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
277#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
278
279/* the service table hashed by <protocol, addr, port> */
280static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
281/* the service table hashed by fwmark */
282static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
283
284/*
285 * Hash table: for real service lookups
286 */
287#define IP_VS_RTAB_BITS 4
288#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
289#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
290
291static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
292
293/*
294 * Trash for destinations
295 */
296static LIST_HEAD(ip_vs_dest_trash);
297
298/*
299 * FTP & NULL virtual service counters
300 */
301static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
302static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
303
304
305/*
306 * Returns hash value for virtual service
307 */
308static __inline__ unsigned
309ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
310 __be16 port)
311{
312 register unsigned porth = ntohs(port);
313 __be32 addr_fold = addr->ip;
314
315#ifdef CONFIG_IP_VS_IPV6
316 if (af == AF_INET6)
317 addr_fold = addr->ip6[0]^addr->ip6[1]^
318 addr->ip6[2]^addr->ip6[3];
319#endif
320
321 return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
322 & IP_VS_SVC_TAB_MASK;
323}
324
325/*
326 * Returns hash value of fwmark for virtual service lookup
327 */
328static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
329{
330 return fwmark & IP_VS_SVC_TAB_MASK;
331}
332
333/*
334 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
335 * or in the ip_vs_svc_fwm_table by fwmark.
336 * Should be called with locked tables.
337 */
338static int ip_vs_svc_hash(struct ip_vs_service *svc)
339{
340 unsigned hash;
341
342 if (svc->flags & IP_VS_SVC_F_HASHED) {
343 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
344 "called from %p\n", __builtin_return_address(0));
345 return 0;
346 }
347
348 if (svc->fwmark == 0) {
349 /*
350 * Hash it by <protocol,addr,port> in ip_vs_svc_table
351 */
352 hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
353 svc->port);
354 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
355 } else {
356 /*
357 * Hash it by fwmark in ip_vs_svc_fwm_table
358 */
359 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
360 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
361 }
362
363 svc->flags |= IP_VS_SVC_F_HASHED;
364 /* increase its refcnt because it is referenced by the svc table */
365 atomic_inc(&svc->refcnt);
366 return 1;
367}
368
369
370/*
371 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
372 * Should be called with locked tables.
373 */
374static int ip_vs_svc_unhash(struct ip_vs_service *svc)
375{
376 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
377 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
378 "called from %p\n", __builtin_return_address(0));
379 return 0;
380 }
381
382 if (svc->fwmark == 0) {
383 /* Remove it from the ip_vs_svc_table table */
384 list_del(&svc->s_list);
385 } else {
386 /* Remove it from the ip_vs_svc_fwm_table table */
387 list_del(&svc->f_list);
388 }
389
390 svc->flags &= ~IP_VS_SVC_F_HASHED;
391 atomic_dec(&svc->refcnt);
392 return 1;
393}
394
395
396/*
397 * Get service by {proto,addr,port} in the service table.
398 */
399static inline struct ip_vs_service *
400__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
401 __be16 vport)
402{
403 unsigned hash;
404 struct ip_vs_service *svc;
405
406 /* Check for "full" addressed entries */
407 hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
408
409 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
410 if ((svc->af == af)
411 && ip_vs_addr_equal(af, &svc->addr, vaddr)
412 && (svc->port == vport)
413 && (svc->protocol == protocol)) {
414 /* HIT */
415 atomic_inc(&svc->usecnt);
416 return svc;
417 }
418 }
419
420 return NULL;
421}
422
423
424/*
425 * Get service by {fwmark} in the service table.
426 */
427static inline struct ip_vs_service *
428__ip_vs_svc_fwm_get(int af, __u32 fwmark)
429{
430 unsigned hash;
431 struct ip_vs_service *svc;
432
433 /* Check for fwmark addressed entries */
434 hash = ip_vs_svc_fwm_hashkey(fwmark);
435
436 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
437 if (svc->fwmark == fwmark && svc->af == af) {
438 /* HIT */
439 atomic_inc(&svc->usecnt);
440 return svc;
441 }
442 }
443
444 return NULL;
445}
446
447struct ip_vs_service *
448ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
449 const union nf_inet_addr *vaddr, __be16 vport)
450{
451 struct ip_vs_service *svc;
452
453 read_lock(&__ip_vs_svc_lock);
454
455 /*
456 * Check the table hashed by fwmark first
457 */
458 if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
459 goto out;
460
461 /*
462 * Check the table hashed by <protocol,addr,port>
463 * for "full" addressed entries
464 */
465 svc = __ip_vs_service_get(af, protocol, vaddr, vport);
466
467 if (svc == NULL
468 && protocol == IPPROTO_TCP
469 && atomic_read(&ip_vs_ftpsvc_counter)
470 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
471 /*
472 * Check if ftp service entry exists, the packet
473 * might belong to FTP data connections.
474 */
475 svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
476 }
477
478 if (svc == NULL
479 && atomic_read(&ip_vs_nullsvc_counter)) {
480 /*
481 * Check if the catch-all port (port zero) exists
482 */
483 svc = __ip_vs_service_get(af, protocol, vaddr, 0);
484 }
485
486 out:
487 read_unlock(&__ip_vs_svc_lock);
488
489 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
490 fwmark, ip_vs_proto_name(protocol),
491 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
492 svc ? "hit" : "not hit");
493
494 return svc;
495}
496
497
498static inline void
499__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
500{
501 atomic_inc(&svc->refcnt);
502 dest->svc = svc;
503}
504
505static inline void
506__ip_vs_unbind_svc(struct ip_vs_dest *dest)
507{
508 struct ip_vs_service *svc = dest->svc;
509
510 dest->svc = NULL;
511 if (atomic_dec_and_test(&svc->refcnt))
512 kfree(svc);
513}
514
515
516/*
517 * Returns hash value for real service
518 */
519static inline unsigned ip_vs_rs_hashkey(int af,
520 const union nf_inet_addr *addr,
521 __be16 port)
522{
523 register unsigned porth = ntohs(port);
524 __be32 addr_fold = addr->ip;
525
526#ifdef CONFIG_IP_VS_IPV6
527 if (af == AF_INET6)
528 addr_fold = addr->ip6[0]^addr->ip6[1]^
529 addr->ip6[2]^addr->ip6[3];
530#endif
531
532 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
533 & IP_VS_RTAB_MASK;
534}
535
536/*
537 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
538 * should be called with locked tables.
539 */
540static int ip_vs_rs_hash(struct ip_vs_dest *dest)
541{
542 unsigned hash;
543
544 if (!list_empty(&dest->d_list)) {
545 return 0;
546 }
547
548 /*
549 * Hash by proto,addr,port,
550 * which are the parameters of the real service.
551 */
552 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
553
554 list_add(&dest->d_list, &ip_vs_rtable[hash]);
555
556 return 1;
557}
558
559/*
560 * UNhashes ip_vs_dest from ip_vs_rtable.
561 * should be called with locked tables.
562 */
563static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
564{
565 /*
566 * Remove it from the ip_vs_rtable table.
567 */
568 if (!list_empty(&dest->d_list)) {
569 list_del(&dest->d_list);
570 INIT_LIST_HEAD(&dest->d_list);
571 }
572
573 return 1;
574}
575
576/*
577 * Lookup real service by <proto,addr,port> in the real service table.
578 */
579struct ip_vs_dest *
580ip_vs_lookup_real_service(int af, __u16 protocol,
581 const union nf_inet_addr *daddr,
582 __be16 dport)
583{
584 unsigned hash;
585 struct ip_vs_dest *dest;
586
587 /*
588 * Check for "full" addressed entries
589 * Return the first found entry
590 */
591 hash = ip_vs_rs_hashkey(af, daddr, dport);
592
593 read_lock(&__ip_vs_rs_lock);
594 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
595 if ((dest->af == af)
596 && ip_vs_addr_equal(af, &dest->addr, daddr)
597 && (dest->port == dport)
598 && ((dest->protocol == protocol) ||
599 dest->vfwmark)) {
600 /* HIT */
601 read_unlock(&__ip_vs_rs_lock);
602 return dest;
603 }
604 }
605 read_unlock(&__ip_vs_rs_lock);
606
607 return NULL;
608}
609
610/*
611 * Lookup destination by {addr,port} in the given service
612 */
613static struct ip_vs_dest *
614ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
615 __be16 dport)
616{
617 struct ip_vs_dest *dest;
618
619 /*
620 * Find the destination for the given service
621 */
622 list_for_each_entry(dest, &svc->destinations, n_list) {
623 if ((dest->af == svc->af)
624 && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
625 && (dest->port == dport)) {
626 /* HIT */
627 return dest;
628 }
629 }
630
631 return NULL;
632}
633
634/*
635 * Find destination by {daddr,dport,vaddr,protocol}
636 * Cretaed to be used in ip_vs_process_message() in
637 * the backup synchronization daemon. It finds the
638 * destination to be bound to the received connection
639 * on the backup.
640 *
641 * ip_vs_lookup_real_service() looked promissing, but
642 * seems not working as expected.
643 */
644struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
645 __be16 dport,
646 const union nf_inet_addr *vaddr,
647 __be16 vport, __u16 protocol)
648{
649 struct ip_vs_dest *dest;
650 struct ip_vs_service *svc;
651
652 svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
653 if (!svc)
654 return NULL;
655 dest = ip_vs_lookup_dest(svc, daddr, dport);
656 if (dest)
657 atomic_inc(&dest->refcnt);
658 ip_vs_service_put(svc);
659 return dest;
660}
661
662/*
663 * Lookup dest by {svc,addr,port} in the destination trash.
664 * The destination trash is used to hold the destinations that are removed
665 * from the service table but are still referenced by some conn entries.
666 * The reason to add the destination trash is when the dest is temporary
667 * down (either by administrator or by monitor program), the dest can be
668 * picked back from the trash, the remaining connections to the dest can
669 * continue, and the counting information of the dest is also useful for
670 * scheduling.
671 */
672static struct ip_vs_dest *
673ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
674 __be16 dport)
675{
676 struct ip_vs_dest *dest, *nxt;
677
678 /*
679 * Find the destination in trash
680 */
681 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
682 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
683 "dest->refcnt=%d\n",
684 dest->vfwmark,
685 IP_VS_DBG_ADDR(svc->af, &dest->addr),
686 ntohs(dest->port),
687 atomic_read(&dest->refcnt));
688 if (dest->af == svc->af &&
689 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
690 dest->port == dport &&
691 dest->vfwmark == svc->fwmark &&
692 dest->protocol == svc->protocol &&
693 (svc->fwmark ||
694 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
695 dest->vport == svc->port))) {
696 /* HIT */
697 return dest;
698 }
699
700 /*
701 * Try to purge the destination from trash if not referenced
702 */
703 if (atomic_read(&dest->refcnt) == 1) {
704 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
705 "from trash\n",
706 dest->vfwmark,
707 IP_VS_DBG_ADDR(svc->af, &dest->addr),
708 ntohs(dest->port));
709 list_del(&dest->n_list);
710 ip_vs_dst_reset(dest);
711 __ip_vs_unbind_svc(dest);
712 kfree(dest);
713 }
714 }
715
716 return NULL;
717}
718
719
720/*
721 * Clean up all the destinations in the trash
722 * Called by the ip_vs_control_cleanup()
723 *
724 * When the ip_vs_control_clearup is activated by ipvs module exit,
725 * the service tables must have been flushed and all the connections
726 * are expired, and the refcnt of each destination in the trash must
727 * be 1, so we simply release them here.
728 */
729static void ip_vs_trash_cleanup(void)
730{
731 struct ip_vs_dest *dest, *nxt;
732
733 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
734 list_del(&dest->n_list);
735 ip_vs_dst_reset(dest);
736 __ip_vs_unbind_svc(dest);
737 kfree(dest);
738 }
739}
740
741
742static void
743ip_vs_zero_stats(struct ip_vs_stats *stats)
744{
745 spin_lock_bh(&stats->lock);
746
747 memset(&stats->ustats, 0, sizeof(stats->ustats));
748 ip_vs_zero_estimator(stats);
749
750 spin_unlock_bh(&stats->lock);
751}
752
753/*
754 * Update a destination in the given service
755 */
756static void
757__ip_vs_update_dest(struct ip_vs_service *svc,
758 struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
759{
760 int conn_flags;
761
762 /* set the weight and the flags */
763 atomic_set(&dest->weight, udest->weight);
764 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
765
766 /* check if local node and update the flags */
767#ifdef CONFIG_IP_VS_IPV6
768 if (svc->af == AF_INET6) {
769 if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
770 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
771 | IP_VS_CONN_F_LOCALNODE;
772 }
773 } else
774#endif
775 if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
776 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
777 | IP_VS_CONN_F_LOCALNODE;
778 }
779
780 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
781 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
782 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
783 } else {
784 /*
785 * Put the real service in ip_vs_rtable if not present.
786 * For now only for NAT!
787 */
788 write_lock_bh(&__ip_vs_rs_lock);
789 ip_vs_rs_hash(dest);
790 write_unlock_bh(&__ip_vs_rs_lock);
791 }
792 atomic_set(&dest->conn_flags, conn_flags);
793
794 /* bind the service */
795 if (!dest->svc) {
796 __ip_vs_bind_svc(dest, svc);
797 } else {
798 if (dest->svc != svc) {
799 __ip_vs_unbind_svc(dest);
800 ip_vs_zero_stats(&dest->stats);
801 __ip_vs_bind_svc(dest, svc);
802 }
803 }
804
805 /* set the dest status flags */
806 dest->flags |= IP_VS_DEST_F_AVAILABLE;
807
808 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
809 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
810 dest->u_threshold = udest->u_threshold;
811 dest->l_threshold = udest->l_threshold;
812}
813
814
815/*
816 * Create a destination for the given service
817 */
818static int
819ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
820 struct ip_vs_dest **dest_p)
821{
822 struct ip_vs_dest *dest;
823 unsigned atype;
824
825 EnterFunction(2);
826
827#ifdef CONFIG_IP_VS_IPV6
828 if (svc->af == AF_INET6) {
829 atype = ipv6_addr_type(&udest->addr.in6);
830 if ((!(atype & IPV6_ADDR_UNICAST) ||
831 atype & IPV6_ADDR_LINKLOCAL) &&
832 !__ip_vs_addr_is_local_v6(&udest->addr.in6))
833 return -EINVAL;
834 } else
835#endif
836 {
837 atype = inet_addr_type(&init_net, udest->addr.ip);
838 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
839 return -EINVAL;
840 }
841
842 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
843 if (dest == NULL) {
844 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
845 return -ENOMEM;
846 }
847
848 dest->af = svc->af;
849 dest->protocol = svc->protocol;
850 dest->vaddr = svc->addr;
851 dest->vport = svc->port;
852 dest->vfwmark = svc->fwmark;
853 ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
854 dest->port = udest->port;
855
856 atomic_set(&dest->activeconns, 0);
857 atomic_set(&dest->inactconns, 0);
858 atomic_set(&dest->persistconns, 0);
859 atomic_set(&dest->refcnt, 0);
860
861 INIT_LIST_HEAD(&dest->d_list);
862 spin_lock_init(&dest->dst_lock);
863 spin_lock_init(&dest->stats.lock);
864 __ip_vs_update_dest(svc, dest, udest);
865 ip_vs_new_estimator(&dest->stats);
866
867 *dest_p = dest;
868
869 LeaveFunction(2);
870 return 0;
871}
872
873
874/*
875 * Add a destination into an existing service
876 */
877static int
878ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
879{
880 struct ip_vs_dest *dest;
881 union nf_inet_addr daddr;
882 __be16 dport = udest->port;
883 int ret;
884
885 EnterFunction(2);
886
887 if (udest->weight < 0) {
888 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
889 return -ERANGE;
890 }
891
892 if (udest->l_threshold > udest->u_threshold) {
893 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
894 "upper threshold\n");
895 return -ERANGE;
896 }
897
898 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
899
900 /*
901 * Check if the dest already exists in the list
902 */
903 dest = ip_vs_lookup_dest(svc, &daddr, dport);
904
905 if (dest != NULL) {
906 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
907 return -EEXIST;
908 }
909
910 /*
911 * Check if the dest already exists in the trash and
912 * is from the same service
913 */
914 dest = ip_vs_trash_get_dest(svc, &daddr, dport);
915
916 if (dest != NULL) {
917 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
918 "dest->refcnt=%d, service %u/%s:%u\n",
919 IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
920 atomic_read(&dest->refcnt),
921 dest->vfwmark,
922 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
923 ntohs(dest->vport));
924
925 __ip_vs_update_dest(svc, dest, udest);
926
927 /*
928 * Get the destination from the trash
929 */
930 list_del(&dest->n_list);
931
932 ip_vs_new_estimator(&dest->stats);
933
934 write_lock_bh(&__ip_vs_svc_lock);
935
936 /*
937 * Wait until all other svc users go away.
938 */
939 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
940
941 list_add(&dest->n_list, &svc->destinations);
942 svc->num_dests++;
943
944 /* call the update_service function of its scheduler */
945 if (svc->scheduler->update_service)
946 svc->scheduler->update_service(svc);
947
948 write_unlock_bh(&__ip_vs_svc_lock);
949 return 0;
950 }
951
952 /*
953 * Allocate and initialize the dest structure
954 */
955 ret = ip_vs_new_dest(svc, udest, &dest);
956 if (ret) {
957 return ret;
958 }
959
960 /*
961 * Add the dest entry into the list
962 */
963 atomic_inc(&dest->refcnt);
964
965 write_lock_bh(&__ip_vs_svc_lock);
966
967 /*
968 * Wait until all other svc users go away.
969 */
970 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
971
972 list_add(&dest->n_list, &svc->destinations);
973 svc->num_dests++;
974
975 /* call the update_service function of its scheduler */
976 if (svc->scheduler->update_service)
977 svc->scheduler->update_service(svc);
978
979 write_unlock_bh(&__ip_vs_svc_lock);
980
981 LeaveFunction(2);
982
983 return 0;
984}
985
986
987/*
988 * Edit a destination in the given service
989 */
990static int
991ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
992{
993 struct ip_vs_dest *dest;
994 union nf_inet_addr daddr;
995 __be16 dport = udest->port;
996
997 EnterFunction(2);
998
999 if (udest->weight < 0) {
1000 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
1001 return -ERANGE;
1002 }
1003
1004 if (udest->l_threshold > udest->u_threshold) {
1005 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
1006 "upper threshold\n");
1007 return -ERANGE;
1008 }
1009
1010 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
1011
1012 /*
1013 * Lookup the destination list
1014 */
1015 dest = ip_vs_lookup_dest(svc, &daddr, dport);
1016
1017 if (dest == NULL) {
1018 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
1019 return -ENOENT;
1020 }
1021
1022 __ip_vs_update_dest(svc, dest, udest);
1023
1024 write_lock_bh(&__ip_vs_svc_lock);
1025
1026 /* Wait until all other svc users go away */
1027 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1028
1029 /* call the update_service, because server weight may be changed */
1030 if (svc->scheduler->update_service)
1031 svc->scheduler->update_service(svc);
1032
1033 write_unlock_bh(&__ip_vs_svc_lock);
1034
1035 LeaveFunction(2);
1036
1037 return 0;
1038}
1039
1040
1041/*
1042 * Delete a destination (must be already unlinked from the service)
1043 */
1044static void __ip_vs_del_dest(struct ip_vs_dest *dest)
1045{
1046 ip_vs_kill_estimator(&dest->stats);
1047
1048 /*
1049 * Remove it from the d-linked list with the real services.
1050 */
1051 write_lock_bh(&__ip_vs_rs_lock);
1052 ip_vs_rs_unhash(dest);
1053 write_unlock_bh(&__ip_vs_rs_lock);
1054
1055 /*
1056 * Decrease the refcnt of the dest, and free the dest
1057 * if nobody refers to it (refcnt=0). Otherwise, throw
1058 * the destination into the trash.
1059 */
1060 if (atomic_dec_and_test(&dest->refcnt)) {
1061 ip_vs_dst_reset(dest);
1062 /* simply decrease svc->refcnt here, let the caller check
1063 and release the service if nobody refers to it.
1064 Only user context can release destination and service,
1065 and only one user context can update virtual service at a
1066 time, so the operation here is OK */
1067 atomic_dec(&dest->svc->refcnt);
1068 kfree(dest);
1069 } else {
1070 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1071 "dest->refcnt=%d\n",
1072 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1073 ntohs(dest->port),
1074 atomic_read(&dest->refcnt));
1075 list_add(&dest->n_list, &ip_vs_dest_trash);
1076 atomic_inc(&dest->refcnt);
1077 }
1078}
1079
1080
1081/*
1082 * Unlink a destination from the given service
1083 */
1084static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1085 struct ip_vs_dest *dest,
1086 int svcupd)
1087{
1088 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1089
1090 /*
1091 * Remove it from the d-linked destination list.
1092 */
1093 list_del(&dest->n_list);
1094 svc->num_dests--;
1095
1096 /*
1097 * Call the update_service function of its scheduler
1098 */
1099 if (svcupd && svc->scheduler->update_service)
1100 svc->scheduler->update_service(svc);
1101}
1102
1103
1104/*
1105 * Delete a destination server in the given service
1106 */
1107static int
1108ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1109{
1110 struct ip_vs_dest *dest;
1111 __be16 dport = udest->port;
1112
1113 EnterFunction(2);
1114
1115 dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1116
1117 if (dest == NULL) {
1118 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1119 return -ENOENT;
1120 }
1121
1122 write_lock_bh(&__ip_vs_svc_lock);
1123
1124 /*
1125 * Wait until all other svc users go away.
1126 */
1127 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1128
1129 /*
1130 * Unlink dest from the service
1131 */
1132 __ip_vs_unlink_dest(svc, dest, 1);
1133
1134 write_unlock_bh(&__ip_vs_svc_lock);
1135
1136 /*
1137 * Delete the destination
1138 */
1139 __ip_vs_del_dest(dest);
1140
1141 LeaveFunction(2);
1142
1143 return 0;
1144}
1145
1146
1147/*
1148 * Add a service into the service hash table
1149 */
1150static int
1151ip_vs_add_service(struct ip_vs_service_user_kern *u,
1152 struct ip_vs_service **svc_p)
1153{
1154 int ret = 0;
1155 struct ip_vs_scheduler *sched = NULL;
1156 struct ip_vs_service *svc = NULL;
1157
1158 /* increase the module use count */
1159 ip_vs_use_count_inc();
1160
1161 /* Lookup the scheduler by 'u->sched_name' */
1162 sched = ip_vs_scheduler_get(u->sched_name);
1163 if (sched == NULL) {
1164 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1165 u->sched_name);
1166 ret = -ENOENT;
1167 goto out_mod_dec;
1168 }
1169
1170#ifdef CONFIG_IP_VS_IPV6
1171 if (u->af == AF_INET6) {
1172 if (!sched->supports_ipv6) {
1173 ret = -EAFNOSUPPORT;
1174 goto out_err;
1175 }
1176 if ((u->netmask < 1) || (u->netmask > 128)) {
1177 ret = -EINVAL;
1178 goto out_err;
1179 }
1180 }
1181#endif
1182
1183 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1184 if (svc == NULL) {
1185 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1186 ret = -ENOMEM;
1187 goto out_err;
1188 }
1189
1190 /* I'm the first user of the service */
1191 atomic_set(&svc->usecnt, 1);
1192 atomic_set(&svc->refcnt, 0);
1193
1194 svc->af = u->af;
1195 svc->protocol = u->protocol;
1196 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1197 svc->port = u->port;
1198 svc->fwmark = u->fwmark;
1199 svc->flags = u->flags;
1200 svc->timeout = u->timeout * HZ;
1201 svc->netmask = u->netmask;
1202
1203 INIT_LIST_HEAD(&svc->destinations);
1204 rwlock_init(&svc->sched_lock);
1205 spin_lock_init(&svc->stats.lock);
1206
1207 /* Bind the scheduler */
1208 ret = ip_vs_bind_scheduler(svc, sched);
1209 if (ret)
1210 goto out_err;
1211 sched = NULL;
1212
1213 /* Update the virtual service counters */
1214 if (svc->port == FTPPORT)
1215 atomic_inc(&ip_vs_ftpsvc_counter);
1216 else if (svc->port == 0)
1217 atomic_inc(&ip_vs_nullsvc_counter);
1218
1219 ip_vs_new_estimator(&svc->stats);
1220
1221 /* Count only IPv4 services for old get/setsockopt interface */
1222 if (svc->af == AF_INET)
1223 ip_vs_num_services++;
1224
1225 /* Hash the service into the service table */
1226 write_lock_bh(&__ip_vs_svc_lock);
1227 ip_vs_svc_hash(svc);
1228 write_unlock_bh(&__ip_vs_svc_lock);
1229
1230 *svc_p = svc;
1231 return 0;
1232
1233 out_err:
1234 if (svc != NULL) {
1235 if (svc->scheduler)
1236 ip_vs_unbind_scheduler(svc);
1237 if (svc->inc) {
1238 local_bh_disable();
1239 ip_vs_app_inc_put(svc->inc);
1240 local_bh_enable();
1241 }
1242 kfree(svc);
1243 }
1244 ip_vs_scheduler_put(sched);
1245
1246 out_mod_dec:
1247 /* decrease the module use count */
1248 ip_vs_use_count_dec();
1249
1250 return ret;
1251}
1252
1253
1254/*
1255 * Edit a service and bind it with a new scheduler
1256 */
1257static int
1258ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1259{
1260 struct ip_vs_scheduler *sched, *old_sched;
1261 int ret = 0;
1262
1263 /*
1264 * Lookup the scheduler, by 'u->sched_name'
1265 */
1266 sched = ip_vs_scheduler_get(u->sched_name);
1267 if (sched == NULL) {
1268 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1269 u->sched_name);
1270 return -ENOENT;
1271 }
1272 old_sched = sched;
1273
1274#ifdef CONFIG_IP_VS_IPV6
1275 if (u->af == AF_INET6) {
1276 if (!sched->supports_ipv6) {
1277 ret = -EAFNOSUPPORT;
1278 goto out;
1279 }
1280 if ((u->netmask < 1) || (u->netmask > 128)) {
1281 ret = -EINVAL;
1282 goto out;
1283 }
1284 }
1285#endif
1286
1287 write_lock_bh(&__ip_vs_svc_lock);
1288
1289 /*
1290 * Wait until all other svc users go away.
1291 */
1292 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1293
1294 /*
1295 * Set the flags and timeout value
1296 */
1297 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1298 svc->timeout = u->timeout * HZ;
1299 svc->netmask = u->netmask;
1300
1301 old_sched = svc->scheduler;
1302 if (sched != old_sched) {
1303 /*
1304 * Unbind the old scheduler
1305 */
1306 if ((ret = ip_vs_unbind_scheduler(svc))) {
1307 old_sched = sched;
1308 goto out_unlock;
1309 }
1310
1311 /*
1312 * Bind the new scheduler
1313 */
1314 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1315 /*
1316 * If ip_vs_bind_scheduler fails, restore the old
1317 * scheduler.
1318 * The main reason of failure is out of memory.
1319 *
1320 * The question is if the old scheduler can be
1321 * restored all the time. TODO: if it cannot be
1322 * restored some time, we must delete the service,
1323 * otherwise the system may crash.
1324 */
1325 ip_vs_bind_scheduler(svc, old_sched);
1326 old_sched = sched;
1327 goto out_unlock;
1328 }
1329 }
1330
1331 out_unlock:
1332 write_unlock_bh(&__ip_vs_svc_lock);
1333 out:
1334
1335 if (old_sched)
1336 ip_vs_scheduler_put(old_sched);
1337
1338 return ret;
1339}
1340
1341
1342/*
1343 * Delete a service from the service list
1344 * - The service must be unlinked, unlocked and not referenced!
1345 * - We are called under _bh lock
1346 */
1347static void __ip_vs_del_service(struct ip_vs_service *svc)
1348{
1349 struct ip_vs_dest *dest, *nxt;
1350 struct ip_vs_scheduler *old_sched;
1351
1352 /* Count only IPv4 services for old get/setsockopt interface */
1353 if (svc->af == AF_INET)
1354 ip_vs_num_services--;
1355
1356 ip_vs_kill_estimator(&svc->stats);
1357
1358 /* Unbind scheduler */
1359 old_sched = svc->scheduler;
1360 ip_vs_unbind_scheduler(svc);
1361 if (old_sched)
1362 ip_vs_scheduler_put(old_sched);
1363
1364 /* Unbind app inc */
1365 if (svc->inc) {
1366 ip_vs_app_inc_put(svc->inc);
1367 svc->inc = NULL;
1368 }
1369
1370 /*
1371 * Unlink the whole destination list
1372 */
1373 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1374 __ip_vs_unlink_dest(svc, dest, 0);
1375 __ip_vs_del_dest(dest);
1376 }
1377
1378 /*
1379 * Update the virtual service counters
1380 */
1381 if (svc->port == FTPPORT)
1382 atomic_dec(&ip_vs_ftpsvc_counter);
1383 else if (svc->port == 0)
1384 atomic_dec(&ip_vs_nullsvc_counter);
1385
1386 /*
1387 * Free the service if nobody refers to it
1388 */
1389 if (atomic_read(&svc->refcnt) == 0)
1390 kfree(svc);
1391
1392 /* decrease the module use count */
1393 ip_vs_use_count_dec();
1394}
1395
1396/*
1397 * Delete a service from the service list
1398 */
1399static int ip_vs_del_service(struct ip_vs_service *svc)
1400{
1401 if (svc == NULL)
1402 return -EEXIST;
1403
1404 /*
1405 * Unhash it from the service table
1406 */
1407 write_lock_bh(&__ip_vs_svc_lock);
1408
1409 ip_vs_svc_unhash(svc);
1410
1411 /*
1412 * Wait until all the svc users go away.
1413 */
1414 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1415
1416 __ip_vs_del_service(svc);
1417
1418 write_unlock_bh(&__ip_vs_svc_lock);
1419
1420 return 0;
1421}
1422
1423
1424/*
1425 * Flush all the virtual services
1426 */
1427static int ip_vs_flush(void)
1428{
1429 int idx;
1430 struct ip_vs_service *svc, *nxt;
1431
1432 /*
1433 * Flush the service table hashed by <protocol,addr,port>
1434 */
1435 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1436 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1437 write_lock_bh(&__ip_vs_svc_lock);
1438 ip_vs_svc_unhash(svc);
1439 /*
1440 * Wait until all the svc users go away.
1441 */
1442 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1443 __ip_vs_del_service(svc);
1444 write_unlock_bh(&__ip_vs_svc_lock);
1445 }
1446 }
1447
1448 /*
1449 * Flush the service table hashed by fwmark
1450 */
1451 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1452 list_for_each_entry_safe(svc, nxt,
1453 &ip_vs_svc_fwm_table[idx], f_list) {
1454 write_lock_bh(&__ip_vs_svc_lock);
1455 ip_vs_svc_unhash(svc);
1456 /*
1457 * Wait until all the svc users go away.
1458 */
1459 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1460 __ip_vs_del_service(svc);
1461 write_unlock_bh(&__ip_vs_svc_lock);
1462 }
1463 }
1464
1465 return 0;
1466}
1467
1468
1469/*
1470 * Zero counters in a service or all services
1471 */
1472static int ip_vs_zero_service(struct ip_vs_service *svc)
1473{
1474 struct ip_vs_dest *dest;
1475
1476 write_lock_bh(&__ip_vs_svc_lock);
1477 list_for_each_entry(dest, &svc->destinations, n_list) {
1478 ip_vs_zero_stats(&dest->stats);
1479 }
1480 ip_vs_zero_stats(&svc->stats);
1481 write_unlock_bh(&__ip_vs_svc_lock);
1482 return 0;
1483}
1484
1485static int ip_vs_zero_all(void)
1486{
1487 int idx;
1488 struct ip_vs_service *svc;
1489
1490 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1491 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1492 ip_vs_zero_service(svc);
1493 }
1494 }
1495
1496 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1497 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1498 ip_vs_zero_service(svc);
1499 }
1500 }
1501
1502 ip_vs_zero_stats(&ip_vs_stats);
1503 return 0;
1504}
1505
1506
1507static int
1508proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1509 void __user *buffer, size_t *lenp, loff_t *ppos)
1510{
1511 int *valp = table->data;
1512 int val = *valp;
1513 int rc;
1514
1515 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1516 if (write && (*valp != val)) {
1517 if ((*valp < 0) || (*valp > 3)) {
1518 /* Restore the correct value */
1519 *valp = val;
1520 } else {
1521 update_defense_level();
1522 }
1523 }
1524 return rc;
1525}
1526
1527
1528static int
1529proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1530 void __user *buffer, size_t *lenp, loff_t *ppos)
1531{
1532 int *valp = table->data;
1533 int val[2];
1534 int rc;
1535
1536 /* backup the value first */
1537 memcpy(val, valp, sizeof(val));
1538
1539 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1540 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1541 /* Restore the correct value */
1542 memcpy(valp, val, sizeof(val));
1543 }
1544 return rc;
1545}
1546
1547
1548/*
1549 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1550 */
1551
1552static struct ctl_table vs_vars[] = {
1553 {
1554 .procname = "amemthresh",
1555 .data = &sysctl_ip_vs_amemthresh,
1556 .maxlen = sizeof(int),
1557 .mode = 0644,
1558 .proc_handler = &proc_dointvec,
1559 },
1560#ifdef CONFIG_IP_VS_DEBUG
1561 {
1562 .procname = "debug_level",
1563 .data = &sysctl_ip_vs_debug_level,
1564 .maxlen = sizeof(int),
1565 .mode = 0644,
1566 .proc_handler = &proc_dointvec,
1567 },
1568#endif
1569 {
1570 .procname = "am_droprate",
1571 .data = &sysctl_ip_vs_am_droprate,
1572 .maxlen = sizeof(int),
1573 .mode = 0644,
1574 .proc_handler = &proc_dointvec,
1575 },
1576 {
1577 .procname = "drop_entry",
1578 .data = &sysctl_ip_vs_drop_entry,
1579 .maxlen = sizeof(int),
1580 .mode = 0644,
1581 .proc_handler = &proc_do_defense_mode,
1582 },
1583 {
1584 .procname = "drop_packet",
1585 .data = &sysctl_ip_vs_drop_packet,
1586 .maxlen = sizeof(int),
1587 .mode = 0644,
1588 .proc_handler = &proc_do_defense_mode,
1589 },
1590 {
1591 .procname = "secure_tcp",
1592 .data = &sysctl_ip_vs_secure_tcp,
1593 .maxlen = sizeof(int),
1594 .mode = 0644,
1595 .proc_handler = &proc_do_defense_mode,
1596 },
1597#if 0
1598 {
1599 .procname = "timeout_established",
1600 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1601 .maxlen = sizeof(int),
1602 .mode = 0644,
1603 .proc_handler = &proc_dointvec_jiffies,
1604 },
1605 {
1606 .procname = "timeout_synsent",
1607 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1608 .maxlen = sizeof(int),
1609 .mode = 0644,
1610 .proc_handler = &proc_dointvec_jiffies,
1611 },
1612 {
1613 .procname = "timeout_synrecv",
1614 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1615 .maxlen = sizeof(int),
1616 .mode = 0644,
1617 .proc_handler = &proc_dointvec_jiffies,
1618 },
1619 {
1620 .procname = "timeout_finwait",
1621 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1622 .maxlen = sizeof(int),
1623 .mode = 0644,
1624 .proc_handler = &proc_dointvec_jiffies,
1625 },
1626 {
1627 .procname = "timeout_timewait",
1628 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1629 .maxlen = sizeof(int),
1630 .mode = 0644,
1631 .proc_handler = &proc_dointvec_jiffies,
1632 },
1633 {
1634 .procname = "timeout_close",
1635 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1636 .maxlen = sizeof(int),
1637 .mode = 0644,
1638 .proc_handler = &proc_dointvec_jiffies,
1639 },
1640 {
1641 .procname = "timeout_closewait",
1642 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1643 .maxlen = sizeof(int),
1644 .mode = 0644,
1645 .proc_handler = &proc_dointvec_jiffies,
1646 },
1647 {
1648 .procname = "timeout_lastack",
1649 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1650 .maxlen = sizeof(int),
1651 .mode = 0644,
1652 .proc_handler = &proc_dointvec_jiffies,
1653 },
1654 {
1655 .procname = "timeout_listen",
1656 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1657 .maxlen = sizeof(int),
1658 .mode = 0644,
1659 .proc_handler = &proc_dointvec_jiffies,
1660 },
1661 {
1662 .procname = "timeout_synack",
1663 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1664 .maxlen = sizeof(int),
1665 .mode = 0644,
1666 .proc_handler = &proc_dointvec_jiffies,
1667 },
1668 {
1669 .procname = "timeout_udp",
1670 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1671 .maxlen = sizeof(int),
1672 .mode = 0644,
1673 .proc_handler = &proc_dointvec_jiffies,
1674 },
1675 {
1676 .procname = "timeout_icmp",
1677 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1678 .maxlen = sizeof(int),
1679 .mode = 0644,
1680 .proc_handler = &proc_dointvec_jiffies,
1681 },
1682#endif
1683 {
1684 .procname = "cache_bypass",
1685 .data = &sysctl_ip_vs_cache_bypass,
1686 .maxlen = sizeof(int),
1687 .mode = 0644,
1688 .proc_handler = &proc_dointvec,
1689 },
1690 {
1691 .procname = "expire_nodest_conn",
1692 .data = &sysctl_ip_vs_expire_nodest_conn,
1693 .maxlen = sizeof(int),
1694 .mode = 0644,
1695 .proc_handler = &proc_dointvec,
1696 },
1697 {
1698 .procname = "expire_quiescent_template",
1699 .data = &sysctl_ip_vs_expire_quiescent_template,
1700 .maxlen = sizeof(int),
1701 .mode = 0644,
1702 .proc_handler = &proc_dointvec,
1703 },
1704 {
1705 .procname = "sync_threshold",
1706 .data = &sysctl_ip_vs_sync_threshold,
1707 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1708 .mode = 0644,
1709 .proc_handler = &proc_do_sync_threshold,
1710 },
1711 {
1712 .procname = "nat_icmp_send",
1713 .data = &sysctl_ip_vs_nat_icmp_send,
1714 .maxlen = sizeof(int),
1715 .mode = 0644,
1716 .proc_handler = &proc_dointvec,
1717 },
1718 { .ctl_name = 0 }
1719};
1720
1721const struct ctl_path net_vs_ctl_path[] = {
1722 { .procname = "net", .ctl_name = CTL_NET, },
1723 { .procname = "ipv4", .ctl_name = NET_IPV4, },
1724 { .procname = "vs", },
1725 { }
1726};
1727EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1728
1729static struct ctl_table_header * sysctl_header;
1730
1731#ifdef CONFIG_PROC_FS
1732
1733struct ip_vs_iter {
1734 struct list_head *table;
1735 int bucket;
1736};
1737
1738/*
1739 * Write the contents of the VS rule table to a PROCfs file.
1740 * (It is kept just for backward compatibility)
1741 */
1742static inline const char *ip_vs_fwd_name(unsigned flags)
1743{
1744 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1745 case IP_VS_CONN_F_LOCALNODE:
1746 return "Local";
1747 case IP_VS_CONN_F_TUNNEL:
1748 return "Tunnel";
1749 case IP_VS_CONN_F_DROUTE:
1750 return "Route";
1751 default:
1752 return "Masq";
1753 }
1754}
1755
1756
1757/* Get the Nth entry in the two lists */
1758static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1759{
1760 struct ip_vs_iter *iter = seq->private;
1761 int idx;
1762 struct ip_vs_service *svc;
1763
1764 /* look in hash by protocol */
1765 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1766 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1767 if (pos-- == 0){
1768 iter->table = ip_vs_svc_table;
1769 iter->bucket = idx;
1770 return svc;
1771 }
1772 }
1773 }
1774
1775 /* keep looking in fwmark */
1776 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1777 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1778 if (pos-- == 0) {
1779 iter->table = ip_vs_svc_fwm_table;
1780 iter->bucket = idx;
1781 return svc;
1782 }
1783 }
1784 }
1785
1786 return NULL;
1787}
1788
1789static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1790__acquires(__ip_vs_svc_lock)
1791{
1792
1793 read_lock_bh(&__ip_vs_svc_lock);
1794 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1795}
1796
1797
1798static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1799{
1800 struct list_head *e;
1801 struct ip_vs_iter *iter;
1802 struct ip_vs_service *svc;
1803
1804 ++*pos;
1805 if (v == SEQ_START_TOKEN)
1806 return ip_vs_info_array(seq,0);
1807
1808 svc = v;
1809 iter = seq->private;
1810
1811 if (iter->table == ip_vs_svc_table) {
1812 /* next service in table hashed by protocol */
1813 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1814 return list_entry(e, struct ip_vs_service, s_list);
1815
1816
1817 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1818 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1819 s_list) {
1820 return svc;
1821 }
1822 }
1823
1824 iter->table = ip_vs_svc_fwm_table;
1825 iter->bucket = -1;
1826 goto scan_fwmark;
1827 }
1828
1829 /* next service in hashed by fwmark */
1830 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1831 return list_entry(e, struct ip_vs_service, f_list);
1832
1833 scan_fwmark:
1834 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1835 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1836 f_list)
1837 return svc;
1838 }
1839
1840 return NULL;
1841}
1842
1843static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1844__releases(__ip_vs_svc_lock)
1845{
1846 read_unlock_bh(&__ip_vs_svc_lock);
1847}
1848
1849
1850static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1851{
1852 if (v == SEQ_START_TOKEN) {
1853 seq_printf(seq,
1854 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1855 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1856 seq_puts(seq,
1857 "Prot LocalAddress:Port Scheduler Flags\n");
1858 seq_puts(seq,
1859 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1860 } else {
1861 const struct ip_vs_service *svc = v;
1862 const struct ip_vs_iter *iter = seq->private;
1863 const struct ip_vs_dest *dest;
1864
1865 if (iter->table == ip_vs_svc_table) {
1866#ifdef CONFIG_IP_VS_IPV6
1867 if (svc->af == AF_INET6)
1868 seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ",
1869 ip_vs_proto_name(svc->protocol),
1870 NIP6(svc->addr.in6),
1871 ntohs(svc->port),
1872 svc->scheduler->name);
1873 else
1874#endif
1875 seq_printf(seq, "%s %08X:%04X %s ",
1876 ip_vs_proto_name(svc->protocol),
1877 ntohl(svc->addr.ip),
1878 ntohs(svc->port),
1879 svc->scheduler->name);
1880 } else {
1881 seq_printf(seq, "FWM %08X %s ",
1882 svc->fwmark, svc->scheduler->name);
1883 }
1884
1885 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1886 seq_printf(seq, "persistent %d %08X\n",
1887 svc->timeout,
1888 ntohl(svc->netmask));
1889 else
1890 seq_putc(seq, '\n');
1891
1892 list_for_each_entry(dest, &svc->destinations, n_list) {
1893#ifdef CONFIG_IP_VS_IPV6
1894 if (dest->af == AF_INET6)
1895 seq_printf(seq,
1896 " -> [" NIP6_FMT "]:%04X"
1897 " %-7s %-6d %-10d %-10d\n",
1898 NIP6(dest->addr.in6),
1899 ntohs(dest->port),
1900 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1901 atomic_read(&dest->weight),
1902 atomic_read(&dest->activeconns),
1903 atomic_read(&dest->inactconns));
1904 else
1905#endif
1906 seq_printf(seq,
1907 " -> %08X:%04X "
1908 "%-7s %-6d %-10d %-10d\n",
1909 ntohl(dest->addr.ip),
1910 ntohs(dest->port),
1911 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1912 atomic_read(&dest->weight),
1913 atomic_read(&dest->activeconns),
1914 atomic_read(&dest->inactconns));
1915
1916 }
1917 }
1918 return 0;
1919}
1920
1921static const struct seq_operations ip_vs_info_seq_ops = {
1922 .start = ip_vs_info_seq_start,
1923 .next = ip_vs_info_seq_next,
1924 .stop = ip_vs_info_seq_stop,
1925 .show = ip_vs_info_seq_show,
1926};
1927
1928static int ip_vs_info_open(struct inode *inode, struct file *file)
1929{
1930 return seq_open_private(file, &ip_vs_info_seq_ops,
1931 sizeof(struct ip_vs_iter));
1932}
1933
1934static const struct file_operations ip_vs_info_fops = {
1935 .owner = THIS_MODULE,
1936 .open = ip_vs_info_open,
1937 .read = seq_read,
1938 .llseek = seq_lseek,
1939 .release = seq_release_private,
1940};
1941
1942#endif
1943
1944struct ip_vs_stats ip_vs_stats = {
1945 .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
1946};
1947
1948#ifdef CONFIG_PROC_FS
1949static int ip_vs_stats_show(struct seq_file *seq, void *v)
1950{
1951
1952/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1953 seq_puts(seq,
1954 " Total Incoming Outgoing Incoming Outgoing\n");
1955 seq_printf(seq,
1956 " Conns Packets Packets Bytes Bytes\n");
1957
1958 spin_lock_bh(&ip_vs_stats.lock);
1959 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
1960 ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
1961 (unsigned long long) ip_vs_stats.ustats.inbytes,
1962 (unsigned long long) ip_vs_stats.ustats.outbytes);
1963
1964/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1965 seq_puts(seq,
1966 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1967 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1968 ip_vs_stats.ustats.cps,
1969 ip_vs_stats.ustats.inpps,
1970 ip_vs_stats.ustats.outpps,
1971 ip_vs_stats.ustats.inbps,
1972 ip_vs_stats.ustats.outbps);
1973 spin_unlock_bh(&ip_vs_stats.lock);
1974
1975 return 0;
1976}
1977
1978static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1979{
1980 return single_open(file, ip_vs_stats_show, NULL);
1981}
1982
1983static const struct file_operations ip_vs_stats_fops = {
1984 .owner = THIS_MODULE,
1985 .open = ip_vs_stats_seq_open,
1986 .read = seq_read,
1987 .llseek = seq_lseek,
1988 .release = single_release,
1989};
1990
1991#endif
1992
1993/*
1994 * Set timeout values for tcp tcpfin udp in the timeout_table.
1995 */
1996static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1997{
1998 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1999 u->tcp_timeout,
2000 u->tcp_fin_timeout,
2001 u->udp_timeout);
2002
2003#ifdef CONFIG_IP_VS_PROTO_TCP
2004 if (u->tcp_timeout) {
2005 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
2006 = u->tcp_timeout * HZ;
2007 }
2008
2009 if (u->tcp_fin_timeout) {
2010 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
2011 = u->tcp_fin_timeout * HZ;
2012 }
2013#endif
2014
2015#ifdef CONFIG_IP_VS_PROTO_UDP
2016 if (u->udp_timeout) {
2017 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
2018 = u->udp_timeout * HZ;
2019 }
2020#endif
2021 return 0;
2022}
2023
2024
2025#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2026#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
2027#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
2028 sizeof(struct ip_vs_dest_user))
2029#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2030#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
2031#define MAX_ARG_LEN SVCDEST_ARG_LEN
2032
2033static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2034 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
2035 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
2036 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
2037 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
2038 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
2039 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
2040 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
2041 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
2042 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
2043 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
2044 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
2045};
2046
2047static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2048 struct ip_vs_service_user *usvc_compat)
2049{
2050 usvc->af = AF_INET;
2051 usvc->protocol = usvc_compat->protocol;
2052 usvc->addr.ip = usvc_compat->addr;
2053 usvc->port = usvc_compat->port;
2054 usvc->fwmark = usvc_compat->fwmark;
2055
2056 /* Deep copy of sched_name is not needed here */
2057 usvc->sched_name = usvc_compat->sched_name;
2058
2059 usvc->flags = usvc_compat->flags;
2060 usvc->timeout = usvc_compat->timeout;
2061 usvc->netmask = usvc_compat->netmask;
2062}
2063
2064static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2065 struct ip_vs_dest_user *udest_compat)
2066{
2067 udest->addr.ip = udest_compat->addr;
2068 udest->port = udest_compat->port;
2069 udest->conn_flags = udest_compat->conn_flags;
2070 udest->weight = udest_compat->weight;
2071 udest->u_threshold = udest_compat->u_threshold;
2072 udest->l_threshold = udest_compat->l_threshold;
2073}
2074
2075static int
2076do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2077{
2078 int ret;
2079 unsigned char arg[MAX_ARG_LEN];
2080 struct ip_vs_service_user *usvc_compat;
2081 struct ip_vs_service_user_kern usvc;
2082 struct ip_vs_service *svc;
2083 struct ip_vs_dest_user *udest_compat;
2084 struct ip_vs_dest_user_kern udest;
2085
2086 if (!capable(CAP_NET_ADMIN))
2087 return -EPERM;
2088
2089 if (len != set_arglen[SET_CMDID(cmd)]) {
2090 IP_VS_ERR("set_ctl: len %u != %u\n",
2091 len, set_arglen[SET_CMDID(cmd)]);
2092 return -EINVAL;
2093 }
2094
2095 if (copy_from_user(arg, user, len) != 0)
2096 return -EFAULT;
2097
2098 /* increase the module use count */
2099 ip_vs_use_count_inc();
2100
2101 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2102 ret = -ERESTARTSYS;
2103 goto out_dec;
2104 }
2105
2106 if (cmd == IP_VS_SO_SET_FLUSH) {
2107 /* Flush the virtual service */
2108 ret = ip_vs_flush();
2109 goto out_unlock;
2110 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2111 /* Set timeout values for (tcp tcpfin udp) */
2112 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
2113 goto out_unlock;
2114 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2115 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2116 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
2117 goto out_unlock;
2118 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2119 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2120 ret = stop_sync_thread(dm->state);
2121 goto out_unlock;
2122 }
2123
2124 usvc_compat = (struct ip_vs_service_user *)arg;
2125 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2126
2127 /* We only use the new structs internally, so copy userspace compat
2128 * structs to extended internal versions */
2129 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2130 ip_vs_copy_udest_compat(&udest, udest_compat);
2131
2132 if (cmd == IP_VS_SO_SET_ZERO) {
2133 /* if no service address is set, zero counters in all */
2134 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2135 ret = ip_vs_zero_all();
2136 goto out_unlock;
2137 }
2138 }
2139
2140 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
2141 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
2142 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
2143 usvc.protocol, NIPQUAD(usvc.addr.ip),
2144 ntohs(usvc.port), usvc.sched_name);
2145 ret = -EFAULT;
2146 goto out_unlock;
2147 }
2148
2149 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2150 if (usvc.fwmark == 0)
2151 svc = __ip_vs_service_get(usvc.af, usvc.protocol,
2152 &usvc.addr, usvc.port);
2153 else
2154 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
2155
2156 if (cmd != IP_VS_SO_SET_ADD
2157 && (svc == NULL || svc->protocol != usvc.protocol)) {
2158 ret = -ESRCH;
2159 goto out_unlock;
2160 }
2161
2162 switch (cmd) {
2163 case IP_VS_SO_SET_ADD:
2164 if (svc != NULL)
2165 ret = -EEXIST;
2166 else
2167 ret = ip_vs_add_service(&usvc, &svc);
2168 break;
2169 case IP_VS_SO_SET_EDIT:
2170 ret = ip_vs_edit_service(svc, &usvc);
2171 break;
2172 case IP_VS_SO_SET_DEL:
2173 ret = ip_vs_del_service(svc);
2174 if (!ret)
2175 goto out_unlock;
2176 break;
2177 case IP_VS_SO_SET_ZERO:
2178 ret = ip_vs_zero_service(svc);
2179 break;
2180 case IP_VS_SO_SET_ADDDEST:
2181 ret = ip_vs_add_dest(svc, &udest);
2182 break;
2183 case IP_VS_SO_SET_EDITDEST:
2184 ret = ip_vs_edit_dest(svc, &udest);
2185 break;
2186 case IP_VS_SO_SET_DELDEST:
2187 ret = ip_vs_del_dest(svc, &udest);
2188 break;
2189 default:
2190 ret = -EINVAL;
2191 }
2192
2193 if (svc)
2194 ip_vs_service_put(svc);
2195
2196 out_unlock:
2197 mutex_unlock(&__ip_vs_mutex);
2198 out_dec:
2199 /* decrease the module use count */
2200 ip_vs_use_count_dec();
2201
2202 return ret;
2203}
2204
2205
2206static void
2207ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2208{
2209 spin_lock_bh(&src->lock);
2210 memcpy(dst, &src->ustats, sizeof(*dst));
2211 spin_unlock_bh(&src->lock);
2212}
2213
2214static void
2215ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2216{
2217 dst->protocol = src->protocol;
2218 dst->addr = src->addr.ip;
2219 dst->port = src->port;
2220 dst->fwmark = src->fwmark;
2221 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2222 dst->flags = src->flags;
2223 dst->timeout = src->timeout / HZ;
2224 dst->netmask = src->netmask;
2225 dst->num_dests = src->num_dests;
2226 ip_vs_copy_stats(&dst->stats, &src->stats);
2227}
2228
2229static inline int
2230__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2231 struct ip_vs_get_services __user *uptr)
2232{
2233 int idx, count=0;
2234 struct ip_vs_service *svc;
2235 struct ip_vs_service_entry entry;
2236 int ret = 0;
2237
2238 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2239 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2240 /* Only expose IPv4 entries to old interface */
2241 if (svc->af != AF_INET)
2242 continue;
2243
2244 if (count >= get->num_services)
2245 goto out;
2246 memset(&entry, 0, sizeof(entry));
2247 ip_vs_copy_service(&entry, svc);
2248 if (copy_to_user(&uptr->entrytable[count],
2249 &entry, sizeof(entry))) {
2250 ret = -EFAULT;
2251 goto out;
2252 }
2253 count++;
2254 }
2255 }
2256
2257 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2258 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2259 /* Only expose IPv4 entries to old interface */
2260 if (svc->af != AF_INET)
2261 continue;
2262
2263 if (count >= get->num_services)
2264 goto out;
2265 memset(&entry, 0, sizeof(entry));
2266 ip_vs_copy_service(&entry, svc);
2267 if (copy_to_user(&uptr->entrytable[count],
2268 &entry, sizeof(entry))) {
2269 ret = -EFAULT;
2270 goto out;
2271 }
2272 count++;
2273 }
2274 }
2275 out:
2276 return ret;
2277}
2278
2279static inline int
2280__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2281 struct ip_vs_get_dests __user *uptr)
2282{
2283 struct ip_vs_service *svc;
2284 union nf_inet_addr addr = { .ip = get->addr };
2285 int ret = 0;
2286
2287 if (get->fwmark)
2288 svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
2289 else
2290 svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
2291 get->port);
2292
2293 if (svc) {
2294 int count = 0;
2295 struct ip_vs_dest *dest;
2296 struct ip_vs_dest_entry entry;
2297
2298 list_for_each_entry(dest, &svc->destinations, n_list) {
2299 if (count >= get->num_dests)
2300 break;
2301
2302 entry.addr = dest->addr.ip;
2303 entry.port = dest->port;
2304 entry.conn_flags = atomic_read(&dest->conn_flags);
2305 entry.weight = atomic_read(&dest->weight);
2306 entry.u_threshold = dest->u_threshold;
2307 entry.l_threshold = dest->l_threshold;
2308 entry.activeconns = atomic_read(&dest->activeconns);
2309 entry.inactconns = atomic_read(&dest->inactconns);
2310 entry.persistconns = atomic_read(&dest->persistconns);
2311 ip_vs_copy_stats(&entry.stats, &dest->stats);
2312 if (copy_to_user(&uptr->entrytable[count],
2313 &entry, sizeof(entry))) {
2314 ret = -EFAULT;
2315 break;
2316 }
2317 count++;
2318 }
2319 ip_vs_service_put(svc);
2320 } else
2321 ret = -ESRCH;
2322 return ret;
2323}
2324
2325static inline void
2326__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2327{
2328#ifdef CONFIG_IP_VS_PROTO_TCP
2329 u->tcp_timeout =
2330 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2331 u->tcp_fin_timeout =
2332 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2333#endif
2334#ifdef CONFIG_IP_VS_PROTO_UDP
2335 u->udp_timeout =
2336 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2337#endif
2338}
2339
2340
2341#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2342#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2343#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2344#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2345#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2346#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2347#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2348
2349static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2350 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2351 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2352 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2353 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2354 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2355 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2356 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2357};
2358
2359static int
2360do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2361{
2362 unsigned char arg[128];
2363 int ret = 0;
2364
2365 if (!capable(CAP_NET_ADMIN))
2366 return -EPERM;
2367
2368 if (*len < get_arglen[GET_CMDID(cmd)]) {
2369 IP_VS_ERR("get_ctl: len %u < %u\n",
2370 *len, get_arglen[GET_CMDID(cmd)]);
2371 return -EINVAL;
2372 }
2373
2374 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2375 return -EFAULT;
2376
2377 if (mutex_lock_interruptible(&__ip_vs_mutex))
2378 return -ERESTARTSYS;
2379
2380 switch (cmd) {
2381 case IP_VS_SO_GET_VERSION:
2382 {
2383 char buf[64];
2384
2385 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2386 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2387 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2388 ret = -EFAULT;
2389 goto out;
2390 }
2391 *len = strlen(buf)+1;
2392 }
2393 break;
2394
2395 case IP_VS_SO_GET_INFO:
2396 {
2397 struct ip_vs_getinfo info;
2398 info.version = IP_VS_VERSION_CODE;
2399 info.size = IP_VS_CONN_TAB_SIZE;
2400 info.num_services = ip_vs_num_services;
2401 if (copy_to_user(user, &info, sizeof(info)) != 0)
2402 ret = -EFAULT;
2403 }
2404 break;
2405
2406 case IP_VS_SO_GET_SERVICES:
2407 {
2408 struct ip_vs_get_services *get;
2409 int size;
2410
2411 get = (struct ip_vs_get_services *)arg;
2412 size = sizeof(*get) +
2413 sizeof(struct ip_vs_service_entry) * get->num_services;
2414 if (*len != size) {
2415 IP_VS_ERR("length: %u != %u\n", *len, size);
2416 ret = -EINVAL;
2417 goto out;
2418 }
2419 ret = __ip_vs_get_service_entries(get, user);
2420 }
2421 break;
2422
2423 case IP_VS_SO_GET_SERVICE:
2424 {
2425 struct ip_vs_service_entry *entry;
2426 struct ip_vs_service *svc;
2427 union nf_inet_addr addr;
2428
2429 entry = (struct ip_vs_service_entry *)arg;
2430 addr.ip = entry->addr;
2431 if (entry->fwmark)
2432 svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
2433 else
2434 svc = __ip_vs_service_get(AF_INET, entry->protocol,
2435 &addr, entry->port);
2436 if (svc) {
2437 ip_vs_copy_service(entry, svc);
2438 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2439 ret = -EFAULT;
2440 ip_vs_service_put(svc);
2441 } else
2442 ret = -ESRCH;
2443 }
2444 break;
2445
2446 case IP_VS_SO_GET_DESTS:
2447 {
2448 struct ip_vs_get_dests *get;
2449 int size;
2450
2451 get = (struct ip_vs_get_dests *)arg;
2452 size = sizeof(*get) +
2453 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2454 if (*len != size) {
2455 IP_VS_ERR("length: %u != %u\n", *len, size);
2456 ret = -EINVAL;
2457 goto out;
2458 }
2459 ret = __ip_vs_get_dest_entries(get, user);
2460 }
2461 break;
2462
2463 case IP_VS_SO_GET_TIMEOUT:
2464 {
2465 struct ip_vs_timeout_user t;
2466
2467 __ip_vs_get_timeouts(&t);
2468 if (copy_to_user(user, &t, sizeof(t)) != 0)
2469 ret = -EFAULT;
2470 }
2471 break;
2472
2473 case IP_VS_SO_GET_DAEMON:
2474 {
2475 struct ip_vs_daemon_user d[2];
2476
2477 memset(&d, 0, sizeof(d));
2478 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2479 d[0].state = IP_VS_STATE_MASTER;
2480 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2481 d[0].syncid = ip_vs_master_syncid;
2482 }
2483 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2484 d[1].state = IP_VS_STATE_BACKUP;
2485 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2486 d[1].syncid = ip_vs_backup_syncid;
2487 }
2488 if (copy_to_user(user, &d, sizeof(d)) != 0)
2489 ret = -EFAULT;
2490 }
2491 break;
2492
2493 default:
2494 ret = -EINVAL;
2495 }
2496
2497 out:
2498 mutex_unlock(&__ip_vs_mutex);
2499 return ret;
2500}
2501
2502
2503static struct nf_sockopt_ops ip_vs_sockopts = {
2504 .pf = PF_INET,
2505 .set_optmin = IP_VS_BASE_CTL,
2506 .set_optmax = IP_VS_SO_SET_MAX+1,
2507 .set = do_ip_vs_set_ctl,
2508 .get_optmin = IP_VS_BASE_CTL,
2509 .get_optmax = IP_VS_SO_GET_MAX+1,
2510 .get = do_ip_vs_get_ctl,
2511 .owner = THIS_MODULE,
2512};
2513
2514/*
2515 * Generic Netlink interface
2516 */
2517
2518/* IPVS genetlink family */
2519static struct genl_family ip_vs_genl_family = {
2520 .id = GENL_ID_GENERATE,
2521 .hdrsize = 0,
2522 .name = IPVS_GENL_NAME,
2523 .version = IPVS_GENL_VERSION,
2524 .maxattr = IPVS_CMD_MAX,
2525};
2526
2527/* Policy used for first-level command attributes */
2528static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2529 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
2530 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
2531 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
2532 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
2533 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2534 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
2535};
2536
2537/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2538static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2539 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
2540 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
2541 .len = IP_VS_IFNAME_MAXLEN },
2542 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
2543};
2544
2545/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2546static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2547 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
2548 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
2549 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
2550 .len = sizeof(union nf_inet_addr) },
2551 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
2552 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
2553 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
2554 .len = IP_VS_SCHEDNAME_MAXLEN },
2555 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
2556 .len = sizeof(struct ip_vs_flags) },
2557 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
2558 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
2559 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
2560};
2561
2562/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2563static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2564 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
2565 .len = sizeof(union nf_inet_addr) },
2566 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
2567 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
2568 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
2569 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
2570 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
2571 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
2572 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
2573 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
2574 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
2575};
2576
2577static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2578 struct ip_vs_stats *stats)
2579{
2580 struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2581 if (!nl_stats)
2582 return -EMSGSIZE;
2583
2584 spin_lock_bh(&stats->lock);
2585
2586 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
2587 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
2588 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
2589 NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
2590 NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
2591 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
2592 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
2593 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
2594 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
2595 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
2596
2597 spin_unlock_bh(&stats->lock);
2598
2599 nla_nest_end(skb, nl_stats);
2600
2601 return 0;
2602
2603nla_put_failure:
2604 spin_unlock_bh(&stats->lock);
2605 nla_nest_cancel(skb, nl_stats);
2606 return -EMSGSIZE;
2607}
2608
2609static int ip_vs_genl_fill_service(struct sk_buff *skb,
2610 struct ip_vs_service *svc)
2611{
2612 struct nlattr *nl_service;
2613 struct ip_vs_flags flags = { .flags = svc->flags,
2614 .mask = ~0 };
2615
2616 nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2617 if (!nl_service)
2618 return -EMSGSIZE;
2619
2620 NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2621
2622 if (svc->fwmark) {
2623 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2624 } else {
2625 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2626 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2627 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2628 }
2629
2630 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2631 NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2632 NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2633 NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2634
2635 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2636 goto nla_put_failure;
2637
2638 nla_nest_end(skb, nl_service);
2639
2640 return 0;
2641
2642nla_put_failure:
2643 nla_nest_cancel(skb, nl_service);
2644 return -EMSGSIZE;
2645}
2646
2647static int ip_vs_genl_dump_service(struct sk_buff *skb,
2648 struct ip_vs_service *svc,
2649 struct netlink_callback *cb)
2650{
2651 void *hdr;
2652
2653 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2654 &ip_vs_genl_family, NLM_F_MULTI,
2655 IPVS_CMD_NEW_SERVICE);
2656 if (!hdr)
2657 return -EMSGSIZE;
2658
2659 if (ip_vs_genl_fill_service(skb, svc) < 0)
2660 goto nla_put_failure;
2661
2662 return genlmsg_end(skb, hdr);
2663
2664nla_put_failure:
2665 genlmsg_cancel(skb, hdr);
2666 return -EMSGSIZE;
2667}
2668
2669static int ip_vs_genl_dump_services(struct sk_buff *skb,
2670 struct netlink_callback *cb)
2671{
2672 int idx = 0, i;
2673 int start = cb->args[0];
2674 struct ip_vs_service *svc;
2675
2676 mutex_lock(&__ip_vs_mutex);
2677 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2678 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2679 if (++idx <= start)
2680 continue;
2681 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2682 idx--;
2683 goto nla_put_failure;
2684 }
2685 }
2686 }
2687
2688 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2689 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2690 if (++idx <= start)
2691 continue;
2692 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2693 idx--;
2694 goto nla_put_failure;
2695 }
2696 }
2697 }
2698
2699nla_put_failure:
2700 mutex_unlock(&__ip_vs_mutex);
2701 cb->args[0] = idx;
2702
2703 return skb->len;
2704}
2705
2706static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2707 struct nlattr *nla, int full_entry)
2708{
2709 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2710 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2711
2712 /* Parse mandatory identifying service fields first */
2713 if (nla == NULL ||
2714 nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2715 return -EINVAL;
2716
2717 nla_af = attrs[IPVS_SVC_ATTR_AF];
2718 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
2719 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
2720 nla_port = attrs[IPVS_SVC_ATTR_PORT];
2721 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
2722
2723 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2724 return -EINVAL;
2725
2726 usvc->af = nla_get_u16(nla_af);
2727#ifdef CONFIG_IP_VS_IPV6
2728 if (usvc->af != AF_INET && usvc->af != AF_INET6)
2729#else
2730 if (usvc->af != AF_INET)
2731#endif
2732 return -EAFNOSUPPORT;
2733
2734 if (nla_fwmark) {
2735 usvc->protocol = IPPROTO_TCP;
2736 usvc->fwmark = nla_get_u32(nla_fwmark);
2737 } else {
2738 usvc->protocol = nla_get_u16(nla_protocol);
2739 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2740 usvc->port = nla_get_u16(nla_port);
2741 usvc->fwmark = 0;
2742 }
2743
2744 /* If a full entry was requested, check for the additional fields */
2745 if (full_entry) {
2746 struct nlattr *nla_sched, *nla_flags, *nla_timeout,
2747 *nla_netmask;
2748 struct ip_vs_flags flags;
2749 struct ip_vs_service *svc;
2750
2751 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2752 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2753 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2754 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2755
2756 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2757 return -EINVAL;
2758
2759 nla_memcpy(&flags, nla_flags, sizeof(flags));
2760
2761 /* prefill flags from service if it already exists */
2762 if (usvc->fwmark)
2763 svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
2764 else
2765 svc = __ip_vs_service_get(usvc->af, usvc->protocol,
2766 &usvc->addr, usvc->port);
2767 if (svc) {
2768 usvc->flags = svc->flags;
2769 ip_vs_service_put(svc);
2770 } else
2771 usvc->flags = 0;
2772
2773 /* set new flags from userland */
2774 usvc->flags = (usvc->flags & ~flags.mask) |
2775 (flags.flags & flags.mask);
2776 usvc->sched_name = nla_data(nla_sched);
2777 usvc->timeout = nla_get_u32(nla_timeout);
2778 usvc->netmask = nla_get_u32(nla_netmask);
2779 }
2780
2781 return 0;
2782}
2783
2784static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
2785{
2786 struct ip_vs_service_user_kern usvc;
2787 int ret;
2788
2789 ret = ip_vs_genl_parse_service(&usvc, nla, 0);
2790 if (ret)
2791 return ERR_PTR(ret);
2792
2793 if (usvc.fwmark)
2794 return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
2795 else
2796 return __ip_vs_service_get(usvc.af, usvc.protocol,
2797 &usvc.addr, usvc.port);
2798}
2799
2800static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2801{
2802 struct nlattr *nl_dest;
2803
2804 nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2805 if (!nl_dest)
2806 return -EMSGSIZE;
2807
2808 NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2809 NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2810
2811 NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2812 atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2813 NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2814 NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2815 NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2816 NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2817 atomic_read(&dest->activeconns));
2818 NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2819 atomic_read(&dest->inactconns));
2820 NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2821 atomic_read(&dest->persistconns));
2822
2823 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2824 goto nla_put_failure;
2825
2826 nla_nest_end(skb, nl_dest);
2827
2828 return 0;
2829
2830nla_put_failure:
2831 nla_nest_cancel(skb, nl_dest);
2832 return -EMSGSIZE;
2833}
2834
2835static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2836 struct netlink_callback *cb)
2837{
2838 void *hdr;
2839
2840 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2841 &ip_vs_genl_family, NLM_F_MULTI,
2842 IPVS_CMD_NEW_DEST);
2843 if (!hdr)
2844 return -EMSGSIZE;
2845
2846 if (ip_vs_genl_fill_dest(skb, dest) < 0)
2847 goto nla_put_failure;
2848
2849 return genlmsg_end(skb, hdr);
2850
2851nla_put_failure:
2852 genlmsg_cancel(skb, hdr);
2853 return -EMSGSIZE;
2854}
2855
2856static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2857 struct netlink_callback *cb)
2858{
2859 int idx = 0;
2860 int start = cb->args[0];
2861 struct ip_vs_service *svc;
2862 struct ip_vs_dest *dest;
2863 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2864
2865 mutex_lock(&__ip_vs_mutex);
2866
2867 /* Try to find the service for which to dump destinations */
2868 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2869 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2870 goto out_err;
2871
2872 svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
2873 if (IS_ERR(svc) || svc == NULL)
2874 goto out_err;
2875
2876 /* Dump the destinations */
2877 list_for_each_entry(dest, &svc->destinations, n_list) {
2878 if (++idx <= start)
2879 continue;
2880 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
2881 idx--;
2882 goto nla_put_failure;
2883 }
2884 }
2885
2886nla_put_failure:
2887 cb->args[0] = idx;
2888 ip_vs_service_put(svc);
2889
2890out_err:
2891 mutex_unlock(&__ip_vs_mutex);
2892
2893 return skb->len;
2894}
2895
2896static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
2897 struct nlattr *nla, int full_entry)
2898{
2899 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
2900 struct nlattr *nla_addr, *nla_port;
2901
2902 /* Parse mandatory identifying destination fields first */
2903 if (nla == NULL ||
2904 nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
2905 return -EINVAL;
2906
2907 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
2908 nla_port = attrs[IPVS_DEST_ATTR_PORT];
2909
2910 if (!(nla_addr && nla_port))
2911 return -EINVAL;
2912
2913 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
2914 udest->port = nla_get_u16(nla_port);
2915
2916 /* If a full entry was requested, check for the additional fields */
2917 if (full_entry) {
2918 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
2919 *nla_l_thresh;
2920
2921 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
2922 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
2923 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
2924 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
2925
2926 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
2927 return -EINVAL;
2928
2929 udest->conn_flags = nla_get_u32(nla_fwd)
2930 & IP_VS_CONN_F_FWD_MASK;
2931 udest->weight = nla_get_u32(nla_weight);
2932 udest->u_threshold = nla_get_u32(nla_u_thresh);
2933 udest->l_threshold = nla_get_u32(nla_l_thresh);
2934 }
2935
2936 return 0;
2937}
2938
2939static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
2940 const char *mcast_ifn, __be32 syncid)
2941{
2942 struct nlattr *nl_daemon;
2943
2944 nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
2945 if (!nl_daemon)
2946 return -EMSGSIZE;
2947
2948 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
2949 NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
2950 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
2951
2952 nla_nest_end(skb, nl_daemon);
2953
2954 return 0;
2955
2956nla_put_failure:
2957 nla_nest_cancel(skb, nl_daemon);
2958 return -EMSGSIZE;
2959}
2960
2961static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
2962 const char *mcast_ifn, __be32 syncid,
2963 struct netlink_callback *cb)
2964{
2965 void *hdr;
2966 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2967 &ip_vs_genl_family, NLM_F_MULTI,
2968 IPVS_CMD_NEW_DAEMON);
2969 if (!hdr)
2970 return -EMSGSIZE;
2971
2972 if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
2973 goto nla_put_failure;
2974
2975 return genlmsg_end(skb, hdr);
2976
2977nla_put_failure:
2978 genlmsg_cancel(skb, hdr);
2979 return -EMSGSIZE;
2980}
2981
2982static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
2983 struct netlink_callback *cb)
2984{
2985 mutex_lock(&__ip_vs_mutex);
2986 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
2987 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
2988 ip_vs_master_mcast_ifn,
2989 ip_vs_master_syncid, cb) < 0)
2990 goto nla_put_failure;
2991
2992 cb->args[0] = 1;
2993 }
2994
2995 if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
2996 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
2997 ip_vs_backup_mcast_ifn,
2998 ip_vs_backup_syncid, cb) < 0)
2999 goto nla_put_failure;
3000
3001 cb->args[1] = 1;
3002 }
3003
3004nla_put_failure:
3005 mutex_unlock(&__ip_vs_mutex);
3006
3007 return skb->len;
3008}
3009
3010static int ip_vs_genl_new_daemon(struct nlattr **attrs)
3011{
3012 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3013 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3014 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3015 return -EINVAL;
3016
3017 return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3018 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3019 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3020}
3021
3022static int ip_vs_genl_del_daemon(struct nlattr **attrs)
3023{
3024 if (!attrs[IPVS_DAEMON_ATTR_STATE])
3025 return -EINVAL;
3026
3027 return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3028}
3029
3030static int ip_vs_genl_set_config(struct nlattr **attrs)
3031{
3032 struct ip_vs_timeout_user t;
3033
3034 __ip_vs_get_timeouts(&t);
3035
3036 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3037 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3038
3039 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3040 t.tcp_fin_timeout =
3041 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3042
3043 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3044 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3045
3046 return ip_vs_set_timeout(&t);
3047}
3048
3049static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3050{
3051 struct ip_vs_service *svc = NULL;
3052 struct ip_vs_service_user_kern usvc;
3053 struct ip_vs_dest_user_kern udest;
3054 int ret = 0, cmd;
3055 int need_full_svc = 0, need_full_dest = 0;
3056
3057 cmd = info->genlhdr->cmd;
3058
3059 mutex_lock(&__ip_vs_mutex);
3060
3061 if (cmd == IPVS_CMD_FLUSH) {
3062 ret = ip_vs_flush();
3063 goto out;
3064 } else if (cmd == IPVS_CMD_SET_CONFIG) {
3065 ret = ip_vs_genl_set_config(info->attrs);
3066 goto out;
3067 } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3068 cmd == IPVS_CMD_DEL_DAEMON) {
3069
3070 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3071
3072 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3073 nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3074 info->attrs[IPVS_CMD_ATTR_DAEMON],
3075 ip_vs_daemon_policy)) {
3076 ret = -EINVAL;
3077 goto out;
3078 }
3079
3080 if (cmd == IPVS_CMD_NEW_DAEMON)
3081 ret = ip_vs_genl_new_daemon(daemon_attrs);
3082 else
3083 ret = ip_vs_genl_del_daemon(daemon_attrs);
3084 goto out;
3085 } else if (cmd == IPVS_CMD_ZERO &&
3086 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3087 ret = ip_vs_zero_all();
3088 goto out;
3089 }
3090
3091 /* All following commands require a service argument, so check if we
3092 * received a valid one. We need a full service specification when
3093 * adding / editing a service. Only identifying members otherwise. */
3094 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3095 need_full_svc = 1;
3096
3097 ret = ip_vs_genl_parse_service(&usvc,
3098 info->attrs[IPVS_CMD_ATTR_SERVICE],
3099 need_full_svc);
3100 if (ret)
3101 goto out;
3102
3103 /* Lookup the exact service by <protocol, addr, port> or fwmark */
3104 if (usvc.fwmark == 0)
3105 svc = __ip_vs_service_get(usvc.af, usvc.protocol,
3106 &usvc.addr, usvc.port);
3107 else
3108 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
3109
3110 /* Unless we're adding a new service, the service must already exist */
3111 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3112 ret = -ESRCH;
3113 goto out;
3114 }
3115
3116 /* Destination commands require a valid destination argument. For
3117 * adding / editing a destination, we need a full destination
3118 * specification. */
3119 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3120 cmd == IPVS_CMD_DEL_DEST) {
3121 if (cmd != IPVS_CMD_DEL_DEST)
3122 need_full_dest = 1;
3123
3124 ret = ip_vs_genl_parse_dest(&udest,
3125 info->attrs[IPVS_CMD_ATTR_DEST],
3126 need_full_dest);
3127 if (ret)
3128 goto out;
3129 }
3130
3131 switch (cmd) {
3132 case IPVS_CMD_NEW_SERVICE:
3133 if (svc == NULL)
3134 ret = ip_vs_add_service(&usvc, &svc);
3135 else
3136 ret = -EEXIST;
3137 break;
3138 case IPVS_CMD_SET_SERVICE:
3139 ret = ip_vs_edit_service(svc, &usvc);
3140 break;
3141 case IPVS_CMD_DEL_SERVICE:
3142 ret = ip_vs_del_service(svc);
3143 break;
3144 case IPVS_CMD_NEW_DEST:
3145 ret = ip_vs_add_dest(svc, &udest);
3146 break;
3147 case IPVS_CMD_SET_DEST:
3148 ret = ip_vs_edit_dest(svc, &udest);
3149 break;
3150 case IPVS_CMD_DEL_DEST:
3151 ret = ip_vs_del_dest(svc, &udest);
3152 break;
3153 case IPVS_CMD_ZERO:
3154 ret = ip_vs_zero_service(svc);
3155 break;
3156 default:
3157 ret = -EINVAL;
3158 }
3159
3160out:
3161 if (svc)
3162 ip_vs_service_put(svc);
3163 mutex_unlock(&__ip_vs_mutex);
3164
3165 return ret;
3166}
3167
3168static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3169{
3170 struct sk_buff *msg;
3171 void *reply;
3172 int ret, cmd, reply_cmd;
3173
3174 cmd = info->genlhdr->cmd;
3175
3176 if (cmd == IPVS_CMD_GET_SERVICE)
3177 reply_cmd = IPVS_CMD_NEW_SERVICE;
3178 else if (cmd == IPVS_CMD_GET_INFO)
3179 reply_cmd = IPVS_CMD_SET_INFO;
3180 else if (cmd == IPVS_CMD_GET_CONFIG)
3181 reply_cmd = IPVS_CMD_SET_CONFIG;
3182 else {
3183 IP_VS_ERR("unknown Generic Netlink command\n");
3184 return -EINVAL;
3185 }
3186
3187 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3188 if (!msg)
3189 return -ENOMEM;
3190
3191 mutex_lock(&__ip_vs_mutex);
3192
3193 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3194 if (reply == NULL)
3195 goto nla_put_failure;
3196
3197 switch (cmd) {
3198 case IPVS_CMD_GET_SERVICE:
3199 {
3200 struct ip_vs_service *svc;
3201
3202 svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
3203 if (IS_ERR(svc)) {
3204 ret = PTR_ERR(svc);
3205 goto out_err;
3206 } else if (svc) {
3207 ret = ip_vs_genl_fill_service(msg, svc);
3208 ip_vs_service_put(svc);
3209 if (ret)
3210 goto nla_put_failure;
3211 } else {
3212 ret = -ESRCH;
3213 goto out_err;
3214 }
3215
3216 break;
3217 }
3218
3219 case IPVS_CMD_GET_CONFIG:
3220 {
3221 struct ip_vs_timeout_user t;
3222
3223 __ip_vs_get_timeouts(&t);
3224#ifdef CONFIG_IP_VS_PROTO_TCP
3225 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3226 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3227 t.tcp_fin_timeout);
3228#endif
3229#ifdef CONFIG_IP_VS_PROTO_UDP
3230 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3231#endif
3232
3233 break;
3234 }
3235
3236 case IPVS_CMD_GET_INFO:
3237 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3238 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3239 IP_VS_CONN_TAB_SIZE);
3240 break;
3241 }
3242
3243 genlmsg_end(msg, reply);
3244 ret = genlmsg_unicast(msg, info->snd_pid);
3245 goto out;
3246
3247nla_put_failure:
3248 IP_VS_ERR("not enough space in Netlink message\n");
3249 ret = -EMSGSIZE;
3250
3251out_err:
3252 nlmsg_free(msg);
3253out:
3254 mutex_unlock(&__ip_vs_mutex);
3255
3256 return ret;
3257}
3258
3259
3260static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3261 {
3262 .cmd = IPVS_CMD_NEW_SERVICE,
3263 .flags = GENL_ADMIN_PERM,
3264 .policy = ip_vs_cmd_policy,
3265 .doit = ip_vs_genl_set_cmd,
3266 },
3267 {
3268 .cmd = IPVS_CMD_SET_SERVICE,
3269 .flags = GENL_ADMIN_PERM,
3270 .policy = ip_vs_cmd_policy,
3271 .doit = ip_vs_genl_set_cmd,
3272 },
3273 {
3274 .cmd = IPVS_CMD_DEL_SERVICE,
3275 .flags = GENL_ADMIN_PERM,
3276 .policy = ip_vs_cmd_policy,
3277 .doit = ip_vs_genl_set_cmd,
3278 },
3279 {
3280 .cmd = IPVS_CMD_GET_SERVICE,
3281 .flags = GENL_ADMIN_PERM,
3282 .doit = ip_vs_genl_get_cmd,
3283 .dumpit = ip_vs_genl_dump_services,
3284 .policy = ip_vs_cmd_policy,
3285 },
3286 {
3287 .cmd = IPVS_CMD_NEW_DEST,
3288 .flags = GENL_ADMIN_PERM,
3289 .policy = ip_vs_cmd_policy,
3290 .doit = ip_vs_genl_set_cmd,
3291 },
3292 {
3293 .cmd = IPVS_CMD_SET_DEST,
3294 .flags = GENL_ADMIN_PERM,
3295 .policy = ip_vs_cmd_policy,
3296 .doit = ip_vs_genl_set_cmd,
3297 },
3298 {
3299 .cmd = IPVS_CMD_DEL_DEST,
3300 .flags = GENL_ADMIN_PERM,
3301 .policy = ip_vs_cmd_policy,
3302 .doit = ip_vs_genl_set_cmd,
3303 },
3304 {
3305 .cmd = IPVS_CMD_GET_DEST,
3306 .flags = GENL_ADMIN_PERM,
3307 .policy = ip_vs_cmd_policy,
3308 .dumpit = ip_vs_genl_dump_dests,
3309 },
3310 {
3311 .cmd = IPVS_CMD_NEW_DAEMON,
3312 .flags = GENL_ADMIN_PERM,
3313 .policy = ip_vs_cmd_policy,
3314 .doit = ip_vs_genl_set_cmd,
3315 },
3316 {
3317 .cmd = IPVS_CMD_DEL_DAEMON,
3318 .flags = GENL_ADMIN_PERM,
3319 .policy = ip_vs_cmd_policy,
3320 .doit = ip_vs_genl_set_cmd,
3321 },
3322 {
3323 .cmd = IPVS_CMD_GET_DAEMON,
3324 .flags = GENL_ADMIN_PERM,
3325 .dumpit = ip_vs_genl_dump_daemons,
3326 },
3327 {
3328 .cmd = IPVS_CMD_SET_CONFIG,
3329 .flags = GENL_ADMIN_PERM,
3330 .policy = ip_vs_cmd_policy,
3331 .doit = ip_vs_genl_set_cmd,
3332 },
3333 {
3334 .cmd = IPVS_CMD_GET_CONFIG,
3335 .flags = GENL_ADMIN_PERM,
3336 .doit = ip_vs_genl_get_cmd,
3337 },
3338 {
3339 .cmd = IPVS_CMD_GET_INFO,
3340 .flags = GENL_ADMIN_PERM,
3341 .doit = ip_vs_genl_get_cmd,
3342 },
3343 {
3344 .cmd = IPVS_CMD_ZERO,
3345 .flags = GENL_ADMIN_PERM,
3346 .policy = ip_vs_cmd_policy,
3347 .doit = ip_vs_genl_set_cmd,
3348 },
3349 {
3350 .cmd = IPVS_CMD_FLUSH,
3351 .flags = GENL_ADMIN_PERM,
3352 .doit = ip_vs_genl_set_cmd,
3353 },
3354};
3355
3356static int __init ip_vs_genl_register(void)
3357{
3358 int ret, i;
3359
3360 ret = genl_register_family(&ip_vs_genl_family);
3361 if (ret)
3362 return ret;
3363
3364 for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
3365 ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
3366 if (ret)
3367 goto err_out;
3368 }
3369 return 0;
3370
3371err_out:
3372 genl_unregister_family(&ip_vs_genl_family);
3373 return ret;
3374}
3375
3376static void ip_vs_genl_unregister(void)
3377{
3378 genl_unregister_family(&ip_vs_genl_family);
3379}
3380
3381/* End of Generic Netlink interface definitions */
3382
3383
3384int __init ip_vs_control_init(void)
3385{
3386 int ret;
3387 int idx;
3388
3389 EnterFunction(2);
3390
3391 ret = nf_register_sockopt(&ip_vs_sockopts);
3392 if (ret) {
3393 IP_VS_ERR("cannot register sockopt.\n");
3394 return ret;
3395 }
3396
3397 ret = ip_vs_genl_register();
3398 if (ret) {
3399 IP_VS_ERR("cannot register Generic Netlink interface.\n");
3400 nf_unregister_sockopt(&ip_vs_sockopts);
3401 return ret;
3402 }
3403
3404 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
3405 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
3406
3407 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
3408
3409 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
3410 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3411 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3412 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3413 }
3414 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
3415 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
3416 }
3417
3418 ip_vs_new_estimator(&ip_vs_stats);
3419
3420 /* Hook the defense timer */
3421 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
3422
3423 LeaveFunction(2);
3424 return 0;
3425}
3426
3427
3428void ip_vs_control_cleanup(void)
3429{
3430 EnterFunction(2);
3431 ip_vs_trash_cleanup();
3432 cancel_rearming_delayed_work(&defense_work);
3433 cancel_work_sync(&defense_work.work);
3434 ip_vs_kill_estimator(&ip_vs_stats);
3435 unregister_sysctl_table(sysctl_header);
3436 proc_net_remove(&init_net, "ip_vs_stats");
3437 proc_net_remove(&init_net, "ip_vs");
3438 ip_vs_genl_unregister();
3439 nf_unregister_sockopt(&ip_vs_sockopts);
3440 LeaveFunction(2);
3441}
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
deleted file mode 100644
index 2eb2860dabb5..000000000000
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ /dev/null
@@ -1,166 +0,0 @@
1/*
2 * ip_vs_est.c: simple rate estimator for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/slab.h>
17#include <linux/types.h>
18#include <linux/interrupt.h>
19#include <linux/sysctl.h>
20#include <linux/list.h>
21
22#include <net/ip_vs.h>
23
24/*
25 This code is to estimate rate in a shorter interval (such as 8
26 seconds) for virtual services and real servers. For measure rate in a
27 long interval, it is easy to implement a user level daemon which
28 periodically reads those statistical counters and measure rate.
29
30 Currently, the measurement is activated by slow timer handler. Hope
31 this measurement will not introduce too much load.
32
33 We measure rate during the last 8 seconds every 2 seconds:
34
35 avgrate = avgrate*(1-W) + rate*W
36
37 where W = 2^(-2)
38
39 NOTES.
40
41 * The stored value for average bps is scaled by 2^5, so that maximal
42 rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
43
44 * A lot code is taken from net/sched/estimator.c
45 */
46
47
48static void estimation_timer(unsigned long arg);
49
50static LIST_HEAD(est_list);
51static DEFINE_SPINLOCK(est_lock);
52static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
53
54static void estimation_timer(unsigned long arg)
55{
56 struct ip_vs_estimator *e;
57 struct ip_vs_stats *s;
58 u32 n_conns;
59 u32 n_inpkts, n_outpkts;
60 u64 n_inbytes, n_outbytes;
61 u32 rate;
62
63 spin_lock(&est_lock);
64 list_for_each_entry(e, &est_list, list) {
65 s = container_of(e, struct ip_vs_stats, est);
66
67 spin_lock(&s->lock);
68 n_conns = s->ustats.conns;
69 n_inpkts = s->ustats.inpkts;
70 n_outpkts = s->ustats.outpkts;
71 n_inbytes = s->ustats.inbytes;
72 n_outbytes = s->ustats.outbytes;
73
74 /* scaled by 2^10, but divided 2 seconds */
75 rate = (n_conns - e->last_conns)<<9;
76 e->last_conns = n_conns;
77 e->cps += ((long)rate - (long)e->cps)>>2;
78 s->ustats.cps = (e->cps+0x1FF)>>10;
79
80 rate = (n_inpkts - e->last_inpkts)<<9;
81 e->last_inpkts = n_inpkts;
82 e->inpps += ((long)rate - (long)e->inpps)>>2;
83 s->ustats.inpps = (e->inpps+0x1FF)>>10;
84
85 rate = (n_outpkts - e->last_outpkts)<<9;
86 e->last_outpkts = n_outpkts;
87 e->outpps += ((long)rate - (long)e->outpps)>>2;
88 s->ustats.outpps = (e->outpps+0x1FF)>>10;
89
90 rate = (n_inbytes - e->last_inbytes)<<4;
91 e->last_inbytes = n_inbytes;
92 e->inbps += ((long)rate - (long)e->inbps)>>2;
93 s->ustats.inbps = (e->inbps+0xF)>>5;
94
95 rate = (n_outbytes - e->last_outbytes)<<4;
96 e->last_outbytes = n_outbytes;
97 e->outbps += ((long)rate - (long)e->outbps)>>2;
98 s->ustats.outbps = (e->outbps+0xF)>>5;
99 spin_unlock(&s->lock);
100 }
101 spin_unlock(&est_lock);
102 mod_timer(&est_timer, jiffies + 2*HZ);
103}
104
105void ip_vs_new_estimator(struct ip_vs_stats *stats)
106{
107 struct ip_vs_estimator *est = &stats->est;
108
109 INIT_LIST_HEAD(&est->list);
110
111 est->last_conns = stats->ustats.conns;
112 est->cps = stats->ustats.cps<<10;
113
114 est->last_inpkts = stats->ustats.inpkts;
115 est->inpps = stats->ustats.inpps<<10;
116
117 est->last_outpkts = stats->ustats.outpkts;
118 est->outpps = stats->ustats.outpps<<10;
119
120 est->last_inbytes = stats->ustats.inbytes;
121 est->inbps = stats->ustats.inbps<<5;
122
123 est->last_outbytes = stats->ustats.outbytes;
124 est->outbps = stats->ustats.outbps<<5;
125
126 spin_lock_bh(&est_lock);
127 list_add(&est->list, &est_list);
128 spin_unlock_bh(&est_lock);
129}
130
131void ip_vs_kill_estimator(struct ip_vs_stats *stats)
132{
133 struct ip_vs_estimator *est = &stats->est;
134
135 spin_lock_bh(&est_lock);
136 list_del(&est->list);
137 spin_unlock_bh(&est_lock);
138}
139
140void ip_vs_zero_estimator(struct ip_vs_stats *stats)
141{
142 struct ip_vs_estimator *est = &stats->est;
143
144 /* set counters zero, caller must hold the stats->lock lock */
145 est->last_inbytes = 0;
146 est->last_outbytes = 0;
147 est->last_conns = 0;
148 est->last_inpkts = 0;
149 est->last_outpkts = 0;
150 est->cps = 0;
151 est->inpps = 0;
152 est->outpps = 0;
153 est->inbps = 0;
154 est->outbps = 0;
155}
156
157int __init ip_vs_estimator_init(void)
158{
159 mod_timer(&est_timer, jiffies + 2 * HZ);
160 return 0;
161}
162
163void ip_vs_estimator_cleanup(void)
164{
165 del_timer_sync(&est_timer);
166}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
deleted file mode 100644
index 2e7dbd8b73a4..000000000000
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * ip_vs_ftp.c: IPVS ftp application module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * Changes:
7 *
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
15 * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
16 *
17 * IP_MASQ_FTP ftp masquerading module
18 *
19 * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
20 *
21 * Author: Wouter Gadeyne
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/kernel.h>
28#include <linux/skbuff.h>
29#include <linux/in.h>
30#include <linux/ip.h>
31#include <linux/netfilter.h>
32#include <net/protocol.h>
33#include <net/tcp.h>
34#include <asm/unaligned.h>
35
36#include <net/ip_vs.h>
37
38
39#define SERVER_STRING "227 Entering Passive Mode ("
40#define CLIENT_STRING "PORT "
41
42
43/*
44 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
45 * First port is set to the default port.
46 */
47static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
48module_param_array(ports, ushort, NULL, 0);
49MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
50
51
52/* Dummy variable */
53static int ip_vs_ftp_pasv;
54
55
56static int
57ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
58{
59 return 0;
60}
61
62
63static int
64ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
65{
66 return 0;
67}
68
69
70/*
71 * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
72 * with the "pattern" and terminated with the "term" character.
73 * <addr,port> is in network order.
74 */
75static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
76 const char *pattern, size_t plen, char term,
77 __be32 *addr, __be16 *port,
78 char **start, char **end)
79{
80 unsigned char p[6];
81 int i = 0;
82
83 if (data_limit - data < plen) {
84 /* check if there is partial match */
85 if (strnicmp(data, pattern, data_limit - data) == 0)
86 return -1;
87 else
88 return 0;
89 }
90
91 if (strnicmp(data, pattern, plen) != 0) {
92 return 0;
93 }
94 *start = data + plen;
95
96 for (data = *start; *data != term; data++) {
97 if (data == data_limit)
98 return -1;
99 }
100 *end = data;
101
102 memset(p, 0, sizeof(p));
103 for (data = *start; data != *end; data++) {
104 if (*data >= '0' && *data <= '9') {
105 p[i] = p[i]*10 + *data - '0';
106 } else if (*data == ',' && i < 5) {
107 i++;
108 } else {
109 /* unexpected character */
110 return -1;
111 }
112 }
113
114 if (i != 5)
115 return -1;
116
117 *addr = get_unaligned((__be32 *)p);
118 *port = get_unaligned((__be16 *)(p + 4));
119 return 1;
120}
121
122
123/*
124 * Look at outgoing ftp packets to catch the response to a PASV command
125 * from the server (inside-to-outside).
126 * When we see one, we build a connection entry with the client address,
127 * client port 0 (unknown at the moment), the server address and the
128 * server port. Mark the current connection entry as a control channel
129 * of the new entry. All this work is just to make the data connection
130 * can be scheduled to the right server later.
131 *
132 * The outgoing packet should be something like
133 * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
134 * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
135 */
136static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
137 struct sk_buff *skb, int *diff)
138{
139 struct iphdr *iph;
140 struct tcphdr *th;
141 char *data, *data_limit;
142 char *start, *end;
143 union nf_inet_addr from;
144 __be16 port;
145 struct ip_vs_conn *n_cp;
146 char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
147 unsigned buf_len;
148 int ret;
149
150#ifdef CONFIG_IP_VS_IPV6
151 /* This application helper doesn't work with IPv6 yet,
152 * so turn this into a no-op for IPv6 packets
153 */
154 if (cp->af == AF_INET6)
155 return 1;
156#endif
157
158 *diff = 0;
159
160 /* Only useful for established sessions */
161 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
162 return 1;
163
164 /* Linear packets are much easier to deal with. */
165 if (!skb_make_writable(skb, skb->len))
166 return 0;
167
168 if (cp->app_data == &ip_vs_ftp_pasv) {
169 iph = ip_hdr(skb);
170 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
171 data = (char *)th + (th->doff << 2);
172 data_limit = skb_tail_pointer(skb);
173
174 if (ip_vs_ftp_get_addrport(data, data_limit,
175 SERVER_STRING,
176 sizeof(SERVER_STRING)-1, ')',
177 &from.ip, &port,
178 &start, &end) != 1)
179 return 1;
180
181 IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
182 "%u.%u.%u.%u:%d detected\n",
183 NIPQUAD(from.ip), ntohs(port),
184 NIPQUAD(cp->caddr.ip), 0);
185
186 /*
187 * Now update or create an connection entry for it
188 */
189 n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
190 &cp->caddr, 0);
191 if (!n_cp) {
192 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
193 &cp->caddr, 0,
194 &cp->vaddr, port,
195 &from, port,
196 IP_VS_CONN_F_NO_CPORT,
197 cp->dest);
198 if (!n_cp)
199 return 0;
200
201 /* add its controller */
202 ip_vs_control_add(n_cp, cp);
203 }
204
205 /*
206 * Replace the old passive address with the new one
207 */
208 from.ip = n_cp->vaddr.ip;
209 port = n_cp->vport;
210 sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
211 (ntohs(port)>>8)&255, ntohs(port)&255);
212 buf_len = strlen(buf);
213
214 /*
215 * Calculate required delta-offset to keep TCP happy
216 */
217 *diff = buf_len - (end-start);
218
219 if (*diff == 0) {
220 /* simply replace it with new passive address */
221 memcpy(start, buf, buf_len);
222 ret = 1;
223 } else {
224 ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
225 end-start, buf, buf_len);
226 }
227
228 cp->app_data = NULL;
229 ip_vs_tcp_conn_listen(n_cp);
230 ip_vs_conn_put(n_cp);
231 return ret;
232 }
233 return 1;
234}
235
236
237/*
238 * Look at incoming ftp packets to catch the PASV/PORT command
239 * (outside-to-inside).
240 *
241 * The incoming packet having the PORT command should be something like
242 * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
243 * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
244 * In this case, we create a connection entry using the client address and
245 * port, so that the active ftp data connection from the server can reach
246 * the client.
247 */
248static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
249 struct sk_buff *skb, int *diff)
250{
251 struct iphdr *iph;
252 struct tcphdr *th;
253 char *data, *data_start, *data_limit;
254 char *start, *end;
255 union nf_inet_addr to;
256 __be16 port;
257 struct ip_vs_conn *n_cp;
258
259#ifdef CONFIG_IP_VS_IPV6
260 /* This application helper doesn't work with IPv6 yet,
261 * so turn this into a no-op for IPv6 packets
262 */
263 if (cp->af == AF_INET6)
264 return 1;
265#endif
266
267 /* no diff required for incoming packets */
268 *diff = 0;
269
270 /* Only useful for established sessions */
271 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
272 return 1;
273
274 /* Linear packets are much easier to deal with. */
275 if (!skb_make_writable(skb, skb->len))
276 return 0;
277
278 /*
279 * Detecting whether it is passive
280 */
281 iph = ip_hdr(skb);
282 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
283
284 /* Since there may be OPTIONS in the TCP packet and the HLEN is
285 the length of the header in 32-bit multiples, it is accurate
286 to calculate data address by th+HLEN*4 */
287 data = data_start = (char *)th + (th->doff << 2);
288 data_limit = skb_tail_pointer(skb);
289
290 while (data <= data_limit - 6) {
291 if (strnicmp(data, "PASV\r\n", 6) == 0) {
292 /* Passive mode on */
293 IP_VS_DBG(7, "got PASV at %td of %td\n",
294 data - data_start,
295 data_limit - data_start);
296 cp->app_data = &ip_vs_ftp_pasv;
297 return 1;
298 }
299 data++;
300 }
301
302 /*
303 * To support virtual FTP server, the scenerio is as follows:
304 * FTP client ----> Load Balancer ----> FTP server
305 * First detect the port number in the application data,
306 * then create a new connection entry for the coming data
307 * connection.
308 */
309 if (ip_vs_ftp_get_addrport(data_start, data_limit,
310 CLIENT_STRING, sizeof(CLIENT_STRING)-1,
311 '\r', &to.ip, &port,
312 &start, &end) != 1)
313 return 1;
314
315 IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
316 NIPQUAD(to.ip), ntohs(port));
317
318 /* Passive mode off */
319 cp->app_data = NULL;
320
321 /*
322 * Now update or create a connection entry for it
323 */
324 IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
325 ip_vs_proto_name(iph->protocol),
326 NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
327
328 n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
329 &to, port,
330 &cp->vaddr, htons(ntohs(cp->vport)-1));
331 if (!n_cp) {
332 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
333 &to, port,
334 &cp->vaddr, htons(ntohs(cp->vport)-1),
335 &cp->daddr, htons(ntohs(cp->dport)-1),
336 0,
337 cp->dest);
338 if (!n_cp)
339 return 0;
340
341 /* add its controller */
342 ip_vs_control_add(n_cp, cp);
343 }
344
345 /*
346 * Move tunnel to listen state
347 */
348 ip_vs_tcp_conn_listen(n_cp);
349 ip_vs_conn_put(n_cp);
350
351 return 1;
352}
353
354
355static struct ip_vs_app ip_vs_ftp = {
356 .name = "ftp",
357 .type = IP_VS_APP_TYPE_FTP,
358 .protocol = IPPROTO_TCP,
359 .module = THIS_MODULE,
360 .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
361 .init_conn = ip_vs_ftp_init_conn,
362 .done_conn = ip_vs_ftp_done_conn,
363 .bind_conn = NULL,
364 .unbind_conn = NULL,
365 .pkt_out = ip_vs_ftp_out,
366 .pkt_in = ip_vs_ftp_in,
367};
368
369
370/*
371 * ip_vs_ftp initialization
372 */
373static int __init ip_vs_ftp_init(void)
374{
375 int i, ret;
376 struct ip_vs_app *app = &ip_vs_ftp;
377
378 ret = register_ip_vs_app(app);
379 if (ret)
380 return ret;
381
382 for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
383 if (!ports[i])
384 continue;
385 ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
386 if (ret)
387 break;
388 IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
389 app->name, i, ports[i]);
390 }
391
392 if (ret)
393 unregister_ip_vs_app(app);
394
395 return ret;
396}
397
398
399/*
400 * ip_vs_ftp finish.
401 */
402static void __exit ip_vs_ftp_exit(void)
403{
404 unregister_ip_vs_app(&ip_vs_ftp);
405}
406
407
408module_init(ip_vs_ftp_init);
409module_exit(ip_vs_ftp_exit);
410MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
deleted file mode 100644
index 6ecef3518cac..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ /dev/null
@@ -1,555 +0,0 @@
1/*
2 * IPVS: Locality-Based Least-Connection scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Martin Hamilton : fixed the terrible locking bugs
13 * *lock(tbl->lock) ==> *lock(&tbl->lock)
14 * Wensong Zhang : fixed the uninitilized tbl->lock bug
15 * Wensong Zhang : added doing full expiration check to
16 * collect stale entries of 24+ hours when
17 * no partial expire check in a half hour
18 * Julian Anastasov : replaced del_timer call with del_timer_sync
19 * to avoid the possible race between timer
20 * handler and del_timer thread in SMP
21 *
22 */
23
24/*
25 * The lblc algorithm is as follows (pseudo code):
26 *
27 * if cachenode[dest_ip] is null then
28 * n, cachenode[dest_ip] <- {weighted least-conn node};
29 * else
30 * n <- cachenode[dest_ip];
31 * if (n is dead) OR
32 * (n.conns>n.weight AND
33 * there is a node m with m.conns<m.weight/2) then
34 * n, cachenode[dest_ip] <- {weighted least-conn node};
35 *
36 * return n;
37 *
38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
39 * me to write this module.
40 */
41
42#include <linux/ip.h>
43#include <linux/module.h>
44#include <linux/kernel.h>
45#include <linux/skbuff.h>
46#include <linux/jiffies.h>
47
48/* for sysctl */
49#include <linux/fs.h>
50#include <linux/sysctl.h>
51
52#include <net/ip_vs.h>
53
54
55/*
56 * It is for garbage collection of stale IPVS lblc entries,
57 * when the table is full.
58 */
59#define CHECK_EXPIRE_INTERVAL (60*HZ)
60#define ENTRY_TIMEOUT (6*60*HZ)
61
62/*
63 * It is for full expiration check.
64 * When there is no partial expiration check (garbage collection)
65 * in a half hour, do a full expiration check to collect stale
66 * entries that haven't been touched for a day.
67 */
68#define COUNT_FOR_FULL_EXPIRATION 30
69static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
70
71
72/*
73 * for IPVS lblc entry hash table
74 */
75#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
76#define CONFIG_IP_VS_LBLC_TAB_BITS 10
77#endif
78#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
79#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
80#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
81
82
83/*
84 * IPVS lblc entry represents an association between destination
85 * IP address and its destination server
86 */
87struct ip_vs_lblc_entry {
88 struct list_head list;
89 __be32 addr; /* destination IP address */
90 struct ip_vs_dest *dest; /* real server (cache) */
91 unsigned long lastuse; /* last used time */
92};
93
94
95/*
96 * IPVS lblc hash table
97 */
98struct ip_vs_lblc_table {
99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
100 atomic_t entries; /* number of entries */
101 int max_size; /* maximum size of entries */
102 struct timer_list periodic_timer; /* collect stale entries */
103 int rover; /* rover for expire check */
104 int counter; /* counter for no expire */
105};
106
107
108/*
109 * IPVS LBLC sysctl table
110 */
111
112static ctl_table vs_vars_table[] = {
113 {
114 .procname = "lblc_expiration",
115 .data = &sysctl_ip_vs_lblc_expiration,
116 .maxlen = sizeof(int),
117 .mode = 0644,
118 .proc_handler = &proc_dointvec_jiffies,
119 },
120 { .ctl_name = 0 }
121};
122
123static struct ctl_table_header * sysctl_header;
124
125static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
126{
127 list_del(&en->list);
128 /*
129 * We don't kfree dest because it is refered either by its service
130 * or the trash dest list.
131 */
132 atomic_dec(&en->dest->refcnt);
133 kfree(en);
134}
135
136
137/*
138 * Returns hash value for IPVS LBLC entry
139 */
140static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
141{
142 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
143}
144
145
146/*
147 * Hash an entry in the ip_vs_lblc_table.
148 * returns bool success.
149 */
150static void
151ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
152{
153 unsigned hash = ip_vs_lblc_hashkey(en->addr);
154
155 list_add(&en->list, &tbl->bucket[hash]);
156 atomic_inc(&tbl->entries);
157}
158
159
160/*
161 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
162 * lock
163 */
164static inline struct ip_vs_lblc_entry *
165ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
166{
167 unsigned hash = ip_vs_lblc_hashkey(addr);
168 struct ip_vs_lblc_entry *en;
169
170 list_for_each_entry(en, &tbl->bucket[hash], list)
171 if (en->addr == addr)
172 return en;
173
174 return NULL;
175}
176
177
178/*
179 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
180 * address to a server. Called under write lock.
181 */
182static inline struct ip_vs_lblc_entry *
183ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
184 struct ip_vs_dest *dest)
185{
186 struct ip_vs_lblc_entry *en;
187
188 en = ip_vs_lblc_get(tbl, daddr);
189 if (!en) {
190 en = kmalloc(sizeof(*en), GFP_ATOMIC);
191 if (!en) {
192 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
193 return NULL;
194 }
195
196 en->addr = daddr;
197 en->lastuse = jiffies;
198
199 atomic_inc(&dest->refcnt);
200 en->dest = dest;
201
202 ip_vs_lblc_hash(tbl, en);
203 } else if (en->dest != dest) {
204 atomic_dec(&en->dest->refcnt);
205 atomic_inc(&dest->refcnt);
206 en->dest = dest;
207 }
208
209 return en;
210}
211
212
213/*
214 * Flush all the entries of the specified table.
215 */
216static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
217{
218 struct ip_vs_lblc_entry *en, *nxt;
219 int i;
220
221 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
222 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
223 ip_vs_lblc_free(en);
224 atomic_dec(&tbl->entries);
225 }
226 }
227}
228
229
230static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
231{
232 struct ip_vs_lblc_table *tbl = svc->sched_data;
233 struct ip_vs_lblc_entry *en, *nxt;
234 unsigned long now = jiffies;
235 int i, j;
236
237 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
238 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
239
240 write_lock(&svc->sched_lock);
241 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
242 if (time_before(now,
243 en->lastuse + sysctl_ip_vs_lblc_expiration))
244 continue;
245
246 ip_vs_lblc_free(en);
247 atomic_dec(&tbl->entries);
248 }
249 write_unlock(&svc->sched_lock);
250 }
251 tbl->rover = j;
252}
253
254
255/*
256 * Periodical timer handler for IPVS lblc table
257 * It is used to collect stale entries when the number of entries
258 * exceeds the maximum size of the table.
259 *
260 * Fixme: we probably need more complicated algorithm to collect
261 * entries that have not been used for a long time even
262 * if the number of entries doesn't exceed the maximum size
263 * of the table.
264 * The full expiration check is for this purpose now.
265 */
266static void ip_vs_lblc_check_expire(unsigned long data)
267{
268 struct ip_vs_service *svc = (struct ip_vs_service *) data;
269 struct ip_vs_lblc_table *tbl = svc->sched_data;
270 unsigned long now = jiffies;
271 int goal;
272 int i, j;
273 struct ip_vs_lblc_entry *en, *nxt;
274
275 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
276 /* do full expiration check */
277 ip_vs_lblc_full_check(svc);
278 tbl->counter = 1;
279 goto out;
280 }
281
282 if (atomic_read(&tbl->entries) <= tbl->max_size) {
283 tbl->counter++;
284 goto out;
285 }
286
287 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
288 if (goal > tbl->max_size/2)
289 goal = tbl->max_size/2;
290
291 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
292 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
293
294 write_lock(&svc->sched_lock);
295 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
296 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
297 continue;
298
299 ip_vs_lblc_free(en);
300 atomic_dec(&tbl->entries);
301 goal--;
302 }
303 write_unlock(&svc->sched_lock);
304 if (goal <= 0)
305 break;
306 }
307 tbl->rover = j;
308
309 out:
310 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
311}
312
313
314static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
315{
316 int i;
317 struct ip_vs_lblc_table *tbl;
318
319 /*
320 * Allocate the ip_vs_lblc_table for this service
321 */
322 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
323 if (tbl == NULL) {
324 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
325 return -ENOMEM;
326 }
327 svc->sched_data = tbl;
328 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
329 "current service\n", sizeof(*tbl));
330
331 /*
332 * Initialize the hash buckets
333 */
334 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
335 INIT_LIST_HEAD(&tbl->bucket[i]);
336 }
337 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
338 tbl->rover = 0;
339 tbl->counter = 1;
340
341 /*
342 * Hook periodic timer for garbage collection
343 */
344 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
345 (unsigned long)svc);
346 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
347
348 return 0;
349}
350
351
352static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
353{
354 struct ip_vs_lblc_table *tbl = svc->sched_data;
355
356 /* remove periodic timer */
357 del_timer_sync(&tbl->periodic_timer);
358
359 /* got to clean up table entries here */
360 ip_vs_lblc_flush(tbl);
361
362 /* release the table itself */
363 kfree(tbl);
364 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
365 sizeof(*tbl));
366
367 return 0;
368}
369
370
371static inline struct ip_vs_dest *
372__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
373{
374 struct ip_vs_dest *dest, *least;
375 int loh, doh;
376
377 /*
378 * We think the overhead of processing active connections is fifty
379 * times higher than that of inactive connections in average. (This
380 * fifty times might not be accurate, we will change it later.) We
381 * use the following formula to estimate the overhead:
382 * dest->activeconns*50 + dest->inactconns
383 * and the load:
384 * (dest overhead) / dest->weight
385 *
386 * Remember -- no floats in kernel mode!!!
387 * The comparison of h1*w2 > h2*w1 is equivalent to that of
388 * h1/w1 > h2/w2
389 * if every weight is larger than zero.
390 *
391 * The server with weight=0 is quiesced and will not receive any
392 * new connection.
393 */
394 list_for_each_entry(dest, &svc->destinations, n_list) {
395 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
396 continue;
397 if (atomic_read(&dest->weight) > 0) {
398 least = dest;
399 loh = atomic_read(&least->activeconns) * 50
400 + atomic_read(&least->inactconns);
401 goto nextstage;
402 }
403 }
404 return NULL;
405
406 /*
407 * Find the destination with the least load.
408 */
409 nextstage:
410 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
411 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
412 continue;
413
414 doh = atomic_read(&dest->activeconns) * 50
415 + atomic_read(&dest->inactconns);
416 if (loh * atomic_read(&dest->weight) >
417 doh * atomic_read(&least->weight)) {
418 least = dest;
419 loh = doh;
420 }
421 }
422
423 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
424 "activeconns %d refcnt %d weight %d overhead %d\n",
425 NIPQUAD(least->addr.ip), ntohs(least->port),
426 atomic_read(&least->activeconns),
427 atomic_read(&least->refcnt),
428 atomic_read(&least->weight), loh);
429
430 return least;
431}
432
433
434/*
435 * If this destination server is overloaded and there is a less loaded
436 * server, then return true.
437 */
438static inline int
439is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
440{
441 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
442 struct ip_vs_dest *d;
443
444 list_for_each_entry(d, &svc->destinations, n_list) {
445 if (atomic_read(&d->activeconns)*2
446 < atomic_read(&d->weight)) {
447 return 1;
448 }
449 }
450 }
451 return 0;
452}
453
454
455/*
456 * Locality-Based (weighted) Least-Connection scheduling
457 */
458static struct ip_vs_dest *
459ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
460{
461 struct ip_vs_lblc_table *tbl = svc->sched_data;
462 struct iphdr *iph = ip_hdr(skb);
463 struct ip_vs_dest *dest = NULL;
464 struct ip_vs_lblc_entry *en;
465
466 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
467
468 /* First look in our cache */
469 read_lock(&svc->sched_lock);
470 en = ip_vs_lblc_get(tbl, iph->daddr);
471 if (en) {
472 /* We only hold a read lock, but this is atomic */
473 en->lastuse = jiffies;
474
475 /*
476 * If the destination is not available, i.e. it's in the trash,
477 * we must ignore it, as it may be removed from under our feet,
478 * if someone drops our reference count. Our caller only makes
479 * sure that destinations, that are not in the trash, are not
480 * moved to the trash, while we are scheduling. But anyone can
481 * free up entries from the trash at any time.
482 */
483
484 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
485 dest = en->dest;
486 }
487 read_unlock(&svc->sched_lock);
488
489 /* If the destination has a weight and is not overloaded, use it */
490 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
491 goto out;
492
493 /* No cache entry or it is invalid, time to schedule */
494 dest = __ip_vs_lblc_schedule(svc, iph);
495 if (!dest) {
496 IP_VS_DBG(1, "no destination available\n");
497 return NULL;
498 }
499
500 /* If we fail to create a cache entry, we'll just use the valid dest */
501 write_lock(&svc->sched_lock);
502 ip_vs_lblc_new(tbl, iph->daddr, dest);
503 write_unlock(&svc->sched_lock);
504
505out:
506 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
507 "--> server %u.%u.%u.%u:%d\n",
508 NIPQUAD(iph->daddr),
509 NIPQUAD(dest->addr.ip),
510 ntohs(dest->port));
511
512 return dest;
513}
514
515
516/*
517 * IPVS LBLC Scheduler structure
518 */
519static struct ip_vs_scheduler ip_vs_lblc_scheduler =
520{
521 .name = "lblc",
522 .refcnt = ATOMIC_INIT(0),
523 .module = THIS_MODULE,
524 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
525#ifdef CONFIG_IP_VS_IPV6
526 .supports_ipv6 = 0,
527#endif
528 .init_service = ip_vs_lblc_init_svc,
529 .done_service = ip_vs_lblc_done_svc,
530 .schedule = ip_vs_lblc_schedule,
531};
532
533
534static int __init ip_vs_lblc_init(void)
535{
536 int ret;
537
538 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
539 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
540 if (ret)
541 unregister_sysctl_table(sysctl_header);
542 return ret;
543}
544
545
546static void __exit ip_vs_lblc_cleanup(void)
547{
548 unregister_sysctl_table(sysctl_header);
549 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
550}
551
552
553module_init(ip_vs_lblc_init);
554module_exit(ip_vs_lblc_cleanup);
555MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
deleted file mode 100644
index 1f75ea83bcf8..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ /dev/null
@@ -1,755 +0,0 @@
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Julian Anastasov : Added the missing (dest->weight>0)
13 * condition in the ip_vs_dest_set_max.
14 *
15 */
16
17/*
18 * The lblc/r algorithm is as follows (pseudo code):
19 *
20 * if serverSet[dest_ip] is null then
21 * n, serverSet[dest_ip] <- {weighted least-conn node};
22 * else
23 * n <- {least-conn (alive) node in serverSet[dest_ip]};
24 * if (n is null) OR
25 * (n.conns>n.weight AND
26 * there is a node m with m.conns<m.weight/2) then
27 * n <- {weighted least-conn node};
28 * add n to serverSet[dest_ip];
29 * if |serverSet[dest_ip]| > 1 AND
30 * now - serverSet[dest_ip].lastMod > T then
31 * m <- {most conn node in serverSet[dest_ip]};
32 * remove m from serverSet[dest_ip];
33 * if serverSet[dest_ip] changed then
34 * serverSet[dest_ip].lastMod <- now;
35 *
36 * return n;
37 *
38 */
39
40#include <linux/ip.h>
41#include <linux/module.h>
42#include <linux/kernel.h>
43#include <linux/skbuff.h>
44#include <linux/jiffies.h>
45
46/* for sysctl */
47#include <linux/fs.h>
48#include <linux/sysctl.h>
49#include <net/net_namespace.h>
50
51#include <net/ip_vs.h>
52
53
54/*
55 * It is for garbage collection of stale IPVS lblcr entries,
56 * when the table is full.
57 */
58#define CHECK_EXPIRE_INTERVAL (60*HZ)
59#define ENTRY_TIMEOUT (6*60*HZ)
60
61/*
62 * It is for full expiration check.
63 * When there is no partial expiration check (garbage collection)
64 * in a half hour, do a full expiration check to collect stale
65 * entries that haven't been touched for a day.
66 */
67#define COUNT_FOR_FULL_EXPIRATION 30
68static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
69
70
71/*
72 * for IPVS lblcr entry hash table
73 */
74#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
75#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
76#endif
77#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
78#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
79#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
80
81
82/*
83 * IPVS destination set structure and operations
84 */
85struct ip_vs_dest_list {
86 struct ip_vs_dest_list *next; /* list link */
87 struct ip_vs_dest *dest; /* destination server */
88};
89
90struct ip_vs_dest_set {
91 atomic_t size; /* set size */
92 unsigned long lastmod; /* last modified time */
93 struct ip_vs_dest_list *list; /* destination list */
94 rwlock_t lock; /* lock for this list */
95};
96
97
98static struct ip_vs_dest_list *
99ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
100{
101 struct ip_vs_dest_list *e;
102
103 for (e=set->list; e!=NULL; e=e->next) {
104 if (e->dest == dest)
105 /* already existed */
106 return NULL;
107 }
108
109 e = kmalloc(sizeof(*e), GFP_ATOMIC);
110 if (e == NULL) {
111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
112 return NULL;
113 }
114
115 atomic_inc(&dest->refcnt);
116 e->dest = dest;
117
118 /* link it to the list */
119 e->next = set->list;
120 set->list = e;
121 atomic_inc(&set->size);
122
123 set->lastmod = jiffies;
124 return e;
125}
126
127static void
128ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
129{
130 struct ip_vs_dest_list *e, **ep;
131
132 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
133 if (e->dest == dest) {
134 /* HIT */
135 *ep = e->next;
136 atomic_dec(&set->size);
137 set->lastmod = jiffies;
138 atomic_dec(&e->dest->refcnt);
139 kfree(e);
140 break;
141 }
142 ep = &e->next;
143 }
144}
145
146static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
147{
148 struct ip_vs_dest_list *e, **ep;
149
150 write_lock(&set->lock);
151 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
152 *ep = e->next;
153 /*
154 * We don't kfree dest because it is refered either
155 * by its service or by the trash dest list.
156 */
157 atomic_dec(&e->dest->refcnt);
158 kfree(e);
159 }
160 write_unlock(&set->lock);
161}
162
163/* get weighted least-connection node in the destination set */
164static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
165{
166 register struct ip_vs_dest_list *e;
167 struct ip_vs_dest *dest, *least;
168 int loh, doh;
169
170 if (set == NULL)
171 return NULL;
172
173 /* select the first destination server, whose weight > 0 */
174 for (e=set->list; e!=NULL; e=e->next) {
175 least = e->dest;
176 if (least->flags & IP_VS_DEST_F_OVERLOAD)
177 continue;
178
179 if ((atomic_read(&least->weight) > 0)
180 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
181 loh = atomic_read(&least->activeconns) * 50
182 + atomic_read(&least->inactconns);
183 goto nextstage;
184 }
185 }
186 return NULL;
187
188 /* find the destination with the weighted least load */
189 nextstage:
190 for (e=e->next; e!=NULL; e=e->next) {
191 dest = e->dest;
192 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
193 continue;
194
195 doh = atomic_read(&dest->activeconns) * 50
196 + atomic_read(&dest->inactconns);
197 if ((loh * atomic_read(&dest->weight) >
198 doh * atomic_read(&least->weight))
199 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
200 least = dest;
201 loh = doh;
202 }
203 }
204
205 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
206 "activeconns %d refcnt %d weight %d overhead %d\n",
207 NIPQUAD(least->addr.ip), ntohs(least->port),
208 atomic_read(&least->activeconns),
209 atomic_read(&least->refcnt),
210 atomic_read(&least->weight), loh);
211 return least;
212}
213
214
215/* get weighted most-connection node in the destination set */
216static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
217{
218 register struct ip_vs_dest_list *e;
219 struct ip_vs_dest *dest, *most;
220 int moh, doh;
221
222 if (set == NULL)
223 return NULL;
224
225 /* select the first destination server, whose weight > 0 */
226 for (e=set->list; e!=NULL; e=e->next) {
227 most = e->dest;
228 if (atomic_read(&most->weight) > 0) {
229 moh = atomic_read(&most->activeconns) * 50
230 + atomic_read(&most->inactconns);
231 goto nextstage;
232 }
233 }
234 return NULL;
235
236 /* find the destination with the weighted most load */
237 nextstage:
238 for (e=e->next; e!=NULL; e=e->next) {
239 dest = e->dest;
240 doh = atomic_read(&dest->activeconns) * 50
241 + atomic_read(&dest->inactconns);
242 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
243 if ((moh * atomic_read(&dest->weight) <
244 doh * atomic_read(&most->weight))
245 && (atomic_read(&dest->weight) > 0)) {
246 most = dest;
247 moh = doh;
248 }
249 }
250
251 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
252 "activeconns %d refcnt %d weight %d overhead %d\n",
253 NIPQUAD(most->addr.ip), ntohs(most->port),
254 atomic_read(&most->activeconns),
255 atomic_read(&most->refcnt),
256 atomic_read(&most->weight), moh);
257 return most;
258}
259
260
261/*
262 * IPVS lblcr entry represents an association between destination
263 * IP address and its destination server set
264 */
265struct ip_vs_lblcr_entry {
266 struct list_head list;
267 __be32 addr; /* destination IP address */
268 struct ip_vs_dest_set set; /* destination server set */
269 unsigned long lastuse; /* last used time */
270};
271
272
273/*
274 * IPVS lblcr hash table
275 */
276struct ip_vs_lblcr_table {
277 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
278 atomic_t entries; /* number of entries */
279 int max_size; /* maximum size of entries */
280 struct timer_list periodic_timer; /* collect stale entries */
281 int rover; /* rover for expire check */
282 int counter; /* counter for no expire */
283};
284
285
286/*
287 * IPVS LBLCR sysctl table
288 */
289
290static ctl_table vs_vars_table[] = {
291 {
292 .procname = "lblcr_expiration",
293 .data = &sysctl_ip_vs_lblcr_expiration,
294 .maxlen = sizeof(int),
295 .mode = 0644,
296 .proc_handler = &proc_dointvec_jiffies,
297 },
298 { .ctl_name = 0 }
299};
300
301static struct ctl_table_header * sysctl_header;
302
303static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
304{
305 list_del(&en->list);
306 ip_vs_dest_set_eraseall(&en->set);
307 kfree(en);
308}
309
310
311/*
312 * Returns hash value for IPVS LBLCR entry
313 */
314static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
315{
316 return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
317}
318
319
320/*
321 * Hash an entry in the ip_vs_lblcr_table.
322 * returns bool success.
323 */
324static void
325ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
326{
327 unsigned hash = ip_vs_lblcr_hashkey(en->addr);
328
329 list_add(&en->list, &tbl->bucket[hash]);
330 atomic_inc(&tbl->entries);
331}
332
333
334/*
335 * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
336 * read lock.
337 */
338static inline struct ip_vs_lblcr_entry *
339ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
340{
341 unsigned hash = ip_vs_lblcr_hashkey(addr);
342 struct ip_vs_lblcr_entry *en;
343
344 list_for_each_entry(en, &tbl->bucket[hash], list)
345 if (en->addr == addr)
346 return en;
347
348 return NULL;
349}
350
351
352/*
353 * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
354 * IP address to a server. Called under write lock.
355 */
356static inline struct ip_vs_lblcr_entry *
357ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
358 struct ip_vs_dest *dest)
359{
360 struct ip_vs_lblcr_entry *en;
361
362 en = ip_vs_lblcr_get(tbl, daddr);
363 if (!en) {
364 en = kmalloc(sizeof(*en), GFP_ATOMIC);
365 if (!en) {
366 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
367 return NULL;
368 }
369
370 en->addr = daddr;
371 en->lastuse = jiffies;
372
373 /* initilize its dest set */
374 atomic_set(&(en->set.size), 0);
375 en->set.list = NULL;
376 rwlock_init(&en->set.lock);
377
378 ip_vs_lblcr_hash(tbl, en);
379 }
380
381 write_lock(&en->set.lock);
382 ip_vs_dest_set_insert(&en->set, dest);
383 write_unlock(&en->set.lock);
384
385 return en;
386}
387
388
389/*
390 * Flush all the entries of the specified table.
391 */
392static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
393{
394 int i;
395 struct ip_vs_lblcr_entry *en, *nxt;
396
397 /* No locking required, only called during cleanup. */
398 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
399 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
400 ip_vs_lblcr_free(en);
401 }
402 }
403}
404
405
406static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
407{
408 struct ip_vs_lblcr_table *tbl = svc->sched_data;
409 unsigned long now = jiffies;
410 int i, j;
411 struct ip_vs_lblcr_entry *en, *nxt;
412
413 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
414 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
415
416 write_lock(&svc->sched_lock);
417 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
418 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
419 now))
420 continue;
421
422 ip_vs_lblcr_free(en);
423 atomic_dec(&tbl->entries);
424 }
425 write_unlock(&svc->sched_lock);
426 }
427 tbl->rover = j;
428}
429
430
431/*
432 * Periodical timer handler for IPVS lblcr table
433 * It is used to collect stale entries when the number of entries
434 * exceeds the maximum size of the table.
435 *
436 * Fixme: we probably need more complicated algorithm to collect
437 * entries that have not been used for a long time even
438 * if the number of entries doesn't exceed the maximum size
439 * of the table.
440 * The full expiration check is for this purpose now.
441 */
442static void ip_vs_lblcr_check_expire(unsigned long data)
443{
444 struct ip_vs_service *svc = (struct ip_vs_service *) data;
445 struct ip_vs_lblcr_table *tbl = svc->sched_data;
446 unsigned long now = jiffies;
447 int goal;
448 int i, j;
449 struct ip_vs_lblcr_entry *en, *nxt;
450
451 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
452 /* do full expiration check */
453 ip_vs_lblcr_full_check(svc);
454 tbl->counter = 1;
455 goto out;
456 }
457
458 if (atomic_read(&tbl->entries) <= tbl->max_size) {
459 tbl->counter++;
460 goto out;
461 }
462
463 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
464 if (goal > tbl->max_size/2)
465 goal = tbl->max_size/2;
466
467 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
468 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
469
470 write_lock(&svc->sched_lock);
471 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
472 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
473 continue;
474
475 ip_vs_lblcr_free(en);
476 atomic_dec(&tbl->entries);
477 goal--;
478 }
479 write_unlock(&svc->sched_lock);
480 if (goal <= 0)
481 break;
482 }
483 tbl->rover = j;
484
485 out:
486 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
487}
488
489static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
490{
491 int i;
492 struct ip_vs_lblcr_table *tbl;
493
494 /*
495 * Allocate the ip_vs_lblcr_table for this service
496 */
497 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
498 if (tbl == NULL) {
499 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
500 return -ENOMEM;
501 }
502 svc->sched_data = tbl;
503 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
504 "current service\n", sizeof(*tbl));
505
506 /*
507 * Initialize the hash buckets
508 */
509 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
510 INIT_LIST_HEAD(&tbl->bucket[i]);
511 }
512 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
513 tbl->rover = 0;
514 tbl->counter = 1;
515
516 /*
517 * Hook periodic timer for garbage collection
518 */
519 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
520 (unsigned long)svc);
521 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
522
523 return 0;
524}
525
526
527static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
528{
529 struct ip_vs_lblcr_table *tbl = svc->sched_data;
530
531 /* remove periodic timer */
532 del_timer_sync(&tbl->periodic_timer);
533
534 /* got to clean up table entries here */
535 ip_vs_lblcr_flush(tbl);
536
537 /* release the table itself */
538 kfree(tbl);
539 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
540 sizeof(*tbl));
541
542 return 0;
543}
544
545
546static inline struct ip_vs_dest *
547__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
548{
549 struct ip_vs_dest *dest, *least;
550 int loh, doh;
551
552 /*
553 * We think the overhead of processing active connections is fifty
554 * times higher than that of inactive connections in average. (This
555 * fifty times might not be accurate, we will change it later.) We
556 * use the following formula to estimate the overhead:
557 * dest->activeconns*50 + dest->inactconns
558 * and the load:
559 * (dest overhead) / dest->weight
560 *
561 * Remember -- no floats in kernel mode!!!
562 * The comparison of h1*w2 > h2*w1 is equivalent to that of
563 * h1/w1 > h2/w2
564 * if every weight is larger than zero.
565 *
566 * The server with weight=0 is quiesced and will not receive any
567 * new connection.
568 */
569 list_for_each_entry(dest, &svc->destinations, n_list) {
570 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
571 continue;
572
573 if (atomic_read(&dest->weight) > 0) {
574 least = dest;
575 loh = atomic_read(&least->activeconns) * 50
576 + atomic_read(&least->inactconns);
577 goto nextstage;
578 }
579 }
580 return NULL;
581
582 /*
583 * Find the destination with the least load.
584 */
585 nextstage:
586 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
587 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
588 continue;
589
590 doh = atomic_read(&dest->activeconns) * 50
591 + atomic_read(&dest->inactconns);
592 if (loh * atomic_read(&dest->weight) >
593 doh * atomic_read(&least->weight)) {
594 least = dest;
595 loh = doh;
596 }
597 }
598
599 IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
600 "activeconns %d refcnt %d weight %d overhead %d\n",
601 NIPQUAD(least->addr.ip), ntohs(least->port),
602 atomic_read(&least->activeconns),
603 atomic_read(&least->refcnt),
604 atomic_read(&least->weight), loh);
605
606 return least;
607}
608
609
610/*
611 * If this destination server is overloaded and there is a less loaded
612 * server, then return true.
613 */
614static inline int
615is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
616{
617 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
618 struct ip_vs_dest *d;
619
620 list_for_each_entry(d, &svc->destinations, n_list) {
621 if (atomic_read(&d->activeconns)*2
622 < atomic_read(&d->weight)) {
623 return 1;
624 }
625 }
626 }
627 return 0;
628}
629
630
631/*
632 * Locality-Based (weighted) Least-Connection scheduling
633 */
634static struct ip_vs_dest *
635ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
636{
637 struct ip_vs_lblcr_table *tbl = svc->sched_data;
638 struct iphdr *iph = ip_hdr(skb);
639 struct ip_vs_dest *dest = NULL;
640 struct ip_vs_lblcr_entry *en;
641
642 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
643
644 /* First look in our cache */
645 read_lock(&svc->sched_lock);
646 en = ip_vs_lblcr_get(tbl, iph->daddr);
647 if (en) {
648 /* We only hold a read lock, but this is atomic */
649 en->lastuse = jiffies;
650
651 /* Get the least loaded destination */
652 read_lock(&en->set.lock);
653 dest = ip_vs_dest_set_min(&en->set);
654 read_unlock(&en->set.lock);
655
656 /* More than one destination + enough time passed by, cleanup */
657 if (atomic_read(&en->set.size) > 1 &&
658 time_after(jiffies, en->set.lastmod +
659 sysctl_ip_vs_lblcr_expiration)) {
660 struct ip_vs_dest *m;
661
662 write_lock(&en->set.lock);
663 m = ip_vs_dest_set_max(&en->set);
664 if (m)
665 ip_vs_dest_set_erase(&en->set, m);
666 write_unlock(&en->set.lock);
667 }
668
669 /* If the destination is not overloaded, use it */
670 if (dest && !is_overloaded(dest, svc)) {
671 read_unlock(&svc->sched_lock);
672 goto out;
673 }
674
675 /* The cache entry is invalid, time to schedule */
676 dest = __ip_vs_lblcr_schedule(svc, iph);
677 if (!dest) {
678 IP_VS_DBG(1, "no destination available\n");
679 read_unlock(&svc->sched_lock);
680 return NULL;
681 }
682
683 /* Update our cache entry */
684 write_lock(&en->set.lock);
685 ip_vs_dest_set_insert(&en->set, dest);
686 write_unlock(&en->set.lock);
687 }
688 read_unlock(&svc->sched_lock);
689
690 if (dest)
691 goto out;
692
693 /* No cache entry, time to schedule */
694 dest = __ip_vs_lblcr_schedule(svc, iph);
695 if (!dest) {
696 IP_VS_DBG(1, "no destination available\n");
697 return NULL;
698 }
699
700 /* If we fail to create a cache entry, we'll just use the valid dest */
701 write_lock(&svc->sched_lock);
702 ip_vs_lblcr_new(tbl, iph->daddr, dest);
703 write_unlock(&svc->sched_lock);
704
705out:
706 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
707 "--> server %u.%u.%u.%u:%d\n",
708 NIPQUAD(iph->daddr),
709 NIPQUAD(dest->addr.ip),
710 ntohs(dest->port));
711
712 return dest;
713}
714
715
716/*
717 * IPVS LBLCR Scheduler structure
718 */
719static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
720{
721 .name = "lblcr",
722 .refcnt = ATOMIC_INIT(0),
723 .module = THIS_MODULE,
724 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
725#ifdef CONFIG_IP_VS_IPV6
726 .supports_ipv6 = 0,
727#endif
728 .init_service = ip_vs_lblcr_init_svc,
729 .done_service = ip_vs_lblcr_done_svc,
730 .schedule = ip_vs_lblcr_schedule,
731};
732
733
734static int __init ip_vs_lblcr_init(void)
735{
736 int ret;
737
738 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
739 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
740 if (ret)
741 unregister_sysctl_table(sysctl_header);
742 return ret;
743}
744
745
746static void __exit ip_vs_lblcr_cleanup(void)
747{
748 unregister_sysctl_table(sysctl_header);
749 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
750}
751
752
753module_init(ip_vs_lblcr_init);
754module_exit(ip_vs_lblcr_cleanup);
755MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
deleted file mode 100644
index b69f808ac461..000000000000
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ /dev/null
@@ -1,103 +0,0 @@
1/*
2 * IPVS: Least-Connection Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Wensong Zhang : added the ip_vs_lc_update_svc
13 * Wensong Zhang : added any dest with weight=0 is quiesced
14 *
15 */
16
17#include <linux/module.h>
18#include <linux/kernel.h>
19
20#include <net/ip_vs.h>
21
22
23static inline unsigned int
24ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
25{
26 /*
27 * We think the overhead of processing active connections is 256
28 * times higher than that of inactive connections in average. (This
29 * 256 times might not be accurate, we will change it later) We
30 * use the following formula to estimate the overhead now:
31 * dest->activeconns*256 + dest->inactconns
32 */
33 return (atomic_read(&dest->activeconns) << 8) +
34 atomic_read(&dest->inactconns);
35}
36
37
38/*
39 * Least Connection scheduling
40 */
41static struct ip_vs_dest *
42ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
43{
44 struct ip_vs_dest *dest, *least = NULL;
45 unsigned int loh = 0, doh;
46
47 IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
48
49 /*
50 * Simply select the server with the least number of
51 * (activeconns<<5) + inactconns
52 * Except whose weight is equal to zero.
53 * If the weight is equal to zero, it means that the server is
54 * quiesced, the existing connections to the server still get
55 * served, but no new connection is assigned to the server.
56 */
57
58 list_for_each_entry(dest, &svc->destinations, n_list) {
59 if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
60 atomic_read(&dest->weight) == 0)
61 continue;
62 doh = ip_vs_lc_dest_overhead(dest);
63 if (!least || doh < loh) {
64 least = dest;
65 loh = doh;
66 }
67 }
68
69 if (least)
70 IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
71 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
72 atomic_read(&least->activeconns),
73 atomic_read(&least->inactconns));
74
75 return least;
76}
77
78
79static struct ip_vs_scheduler ip_vs_lc_scheduler = {
80 .name = "lc",
81 .refcnt = ATOMIC_INIT(0),
82 .module = THIS_MODULE,
83 .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
84#ifdef CONFIG_IP_VS_IPV6
85 .supports_ipv6 = 1,
86#endif
87 .schedule = ip_vs_lc_schedule,
88};
89
90
91static int __init ip_vs_lc_init(void)
92{
93 return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
94}
95
96static void __exit ip_vs_lc_cleanup(void)
97{
98 unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
99}
100
101module_init(ip_vs_lc_init);
102module_exit(ip_vs_lc_cleanup);
103MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
deleted file mode 100644
index 9a2d8033f08f..000000000000
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ /dev/null
@@ -1,138 +0,0 @@
1/*
2 * IPVS: Never Queue scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The NQ algorithm adopts a two-speed model. When there is an idle server
17 * available, the job will be sent to the idle server, instead of waiting
18 * for a fast one. When there is no idle server available, the job will be
19 * sent to the server that minimize its expected delay (The Shortest
20 * Expected Delay scheduling algorithm).
21 *
22 * See the following paper for more information:
23 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
24 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
25 * pages 986-994, 1988.
26 *
27 * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
28 *
29 * The difference between NQ and SED is that NQ can improve overall
30 * system utilization.
31 *
32 */
33
34#include <linux/module.h>
35#include <linux/kernel.h>
36
37#include <net/ip_vs.h>
38
39
40static inline unsigned int
41ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
42{
43 /*
44 * We only use the active connection number in the cost
45 * calculation here.
46 */
47 return atomic_read(&dest->activeconns) + 1;
48}
49
50
51/*
52 * Weighted Least Connection scheduling
53 */
54static struct ip_vs_dest *
55ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
56{
57 struct ip_vs_dest *dest, *least = NULL;
58 unsigned int loh = 0, doh;
59
60 IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
61
62 /*
63 * We calculate the load of each dest server as follows:
64 * (server expected overhead) / dest->weight
65 *
66 * Remember -- no floats in kernel mode!!!
67 * The comparison of h1*w2 > h2*w1 is equivalent to that of
68 * h1/w1 > h2/w2
69 * if every weight is larger than zero.
70 *
71 * The server with weight=0 is quiesced and will not receive any
72 * new connections.
73 */
74
75 list_for_each_entry(dest, &svc->destinations, n_list) {
76
77 if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
78 !atomic_read(&dest->weight))
79 continue;
80
81 doh = ip_vs_nq_dest_overhead(dest);
82
83 /* return the server directly if it is idle */
84 if (atomic_read(&dest->activeconns) == 0) {
85 least = dest;
86 loh = doh;
87 goto out;
88 }
89
90 if (!least ||
91 (loh * atomic_read(&dest->weight) >
92 doh * atomic_read(&least->weight))) {
93 least = dest;
94 loh = doh;
95 }
96 }
97
98 if (!least)
99 return NULL;
100
101 out:
102 IP_VS_DBG_BUF(6, "NQ: server %s:%u "
103 "activeconns %d refcnt %d weight %d overhead %d\n",
104 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
105 atomic_read(&least->activeconns),
106 atomic_read(&least->refcnt),
107 atomic_read(&least->weight), loh);
108
109 return least;
110}
111
112
113static struct ip_vs_scheduler ip_vs_nq_scheduler =
114{
115 .name = "nq",
116 .refcnt = ATOMIC_INIT(0),
117 .module = THIS_MODULE,
118 .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
119#ifdef CONFIG_IP_VS_IPV6
120 .supports_ipv6 = 1,
121#endif
122 .schedule = ip_vs_nq_schedule,
123};
124
125
126static int __init ip_vs_nq_init(void)
127{
128 return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
129}
130
131static void __exit ip_vs_nq_cleanup(void)
132{
133 unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
134}
135
136module_init(ip_vs_nq_init);
137module_exit(ip_vs_nq_cleanup);
138MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
deleted file mode 100644
index 0791f9e08feb..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/*
2 * ip_vs_proto.c: transport protocol load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/module.h>
17#include <linux/kernel.h>
18#include <linux/skbuff.h>
19#include <linux/in.h>
20#include <linux/ip.h>
21#include <net/protocol.h>
22#include <net/tcp.h>
23#include <net/udp.h>
24#include <asm/system.h>
25#include <linux/stat.h>
26#include <linux/proc_fs.h>
27
28#include <net/ip_vs.h>
29
30
31/*
32 * IPVS protocols can only be registered/unregistered when the ipvs
33 * module is loaded/unloaded, so no lock is needed in accessing the
34 * ipvs protocol table.
35 */
36
37#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
38#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
39
40static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
41
42
43/*
44 * register an ipvs protocol
45 */
46static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
47{
48 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
49
50 pp->next = ip_vs_proto_table[hash];
51 ip_vs_proto_table[hash] = pp;
52
53 if (pp->init != NULL)
54 pp->init(pp);
55
56 return 0;
57}
58
59
60/*
61 * unregister an ipvs protocol
62 */
63static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
64{
65 struct ip_vs_protocol **pp_p;
66 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
67
68 pp_p = &ip_vs_proto_table[hash];
69 for (; *pp_p; pp_p = &(*pp_p)->next) {
70 if (*pp_p == pp) {
71 *pp_p = pp->next;
72 if (pp->exit != NULL)
73 pp->exit(pp);
74 return 0;
75 }
76 }
77
78 return -ESRCH;
79}
80
81
82/*
83 * get ip_vs_protocol object by its proto.
84 */
85struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
86{
87 struct ip_vs_protocol *pp;
88 unsigned hash = IP_VS_PROTO_HASH(proto);
89
90 for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
91 if (pp->protocol == proto)
92 return pp;
93 }
94
95 return NULL;
96}
97
98
99/*
100 * Propagate event for state change to all protocols
101 */
102void ip_vs_protocol_timeout_change(int flags)
103{
104 struct ip_vs_protocol *pp;
105 int i;
106
107 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
108 for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
109 if (pp->timeout_change)
110 pp->timeout_change(pp, flags);
111 }
112 }
113}
114
115
116int *
117ip_vs_create_timeout_table(int *table, int size)
118{
119 return kmemdup(table, size, GFP_ATOMIC);
120}
121
122
123/*
124 * Set timeout value for state specified by name
125 */
126int
127ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
128{
129 int i;
130
131 if (!table || !name || !to)
132 return -EINVAL;
133
134 for (i = 0; i < num; i++) {
135 if (strcmp(names[i], name))
136 continue;
137 table[i] = to * HZ;
138 return 0;
139 }
140 return -ENOENT;
141}
142
143
144const char * ip_vs_state_name(__u16 proto, int state)
145{
146 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
147
148 if (pp == NULL || pp->state_name == NULL)
149 return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
150 return pp->state_name(state);
151}
152
153
154static void
155ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
156 const struct sk_buff *skb,
157 int offset,
158 const char *msg)
159{
160 char buf[128];
161 struct iphdr _iph, *ih;
162
163 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
164 if (ih == NULL)
165 sprintf(buf, "%s TRUNCATED", pp->name);
166 else if (ih->frag_off & htons(IP_OFFSET))
167 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
168 pp->name, NIPQUAD(ih->saddr),
169 NIPQUAD(ih->daddr));
170 else {
171 __be16 _ports[2], *pptr
172;
173 pptr = skb_header_pointer(skb, offset + ih->ihl*4,
174 sizeof(_ports), _ports);
175 if (pptr == NULL)
176 sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
177 pp->name,
178 NIPQUAD(ih->saddr),
179 NIPQUAD(ih->daddr));
180 else
181 sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
182 pp->name,
183 NIPQUAD(ih->saddr),
184 ntohs(pptr[0]),
185 NIPQUAD(ih->daddr),
186 ntohs(pptr[1]));
187 }
188
189 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
190}
191
192#ifdef CONFIG_IP_VS_IPV6
193static void
194ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
195 const struct sk_buff *skb,
196 int offset,
197 const char *msg)
198{
199 char buf[192];
200 struct ipv6hdr _iph, *ih;
201
202 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
203 if (ih == NULL)
204 sprintf(buf, "%s TRUNCATED", pp->name);
205 else if (ih->nexthdr == IPPROTO_FRAGMENT)
206 sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
207 pp->name, NIP6(ih->saddr),
208 NIP6(ih->daddr));
209 else {
210 __be16 _ports[2], *pptr;
211
212 pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
213 sizeof(_ports), _ports);
214 if (pptr == NULL)
215 sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
216 pp->name,
217 NIP6(ih->saddr),
218 NIP6(ih->daddr));
219 else
220 sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
221 pp->name,
222 NIP6(ih->saddr),
223 ntohs(pptr[0]),
224 NIP6(ih->daddr),
225 ntohs(pptr[1]));
226 }
227
228 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
229}
230#endif
231
232
233void
234ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
235 const struct sk_buff *skb,
236 int offset,
237 const char *msg)
238{
239#ifdef CONFIG_IP_VS_IPV6
240 if (skb->protocol == htons(ETH_P_IPV6))
241 ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
242 else
243#endif
244 ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
245}
246
247
248int __init ip_vs_protocol_init(void)
249{
250 char protocols[64];
251#define REGISTER_PROTOCOL(p) \
252 do { \
253 register_ip_vs_protocol(p); \
254 strcat(protocols, ", "); \
255 strcat(protocols, (p)->name); \
256 } while (0)
257
258 protocols[0] = '\0';
259 protocols[2] = '\0';
260#ifdef CONFIG_IP_VS_PROTO_TCP
261 REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
262#endif
263#ifdef CONFIG_IP_VS_PROTO_UDP
264 REGISTER_PROTOCOL(&ip_vs_protocol_udp);
265#endif
266#ifdef CONFIG_IP_VS_PROTO_AH
267 REGISTER_PROTOCOL(&ip_vs_protocol_ah);
268#endif
269#ifdef CONFIG_IP_VS_PROTO_ESP
270 REGISTER_PROTOCOL(&ip_vs_protocol_esp);
271#endif
272 IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
273
274 return 0;
275}
276
277
278void ip_vs_protocol_cleanup(void)
279{
280 struct ip_vs_protocol *pp;
281 int i;
282
283 /* unregister all the ipvs protocols */
284 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
285 while ((pp = ip_vs_proto_table[i]) != NULL)
286 unregister_ip_vs_protocol(pp);
287 }
288}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
deleted file mode 100644
index 80ab0c8e5b4a..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ /dev/null
@@ -1,235 +0,0 @@
1/*
2 * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
43 const struct ip_vs_iphdr *iph, unsigned int proto_off,
44 int inverse)
45{
46 struct ip_vs_conn *cp;
47
48 if (likely(!inverse)) {
49 cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
50 &iph->saddr,
51 htons(PORT_ISAKMP),
52 &iph->daddr,
53 htons(PORT_ISAKMP));
54 } else {
55 cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
56 &iph->daddr,
57 htons(PORT_ISAKMP),
58 &iph->saddr,
59 htons(PORT_ISAKMP));
60 }
61
62 if (!cp) {
63 /*
64 * We are not sure if the packet is from our
65 * service, so our conn_schedule hook should return NF_ACCEPT
66 */
67 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
68 "%s%s %s->%s\n",
69 inverse ? "ICMP+" : "",
70 pp->name,
71 IP_VS_DBG_ADDR(af, &iph->saddr),
72 IP_VS_DBG_ADDR(af, &iph->daddr));
73 }
74
75 return cp;
76}
77
78
79static struct ip_vs_conn *
80ah_esp_conn_out_get(int af, const struct sk_buff *skb,
81 struct ip_vs_protocol *pp,
82 const struct ip_vs_iphdr *iph,
83 unsigned int proto_off,
84 int inverse)
85{
86 struct ip_vs_conn *cp;
87
88 if (likely(!inverse)) {
89 cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
90 &iph->saddr,
91 htons(PORT_ISAKMP),
92 &iph->daddr,
93 htons(PORT_ISAKMP));
94 } else {
95 cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
96 &iph->daddr,
97 htons(PORT_ISAKMP),
98 &iph->saddr,
99 htons(PORT_ISAKMP));
100 }
101
102 if (!cp) {
103 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
104 "%s%s %s->%s\n",
105 inverse ? "ICMP+" : "",
106 pp->name,
107 IP_VS_DBG_ADDR(af, &iph->saddr),
108 IP_VS_DBG_ADDR(af, &iph->daddr));
109 }
110
111 return cp;
112}
113
114
115static int
116ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp)
118{
119 /*
120 * AH/ESP is only related traffic. Pass the packet to IP stack.
121 */
122 *verdict = NF_ACCEPT;
123 return 0;
124}
125
126
127static void
128ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg)
130{
131 char buf[256];
132 struct iphdr _iph, *ih;
133
134 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
135 if (ih == NULL)
136 sprintf(buf, "%s TRUNCATED", pp->name);
137 else
138 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
139 pp->name, NIPQUAD(ih->saddr),
140 NIPQUAD(ih->daddr));
141
142 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
143}
144
145#ifdef CONFIG_IP_VS_IPV6
146static void
147ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
148 int offset, const char *msg)
149{
150 char buf[256];
151 struct ipv6hdr _iph, *ih;
152
153 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
154 if (ih == NULL)
155 sprintf(buf, "%s TRUNCATED", pp->name);
156 else
157 sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
158 pp->name, NIP6(ih->saddr),
159 NIP6(ih->daddr));
160
161 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
162}
163#endif
164
165static void
166ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
167 int offset, const char *msg)
168{
169#ifdef CONFIG_IP_VS_IPV6
170 if (skb->protocol == htons(ETH_P_IPV6))
171 ah_esp_debug_packet_v6(pp, skb, offset, msg);
172 else
173#endif
174 ah_esp_debug_packet_v4(pp, skb, offset, msg);
175}
176
177
178static void ah_esp_init(struct ip_vs_protocol *pp)
179{
180 /* nothing to do now */
181}
182
183
184static void ah_esp_exit(struct ip_vs_protocol *pp)
185{
186 /* nothing to do now */
187}
188
189
190#ifdef CONFIG_IP_VS_PROTO_AH
191struct ip_vs_protocol ip_vs_protocol_ah = {
192 .name = "AH",
193 .protocol = IPPROTO_AH,
194 .num_states = 1,
195 .dont_defrag = 1,
196 .init = ah_esp_init,
197 .exit = ah_esp_exit,
198 .conn_schedule = ah_esp_conn_schedule,
199 .conn_in_get = ah_esp_conn_in_get,
200 .conn_out_get = ah_esp_conn_out_get,
201 .snat_handler = NULL,
202 .dnat_handler = NULL,
203 .csum_check = NULL,
204 .state_transition = NULL,
205 .register_app = NULL,
206 .unregister_app = NULL,
207 .app_conn_bind = NULL,
208 .debug_packet = ah_esp_debug_packet,
209 .timeout_change = NULL, /* ISAKMP */
210 .set_state_timeout = NULL,
211};
212#endif
213
214#ifdef CONFIG_IP_VS_PROTO_ESP
215struct ip_vs_protocol ip_vs_protocol_esp = {
216 .name = "ESP",
217 .protocol = IPPROTO_ESP,
218 .num_states = 1,
219 .dont_defrag = 1,
220 .init = ah_esp_init,
221 .exit = ah_esp_exit,
222 .conn_schedule = ah_esp_conn_schedule,
223 .conn_in_get = ah_esp_conn_in_get,
224 .conn_out_get = ah_esp_conn_out_get,
225 .snat_handler = NULL,
226 .dnat_handler = NULL,
227 .csum_check = NULL,
228 .state_transition = NULL,
229 .register_app = NULL,
230 .unregister_app = NULL,
231 .app_conn_bind = NULL,
232 .debug_packet = ah_esp_debug_packet,
233 .timeout_change = NULL, /* ISAKMP */
234};
235#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
deleted file mode 100644
index dd4566ea2bff..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ /dev/null
@@ -1,732 +0,0 @@
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/ip.h>
18#include <linux/tcp.h> /* for tcphdr */
19#include <net/ip.h>
20#include <net/tcp.h> /* for csum_tcpudp_magic */
21#include <net/ip6_checksum.h>
22#include <linux/netfilter.h>
23#include <linux/netfilter_ipv4.h>
24
25#include <net/ip_vs.h>
26
27
28static struct ip_vs_conn *
29tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
30 const struct ip_vs_iphdr *iph, unsigned int proto_off,
31 int inverse)
32{
33 __be16 _ports[2], *pptr;
34
35 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
36 if (pptr == NULL)
37 return NULL;
38
39 if (likely(!inverse)) {
40 return ip_vs_conn_in_get(af, iph->protocol,
41 &iph->saddr, pptr[0],
42 &iph->daddr, pptr[1]);
43 } else {
44 return ip_vs_conn_in_get(af, iph->protocol,
45 &iph->daddr, pptr[1],
46 &iph->saddr, pptr[0]);
47 }
48}
49
50static struct ip_vs_conn *
51tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
52 const struct ip_vs_iphdr *iph, unsigned int proto_off,
53 int inverse)
54{
55 __be16 _ports[2], *pptr;
56
57 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
58 if (pptr == NULL)
59 return NULL;
60
61 if (likely(!inverse)) {
62 return ip_vs_conn_out_get(af, iph->protocol,
63 &iph->saddr, pptr[0],
64 &iph->daddr, pptr[1]);
65 } else {
66 return ip_vs_conn_out_get(af, iph->protocol,
67 &iph->daddr, pptr[1],
68 &iph->saddr, pptr[0]);
69 }
70}
71
72
73static int
74tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
75 int *verdict, struct ip_vs_conn **cpp)
76{
77 struct ip_vs_service *svc;
78 struct tcphdr _tcph, *th;
79 struct ip_vs_iphdr iph;
80
81 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
82
83 th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
84 if (th == NULL) {
85 *verdict = NF_DROP;
86 return 0;
87 }
88
89 if (th->syn &&
90 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
91 th->dest))) {
92 if (ip_vs_todrop()) {
93 /*
94 * It seems that we are very loaded.
95 * We have to drop this packet :(
96 */
97 ip_vs_service_put(svc);
98 *verdict = NF_DROP;
99 return 0;
100 }
101
102 /*
103 * Let the virtual server select a real server for the
104 * incoming connection, and create a connection entry.
105 */
106 *cpp = ip_vs_schedule(svc, skb);
107 if (!*cpp) {
108 *verdict = ip_vs_leave(svc, skb, pp);
109 return 0;
110 }
111 ip_vs_service_put(svc);
112 }
113 return 1;
114}
115
116
117static inline void
118tcp_fast_csum_update(int af, struct tcphdr *tcph,
119 const union nf_inet_addr *oldip,
120 const union nf_inet_addr *newip,
121 __be16 oldport, __be16 newport)
122{
123#ifdef CONFIG_IP_VS_IPV6
124 if (af == AF_INET6)
125 tcph->check =
126 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
127 ip_vs_check_diff2(oldport, newport,
128 ~csum_unfold(tcph->check))));
129 else
130#endif
131 tcph->check =
132 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
133 ip_vs_check_diff2(oldport, newport,
134 ~csum_unfold(tcph->check))));
135}
136
137
138static inline void
139tcp_partial_csum_update(int af, struct tcphdr *tcph,
140 const union nf_inet_addr *oldip,
141 const union nf_inet_addr *newip,
142 __be16 oldlen, __be16 newlen)
143{
144#ifdef CONFIG_IP_VS_IPV6
145 if (af == AF_INET6)
146 tcph->check =
147 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
148 ip_vs_check_diff2(oldlen, newlen,
149 ~csum_unfold(tcph->check))));
150 else
151#endif
152 tcph->check =
153 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
154 ip_vs_check_diff2(oldlen, newlen,
155 ~csum_unfold(tcph->check))));
156}
157
158
159static int
160tcp_snat_handler(struct sk_buff *skb,
161 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
162{
163 struct tcphdr *tcph;
164 unsigned int tcphoff;
165 int oldlen;
166
167#ifdef CONFIG_IP_VS_IPV6
168 if (cp->af == AF_INET6)
169 tcphoff = sizeof(struct ipv6hdr);
170 else
171#endif
172 tcphoff = ip_hdrlen(skb);
173 oldlen = skb->len - tcphoff;
174
175 /* csum_check requires unshared skb */
176 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
177 return 0;
178
179 if (unlikely(cp->app != NULL)) {
180 /* Some checks before mangling */
181 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
182 return 0;
183
184 /* Call application helper if needed */
185 if (!ip_vs_app_pkt_out(cp, skb))
186 return 0;
187 }
188
189 tcph = (void *)skb_network_header(skb) + tcphoff;
190 tcph->source = cp->vport;
191
192 /* Adjust TCP checksums */
193 if (skb->ip_summed == CHECKSUM_PARTIAL) {
194 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
195 htonl(oldlen),
196 htonl(skb->len - tcphoff));
197 } else if (!cp->app) {
198 /* Only port and addr are changed, do fast csum update */
199 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
200 cp->dport, cp->vport);
201 if (skb->ip_summed == CHECKSUM_COMPLETE)
202 skb->ip_summed = CHECKSUM_NONE;
203 } else {
204 /* full checksum calculation */
205 tcph->check = 0;
206 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
207#ifdef CONFIG_IP_VS_IPV6
208 if (cp->af == AF_INET6)
209 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
210 &cp->caddr.in6,
211 skb->len - tcphoff,
212 cp->protocol, skb->csum);
213 else
214#endif
215 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
216 cp->caddr.ip,
217 skb->len - tcphoff,
218 cp->protocol,
219 skb->csum);
220
221 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
222 pp->name, tcph->check,
223 (char*)&(tcph->check) - (char*)tcph);
224 }
225 return 1;
226}
227
228
229static int
230tcp_dnat_handler(struct sk_buff *skb,
231 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
232{
233 struct tcphdr *tcph;
234 unsigned int tcphoff;
235 int oldlen;
236
237#ifdef CONFIG_IP_VS_IPV6
238 if (cp->af == AF_INET6)
239 tcphoff = sizeof(struct ipv6hdr);
240 else
241#endif
242 tcphoff = ip_hdrlen(skb);
243 oldlen = skb->len - tcphoff;
244
245 /* csum_check requires unshared skb */
246 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
247 return 0;
248
249 if (unlikely(cp->app != NULL)) {
250 /* Some checks before mangling */
251 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
252 return 0;
253
254 /*
255 * Attempt ip_vs_app call.
256 * It will fix ip_vs_conn and iph ack_seq stuff
257 */
258 if (!ip_vs_app_pkt_in(cp, skb))
259 return 0;
260 }
261
262 tcph = (void *)skb_network_header(skb) + tcphoff;
263 tcph->dest = cp->dport;
264
265 /*
266 * Adjust TCP checksums
267 */
268 if (skb->ip_summed == CHECKSUM_PARTIAL) {
269 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
270 htonl(oldlen),
271 htonl(skb->len - tcphoff));
272 } else if (!cp->app) {
273 /* Only port and addr are changed, do fast csum update */
274 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
275 cp->vport, cp->dport);
276 if (skb->ip_summed == CHECKSUM_COMPLETE)
277 skb->ip_summed = CHECKSUM_NONE;
278 } else {
279 /* full checksum calculation */
280 tcph->check = 0;
281 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
282#ifdef CONFIG_IP_VS_IPV6
283 if (cp->af == AF_INET6)
284 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
285 &cp->daddr.in6,
286 skb->len - tcphoff,
287 cp->protocol, skb->csum);
288 else
289#endif
290 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
291 cp->daddr.ip,
292 skb->len - tcphoff,
293 cp->protocol,
294 skb->csum);
295 skb->ip_summed = CHECKSUM_UNNECESSARY;
296 }
297 return 1;
298}
299
300
301static int
302tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
303{
304 unsigned int tcphoff;
305
306#ifdef CONFIG_IP_VS_IPV6
307 if (af == AF_INET6)
308 tcphoff = sizeof(struct ipv6hdr);
309 else
310#endif
311 tcphoff = ip_hdrlen(skb);
312
313 switch (skb->ip_summed) {
314 case CHECKSUM_NONE:
315 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
316 case CHECKSUM_COMPLETE:
317#ifdef CONFIG_IP_VS_IPV6
318 if (af == AF_INET6) {
319 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
320 &ipv6_hdr(skb)->daddr,
321 skb->len - tcphoff,
322 ipv6_hdr(skb)->nexthdr,
323 skb->csum)) {
324 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
325 "Failed checksum for");
326 return 0;
327 }
328 } else
329#endif
330 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
331 ip_hdr(skb)->daddr,
332 skb->len - tcphoff,
333 ip_hdr(skb)->protocol,
334 skb->csum)) {
335 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
336 "Failed checksum for");
337 return 0;
338 }
339 break;
340 default:
341 /* No need to checksum. */
342 break;
343 }
344
345 return 1;
346}
347
348
349#define TCP_DIR_INPUT 0
350#define TCP_DIR_OUTPUT 4
351#define TCP_DIR_INPUT_ONLY 8
352
353static const int tcp_state_off[IP_VS_DIR_LAST] = {
354 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
355 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
356 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
357};
358
359/*
360 * Timeout table[state]
361 */
362static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
363 [IP_VS_TCP_S_NONE] = 2*HZ,
364 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
365 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
366 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
367 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
368 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
369 [IP_VS_TCP_S_CLOSE] = 10*HZ,
370 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
371 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
372 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
373 [IP_VS_TCP_S_SYNACK] = 120*HZ,
374 [IP_VS_TCP_S_LAST] = 2*HZ,
375};
376
377static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
378 [IP_VS_TCP_S_NONE] = "NONE",
379 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
380 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
381 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
382 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
383 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
384 [IP_VS_TCP_S_CLOSE] = "CLOSE",
385 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
386 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
387 [IP_VS_TCP_S_LISTEN] = "LISTEN",
388 [IP_VS_TCP_S_SYNACK] = "SYNACK",
389 [IP_VS_TCP_S_LAST] = "BUG!",
390};
391
392#define sNO IP_VS_TCP_S_NONE
393#define sES IP_VS_TCP_S_ESTABLISHED
394#define sSS IP_VS_TCP_S_SYN_SENT
395#define sSR IP_VS_TCP_S_SYN_RECV
396#define sFW IP_VS_TCP_S_FIN_WAIT
397#define sTW IP_VS_TCP_S_TIME_WAIT
398#define sCL IP_VS_TCP_S_CLOSE
399#define sCW IP_VS_TCP_S_CLOSE_WAIT
400#define sLA IP_VS_TCP_S_LAST_ACK
401#define sLI IP_VS_TCP_S_LISTEN
402#define sSA IP_VS_TCP_S_SYNACK
403
404struct tcp_states_t {
405 int next_state[IP_VS_TCP_S_LAST];
406};
407
408static const char * tcp_state_name(int state)
409{
410 if (state >= IP_VS_TCP_S_LAST)
411 return "ERR!";
412 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
413}
414
415static struct tcp_states_t tcp_states [] = {
416/* INPUT */
417/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
418/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
419/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
420/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
421/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
422
423/* OUTPUT */
424/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
425/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
426/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
427/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
428/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
429
430/* INPUT-ONLY */
431/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
432/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
433/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
434/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
435/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
436};
437
438static struct tcp_states_t tcp_states_dos [] = {
439/* INPUT */
440/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
441/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
442/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
443/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
444/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
445
446/* OUTPUT */
447/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
448/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
449/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
450/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
451/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
452
453/* INPUT-ONLY */
454/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
455/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
456/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
457/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
458/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
459};
460
461static struct tcp_states_t *tcp_state_table = tcp_states;
462
463
464static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
465{
466 int on = (flags & 1); /* secure_tcp */
467
468 /*
469 ** FIXME: change secure_tcp to independent sysctl var
470 ** or make it per-service or per-app because it is valid
471 ** for most if not for all of the applications. Something
472 ** like "capabilities" (flags) for each object.
473 */
474 tcp_state_table = (on? tcp_states_dos : tcp_states);
475}
476
477static int
478tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
479{
480 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
481 tcp_state_name_table, sname, to);
482}
483
484static inline int tcp_state_idx(struct tcphdr *th)
485{
486 if (th->rst)
487 return 3;
488 if (th->syn)
489 return 0;
490 if (th->fin)
491 return 1;
492 if (th->ack)
493 return 2;
494 return -1;
495}
496
497static inline void
498set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
499 int direction, struct tcphdr *th)
500{
501 int state_idx;
502 int new_state = IP_VS_TCP_S_CLOSE;
503 int state_off = tcp_state_off[direction];
504
505 /*
506 * Update state offset to INPUT_ONLY if necessary
507 * or delete NO_OUTPUT flag if output packet detected
508 */
509 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
510 if (state_off == TCP_DIR_OUTPUT)
511 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
512 else
513 state_off = TCP_DIR_INPUT_ONLY;
514 }
515
516 if ((state_idx = tcp_state_idx(th)) < 0) {
517 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
518 goto tcp_state_out;
519 }
520
521 new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
522
523 tcp_state_out:
524 if (new_state != cp->state) {
525 struct ip_vs_dest *dest = cp->dest;
526
527 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
528 "%s:%d state: %s->%s conn->refcnt:%d\n",
529 pp->name,
530 ((state_off == TCP_DIR_OUTPUT) ?
531 "output " : "input "),
532 th->syn ? 'S' : '.',
533 th->fin ? 'F' : '.',
534 th->ack ? 'A' : '.',
535 th->rst ? 'R' : '.',
536 IP_VS_DBG_ADDR(cp->af, &cp->daddr),
537 ntohs(cp->dport),
538 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
539 ntohs(cp->cport),
540 tcp_state_name(cp->state),
541 tcp_state_name(new_state),
542 atomic_read(&cp->refcnt));
543
544 if (dest) {
545 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
546 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
547 atomic_dec(&dest->activeconns);
548 atomic_inc(&dest->inactconns);
549 cp->flags |= IP_VS_CONN_F_INACTIVE;
550 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
551 (new_state == IP_VS_TCP_S_ESTABLISHED)) {
552 atomic_inc(&dest->activeconns);
553 atomic_dec(&dest->inactconns);
554 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
555 }
556 }
557 }
558
559 cp->timeout = pp->timeout_table[cp->state = new_state];
560}
561
562
563/*
564 * Handle state transitions
565 */
566static int
567tcp_state_transition(struct ip_vs_conn *cp, int direction,
568 const struct sk_buff *skb,
569 struct ip_vs_protocol *pp)
570{
571 struct tcphdr _tcph, *th;
572
573#ifdef CONFIG_IP_VS_IPV6
574 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
575#else
576 int ihl = ip_hdrlen(skb);
577#endif
578
579 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
580 if (th == NULL)
581 return 0;
582
583 spin_lock(&cp->lock);
584 set_tcp_state(pp, cp, direction, th);
585 spin_unlock(&cp->lock);
586
587 return 1;
588}
589
590
591/*
592 * Hash table for TCP application incarnations
593 */
594#define TCP_APP_TAB_BITS 4
595#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
596#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
597
598static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
599static DEFINE_SPINLOCK(tcp_app_lock);
600
601static inline __u16 tcp_app_hashkey(__be16 port)
602{
603 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
604 & TCP_APP_TAB_MASK;
605}
606
607
608static int tcp_register_app(struct ip_vs_app *inc)
609{
610 struct ip_vs_app *i;
611 __u16 hash;
612 __be16 port = inc->port;
613 int ret = 0;
614
615 hash = tcp_app_hashkey(port);
616
617 spin_lock_bh(&tcp_app_lock);
618 list_for_each_entry(i, &tcp_apps[hash], p_list) {
619 if (i->port == port) {
620 ret = -EEXIST;
621 goto out;
622 }
623 }
624 list_add(&inc->p_list, &tcp_apps[hash]);
625 atomic_inc(&ip_vs_protocol_tcp.appcnt);
626
627 out:
628 spin_unlock_bh(&tcp_app_lock);
629 return ret;
630}
631
632
633static void
634tcp_unregister_app(struct ip_vs_app *inc)
635{
636 spin_lock_bh(&tcp_app_lock);
637 atomic_dec(&ip_vs_protocol_tcp.appcnt);
638 list_del(&inc->p_list);
639 spin_unlock_bh(&tcp_app_lock);
640}
641
642
643static int
644tcp_app_conn_bind(struct ip_vs_conn *cp)
645{
646 int hash;
647 struct ip_vs_app *inc;
648 int result = 0;
649
650 /* Default binding: bind app only for NAT */
651 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
652 return 0;
653
654 /* Lookup application incarnations and bind the right one */
655 hash = tcp_app_hashkey(cp->vport);
656
657 spin_lock(&tcp_app_lock);
658 list_for_each_entry(inc, &tcp_apps[hash], p_list) {
659 if (inc->port == cp->vport) {
660 if (unlikely(!ip_vs_app_inc_get(inc)))
661 break;
662 spin_unlock(&tcp_app_lock);
663
664 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
665 "%s:%u to app %s on port %u\n",
666 __func__,
667 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
668 ntohs(cp->cport),
669 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
670 ntohs(cp->vport),
671 inc->name, ntohs(inc->port));
672
673 cp->app = inc;
674 if (inc->init_conn)
675 result = inc->init_conn(inc, cp);
676 goto out;
677 }
678 }
679 spin_unlock(&tcp_app_lock);
680
681 out:
682 return result;
683}
684
685
686/*
687 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
688 */
689void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
690{
691 spin_lock(&cp->lock);
692 cp->state = IP_VS_TCP_S_LISTEN;
693 cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
694 spin_unlock(&cp->lock);
695}
696
697
698static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
699{
700 IP_VS_INIT_HASH_TABLE(tcp_apps);
701 pp->timeout_table = tcp_timeouts;
702}
703
704
705static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
706{
707}
708
709
710struct ip_vs_protocol ip_vs_protocol_tcp = {
711 .name = "TCP",
712 .protocol = IPPROTO_TCP,
713 .num_states = IP_VS_TCP_S_LAST,
714 .dont_defrag = 0,
715 .appcnt = ATOMIC_INIT(0),
716 .init = ip_vs_tcp_init,
717 .exit = ip_vs_tcp_exit,
718 .register_app = tcp_register_app,
719 .unregister_app = tcp_unregister_app,
720 .conn_schedule = tcp_conn_schedule,
721 .conn_in_get = tcp_conn_in_get,
722 .conn_out_get = tcp_conn_out_get,
723 .snat_handler = tcp_snat_handler,
724 .dnat_handler = tcp_dnat_handler,
725 .csum_check = tcp_csum_check,
726 .state_name = tcp_state_name,
727 .state_transition = tcp_state_transition,
728 .app_conn_bind = tcp_app_conn_bind,
729 .debug_packet = ip_vs_tcpudp_debug_packet,
730 .timeout_change = tcp_timeout_change,
731 .set_state_timeout = tcp_set_state_timeout,
732};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
deleted file mode 100644
index 6eb6039d6343..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ /dev/null
@@ -1,533 +0,0 @@
1/*
2 * ip_vs_proto_udp.c: UDP load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/in.h>
17#include <linux/ip.h>
18#include <linux/kernel.h>
19#include <linux/netfilter.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/udp.h>
22
23#include <net/ip_vs.h>
24#include <net/ip.h>
25#include <net/ip6_checksum.h>
26
27static struct ip_vs_conn *
28udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
29 const struct ip_vs_iphdr *iph, unsigned int proto_off,
30 int inverse)
31{
32 struct ip_vs_conn *cp;
33 __be16 _ports[2], *pptr;
34
35 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
36 if (pptr == NULL)
37 return NULL;
38
39 if (likely(!inverse)) {
40 cp = ip_vs_conn_in_get(af, iph->protocol,
41 &iph->saddr, pptr[0],
42 &iph->daddr, pptr[1]);
43 } else {
44 cp = ip_vs_conn_in_get(af, iph->protocol,
45 &iph->daddr, pptr[1],
46 &iph->saddr, pptr[0]);
47 }
48
49 return cp;
50}
51
52
53static struct ip_vs_conn *
54udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
55 const struct ip_vs_iphdr *iph, unsigned int proto_off,
56 int inverse)
57{
58 struct ip_vs_conn *cp;
59 __be16 _ports[2], *pptr;
60
61 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
62 if (pptr == NULL)
63 return NULL;
64
65 if (likely(!inverse)) {
66 cp = ip_vs_conn_out_get(af, iph->protocol,
67 &iph->saddr, pptr[0],
68 &iph->daddr, pptr[1]);
69 } else {
70 cp = ip_vs_conn_out_get(af, iph->protocol,
71 &iph->daddr, pptr[1],
72 &iph->saddr, pptr[0]);
73 }
74
75 return cp;
76}
77
78
79static int
80udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
81 int *verdict, struct ip_vs_conn **cpp)
82{
83 struct ip_vs_service *svc;
84 struct udphdr _udph, *uh;
85 struct ip_vs_iphdr iph;
86
87 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
88
89 uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
90 if (uh == NULL) {
91 *verdict = NF_DROP;
92 return 0;
93 }
94
95 svc = ip_vs_service_get(af, skb->mark, iph.protocol,
96 &iph.daddr, uh->dest);
97 if (svc) {
98 if (ip_vs_todrop()) {
99 /*
100 * It seems that we are very loaded.
101 * We have to drop this packet :(
102 */
103 ip_vs_service_put(svc);
104 *verdict = NF_DROP;
105 return 0;
106 }
107
108 /*
109 * Let the virtual server select a real server for the
110 * incoming connection, and create a connection entry.
111 */
112 *cpp = ip_vs_schedule(svc, skb);
113 if (!*cpp) {
114 *verdict = ip_vs_leave(svc, skb, pp);
115 return 0;
116 }
117 ip_vs_service_put(svc);
118 }
119 return 1;
120}
121
122
123static inline void
124udp_fast_csum_update(int af, struct udphdr *uhdr,
125 const union nf_inet_addr *oldip,
126 const union nf_inet_addr *newip,
127 __be16 oldport, __be16 newport)
128{
129#ifdef CONFIG_IP_VS_IPV6
130 if (af == AF_INET6)
131 uhdr->check =
132 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
133 ip_vs_check_diff2(oldport, newport,
134 ~csum_unfold(uhdr->check))));
135 else
136#endif
137 uhdr->check =
138 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
139 ip_vs_check_diff2(oldport, newport,
140 ~csum_unfold(uhdr->check))));
141 if (!uhdr->check)
142 uhdr->check = CSUM_MANGLED_0;
143}
144
145static inline void
146udp_partial_csum_update(int af, struct udphdr *uhdr,
147 const union nf_inet_addr *oldip,
148 const union nf_inet_addr *newip,
149 __be16 oldlen, __be16 newlen)
150{
151#ifdef CONFIG_IP_VS_IPV6
152 if (af == AF_INET6)
153 uhdr->check =
154 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
155 ip_vs_check_diff2(oldlen, newlen,
156 ~csum_unfold(uhdr->check))));
157 else
158#endif
159 uhdr->check =
160 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
161 ip_vs_check_diff2(oldlen, newlen,
162 ~csum_unfold(uhdr->check))));
163}
164
165
166static int
167udp_snat_handler(struct sk_buff *skb,
168 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
169{
170 struct udphdr *udph;
171 unsigned int udphoff;
172 int oldlen;
173
174#ifdef CONFIG_IP_VS_IPV6
175 if (cp->af == AF_INET6)
176 udphoff = sizeof(struct ipv6hdr);
177 else
178#endif
179 udphoff = ip_hdrlen(skb);
180 oldlen = skb->len - udphoff;
181
182 /* csum_check requires unshared skb */
183 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
184 return 0;
185
186 if (unlikely(cp->app != NULL)) {
187 /* Some checks before mangling */
188 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
189 return 0;
190
191 /*
192 * Call application helper if needed
193 */
194 if (!ip_vs_app_pkt_out(cp, skb))
195 return 0;
196 }
197
198 udph = (void *)skb_network_header(skb) + udphoff;
199 udph->source = cp->vport;
200
201 /*
202 * Adjust UDP checksums
203 */
204 if (skb->ip_summed == CHECKSUM_PARTIAL) {
205 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
206 htonl(oldlen),
207 htonl(skb->len - udphoff));
208 } else if (!cp->app && (udph->check != 0)) {
209 /* Only port and addr are changed, do fast csum update */
210 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
211 cp->dport, cp->vport);
212 if (skb->ip_summed == CHECKSUM_COMPLETE)
213 skb->ip_summed = CHECKSUM_NONE;
214 } else {
215 /* full checksum calculation */
216 udph->check = 0;
217 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
218#ifdef CONFIG_IP_VS_IPV6
219 if (cp->af == AF_INET6)
220 udph->check = csum_ipv6_magic(&cp->vaddr.in6,
221 &cp->caddr.in6,
222 skb->len - udphoff,
223 cp->protocol, skb->csum);
224 else
225#endif
226 udph->check = csum_tcpudp_magic(cp->vaddr.ip,
227 cp->caddr.ip,
228 skb->len - udphoff,
229 cp->protocol,
230 skb->csum);
231 if (udph->check == 0)
232 udph->check = CSUM_MANGLED_0;
233 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
234 pp->name, udph->check,
235 (char*)&(udph->check) - (char*)udph);
236 }
237 return 1;
238}
239
240
241static int
242udp_dnat_handler(struct sk_buff *skb,
243 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
244{
245 struct udphdr *udph;
246 unsigned int udphoff;
247 int oldlen;
248
249#ifdef CONFIG_IP_VS_IPV6
250 if (cp->af == AF_INET6)
251 udphoff = sizeof(struct ipv6hdr);
252 else
253#endif
254 udphoff = ip_hdrlen(skb);
255 oldlen = skb->len - udphoff;
256
257 /* csum_check requires unshared skb */
258 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
259 return 0;
260
261 if (unlikely(cp->app != NULL)) {
262 /* Some checks before mangling */
263 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
264 return 0;
265
266 /*
267 * Attempt ip_vs_app call.
268 * It will fix ip_vs_conn
269 */
270 if (!ip_vs_app_pkt_in(cp, skb))
271 return 0;
272 }
273
274 udph = (void *)skb_network_header(skb) + udphoff;
275 udph->dest = cp->dport;
276
277 /*
278 * Adjust UDP checksums
279 */
280 if (skb->ip_summed == CHECKSUM_PARTIAL) {
281 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
282 htonl(oldlen),
283 htonl(skb->len - udphoff));
284 } else if (!cp->app && (udph->check != 0)) {
285 /* Only port and addr are changed, do fast csum update */
286 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
287 cp->vport, cp->dport);
288 if (skb->ip_summed == CHECKSUM_COMPLETE)
289 skb->ip_summed = CHECKSUM_NONE;
290 } else {
291 /* full checksum calculation */
292 udph->check = 0;
293 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
294#ifdef CONFIG_IP_VS_IPV6
295 if (cp->af == AF_INET6)
296 udph->check = csum_ipv6_magic(&cp->caddr.in6,
297 &cp->daddr.in6,
298 skb->len - udphoff,
299 cp->protocol, skb->csum);
300 else
301#endif
302 udph->check = csum_tcpudp_magic(cp->caddr.ip,
303 cp->daddr.ip,
304 skb->len - udphoff,
305 cp->protocol,
306 skb->csum);
307 if (udph->check == 0)
308 udph->check = CSUM_MANGLED_0;
309 skb->ip_summed = CHECKSUM_UNNECESSARY;
310 }
311 return 1;
312}
313
314
315static int
316udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
317{
318 struct udphdr _udph, *uh;
319 unsigned int udphoff;
320
321#ifdef CONFIG_IP_VS_IPV6
322 if (af == AF_INET6)
323 udphoff = sizeof(struct ipv6hdr);
324 else
325#endif
326 udphoff = ip_hdrlen(skb);
327
328 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
329 if (uh == NULL)
330 return 0;
331
332 if (uh->check != 0) {
333 switch (skb->ip_summed) {
334 case CHECKSUM_NONE:
335 skb->csum = skb_checksum(skb, udphoff,
336 skb->len - udphoff, 0);
337 case CHECKSUM_COMPLETE:
338#ifdef CONFIG_IP_VS_IPV6
339 if (af == AF_INET6) {
340 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
341 &ipv6_hdr(skb)->daddr,
342 skb->len - udphoff,
343 ipv6_hdr(skb)->nexthdr,
344 skb->csum)) {
345 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
346 "Failed checksum for");
347 return 0;
348 }
349 } else
350#endif
351 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
352 ip_hdr(skb)->daddr,
353 skb->len - udphoff,
354 ip_hdr(skb)->protocol,
355 skb->csum)) {
356 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
357 "Failed checksum for");
358 return 0;
359 }
360 break;
361 default:
362 /* No need to checksum. */
363 break;
364 }
365 }
366 return 1;
367}
368
369
370/*
371 * Note: the caller guarantees that only one of register_app,
372 * unregister_app or app_conn_bind is called each time.
373 */
374
375#define UDP_APP_TAB_BITS 4
376#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
377#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
378
379static struct list_head udp_apps[UDP_APP_TAB_SIZE];
380static DEFINE_SPINLOCK(udp_app_lock);
381
382static inline __u16 udp_app_hashkey(__be16 port)
383{
384 return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
385 & UDP_APP_TAB_MASK;
386}
387
388
389static int udp_register_app(struct ip_vs_app *inc)
390{
391 struct ip_vs_app *i;
392 __u16 hash;
393 __be16 port = inc->port;
394 int ret = 0;
395
396 hash = udp_app_hashkey(port);
397
398
399 spin_lock_bh(&udp_app_lock);
400 list_for_each_entry(i, &udp_apps[hash], p_list) {
401 if (i->port == port) {
402 ret = -EEXIST;
403 goto out;
404 }
405 }
406 list_add(&inc->p_list, &udp_apps[hash]);
407 atomic_inc(&ip_vs_protocol_udp.appcnt);
408
409 out:
410 spin_unlock_bh(&udp_app_lock);
411 return ret;
412}
413
414
415static void
416udp_unregister_app(struct ip_vs_app *inc)
417{
418 spin_lock_bh(&udp_app_lock);
419 atomic_dec(&ip_vs_protocol_udp.appcnt);
420 list_del(&inc->p_list);
421 spin_unlock_bh(&udp_app_lock);
422}
423
424
425static int udp_app_conn_bind(struct ip_vs_conn *cp)
426{
427 int hash;
428 struct ip_vs_app *inc;
429 int result = 0;
430
431 /* Default binding: bind app only for NAT */
432 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
433 return 0;
434
435 /* Lookup application incarnations and bind the right one */
436 hash = udp_app_hashkey(cp->vport);
437
438 spin_lock(&udp_app_lock);
439 list_for_each_entry(inc, &udp_apps[hash], p_list) {
440 if (inc->port == cp->vport) {
441 if (unlikely(!ip_vs_app_inc_get(inc)))
442 break;
443 spin_unlock(&udp_app_lock);
444
445 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
446 "%s:%u to app %s on port %u\n",
447 __func__,
448 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
449 ntohs(cp->cport),
450 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
451 ntohs(cp->vport),
452 inc->name, ntohs(inc->port));
453
454 cp->app = inc;
455 if (inc->init_conn)
456 result = inc->init_conn(inc, cp);
457 goto out;
458 }
459 }
460 spin_unlock(&udp_app_lock);
461
462 out:
463 return result;
464}
465
466
467static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
468 [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
469 [IP_VS_UDP_S_LAST] = 2*HZ,
470};
471
472static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
473 [IP_VS_UDP_S_NORMAL] = "UDP",
474 [IP_VS_UDP_S_LAST] = "BUG!",
475};
476
477
478static int
479udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
480{
481 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
482 udp_state_name_table, sname, to);
483}
484
485static const char * udp_state_name(int state)
486{
487 if (state >= IP_VS_UDP_S_LAST)
488 return "ERR!";
489 return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
490}
491
492static int
493udp_state_transition(struct ip_vs_conn *cp, int direction,
494 const struct sk_buff *skb,
495 struct ip_vs_protocol *pp)
496{
497 cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
498 return 1;
499}
500
501static void udp_init(struct ip_vs_protocol *pp)
502{
503 IP_VS_INIT_HASH_TABLE(udp_apps);
504 pp->timeout_table = udp_timeouts;
505}
506
507static void udp_exit(struct ip_vs_protocol *pp)
508{
509}
510
511
512struct ip_vs_protocol ip_vs_protocol_udp = {
513 .name = "UDP",
514 .protocol = IPPROTO_UDP,
515 .num_states = IP_VS_UDP_S_LAST,
516 .dont_defrag = 0,
517 .init = udp_init,
518 .exit = udp_exit,
519 .conn_schedule = udp_conn_schedule,
520 .conn_in_get = udp_conn_in_get,
521 .conn_out_get = udp_conn_out_get,
522 .snat_handler = udp_snat_handler,
523 .dnat_handler = udp_dnat_handler,
524 .csum_check = udp_csum_check,
525 .state_transition = udp_state_transition,
526 .state_name = udp_state_name,
527 .register_app = udp_register_app,
528 .unregister_app = udp_unregister_app,
529 .app_conn_bind = udp_app_conn_bind,
530 .debug_packet = ip_vs_tcpudp_debug_packet,
531 .timeout_change = NULL,
532 .set_state_timeout = udp_set_state_timeout,
533};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
deleted file mode 100644
index a22195f68ac4..000000000000
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ /dev/null
@@ -1,112 +0,0 @@
1/*
2 * IPVS: Round-Robin Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Peter Kese <peter.kese@ijs.si>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Fixes/Changes:
13 * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
14 * Julian Anastasov : fixed the NULL pointer access bug in debugging
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_rr_update_svc
18 * Wensong Zhang : added any dest with weight=0 is quiesced
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24
25#include <net/ip_vs.h>
26
27
28static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
29{
30 svc->sched_data = &svc->destinations;
31 return 0;
32}
33
34
35static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
36{
37 svc->sched_data = &svc->destinations;
38 return 0;
39}
40
41
42/*
43 * Round-Robin Scheduling
44 */
45static struct ip_vs_dest *
46ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
47{
48 struct list_head *p, *q;
49 struct ip_vs_dest *dest;
50
51 IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
52
53 write_lock(&svc->sched_lock);
54 p = (struct list_head *)svc->sched_data;
55 p = p->next;
56 q = p;
57 do {
58 /* skip list head */
59 if (q == &svc->destinations) {
60 q = q->next;
61 continue;
62 }
63
64 dest = list_entry(q, struct ip_vs_dest, n_list);
65 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
66 atomic_read(&dest->weight) > 0)
67 /* HIT */
68 goto out;
69 q = q->next;
70 } while (q != p);
71 write_unlock(&svc->sched_lock);
72 return NULL;
73
74 out:
75 svc->sched_data = q;
76 write_unlock(&svc->sched_lock);
77 IP_VS_DBG_BUF(6, "RR: server %s:%u "
78 "activeconns %d refcnt %d weight %d\n",
79 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
80 atomic_read(&dest->activeconns),
81 atomic_read(&dest->refcnt), atomic_read(&dest->weight));
82
83 return dest;
84}
85
86
87static struct ip_vs_scheduler ip_vs_rr_scheduler = {
88 .name = "rr", /* name */
89 .refcnt = ATOMIC_INIT(0),
90 .module = THIS_MODULE,
91 .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
92#ifdef CONFIG_IP_VS_IPV6
93 .supports_ipv6 = 1,
94#endif
95 .init_service = ip_vs_rr_init_svc,
96 .update_service = ip_vs_rr_update_svc,
97 .schedule = ip_vs_rr_schedule,
98};
99
100static int __init ip_vs_rr_init(void)
101{
102 return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
103}
104
105static void __exit ip_vs_rr_cleanup(void)
106{
107 unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
108}
109
110module_init(ip_vs_rr_init);
111module_exit(ip_vs_rr_cleanup);
112MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
deleted file mode 100644
index a46ad9e35016..000000000000
--- a/net/ipv4/ipvs/ip_vs_sched.c
+++ /dev/null
@@ -1,251 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 * Changes:
17 *
18 */
19
20#include <linux/module.h>
21#include <linux/spinlock.h>
22#include <linux/interrupt.h>
23#include <asm/string.h>
24#include <linux/kmod.h>
25#include <linux/sysctl.h>
26
27#include <net/ip_vs.h>
28
29/*
30 * IPVS scheduler list
31 */
32static LIST_HEAD(ip_vs_schedulers);
33
34/* lock for service table */
35static DEFINE_RWLOCK(__ip_vs_sched_lock);
36
37
38/*
39 * Bind a service with a scheduler
40 */
41int ip_vs_bind_scheduler(struct ip_vs_service *svc,
42 struct ip_vs_scheduler *scheduler)
43{
44 int ret;
45
46 if (svc == NULL) {
47 IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
48 return -EINVAL;
49 }
50 if (scheduler == NULL) {
51 IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
52 return -EINVAL;
53 }
54
55 svc->scheduler = scheduler;
56
57 if (scheduler->init_service) {
58 ret = scheduler->init_service(svc);
59 if (ret) {
60 IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
61 return ret;
62 }
63 }
64
65 return 0;
66}
67
68
69/*
70 * Unbind a service with its scheduler
71 */
72int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
73{
74 struct ip_vs_scheduler *sched;
75
76 if (svc == NULL) {
77 IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
78 return -EINVAL;
79 }
80
81 sched = svc->scheduler;
82 if (sched == NULL) {
83 IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
84 return -EINVAL;
85 }
86
87 if (sched->done_service) {
88 if (sched->done_service(svc) != 0) {
89 IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
90 return -EINVAL;
91 }
92 }
93
94 svc->scheduler = NULL;
95 return 0;
96}
97
98
99/*
100 * Get scheduler in the scheduler list by name
101 */
102static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
103{
104 struct ip_vs_scheduler *sched;
105
106 IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
107 sched_name);
108
109 read_lock_bh(&__ip_vs_sched_lock);
110
111 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
112 /*
113 * Test and get the modules atomically
114 */
115 if (sched->module && !try_module_get(sched->module)) {
116 /*
117 * This scheduler is just deleted
118 */
119 continue;
120 }
121 if (strcmp(sched_name, sched->name)==0) {
122 /* HIT */
123 read_unlock_bh(&__ip_vs_sched_lock);
124 return sched;
125 }
126 if (sched->module)
127 module_put(sched->module);
128 }
129
130 read_unlock_bh(&__ip_vs_sched_lock);
131 return NULL;
132}
133
134
135/*
136 * Lookup scheduler and try to load it if it doesn't exist
137 */
138struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
139{
140 struct ip_vs_scheduler *sched;
141
142 /*
143 * Search for the scheduler by sched_name
144 */
145 sched = ip_vs_sched_getbyname(sched_name);
146
147 /*
148 * If scheduler not found, load the module and search again
149 */
150 if (sched == NULL) {
151 request_module("ip_vs_%s", sched_name);
152 sched = ip_vs_sched_getbyname(sched_name);
153 }
154
155 return sched;
156}
157
158void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
159{
160 if (scheduler->module)
161 module_put(scheduler->module);
162}
163
164
165/*
166 * Register a scheduler in the scheduler list
167 */
168int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
169{
170 struct ip_vs_scheduler *sched;
171
172 if (!scheduler) {
173 IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
174 return -EINVAL;
175 }
176
177 if (!scheduler->name) {
178 IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
179 return -EINVAL;
180 }
181
182 /* increase the module use count */
183 ip_vs_use_count_inc();
184
185 write_lock_bh(&__ip_vs_sched_lock);
186
187 if (!list_empty(&scheduler->n_list)) {
188 write_unlock_bh(&__ip_vs_sched_lock);
189 ip_vs_use_count_dec();
190 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
191 "already linked\n", scheduler->name);
192 return -EINVAL;
193 }
194
195 /*
196 * Make sure that the scheduler with this name doesn't exist
197 * in the scheduler list.
198 */
199 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
200 if (strcmp(scheduler->name, sched->name) == 0) {
201 write_unlock_bh(&__ip_vs_sched_lock);
202 ip_vs_use_count_dec();
203 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
204 "already existed in the system\n",
205 scheduler->name);
206 return -EINVAL;
207 }
208 }
209 /*
210 * Add it into the d-linked scheduler list
211 */
212 list_add(&scheduler->n_list, &ip_vs_schedulers);
213 write_unlock_bh(&__ip_vs_sched_lock);
214
215 IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
216
217 return 0;
218}
219
220
221/*
222 * Unregister a scheduler from the scheduler list
223 */
224int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
225{
226 if (!scheduler) {
227 IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
228 return -EINVAL;
229 }
230
231 write_lock_bh(&__ip_vs_sched_lock);
232 if (list_empty(&scheduler->n_list)) {
233 write_unlock_bh(&__ip_vs_sched_lock);
234 IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
235 "is not in the list. failed\n", scheduler->name);
236 return -EINVAL;
237 }
238
239 /*
240 * Remove it from the d-linked scheduler list
241 */
242 list_del(&scheduler->n_list);
243 write_unlock_bh(&__ip_vs_sched_lock);
244
245 /* decrease the module use count */
246 ip_vs_use_count_dec();
247
248 IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
249
250 return 0;
251}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
deleted file mode 100644
index 7d2f22f04b83..000000000000
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ /dev/null
@@ -1,140 +0,0 @@
1/*
2 * IPVS: Shortest Expected Delay scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The SED algorithm attempts to minimize each job's expected delay until
17 * completion. The expected delay that the job will experience is
18 * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
19 * jobs on the ith server and Ui is the fixed service rate (weight) of
20 * the ith server. The SED algorithm adopts a greedy policy that each does
21 * what is in its own best interest, i.e. to join the queue which would
22 * minimize its expected delay of completion.
23 *
24 * See the following paper for more information:
25 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
26 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
27 * pages 986-994, 1988.
28 *
29 * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
30 *
31 * The difference between SED and WLC is that SED includes the incoming
32 * job in the cost function (the increment of 1). SED may outperform
33 * WLC, while scheduling big jobs under larger heterogeneous systems
34 * (the server weight varies a lot).
35 *
36 */
37
38#include <linux/module.h>
39#include <linux/kernel.h>
40
41#include <net/ip_vs.h>
42
43
44static inline unsigned int
45ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
46{
47 /*
48 * We only use the active connection number in the cost
49 * calculation here.
50 */
51 return atomic_read(&dest->activeconns) + 1;
52}
53
54
55/*
56 * Weighted Least Connection scheduling
57 */
58static struct ip_vs_dest *
59ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
60{
61 struct ip_vs_dest *dest, *least;
62 unsigned int loh, doh;
63
64 IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
65
66 /*
67 * We calculate the load of each dest server as follows:
68 * (server expected overhead) / dest->weight
69 *
70 * Remember -- no floats in kernel mode!!!
71 * The comparison of h1*w2 > h2*w1 is equivalent to that of
72 * h1/w1 > h2/w2
73 * if every weight is larger than zero.
74 *
75 * The server with weight=0 is quiesced and will not receive any
76 * new connections.
77 */
78
79 list_for_each_entry(dest, &svc->destinations, n_list) {
80 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
81 atomic_read(&dest->weight) > 0) {
82 least = dest;
83 loh = ip_vs_sed_dest_overhead(least);
84 goto nextstage;
85 }
86 }
87 return NULL;
88
89 /*
90 * Find the destination with the least load.
91 */
92 nextstage:
93 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
94 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
95 continue;
96 doh = ip_vs_sed_dest_overhead(dest);
97 if (loh * atomic_read(&dest->weight) >
98 doh * atomic_read(&least->weight)) {
99 least = dest;
100 loh = doh;
101 }
102 }
103
104 IP_VS_DBG_BUF(6, "SED: server %s:%u "
105 "activeconns %d refcnt %d weight %d overhead %d\n",
106 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
107 atomic_read(&least->activeconns),
108 atomic_read(&least->refcnt),
109 atomic_read(&least->weight), loh);
110
111 return least;
112}
113
114
115static struct ip_vs_scheduler ip_vs_sed_scheduler =
116{
117 .name = "sed",
118 .refcnt = ATOMIC_INIT(0),
119 .module = THIS_MODULE,
120 .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
121#ifdef CONFIG_IP_VS_IPV6
122 .supports_ipv6 = 1,
123#endif
124 .schedule = ip_vs_sed_schedule,
125};
126
127
128static int __init ip_vs_sed_init(void)
129{
130 return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
131}
132
133static void __exit ip_vs_sed_cleanup(void)
134{
135 unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
136}
137
138module_init(ip_vs_sed_init);
139module_exit(ip_vs_sed_cleanup);
140MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
deleted file mode 100644
index 1d96de27fefd..000000000000
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ /dev/null
@@ -1,258 +0,0 @@
1/*
2 * IPVS: Source Hashing scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The sh algorithm is to select server by the hash key of source IP
17 * address. The pseudo code is as follows:
18 *
19 * n <- servernode[src_ip];
20 * if (n is dead) OR
21 * (n is overloaded) or (n.weight <= 0) then
22 * return NULL;
23 *
24 * return n;
25 *
26 * Notes that servernode is a 256-bucket hash table that maps the hash
27 * index derived from packet source IP address to the current server
28 * array. If the sh scheduler is used in cache cluster, it is good to
29 * combine it with cache_bypass feature. When the statically assigned
30 * server is dead or overloaded, the load balancer can bypass the cache
31 * server and send requests to the original server directly.
32 *
33 */
34
35#include <linux/ip.h>
36#include <linux/module.h>
37#include <linux/kernel.h>
38#include <linux/skbuff.h>
39
40#include <net/ip_vs.h>
41
42
43/*
44 * IPVS SH bucket
45 */
46struct ip_vs_sh_bucket {
47 struct ip_vs_dest *dest; /* real server (cache) */
48};
49
50/*
51 * for IPVS SH entry hash table
52 */
53#ifndef CONFIG_IP_VS_SH_TAB_BITS
54#define CONFIG_IP_VS_SH_TAB_BITS 8
55#endif
56#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
57#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
58#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
59
60
61/*
62 * Returns hash value for IPVS SH entry
63 */
64static inline unsigned ip_vs_sh_hashkey(__be32 addr)
65{
66 return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
67}
68
69
70/*
71 * Get ip_vs_dest associated with supplied parameters.
72 */
73static inline struct ip_vs_dest *
74ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
75{
76 return (tbl[ip_vs_sh_hashkey(addr)]).dest;
77}
78
79
80/*
81 * Assign all the hash buckets of the specified table with the service.
82 */
83static int
84ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
85{
86 int i;
87 struct ip_vs_sh_bucket *b;
88 struct list_head *p;
89 struct ip_vs_dest *dest;
90
91 b = tbl;
92 p = &svc->destinations;
93 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
94 if (list_empty(p)) {
95 b->dest = NULL;
96 } else {
97 if (p == &svc->destinations)
98 p = p->next;
99
100 dest = list_entry(p, struct ip_vs_dest, n_list);
101 atomic_inc(&dest->refcnt);
102 b->dest = dest;
103
104 p = p->next;
105 }
106 b++;
107 }
108 return 0;
109}
110
111
112/*
113 * Flush all the hash buckets of the specified table.
114 */
115static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
116{
117 int i;
118 struct ip_vs_sh_bucket *b;
119
120 b = tbl;
121 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
122 if (b->dest) {
123 atomic_dec(&b->dest->refcnt);
124 b->dest = NULL;
125 }
126 b++;
127 }
128}
129
130
131static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
132{
133 struct ip_vs_sh_bucket *tbl;
134
135 /* allocate the SH table for this service */
136 tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
137 GFP_ATOMIC);
138 if (tbl == NULL) {
139 IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
140 return -ENOMEM;
141 }
142 svc->sched_data = tbl;
143 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
144 "current service\n",
145 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
146
147 /* assign the hash buckets with the updated service */
148 ip_vs_sh_assign(tbl, svc);
149
150 return 0;
151}
152
153
154static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
155{
156 struct ip_vs_sh_bucket *tbl = svc->sched_data;
157
158 /* got to clean up hash buckets here */
159 ip_vs_sh_flush(tbl);
160
161 /* release the table itself */
162 kfree(svc->sched_data);
163 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
164 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
165
166 return 0;
167}
168
169
170static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
171{
172 struct ip_vs_sh_bucket *tbl = svc->sched_data;
173
174 /* got to clean up hash buckets here */
175 ip_vs_sh_flush(tbl);
176
177 /* assign the hash buckets with the updated service */
178 ip_vs_sh_assign(tbl, svc);
179
180 return 0;
181}
182
183
184/*
185 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
186 * consider that the server is overloaded here.
187 */
188static inline int is_overloaded(struct ip_vs_dest *dest)
189{
190 return dest->flags & IP_VS_DEST_F_OVERLOAD;
191}
192
193
194/*
195 * Source Hashing scheduling
196 */
197static struct ip_vs_dest *
198ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
199{
200 struct ip_vs_dest *dest;
201 struct ip_vs_sh_bucket *tbl;
202 struct iphdr *iph = ip_hdr(skb);
203
204 IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
205
206 tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
207 dest = ip_vs_sh_get(tbl, iph->saddr);
208 if (!dest
209 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
210 || atomic_read(&dest->weight) <= 0
211 || is_overloaded(dest)) {
212 return NULL;
213 }
214
215 IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
216 "--> server %u.%u.%u.%u:%d\n",
217 NIPQUAD(iph->saddr),
218 NIPQUAD(dest->addr.ip),
219 ntohs(dest->port));
220
221 return dest;
222}
223
224
225/*
226 * IPVS SH Scheduler structure
227 */
228static struct ip_vs_scheduler ip_vs_sh_scheduler =
229{
230 .name = "sh",
231 .refcnt = ATOMIC_INIT(0),
232 .module = THIS_MODULE,
233 .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
234#ifdef CONFIG_IP_VS_IPV6
235 .supports_ipv6 = 0,
236#endif
237 .init_service = ip_vs_sh_init_svc,
238 .done_service = ip_vs_sh_done_svc,
239 .update_service = ip_vs_sh_update_svc,
240 .schedule = ip_vs_sh_schedule,
241};
242
243
244static int __init ip_vs_sh_init(void)
245{
246 return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
247}
248
249
250static void __exit ip_vs_sh_cleanup(void)
251{
252 unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
253}
254
255
256module_init(ip_vs_sh_init);
257module_exit(ip_vs_sh_cleanup);
258MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
deleted file mode 100644
index 28237a5f62e2..000000000000
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ /dev/null
@@ -1,940 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 *
10 * ip_vs_sync: sync connection info from master load balancer to backups
11 * through multicast
12 *
13 * Changes:
14 * Alexandre Cassen : Added master & backup support at a time.
15 * Alexandre Cassen : Added SyncID support for incoming sync
16 * messages filtering.
17 * Justin Ossevoort : Fix endian problem on sync message size.
18 */
19
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/inetdevice.h>
23#include <linux/net.h>
24#include <linux/completion.h>
25#include <linux/delay.h>
26#include <linux/skbuff.h>
27#include <linux/in.h>
28#include <linux/igmp.h> /* for ip_mc_join_group */
29#include <linux/udp.h>
30#include <linux/err.h>
31#include <linux/kthread.h>
32#include <linux/wait.h>
33
34#include <net/ip.h>
35#include <net/sock.h>
36
37#include <net/ip_vs.h>
38
39#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
40#define IP_VS_SYNC_PORT 8848 /* multicast port */
41
42
43/*
44 * IPVS sync connection entry
45 */
46struct ip_vs_sync_conn {
47 __u8 reserved;
48
49 /* Protocol, addresses and port numbers */
50 __u8 protocol; /* Which protocol (TCP/UDP) */
51 __be16 cport;
52 __be16 vport;
53 __be16 dport;
54 __be32 caddr; /* client address */
55 __be32 vaddr; /* virtual address */
56 __be32 daddr; /* destination address */
57
58 /* Flags and state transition */
59 __be16 flags; /* status flags */
60 __be16 state; /* state info */
61
62 /* The sequence options start here */
63};
64
65struct ip_vs_sync_conn_options {
66 struct ip_vs_seq in_seq; /* incoming seq. struct */
67 struct ip_vs_seq out_seq; /* outgoing seq. struct */
68};
69
70struct ip_vs_sync_thread_data {
71 struct socket *sock;
72 char *buf;
73};
74
75#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
76#define FULL_CONN_SIZE \
77(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
78
79
80/*
81 The master mulitcasts messages to the backup load balancers in the
82 following format.
83
84 0 1 2 3
85 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
86 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 | Count Conns | SyncID | Size |
88 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 | |
90 | IPVS Sync Connection (1) |
91 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92 | . |
93 | . |
94 | . |
95 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
96 | |
97 | IPVS Sync Connection (n) |
98 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99*/
100
101#define SYNC_MESG_HEADER_LEN 4
102
103struct ip_vs_sync_mesg {
104 __u8 nr_conns;
105 __u8 syncid;
106 __u16 size;
107
108 /* ip_vs_sync_conn entries start here */
109};
110
111/* the maximum length of sync (sending/receiving) message */
112static int sync_send_mesg_maxlen;
113static int sync_recv_mesg_maxlen;
114
115struct ip_vs_sync_buff {
116 struct list_head list;
117 unsigned long firstuse;
118
119 /* pointers for the message data */
120 struct ip_vs_sync_mesg *mesg;
121 unsigned char *head;
122 unsigned char *end;
123};
124
125
126/* the sync_buff list head and the lock */
127static LIST_HEAD(ip_vs_sync_queue);
128static DEFINE_SPINLOCK(ip_vs_sync_lock);
129
130/* current sync_buff for accepting new conn entries */
131static struct ip_vs_sync_buff *curr_sb = NULL;
132static DEFINE_SPINLOCK(curr_sb_lock);
133
134/* ipvs sync daemon state */
135volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
136volatile int ip_vs_master_syncid = 0;
137volatile int ip_vs_backup_syncid = 0;
138
139/* multicast interface name */
140char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
141char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
142
143/* sync daemon tasks */
144static struct task_struct *sync_master_thread;
145static struct task_struct *sync_backup_thread;
146
147/* multicast addr */
148static struct sockaddr_in mcast_addr = {
149 .sin_family = AF_INET,
150 .sin_port = __constant_htons(IP_VS_SYNC_PORT),
151 .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP),
152};
153
154
155static inline struct ip_vs_sync_buff *sb_dequeue(void)
156{
157 struct ip_vs_sync_buff *sb;
158
159 spin_lock_bh(&ip_vs_sync_lock);
160 if (list_empty(&ip_vs_sync_queue)) {
161 sb = NULL;
162 } else {
163 sb = list_entry(ip_vs_sync_queue.next,
164 struct ip_vs_sync_buff,
165 list);
166 list_del(&sb->list);
167 }
168 spin_unlock_bh(&ip_vs_sync_lock);
169
170 return sb;
171}
172
173static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
174{
175 struct ip_vs_sync_buff *sb;
176
177 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
178 return NULL;
179
180 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
181 kfree(sb);
182 return NULL;
183 }
184 sb->mesg->nr_conns = 0;
185 sb->mesg->syncid = ip_vs_master_syncid;
186 sb->mesg->size = 4;
187 sb->head = (unsigned char *)sb->mesg + 4;
188 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
189 sb->firstuse = jiffies;
190 return sb;
191}
192
193static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
194{
195 kfree(sb->mesg);
196 kfree(sb);
197}
198
199static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
200{
201 spin_lock(&ip_vs_sync_lock);
202 if (ip_vs_sync_state & IP_VS_STATE_MASTER)
203 list_add_tail(&sb->list, &ip_vs_sync_queue);
204 else
205 ip_vs_sync_buff_release(sb);
206 spin_unlock(&ip_vs_sync_lock);
207}
208
209/*
210 * Get the current sync buffer if it has been created for more
211 * than the specified time or the specified time is zero.
212 */
213static inline struct ip_vs_sync_buff *
214get_curr_sync_buff(unsigned long time)
215{
216 struct ip_vs_sync_buff *sb;
217
218 spin_lock_bh(&curr_sb_lock);
219 if (curr_sb && (time == 0 ||
220 time_before(jiffies - curr_sb->firstuse, time))) {
221 sb = curr_sb;
222 curr_sb = NULL;
223 } else
224 sb = NULL;
225 spin_unlock_bh(&curr_sb_lock);
226 return sb;
227}
228
229
230/*
231 * Add an ip_vs_conn information into the current sync_buff.
232 * Called by ip_vs_in.
233 */
234void ip_vs_sync_conn(struct ip_vs_conn *cp)
235{
236 struct ip_vs_sync_mesg *m;
237 struct ip_vs_sync_conn *s;
238 int len;
239
240 spin_lock(&curr_sb_lock);
241 if (!curr_sb) {
242 if (!(curr_sb=ip_vs_sync_buff_create())) {
243 spin_unlock(&curr_sb_lock);
244 IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
245 return;
246 }
247 }
248
249 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
250 SIMPLE_CONN_SIZE;
251 m = curr_sb->mesg;
252 s = (struct ip_vs_sync_conn *)curr_sb->head;
253
254 /* copy members */
255 s->protocol = cp->protocol;
256 s->cport = cp->cport;
257 s->vport = cp->vport;
258 s->dport = cp->dport;
259 s->caddr = cp->caddr.ip;
260 s->vaddr = cp->vaddr.ip;
261 s->daddr = cp->daddr.ip;
262 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
263 s->state = htons(cp->state);
264 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
265 struct ip_vs_sync_conn_options *opt =
266 (struct ip_vs_sync_conn_options *)&s[1];
267 memcpy(opt, &cp->in_seq, sizeof(*opt));
268 }
269
270 m->nr_conns++;
271 m->size += len;
272 curr_sb->head += len;
273
274 /* check if there is a space for next one */
275 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
276 sb_queue_tail(curr_sb);
277 curr_sb = NULL;
278 }
279 spin_unlock(&curr_sb_lock);
280
281 /* synchronize its controller if it has */
282 if (cp->control)
283 ip_vs_sync_conn(cp->control);
284}
285
286
287/*
288 * Process received multicast message and create the corresponding
289 * ip_vs_conn entries.
290 */
291static void ip_vs_process_message(const char *buffer, const size_t buflen)
292{
293 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
294 struct ip_vs_sync_conn *s;
295 struct ip_vs_sync_conn_options *opt;
296 struct ip_vs_conn *cp;
297 struct ip_vs_protocol *pp;
298 struct ip_vs_dest *dest;
299 char *p;
300 int i;
301
302 if (buflen < sizeof(struct ip_vs_sync_mesg)) {
303 IP_VS_ERR_RL("sync message header too short\n");
304 return;
305 }
306
307 /* Convert size back to host byte order */
308 m->size = ntohs(m->size);
309
310 if (buflen != m->size) {
311 IP_VS_ERR_RL("bogus sync message size\n");
312 return;
313 }
314
315 /* SyncID sanity check */
316 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
317 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
318 m->syncid);
319 return;
320 }
321
322 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
323 for (i=0; i<m->nr_conns; i++) {
324 unsigned flags, state;
325
326 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
327 IP_VS_ERR_RL("bogus conn in sync message\n");
328 return;
329 }
330 s = (struct ip_vs_sync_conn *) p;
331 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
332 flags &= ~IP_VS_CONN_F_HASHED;
333 if (flags & IP_VS_CONN_F_SEQ_MASK) {
334 opt = (struct ip_vs_sync_conn_options *)&s[1];
335 p += FULL_CONN_SIZE;
336 if (p > buffer+buflen) {
337 IP_VS_ERR_RL("bogus conn options in sync message\n");
338 return;
339 }
340 } else {
341 opt = NULL;
342 p += SIMPLE_CONN_SIZE;
343 }
344
345 state = ntohs(s->state);
346 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
347 pp = ip_vs_proto_get(s->protocol);
348 if (!pp) {
349 IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
350 s->protocol);
351 continue;
352 }
353 if (state >= pp->num_states) {
354 IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
355 pp->name, state);
356 continue;
357 }
358 } else {
359 /* protocol in templates is not used for state/timeout */
360 pp = NULL;
361 if (state > 0) {
362 IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
363 state);
364 state = 0;
365 }
366 }
367
368 if (!(flags & IP_VS_CONN_F_TEMPLATE))
369 cp = ip_vs_conn_in_get(AF_INET, s->protocol,
370 (union nf_inet_addr *)&s->caddr,
371 s->cport,
372 (union nf_inet_addr *)&s->vaddr,
373 s->vport);
374 else
375 cp = ip_vs_ct_in_get(AF_INET, s->protocol,
376 (union nf_inet_addr *)&s->caddr,
377 s->cport,
378 (union nf_inet_addr *)&s->vaddr,
379 s->vport);
380 if (!cp) {
381 /*
382 * Find the appropriate destination for the connection.
383 * If it is not found the connection will remain unbound
384 * but still handled.
385 */
386 dest = ip_vs_find_dest(AF_INET,
387 (union nf_inet_addr *)&s->daddr,
388 s->dport,
389 (union nf_inet_addr *)&s->vaddr,
390 s->vport,
391 s->protocol);
392 /* Set the approprite ativity flag */
393 if (s->protocol == IPPROTO_TCP) {
394 if (state != IP_VS_TCP_S_ESTABLISHED)
395 flags |= IP_VS_CONN_F_INACTIVE;
396 else
397 flags &= ~IP_VS_CONN_F_INACTIVE;
398 }
399 cp = ip_vs_conn_new(AF_INET, s->protocol,
400 (union nf_inet_addr *)&s->caddr,
401 s->cport,
402 (union nf_inet_addr *)&s->vaddr,
403 s->vport,
404 (union nf_inet_addr *)&s->daddr,
405 s->dport,
406 flags, dest);
407 if (dest)
408 atomic_dec(&dest->refcnt);
409 if (!cp) {
410 IP_VS_ERR("ip_vs_conn_new failed\n");
411 return;
412 }
413 } else if (!cp->dest) {
414 dest = ip_vs_try_bind_dest(cp);
415 if (dest)
416 atomic_dec(&dest->refcnt);
417 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
418 (cp->state != state)) {
419 /* update active/inactive flag for the connection */
420 dest = cp->dest;
421 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
422 (state != IP_VS_TCP_S_ESTABLISHED)) {
423 atomic_dec(&dest->activeconns);
424 atomic_inc(&dest->inactconns);
425 cp->flags |= IP_VS_CONN_F_INACTIVE;
426 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
427 (state == IP_VS_TCP_S_ESTABLISHED)) {
428 atomic_inc(&dest->activeconns);
429 atomic_dec(&dest->inactconns);
430 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
431 }
432 }
433
434 if (opt)
435 memcpy(&cp->in_seq, opt, sizeof(*opt));
436 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
437 cp->state = state;
438 cp->old_state = cp->state;
439 /*
440 * We can not recover the right timeout for templates
441 * in all cases, we can not find the right fwmark
442 * virtual service. If needed, we can do it for
443 * non-fwmark persistent services.
444 */
445 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
446 cp->timeout = pp->timeout_table[state];
447 else
448 cp->timeout = (3*60*HZ);
449 ip_vs_conn_put(cp);
450 }
451}
452
453
454/*
455 * Setup loopback of outgoing multicasts on a sending socket
456 */
457static void set_mcast_loop(struct sock *sk, u_char loop)
458{
459 struct inet_sock *inet = inet_sk(sk);
460
461 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
462 lock_sock(sk);
463 inet->mc_loop = loop ? 1 : 0;
464 release_sock(sk);
465}
466
467/*
468 * Specify TTL for outgoing multicasts on a sending socket
469 */
470static void set_mcast_ttl(struct sock *sk, u_char ttl)
471{
472 struct inet_sock *inet = inet_sk(sk);
473
474 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
475 lock_sock(sk);
476 inet->mc_ttl = ttl;
477 release_sock(sk);
478}
479
480/*
481 * Specifiy default interface for outgoing multicasts
482 */
483static int set_mcast_if(struct sock *sk, char *ifname)
484{
485 struct net_device *dev;
486 struct inet_sock *inet = inet_sk(sk);
487
488 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
489 return -ENODEV;
490
491 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
492 return -EINVAL;
493
494 lock_sock(sk);
495 inet->mc_index = dev->ifindex;
496 /* inet->mc_addr = 0; */
497 release_sock(sk);
498
499 return 0;
500}
501
502
503/*
504 * Set the maximum length of sync message according to the
505 * specified interface's MTU.
506 */
507static int set_sync_mesg_maxlen(int sync_state)
508{
509 struct net_device *dev;
510 int num;
511
512 if (sync_state == IP_VS_STATE_MASTER) {
513 if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
514 return -ENODEV;
515
516 num = (dev->mtu - sizeof(struct iphdr) -
517 sizeof(struct udphdr) -
518 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
519 sync_send_mesg_maxlen =
520 SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
521 IP_VS_DBG(7, "setting the maximum length of sync sending "
522 "message %d.\n", sync_send_mesg_maxlen);
523 } else if (sync_state == IP_VS_STATE_BACKUP) {
524 if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
525 return -ENODEV;
526
527 sync_recv_mesg_maxlen = dev->mtu -
528 sizeof(struct iphdr) - sizeof(struct udphdr);
529 IP_VS_DBG(7, "setting the maximum length of sync receiving "
530 "message %d.\n", sync_recv_mesg_maxlen);
531 }
532
533 return 0;
534}
535
536
537/*
538 * Join a multicast group.
539 * the group is specified by a class D multicast address 224.0.0.0/8
540 * in the in_addr structure passed in as a parameter.
541 */
542static int
543join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
544{
545 struct ip_mreqn mreq;
546 struct net_device *dev;
547 int ret;
548
549 memset(&mreq, 0, sizeof(mreq));
550 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
551
552 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
553 return -ENODEV;
554 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
555 return -EINVAL;
556
557 mreq.imr_ifindex = dev->ifindex;
558
559 lock_sock(sk);
560 ret = ip_mc_join_group(sk, &mreq);
561 release_sock(sk);
562
563 return ret;
564}
565
566
567static int bind_mcastif_addr(struct socket *sock, char *ifname)
568{
569 struct net_device *dev;
570 __be32 addr;
571 struct sockaddr_in sin;
572
573 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
574 return -ENODEV;
575
576 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
577 if (!addr)
578 IP_VS_ERR("You probably need to specify IP address on "
579 "multicast interface.\n");
580
581 IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
582 ifname, NIPQUAD(addr));
583
584 /* Now bind the socket with the address of multicast interface */
585 sin.sin_family = AF_INET;
586 sin.sin_addr.s_addr = addr;
587 sin.sin_port = 0;
588
589 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
590}
591
592/*
593 * Set up sending multicast socket over UDP
594 */
595static struct socket * make_send_sock(void)
596{
597 struct socket *sock;
598 int result;
599
600 /* First create a socket */
601 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
602 if (result < 0) {
603 IP_VS_ERR("Error during creation of socket; terminating\n");
604 return ERR_PTR(result);
605 }
606
607 result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
608 if (result < 0) {
609 IP_VS_ERR("Error setting outbound mcast interface\n");
610 goto error;
611 }
612
613 set_mcast_loop(sock->sk, 0);
614 set_mcast_ttl(sock->sk, 1);
615
616 result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
617 if (result < 0) {
618 IP_VS_ERR("Error binding address of the mcast interface\n");
619 goto error;
620 }
621
622 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
623 sizeof(struct sockaddr), 0);
624 if (result < 0) {
625 IP_VS_ERR("Error connecting to the multicast addr\n");
626 goto error;
627 }
628
629 return sock;
630
631 error:
632 sock_release(sock);
633 return ERR_PTR(result);
634}
635
636
637/*
638 * Set up receiving multicast socket over UDP
639 */
640static struct socket * make_receive_sock(void)
641{
642 struct socket *sock;
643 int result;
644
645 /* First create a socket */
646 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
647 if (result < 0) {
648 IP_VS_ERR("Error during creation of socket; terminating\n");
649 return ERR_PTR(result);
650 }
651
652 /* it is equivalent to the REUSEADDR option in user-space */
653 sock->sk->sk_reuse = 1;
654
655 result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
656 sizeof(struct sockaddr));
657 if (result < 0) {
658 IP_VS_ERR("Error binding to the multicast addr\n");
659 goto error;
660 }
661
662 /* join the multicast group */
663 result = join_mcast_group(sock->sk,
664 (struct in_addr *) &mcast_addr.sin_addr,
665 ip_vs_backup_mcast_ifn);
666 if (result < 0) {
667 IP_VS_ERR("Error joining to the multicast group\n");
668 goto error;
669 }
670
671 return sock;
672
673 error:
674 sock_release(sock);
675 return ERR_PTR(result);
676}
677
678
679static int
680ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
681{
682 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
683 struct kvec iov;
684 int len;
685
686 EnterFunction(7);
687 iov.iov_base = (void *)buffer;
688 iov.iov_len = length;
689
690 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
691
692 LeaveFunction(7);
693 return len;
694}
695
696static void
697ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
698{
699 int msize;
700
701 msize = msg->size;
702
703 /* Put size in network byte order */
704 msg->size = htons(msg->size);
705
706 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
707 IP_VS_ERR("ip_vs_send_async error\n");
708}
709
710static int
711ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
712{
713 struct msghdr msg = {NULL,};
714 struct kvec iov;
715 int len;
716
717 EnterFunction(7);
718
719 /* Receive a packet */
720 iov.iov_base = buffer;
721 iov.iov_len = (size_t)buflen;
722
723 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
724
725 if (len < 0)
726 return -1;
727
728 LeaveFunction(7);
729 return len;
730}
731
732
733static int sync_thread_master(void *data)
734{
735 struct ip_vs_sync_thread_data *tinfo = data;
736 struct ip_vs_sync_buff *sb;
737
738 IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
739 "syncid = %d\n",
740 ip_vs_master_mcast_ifn, ip_vs_master_syncid);
741
742 while (!kthread_should_stop()) {
743 while ((sb = sb_dequeue())) {
744 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
745 ip_vs_sync_buff_release(sb);
746 }
747
748 /* check if entries stay in curr_sb for 2 seconds */
749 sb = get_curr_sync_buff(2 * HZ);
750 if (sb) {
751 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
752 ip_vs_sync_buff_release(sb);
753 }
754
755 schedule_timeout_interruptible(HZ);
756 }
757
758 /* clean up the sync_buff queue */
759 while ((sb=sb_dequeue())) {
760 ip_vs_sync_buff_release(sb);
761 }
762
763 /* clean up the current sync_buff */
764 if ((sb = get_curr_sync_buff(0))) {
765 ip_vs_sync_buff_release(sb);
766 }
767
768 /* release the sending multicast socket */
769 sock_release(tinfo->sock);
770 kfree(tinfo);
771
772 return 0;
773}
774
775
776static int sync_thread_backup(void *data)
777{
778 struct ip_vs_sync_thread_data *tinfo = data;
779 int len;
780
781 IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
782 "syncid = %d\n",
783 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
784
785 while (!kthread_should_stop()) {
786 wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
787 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
788 || kthread_should_stop());
789
790 /* do we have data now? */
791 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
792 len = ip_vs_receive(tinfo->sock, tinfo->buf,
793 sync_recv_mesg_maxlen);
794 if (len <= 0) {
795 IP_VS_ERR("receiving message error\n");
796 break;
797 }
798
799 /* disable bottom half, because it accesses the data
800 shared by softirq while getting/creating conns */
801 local_bh_disable();
802 ip_vs_process_message(tinfo->buf, len);
803 local_bh_enable();
804 }
805 }
806
807 /* release the sending multicast socket */
808 sock_release(tinfo->sock);
809 kfree(tinfo->buf);
810 kfree(tinfo);
811
812 return 0;
813}
814
815
816int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
817{
818 struct ip_vs_sync_thread_data *tinfo;
819 struct task_struct **realtask, *task;
820 struct socket *sock;
821 char *name, *buf = NULL;
822 int (*threadfn)(void *data);
823 int result = -ENOMEM;
824
825 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
826 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
827 sizeof(struct ip_vs_sync_conn));
828
829 if (state == IP_VS_STATE_MASTER) {
830 if (sync_master_thread)
831 return -EEXIST;
832
833 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
834 sizeof(ip_vs_master_mcast_ifn));
835 ip_vs_master_syncid = syncid;
836 realtask = &sync_master_thread;
837 name = "ipvs_syncmaster";
838 threadfn = sync_thread_master;
839 sock = make_send_sock();
840 } else if (state == IP_VS_STATE_BACKUP) {
841 if (sync_backup_thread)
842 return -EEXIST;
843
844 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
845 sizeof(ip_vs_backup_mcast_ifn));
846 ip_vs_backup_syncid = syncid;
847 realtask = &sync_backup_thread;
848 name = "ipvs_syncbackup";
849 threadfn = sync_thread_backup;
850 sock = make_receive_sock();
851 } else {
852 return -EINVAL;
853 }
854
855 if (IS_ERR(sock)) {
856 result = PTR_ERR(sock);
857 goto out;
858 }
859
860 set_sync_mesg_maxlen(state);
861 if (state == IP_VS_STATE_BACKUP) {
862 buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
863 if (!buf)
864 goto outsocket;
865 }
866
867 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
868 if (!tinfo)
869 goto outbuf;
870
871 tinfo->sock = sock;
872 tinfo->buf = buf;
873
874 task = kthread_run(threadfn, tinfo, name);
875 if (IS_ERR(task)) {
876 result = PTR_ERR(task);
877 goto outtinfo;
878 }
879
880 /* mark as active */
881 *realtask = task;
882 ip_vs_sync_state |= state;
883
884 /* increase the module use count */
885 ip_vs_use_count_inc();
886
887 return 0;
888
889outtinfo:
890 kfree(tinfo);
891outbuf:
892 kfree(buf);
893outsocket:
894 sock_release(sock);
895out:
896 return result;
897}
898
899
900int stop_sync_thread(int state)
901{
902 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
903
904 if (state == IP_VS_STATE_MASTER) {
905 if (!sync_master_thread)
906 return -ESRCH;
907
908 IP_VS_INFO("stopping master sync thread %d ...\n",
909 task_pid_nr(sync_master_thread));
910
911 /*
912 * The lock synchronizes with sb_queue_tail(), so that we don't
913 * add sync buffers to the queue, when we are already in
914 * progress of stopping the master sync daemon.
915 */
916
917 spin_lock_bh(&ip_vs_sync_lock);
918 ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
919 spin_unlock_bh(&ip_vs_sync_lock);
920 kthread_stop(sync_master_thread);
921 sync_master_thread = NULL;
922 } else if (state == IP_VS_STATE_BACKUP) {
923 if (!sync_backup_thread)
924 return -ESRCH;
925
926 IP_VS_INFO("stopping backup sync thread %d ...\n",
927 task_pid_nr(sync_backup_thread));
928
929 ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
930 kthread_stop(sync_backup_thread);
931 sync_backup_thread = NULL;
932 } else {
933 return -EINVAL;
934 }
935
936 /* decrease the module use count */
937 ip_vs_use_count_dec();
938
939 return 0;
940}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
deleted file mode 100644
index 8c596e712599..000000000000
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ /dev/null
@@ -1,128 +0,0 @@
1/*
2 * IPVS: Weighted Least-Connection Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Peter Kese <peter.kese@ijs.si>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
14 * Wensong Zhang : changed to use the inactconns in scheduling
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_wlc_update_svc
18 * Wensong Zhang : added any dest with weight=0 is quiesced
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24
25#include <net/ip_vs.h>
26
27
28static inline unsigned int
29ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
30{
31 /*
32 * We think the overhead of processing active connections is 256
33 * times higher than that of inactive connections in average. (This
34 * 256 times might not be accurate, we will change it later) We
35 * use the following formula to estimate the overhead now:
36 * dest->activeconns*256 + dest->inactconns
37 */
38 return (atomic_read(&dest->activeconns) << 8) +
39 atomic_read(&dest->inactconns);
40}
41
42
43/*
44 * Weighted Least Connection scheduling
45 */
46static struct ip_vs_dest *
47ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
48{
49 struct ip_vs_dest *dest, *least;
50 unsigned int loh, doh;
51
52 IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
53
54 /*
55 * We calculate the load of each dest server as follows:
56 * (dest overhead) / dest->weight
57 *
58 * Remember -- no floats in kernel mode!!!
59 * The comparison of h1*w2 > h2*w1 is equivalent to that of
60 * h1/w1 > h2/w2
61 * if every weight is larger than zero.
62 *
63 * The server with weight=0 is quiesced and will not receive any
64 * new connections.
65 */
66
67 list_for_each_entry(dest, &svc->destinations, n_list) {
68 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
69 atomic_read(&dest->weight) > 0) {
70 least = dest;
71 loh = ip_vs_wlc_dest_overhead(least);
72 goto nextstage;
73 }
74 }
75 return NULL;
76
77 /*
78 * Find the destination with the least load.
79 */
80 nextstage:
81 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
82 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
83 continue;
84 doh = ip_vs_wlc_dest_overhead(dest);
85 if (loh * atomic_read(&dest->weight) >
86 doh * atomic_read(&least->weight)) {
87 least = dest;
88 loh = doh;
89 }
90 }
91
92 IP_VS_DBG_BUF(6, "WLC: server %s:%u "
93 "activeconns %d refcnt %d weight %d overhead %d\n",
94 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
95 atomic_read(&least->activeconns),
96 atomic_read(&least->refcnt),
97 atomic_read(&least->weight), loh);
98
99 return least;
100}
101
102
103static struct ip_vs_scheduler ip_vs_wlc_scheduler =
104{
105 .name = "wlc",
106 .refcnt = ATOMIC_INIT(0),
107 .module = THIS_MODULE,
108 .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
109#ifdef CONFIG_IP_VS_IPV6
110 .supports_ipv6 = 1,
111#endif
112 .schedule = ip_vs_wlc_schedule,
113};
114
115
116static int __init ip_vs_wlc_init(void)
117{
118 return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
119}
120
121static void __exit ip_vs_wlc_cleanup(void)
122{
123 unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
124}
125
126module_init(ip_vs_wlc_init);
127module_exit(ip_vs_wlc_cleanup);
128MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
deleted file mode 100644
index 7ea92fed50bf..000000000000
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ /dev/null
@@ -1,237 +0,0 @@
1/*
2 * IPVS: Weighted Round-Robin Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
13 * Wensong Zhang : changed some comestics things for debugging
14 * Wensong Zhang : changed for the d-linked destination list
15 * Wensong Zhang : added the ip_vs_wrr_update_svc
16 * Julian Anastasov : fixed the bug of returning destination
17 * with weight 0 when all weights are zero
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/net.h>
24
25#include <net/ip_vs.h>
26
27/*
28 * current destination pointer for weighted round-robin scheduling
29 */
30struct ip_vs_wrr_mark {
31 struct list_head *cl; /* current list head */
32 int cw; /* current weight */
33 int mw; /* maximum weight */
34 int di; /* decreasing interval */
35};
36
37
38/*
39 * Get the gcd of server weights
40 */
41static int gcd(int a, int b)
42{
43 int c;
44
45 while ((c = a % b)) {
46 a = b;
47 b = c;
48 }
49 return b;
50}
51
52static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
53{
54 struct ip_vs_dest *dest;
55 int weight;
56 int g = 0;
57
58 list_for_each_entry(dest, &svc->destinations, n_list) {
59 weight = atomic_read(&dest->weight);
60 if (weight > 0) {
61 if (g > 0)
62 g = gcd(weight, g);
63 else
64 g = weight;
65 }
66 }
67 return g ? g : 1;
68}
69
70
71/*
72 * Get the maximum weight of the service destinations.
73 */
74static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
75{
76 struct ip_vs_dest *dest;
77 int weight = 0;
78
79 list_for_each_entry(dest, &svc->destinations, n_list) {
80 if (atomic_read(&dest->weight) > weight)
81 weight = atomic_read(&dest->weight);
82 }
83
84 return weight;
85}
86
87
88static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
89{
90 struct ip_vs_wrr_mark *mark;
91
92 /*
93 * Allocate the mark variable for WRR scheduling
94 */
95 mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
96 if (mark == NULL) {
97 IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
98 return -ENOMEM;
99 }
100 mark->cl = &svc->destinations;
101 mark->cw = 0;
102 mark->mw = ip_vs_wrr_max_weight(svc);
103 mark->di = ip_vs_wrr_gcd_weight(svc);
104 svc->sched_data = mark;
105
106 return 0;
107}
108
109
110static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
111{
112 /*
113 * Release the mark variable
114 */
115 kfree(svc->sched_data);
116
117 return 0;
118}
119
120
121static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
122{
123 struct ip_vs_wrr_mark *mark = svc->sched_data;
124
125 mark->cl = &svc->destinations;
126 mark->mw = ip_vs_wrr_max_weight(svc);
127 mark->di = ip_vs_wrr_gcd_weight(svc);
128 if (mark->cw > mark->mw)
129 mark->cw = 0;
130 return 0;
131}
132
133
134/*
135 * Weighted Round-Robin Scheduling
136 */
137static struct ip_vs_dest *
138ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
139{
140 struct ip_vs_dest *dest;
141 struct ip_vs_wrr_mark *mark = svc->sched_data;
142 struct list_head *p;
143
144 IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
145
146 /*
147 * This loop will always terminate, because mark->cw in (0, max_weight]
148 * and at least one server has its weight equal to max_weight.
149 */
150 write_lock(&svc->sched_lock);
151 p = mark->cl;
152 while (1) {
153 if (mark->cl == &svc->destinations) {
154 /* it is at the head of the destination list */
155
156 if (mark->cl == mark->cl->next) {
157 /* no dest entry */
158 dest = NULL;
159 goto out;
160 }
161
162 mark->cl = svc->destinations.next;
163 mark->cw -= mark->di;
164 if (mark->cw <= 0) {
165 mark->cw = mark->mw;
166 /*
167 * Still zero, which means no available servers.
168 */
169 if (mark->cw == 0) {
170 mark->cl = &svc->destinations;
171 IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
172 "no available servers\n");
173 dest = NULL;
174 goto out;
175 }
176 }
177 } else
178 mark->cl = mark->cl->next;
179
180 if (mark->cl != &svc->destinations) {
181 /* not at the head of the list */
182 dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
183 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
184 atomic_read(&dest->weight) >= mark->cw) {
185 /* got it */
186 break;
187 }
188 }
189
190 if (mark->cl == p && mark->cw == mark->di) {
191 /* back to the start, and no dest is found.
192 It is only possible when all dests are OVERLOADED */
193 dest = NULL;
194 goto out;
195 }
196 }
197
198 IP_VS_DBG_BUF(6, "WRR: server %s:%u "
199 "activeconns %d refcnt %d weight %d\n",
200 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
201 atomic_read(&dest->activeconns),
202 atomic_read(&dest->refcnt),
203 atomic_read(&dest->weight));
204
205 out:
206 write_unlock(&svc->sched_lock);
207 return dest;
208}
209
210
211static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
212 .name = "wrr",
213 .refcnt = ATOMIC_INIT(0),
214 .module = THIS_MODULE,
215 .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
216#ifdef CONFIG_IP_VS_IPV6
217 .supports_ipv6 = 1,
218#endif
219 .init_service = ip_vs_wrr_init_svc,
220 .done_service = ip_vs_wrr_done_svc,
221 .update_service = ip_vs_wrr_update_svc,
222 .schedule = ip_vs_wrr_schedule,
223};
224
225static int __init ip_vs_wrr_init(void)
226{
227 return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
228}
229
230static void __exit ip_vs_wrr_cleanup(void)
231{
232 unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
233}
234
235module_init(ip_vs_wrr_init);
236module_exit(ip_vs_wrr_cleanup);
237MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
deleted file mode 100644
index 02ddc2b3ce2e..000000000000
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ /dev/null
@@ -1,1004 +0,0 @@
1/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/tcp.h> /* for tcphdr */
18#include <net/ip.h>
19#include <net/tcp.h> /* for csum_tcpudp_magic */
20#include <net/udp.h>
21#include <net/icmp.h> /* for icmp_send */
22#include <net/route.h> /* for ip_route_output */
23#include <net/ipv6.h>
24#include <net/ip6_route.h>
25#include <linux/icmpv6.h>
26#include <linux/netfilter.h>
27#include <linux/netfilter_ipv4.h>
28
29#include <net/ip_vs.h>
30
31
32/*
33 * Destination cache to speed up outgoing route lookup
34 */
35static inline void
36__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
37{
38 struct dst_entry *old_dst;
39
40 old_dst = dest->dst_cache;
41 dest->dst_cache = dst;
42 dest->dst_rtos = rtos;
43 dst_release(old_dst);
44}
45
46static inline struct dst_entry *
47__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
48{
49 struct dst_entry *dst = dest->dst_cache;
50
51 if (!dst)
52 return NULL;
53 if ((dst->obsolete
54 || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
55 dst->ops->check(dst, cookie) == NULL) {
56 dest->dst_cache = NULL;
57 dst_release(dst);
58 return NULL;
59 }
60 dst_hold(dst);
61 return dst;
62}
63
64static struct rtable *
65__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
66{
67 struct rtable *rt; /* Route to the other host */
68 struct ip_vs_dest *dest = cp->dest;
69
70 if (dest) {
71 spin_lock(&dest->dst_lock);
72 if (!(rt = (struct rtable *)
73 __ip_vs_dst_check(dest, rtos, 0))) {
74 struct flowi fl = {
75 .oif = 0,
76 .nl_u = {
77 .ip4_u = {
78 .daddr = dest->addr.ip,
79 .saddr = 0,
80 .tos = rtos, } },
81 };
82
83 if (ip_route_output_key(&init_net, &rt, &fl)) {
84 spin_unlock(&dest->dst_lock);
85 IP_VS_DBG_RL("ip_route_output error, "
86 "dest: %u.%u.%u.%u\n",
87 NIPQUAD(dest->addr.ip));
88 return NULL;
89 }
90 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
91 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
92 NIPQUAD(dest->addr.ip),
93 atomic_read(&rt->u.dst.__refcnt), rtos);
94 }
95 spin_unlock(&dest->dst_lock);
96 } else {
97 struct flowi fl = {
98 .oif = 0,
99 .nl_u = {
100 .ip4_u = {
101 .daddr = cp->daddr.ip,
102 .saddr = 0,
103 .tos = rtos, } },
104 };
105
106 if (ip_route_output_key(&init_net, &rt, &fl)) {
107 IP_VS_DBG_RL("ip_route_output error, dest: "
108 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
109 return NULL;
110 }
111 }
112
113 return rt;
114}
115
116#ifdef CONFIG_IP_VS_IPV6
117static struct rt6_info *
118__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
119{
120 struct rt6_info *rt; /* Route to the other host */
121 struct ip_vs_dest *dest = cp->dest;
122
123 if (dest) {
124 spin_lock(&dest->dst_lock);
125 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
126 if (!rt) {
127 struct flowi fl = {
128 .oif = 0,
129 .nl_u = {
130 .ip6_u = {
131 .daddr = dest->addr.in6,
132 .saddr = {
133 .s6_addr32 =
134 { 0, 0, 0, 0 },
135 },
136 },
137 },
138 };
139
140 rt = (struct rt6_info *)ip6_route_output(&init_net,
141 NULL, &fl);
142 if (!rt) {
143 spin_unlock(&dest->dst_lock);
144 IP_VS_DBG_RL("ip6_route_output error, "
145 "dest: " NIP6_FMT "\n",
146 NIP6(dest->addr.in6));
147 return NULL;
148 }
149 __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
150 IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
151 NIP6(dest->addr.in6),
152 atomic_read(&rt->u.dst.__refcnt));
153 }
154 spin_unlock(&dest->dst_lock);
155 } else {
156 struct flowi fl = {
157 .oif = 0,
158 .nl_u = {
159 .ip6_u = {
160 .daddr = cp->daddr.in6,
161 .saddr = {
162 .s6_addr32 = { 0, 0, 0, 0 },
163 },
164 },
165 },
166 };
167
168 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
169 if (!rt) {
170 IP_VS_DBG_RL("ip6_route_output error, dest: "
171 NIP6_FMT "\n", NIP6(cp->daddr.in6));
172 return NULL;
173 }
174 }
175
176 return rt;
177}
178#endif
179
180
181/*
182 * Release dest->dst_cache before a dest is removed
183 */
184void
185ip_vs_dst_reset(struct ip_vs_dest *dest)
186{
187 struct dst_entry *old_dst;
188
189 old_dst = dest->dst_cache;
190 dest->dst_cache = NULL;
191 dst_release(old_dst);
192}
193
194#define IP_VS_XMIT(pf, skb, rt) \
195do { \
196 (skb)->ipvs_property = 1; \
197 skb_forward_csum(skb); \
198 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
199 (rt)->u.dst.dev, dst_output); \
200} while (0)
201
202
203/*
204 * NULL transmitter (do nothing except return NF_ACCEPT)
205 */
206int
207ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
208 struct ip_vs_protocol *pp)
209{
210 /* we do not touch skb and do not need pskb ptr */
211 return NF_ACCEPT;
212}
213
214
215/*
216 * Bypass transmitter
217 * Let packets bypass the destination when the destination is not
218 * available, it may be only used in transparent cache cluster.
219 */
220int
221ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
222 struct ip_vs_protocol *pp)
223{
224 struct rtable *rt; /* Route to the other host */
225 struct iphdr *iph = ip_hdr(skb);
226 u8 tos = iph->tos;
227 int mtu;
228 struct flowi fl = {
229 .oif = 0,
230 .nl_u = {
231 .ip4_u = {
232 .daddr = iph->daddr,
233 .saddr = 0,
234 .tos = RT_TOS(tos), } },
235 };
236
237 EnterFunction(10);
238
239 if (ip_route_output_key(&init_net, &rt, &fl)) {
240 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
241 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
242 goto tx_error_icmp;
243 }
244
245 /* MTU checking */
246 mtu = dst_mtu(&rt->u.dst);
247 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
248 ip_rt_put(rt);
249 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
250 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
251 goto tx_error;
252 }
253
254 /*
255 * Call ip_send_check because we are not sure it is called
256 * after ip_defrag. Is copy-on-write needed?
257 */
258 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
259 ip_rt_put(rt);
260 return NF_STOLEN;
261 }
262 ip_send_check(ip_hdr(skb));
263
264 /* drop old route */
265 dst_release(skb->dst);
266 skb->dst = &rt->u.dst;
267
268 /* Another hack: avoid icmp_send in ip_fragment */
269 skb->local_df = 1;
270
271 IP_VS_XMIT(PF_INET, skb, rt);
272
273 LeaveFunction(10);
274 return NF_STOLEN;
275
276 tx_error_icmp:
277 dst_link_failure(skb);
278 tx_error:
279 kfree_skb(skb);
280 LeaveFunction(10);
281 return NF_STOLEN;
282}
283
284#ifdef CONFIG_IP_VS_IPV6
285int
286ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
287 struct ip_vs_protocol *pp)
288{
289 struct rt6_info *rt; /* Route to the other host */
290 struct ipv6hdr *iph = ipv6_hdr(skb);
291 int mtu;
292 struct flowi fl = {
293 .oif = 0,
294 .nl_u = {
295 .ip6_u = {
296 .daddr = iph->daddr,
297 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
298 };
299
300 EnterFunction(10);
301
302 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
303 if (!rt) {
304 IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
305 "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
306 goto tx_error_icmp;
307 }
308
309 /* MTU checking */
310 mtu = dst_mtu(&rt->u.dst);
311 if (skb->len > mtu) {
312 dst_release(&rt->u.dst);
313 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
314 IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
315 goto tx_error;
316 }
317
318 /*
319 * Call ip_send_check because we are not sure it is called
320 * after ip_defrag. Is copy-on-write needed?
321 */
322 skb = skb_share_check(skb, GFP_ATOMIC);
323 if (unlikely(skb == NULL)) {
324 dst_release(&rt->u.dst);
325 return NF_STOLEN;
326 }
327
328 /* drop old route */
329 dst_release(skb->dst);
330 skb->dst = &rt->u.dst;
331
332 /* Another hack: avoid icmp_send in ip_fragment */
333 skb->local_df = 1;
334
335 IP_VS_XMIT(PF_INET6, skb, rt);
336
337 LeaveFunction(10);
338 return NF_STOLEN;
339
340 tx_error_icmp:
341 dst_link_failure(skb);
342 tx_error:
343 kfree_skb(skb);
344 LeaveFunction(10);
345 return NF_STOLEN;
346}
347#endif
348
349/*
350 * NAT transmitter (only for outside-to-inside nat forwarding)
351 * Not used for related ICMP
352 */
353int
354ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
355 struct ip_vs_protocol *pp)
356{
357 struct rtable *rt; /* Route to the other host */
358 int mtu;
359 struct iphdr *iph = ip_hdr(skb);
360
361 EnterFunction(10);
362
363 /* check if it is a connection of no-client-port */
364 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
365 __be16 _pt, *p;
366 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
367 if (p == NULL)
368 goto tx_error;
369 ip_vs_conn_fill_cport(cp, *p);
370 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
371 }
372
373 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
374 goto tx_error_icmp;
375
376 /* MTU checking */
377 mtu = dst_mtu(&rt->u.dst);
378 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
379 ip_rt_put(rt);
380 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
381 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
382 goto tx_error;
383 }
384
385 /* copy-on-write the packet before mangling it */
386 if (!skb_make_writable(skb, sizeof(struct iphdr)))
387 goto tx_error_put;
388
389 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
390 goto tx_error_put;
391
392 /* drop old route */
393 dst_release(skb->dst);
394 skb->dst = &rt->u.dst;
395
396 /* mangle the packet */
397 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
398 goto tx_error;
399 ip_hdr(skb)->daddr = cp->daddr.ip;
400 ip_send_check(ip_hdr(skb));
401
402 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
403
404 /* FIXME: when application helper enlarges the packet and the length
405 is larger than the MTU of outgoing device, there will be still
406 MTU problem. */
407
408 /* Another hack: avoid icmp_send in ip_fragment */
409 skb->local_df = 1;
410
411 IP_VS_XMIT(PF_INET, skb, rt);
412
413 LeaveFunction(10);
414 return NF_STOLEN;
415
416 tx_error_icmp:
417 dst_link_failure(skb);
418 tx_error:
419 LeaveFunction(10);
420 kfree_skb(skb);
421 return NF_STOLEN;
422 tx_error_put:
423 ip_rt_put(rt);
424 goto tx_error;
425}
426
427#ifdef CONFIG_IP_VS_IPV6
428int
429ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
430 struct ip_vs_protocol *pp)
431{
432 struct rt6_info *rt; /* Route to the other host */
433 int mtu;
434
435 EnterFunction(10);
436
437 /* check if it is a connection of no-client-port */
438 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
439 __be16 _pt, *p;
440 p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
441 sizeof(_pt), &_pt);
442 if (p == NULL)
443 goto tx_error;
444 ip_vs_conn_fill_cport(cp, *p);
445 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
446 }
447
448 rt = __ip_vs_get_out_rt_v6(cp);
449 if (!rt)
450 goto tx_error_icmp;
451
452 /* MTU checking */
453 mtu = dst_mtu(&rt->u.dst);
454 if (skb->len > mtu) {
455 dst_release(&rt->u.dst);
456 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
457 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
458 "ip_vs_nat_xmit_v6(): frag needed for");
459 goto tx_error;
460 }
461
462 /* copy-on-write the packet before mangling it */
463 if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
464 goto tx_error_put;
465
466 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
467 goto tx_error_put;
468
469 /* drop old route */
470 dst_release(skb->dst);
471 skb->dst = &rt->u.dst;
472
473 /* mangle the packet */
474 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
475 goto tx_error;
476 ipv6_hdr(skb)->daddr = cp->daddr.in6;
477
478 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
479
480 /* FIXME: when application helper enlarges the packet and the length
481 is larger than the MTU of outgoing device, there will be still
482 MTU problem. */
483
484 /* Another hack: avoid icmp_send in ip_fragment */
485 skb->local_df = 1;
486
487 IP_VS_XMIT(PF_INET6, skb, rt);
488
489 LeaveFunction(10);
490 return NF_STOLEN;
491
492tx_error_icmp:
493 dst_link_failure(skb);
494tx_error:
495 LeaveFunction(10);
496 kfree_skb(skb);
497 return NF_STOLEN;
498tx_error_put:
499 dst_release(&rt->u.dst);
500 goto tx_error;
501}
502#endif
503
504
505/*
506 * IP Tunneling transmitter
507 *
508 * This function encapsulates the packet in a new IP packet, its
509 * destination will be set to cp->daddr. Most code of this function
510 * is taken from ipip.c.
511 *
512 * It is used in VS/TUN cluster. The load balancer selects a real
513 * server from a cluster based on a scheduling algorithm,
514 * encapsulates the request packet and forwards it to the selected
515 * server. For example, all real servers are configured with
516 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
517 * the encapsulated packet, it will decapsulate the packet, processe
518 * the request and return the response packets directly to the client
519 * without passing the load balancer. This can greatly increase the
520 * scalability of virtual server.
521 *
522 * Used for ANY protocol
523 */
524int
525ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
526 struct ip_vs_protocol *pp)
527{
528 struct rtable *rt; /* Route to the other host */
529 struct net_device *tdev; /* Device to other host */
530 struct iphdr *old_iph = ip_hdr(skb);
531 u8 tos = old_iph->tos;
532 __be16 df = old_iph->frag_off;
533 sk_buff_data_t old_transport_header = skb->transport_header;
534 struct iphdr *iph; /* Our new IP header */
535 unsigned int max_headroom; /* The extra header space needed */
536 int mtu;
537
538 EnterFunction(10);
539
540 if (skb->protocol != htons(ETH_P_IP)) {
541 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
542 "ETH_P_IP: %d, skb protocol: %d\n",
543 htons(ETH_P_IP), skb->protocol);
544 goto tx_error;
545 }
546
547 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
548 goto tx_error_icmp;
549
550 tdev = rt->u.dst.dev;
551
552 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
553 if (mtu < 68) {
554 ip_rt_put(rt);
555 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
556 goto tx_error;
557 }
558 if (skb->dst)
559 skb->dst->ops->update_pmtu(skb->dst, mtu);
560
561 df |= (old_iph->frag_off & htons(IP_DF));
562
563 if ((old_iph->frag_off & htons(IP_DF))
564 && mtu < ntohs(old_iph->tot_len)) {
565 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
566 ip_rt_put(rt);
567 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
568 goto tx_error;
569 }
570
571 /*
572 * Okay, now see if we can stuff it in the buffer as-is.
573 */
574 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
575
576 if (skb_headroom(skb) < max_headroom
577 || skb_cloned(skb) || skb_shared(skb)) {
578 struct sk_buff *new_skb =
579 skb_realloc_headroom(skb, max_headroom);
580 if (!new_skb) {
581 ip_rt_put(rt);
582 kfree_skb(skb);
583 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
584 return NF_STOLEN;
585 }
586 kfree_skb(skb);
587 skb = new_skb;
588 old_iph = ip_hdr(skb);
589 }
590
591 skb->transport_header = old_transport_header;
592
593 /* fix old IP header checksum */
594 ip_send_check(old_iph);
595
596 skb_push(skb, sizeof(struct iphdr));
597 skb_reset_network_header(skb);
598 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
599
600 /* drop old route */
601 dst_release(skb->dst);
602 skb->dst = &rt->u.dst;
603
604 /*
605 * Push down and install the IPIP header.
606 */
607 iph = ip_hdr(skb);
608 iph->version = 4;
609 iph->ihl = sizeof(struct iphdr)>>2;
610 iph->frag_off = df;
611 iph->protocol = IPPROTO_IPIP;
612 iph->tos = tos;
613 iph->daddr = rt->rt_dst;
614 iph->saddr = rt->rt_src;
615 iph->ttl = old_iph->ttl;
616 ip_select_ident(iph, &rt->u.dst, NULL);
617
618 /* Another hack: avoid icmp_send in ip_fragment */
619 skb->local_df = 1;
620
621 ip_local_out(skb);
622
623 LeaveFunction(10);
624
625 return NF_STOLEN;
626
627 tx_error_icmp:
628 dst_link_failure(skb);
629 tx_error:
630 kfree_skb(skb);
631 LeaveFunction(10);
632 return NF_STOLEN;
633}
634
635#ifdef CONFIG_IP_VS_IPV6
636int
637ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
638 struct ip_vs_protocol *pp)
639{
640 struct rt6_info *rt; /* Route to the other host */
641 struct net_device *tdev; /* Device to other host */
642 struct ipv6hdr *old_iph = ipv6_hdr(skb);
643 sk_buff_data_t old_transport_header = skb->transport_header;
644 struct ipv6hdr *iph; /* Our new IP header */
645 unsigned int max_headroom; /* The extra header space needed */
646 int mtu;
647
648 EnterFunction(10);
649
650 if (skb->protocol != htons(ETH_P_IPV6)) {
651 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
652 "ETH_P_IPV6: %d, skb protocol: %d\n",
653 htons(ETH_P_IPV6), skb->protocol);
654 goto tx_error;
655 }
656
657 rt = __ip_vs_get_out_rt_v6(cp);
658 if (!rt)
659 goto tx_error_icmp;
660
661 tdev = rt->u.dst.dev;
662
663 mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
664 /* TODO IPv6: do we need this check in IPv6? */
665 if (mtu < 1280) {
666 dst_release(&rt->u.dst);
667 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
668 goto tx_error;
669 }
670 if (skb->dst)
671 skb->dst->ops->update_pmtu(skb->dst, mtu);
672
673 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
674 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
675 dst_release(&rt->u.dst);
676 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
677 goto tx_error;
678 }
679
680 /*
681 * Okay, now see if we can stuff it in the buffer as-is.
682 */
683 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
684
685 if (skb_headroom(skb) < max_headroom
686 || skb_cloned(skb) || skb_shared(skb)) {
687 struct sk_buff *new_skb =
688 skb_realloc_headroom(skb, max_headroom);
689 if (!new_skb) {
690 dst_release(&rt->u.dst);
691 kfree_skb(skb);
692 IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
693 return NF_STOLEN;
694 }
695 kfree_skb(skb);
696 skb = new_skb;
697 old_iph = ipv6_hdr(skb);
698 }
699
700 skb->transport_header = old_transport_header;
701
702 skb_push(skb, sizeof(struct ipv6hdr));
703 skb_reset_network_header(skb);
704 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
705
706 /* drop old route */
707 dst_release(skb->dst);
708 skb->dst = &rt->u.dst;
709
710 /*
711 * Push down and install the IPIP header.
712 */
713 iph = ipv6_hdr(skb);
714 iph->version = 6;
715 iph->nexthdr = IPPROTO_IPV6;
716 iph->payload_len = old_iph->payload_len + sizeof(old_iph);
717 iph->priority = old_iph->priority;
718 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
719 iph->daddr = rt->rt6i_dst.addr;
720 iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */
721 iph->hop_limit = old_iph->hop_limit;
722
723 /* Another hack: avoid icmp_send in ip_fragment */
724 skb->local_df = 1;
725
726 ip6_local_out(skb);
727
728 LeaveFunction(10);
729
730 return NF_STOLEN;
731
732tx_error_icmp:
733 dst_link_failure(skb);
734tx_error:
735 kfree_skb(skb);
736 LeaveFunction(10);
737 return NF_STOLEN;
738}
739#endif
740
741
742/*
743 * Direct Routing transmitter
744 * Used for ANY protocol
745 */
746int
747ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
748 struct ip_vs_protocol *pp)
749{
750 struct rtable *rt; /* Route to the other host */
751 struct iphdr *iph = ip_hdr(skb);
752 int mtu;
753
754 EnterFunction(10);
755
756 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
757 goto tx_error_icmp;
758
759 /* MTU checking */
760 mtu = dst_mtu(&rt->u.dst);
761 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
762 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
763 ip_rt_put(rt);
764 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
765 goto tx_error;
766 }
767
768 /*
769 * Call ip_send_check because we are not sure it is called
770 * after ip_defrag. Is copy-on-write needed?
771 */
772 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
773 ip_rt_put(rt);
774 return NF_STOLEN;
775 }
776 ip_send_check(ip_hdr(skb));
777
778 /* drop old route */
779 dst_release(skb->dst);
780 skb->dst = &rt->u.dst;
781
782 /* Another hack: avoid icmp_send in ip_fragment */
783 skb->local_df = 1;
784
785 IP_VS_XMIT(PF_INET, skb, rt);
786
787 LeaveFunction(10);
788 return NF_STOLEN;
789
790 tx_error_icmp:
791 dst_link_failure(skb);
792 tx_error:
793 kfree_skb(skb);
794 LeaveFunction(10);
795 return NF_STOLEN;
796}
797
798#ifdef CONFIG_IP_VS_IPV6
799int
800ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
801 struct ip_vs_protocol *pp)
802{
803 struct rt6_info *rt; /* Route to the other host */
804 int mtu;
805
806 EnterFunction(10);
807
808 rt = __ip_vs_get_out_rt_v6(cp);
809 if (!rt)
810 goto tx_error_icmp;
811
812 /* MTU checking */
813 mtu = dst_mtu(&rt->u.dst);
814 if (skb->len > mtu) {
815 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
816 dst_release(&rt->u.dst);
817 IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
818 goto tx_error;
819 }
820
821 /*
822 * Call ip_send_check because we are not sure it is called
823 * after ip_defrag. Is copy-on-write needed?
824 */
825 skb = skb_share_check(skb, GFP_ATOMIC);
826 if (unlikely(skb == NULL)) {
827 dst_release(&rt->u.dst);
828 return NF_STOLEN;
829 }
830
831 /* drop old route */
832 dst_release(skb->dst);
833 skb->dst = &rt->u.dst;
834
835 /* Another hack: avoid icmp_send in ip_fragment */
836 skb->local_df = 1;
837
838 IP_VS_XMIT(PF_INET6, skb, rt);
839
840 LeaveFunction(10);
841 return NF_STOLEN;
842
843tx_error_icmp:
844 dst_link_failure(skb);
845tx_error:
846 kfree_skb(skb);
847 LeaveFunction(10);
848 return NF_STOLEN;
849}
850#endif
851
852
853/*
854 * ICMP packet transmitter
855 * called by the ip_vs_in_icmp
856 */
857int
858ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
859 struct ip_vs_protocol *pp, int offset)
860{
861 struct rtable *rt; /* Route to the other host */
862 int mtu;
863 int rc;
864
865 EnterFunction(10);
866
867 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
868 forwarded directly here, because there is no need to
869 translate address/port back */
870 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
871 if (cp->packet_xmit)
872 rc = cp->packet_xmit(skb, cp, pp);
873 else
874 rc = NF_ACCEPT;
875 /* do not touch skb anymore */
876 atomic_inc(&cp->in_pkts);
877 goto out;
878 }
879
880 /*
881 * mangle and send the packet here (only for VS/NAT)
882 */
883
884 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
885 goto tx_error_icmp;
886
887 /* MTU checking */
888 mtu = dst_mtu(&rt->u.dst);
889 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
890 ip_rt_put(rt);
891 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
892 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
893 goto tx_error;
894 }
895
896 /* copy-on-write the packet before mangling it */
897 if (!skb_make_writable(skb, offset))
898 goto tx_error_put;
899
900 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
901 goto tx_error_put;
902
903 /* drop the old route when skb is not shared */
904 dst_release(skb->dst);
905 skb->dst = &rt->u.dst;
906
907 ip_vs_nat_icmp(skb, pp, cp, 0);
908
909 /* Another hack: avoid icmp_send in ip_fragment */
910 skb->local_df = 1;
911
912 IP_VS_XMIT(PF_INET, skb, rt);
913
914 rc = NF_STOLEN;
915 goto out;
916
917 tx_error_icmp:
918 dst_link_failure(skb);
919 tx_error:
920 dev_kfree_skb(skb);
921 rc = NF_STOLEN;
922 out:
923 LeaveFunction(10);
924 return rc;
925 tx_error_put:
926 ip_rt_put(rt);
927 goto tx_error;
928}
929
930#ifdef CONFIG_IP_VS_IPV6
931int
932ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
933 struct ip_vs_protocol *pp, int offset)
934{
935 struct rt6_info *rt; /* Route to the other host */
936 int mtu;
937 int rc;
938
939 EnterFunction(10);
940
941 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
942 forwarded directly here, because there is no need to
943 translate address/port back */
944 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
945 if (cp->packet_xmit)
946 rc = cp->packet_xmit(skb, cp, pp);
947 else
948 rc = NF_ACCEPT;
949 /* do not touch skb anymore */
950 atomic_inc(&cp->in_pkts);
951 goto out;
952 }
953
954 /*
955 * mangle and send the packet here (only for VS/NAT)
956 */
957
958 rt = __ip_vs_get_out_rt_v6(cp);
959 if (!rt)
960 goto tx_error_icmp;
961
962 /* MTU checking */
963 mtu = dst_mtu(&rt->u.dst);
964 if (skb->len > mtu) {
965 dst_release(&rt->u.dst);
966 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
967 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
968 goto tx_error;
969 }
970
971 /* copy-on-write the packet before mangling it */
972 if (!skb_make_writable(skb, offset))
973 goto tx_error_put;
974
975 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
976 goto tx_error_put;
977
978 /* drop the old route when skb is not shared */
979 dst_release(skb->dst);
980 skb->dst = &rt->u.dst;
981
982 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
983
984 /* Another hack: avoid icmp_send in ip_fragment */
985 skb->local_df = 1;
986
987 IP_VS_XMIT(PF_INET6, skb, rt);
988
989 rc = NF_STOLEN;
990 goto out;
991
992tx_error_icmp:
993 dst_link_failure(skb);
994tx_error:
995 dev_kfree_skb(skb);
996 rc = NF_STOLEN;
997out:
998 LeaveFunction(10);
999 return rc;
1000tx_error_put:
1001 dst_release(&rt->u.dst);
1002 goto tx_error;
1003}
1004#endif