diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4/ipvs |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/ipv4/ipvs')
27 files changed, 12565 insertions, 0 deletions
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig new file mode 100644 index 000000000000..63a82b4b64bb --- /dev/null +++ b/net/ipv4/ipvs/Kconfig | |||
@@ -0,0 +1,244 @@ | |||
1 | # | ||
2 | # IP Virtual Server configuration | ||
3 | # | ||
4 | menu "IP: Virtual Server Configuration" | ||
5 | depends on INET && NETFILTER | ||
6 | |||
7 | config IP_VS | ||
8 | tristate "IP virtual server support (EXPERIMENTAL)" | ||
9 | depends on INET && NETFILTER | ||
10 | ---help--- | ||
11 | IP Virtual Server support will let you build a high-performance | ||
12 | virtual server based on cluster of two or more real servers. This | ||
13 | option must be enabled for at least one of the clustered computers | ||
14 | that will take care of intercepting incoming connections to a | ||
15 | single IP address and scheduling them to real servers. | ||
16 | |||
17 | Three request dispatching techniques are implemented, they are | ||
18 | virtual server via NAT, virtual server via tunneling and virtual | ||
19 | server via direct routing. The several scheduling algorithms can | ||
20 | be used to choose which server the connection is directed to, | ||
21 | thus load balancing can be achieved among the servers. For more | ||
22 | information and its administration program, please visit the | ||
23 | following URL: <http://www.linuxvirtualserver.org/>. | ||
24 | |||
25 | If you want to compile it in kernel, say Y. To compile it as a | ||
26 | module, choose M here. If unsure, say N. | ||
27 | |||
28 | config IP_VS_DEBUG | ||
29 | bool "IP virtual server debugging" | ||
30 | depends on IP_VS | ||
31 | ---help--- | ||
32 | Say Y here if you want to get additional messages useful in | ||
33 | debugging the IP virtual server code. You can change the debug | ||
34 | level in /proc/sys/net/ipv4/vs/debug_level | ||
35 | |||
36 | config IP_VS_TAB_BITS | ||
37 | int "IPVS connection table size (the Nth power of 2)" | ||
38 | depends on IP_VS | ||
39 | default "12" | ||
40 | ---help--- | ||
41 | The IPVS connection hash table uses the chaining scheme to handle | ||
42 | hash collisions. Using a big IPVS connection hash table will greatly | ||
43 | reduce conflicts when there are hundreds of thousands of connections | ||
44 | in the hash table. | ||
45 | |||
46 | Note the table size must be power of 2. The table size will be the | ||
47 | value of 2 to the your input number power. The number to choose is | ||
48 | from 8 to 20, the default number is 12, which means the table size | ||
49 | is 4096. Don't input the number too small, otherwise you will lose | ||
50 | performance on it. You can adapt the table size yourself, according | ||
51 | to your virtual server application. It is good to set the table size | ||
52 | not far less than the number of connections per second multiplying | ||
53 | average lasting time of connection in the table. For example, your | ||
54 | virtual server gets 200 connections per second, the connection lasts | ||
55 | for 200 seconds in average in the connection table, the table size | ||
56 | should be not far less than 200x200, it is good to set the table | ||
57 | size 32768 (2**15). | ||
58 | |||
59 | Another note that each connection occupies 128 bytes effectively and | ||
60 | each hash entry uses 8 bytes, so you can estimate how much memory is | ||
61 | needed for your box. | ||
62 | |||
63 | comment "IPVS transport protocol load balancing support" | ||
64 | depends on IP_VS | ||
65 | |||
66 | config IP_VS_PROTO_TCP | ||
67 | bool "TCP load balancing support" | ||
68 | depends on IP_VS | ||
69 | ---help--- | ||
70 | This option enables support for load balancing TCP transport | ||
71 | protocol. Say Y if unsure. | ||
72 | |||
73 | config IP_VS_PROTO_UDP | ||
74 | bool "UDP load balancing support" | ||
75 | depends on IP_VS | ||
76 | ---help--- | ||
77 | This option enables support for load balancing UDP transport | ||
78 | protocol. Say Y if unsure. | ||
79 | |||
80 | config IP_VS_PROTO_ESP | ||
81 | bool "ESP load balancing support" | ||
82 | depends on IP_VS | ||
83 | ---help--- | ||
84 | This option enables support for load balancing ESP (Encapsultion | ||
85 | Security Payload) transport protocol. Say Y if unsure. | ||
86 | |||
87 | config IP_VS_PROTO_AH | ||
88 | bool "AH load balancing support" | ||
89 | depends on IP_VS | ||
90 | ---help--- | ||
91 | This option enables support for load balancing AH (Authentication | ||
92 | Header) transport protocol. Say Y if unsure. | ||
93 | |||
94 | comment "IPVS scheduler" | ||
95 | depends on IP_VS | ||
96 | |||
97 | config IP_VS_RR | ||
98 | tristate "round-robin scheduling" | ||
99 | depends on IP_VS | ||
100 | ---help--- | ||
101 | The robin-robin scheduling algorithm simply directs network | ||
102 | connections to different real servers in a round-robin manner. | ||
103 | |||
104 | If you want to compile it in kernel, say Y. To compile it as a | ||
105 | module, choose M here. If unsure, say N. | ||
106 | |||
107 | config IP_VS_WRR | ||
108 | tristate "weighted round-robin scheduling" | ||
109 | depends on IP_VS | ||
110 | ---help--- | ||
111 | The weighted robin-robin scheduling algorithm directs network | ||
112 | connections to different real servers based on server weights | ||
113 | in a round-robin manner. Servers with higher weights receive | ||
114 | new connections first than those with less weights, and servers | ||
115 | with higher weights get more connections than those with less | ||
116 | weights and servers with equal weights get equal connections. | ||
117 | |||
118 | If you want to compile it in kernel, say Y. To compile it as a | ||
119 | module, choose M here. If unsure, say N. | ||
120 | |||
121 | config IP_VS_LC | ||
122 | tristate "least-connection scheduling" | ||
123 | depends on IP_VS | ||
124 | ---help--- | ||
125 | The least-connection scheduling algorithm directs network | ||
126 | connections to the server with the least number of active | ||
127 | connections. | ||
128 | |||
129 | If you want to compile it in kernel, say Y. To compile it as a | ||
130 | module, choose M here. If unsure, say N. | ||
131 | |||
132 | config IP_VS_WLC | ||
133 | tristate "weighted least-connection scheduling" | ||
134 | depends on IP_VS | ||
135 | ---help--- | ||
136 | The weighted least-connection scheduling algorithm directs network | ||
137 | connections to the server with the least active connections | ||
138 | normalized by the server weight. | ||
139 | |||
140 | If you want to compile it in kernel, say Y. To compile it as a | ||
141 | module, choose M here. If unsure, say N. | ||
142 | |||
143 | config IP_VS_LBLC | ||
144 | tristate "locality-based least-connection scheduling" | ||
145 | depends on IP_VS | ||
146 | ---help--- | ||
147 | The locality-based least-connection scheduling algorithm is for | ||
148 | destination IP load balancing. It is usually used in cache cluster. | ||
149 | This algorithm usually directs packet destined for an IP address to | ||
150 | its server if the server is alive and under load. If the server is | ||
151 | overloaded (its active connection numbers is larger than its weight) | ||
152 | and there is a server in its half load, then allocate the weighted | ||
153 | least-connection server to this IP address. | ||
154 | |||
155 | If you want to compile it in kernel, say Y. To compile it as a | ||
156 | module, choose M here. If unsure, say N. | ||
157 | |||
158 | config IP_VS_LBLCR | ||
159 | tristate "locality-based least-connection with replication scheduling" | ||
160 | depends on IP_VS | ||
161 | ---help--- | ||
162 | The locality-based least-connection with replication scheduling | ||
163 | algorithm is also for destination IP load balancing. It is | ||
164 | usually used in cache cluster. It differs from the LBLC scheduling | ||
165 | as follows: the load balancer maintains mappings from a target | ||
166 | to a set of server nodes that can serve the target. Requests for | ||
167 | a target are assigned to the least-connection node in the target's | ||
168 | server set. If all the node in the server set are over loaded, | ||
169 | it picks up a least-connection node in the cluster and adds it | ||
170 | in the sever set for the target. If the server set has not been | ||
171 | modified for the specified time, the most loaded node is removed | ||
172 | from the server set, in order to avoid high degree of replication. | ||
173 | |||
174 | If you want to compile it in kernel, say Y. To compile it as a | ||
175 | module, choose M here. If unsure, say N. | ||
176 | |||
177 | config IP_VS_DH | ||
178 | tristate "destination hashing scheduling" | ||
179 | depends on IP_VS | ||
180 | ---help--- | ||
181 | The destination hashing scheduling algorithm assigns network | ||
182 | connections to the servers through looking up a statically assigned | ||
183 | hash table by their destination IP addresses. | ||
184 | |||
185 | If you want to compile it in kernel, say Y. To compile it as a | ||
186 | module, choose M here. If unsure, say N. | ||
187 | |||
188 | config IP_VS_SH | ||
189 | tristate "source hashing scheduling" | ||
190 | depends on IP_VS | ||
191 | ---help--- | ||
192 | The source hashing scheduling algorithm assigns network | ||
193 | connections to the servers through looking up a statically assigned | ||
194 | hash table by their source IP addresses. | ||
195 | |||
196 | If you want to compile it in kernel, say Y. To compile it as a | ||
197 | module, choose M here. If unsure, say N. | ||
198 | |||
199 | config IP_VS_SED | ||
200 | tristate "shortest expected delay scheduling" | ||
201 | depends on IP_VS | ||
202 | ---help--- | ||
203 | The shortest expected delay scheduling algorithm assigns network | ||
204 | connections to the server with the shortest expected delay. The | ||
205 | expected delay that the job will experience is (Ci + 1) / Ui if | ||
206 | sent to the ith server, in which Ci is the number of connections | ||
207 | on the the ith server and Ui is the fixed service rate (weight) | ||
208 | of the ith server. | ||
209 | |||
210 | If you want to compile it in kernel, say Y. To compile it as a | ||
211 | module, choose M here. If unsure, say N. | ||
212 | |||
213 | config IP_VS_NQ | ||
214 | tristate "never queue scheduling" | ||
215 | depends on IP_VS | ||
216 | ---help--- | ||
217 | The never queue scheduling algorithm adopts a two-speed model. | ||
218 | When there is an idle server available, the job will be sent to | ||
219 | the idle server, instead of waiting for a fast one. When there | ||
220 | is no idle server available, the job will be sent to the server | ||
221 | that minimize its expected delay (The Shortest Expected Delay | ||
222 | scheduling algorithm). | ||
223 | |||
224 | If you want to compile it in kernel, say Y. To compile it as a | ||
225 | module, choose M here. If unsure, say N. | ||
226 | |||
227 | comment 'IPVS application helper' | ||
228 | depends on IP_VS | ||
229 | |||
230 | config IP_VS_FTP | ||
231 | tristate "FTP protocol helper" | ||
232 | depends on IP_VS && IP_VS_PROTO_TCP | ||
233 | ---help--- | ||
234 | FTP is a protocol that transfers IP address and/or port number in | ||
235 | the payload. In the virtual server via Network Address Translation, | ||
236 | the IP address and port number of real servers cannot be sent to | ||
237 | clients in ftp connections directly, so FTP protocol helper is | ||
238 | required for tracking the connection and mangling it back to that of | ||
239 | virtual service. | ||
240 | |||
241 | If you want to compile it in kernel, say Y. To compile it as a | ||
242 | module, choose M here. If unsure, say N. | ||
243 | |||
244 | endmenu | ||
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile new file mode 100644 index 000000000000..a788461a40c9 --- /dev/null +++ b/net/ipv4/ipvs/Makefile | |||
@@ -0,0 +1,34 @@ | |||
1 | # | ||
2 | # Makefile for the IPVS modules on top of IPv4. | ||
3 | # | ||
4 | |||
5 | # IPVS transport protocol load balancing support | ||
6 | ip_vs_proto-objs-y := | ||
7 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o | ||
8 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o | ||
9 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o | ||
10 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o | ||
11 | |||
12 | ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ | ||
13 | ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ | ||
14 | ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \ | ||
15 | $(ip_vs_proto-objs-y) | ||
16 | |||
17 | |||
18 | # IPVS core | ||
19 | obj-$(CONFIG_IP_VS) += ip_vs.o | ||
20 | |||
21 | # IPVS schedulers | ||
22 | obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o | ||
23 | obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o | ||
24 | obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o | ||
25 | obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o | ||
26 | obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o | ||
27 | obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o | ||
28 | obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o | ||
29 | obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o | ||
30 | obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o | ||
31 | obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o | ||
32 | |||
33 | # IPVS application helpers | ||
34 | obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o | ||
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c new file mode 100644 index 000000000000..d9212addd193 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_app.c | |||
@@ -0,0 +1,658 @@ | |||
1 | /* | ||
2 | * ip_vs_app.c: Application module support for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference | ||
14 | * is that ip_vs_app module handles the reverse direction (incoming requests | ||
15 | * and outgoing responses). | ||
16 | * | ||
17 | * IP_MASQ_APP application masquerading module | ||
18 | * | ||
19 | * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/skbuff.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/ip.h> | ||
28 | #include <net/protocol.h> | ||
29 | #include <asm/system.h> | ||
30 | #include <linux/stat.h> | ||
31 | #include <linux/proc_fs.h> | ||
32 | #include <linux/seq_file.h> | ||
33 | |||
34 | #include <net/ip_vs.h> | ||
35 | |||
36 | EXPORT_SYMBOL(register_ip_vs_app); | ||
37 | EXPORT_SYMBOL(unregister_ip_vs_app); | ||
38 | EXPORT_SYMBOL(register_ip_vs_app_inc); | ||
39 | |||
40 | /* ipvs application list head */ | ||
41 | static LIST_HEAD(ip_vs_app_list); | ||
42 | static DECLARE_MUTEX(__ip_vs_app_mutex); | ||
43 | |||
44 | |||
45 | /* | ||
46 | * Get an ip_vs_app object | ||
47 | */ | ||
48 | static inline int ip_vs_app_get(struct ip_vs_app *app) | ||
49 | { | ||
50 | /* test and get the module atomically */ | ||
51 | if (app->module) | ||
52 | return try_module_get(app->module); | ||
53 | else | ||
54 | return 1; | ||
55 | } | ||
56 | |||
57 | |||
58 | static inline void ip_vs_app_put(struct ip_vs_app *app) | ||
59 | { | ||
60 | if (app->module) | ||
61 | module_put(app->module); | ||
62 | } | ||
63 | |||
64 | |||
65 | /* | ||
66 | * Allocate/initialize app incarnation and register it in proto apps. | ||
67 | */ | ||
68 | static int | ||
69 | ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) | ||
70 | { | ||
71 | struct ip_vs_protocol *pp; | ||
72 | struct ip_vs_app *inc; | ||
73 | int ret; | ||
74 | |||
75 | if (!(pp = ip_vs_proto_get(proto))) | ||
76 | return -EPROTONOSUPPORT; | ||
77 | |||
78 | if (!pp->unregister_app) | ||
79 | return -EOPNOTSUPP; | ||
80 | |||
81 | inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL); | ||
82 | if (!inc) | ||
83 | return -ENOMEM; | ||
84 | memcpy(inc, app, sizeof(*inc)); | ||
85 | INIT_LIST_HEAD(&inc->p_list); | ||
86 | INIT_LIST_HEAD(&inc->incs_list); | ||
87 | inc->app = app; | ||
88 | inc->port = htons(port); | ||
89 | atomic_set(&inc->usecnt, 0); | ||
90 | |||
91 | if (app->timeouts) { | ||
92 | inc->timeout_table = | ||
93 | ip_vs_create_timeout_table(app->timeouts, | ||
94 | app->timeouts_size); | ||
95 | if (!inc->timeout_table) { | ||
96 | ret = -ENOMEM; | ||
97 | goto out; | ||
98 | } | ||
99 | } | ||
100 | |||
101 | ret = pp->register_app(inc); | ||
102 | if (ret) | ||
103 | goto out; | ||
104 | |||
105 | list_add(&inc->a_list, &app->incs_list); | ||
106 | IP_VS_DBG(9, "%s application %s:%u registered\n", | ||
107 | pp->name, inc->name, inc->port); | ||
108 | |||
109 | return 0; | ||
110 | |||
111 | out: | ||
112 | if (inc->timeout_table) | ||
113 | kfree(inc->timeout_table); | ||
114 | kfree(inc); | ||
115 | return ret; | ||
116 | } | ||
117 | |||
118 | |||
119 | /* | ||
120 | * Release app incarnation | ||
121 | */ | ||
122 | static void | ||
123 | ip_vs_app_inc_release(struct ip_vs_app *inc) | ||
124 | { | ||
125 | struct ip_vs_protocol *pp; | ||
126 | |||
127 | if (!(pp = ip_vs_proto_get(inc->protocol))) | ||
128 | return; | ||
129 | |||
130 | if (pp->unregister_app) | ||
131 | pp->unregister_app(inc); | ||
132 | |||
133 | IP_VS_DBG(9, "%s App %s:%u unregistered\n", | ||
134 | pp->name, inc->name, inc->port); | ||
135 | |||
136 | list_del(&inc->a_list); | ||
137 | |||
138 | if (inc->timeout_table != NULL) | ||
139 | kfree(inc->timeout_table); | ||
140 | kfree(inc); | ||
141 | } | ||
142 | |||
143 | |||
144 | /* | ||
145 | * Get reference to app inc (only called from softirq) | ||
146 | * | ||
147 | */ | ||
148 | int ip_vs_app_inc_get(struct ip_vs_app *inc) | ||
149 | { | ||
150 | int result; | ||
151 | |||
152 | atomic_inc(&inc->usecnt); | ||
153 | if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) | ||
154 | atomic_dec(&inc->usecnt); | ||
155 | return result; | ||
156 | } | ||
157 | |||
158 | |||
159 | /* | ||
160 | * Put the app inc (only called from timer or net softirq) | ||
161 | */ | ||
162 | void ip_vs_app_inc_put(struct ip_vs_app *inc) | ||
163 | { | ||
164 | ip_vs_app_put(inc->app); | ||
165 | atomic_dec(&inc->usecnt); | ||
166 | } | ||
167 | |||
168 | |||
169 | /* | ||
170 | * Register an application incarnation in protocol applications | ||
171 | */ | ||
172 | int | ||
173 | register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) | ||
174 | { | ||
175 | int result; | ||
176 | |||
177 | down(&__ip_vs_app_mutex); | ||
178 | |||
179 | result = ip_vs_app_inc_new(app, proto, port); | ||
180 | |||
181 | up(&__ip_vs_app_mutex); | ||
182 | |||
183 | return result; | ||
184 | } | ||
185 | |||
186 | |||
187 | /* | ||
188 | * ip_vs_app registration routine | ||
189 | */ | ||
190 | int register_ip_vs_app(struct ip_vs_app *app) | ||
191 | { | ||
192 | /* increase the module use count */ | ||
193 | ip_vs_use_count_inc(); | ||
194 | |||
195 | down(&__ip_vs_app_mutex); | ||
196 | |||
197 | list_add(&app->a_list, &ip_vs_app_list); | ||
198 | |||
199 | up(&__ip_vs_app_mutex); | ||
200 | |||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | |||
205 | /* | ||
206 | * ip_vs_app unregistration routine | ||
207 | * We are sure there are no app incarnations attached to services | ||
208 | */ | ||
209 | void unregister_ip_vs_app(struct ip_vs_app *app) | ||
210 | { | ||
211 | struct ip_vs_app *inc, *nxt; | ||
212 | |||
213 | down(&__ip_vs_app_mutex); | ||
214 | |||
215 | list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { | ||
216 | ip_vs_app_inc_release(inc); | ||
217 | } | ||
218 | |||
219 | list_del(&app->a_list); | ||
220 | |||
221 | up(&__ip_vs_app_mutex); | ||
222 | |||
223 | /* decrease the module use count */ | ||
224 | ip_vs_use_count_dec(); | ||
225 | } | ||
226 | |||
227 | |||
228 | #if 0000 | ||
229 | /* | ||
230 | * Get reference to app by name (called from user context) | ||
231 | */ | ||
232 | struct ip_vs_app *ip_vs_app_get_by_name(char *appname) | ||
233 | { | ||
234 | struct ip_vs_app *app, *a = NULL; | ||
235 | |||
236 | down(&__ip_vs_app_mutex); | ||
237 | |||
238 | list_for_each_entry(ent, &ip_vs_app_list, a_list) { | ||
239 | if (strcmp(app->name, appname)) | ||
240 | continue; | ||
241 | |||
242 | /* softirq may call ip_vs_app_get too, so the caller | ||
243 | must disable softirq on the current CPU */ | ||
244 | if (ip_vs_app_get(app)) | ||
245 | a = app; | ||
246 | break; | ||
247 | } | ||
248 | |||
249 | up(&__ip_vs_app_mutex); | ||
250 | |||
251 | return a; | ||
252 | } | ||
253 | #endif | ||
254 | |||
255 | |||
256 | /* | ||
257 | * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) | ||
258 | */ | ||
259 | int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) | ||
260 | { | ||
261 | return pp->app_conn_bind(cp); | ||
262 | } | ||
263 | |||
264 | |||
265 | /* | ||
266 | * Unbind cp from application incarnation (called by cp destructor) | ||
267 | */ | ||
268 | void ip_vs_unbind_app(struct ip_vs_conn *cp) | ||
269 | { | ||
270 | struct ip_vs_app *inc = cp->app; | ||
271 | |||
272 | if (!inc) | ||
273 | return; | ||
274 | |||
275 | if (inc->unbind_conn) | ||
276 | inc->unbind_conn(inc, cp); | ||
277 | if (inc->done_conn) | ||
278 | inc->done_conn(inc, cp); | ||
279 | ip_vs_app_inc_put(inc); | ||
280 | cp->app = NULL; | ||
281 | } | ||
282 | |||
283 | |||
284 | /* | ||
285 | * Fixes th->seq based on ip_vs_seq info. | ||
286 | */ | ||
287 | static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) | ||
288 | { | ||
289 | __u32 seq = ntohl(th->seq); | ||
290 | |||
291 | /* | ||
292 | * Adjust seq with delta-offset for all packets after | ||
293 | * the most recent resized pkt seq and with previous_delta offset | ||
294 | * for all packets before most recent resized pkt seq. | ||
295 | */ | ||
296 | if (vseq->delta || vseq->previous_delta) { | ||
297 | if(after(seq, vseq->init_seq)) { | ||
298 | th->seq = htonl(seq + vseq->delta); | ||
299 | IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", | ||
300 | vseq->delta); | ||
301 | } else { | ||
302 | th->seq = htonl(seq + vseq->previous_delta); | ||
303 | IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " | ||
304 | "(%d) to seq\n", vseq->previous_delta); | ||
305 | } | ||
306 | } | ||
307 | } | ||
308 | |||
309 | |||
310 | /* | ||
311 | * Fixes th->ack_seq based on ip_vs_seq info. | ||
312 | */ | ||
313 | static inline void | ||
314 | vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) | ||
315 | { | ||
316 | __u32 ack_seq = ntohl(th->ack_seq); | ||
317 | |||
318 | /* | ||
319 | * Adjust ack_seq with delta-offset for | ||
320 | * the packets AFTER most recent resized pkt has caused a shift | ||
321 | * for packets before most recent resized pkt, use previous_delta | ||
322 | */ | ||
323 | if (vseq->delta || vseq->previous_delta) { | ||
324 | /* since ack_seq is the number of octet that is expected | ||
325 | to receive next, so compare it with init_seq+delta */ | ||
326 | if(after(ack_seq, vseq->init_seq+vseq->delta)) { | ||
327 | th->ack_seq = htonl(ack_seq - vseq->delta); | ||
328 | IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " | ||
329 | "(%d) from ack_seq\n", vseq->delta); | ||
330 | |||
331 | } else { | ||
332 | th->ack_seq = htonl(ack_seq - vseq->previous_delta); | ||
333 | IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " | ||
334 | "previous_delta (%d) from ack_seq\n", | ||
335 | vseq->previous_delta); | ||
336 | } | ||
337 | } | ||
338 | } | ||
339 | |||
340 | |||
341 | /* | ||
342 | * Updates ip_vs_seq if pkt has been resized | ||
343 | * Assumes already checked proto==IPPROTO_TCP and diff!=0. | ||
344 | */ | ||
345 | static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, | ||
346 | unsigned flag, __u32 seq, int diff) | ||
347 | { | ||
348 | /* spinlock is to keep updating cp->flags atomic */ | ||
349 | spin_lock(&cp->lock); | ||
350 | if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { | ||
351 | vseq->previous_delta = vseq->delta; | ||
352 | vseq->delta += diff; | ||
353 | vseq->init_seq = seq; | ||
354 | cp->flags |= flag; | ||
355 | } | ||
356 | spin_unlock(&cp->lock); | ||
357 | } | ||
358 | |||
359 | static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb, | ||
360 | struct ip_vs_app *app) | ||
361 | { | ||
362 | int diff; | ||
363 | unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; | ||
364 | struct tcphdr *th; | ||
365 | __u32 seq; | ||
366 | |||
367 | if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) | ||
368 | return 0; | ||
369 | |||
370 | th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); | ||
371 | |||
372 | /* | ||
373 | * Remember seq number in case this pkt gets resized | ||
374 | */ | ||
375 | seq = ntohl(th->seq); | ||
376 | |||
377 | /* | ||
378 | * Fix seq stuff if flagged as so. | ||
379 | */ | ||
380 | if (cp->flags & IP_VS_CONN_F_OUT_SEQ) | ||
381 | vs_fix_seq(&cp->out_seq, th); | ||
382 | if (cp->flags & IP_VS_CONN_F_IN_SEQ) | ||
383 | vs_fix_ack_seq(&cp->in_seq, th); | ||
384 | |||
385 | /* | ||
386 | * Call private output hook function | ||
387 | */ | ||
388 | if (app->pkt_out == NULL) | ||
389 | return 1; | ||
390 | |||
391 | if (!app->pkt_out(app, cp, pskb, &diff)) | ||
392 | return 0; | ||
393 | |||
394 | /* | ||
395 | * Update ip_vs seq stuff if len has changed. | ||
396 | */ | ||
397 | if (diff != 0) | ||
398 | vs_seq_update(cp, &cp->out_seq, | ||
399 | IP_VS_CONN_F_OUT_SEQ, seq, diff); | ||
400 | |||
401 | return 1; | ||
402 | } | ||
403 | |||
404 | /* | ||
405 | * Output pkt hook. Will call bound ip_vs_app specific function | ||
406 | * called by ipvs packet handler, assumes previously checked cp!=NULL | ||
407 | * returns false if it can't handle packet (oom) | ||
408 | */ | ||
409 | int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb) | ||
410 | { | ||
411 | struct ip_vs_app *app; | ||
412 | |||
413 | /* | ||
414 | * check if application module is bound to | ||
415 | * this ip_vs_conn. | ||
416 | */ | ||
417 | if ((app = cp->app) == NULL) | ||
418 | return 1; | ||
419 | |||
420 | /* TCP is complicated */ | ||
421 | if (cp->protocol == IPPROTO_TCP) | ||
422 | return app_tcp_pkt_out(cp, pskb, app); | ||
423 | |||
424 | /* | ||
425 | * Call private output hook function | ||
426 | */ | ||
427 | if (app->pkt_out == NULL) | ||
428 | return 1; | ||
429 | |||
430 | return app->pkt_out(app, cp, pskb, NULL); | ||
431 | } | ||
432 | |||
433 | |||
434 | static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb, | ||
435 | struct ip_vs_app *app) | ||
436 | { | ||
437 | int diff; | ||
438 | unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; | ||
439 | struct tcphdr *th; | ||
440 | __u32 seq; | ||
441 | |||
442 | if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) | ||
443 | return 0; | ||
444 | |||
445 | th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); | ||
446 | |||
447 | /* | ||
448 | * Remember seq number in case this pkt gets resized | ||
449 | */ | ||
450 | seq = ntohl(th->seq); | ||
451 | |||
452 | /* | ||
453 | * Fix seq stuff if flagged as so. | ||
454 | */ | ||
455 | if (cp->flags & IP_VS_CONN_F_IN_SEQ) | ||
456 | vs_fix_seq(&cp->in_seq, th); | ||
457 | if (cp->flags & IP_VS_CONN_F_OUT_SEQ) | ||
458 | vs_fix_ack_seq(&cp->out_seq, th); | ||
459 | |||
460 | /* | ||
461 | * Call private input hook function | ||
462 | */ | ||
463 | if (app->pkt_in == NULL) | ||
464 | return 1; | ||
465 | |||
466 | if (!app->pkt_in(app, cp, pskb, &diff)) | ||
467 | return 0; | ||
468 | |||
469 | /* | ||
470 | * Update ip_vs seq stuff if len has changed. | ||
471 | */ | ||
472 | if (diff != 0) | ||
473 | vs_seq_update(cp, &cp->in_seq, | ||
474 | IP_VS_CONN_F_IN_SEQ, seq, diff); | ||
475 | |||
476 | return 1; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * Input pkt hook. Will call bound ip_vs_app specific function | ||
481 | * called by ipvs packet handler, assumes previously checked cp!=NULL. | ||
482 | * returns false if can't handle packet (oom). | ||
483 | */ | ||
484 | int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb) | ||
485 | { | ||
486 | struct ip_vs_app *app; | ||
487 | |||
488 | /* | ||
489 | * check if application module is bound to | ||
490 | * this ip_vs_conn. | ||
491 | */ | ||
492 | if ((app = cp->app) == NULL) | ||
493 | return 1; | ||
494 | |||
495 | /* TCP is complicated */ | ||
496 | if (cp->protocol == IPPROTO_TCP) | ||
497 | return app_tcp_pkt_in(cp, pskb, app); | ||
498 | |||
499 | /* | ||
500 | * Call private input hook function | ||
501 | */ | ||
502 | if (app->pkt_in == NULL) | ||
503 | return 1; | ||
504 | |||
505 | return app->pkt_in(app, cp, pskb, NULL); | ||
506 | } | ||
507 | |||
508 | |||
509 | #ifdef CONFIG_PROC_FS | ||
510 | /* | ||
511 | * /proc/net/ip_vs_app entry function | ||
512 | */ | ||
513 | |||
514 | static struct ip_vs_app *ip_vs_app_idx(loff_t pos) | ||
515 | { | ||
516 | struct ip_vs_app *app, *inc; | ||
517 | |||
518 | list_for_each_entry(app, &ip_vs_app_list, a_list) { | ||
519 | list_for_each_entry(inc, &app->incs_list, a_list) { | ||
520 | if (pos-- == 0) | ||
521 | return inc; | ||
522 | } | ||
523 | } | ||
524 | return NULL; | ||
525 | |||
526 | } | ||
527 | |||
528 | static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) | ||
529 | { | ||
530 | down(&__ip_vs_app_mutex); | ||
531 | |||
532 | return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; | ||
533 | } | ||
534 | |||
535 | static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
536 | { | ||
537 | struct ip_vs_app *inc, *app; | ||
538 | struct list_head *e; | ||
539 | |||
540 | ++*pos; | ||
541 | if (v == SEQ_START_TOKEN) | ||
542 | return ip_vs_app_idx(0); | ||
543 | |||
544 | inc = v; | ||
545 | app = inc->app; | ||
546 | |||
547 | if ((e = inc->a_list.next) != &app->incs_list) | ||
548 | return list_entry(e, struct ip_vs_app, a_list); | ||
549 | |||
550 | /* go on to next application */ | ||
551 | for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { | ||
552 | app = list_entry(e, struct ip_vs_app, a_list); | ||
553 | list_for_each_entry(inc, &app->incs_list, a_list) { | ||
554 | return inc; | ||
555 | } | ||
556 | } | ||
557 | return NULL; | ||
558 | } | ||
559 | |||
560 | static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) | ||
561 | { | ||
562 | up(&__ip_vs_app_mutex); | ||
563 | } | ||
564 | |||
565 | static int ip_vs_app_seq_show(struct seq_file *seq, void *v) | ||
566 | { | ||
567 | if (v == SEQ_START_TOKEN) | ||
568 | seq_puts(seq, "prot port usecnt name\n"); | ||
569 | else { | ||
570 | const struct ip_vs_app *inc = v; | ||
571 | |||
572 | seq_printf(seq, "%-3s %-7u %-6d %-17s\n", | ||
573 | ip_vs_proto_name(inc->protocol), | ||
574 | ntohs(inc->port), | ||
575 | atomic_read(&inc->usecnt), | ||
576 | inc->name); | ||
577 | } | ||
578 | return 0; | ||
579 | } | ||
580 | |||
581 | static struct seq_operations ip_vs_app_seq_ops = { | ||
582 | .start = ip_vs_app_seq_start, | ||
583 | .next = ip_vs_app_seq_next, | ||
584 | .stop = ip_vs_app_seq_stop, | ||
585 | .show = ip_vs_app_seq_show, | ||
586 | }; | ||
587 | |||
588 | static int ip_vs_app_open(struct inode *inode, struct file *file) | ||
589 | { | ||
590 | return seq_open(file, &ip_vs_app_seq_ops); | ||
591 | } | ||
592 | |||
593 | static struct file_operations ip_vs_app_fops = { | ||
594 | .owner = THIS_MODULE, | ||
595 | .open = ip_vs_app_open, | ||
596 | .read = seq_read, | ||
597 | .llseek = seq_lseek, | ||
598 | .release = seq_release, | ||
599 | }; | ||
600 | #endif | ||
601 | |||
602 | |||
603 | /* | ||
604 | * Replace a segment of data with a new segment | ||
605 | */ | ||
606 | int ip_vs_skb_replace(struct sk_buff *skb, int pri, | ||
607 | char *o_buf, int o_len, char *n_buf, int n_len) | ||
608 | { | ||
609 | struct iphdr *iph; | ||
610 | int diff; | ||
611 | int o_offset; | ||
612 | int o_left; | ||
613 | |||
614 | EnterFunction(9); | ||
615 | |||
616 | diff = n_len - o_len; | ||
617 | o_offset = o_buf - (char *)skb->data; | ||
618 | /* The length of left data after o_buf+o_len in the skb data */ | ||
619 | o_left = skb->len - (o_offset + o_len); | ||
620 | |||
621 | if (diff <= 0) { | ||
622 | memmove(o_buf + n_len, o_buf + o_len, o_left); | ||
623 | memcpy(o_buf, n_buf, n_len); | ||
624 | skb_trim(skb, skb->len + diff); | ||
625 | } else if (diff <= skb_tailroom(skb)) { | ||
626 | skb_put(skb, diff); | ||
627 | memmove(o_buf + n_len, o_buf + o_len, o_left); | ||
628 | memcpy(o_buf, n_buf, n_len); | ||
629 | } else { | ||
630 | if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) | ||
631 | return -ENOMEM; | ||
632 | skb_put(skb, diff); | ||
633 | memmove(skb->data + o_offset + n_len, | ||
634 | skb->data + o_offset + o_len, o_left); | ||
635 | memcpy(skb->data + o_offset, n_buf, n_len); | ||
636 | } | ||
637 | |||
638 | /* must update the iph total length here */ | ||
639 | iph = skb->nh.iph; | ||
640 | iph->tot_len = htons(skb->len); | ||
641 | |||
642 | LeaveFunction(9); | ||
643 | return 0; | ||
644 | } | ||
645 | |||
646 | |||
647 | int ip_vs_app_init(void) | ||
648 | { | ||
649 | /* we will replace it with proc_net_ipvs_create() soon */ | ||
650 | proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops); | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | |||
655 | void ip_vs_app_cleanup(void) | ||
656 | { | ||
657 | proc_net_remove("ip_vs_app"); | ||
658 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c new file mode 100644 index 000000000000..fd6feb5499fe --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_conn.c | |||
@@ -0,0 +1,920 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ | ||
9 | * | ||
10 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
11 | * Peter Kese <peter.kese@ijs.si> | ||
12 | * Julian Anastasov <ja@ssi.bg> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or | ||
15 | * modify it under the terms of the GNU General Public License | ||
16 | * as published by the Free Software Foundation; either version | ||
17 | * 2 of the License, or (at your option) any later version. | ||
18 | * | ||
19 | * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, | ||
20 | * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms | ||
21 | * and others. Many code here is taken from IP MASQ code of kernel 2.2. | ||
22 | * | ||
23 | * Changes: | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/vmalloc.h> | ||
29 | #include <linux/proc_fs.h> /* for proc_net_* */ | ||
30 | #include <linux/seq_file.h> | ||
31 | #include <linux/jhash.h> | ||
32 | #include <linux/random.h> | ||
33 | |||
34 | #include <net/ip_vs.h> | ||
35 | |||
36 | |||
37 | /* | ||
38 | * Connection hash table: for input and output packets lookups of IPVS | ||
39 | */ | ||
40 | static struct list_head *ip_vs_conn_tab; | ||
41 | |||
42 | /* SLAB cache for IPVS connections */ | ||
43 | static kmem_cache_t *ip_vs_conn_cachep; | ||
44 | |||
45 | /* counter for current IPVS connections */ | ||
46 | static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); | ||
47 | |||
48 | /* counter for no client port connections */ | ||
49 | static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); | ||
50 | |||
51 | /* random value for IPVS connection hash */ | ||
52 | static unsigned int ip_vs_conn_rnd; | ||
53 | |||
54 | /* | ||
55 | * Fine locking granularity for big connection hash table | ||
56 | */ | ||
57 | #define CT_LOCKARRAY_BITS 4 | ||
58 | #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) | ||
59 | #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) | ||
60 | |||
61 | struct ip_vs_aligned_lock | ||
62 | { | ||
63 | rwlock_t l; | ||
64 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); | ||
65 | |||
66 | /* lock array for conn table */ | ||
67 | static struct ip_vs_aligned_lock | ||
68 | __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; | ||
69 | |||
70 | static inline void ct_read_lock(unsigned key) | ||
71 | { | ||
72 | read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
73 | } | ||
74 | |||
75 | static inline void ct_read_unlock(unsigned key) | ||
76 | { | ||
77 | read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
78 | } | ||
79 | |||
80 | static inline void ct_write_lock(unsigned key) | ||
81 | { | ||
82 | write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
83 | } | ||
84 | |||
85 | static inline void ct_write_unlock(unsigned key) | ||
86 | { | ||
87 | write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
88 | } | ||
89 | |||
90 | static inline void ct_read_lock_bh(unsigned key) | ||
91 | { | ||
92 | read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
93 | } | ||
94 | |||
95 | static inline void ct_read_unlock_bh(unsigned key) | ||
96 | { | ||
97 | read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
98 | } | ||
99 | |||
100 | static inline void ct_write_lock_bh(unsigned key) | ||
101 | { | ||
102 | write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
103 | } | ||
104 | |||
105 | static inline void ct_write_unlock_bh(unsigned key) | ||
106 | { | ||
107 | write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
108 | } | ||
109 | |||
110 | |||
111 | /* | ||
112 | * Returns hash value for IPVS connection entry | ||
113 | */ | ||
114 | static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port) | ||
115 | { | ||
116 | return jhash_3words(addr, port, proto, ip_vs_conn_rnd) | ||
117 | & IP_VS_CONN_TAB_MASK; | ||
118 | } | ||
119 | |||
120 | |||
121 | /* | ||
122 | * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. | ||
123 | * returns bool success. | ||
124 | */ | ||
125 | static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) | ||
126 | { | ||
127 | unsigned hash; | ||
128 | int ret; | ||
129 | |||
130 | /* Hash by protocol, client address and port */ | ||
131 | hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); | ||
132 | |||
133 | ct_write_lock(hash); | ||
134 | |||
135 | if (!(cp->flags & IP_VS_CONN_F_HASHED)) { | ||
136 | list_add(&cp->c_list, &ip_vs_conn_tab[hash]); | ||
137 | cp->flags |= IP_VS_CONN_F_HASHED; | ||
138 | atomic_inc(&cp->refcnt); | ||
139 | ret = 1; | ||
140 | } else { | ||
141 | IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " | ||
142 | "called from %p\n", __builtin_return_address(0)); | ||
143 | ret = 0; | ||
144 | } | ||
145 | |||
146 | ct_write_unlock(hash); | ||
147 | |||
148 | return ret; | ||
149 | } | ||
150 | |||
151 | |||
152 | /* | ||
153 | * UNhashes ip_vs_conn from ip_vs_conn_tab. | ||
154 | * returns bool success. | ||
155 | */ | ||
156 | static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) | ||
157 | { | ||
158 | unsigned hash; | ||
159 | int ret; | ||
160 | |||
161 | /* unhash it and decrease its reference counter */ | ||
162 | hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); | ||
163 | |||
164 | ct_write_lock(hash); | ||
165 | |||
166 | if (cp->flags & IP_VS_CONN_F_HASHED) { | ||
167 | list_del(&cp->c_list); | ||
168 | cp->flags &= ~IP_VS_CONN_F_HASHED; | ||
169 | atomic_dec(&cp->refcnt); | ||
170 | ret = 1; | ||
171 | } else | ||
172 | ret = 0; | ||
173 | |||
174 | ct_write_unlock(hash); | ||
175 | |||
176 | return ret; | ||
177 | } | ||
178 | |||
179 | |||
180 | /* | ||
181 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. | ||
182 | * Called for pkts coming from OUTside-to-INside. | ||
183 | * s_addr, s_port: pkt source address (foreign host) | ||
184 | * d_addr, d_port: pkt dest address (load balancer) | ||
185 | */ | ||
186 | static inline struct ip_vs_conn *__ip_vs_conn_in_get | ||
187 | (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) | ||
188 | { | ||
189 | unsigned hash; | ||
190 | struct ip_vs_conn *cp; | ||
191 | |||
192 | hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); | ||
193 | |||
194 | ct_read_lock(hash); | ||
195 | |||
196 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
197 | if (s_addr==cp->caddr && s_port==cp->cport && | ||
198 | d_port==cp->vport && d_addr==cp->vaddr && | ||
199 | protocol==cp->protocol) { | ||
200 | /* HIT */ | ||
201 | atomic_inc(&cp->refcnt); | ||
202 | ct_read_unlock(hash); | ||
203 | return cp; | ||
204 | } | ||
205 | } | ||
206 | |||
207 | ct_read_unlock(hash); | ||
208 | |||
209 | return NULL; | ||
210 | } | ||
211 | |||
212 | struct ip_vs_conn *ip_vs_conn_in_get | ||
213 | (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) | ||
214 | { | ||
215 | struct ip_vs_conn *cp; | ||
216 | |||
217 | cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); | ||
218 | if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) | ||
219 | cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); | ||
220 | |||
221 | IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | ||
222 | ip_vs_proto_name(protocol), | ||
223 | NIPQUAD(s_addr), ntohs(s_port), | ||
224 | NIPQUAD(d_addr), ntohs(d_port), | ||
225 | cp?"hit":"not hit"); | ||
226 | |||
227 | return cp; | ||
228 | } | ||
229 | |||
230 | |||
231 | /* | ||
232 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. | ||
233 | * Called for pkts coming from inside-to-OUTside. | ||
234 | * s_addr, s_port: pkt source address (inside host) | ||
235 | * d_addr, d_port: pkt dest address (foreign host) | ||
236 | */ | ||
237 | struct ip_vs_conn *ip_vs_conn_out_get | ||
238 | (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) | ||
239 | { | ||
240 | unsigned hash; | ||
241 | struct ip_vs_conn *cp, *ret=NULL; | ||
242 | |||
243 | /* | ||
244 | * Check for "full" addressed entries | ||
245 | */ | ||
246 | hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); | ||
247 | |||
248 | ct_read_lock(hash); | ||
249 | |||
250 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
251 | if (d_addr == cp->caddr && d_port == cp->cport && | ||
252 | s_port == cp->dport && s_addr == cp->daddr && | ||
253 | protocol == cp->protocol) { | ||
254 | /* HIT */ | ||
255 | atomic_inc(&cp->refcnt); | ||
256 | ret = cp; | ||
257 | break; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | ct_read_unlock(hash); | ||
262 | |||
263 | IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | ||
264 | ip_vs_proto_name(protocol), | ||
265 | NIPQUAD(s_addr), ntohs(s_port), | ||
266 | NIPQUAD(d_addr), ntohs(d_port), | ||
267 | ret?"hit":"not hit"); | ||
268 | |||
269 | return ret; | ||
270 | } | ||
271 | |||
272 | |||
273 | /* | ||
274 | * Put back the conn and restart its timer with its timeout | ||
275 | */ | ||
276 | void ip_vs_conn_put(struct ip_vs_conn *cp) | ||
277 | { | ||
278 | /* reset it expire in its timeout */ | ||
279 | mod_timer(&cp->timer, jiffies+cp->timeout); | ||
280 | |||
281 | __ip_vs_conn_put(cp); | ||
282 | } | ||
283 | |||
284 | |||
285 | /* | ||
286 | * Fill a no_client_port connection with a client port number | ||
287 | */ | ||
288 | void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport) | ||
289 | { | ||
290 | if (ip_vs_conn_unhash(cp)) { | ||
291 | spin_lock(&cp->lock); | ||
292 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) { | ||
293 | atomic_dec(&ip_vs_conn_no_cport_cnt); | ||
294 | cp->flags &= ~IP_VS_CONN_F_NO_CPORT; | ||
295 | cp->cport = cport; | ||
296 | } | ||
297 | spin_unlock(&cp->lock); | ||
298 | |||
299 | /* hash on new dport */ | ||
300 | ip_vs_conn_hash(cp); | ||
301 | } | ||
302 | } | ||
303 | |||
304 | |||
305 | /* | ||
306 | * Bind a connection entry with the corresponding packet_xmit. | ||
307 | * Called by ip_vs_conn_new. | ||
308 | */ | ||
309 | static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) | ||
310 | { | ||
311 | switch (IP_VS_FWD_METHOD(cp)) { | ||
312 | case IP_VS_CONN_F_MASQ: | ||
313 | cp->packet_xmit = ip_vs_nat_xmit; | ||
314 | break; | ||
315 | |||
316 | case IP_VS_CONN_F_TUNNEL: | ||
317 | cp->packet_xmit = ip_vs_tunnel_xmit; | ||
318 | break; | ||
319 | |||
320 | case IP_VS_CONN_F_DROUTE: | ||
321 | cp->packet_xmit = ip_vs_dr_xmit; | ||
322 | break; | ||
323 | |||
324 | case IP_VS_CONN_F_LOCALNODE: | ||
325 | cp->packet_xmit = ip_vs_null_xmit; | ||
326 | break; | ||
327 | |||
328 | case IP_VS_CONN_F_BYPASS: | ||
329 | cp->packet_xmit = ip_vs_bypass_xmit; | ||
330 | break; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | |||
335 | static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) | ||
336 | { | ||
337 | return atomic_read(&dest->activeconns) | ||
338 | + atomic_read(&dest->inactconns); | ||
339 | } | ||
340 | |||
341 | /* | ||
342 | * Bind a connection entry with a virtual service destination | ||
343 | * Called just after a new connection entry is created. | ||
344 | */ | ||
345 | static inline void | ||
346 | ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) | ||
347 | { | ||
348 | /* if dest is NULL, then return directly */ | ||
349 | if (!dest) | ||
350 | return; | ||
351 | |||
352 | /* Increase the refcnt counter of the dest */ | ||
353 | atomic_inc(&dest->refcnt); | ||
354 | |||
355 | /* Bind with the destination and its corresponding transmitter */ | ||
356 | cp->flags |= atomic_read(&dest->conn_flags); | ||
357 | cp->dest = dest; | ||
358 | |||
359 | IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | ||
360 | "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", | ||
361 | ip_vs_proto_name(cp->protocol), | ||
362 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
363 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
364 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
365 | ip_vs_fwd_tag(cp), cp->state, | ||
366 | cp->flags, atomic_read(&cp->refcnt), | ||
367 | atomic_read(&dest->refcnt)); | ||
368 | |||
369 | /* Update the connection counters */ | ||
370 | if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { | ||
371 | /* It is a normal connection, so increase the inactive | ||
372 | connection counter because it is in TCP SYNRECV | ||
373 | state (inactive) or other protocol inacive state */ | ||
374 | atomic_inc(&dest->inactconns); | ||
375 | } else { | ||
376 | /* It is a persistent connection/template, so increase | ||
377 | the peristent connection counter */ | ||
378 | atomic_inc(&dest->persistconns); | ||
379 | } | ||
380 | |||
381 | if (dest->u_threshold != 0 && | ||
382 | ip_vs_dest_totalconns(dest) >= dest->u_threshold) | ||
383 | dest->flags |= IP_VS_DEST_F_OVERLOAD; | ||
384 | } | ||
385 | |||
386 | |||
387 | /* | ||
388 | * Unbind a connection entry with its VS destination | ||
389 | * Called by the ip_vs_conn_expire function. | ||
390 | */ | ||
391 | static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) | ||
392 | { | ||
393 | struct ip_vs_dest *dest = cp->dest; | ||
394 | |||
395 | if (!dest) | ||
396 | return; | ||
397 | |||
398 | IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | ||
399 | "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", | ||
400 | ip_vs_proto_name(cp->protocol), | ||
401 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
402 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
403 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
404 | ip_vs_fwd_tag(cp), cp->state, | ||
405 | cp->flags, atomic_read(&cp->refcnt), | ||
406 | atomic_read(&dest->refcnt)); | ||
407 | |||
408 | /* Update the connection counters */ | ||
409 | if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { | ||
410 | /* It is a normal connection, so decrease the inactconns | ||
411 | or activeconns counter */ | ||
412 | if (cp->flags & IP_VS_CONN_F_INACTIVE) { | ||
413 | atomic_dec(&dest->inactconns); | ||
414 | } else { | ||
415 | atomic_dec(&dest->activeconns); | ||
416 | } | ||
417 | } else { | ||
418 | /* It is a persistent connection/template, so decrease | ||
419 | the peristent connection counter */ | ||
420 | atomic_dec(&dest->persistconns); | ||
421 | } | ||
422 | |||
423 | if (dest->l_threshold != 0) { | ||
424 | if (ip_vs_dest_totalconns(dest) < dest->l_threshold) | ||
425 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
426 | } else if (dest->u_threshold != 0) { | ||
427 | if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) | ||
428 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
429 | } else { | ||
430 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
431 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Simply decrease the refcnt of the dest, because the | ||
436 | * dest will be either in service's destination list | ||
437 | * or in the trash. | ||
438 | */ | ||
439 | atomic_dec(&dest->refcnt); | ||
440 | } | ||
441 | |||
442 | |||
443 | /* | ||
444 | * Checking if the destination of a connection template is available. | ||
445 | * If available, return 1, otherwise invalidate this connection | ||
446 | * template and return 0. | ||
447 | */ | ||
448 | int ip_vs_check_template(struct ip_vs_conn *ct) | ||
449 | { | ||
450 | struct ip_vs_dest *dest = ct->dest; | ||
451 | |||
452 | /* | ||
453 | * Checking the dest server status. | ||
454 | */ | ||
455 | if ((dest == NULL) || | ||
456 | !(dest->flags & IP_VS_DEST_F_AVAILABLE) || | ||
457 | (sysctl_ip_vs_expire_quiescent_template && | ||
458 | (atomic_read(&dest->weight) == 0))) { | ||
459 | IP_VS_DBG(9, "check_template: dest not available for " | ||
460 | "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | ||
461 | "-> d:%u.%u.%u.%u:%d\n", | ||
462 | ip_vs_proto_name(ct->protocol), | ||
463 | NIPQUAD(ct->caddr), ntohs(ct->cport), | ||
464 | NIPQUAD(ct->vaddr), ntohs(ct->vport), | ||
465 | NIPQUAD(ct->daddr), ntohs(ct->dport)); | ||
466 | |||
467 | /* | ||
468 | * Invalidate the connection template | ||
469 | */ | ||
470 | if (ct->cport) { | ||
471 | if (ip_vs_conn_unhash(ct)) { | ||
472 | ct->dport = 65535; | ||
473 | ct->vport = 65535; | ||
474 | ct->cport = 0; | ||
475 | ip_vs_conn_hash(ct); | ||
476 | } | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * Simply decrease the refcnt of the template, | ||
481 | * don't restart its timer. | ||
482 | */ | ||
483 | atomic_dec(&ct->refcnt); | ||
484 | return 0; | ||
485 | } | ||
486 | return 1; | ||
487 | } | ||
488 | |||
489 | static void ip_vs_conn_expire(unsigned long data) | ||
490 | { | ||
491 | struct ip_vs_conn *cp = (struct ip_vs_conn *)data; | ||
492 | |||
493 | cp->timeout = 60*HZ; | ||
494 | |||
495 | /* | ||
496 | * hey, I'm using it | ||
497 | */ | ||
498 | atomic_inc(&cp->refcnt); | ||
499 | |||
500 | /* | ||
501 | * do I control anybody? | ||
502 | */ | ||
503 | if (atomic_read(&cp->n_control)) | ||
504 | goto expire_later; | ||
505 | |||
506 | /* | ||
507 | * unhash it if it is hashed in the conn table | ||
508 | */ | ||
509 | if (!ip_vs_conn_unhash(cp)) | ||
510 | goto expire_later; | ||
511 | |||
512 | /* | ||
513 | * refcnt==1 implies I'm the only one referrer | ||
514 | */ | ||
515 | if (likely(atomic_read(&cp->refcnt) == 1)) { | ||
516 | /* delete the timer if it is activated by other users */ | ||
517 | if (timer_pending(&cp->timer)) | ||
518 | del_timer(&cp->timer); | ||
519 | |||
520 | /* does anybody control me? */ | ||
521 | if (cp->control) | ||
522 | ip_vs_control_del(cp); | ||
523 | |||
524 | if (unlikely(cp->app != NULL)) | ||
525 | ip_vs_unbind_app(cp); | ||
526 | ip_vs_unbind_dest(cp); | ||
527 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) | ||
528 | atomic_dec(&ip_vs_conn_no_cport_cnt); | ||
529 | atomic_dec(&ip_vs_conn_count); | ||
530 | |||
531 | kmem_cache_free(ip_vs_conn_cachep, cp); | ||
532 | return; | ||
533 | } | ||
534 | |||
535 | /* hash it back to the table */ | ||
536 | ip_vs_conn_hash(cp); | ||
537 | |||
538 | expire_later: | ||
539 | IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", | ||
540 | atomic_read(&cp->refcnt)-1, | ||
541 | atomic_read(&cp->n_control)); | ||
542 | |||
543 | ip_vs_conn_put(cp); | ||
544 | } | ||
545 | |||
546 | |||
547 | void ip_vs_conn_expire_now(struct ip_vs_conn *cp) | ||
548 | { | ||
549 | if (del_timer(&cp->timer)) | ||
550 | mod_timer(&cp->timer, jiffies); | ||
551 | __ip_vs_conn_put(cp); | ||
552 | } | ||
553 | |||
554 | |||
555 | /* | ||
556 | * Create a new connection entry and hash it into the ip_vs_conn_tab | ||
557 | */ | ||
558 | struct ip_vs_conn * | ||
559 | ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, | ||
560 | __u32 daddr, __u16 dport, unsigned flags, | ||
561 | struct ip_vs_dest *dest) | ||
562 | { | ||
563 | struct ip_vs_conn *cp; | ||
564 | struct ip_vs_protocol *pp = ip_vs_proto_get(proto); | ||
565 | |||
566 | cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); | ||
567 | if (cp == NULL) { | ||
568 | IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); | ||
569 | return NULL; | ||
570 | } | ||
571 | |||
572 | memset(cp, 0, sizeof(*cp)); | ||
573 | INIT_LIST_HEAD(&cp->c_list); | ||
574 | init_timer(&cp->timer); | ||
575 | cp->timer.data = (unsigned long)cp; | ||
576 | cp->timer.function = ip_vs_conn_expire; | ||
577 | cp->protocol = proto; | ||
578 | cp->caddr = caddr; | ||
579 | cp->cport = cport; | ||
580 | cp->vaddr = vaddr; | ||
581 | cp->vport = vport; | ||
582 | cp->daddr = daddr; | ||
583 | cp->dport = dport; | ||
584 | cp->flags = flags; | ||
585 | spin_lock_init(&cp->lock); | ||
586 | |||
587 | /* | ||
588 | * Set the entry is referenced by the current thread before hashing | ||
589 | * it in the table, so that other thread run ip_vs_random_dropentry | ||
590 | * but cannot drop this entry. | ||
591 | */ | ||
592 | atomic_set(&cp->refcnt, 1); | ||
593 | |||
594 | atomic_set(&cp->n_control, 0); | ||
595 | atomic_set(&cp->in_pkts, 0); | ||
596 | |||
597 | atomic_inc(&ip_vs_conn_count); | ||
598 | if (flags & IP_VS_CONN_F_NO_CPORT) | ||
599 | atomic_inc(&ip_vs_conn_no_cport_cnt); | ||
600 | |||
601 | /* Bind the connection with a destination server */ | ||
602 | ip_vs_bind_dest(cp, dest); | ||
603 | |||
604 | /* Set its state and timeout */ | ||
605 | cp->state = 0; | ||
606 | cp->timeout = 3*HZ; | ||
607 | |||
608 | /* Bind its packet transmitter */ | ||
609 | ip_vs_bind_xmit(cp); | ||
610 | |||
611 | if (unlikely(pp && atomic_read(&pp->appcnt))) | ||
612 | ip_vs_bind_app(cp, pp); | ||
613 | |||
614 | /* Hash it in the ip_vs_conn_tab finally */ | ||
615 | ip_vs_conn_hash(cp); | ||
616 | |||
617 | return cp; | ||
618 | } | ||
619 | |||
620 | |||
621 | /* | ||
622 | * /proc/net/ip_vs_conn entries | ||
623 | */ | ||
624 | #ifdef CONFIG_PROC_FS | ||
625 | |||
626 | static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) | ||
627 | { | ||
628 | int idx; | ||
629 | struct ip_vs_conn *cp; | ||
630 | |||
631 | for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { | ||
632 | ct_read_lock_bh(idx); | ||
633 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
634 | if (pos-- == 0) { | ||
635 | seq->private = &ip_vs_conn_tab[idx]; | ||
636 | return cp; | ||
637 | } | ||
638 | } | ||
639 | ct_read_unlock_bh(idx); | ||
640 | } | ||
641 | |||
642 | return NULL; | ||
643 | } | ||
644 | |||
645 | static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) | ||
646 | { | ||
647 | seq->private = NULL; | ||
648 | return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; | ||
649 | } | ||
650 | |||
651 | static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
652 | { | ||
653 | struct ip_vs_conn *cp = v; | ||
654 | struct list_head *e, *l = seq->private; | ||
655 | int idx; | ||
656 | |||
657 | ++*pos; | ||
658 | if (v == SEQ_START_TOKEN) | ||
659 | return ip_vs_conn_array(seq, 0); | ||
660 | |||
661 | /* more on same hash chain? */ | ||
662 | if ((e = cp->c_list.next) != l) | ||
663 | return list_entry(e, struct ip_vs_conn, c_list); | ||
664 | |||
665 | idx = l - ip_vs_conn_tab; | ||
666 | ct_read_unlock_bh(idx); | ||
667 | |||
668 | while (++idx < IP_VS_CONN_TAB_SIZE) { | ||
669 | ct_read_lock_bh(idx); | ||
670 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
671 | seq->private = &ip_vs_conn_tab[idx]; | ||
672 | return cp; | ||
673 | } | ||
674 | ct_read_unlock_bh(idx); | ||
675 | } | ||
676 | seq->private = NULL; | ||
677 | return NULL; | ||
678 | } | ||
679 | |||
680 | static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) | ||
681 | { | ||
682 | struct list_head *l = seq->private; | ||
683 | |||
684 | if (l) | ||
685 | ct_read_unlock_bh(l - ip_vs_conn_tab); | ||
686 | } | ||
687 | |||
688 | static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) | ||
689 | { | ||
690 | |||
691 | if (v == SEQ_START_TOKEN) | ||
692 | seq_puts(seq, | ||
693 | "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); | ||
694 | else { | ||
695 | const struct ip_vs_conn *cp = v; | ||
696 | |||
697 | seq_printf(seq, | ||
698 | "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", | ||
699 | ip_vs_proto_name(cp->protocol), | ||
700 | ntohl(cp->caddr), ntohs(cp->cport), | ||
701 | ntohl(cp->vaddr), ntohs(cp->vport), | ||
702 | ntohl(cp->daddr), ntohs(cp->dport), | ||
703 | ip_vs_state_name(cp->protocol, cp->state), | ||
704 | (cp->timer.expires-jiffies)/HZ); | ||
705 | } | ||
706 | return 0; | ||
707 | } | ||
708 | |||
709 | static struct seq_operations ip_vs_conn_seq_ops = { | ||
710 | .start = ip_vs_conn_seq_start, | ||
711 | .next = ip_vs_conn_seq_next, | ||
712 | .stop = ip_vs_conn_seq_stop, | ||
713 | .show = ip_vs_conn_seq_show, | ||
714 | }; | ||
715 | |||
716 | static int ip_vs_conn_open(struct inode *inode, struct file *file) | ||
717 | { | ||
718 | return seq_open(file, &ip_vs_conn_seq_ops); | ||
719 | } | ||
720 | |||
721 | static struct file_operations ip_vs_conn_fops = { | ||
722 | .owner = THIS_MODULE, | ||
723 | .open = ip_vs_conn_open, | ||
724 | .read = seq_read, | ||
725 | .llseek = seq_lseek, | ||
726 | .release = seq_release, | ||
727 | }; | ||
728 | #endif | ||
729 | |||
730 | |||
731 | /* | ||
732 | * Randomly drop connection entries before running out of memory | ||
733 | */ | ||
734 | static inline int todrop_entry(struct ip_vs_conn *cp) | ||
735 | { | ||
736 | /* | ||
737 | * The drop rate array needs tuning for real environments. | ||
738 | * Called from timer bh only => no locking | ||
739 | */ | ||
740 | static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; | ||
741 | static char todrop_counter[9] = {0}; | ||
742 | int i; | ||
743 | |||
744 | /* if the conn entry hasn't lasted for 60 seconds, don't drop it. | ||
745 | This will leave enough time for normal connection to get | ||
746 | through. */ | ||
747 | if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) | ||
748 | return 0; | ||
749 | |||
750 | /* Don't drop the entry if its number of incoming packets is not | ||
751 | located in [0, 8] */ | ||
752 | i = atomic_read(&cp->in_pkts); | ||
753 | if (i > 8 || i < 0) return 0; | ||
754 | |||
755 | if (!todrop_rate[i]) return 0; | ||
756 | if (--todrop_counter[i] > 0) return 0; | ||
757 | |||
758 | todrop_counter[i] = todrop_rate[i]; | ||
759 | return 1; | ||
760 | } | ||
761 | |||
762 | |||
763 | void ip_vs_random_dropentry(void) | ||
764 | { | ||
765 | int idx; | ||
766 | struct ip_vs_conn *cp; | ||
767 | struct ip_vs_conn *ct; | ||
768 | |||
769 | /* | ||
770 | * Randomly scan 1/32 of the whole table every second | ||
771 | */ | ||
772 | for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { | ||
773 | unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; | ||
774 | |||
775 | /* | ||
776 | * Lock is actually needed in this loop. | ||
777 | */ | ||
778 | ct_write_lock(hash); | ||
779 | |||
780 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
781 | if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) | ||
782 | /* connection template */ | ||
783 | continue; | ||
784 | |||
785 | if (cp->protocol == IPPROTO_TCP) { | ||
786 | switch(cp->state) { | ||
787 | case IP_VS_TCP_S_SYN_RECV: | ||
788 | case IP_VS_TCP_S_SYNACK: | ||
789 | break; | ||
790 | |||
791 | case IP_VS_TCP_S_ESTABLISHED: | ||
792 | if (todrop_entry(cp)) | ||
793 | break; | ||
794 | continue; | ||
795 | |||
796 | default: | ||
797 | continue; | ||
798 | } | ||
799 | } else { | ||
800 | if (!todrop_entry(cp)) | ||
801 | continue; | ||
802 | } | ||
803 | |||
804 | /* | ||
805 | * Drop the entry, and drop its ct if not referenced | ||
806 | */ | ||
807 | atomic_inc(&cp->refcnt); | ||
808 | ct_write_unlock(hash); | ||
809 | |||
810 | if ((ct = cp->control)) | ||
811 | atomic_inc(&ct->refcnt); | ||
812 | IP_VS_DBG(4, "del connection\n"); | ||
813 | ip_vs_conn_expire_now(cp); | ||
814 | if (ct) { | ||
815 | IP_VS_DBG(4, "del conn template\n"); | ||
816 | ip_vs_conn_expire_now(ct); | ||
817 | } | ||
818 | ct_write_lock(hash); | ||
819 | } | ||
820 | ct_write_unlock(hash); | ||
821 | } | ||
822 | } | ||
823 | |||
824 | |||
825 | /* | ||
826 | * Flush all the connection entries in the ip_vs_conn_tab | ||
827 | */ | ||
828 | static void ip_vs_conn_flush(void) | ||
829 | { | ||
830 | int idx; | ||
831 | struct ip_vs_conn *cp; | ||
832 | struct ip_vs_conn *ct; | ||
833 | |||
834 | flush_again: | ||
835 | for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { | ||
836 | /* | ||
837 | * Lock is actually needed in this loop. | ||
838 | */ | ||
839 | ct_write_lock_bh(idx); | ||
840 | |||
841 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
842 | atomic_inc(&cp->refcnt); | ||
843 | ct_write_unlock(idx); | ||
844 | |||
845 | if ((ct = cp->control)) | ||
846 | atomic_inc(&ct->refcnt); | ||
847 | IP_VS_DBG(4, "del connection\n"); | ||
848 | ip_vs_conn_expire_now(cp); | ||
849 | if (ct) { | ||
850 | IP_VS_DBG(4, "del conn template\n"); | ||
851 | ip_vs_conn_expire_now(ct); | ||
852 | } | ||
853 | ct_write_lock(idx); | ||
854 | } | ||
855 | ct_write_unlock_bh(idx); | ||
856 | } | ||
857 | |||
858 | /* the counter may be not NULL, because maybe some conn entries | ||
859 | are run by slow timer handler or unhashed but still referred */ | ||
860 | if (atomic_read(&ip_vs_conn_count) != 0) { | ||
861 | schedule(); | ||
862 | goto flush_again; | ||
863 | } | ||
864 | } | ||
865 | |||
866 | |||
867 | int ip_vs_conn_init(void) | ||
868 | { | ||
869 | int idx; | ||
870 | |||
871 | /* | ||
872 | * Allocate the connection hash table and initialize its list heads | ||
873 | */ | ||
874 | ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); | ||
875 | if (!ip_vs_conn_tab) | ||
876 | return -ENOMEM; | ||
877 | |||
878 | /* Allocate ip_vs_conn slab cache */ | ||
879 | ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", | ||
880 | sizeof(struct ip_vs_conn), 0, | ||
881 | SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
882 | if (!ip_vs_conn_cachep) { | ||
883 | vfree(ip_vs_conn_tab); | ||
884 | return -ENOMEM; | ||
885 | } | ||
886 | |||
887 | IP_VS_INFO("Connection hash table configured " | ||
888 | "(size=%d, memory=%ldKbytes)\n", | ||
889 | IP_VS_CONN_TAB_SIZE, | ||
890 | (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); | ||
891 | IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", | ||
892 | sizeof(struct ip_vs_conn)); | ||
893 | |||
894 | for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { | ||
895 | INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); | ||
896 | } | ||
897 | |||
898 | for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { | ||
899 | rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); | ||
900 | } | ||
901 | |||
902 | proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); | ||
903 | |||
904 | /* calculate the random value for connection hash */ | ||
905 | get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); | ||
906 | |||
907 | return 0; | ||
908 | } | ||
909 | |||
910 | |||
911 | void ip_vs_conn_cleanup(void) | ||
912 | { | ||
913 | /* flush all the connection entries first */ | ||
914 | ip_vs_conn_flush(); | ||
915 | |||
916 | /* Release the empty cache */ | ||
917 | kmem_cache_destroy(ip_vs_conn_cachep); | ||
918 | proc_net_remove("ip_vs_conn"); | ||
919 | vfree(ip_vs_conn_tab); | ||
920 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c new file mode 100644 index 000000000000..5fb257dd07cb --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_core.c | |||
@@ -0,0 +1,1191 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ | ||
9 | * | ||
10 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
11 | * Peter Kese <peter.kese@ijs.si> | ||
12 | * Julian Anastasov <ja@ssi.bg> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or | ||
15 | * modify it under the terms of the GNU General Public License | ||
16 | * as published by the Free Software Foundation; either version | ||
17 | * 2 of the License, or (at your option) any later version. | ||
18 | * | ||
19 | * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, | ||
20 | * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms | ||
21 | * and others. | ||
22 | * | ||
23 | * Changes: | ||
24 | * Paul `Rusty' Russell properly handle non-linear skbs | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/kernel.h> | ||
30 | #include <linux/ip.h> | ||
31 | #include <linux/tcp.h> | ||
32 | #include <linux/icmp.h> | ||
33 | |||
34 | #include <net/ip.h> | ||
35 | #include <net/tcp.h> | ||
36 | #include <net/udp.h> | ||
37 | #include <net/icmp.h> /* for icmp_send */ | ||
38 | #include <net/route.h> | ||
39 | |||
40 | #include <linux/netfilter.h> | ||
41 | #include <linux/netfilter_ipv4.h> | ||
42 | |||
43 | #include <net/ip_vs.h> | ||
44 | |||
45 | |||
46 | EXPORT_SYMBOL(register_ip_vs_scheduler); | ||
47 | EXPORT_SYMBOL(unregister_ip_vs_scheduler); | ||
48 | EXPORT_SYMBOL(ip_vs_skb_replace); | ||
49 | EXPORT_SYMBOL(ip_vs_proto_name); | ||
50 | EXPORT_SYMBOL(ip_vs_conn_new); | ||
51 | EXPORT_SYMBOL(ip_vs_conn_in_get); | ||
52 | EXPORT_SYMBOL(ip_vs_conn_out_get); | ||
53 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
54 | EXPORT_SYMBOL(ip_vs_tcp_conn_listen); | ||
55 | #endif | ||
56 | EXPORT_SYMBOL(ip_vs_conn_put); | ||
57 | #ifdef CONFIG_IP_VS_DEBUG | ||
58 | EXPORT_SYMBOL(ip_vs_get_debug_level); | ||
59 | #endif | ||
60 | EXPORT_SYMBOL(ip_vs_make_skb_writable); | ||
61 | |||
62 | |||
63 | /* ID used in ICMP lookups */ | ||
64 | #define icmp_id(icmph) (((icmph)->un).echo.id) | ||
65 | |||
66 | const char *ip_vs_proto_name(unsigned proto) | ||
67 | { | ||
68 | static char buf[20]; | ||
69 | |||
70 | switch (proto) { | ||
71 | case IPPROTO_IP: | ||
72 | return "IP"; | ||
73 | case IPPROTO_UDP: | ||
74 | return "UDP"; | ||
75 | case IPPROTO_TCP: | ||
76 | return "TCP"; | ||
77 | case IPPROTO_ICMP: | ||
78 | return "ICMP"; | ||
79 | default: | ||
80 | sprintf(buf, "IP_%d", proto); | ||
81 | return buf; | ||
82 | } | ||
83 | } | ||
84 | |||
85 | void ip_vs_init_hash_table(struct list_head *table, int rows) | ||
86 | { | ||
87 | while (--rows >= 0) | ||
88 | INIT_LIST_HEAD(&table[rows]); | ||
89 | } | ||
90 | |||
91 | static inline void | ||
92 | ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
93 | { | ||
94 | struct ip_vs_dest *dest = cp->dest; | ||
95 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
96 | spin_lock(&dest->stats.lock); | ||
97 | dest->stats.inpkts++; | ||
98 | dest->stats.inbytes += skb->len; | ||
99 | spin_unlock(&dest->stats.lock); | ||
100 | |||
101 | spin_lock(&dest->svc->stats.lock); | ||
102 | dest->svc->stats.inpkts++; | ||
103 | dest->svc->stats.inbytes += skb->len; | ||
104 | spin_unlock(&dest->svc->stats.lock); | ||
105 | |||
106 | spin_lock(&ip_vs_stats.lock); | ||
107 | ip_vs_stats.inpkts++; | ||
108 | ip_vs_stats.inbytes += skb->len; | ||
109 | spin_unlock(&ip_vs_stats.lock); | ||
110 | } | ||
111 | } | ||
112 | |||
113 | |||
114 | static inline void | ||
115 | ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
116 | { | ||
117 | struct ip_vs_dest *dest = cp->dest; | ||
118 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
119 | spin_lock(&dest->stats.lock); | ||
120 | dest->stats.outpkts++; | ||
121 | dest->stats.outbytes += skb->len; | ||
122 | spin_unlock(&dest->stats.lock); | ||
123 | |||
124 | spin_lock(&dest->svc->stats.lock); | ||
125 | dest->svc->stats.outpkts++; | ||
126 | dest->svc->stats.outbytes += skb->len; | ||
127 | spin_unlock(&dest->svc->stats.lock); | ||
128 | |||
129 | spin_lock(&ip_vs_stats.lock); | ||
130 | ip_vs_stats.outpkts++; | ||
131 | ip_vs_stats.outbytes += skb->len; | ||
132 | spin_unlock(&ip_vs_stats.lock); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | |||
137 | static inline void | ||
138 | ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) | ||
139 | { | ||
140 | spin_lock(&cp->dest->stats.lock); | ||
141 | cp->dest->stats.conns++; | ||
142 | spin_unlock(&cp->dest->stats.lock); | ||
143 | |||
144 | spin_lock(&svc->stats.lock); | ||
145 | svc->stats.conns++; | ||
146 | spin_unlock(&svc->stats.lock); | ||
147 | |||
148 | spin_lock(&ip_vs_stats.lock); | ||
149 | ip_vs_stats.conns++; | ||
150 | spin_unlock(&ip_vs_stats.lock); | ||
151 | } | ||
152 | |||
153 | |||
154 | static inline int | ||
155 | ip_vs_set_state(struct ip_vs_conn *cp, int direction, | ||
156 | const struct sk_buff *skb, | ||
157 | struct ip_vs_protocol *pp) | ||
158 | { | ||
159 | if (unlikely(!pp->state_transition)) | ||
160 | return 0; | ||
161 | return pp->state_transition(cp, direction, skb, pp); | ||
162 | } | ||
163 | |||
164 | |||
165 | int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len) | ||
166 | { | ||
167 | struct sk_buff *skb = *pskb; | ||
168 | |||
169 | /* skb is already used, better copy skb and its payload */ | ||
170 | if (unlikely(skb_shared(skb) || skb->sk)) | ||
171 | goto copy_skb; | ||
172 | |||
173 | /* skb data is already used, copy it */ | ||
174 | if (unlikely(skb_cloned(skb))) | ||
175 | goto copy_data; | ||
176 | |||
177 | return pskb_may_pull(skb, writable_len); | ||
178 | |||
179 | copy_data: | ||
180 | if (unlikely(writable_len > skb->len)) | ||
181 | return 0; | ||
182 | return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC); | ||
183 | |||
184 | copy_skb: | ||
185 | if (unlikely(writable_len > skb->len)) | ||
186 | return 0; | ||
187 | skb = skb_copy(skb, GFP_ATOMIC); | ||
188 | if (!skb) | ||
189 | return 0; | ||
190 | BUG_ON(skb_is_nonlinear(skb)); | ||
191 | |||
192 | /* Rest of kernel will get very unhappy if we pass it a | ||
193 | suddenly-orphaned skbuff */ | ||
194 | if ((*pskb)->sk) | ||
195 | skb_set_owner_w(skb, (*pskb)->sk); | ||
196 | kfree_skb(*pskb); | ||
197 | *pskb = skb; | ||
198 | return 1; | ||
199 | } | ||
200 | |||
201 | /* | ||
202 | * IPVS persistent scheduling function | ||
203 | * It creates a connection entry according to its template if exists, | ||
204 | * or selects a server and creates a connection entry plus a template. | ||
205 | * Locking: we are svc user (svc->refcnt), so we hold all dests too | ||
206 | * Protocols supported: TCP, UDP | ||
207 | */ | ||
208 | static struct ip_vs_conn * | ||
209 | ip_vs_sched_persist(struct ip_vs_service *svc, | ||
210 | const struct sk_buff *skb, | ||
211 | __u16 ports[2]) | ||
212 | { | ||
213 | struct ip_vs_conn *cp = NULL; | ||
214 | struct iphdr *iph = skb->nh.iph; | ||
215 | struct ip_vs_dest *dest; | ||
216 | struct ip_vs_conn *ct; | ||
217 | __u16 dport; /* destination port to forward */ | ||
218 | __u32 snet; /* source network of the client, after masking */ | ||
219 | |||
220 | /* Mask saddr with the netmask to adjust template granularity */ | ||
221 | snet = iph->saddr & svc->netmask; | ||
222 | |||
223 | IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " | ||
224 | "mnet %u.%u.%u.%u\n", | ||
225 | NIPQUAD(iph->saddr), ntohs(ports[0]), | ||
226 | NIPQUAD(iph->daddr), ntohs(ports[1]), | ||
227 | NIPQUAD(snet)); | ||
228 | |||
229 | /* | ||
230 | * As far as we know, FTP is a very complicated network protocol, and | ||
231 | * it uses control connection and data connections. For active FTP, | ||
232 | * FTP server initialize data connection to the client, its source port | ||
233 | * is often 20. For passive FTP, FTP server tells the clients the port | ||
234 | * that it passively listens to, and the client issues the data | ||
235 | * connection. In the tunneling or direct routing mode, the load | ||
236 | * balancer is on the client-to-server half of connection, the port | ||
237 | * number is unknown to the load balancer. So, a conn template like | ||
238 | * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP | ||
239 | * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> | ||
240 | * is created for other persistent services. | ||
241 | */ | ||
242 | if (ports[1] == svc->port) { | ||
243 | /* Check if a template already exists */ | ||
244 | if (svc->port != FTPPORT) | ||
245 | ct = ip_vs_conn_in_get(iph->protocol, snet, 0, | ||
246 | iph->daddr, ports[1]); | ||
247 | else | ||
248 | ct = ip_vs_conn_in_get(iph->protocol, snet, 0, | ||
249 | iph->daddr, 0); | ||
250 | |||
251 | if (!ct || !ip_vs_check_template(ct)) { | ||
252 | /* | ||
253 | * No template found or the dest of the connection | ||
254 | * template is not available. | ||
255 | */ | ||
256 | dest = svc->scheduler->schedule(svc, skb); | ||
257 | if (dest == NULL) { | ||
258 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); | ||
259 | return NULL; | ||
260 | } | ||
261 | |||
262 | /* | ||
263 | * Create a template like <protocol,caddr,0, | ||
264 | * vaddr,vport,daddr,dport> for non-ftp service, | ||
265 | * and <protocol,caddr,0,vaddr,0,daddr,0> | ||
266 | * for ftp service. | ||
267 | */ | ||
268 | if (svc->port != FTPPORT) | ||
269 | ct = ip_vs_conn_new(iph->protocol, | ||
270 | snet, 0, | ||
271 | iph->daddr, | ||
272 | ports[1], | ||
273 | dest->addr, dest->port, | ||
274 | 0, | ||
275 | dest); | ||
276 | else | ||
277 | ct = ip_vs_conn_new(iph->protocol, | ||
278 | snet, 0, | ||
279 | iph->daddr, 0, | ||
280 | dest->addr, 0, | ||
281 | 0, | ||
282 | dest); | ||
283 | if (ct == NULL) | ||
284 | return NULL; | ||
285 | |||
286 | ct->timeout = svc->timeout; | ||
287 | } else { | ||
288 | /* set destination with the found template */ | ||
289 | dest = ct->dest; | ||
290 | } | ||
291 | dport = dest->port; | ||
292 | } else { | ||
293 | /* | ||
294 | * Note: persistent fwmark-based services and persistent | ||
295 | * port zero service are handled here. | ||
296 | * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> | ||
297 | * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> | ||
298 | */ | ||
299 | if (svc->fwmark) | ||
300 | ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0, | ||
301 | htonl(svc->fwmark), 0); | ||
302 | else | ||
303 | ct = ip_vs_conn_in_get(iph->protocol, snet, 0, | ||
304 | iph->daddr, 0); | ||
305 | |||
306 | if (!ct || !ip_vs_check_template(ct)) { | ||
307 | /* | ||
308 | * If it is not persistent port zero, return NULL, | ||
309 | * otherwise create a connection template. | ||
310 | */ | ||
311 | if (svc->port) | ||
312 | return NULL; | ||
313 | |||
314 | dest = svc->scheduler->schedule(svc, skb); | ||
315 | if (dest == NULL) { | ||
316 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); | ||
317 | return NULL; | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * Create a template according to the service | ||
322 | */ | ||
323 | if (svc->fwmark) | ||
324 | ct = ip_vs_conn_new(IPPROTO_IP, | ||
325 | snet, 0, | ||
326 | htonl(svc->fwmark), 0, | ||
327 | dest->addr, 0, | ||
328 | 0, | ||
329 | dest); | ||
330 | else | ||
331 | ct = ip_vs_conn_new(iph->protocol, | ||
332 | snet, 0, | ||
333 | iph->daddr, 0, | ||
334 | dest->addr, 0, | ||
335 | 0, | ||
336 | dest); | ||
337 | if (ct == NULL) | ||
338 | return NULL; | ||
339 | |||
340 | ct->timeout = svc->timeout; | ||
341 | } else { | ||
342 | /* set destination with the found template */ | ||
343 | dest = ct->dest; | ||
344 | } | ||
345 | dport = ports[1]; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * Create a new connection according to the template | ||
350 | */ | ||
351 | cp = ip_vs_conn_new(iph->protocol, | ||
352 | iph->saddr, ports[0], | ||
353 | iph->daddr, ports[1], | ||
354 | dest->addr, dport, | ||
355 | 0, | ||
356 | dest); | ||
357 | if (cp == NULL) { | ||
358 | ip_vs_conn_put(ct); | ||
359 | return NULL; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * Add its control | ||
364 | */ | ||
365 | ip_vs_control_add(cp, ct); | ||
366 | ip_vs_conn_put(ct); | ||
367 | |||
368 | ip_vs_conn_stats(cp, svc); | ||
369 | return cp; | ||
370 | } | ||
371 | |||
372 | |||
373 | /* | ||
374 | * IPVS main scheduling function | ||
375 | * It selects a server according to the virtual service, and | ||
376 | * creates a connection entry. | ||
377 | * Protocols supported: TCP, UDP | ||
378 | */ | ||
379 | struct ip_vs_conn * | ||
380 | ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
381 | { | ||
382 | struct ip_vs_conn *cp = NULL; | ||
383 | struct iphdr *iph = skb->nh.iph; | ||
384 | struct ip_vs_dest *dest; | ||
385 | __u16 _ports[2], *pptr; | ||
386 | |||
387 | pptr = skb_header_pointer(skb, iph->ihl*4, | ||
388 | sizeof(_ports), _ports); | ||
389 | if (pptr == NULL) | ||
390 | return NULL; | ||
391 | |||
392 | /* | ||
393 | * Persistent service | ||
394 | */ | ||
395 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) | ||
396 | return ip_vs_sched_persist(svc, skb, pptr); | ||
397 | |||
398 | /* | ||
399 | * Non-persistent service | ||
400 | */ | ||
401 | if (!svc->fwmark && pptr[1] != svc->port) { | ||
402 | if (!svc->port) | ||
403 | IP_VS_ERR("Schedule: port zero only supported " | ||
404 | "in persistent services, " | ||
405 | "check your ipvs configuration\n"); | ||
406 | return NULL; | ||
407 | } | ||
408 | |||
409 | dest = svc->scheduler->schedule(svc, skb); | ||
410 | if (dest == NULL) { | ||
411 | IP_VS_DBG(1, "Schedule: no dest found.\n"); | ||
412 | return NULL; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * Create a connection entry. | ||
417 | */ | ||
418 | cp = ip_vs_conn_new(iph->protocol, | ||
419 | iph->saddr, pptr[0], | ||
420 | iph->daddr, pptr[1], | ||
421 | dest->addr, dest->port?dest->port:pptr[1], | ||
422 | 0, | ||
423 | dest); | ||
424 | if (cp == NULL) | ||
425 | return NULL; | ||
426 | |||
427 | IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " | ||
428 | "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", | ||
429 | ip_vs_fwd_tag(cp), | ||
430 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
431 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
432 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
433 | cp->flags, atomic_read(&cp->refcnt)); | ||
434 | |||
435 | ip_vs_conn_stats(cp, svc); | ||
436 | return cp; | ||
437 | } | ||
438 | |||
439 | |||
440 | /* | ||
441 | * Pass or drop the packet. | ||
442 | * Called by ip_vs_in, when the virtual service is available but | ||
443 | * no destination is available for a new connection. | ||
444 | */ | ||
445 | int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | ||
446 | struct ip_vs_protocol *pp) | ||
447 | { | ||
448 | __u16 _ports[2], *pptr; | ||
449 | struct iphdr *iph = skb->nh.iph; | ||
450 | |||
451 | pptr = skb_header_pointer(skb, iph->ihl*4, | ||
452 | sizeof(_ports), _ports); | ||
453 | if (pptr == NULL) { | ||
454 | ip_vs_service_put(svc); | ||
455 | return NF_DROP; | ||
456 | } | ||
457 | |||
458 | /* if it is fwmark-based service, the cache_bypass sysctl is up | ||
459 | and the destination is RTN_UNICAST (and not local), then create | ||
460 | a cache_bypass connection entry */ | ||
461 | if (sysctl_ip_vs_cache_bypass && svc->fwmark | ||
462 | && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { | ||
463 | int ret, cs; | ||
464 | struct ip_vs_conn *cp; | ||
465 | |||
466 | ip_vs_service_put(svc); | ||
467 | |||
468 | /* create a new connection entry */ | ||
469 | IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); | ||
470 | cp = ip_vs_conn_new(iph->protocol, | ||
471 | iph->saddr, pptr[0], | ||
472 | iph->daddr, pptr[1], | ||
473 | 0, 0, | ||
474 | IP_VS_CONN_F_BYPASS, | ||
475 | NULL); | ||
476 | if (cp == NULL) | ||
477 | return NF_DROP; | ||
478 | |||
479 | /* statistics */ | ||
480 | ip_vs_in_stats(cp, skb); | ||
481 | |||
482 | /* set state */ | ||
483 | cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | ||
484 | |||
485 | /* transmit the first SYN packet */ | ||
486 | ret = cp->packet_xmit(skb, cp, pp); | ||
487 | /* do not touch skb anymore */ | ||
488 | |||
489 | atomic_inc(&cp->in_pkts); | ||
490 | ip_vs_conn_put(cp); | ||
491 | return ret; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * When the virtual ftp service is presented, packets destined | ||
496 | * for other services on the VIP may get here (except services | ||
497 | * listed in the ipvs table), pass the packets, because it is | ||
498 | * not ipvs job to decide to drop the packets. | ||
499 | */ | ||
500 | if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { | ||
501 | ip_vs_service_put(svc); | ||
502 | return NF_ACCEPT; | ||
503 | } | ||
504 | |||
505 | ip_vs_service_put(svc); | ||
506 | |||
507 | /* | ||
508 | * Notify the client that the destination is unreachable, and | ||
509 | * release the socket buffer. | ||
510 | * Since it is in IP layer, the TCP socket is not actually | ||
511 | * created, the TCP RST packet cannot be sent, instead that | ||
512 | * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ | ||
513 | */ | ||
514 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | ||
515 | return NF_DROP; | ||
516 | } | ||
517 | |||
518 | |||
519 | /* | ||
520 | * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING | ||
521 | * chain, and is used for VS/NAT. | ||
522 | * It detects packets for VS/NAT connections and sends the packets | ||
523 | * immediately. This can avoid that iptable_nat mangles the packets | ||
524 | * for VS/NAT. | ||
525 | */ | ||
526 | static unsigned int ip_vs_post_routing(unsigned int hooknum, | ||
527 | struct sk_buff **pskb, | ||
528 | const struct net_device *in, | ||
529 | const struct net_device *out, | ||
530 | int (*okfn)(struct sk_buff *)) | ||
531 | { | ||
532 | if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) | ||
533 | return NF_ACCEPT; | ||
534 | |||
535 | /* The packet was sent from IPVS, exit this chain */ | ||
536 | (*okfn)(*pskb); | ||
537 | |||
538 | return NF_STOLEN; | ||
539 | } | ||
540 | |||
541 | u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) | ||
542 | { | ||
543 | return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); | ||
544 | } | ||
545 | |||
546 | static inline struct sk_buff * | ||
547 | ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) | ||
548 | { | ||
549 | skb = ip_defrag(skb, user); | ||
550 | if (skb) | ||
551 | ip_send_check(skb->nh.iph); | ||
552 | return skb; | ||
553 | } | ||
554 | |||
555 | /* | ||
556 | * Packet has been made sufficiently writable in caller | ||
557 | * - inout: 1=in->out, 0=out->in | ||
558 | */ | ||
559 | void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
560 | struct ip_vs_conn *cp, int inout) | ||
561 | { | ||
562 | struct iphdr *iph = skb->nh.iph; | ||
563 | unsigned int icmp_offset = iph->ihl*4; | ||
564 | struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset); | ||
565 | struct iphdr *ciph = (struct iphdr *)(icmph + 1); | ||
566 | |||
567 | if (inout) { | ||
568 | iph->saddr = cp->vaddr; | ||
569 | ip_send_check(iph); | ||
570 | ciph->daddr = cp->vaddr; | ||
571 | ip_send_check(ciph); | ||
572 | } else { | ||
573 | iph->daddr = cp->daddr; | ||
574 | ip_send_check(iph); | ||
575 | ciph->saddr = cp->daddr; | ||
576 | ip_send_check(ciph); | ||
577 | } | ||
578 | |||
579 | /* the TCP/UDP port */ | ||
580 | if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { | ||
581 | __u16 *ports = (void *)ciph + ciph->ihl*4; | ||
582 | |||
583 | if (inout) | ||
584 | ports[1] = cp->vport; | ||
585 | else | ||
586 | ports[0] = cp->dport; | ||
587 | } | ||
588 | |||
589 | /* And finally the ICMP checksum */ | ||
590 | icmph->checksum = 0; | ||
591 | icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); | ||
592 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
593 | |||
594 | if (inout) | ||
595 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
596 | "Forwarding altered outgoing ICMP"); | ||
597 | else | ||
598 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
599 | "Forwarding altered incoming ICMP"); | ||
600 | } | ||
601 | |||
602 | /* | ||
603 | * Handle ICMP messages in the inside-to-outside direction (outgoing). | ||
604 | * Find any that might be relevant, check against existing connections, | ||
605 | * forward to the right destination host if relevant. | ||
606 | * Currently handles error types - unreachable, quench, ttl exceeded. | ||
607 | * (Only used in VS/NAT) | ||
608 | */ | ||
609 | static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) | ||
610 | { | ||
611 | struct sk_buff *skb = *pskb; | ||
612 | struct iphdr *iph; | ||
613 | struct icmphdr _icmph, *ic; | ||
614 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ | ||
615 | struct ip_vs_conn *cp; | ||
616 | struct ip_vs_protocol *pp; | ||
617 | unsigned int offset, ihl, verdict; | ||
618 | |||
619 | *related = 1; | ||
620 | |||
621 | /* reassemble IP fragments */ | ||
622 | if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { | ||
623 | skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); | ||
624 | if (!skb) | ||
625 | return NF_STOLEN; | ||
626 | *pskb = skb; | ||
627 | } | ||
628 | |||
629 | iph = skb->nh.iph; | ||
630 | offset = ihl = iph->ihl * 4; | ||
631 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
632 | if (ic == NULL) | ||
633 | return NF_DROP; | ||
634 | |||
635 | IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
636 | ic->type, ntohs(icmp_id(ic)), | ||
637 | NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | ||
638 | |||
639 | /* | ||
640 | * Work through seeing if this is for us. | ||
641 | * These checks are supposed to be in an order that means easy | ||
642 | * things are checked first to speed up processing.... however | ||
643 | * this means that some packets will manage to get a long way | ||
644 | * down this stack and then be rejected, but that's life. | ||
645 | */ | ||
646 | if ((ic->type != ICMP_DEST_UNREACH) && | ||
647 | (ic->type != ICMP_SOURCE_QUENCH) && | ||
648 | (ic->type != ICMP_TIME_EXCEEDED)) { | ||
649 | *related = 0; | ||
650 | return NF_ACCEPT; | ||
651 | } | ||
652 | |||
653 | /* Now find the contained IP header */ | ||
654 | offset += sizeof(_icmph); | ||
655 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
656 | if (cih == NULL) | ||
657 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
658 | |||
659 | pp = ip_vs_proto_get(cih->protocol); | ||
660 | if (!pp) | ||
661 | return NF_ACCEPT; | ||
662 | |||
663 | /* Is the embedded protocol header present? */ | ||
664 | if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && | ||
665 | pp->dont_defrag)) | ||
666 | return NF_ACCEPT; | ||
667 | |||
668 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); | ||
669 | |||
670 | offset += cih->ihl * 4; | ||
671 | |||
672 | /* The embedded headers contain source and dest in reverse order */ | ||
673 | cp = pp->conn_out_get(skb, pp, cih, offset, 1); | ||
674 | if (!cp) | ||
675 | return NF_ACCEPT; | ||
676 | |||
677 | verdict = NF_DROP; | ||
678 | |||
679 | if (IP_VS_FWD_METHOD(cp) != 0) { | ||
680 | IP_VS_ERR("shouldn't reach here, because the box is on the" | ||
681 | "half connection in the tun/dr module.\n"); | ||
682 | } | ||
683 | |||
684 | /* Ensure the checksum is correct */ | ||
685 | if (skb->ip_summed != CHECKSUM_UNNECESSARY && | ||
686 | ip_vs_checksum_complete(skb, ihl)) { | ||
687 | /* Failed checksum! */ | ||
688 | IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", | ||
689 | NIPQUAD(iph->saddr)); | ||
690 | goto out; | ||
691 | } | ||
692 | |||
693 | if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) | ||
694 | offset += 2 * sizeof(__u16); | ||
695 | if (!ip_vs_make_skb_writable(pskb, offset)) | ||
696 | goto out; | ||
697 | skb = *pskb; | ||
698 | |||
699 | ip_vs_nat_icmp(skb, pp, cp, 1); | ||
700 | |||
701 | /* do the statistics and put it back */ | ||
702 | ip_vs_out_stats(cp, skb); | ||
703 | |||
704 | skb->nfcache |= NFC_IPVS_PROPERTY; | ||
705 | verdict = NF_ACCEPT; | ||
706 | |||
707 | out: | ||
708 | __ip_vs_conn_put(cp); | ||
709 | |||
710 | return verdict; | ||
711 | } | ||
712 | |||
713 | static inline int is_tcp_reset(const struct sk_buff *skb) | ||
714 | { | ||
715 | struct tcphdr _tcph, *th; | ||
716 | |||
717 | th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, | ||
718 | sizeof(_tcph), &_tcph); | ||
719 | if (th == NULL) | ||
720 | return 0; | ||
721 | return th->rst; | ||
722 | } | ||
723 | |||
724 | /* | ||
725 | * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. | ||
726 | * Check if outgoing packet belongs to the established ip_vs_conn, | ||
727 | * rewrite addresses of the packet and send it on its way... | ||
728 | */ | ||
729 | static unsigned int | ||
730 | ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, | ||
731 | const struct net_device *in, const struct net_device *out, | ||
732 | int (*okfn)(struct sk_buff *)) | ||
733 | { | ||
734 | struct sk_buff *skb = *pskb; | ||
735 | struct iphdr *iph; | ||
736 | struct ip_vs_protocol *pp; | ||
737 | struct ip_vs_conn *cp; | ||
738 | int ihl; | ||
739 | |||
740 | EnterFunction(11); | ||
741 | |||
742 | if (skb->nfcache & NFC_IPVS_PROPERTY) | ||
743 | return NF_ACCEPT; | ||
744 | |||
745 | iph = skb->nh.iph; | ||
746 | if (unlikely(iph->protocol == IPPROTO_ICMP)) { | ||
747 | int related, verdict = ip_vs_out_icmp(pskb, &related); | ||
748 | |||
749 | if (related) | ||
750 | return verdict; | ||
751 | skb = *pskb; | ||
752 | iph = skb->nh.iph; | ||
753 | } | ||
754 | |||
755 | pp = ip_vs_proto_get(iph->protocol); | ||
756 | if (unlikely(!pp)) | ||
757 | return NF_ACCEPT; | ||
758 | |||
759 | /* reassemble IP fragments */ | ||
760 | if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && | ||
761 | !pp->dont_defrag)) { | ||
762 | skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); | ||
763 | if (!skb) | ||
764 | return NF_STOLEN; | ||
765 | iph = skb->nh.iph; | ||
766 | *pskb = skb; | ||
767 | } | ||
768 | |||
769 | ihl = iph->ihl << 2; | ||
770 | |||
771 | /* | ||
772 | * Check if the packet belongs to an existing entry | ||
773 | */ | ||
774 | cp = pp->conn_out_get(skb, pp, iph, ihl, 0); | ||
775 | |||
776 | if (unlikely(!cp)) { | ||
777 | if (sysctl_ip_vs_nat_icmp_send && | ||
778 | (pp->protocol == IPPROTO_TCP || | ||
779 | pp->protocol == IPPROTO_UDP)) { | ||
780 | __u16 _ports[2], *pptr; | ||
781 | |||
782 | pptr = skb_header_pointer(skb, ihl, | ||
783 | sizeof(_ports), _ports); | ||
784 | if (pptr == NULL) | ||
785 | return NF_ACCEPT; /* Not for me */ | ||
786 | if (ip_vs_lookup_real_service(iph->protocol, | ||
787 | iph->saddr, pptr[0])) { | ||
788 | /* | ||
789 | * Notify the real server: there is no | ||
790 | * existing entry if it is not RST | ||
791 | * packet or not TCP packet. | ||
792 | */ | ||
793 | if (iph->protocol != IPPROTO_TCP | ||
794 | || !is_tcp_reset(skb)) { | ||
795 | icmp_send(skb,ICMP_DEST_UNREACH, | ||
796 | ICMP_PORT_UNREACH, 0); | ||
797 | return NF_DROP; | ||
798 | } | ||
799 | } | ||
800 | } | ||
801 | IP_VS_DBG_PKT(12, pp, skb, 0, | ||
802 | "packet continues traversal as normal"); | ||
803 | return NF_ACCEPT; | ||
804 | } | ||
805 | |||
806 | IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); | ||
807 | |||
808 | if (!ip_vs_make_skb_writable(pskb, ihl)) | ||
809 | goto drop; | ||
810 | |||
811 | /* mangle the packet */ | ||
812 | if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp)) | ||
813 | goto drop; | ||
814 | skb = *pskb; | ||
815 | skb->nh.iph->saddr = cp->vaddr; | ||
816 | ip_send_check(skb->nh.iph); | ||
817 | |||
818 | IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); | ||
819 | |||
820 | ip_vs_out_stats(cp, skb); | ||
821 | ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); | ||
822 | ip_vs_conn_put(cp); | ||
823 | |||
824 | skb->nfcache |= NFC_IPVS_PROPERTY; | ||
825 | |||
826 | LeaveFunction(11); | ||
827 | return NF_ACCEPT; | ||
828 | |||
829 | drop: | ||
830 | ip_vs_conn_put(cp); | ||
831 | kfree_skb(*pskb); | ||
832 | return NF_STOLEN; | ||
833 | } | ||
834 | |||
835 | |||
836 | /* | ||
837 | * Handle ICMP messages in the outside-to-inside direction (incoming). | ||
838 | * Find any that might be relevant, check against existing connections, | ||
839 | * forward to the right destination host if relevant. | ||
840 | * Currently handles error types - unreachable, quench, ttl exceeded. | ||
841 | */ | ||
842 | static int | ||
843 | ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) | ||
844 | { | ||
845 | struct sk_buff *skb = *pskb; | ||
846 | struct iphdr *iph; | ||
847 | struct icmphdr _icmph, *ic; | ||
848 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ | ||
849 | struct ip_vs_conn *cp; | ||
850 | struct ip_vs_protocol *pp; | ||
851 | unsigned int offset, ihl, verdict; | ||
852 | |||
853 | *related = 1; | ||
854 | |||
855 | /* reassemble IP fragments */ | ||
856 | if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { | ||
857 | skb = ip_vs_gather_frags(skb, | ||
858 | hooknum == NF_IP_LOCAL_IN ? | ||
859 | IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD); | ||
860 | if (!skb) | ||
861 | return NF_STOLEN; | ||
862 | *pskb = skb; | ||
863 | } | ||
864 | |||
865 | iph = skb->nh.iph; | ||
866 | offset = ihl = iph->ihl * 4; | ||
867 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
868 | if (ic == NULL) | ||
869 | return NF_DROP; | ||
870 | |||
871 | IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
872 | ic->type, ntohs(icmp_id(ic)), | ||
873 | NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | ||
874 | |||
875 | /* | ||
876 | * Work through seeing if this is for us. | ||
877 | * These checks are supposed to be in an order that means easy | ||
878 | * things are checked first to speed up processing.... however | ||
879 | * this means that some packets will manage to get a long way | ||
880 | * down this stack and then be rejected, but that's life. | ||
881 | */ | ||
882 | if ((ic->type != ICMP_DEST_UNREACH) && | ||
883 | (ic->type != ICMP_SOURCE_QUENCH) && | ||
884 | (ic->type != ICMP_TIME_EXCEEDED)) { | ||
885 | *related = 0; | ||
886 | return NF_ACCEPT; | ||
887 | } | ||
888 | |||
889 | /* Now find the contained IP header */ | ||
890 | offset += sizeof(_icmph); | ||
891 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
892 | if (cih == NULL) | ||
893 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
894 | |||
895 | pp = ip_vs_proto_get(cih->protocol); | ||
896 | if (!pp) | ||
897 | return NF_ACCEPT; | ||
898 | |||
899 | /* Is the embedded protocol header present? */ | ||
900 | if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && | ||
901 | pp->dont_defrag)) | ||
902 | return NF_ACCEPT; | ||
903 | |||
904 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); | ||
905 | |||
906 | offset += cih->ihl * 4; | ||
907 | |||
908 | /* The embedded headers contain source and dest in reverse order */ | ||
909 | cp = pp->conn_in_get(skb, pp, cih, offset, 1); | ||
910 | if (!cp) | ||
911 | return NF_ACCEPT; | ||
912 | |||
913 | verdict = NF_DROP; | ||
914 | |||
915 | /* Ensure the checksum is correct */ | ||
916 | if (skb->ip_summed != CHECKSUM_UNNECESSARY && | ||
917 | ip_vs_checksum_complete(skb, ihl)) { | ||
918 | /* Failed checksum! */ | ||
919 | IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", | ||
920 | NIPQUAD(iph->saddr)); | ||
921 | goto out; | ||
922 | } | ||
923 | |||
924 | /* do the statistics and put it back */ | ||
925 | ip_vs_in_stats(cp, skb); | ||
926 | if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) | ||
927 | offset += 2 * sizeof(__u16); | ||
928 | verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); | ||
929 | /* do not touch skb anymore */ | ||
930 | |||
931 | out: | ||
932 | __ip_vs_conn_put(cp); | ||
933 | |||
934 | return verdict; | ||
935 | } | ||
936 | |||
937 | /* | ||
938 | * Check if it's for virtual services, look it up, | ||
939 | * and send it on its way... | ||
940 | */ | ||
941 | static unsigned int | ||
942 | ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, | ||
943 | const struct net_device *in, const struct net_device *out, | ||
944 | int (*okfn)(struct sk_buff *)) | ||
945 | { | ||
946 | struct sk_buff *skb = *pskb; | ||
947 | struct iphdr *iph; | ||
948 | struct ip_vs_protocol *pp; | ||
949 | struct ip_vs_conn *cp; | ||
950 | int ret, restart; | ||
951 | int ihl; | ||
952 | |||
953 | /* | ||
954 | * Big tappo: only PACKET_HOST (neither loopback nor mcasts) | ||
955 | * ... don't know why 1st test DOES NOT include 2nd (?) | ||
956 | */ | ||
957 | if (unlikely(skb->pkt_type != PACKET_HOST | ||
958 | || skb->dev == &loopback_dev || skb->sk)) { | ||
959 | IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", | ||
960 | skb->pkt_type, | ||
961 | skb->nh.iph->protocol, | ||
962 | NIPQUAD(skb->nh.iph->daddr)); | ||
963 | return NF_ACCEPT; | ||
964 | } | ||
965 | |||
966 | iph = skb->nh.iph; | ||
967 | if (unlikely(iph->protocol == IPPROTO_ICMP)) { | ||
968 | int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum); | ||
969 | |||
970 | if (related) | ||
971 | return verdict; | ||
972 | skb = *pskb; | ||
973 | iph = skb->nh.iph; | ||
974 | } | ||
975 | |||
976 | /* Protocol supported? */ | ||
977 | pp = ip_vs_proto_get(iph->protocol); | ||
978 | if (unlikely(!pp)) | ||
979 | return NF_ACCEPT; | ||
980 | |||
981 | ihl = iph->ihl << 2; | ||
982 | |||
983 | /* | ||
984 | * Check if the packet belongs to an existing connection entry | ||
985 | */ | ||
986 | cp = pp->conn_in_get(skb, pp, iph, ihl, 0); | ||
987 | |||
988 | if (unlikely(!cp)) { | ||
989 | int v; | ||
990 | |||
991 | if (!pp->conn_schedule(skb, pp, &v, &cp)) | ||
992 | return v; | ||
993 | } | ||
994 | |||
995 | if (unlikely(!cp)) { | ||
996 | /* sorry, all this trouble for a no-hit :) */ | ||
997 | IP_VS_DBG_PKT(12, pp, skb, 0, | ||
998 | "packet continues traversal as normal"); | ||
999 | return NF_ACCEPT; | ||
1000 | } | ||
1001 | |||
1002 | IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); | ||
1003 | |||
1004 | /* Check the server status */ | ||
1005 | if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
1006 | /* the destination server is not available */ | ||
1007 | |||
1008 | if (sysctl_ip_vs_expire_nodest_conn) { | ||
1009 | /* try to expire the connection immediately */ | ||
1010 | ip_vs_conn_expire_now(cp); | ||
1011 | } else { | ||
1012 | /* don't restart its timer, and silently | ||
1013 | drop the packet. */ | ||
1014 | __ip_vs_conn_put(cp); | ||
1015 | } | ||
1016 | return NF_DROP; | ||
1017 | } | ||
1018 | |||
1019 | ip_vs_in_stats(cp, skb); | ||
1020 | restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | ||
1021 | if (cp->packet_xmit) | ||
1022 | ret = cp->packet_xmit(skb, cp, pp); | ||
1023 | /* do not touch skb anymore */ | ||
1024 | else { | ||
1025 | IP_VS_DBG_RL("warning: packet_xmit is null"); | ||
1026 | ret = NF_ACCEPT; | ||
1027 | } | ||
1028 | |||
1029 | /* increase its packet counter and check if it is needed | ||
1030 | to be synchronized */ | ||
1031 | atomic_inc(&cp->in_pkts); | ||
1032 | if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && | ||
1033 | (cp->protocol != IPPROTO_TCP || | ||
1034 | cp->state == IP_VS_TCP_S_ESTABLISHED) && | ||
1035 | (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] | ||
1036 | == sysctl_ip_vs_sync_threshold[0])) | ||
1037 | ip_vs_sync_conn(cp); | ||
1038 | |||
1039 | ip_vs_conn_put(cp); | ||
1040 | return ret; | ||
1041 | } | ||
1042 | |||
1043 | |||
1044 | /* | ||
1045 | * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP | ||
1046 | * related packets destined for 0.0.0.0/0. | ||
1047 | * When fwmark-based virtual service is used, such as transparent | ||
1048 | * cache cluster, TCP packets can be marked and routed to ip_vs_in, | ||
1049 | * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and | ||
1050 | * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain | ||
1051 | * and send them to ip_vs_in_icmp. | ||
1052 | */ | ||
1053 | static unsigned int | ||
1054 | ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb, | ||
1055 | const struct net_device *in, const struct net_device *out, | ||
1056 | int (*okfn)(struct sk_buff *)) | ||
1057 | { | ||
1058 | int r; | ||
1059 | |||
1060 | if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP) | ||
1061 | return NF_ACCEPT; | ||
1062 | |||
1063 | return ip_vs_in_icmp(pskb, &r, hooknum); | ||
1064 | } | ||
1065 | |||
1066 | |||
1067 | /* After packet filtering, forward packet through VS/DR, VS/TUN, | ||
1068 | or VS/NAT(change destination), so that filtering rules can be | ||
1069 | applied to IPVS. */ | ||
1070 | static struct nf_hook_ops ip_vs_in_ops = { | ||
1071 | .hook = ip_vs_in, | ||
1072 | .owner = THIS_MODULE, | ||
1073 | .pf = PF_INET, | ||
1074 | .hooknum = NF_IP_LOCAL_IN, | ||
1075 | .priority = 100, | ||
1076 | }; | ||
1077 | |||
1078 | /* After packet filtering, change source only for VS/NAT */ | ||
1079 | static struct nf_hook_ops ip_vs_out_ops = { | ||
1080 | .hook = ip_vs_out, | ||
1081 | .owner = THIS_MODULE, | ||
1082 | .pf = PF_INET, | ||
1083 | .hooknum = NF_IP_FORWARD, | ||
1084 | .priority = 100, | ||
1085 | }; | ||
1086 | |||
1087 | /* After packet filtering (but before ip_vs_out_icmp), catch icmp | ||
1088 | destined for 0.0.0.0/0, which is for incoming IPVS connections */ | ||
1089 | static struct nf_hook_ops ip_vs_forward_icmp_ops = { | ||
1090 | .hook = ip_vs_forward_icmp, | ||
1091 | .owner = THIS_MODULE, | ||
1092 | .pf = PF_INET, | ||
1093 | .hooknum = NF_IP_FORWARD, | ||
1094 | .priority = 99, | ||
1095 | }; | ||
1096 | |||
1097 | /* Before the netfilter connection tracking, exit from POST_ROUTING */ | ||
1098 | static struct nf_hook_ops ip_vs_post_routing_ops = { | ||
1099 | .hook = ip_vs_post_routing, | ||
1100 | .owner = THIS_MODULE, | ||
1101 | .pf = PF_INET, | ||
1102 | .hooknum = NF_IP_POST_ROUTING, | ||
1103 | .priority = NF_IP_PRI_NAT_SRC-1, | ||
1104 | }; | ||
1105 | |||
1106 | |||
1107 | /* | ||
1108 | * Initialize IP Virtual Server | ||
1109 | */ | ||
1110 | static int __init ip_vs_init(void) | ||
1111 | { | ||
1112 | int ret; | ||
1113 | |||
1114 | ret = ip_vs_control_init(); | ||
1115 | if (ret < 0) { | ||
1116 | IP_VS_ERR("can't setup control.\n"); | ||
1117 | goto cleanup_nothing; | ||
1118 | } | ||
1119 | |||
1120 | ip_vs_protocol_init(); | ||
1121 | |||
1122 | ret = ip_vs_app_init(); | ||
1123 | if (ret < 0) { | ||
1124 | IP_VS_ERR("can't setup application helper.\n"); | ||
1125 | goto cleanup_protocol; | ||
1126 | } | ||
1127 | |||
1128 | ret = ip_vs_conn_init(); | ||
1129 | if (ret < 0) { | ||
1130 | IP_VS_ERR("can't setup connection table.\n"); | ||
1131 | goto cleanup_app; | ||
1132 | } | ||
1133 | |||
1134 | ret = nf_register_hook(&ip_vs_in_ops); | ||
1135 | if (ret < 0) { | ||
1136 | IP_VS_ERR("can't register in hook.\n"); | ||
1137 | goto cleanup_conn; | ||
1138 | } | ||
1139 | |||
1140 | ret = nf_register_hook(&ip_vs_out_ops); | ||
1141 | if (ret < 0) { | ||
1142 | IP_VS_ERR("can't register out hook.\n"); | ||
1143 | goto cleanup_inops; | ||
1144 | } | ||
1145 | ret = nf_register_hook(&ip_vs_post_routing_ops); | ||
1146 | if (ret < 0) { | ||
1147 | IP_VS_ERR("can't register post_routing hook.\n"); | ||
1148 | goto cleanup_outops; | ||
1149 | } | ||
1150 | ret = nf_register_hook(&ip_vs_forward_icmp_ops); | ||
1151 | if (ret < 0) { | ||
1152 | IP_VS_ERR("can't register forward_icmp hook.\n"); | ||
1153 | goto cleanup_postroutingops; | ||
1154 | } | ||
1155 | |||
1156 | IP_VS_INFO("ipvs loaded.\n"); | ||
1157 | return ret; | ||
1158 | |||
1159 | cleanup_postroutingops: | ||
1160 | nf_unregister_hook(&ip_vs_post_routing_ops); | ||
1161 | cleanup_outops: | ||
1162 | nf_unregister_hook(&ip_vs_out_ops); | ||
1163 | cleanup_inops: | ||
1164 | nf_unregister_hook(&ip_vs_in_ops); | ||
1165 | cleanup_conn: | ||
1166 | ip_vs_conn_cleanup(); | ||
1167 | cleanup_app: | ||
1168 | ip_vs_app_cleanup(); | ||
1169 | cleanup_protocol: | ||
1170 | ip_vs_protocol_cleanup(); | ||
1171 | ip_vs_control_cleanup(); | ||
1172 | cleanup_nothing: | ||
1173 | return ret; | ||
1174 | } | ||
1175 | |||
1176 | static void __exit ip_vs_cleanup(void) | ||
1177 | { | ||
1178 | nf_unregister_hook(&ip_vs_forward_icmp_ops); | ||
1179 | nf_unregister_hook(&ip_vs_post_routing_ops); | ||
1180 | nf_unregister_hook(&ip_vs_out_ops); | ||
1181 | nf_unregister_hook(&ip_vs_in_ops); | ||
1182 | ip_vs_conn_cleanup(); | ||
1183 | ip_vs_app_cleanup(); | ||
1184 | ip_vs_protocol_cleanup(); | ||
1185 | ip_vs_control_cleanup(); | ||
1186 | IP_VS_INFO("ipvs unloaded.\n"); | ||
1187 | } | ||
1188 | |||
1189 | module_init(ip_vs_init); | ||
1190 | module_exit(ip_vs_cleanup); | ||
1191 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c new file mode 100644 index 000000000000..218d9701036e --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ctl.c | |||
@@ -0,0 +1,2391 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the NetFilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $ | ||
9 | * | ||
10 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
11 | * Peter Kese <peter.kese@ijs.si> | ||
12 | * Julian Anastasov <ja@ssi.bg> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or | ||
15 | * modify it under the terms of the GNU General Public License | ||
16 | * as published by the Free Software Foundation; either version | ||
17 | * 2 of the License, or (at your option) any later version. | ||
18 | * | ||
19 | * Changes: | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/types.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/sysctl.h> | ||
28 | #include <linux/proc_fs.h> | ||
29 | #include <linux/workqueue.h> | ||
30 | #include <linux/swap.h> | ||
31 | #include <linux/proc_fs.h> | ||
32 | #include <linux/seq_file.h> | ||
33 | |||
34 | #include <linux/netfilter.h> | ||
35 | #include <linux/netfilter_ipv4.h> | ||
36 | |||
37 | #include <net/ip.h> | ||
38 | #include <net/sock.h> | ||
39 | |||
40 | #include <asm/uaccess.h> | ||
41 | |||
42 | #include <net/ip_vs.h> | ||
43 | |||
44 | /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ | ||
45 | static DECLARE_MUTEX(__ip_vs_mutex); | ||
46 | |||
47 | /* lock for service table */ | ||
48 | static DEFINE_RWLOCK(__ip_vs_svc_lock); | ||
49 | |||
50 | /* lock for table with the real services */ | ||
51 | static DEFINE_RWLOCK(__ip_vs_rs_lock); | ||
52 | |||
53 | /* lock for state and timeout tables */ | ||
54 | static DEFINE_RWLOCK(__ip_vs_securetcp_lock); | ||
55 | |||
56 | /* lock for drop entry handling */ | ||
57 | static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); | ||
58 | |||
59 | /* lock for drop packet handling */ | ||
60 | static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); | ||
61 | |||
62 | /* 1/rate drop and drop-entry variables */ | ||
63 | int ip_vs_drop_rate = 0; | ||
64 | int ip_vs_drop_counter = 0; | ||
65 | static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); | ||
66 | |||
67 | /* number of virtual services */ | ||
68 | static int ip_vs_num_services = 0; | ||
69 | |||
70 | /* sysctl variables */ | ||
71 | static int sysctl_ip_vs_drop_entry = 0; | ||
72 | static int sysctl_ip_vs_drop_packet = 0; | ||
73 | static int sysctl_ip_vs_secure_tcp = 0; | ||
74 | static int sysctl_ip_vs_amemthresh = 1024; | ||
75 | static int sysctl_ip_vs_am_droprate = 10; | ||
76 | int sysctl_ip_vs_cache_bypass = 0; | ||
77 | int sysctl_ip_vs_expire_nodest_conn = 0; | ||
78 | int sysctl_ip_vs_expire_quiescent_template = 0; | ||
79 | int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; | ||
80 | int sysctl_ip_vs_nat_icmp_send = 0; | ||
81 | |||
82 | |||
83 | #ifdef CONFIG_IP_VS_DEBUG | ||
84 | static int sysctl_ip_vs_debug_level = 0; | ||
85 | |||
86 | int ip_vs_get_debug_level(void) | ||
87 | { | ||
88 | return sysctl_ip_vs_debug_level; | ||
89 | } | ||
90 | #endif | ||
91 | |||
92 | /* | ||
93 | * update_defense_level is called from keventd and from sysctl. | ||
94 | */ | ||
95 | static void update_defense_level(void) | ||
96 | { | ||
97 | struct sysinfo i; | ||
98 | static int old_secure_tcp = 0; | ||
99 | int availmem; | ||
100 | int nomem; | ||
101 | int to_change = -1; | ||
102 | |||
103 | /* we only count free and buffered memory (in pages) */ | ||
104 | si_meminfo(&i); | ||
105 | availmem = i.freeram + i.bufferram; | ||
106 | /* however in linux 2.5 the i.bufferram is total page cache size, | ||
107 | we need adjust it */ | ||
108 | /* si_swapinfo(&i); */ | ||
109 | /* availmem = availmem - (i.totalswap - i.freeswap); */ | ||
110 | |||
111 | nomem = (availmem < sysctl_ip_vs_amemthresh); | ||
112 | |||
113 | /* drop_entry */ | ||
114 | spin_lock(&__ip_vs_dropentry_lock); | ||
115 | switch (sysctl_ip_vs_drop_entry) { | ||
116 | case 0: | ||
117 | atomic_set(&ip_vs_dropentry, 0); | ||
118 | break; | ||
119 | case 1: | ||
120 | if (nomem) { | ||
121 | atomic_set(&ip_vs_dropentry, 1); | ||
122 | sysctl_ip_vs_drop_entry = 2; | ||
123 | } else { | ||
124 | atomic_set(&ip_vs_dropentry, 0); | ||
125 | } | ||
126 | break; | ||
127 | case 2: | ||
128 | if (nomem) { | ||
129 | atomic_set(&ip_vs_dropentry, 1); | ||
130 | } else { | ||
131 | atomic_set(&ip_vs_dropentry, 0); | ||
132 | sysctl_ip_vs_drop_entry = 1; | ||
133 | }; | ||
134 | break; | ||
135 | case 3: | ||
136 | atomic_set(&ip_vs_dropentry, 1); | ||
137 | break; | ||
138 | } | ||
139 | spin_unlock(&__ip_vs_dropentry_lock); | ||
140 | |||
141 | /* drop_packet */ | ||
142 | spin_lock(&__ip_vs_droppacket_lock); | ||
143 | switch (sysctl_ip_vs_drop_packet) { | ||
144 | case 0: | ||
145 | ip_vs_drop_rate = 0; | ||
146 | break; | ||
147 | case 1: | ||
148 | if (nomem) { | ||
149 | ip_vs_drop_rate = ip_vs_drop_counter | ||
150 | = sysctl_ip_vs_amemthresh / | ||
151 | (sysctl_ip_vs_amemthresh-availmem); | ||
152 | sysctl_ip_vs_drop_packet = 2; | ||
153 | } else { | ||
154 | ip_vs_drop_rate = 0; | ||
155 | } | ||
156 | break; | ||
157 | case 2: | ||
158 | if (nomem) { | ||
159 | ip_vs_drop_rate = ip_vs_drop_counter | ||
160 | = sysctl_ip_vs_amemthresh / | ||
161 | (sysctl_ip_vs_amemthresh-availmem); | ||
162 | } else { | ||
163 | ip_vs_drop_rate = 0; | ||
164 | sysctl_ip_vs_drop_packet = 1; | ||
165 | } | ||
166 | break; | ||
167 | case 3: | ||
168 | ip_vs_drop_rate = sysctl_ip_vs_am_droprate; | ||
169 | break; | ||
170 | } | ||
171 | spin_unlock(&__ip_vs_droppacket_lock); | ||
172 | |||
173 | /* secure_tcp */ | ||
174 | write_lock(&__ip_vs_securetcp_lock); | ||
175 | switch (sysctl_ip_vs_secure_tcp) { | ||
176 | case 0: | ||
177 | if (old_secure_tcp >= 2) | ||
178 | to_change = 0; | ||
179 | break; | ||
180 | case 1: | ||
181 | if (nomem) { | ||
182 | if (old_secure_tcp < 2) | ||
183 | to_change = 1; | ||
184 | sysctl_ip_vs_secure_tcp = 2; | ||
185 | } else { | ||
186 | if (old_secure_tcp >= 2) | ||
187 | to_change = 0; | ||
188 | } | ||
189 | break; | ||
190 | case 2: | ||
191 | if (nomem) { | ||
192 | if (old_secure_tcp < 2) | ||
193 | to_change = 1; | ||
194 | } else { | ||
195 | if (old_secure_tcp >= 2) | ||
196 | to_change = 0; | ||
197 | sysctl_ip_vs_secure_tcp = 1; | ||
198 | } | ||
199 | break; | ||
200 | case 3: | ||
201 | if (old_secure_tcp < 2) | ||
202 | to_change = 1; | ||
203 | break; | ||
204 | } | ||
205 | old_secure_tcp = sysctl_ip_vs_secure_tcp; | ||
206 | if (to_change >= 0) | ||
207 | ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); | ||
208 | write_unlock(&__ip_vs_securetcp_lock); | ||
209 | } | ||
210 | |||
211 | |||
212 | /* | ||
213 | * Timer for checking the defense | ||
214 | */ | ||
215 | #define DEFENSE_TIMER_PERIOD 1*HZ | ||
216 | static void defense_work_handler(void *data); | ||
217 | static DECLARE_WORK(defense_work, defense_work_handler, NULL); | ||
218 | |||
219 | static void defense_work_handler(void *data) | ||
220 | { | ||
221 | update_defense_level(); | ||
222 | if (atomic_read(&ip_vs_dropentry)) | ||
223 | ip_vs_random_dropentry(); | ||
224 | |||
225 | schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); | ||
226 | } | ||
227 | |||
228 | int | ||
229 | ip_vs_use_count_inc(void) | ||
230 | { | ||
231 | return try_module_get(THIS_MODULE); | ||
232 | } | ||
233 | |||
234 | void | ||
235 | ip_vs_use_count_dec(void) | ||
236 | { | ||
237 | module_put(THIS_MODULE); | ||
238 | } | ||
239 | |||
240 | |||
241 | /* | ||
242 | * Hash table: for virtual service lookups | ||
243 | */ | ||
244 | #define IP_VS_SVC_TAB_BITS 8 | ||
245 | #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) | ||
246 | #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) | ||
247 | |||
248 | /* the service table hashed by <protocol, addr, port> */ | ||
249 | static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; | ||
250 | /* the service table hashed by fwmark */ | ||
251 | static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; | ||
252 | |||
253 | /* | ||
254 | * Hash table: for real service lookups | ||
255 | */ | ||
256 | #define IP_VS_RTAB_BITS 4 | ||
257 | #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) | ||
258 | #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) | ||
259 | |||
260 | static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; | ||
261 | |||
262 | /* | ||
263 | * Trash for destinations | ||
264 | */ | ||
265 | static LIST_HEAD(ip_vs_dest_trash); | ||
266 | |||
267 | /* | ||
268 | * FTP & NULL virtual service counters | ||
269 | */ | ||
270 | static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); | ||
271 | static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); | ||
272 | |||
273 | |||
274 | /* | ||
275 | * Returns hash value for virtual service | ||
276 | */ | ||
277 | static __inline__ unsigned | ||
278 | ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port) | ||
279 | { | ||
280 | register unsigned porth = ntohs(port); | ||
281 | |||
282 | return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) | ||
283 | & IP_VS_SVC_TAB_MASK; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Returns hash value of fwmark for virtual service lookup | ||
288 | */ | ||
289 | static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) | ||
290 | { | ||
291 | return fwmark & IP_VS_SVC_TAB_MASK; | ||
292 | } | ||
293 | |||
294 | /* | ||
295 | * Hashes a service in the ip_vs_svc_table by <proto,addr,port> | ||
296 | * or in the ip_vs_svc_fwm_table by fwmark. | ||
297 | * Should be called with locked tables. | ||
298 | */ | ||
299 | static int ip_vs_svc_hash(struct ip_vs_service *svc) | ||
300 | { | ||
301 | unsigned hash; | ||
302 | |||
303 | if (svc->flags & IP_VS_SVC_F_HASHED) { | ||
304 | IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " | ||
305 | "called from %p\n", __builtin_return_address(0)); | ||
306 | return 0; | ||
307 | } | ||
308 | |||
309 | if (svc->fwmark == 0) { | ||
310 | /* | ||
311 | * Hash it by <protocol,addr,port> in ip_vs_svc_table | ||
312 | */ | ||
313 | hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); | ||
314 | list_add(&svc->s_list, &ip_vs_svc_table[hash]); | ||
315 | } else { | ||
316 | /* | ||
317 | * Hash it by fwmark in ip_vs_svc_fwm_table | ||
318 | */ | ||
319 | hash = ip_vs_svc_fwm_hashkey(svc->fwmark); | ||
320 | list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); | ||
321 | } | ||
322 | |||
323 | svc->flags |= IP_VS_SVC_F_HASHED; | ||
324 | /* increase its refcnt because it is referenced by the svc table */ | ||
325 | atomic_inc(&svc->refcnt); | ||
326 | return 1; | ||
327 | } | ||
328 | |||
329 | |||
330 | /* | ||
331 | * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. | ||
332 | * Should be called with locked tables. | ||
333 | */ | ||
334 | static int ip_vs_svc_unhash(struct ip_vs_service *svc) | ||
335 | { | ||
336 | if (!(svc->flags & IP_VS_SVC_F_HASHED)) { | ||
337 | IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " | ||
338 | "called from %p\n", __builtin_return_address(0)); | ||
339 | return 0; | ||
340 | } | ||
341 | |||
342 | if (svc->fwmark == 0) { | ||
343 | /* Remove it from the ip_vs_svc_table table */ | ||
344 | list_del(&svc->s_list); | ||
345 | } else { | ||
346 | /* Remove it from the ip_vs_svc_fwm_table table */ | ||
347 | list_del(&svc->f_list); | ||
348 | } | ||
349 | |||
350 | svc->flags &= ~IP_VS_SVC_F_HASHED; | ||
351 | atomic_dec(&svc->refcnt); | ||
352 | return 1; | ||
353 | } | ||
354 | |||
355 | |||
356 | /* | ||
357 | * Get service by {proto,addr,port} in the service table. | ||
358 | */ | ||
359 | static __inline__ struct ip_vs_service * | ||
360 | __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport) | ||
361 | { | ||
362 | unsigned hash; | ||
363 | struct ip_vs_service *svc; | ||
364 | |||
365 | /* Check for "full" addressed entries */ | ||
366 | hash = ip_vs_svc_hashkey(protocol, vaddr, vport); | ||
367 | |||
368 | list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ | ||
369 | if ((svc->addr == vaddr) | ||
370 | && (svc->port == vport) | ||
371 | && (svc->protocol == protocol)) { | ||
372 | /* HIT */ | ||
373 | atomic_inc(&svc->usecnt); | ||
374 | return svc; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | return NULL; | ||
379 | } | ||
380 | |||
381 | |||
382 | /* | ||
383 | * Get service by {fwmark} in the service table. | ||
384 | */ | ||
385 | static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) | ||
386 | { | ||
387 | unsigned hash; | ||
388 | struct ip_vs_service *svc; | ||
389 | |||
390 | /* Check for fwmark addressed entries */ | ||
391 | hash = ip_vs_svc_fwm_hashkey(fwmark); | ||
392 | |||
393 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { | ||
394 | if (svc->fwmark == fwmark) { | ||
395 | /* HIT */ | ||
396 | atomic_inc(&svc->usecnt); | ||
397 | return svc; | ||
398 | } | ||
399 | } | ||
400 | |||
401 | return NULL; | ||
402 | } | ||
403 | |||
404 | struct ip_vs_service * | ||
405 | ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) | ||
406 | { | ||
407 | struct ip_vs_service *svc; | ||
408 | |||
409 | read_lock(&__ip_vs_svc_lock); | ||
410 | |||
411 | /* | ||
412 | * Check the table hashed by fwmark first | ||
413 | */ | ||
414 | if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) | ||
415 | goto out; | ||
416 | |||
417 | /* | ||
418 | * Check the table hashed by <protocol,addr,port> | ||
419 | * for "full" addressed entries | ||
420 | */ | ||
421 | svc = __ip_vs_service_get(protocol, vaddr, vport); | ||
422 | |||
423 | if (svc == NULL | ||
424 | && protocol == IPPROTO_TCP | ||
425 | && atomic_read(&ip_vs_ftpsvc_counter) | ||
426 | && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { | ||
427 | /* | ||
428 | * Check if ftp service entry exists, the packet | ||
429 | * might belong to FTP data connections. | ||
430 | */ | ||
431 | svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); | ||
432 | } | ||
433 | |||
434 | if (svc == NULL | ||
435 | && atomic_read(&ip_vs_nullsvc_counter)) { | ||
436 | /* | ||
437 | * Check if the catch-all port (port zero) exists | ||
438 | */ | ||
439 | svc = __ip_vs_service_get(protocol, vaddr, 0); | ||
440 | } | ||
441 | |||
442 | out: | ||
443 | read_unlock(&__ip_vs_svc_lock); | ||
444 | |||
445 | IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", | ||
446 | fwmark, ip_vs_proto_name(protocol), | ||
447 | NIPQUAD(vaddr), ntohs(vport), | ||
448 | svc?"hit":"not hit"); | ||
449 | |||
450 | return svc; | ||
451 | } | ||
452 | |||
453 | |||
454 | static inline void | ||
455 | __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
456 | { | ||
457 | atomic_inc(&svc->refcnt); | ||
458 | dest->svc = svc; | ||
459 | } | ||
460 | |||
461 | static inline void | ||
462 | __ip_vs_unbind_svc(struct ip_vs_dest *dest) | ||
463 | { | ||
464 | struct ip_vs_service *svc = dest->svc; | ||
465 | |||
466 | dest->svc = NULL; | ||
467 | if (atomic_dec_and_test(&svc->refcnt)) | ||
468 | kfree(svc); | ||
469 | } | ||
470 | |||
471 | |||
472 | /* | ||
473 | * Returns hash value for real service | ||
474 | */ | ||
475 | static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port) | ||
476 | { | ||
477 | register unsigned porth = ntohs(port); | ||
478 | |||
479 | return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) | ||
480 | & IP_VS_RTAB_MASK; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. | ||
485 | * should be called with locked tables. | ||
486 | */ | ||
487 | static int ip_vs_rs_hash(struct ip_vs_dest *dest) | ||
488 | { | ||
489 | unsigned hash; | ||
490 | |||
491 | if (!list_empty(&dest->d_list)) { | ||
492 | return 0; | ||
493 | } | ||
494 | |||
495 | /* | ||
496 | * Hash by proto,addr,port, | ||
497 | * which are the parameters of the real service. | ||
498 | */ | ||
499 | hash = ip_vs_rs_hashkey(dest->addr, dest->port); | ||
500 | list_add(&dest->d_list, &ip_vs_rtable[hash]); | ||
501 | |||
502 | return 1; | ||
503 | } | ||
504 | |||
505 | /* | ||
506 | * UNhashes ip_vs_dest from ip_vs_rtable. | ||
507 | * should be called with locked tables. | ||
508 | */ | ||
509 | static int ip_vs_rs_unhash(struct ip_vs_dest *dest) | ||
510 | { | ||
511 | /* | ||
512 | * Remove it from the ip_vs_rtable table. | ||
513 | */ | ||
514 | if (!list_empty(&dest->d_list)) { | ||
515 | list_del(&dest->d_list); | ||
516 | INIT_LIST_HEAD(&dest->d_list); | ||
517 | } | ||
518 | |||
519 | return 1; | ||
520 | } | ||
521 | |||
522 | /* | ||
523 | * Lookup real service by <proto,addr,port> in the real service table. | ||
524 | */ | ||
525 | struct ip_vs_dest * | ||
526 | ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport) | ||
527 | { | ||
528 | unsigned hash; | ||
529 | struct ip_vs_dest *dest; | ||
530 | |||
531 | /* | ||
532 | * Check for "full" addressed entries | ||
533 | * Return the first found entry | ||
534 | */ | ||
535 | hash = ip_vs_rs_hashkey(daddr, dport); | ||
536 | |||
537 | read_lock(&__ip_vs_rs_lock); | ||
538 | list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { | ||
539 | if ((dest->addr == daddr) | ||
540 | && (dest->port == dport) | ||
541 | && ((dest->protocol == protocol) || | ||
542 | dest->vfwmark)) { | ||
543 | /* HIT */ | ||
544 | read_unlock(&__ip_vs_rs_lock); | ||
545 | return dest; | ||
546 | } | ||
547 | } | ||
548 | read_unlock(&__ip_vs_rs_lock); | ||
549 | |||
550 | return NULL; | ||
551 | } | ||
552 | |||
553 | /* | ||
554 | * Lookup destination by {addr,port} in the given service | ||
555 | */ | ||
556 | static struct ip_vs_dest * | ||
557 | ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) | ||
558 | { | ||
559 | struct ip_vs_dest *dest; | ||
560 | |||
561 | /* | ||
562 | * Find the destination for the given service | ||
563 | */ | ||
564 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
565 | if ((dest->addr == daddr) && (dest->port == dport)) { | ||
566 | /* HIT */ | ||
567 | return dest; | ||
568 | } | ||
569 | } | ||
570 | |||
571 | return NULL; | ||
572 | } | ||
573 | |||
574 | |||
575 | /* | ||
576 | * Lookup dest by {svc,addr,port} in the destination trash. | ||
577 | * The destination trash is used to hold the destinations that are removed | ||
578 | * from the service table but are still referenced by some conn entries. | ||
579 | * The reason to add the destination trash is when the dest is temporary | ||
580 | * down (either by administrator or by monitor program), the dest can be | ||
581 | * picked back from the trash, the remaining connections to the dest can | ||
582 | * continue, and the counting information of the dest is also useful for | ||
583 | * scheduling. | ||
584 | */ | ||
585 | static struct ip_vs_dest * | ||
586 | ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) | ||
587 | { | ||
588 | struct ip_vs_dest *dest, *nxt; | ||
589 | |||
590 | /* | ||
591 | * Find the destination in trash | ||
592 | */ | ||
593 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { | ||
594 | IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " | ||
595 | "refcnt=%d\n", | ||
596 | dest->vfwmark, | ||
597 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
598 | atomic_read(&dest->refcnt)); | ||
599 | if (dest->addr == daddr && | ||
600 | dest->port == dport && | ||
601 | dest->vfwmark == svc->fwmark && | ||
602 | dest->protocol == svc->protocol && | ||
603 | (svc->fwmark || | ||
604 | (dest->vaddr == svc->addr && | ||
605 | dest->vport == svc->port))) { | ||
606 | /* HIT */ | ||
607 | return dest; | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Try to purge the destination from trash if not referenced | ||
612 | */ | ||
613 | if (atomic_read(&dest->refcnt) == 1) { | ||
614 | IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " | ||
615 | "from trash\n", | ||
616 | dest->vfwmark, | ||
617 | NIPQUAD(dest->addr), ntohs(dest->port)); | ||
618 | list_del(&dest->n_list); | ||
619 | ip_vs_dst_reset(dest); | ||
620 | __ip_vs_unbind_svc(dest); | ||
621 | kfree(dest); | ||
622 | } | ||
623 | } | ||
624 | |||
625 | return NULL; | ||
626 | } | ||
627 | |||
628 | |||
629 | /* | ||
630 | * Clean up all the destinations in the trash | ||
631 | * Called by the ip_vs_control_cleanup() | ||
632 | * | ||
633 | * When the ip_vs_control_clearup is activated by ipvs module exit, | ||
634 | * the service tables must have been flushed and all the connections | ||
635 | * are expired, and the refcnt of each destination in the trash must | ||
636 | * be 1, so we simply release them here. | ||
637 | */ | ||
638 | static void ip_vs_trash_cleanup(void) | ||
639 | { | ||
640 | struct ip_vs_dest *dest, *nxt; | ||
641 | |||
642 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { | ||
643 | list_del(&dest->n_list); | ||
644 | ip_vs_dst_reset(dest); | ||
645 | __ip_vs_unbind_svc(dest); | ||
646 | kfree(dest); | ||
647 | } | ||
648 | } | ||
649 | |||
650 | |||
651 | static void | ||
652 | ip_vs_zero_stats(struct ip_vs_stats *stats) | ||
653 | { | ||
654 | spin_lock_bh(&stats->lock); | ||
655 | memset(stats, 0, (char *)&stats->lock - (char *)stats); | ||
656 | spin_unlock_bh(&stats->lock); | ||
657 | ip_vs_zero_estimator(stats); | ||
658 | } | ||
659 | |||
660 | /* | ||
661 | * Update a destination in the given service | ||
662 | */ | ||
663 | static void | ||
664 | __ip_vs_update_dest(struct ip_vs_service *svc, | ||
665 | struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) | ||
666 | { | ||
667 | int conn_flags; | ||
668 | |||
669 | /* set the weight and the flags */ | ||
670 | atomic_set(&dest->weight, udest->weight); | ||
671 | conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; | ||
672 | |||
673 | /* check if local node and update the flags */ | ||
674 | if (inet_addr_type(udest->addr) == RTN_LOCAL) { | ||
675 | conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | ||
676 | | IP_VS_CONN_F_LOCALNODE; | ||
677 | } | ||
678 | |||
679 | /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ | ||
680 | if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { | ||
681 | conn_flags |= IP_VS_CONN_F_NOOUTPUT; | ||
682 | } else { | ||
683 | /* | ||
684 | * Put the real service in ip_vs_rtable if not present. | ||
685 | * For now only for NAT! | ||
686 | */ | ||
687 | write_lock_bh(&__ip_vs_rs_lock); | ||
688 | ip_vs_rs_hash(dest); | ||
689 | write_unlock_bh(&__ip_vs_rs_lock); | ||
690 | } | ||
691 | atomic_set(&dest->conn_flags, conn_flags); | ||
692 | |||
693 | /* bind the service */ | ||
694 | if (!dest->svc) { | ||
695 | __ip_vs_bind_svc(dest, svc); | ||
696 | } else { | ||
697 | if (dest->svc != svc) { | ||
698 | __ip_vs_unbind_svc(dest); | ||
699 | ip_vs_zero_stats(&dest->stats); | ||
700 | __ip_vs_bind_svc(dest, svc); | ||
701 | } | ||
702 | } | ||
703 | |||
704 | /* set the dest status flags */ | ||
705 | dest->flags |= IP_VS_DEST_F_AVAILABLE; | ||
706 | |||
707 | if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) | ||
708 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
709 | dest->u_threshold = udest->u_threshold; | ||
710 | dest->l_threshold = udest->l_threshold; | ||
711 | } | ||
712 | |||
713 | |||
714 | /* | ||
715 | * Create a destination for the given service | ||
716 | */ | ||
717 | static int | ||
718 | ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, | ||
719 | struct ip_vs_dest **dest_p) | ||
720 | { | ||
721 | struct ip_vs_dest *dest; | ||
722 | unsigned atype; | ||
723 | |||
724 | EnterFunction(2); | ||
725 | |||
726 | atype = inet_addr_type(udest->addr); | ||
727 | if (atype != RTN_LOCAL && atype != RTN_UNICAST) | ||
728 | return -EINVAL; | ||
729 | |||
730 | dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); | ||
731 | if (dest == NULL) { | ||
732 | IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); | ||
733 | return -ENOMEM; | ||
734 | } | ||
735 | memset(dest, 0, sizeof(struct ip_vs_dest)); | ||
736 | |||
737 | dest->protocol = svc->protocol; | ||
738 | dest->vaddr = svc->addr; | ||
739 | dest->vport = svc->port; | ||
740 | dest->vfwmark = svc->fwmark; | ||
741 | dest->addr = udest->addr; | ||
742 | dest->port = udest->port; | ||
743 | |||
744 | atomic_set(&dest->activeconns, 0); | ||
745 | atomic_set(&dest->inactconns, 0); | ||
746 | atomic_set(&dest->persistconns, 0); | ||
747 | atomic_set(&dest->refcnt, 0); | ||
748 | |||
749 | INIT_LIST_HEAD(&dest->d_list); | ||
750 | spin_lock_init(&dest->dst_lock); | ||
751 | spin_lock_init(&dest->stats.lock); | ||
752 | __ip_vs_update_dest(svc, dest, udest); | ||
753 | ip_vs_new_estimator(&dest->stats); | ||
754 | |||
755 | *dest_p = dest; | ||
756 | |||
757 | LeaveFunction(2); | ||
758 | return 0; | ||
759 | } | ||
760 | |||
761 | |||
762 | /* | ||
763 | * Add a destination into an existing service | ||
764 | */ | ||
765 | static int | ||
766 | ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) | ||
767 | { | ||
768 | struct ip_vs_dest *dest; | ||
769 | __u32 daddr = udest->addr; | ||
770 | __u16 dport = udest->port; | ||
771 | int ret; | ||
772 | |||
773 | EnterFunction(2); | ||
774 | |||
775 | if (udest->weight < 0) { | ||
776 | IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); | ||
777 | return -ERANGE; | ||
778 | } | ||
779 | |||
780 | if (udest->l_threshold > udest->u_threshold) { | ||
781 | IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " | ||
782 | "upper threshold\n"); | ||
783 | return -ERANGE; | ||
784 | } | ||
785 | |||
786 | /* | ||
787 | * Check if the dest already exists in the list | ||
788 | */ | ||
789 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
790 | if (dest != NULL) { | ||
791 | IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); | ||
792 | return -EEXIST; | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * Check if the dest already exists in the trash and | ||
797 | * is from the same service | ||
798 | */ | ||
799 | dest = ip_vs_trash_get_dest(svc, daddr, dport); | ||
800 | if (dest != NULL) { | ||
801 | IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " | ||
802 | "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", | ||
803 | NIPQUAD(daddr), ntohs(dport), | ||
804 | atomic_read(&dest->refcnt), | ||
805 | dest->vfwmark, | ||
806 | NIPQUAD(dest->vaddr), | ||
807 | ntohs(dest->vport)); | ||
808 | __ip_vs_update_dest(svc, dest, udest); | ||
809 | |||
810 | /* | ||
811 | * Get the destination from the trash | ||
812 | */ | ||
813 | list_del(&dest->n_list); | ||
814 | |||
815 | ip_vs_new_estimator(&dest->stats); | ||
816 | |||
817 | write_lock_bh(&__ip_vs_svc_lock); | ||
818 | |||
819 | /* | ||
820 | * Wait until all other svc users go away. | ||
821 | */ | ||
822 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
823 | |||
824 | list_add(&dest->n_list, &svc->destinations); | ||
825 | svc->num_dests++; | ||
826 | |||
827 | /* call the update_service function of its scheduler */ | ||
828 | svc->scheduler->update_service(svc); | ||
829 | |||
830 | write_unlock_bh(&__ip_vs_svc_lock); | ||
831 | return 0; | ||
832 | } | ||
833 | |||
834 | /* | ||
835 | * Allocate and initialize the dest structure | ||
836 | */ | ||
837 | ret = ip_vs_new_dest(svc, udest, &dest); | ||
838 | if (ret) { | ||
839 | return ret; | ||
840 | } | ||
841 | |||
842 | /* | ||
843 | * Add the dest entry into the list | ||
844 | */ | ||
845 | atomic_inc(&dest->refcnt); | ||
846 | |||
847 | write_lock_bh(&__ip_vs_svc_lock); | ||
848 | |||
849 | /* | ||
850 | * Wait until all other svc users go away. | ||
851 | */ | ||
852 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
853 | |||
854 | list_add(&dest->n_list, &svc->destinations); | ||
855 | svc->num_dests++; | ||
856 | |||
857 | /* call the update_service function of its scheduler */ | ||
858 | svc->scheduler->update_service(svc); | ||
859 | |||
860 | write_unlock_bh(&__ip_vs_svc_lock); | ||
861 | |||
862 | LeaveFunction(2); | ||
863 | |||
864 | return 0; | ||
865 | } | ||
866 | |||
867 | |||
868 | /* | ||
869 | * Edit a destination in the given service | ||
870 | */ | ||
871 | static int | ||
872 | ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) | ||
873 | { | ||
874 | struct ip_vs_dest *dest; | ||
875 | __u32 daddr = udest->addr; | ||
876 | __u16 dport = udest->port; | ||
877 | |||
878 | EnterFunction(2); | ||
879 | |||
880 | if (udest->weight < 0) { | ||
881 | IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); | ||
882 | return -ERANGE; | ||
883 | } | ||
884 | |||
885 | if (udest->l_threshold > udest->u_threshold) { | ||
886 | IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " | ||
887 | "upper threshold\n"); | ||
888 | return -ERANGE; | ||
889 | } | ||
890 | |||
891 | /* | ||
892 | * Lookup the destination list | ||
893 | */ | ||
894 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
895 | if (dest == NULL) { | ||
896 | IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); | ||
897 | return -ENOENT; | ||
898 | } | ||
899 | |||
900 | __ip_vs_update_dest(svc, dest, udest); | ||
901 | |||
902 | write_lock_bh(&__ip_vs_svc_lock); | ||
903 | |||
904 | /* Wait until all other svc users go away */ | ||
905 | while (atomic_read(&svc->usecnt) > 1) {}; | ||
906 | |||
907 | /* call the update_service, because server weight may be changed */ | ||
908 | svc->scheduler->update_service(svc); | ||
909 | |||
910 | write_unlock_bh(&__ip_vs_svc_lock); | ||
911 | |||
912 | LeaveFunction(2); | ||
913 | |||
914 | return 0; | ||
915 | } | ||
916 | |||
917 | |||
918 | /* | ||
919 | * Delete a destination (must be already unlinked from the service) | ||
920 | */ | ||
921 | static void __ip_vs_del_dest(struct ip_vs_dest *dest) | ||
922 | { | ||
923 | ip_vs_kill_estimator(&dest->stats); | ||
924 | |||
925 | /* | ||
926 | * Remove it from the d-linked list with the real services. | ||
927 | */ | ||
928 | write_lock_bh(&__ip_vs_rs_lock); | ||
929 | ip_vs_rs_unhash(dest); | ||
930 | write_unlock_bh(&__ip_vs_rs_lock); | ||
931 | |||
932 | /* | ||
933 | * Decrease the refcnt of the dest, and free the dest | ||
934 | * if nobody refers to it (refcnt=0). Otherwise, throw | ||
935 | * the destination into the trash. | ||
936 | */ | ||
937 | if (atomic_dec_and_test(&dest->refcnt)) { | ||
938 | ip_vs_dst_reset(dest); | ||
939 | /* simply decrease svc->refcnt here, let the caller check | ||
940 | and release the service if nobody refers to it. | ||
941 | Only user context can release destination and service, | ||
942 | and only one user context can update virtual service at a | ||
943 | time, so the operation here is OK */ | ||
944 | atomic_dec(&dest->svc->refcnt); | ||
945 | kfree(dest); | ||
946 | } else { | ||
947 | IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", | ||
948 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
949 | atomic_read(&dest->refcnt)); | ||
950 | list_add(&dest->n_list, &ip_vs_dest_trash); | ||
951 | atomic_inc(&dest->refcnt); | ||
952 | } | ||
953 | } | ||
954 | |||
955 | |||
956 | /* | ||
957 | * Unlink a destination from the given service | ||
958 | */ | ||
959 | static void __ip_vs_unlink_dest(struct ip_vs_service *svc, | ||
960 | struct ip_vs_dest *dest, | ||
961 | int svcupd) | ||
962 | { | ||
963 | dest->flags &= ~IP_VS_DEST_F_AVAILABLE; | ||
964 | |||
965 | /* | ||
966 | * Remove it from the d-linked destination list. | ||
967 | */ | ||
968 | list_del(&dest->n_list); | ||
969 | svc->num_dests--; | ||
970 | if (svcupd) { | ||
971 | /* | ||
972 | * Call the update_service function of its scheduler | ||
973 | */ | ||
974 | svc->scheduler->update_service(svc); | ||
975 | } | ||
976 | } | ||
977 | |||
978 | |||
979 | /* | ||
980 | * Delete a destination server in the given service | ||
981 | */ | ||
982 | static int | ||
983 | ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) | ||
984 | { | ||
985 | struct ip_vs_dest *dest; | ||
986 | __u32 daddr = udest->addr; | ||
987 | __u16 dport = udest->port; | ||
988 | |||
989 | EnterFunction(2); | ||
990 | |||
991 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
992 | if (dest == NULL) { | ||
993 | IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); | ||
994 | return -ENOENT; | ||
995 | } | ||
996 | |||
997 | write_lock_bh(&__ip_vs_svc_lock); | ||
998 | |||
999 | /* | ||
1000 | * Wait until all other svc users go away. | ||
1001 | */ | ||
1002 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1003 | |||
1004 | /* | ||
1005 | * Unlink dest from the service | ||
1006 | */ | ||
1007 | __ip_vs_unlink_dest(svc, dest, 1); | ||
1008 | |||
1009 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1010 | |||
1011 | /* | ||
1012 | * Delete the destination | ||
1013 | */ | ||
1014 | __ip_vs_del_dest(dest); | ||
1015 | |||
1016 | LeaveFunction(2); | ||
1017 | |||
1018 | return 0; | ||
1019 | } | ||
1020 | |||
1021 | |||
1022 | /* | ||
1023 | * Add a service into the service hash table | ||
1024 | */ | ||
1025 | static int | ||
1026 | ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) | ||
1027 | { | ||
1028 | int ret = 0; | ||
1029 | struct ip_vs_scheduler *sched = NULL; | ||
1030 | struct ip_vs_service *svc = NULL; | ||
1031 | |||
1032 | /* increase the module use count */ | ||
1033 | ip_vs_use_count_inc(); | ||
1034 | |||
1035 | /* Lookup the scheduler by 'u->sched_name' */ | ||
1036 | sched = ip_vs_scheduler_get(u->sched_name); | ||
1037 | if (sched == NULL) { | ||
1038 | IP_VS_INFO("Scheduler module ip_vs_%s not found\n", | ||
1039 | u->sched_name); | ||
1040 | ret = -ENOENT; | ||
1041 | goto out_mod_dec; | ||
1042 | } | ||
1043 | |||
1044 | svc = (struct ip_vs_service *) | ||
1045 | kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); | ||
1046 | if (svc == NULL) { | ||
1047 | IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); | ||
1048 | ret = -ENOMEM; | ||
1049 | goto out_err; | ||
1050 | } | ||
1051 | memset(svc, 0, sizeof(struct ip_vs_service)); | ||
1052 | |||
1053 | /* I'm the first user of the service */ | ||
1054 | atomic_set(&svc->usecnt, 1); | ||
1055 | atomic_set(&svc->refcnt, 0); | ||
1056 | |||
1057 | svc->protocol = u->protocol; | ||
1058 | svc->addr = u->addr; | ||
1059 | svc->port = u->port; | ||
1060 | svc->fwmark = u->fwmark; | ||
1061 | svc->flags = u->flags; | ||
1062 | svc->timeout = u->timeout * HZ; | ||
1063 | svc->netmask = u->netmask; | ||
1064 | |||
1065 | INIT_LIST_HEAD(&svc->destinations); | ||
1066 | rwlock_init(&svc->sched_lock); | ||
1067 | spin_lock_init(&svc->stats.lock); | ||
1068 | |||
1069 | /* Bind the scheduler */ | ||
1070 | ret = ip_vs_bind_scheduler(svc, sched); | ||
1071 | if (ret) | ||
1072 | goto out_err; | ||
1073 | sched = NULL; | ||
1074 | |||
1075 | /* Update the virtual service counters */ | ||
1076 | if (svc->port == FTPPORT) | ||
1077 | atomic_inc(&ip_vs_ftpsvc_counter); | ||
1078 | else if (svc->port == 0) | ||
1079 | atomic_inc(&ip_vs_nullsvc_counter); | ||
1080 | |||
1081 | ip_vs_new_estimator(&svc->stats); | ||
1082 | ip_vs_num_services++; | ||
1083 | |||
1084 | /* Hash the service into the service table */ | ||
1085 | write_lock_bh(&__ip_vs_svc_lock); | ||
1086 | ip_vs_svc_hash(svc); | ||
1087 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1088 | |||
1089 | *svc_p = svc; | ||
1090 | return 0; | ||
1091 | |||
1092 | out_err: | ||
1093 | if (svc != NULL) { | ||
1094 | if (svc->scheduler) | ||
1095 | ip_vs_unbind_scheduler(svc); | ||
1096 | if (svc->inc) { | ||
1097 | local_bh_disable(); | ||
1098 | ip_vs_app_inc_put(svc->inc); | ||
1099 | local_bh_enable(); | ||
1100 | } | ||
1101 | kfree(svc); | ||
1102 | } | ||
1103 | ip_vs_scheduler_put(sched); | ||
1104 | |||
1105 | out_mod_dec: | ||
1106 | /* decrease the module use count */ | ||
1107 | ip_vs_use_count_dec(); | ||
1108 | |||
1109 | return ret; | ||
1110 | } | ||
1111 | |||
1112 | |||
1113 | /* | ||
1114 | * Edit a service and bind it with a new scheduler | ||
1115 | */ | ||
1116 | static int | ||
1117 | ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) | ||
1118 | { | ||
1119 | struct ip_vs_scheduler *sched, *old_sched; | ||
1120 | int ret = 0; | ||
1121 | |||
1122 | /* | ||
1123 | * Lookup the scheduler, by 'u->sched_name' | ||
1124 | */ | ||
1125 | sched = ip_vs_scheduler_get(u->sched_name); | ||
1126 | if (sched == NULL) { | ||
1127 | IP_VS_INFO("Scheduler module ip_vs_%s not found\n", | ||
1128 | u->sched_name); | ||
1129 | return -ENOENT; | ||
1130 | } | ||
1131 | old_sched = sched; | ||
1132 | |||
1133 | write_lock_bh(&__ip_vs_svc_lock); | ||
1134 | |||
1135 | /* | ||
1136 | * Wait until all other svc users go away. | ||
1137 | */ | ||
1138 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1139 | |||
1140 | /* | ||
1141 | * Set the flags and timeout value | ||
1142 | */ | ||
1143 | svc->flags = u->flags | IP_VS_SVC_F_HASHED; | ||
1144 | svc->timeout = u->timeout * HZ; | ||
1145 | svc->netmask = u->netmask; | ||
1146 | |||
1147 | old_sched = svc->scheduler; | ||
1148 | if (sched != old_sched) { | ||
1149 | /* | ||
1150 | * Unbind the old scheduler | ||
1151 | */ | ||
1152 | if ((ret = ip_vs_unbind_scheduler(svc))) { | ||
1153 | old_sched = sched; | ||
1154 | goto out; | ||
1155 | } | ||
1156 | |||
1157 | /* | ||
1158 | * Bind the new scheduler | ||
1159 | */ | ||
1160 | if ((ret = ip_vs_bind_scheduler(svc, sched))) { | ||
1161 | /* | ||
1162 | * If ip_vs_bind_scheduler fails, restore the old | ||
1163 | * scheduler. | ||
1164 | * The main reason of failure is out of memory. | ||
1165 | * | ||
1166 | * The question is if the old scheduler can be | ||
1167 | * restored all the time. TODO: if it cannot be | ||
1168 | * restored some time, we must delete the service, | ||
1169 | * otherwise the system may crash. | ||
1170 | */ | ||
1171 | ip_vs_bind_scheduler(svc, old_sched); | ||
1172 | old_sched = sched; | ||
1173 | goto out; | ||
1174 | } | ||
1175 | } | ||
1176 | |||
1177 | out: | ||
1178 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1179 | |||
1180 | if (old_sched) | ||
1181 | ip_vs_scheduler_put(old_sched); | ||
1182 | |||
1183 | return ret; | ||
1184 | } | ||
1185 | |||
1186 | |||
1187 | /* | ||
1188 | * Delete a service from the service list | ||
1189 | * - The service must be unlinked, unlocked and not referenced! | ||
1190 | * - We are called under _bh lock | ||
1191 | */ | ||
1192 | static void __ip_vs_del_service(struct ip_vs_service *svc) | ||
1193 | { | ||
1194 | struct ip_vs_dest *dest, *nxt; | ||
1195 | struct ip_vs_scheduler *old_sched; | ||
1196 | |||
1197 | ip_vs_num_services--; | ||
1198 | ip_vs_kill_estimator(&svc->stats); | ||
1199 | |||
1200 | /* Unbind scheduler */ | ||
1201 | old_sched = svc->scheduler; | ||
1202 | ip_vs_unbind_scheduler(svc); | ||
1203 | if (old_sched) | ||
1204 | ip_vs_scheduler_put(old_sched); | ||
1205 | |||
1206 | /* Unbind app inc */ | ||
1207 | if (svc->inc) { | ||
1208 | ip_vs_app_inc_put(svc->inc); | ||
1209 | svc->inc = NULL; | ||
1210 | } | ||
1211 | |||
1212 | /* | ||
1213 | * Unlink the whole destination list | ||
1214 | */ | ||
1215 | list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { | ||
1216 | __ip_vs_unlink_dest(svc, dest, 0); | ||
1217 | __ip_vs_del_dest(dest); | ||
1218 | } | ||
1219 | |||
1220 | /* | ||
1221 | * Update the virtual service counters | ||
1222 | */ | ||
1223 | if (svc->port == FTPPORT) | ||
1224 | atomic_dec(&ip_vs_ftpsvc_counter); | ||
1225 | else if (svc->port == 0) | ||
1226 | atomic_dec(&ip_vs_nullsvc_counter); | ||
1227 | |||
1228 | /* | ||
1229 | * Free the service if nobody refers to it | ||
1230 | */ | ||
1231 | if (atomic_read(&svc->refcnt) == 0) | ||
1232 | kfree(svc); | ||
1233 | |||
1234 | /* decrease the module use count */ | ||
1235 | ip_vs_use_count_dec(); | ||
1236 | } | ||
1237 | |||
1238 | /* | ||
1239 | * Delete a service from the service list | ||
1240 | */ | ||
1241 | static int ip_vs_del_service(struct ip_vs_service *svc) | ||
1242 | { | ||
1243 | if (svc == NULL) | ||
1244 | return -EEXIST; | ||
1245 | |||
1246 | /* | ||
1247 | * Unhash it from the service table | ||
1248 | */ | ||
1249 | write_lock_bh(&__ip_vs_svc_lock); | ||
1250 | |||
1251 | ip_vs_svc_unhash(svc); | ||
1252 | |||
1253 | /* | ||
1254 | * Wait until all the svc users go away. | ||
1255 | */ | ||
1256 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1257 | |||
1258 | __ip_vs_del_service(svc); | ||
1259 | |||
1260 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1261 | |||
1262 | return 0; | ||
1263 | } | ||
1264 | |||
1265 | |||
1266 | /* | ||
1267 | * Flush all the virtual services | ||
1268 | */ | ||
1269 | static int ip_vs_flush(void) | ||
1270 | { | ||
1271 | int idx; | ||
1272 | struct ip_vs_service *svc, *nxt; | ||
1273 | |||
1274 | /* | ||
1275 | * Flush the service table hashed by <protocol,addr,port> | ||
1276 | */ | ||
1277 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1278 | list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { | ||
1279 | write_lock_bh(&__ip_vs_svc_lock); | ||
1280 | ip_vs_svc_unhash(svc); | ||
1281 | /* | ||
1282 | * Wait until all the svc users go away. | ||
1283 | */ | ||
1284 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); | ||
1285 | __ip_vs_del_service(svc); | ||
1286 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1287 | } | ||
1288 | } | ||
1289 | |||
1290 | /* | ||
1291 | * Flush the service table hashed by fwmark | ||
1292 | */ | ||
1293 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1294 | list_for_each_entry_safe(svc, nxt, | ||
1295 | &ip_vs_svc_fwm_table[idx], f_list) { | ||
1296 | write_lock_bh(&__ip_vs_svc_lock); | ||
1297 | ip_vs_svc_unhash(svc); | ||
1298 | /* | ||
1299 | * Wait until all the svc users go away. | ||
1300 | */ | ||
1301 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); | ||
1302 | __ip_vs_del_service(svc); | ||
1303 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | return 0; | ||
1308 | } | ||
1309 | |||
1310 | |||
1311 | /* | ||
1312 | * Zero counters in a service or all services | ||
1313 | */ | ||
1314 | static int ip_vs_zero_service(struct ip_vs_service *svc) | ||
1315 | { | ||
1316 | struct ip_vs_dest *dest; | ||
1317 | |||
1318 | write_lock_bh(&__ip_vs_svc_lock); | ||
1319 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
1320 | ip_vs_zero_stats(&dest->stats); | ||
1321 | } | ||
1322 | ip_vs_zero_stats(&svc->stats); | ||
1323 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1324 | return 0; | ||
1325 | } | ||
1326 | |||
1327 | static int ip_vs_zero_all(void) | ||
1328 | { | ||
1329 | int idx; | ||
1330 | struct ip_vs_service *svc; | ||
1331 | |||
1332 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1333 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
1334 | ip_vs_zero_service(svc); | ||
1335 | } | ||
1336 | } | ||
1337 | |||
1338 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1339 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
1340 | ip_vs_zero_service(svc); | ||
1341 | } | ||
1342 | } | ||
1343 | |||
1344 | ip_vs_zero_stats(&ip_vs_stats); | ||
1345 | return 0; | ||
1346 | } | ||
1347 | |||
1348 | |||
1349 | static int | ||
1350 | proc_do_defense_mode(ctl_table *table, int write, struct file * filp, | ||
1351 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1352 | { | ||
1353 | int *valp = table->data; | ||
1354 | int val = *valp; | ||
1355 | int rc; | ||
1356 | |||
1357 | rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
1358 | if (write && (*valp != val)) { | ||
1359 | if ((*valp < 0) || (*valp > 3)) { | ||
1360 | /* Restore the correct value */ | ||
1361 | *valp = val; | ||
1362 | } else { | ||
1363 | local_bh_disable(); | ||
1364 | update_defense_level(); | ||
1365 | local_bh_enable(); | ||
1366 | } | ||
1367 | } | ||
1368 | return rc; | ||
1369 | } | ||
1370 | |||
1371 | |||
1372 | static int | ||
1373 | proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, | ||
1374 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1375 | { | ||
1376 | int *valp = table->data; | ||
1377 | int val[2]; | ||
1378 | int rc; | ||
1379 | |||
1380 | /* backup the value first */ | ||
1381 | memcpy(val, valp, sizeof(val)); | ||
1382 | |||
1383 | rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
1384 | if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { | ||
1385 | /* Restore the correct value */ | ||
1386 | memcpy(valp, val, sizeof(val)); | ||
1387 | } | ||
1388 | return rc; | ||
1389 | } | ||
1390 | |||
1391 | |||
1392 | /* | ||
1393 | * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) | ||
1394 | */ | ||
1395 | |||
1396 | static struct ctl_table vs_vars[] = { | ||
1397 | { | ||
1398 | .ctl_name = NET_IPV4_VS_AMEMTHRESH, | ||
1399 | .procname = "amemthresh", | ||
1400 | .data = &sysctl_ip_vs_amemthresh, | ||
1401 | .maxlen = sizeof(int), | ||
1402 | .mode = 0644, | ||
1403 | .proc_handler = &proc_dointvec, | ||
1404 | }, | ||
1405 | #ifdef CONFIG_IP_VS_DEBUG | ||
1406 | { | ||
1407 | .ctl_name = NET_IPV4_VS_DEBUG_LEVEL, | ||
1408 | .procname = "debug_level", | ||
1409 | .data = &sysctl_ip_vs_debug_level, | ||
1410 | .maxlen = sizeof(int), | ||
1411 | .mode = 0644, | ||
1412 | .proc_handler = &proc_dointvec, | ||
1413 | }, | ||
1414 | #endif | ||
1415 | { | ||
1416 | .ctl_name = NET_IPV4_VS_AMDROPRATE, | ||
1417 | .procname = "am_droprate", | ||
1418 | .data = &sysctl_ip_vs_am_droprate, | ||
1419 | .maxlen = sizeof(int), | ||
1420 | .mode = 0644, | ||
1421 | .proc_handler = &proc_dointvec, | ||
1422 | }, | ||
1423 | { | ||
1424 | .ctl_name = NET_IPV4_VS_DROP_ENTRY, | ||
1425 | .procname = "drop_entry", | ||
1426 | .data = &sysctl_ip_vs_drop_entry, | ||
1427 | .maxlen = sizeof(int), | ||
1428 | .mode = 0644, | ||
1429 | .proc_handler = &proc_do_defense_mode, | ||
1430 | }, | ||
1431 | { | ||
1432 | .ctl_name = NET_IPV4_VS_DROP_PACKET, | ||
1433 | .procname = "drop_packet", | ||
1434 | .data = &sysctl_ip_vs_drop_packet, | ||
1435 | .maxlen = sizeof(int), | ||
1436 | .mode = 0644, | ||
1437 | .proc_handler = &proc_do_defense_mode, | ||
1438 | }, | ||
1439 | { | ||
1440 | .ctl_name = NET_IPV4_VS_SECURE_TCP, | ||
1441 | .procname = "secure_tcp", | ||
1442 | .data = &sysctl_ip_vs_secure_tcp, | ||
1443 | .maxlen = sizeof(int), | ||
1444 | .mode = 0644, | ||
1445 | .proc_handler = &proc_do_defense_mode, | ||
1446 | }, | ||
1447 | #if 0 | ||
1448 | { | ||
1449 | .ctl_name = NET_IPV4_VS_TO_ES, | ||
1450 | .procname = "timeout_established", | ||
1451 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], | ||
1452 | .maxlen = sizeof(int), | ||
1453 | .mode = 0644, | ||
1454 | .proc_handler = &proc_dointvec_jiffies, | ||
1455 | }, | ||
1456 | { | ||
1457 | .ctl_name = NET_IPV4_VS_TO_SS, | ||
1458 | .procname = "timeout_synsent", | ||
1459 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], | ||
1460 | .maxlen = sizeof(int), | ||
1461 | .mode = 0644, | ||
1462 | .proc_handler = &proc_dointvec_jiffies, | ||
1463 | }, | ||
1464 | { | ||
1465 | .ctl_name = NET_IPV4_VS_TO_SR, | ||
1466 | .procname = "timeout_synrecv", | ||
1467 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], | ||
1468 | .maxlen = sizeof(int), | ||
1469 | .mode = 0644, | ||
1470 | .proc_handler = &proc_dointvec_jiffies, | ||
1471 | }, | ||
1472 | { | ||
1473 | .ctl_name = NET_IPV4_VS_TO_FW, | ||
1474 | .procname = "timeout_finwait", | ||
1475 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], | ||
1476 | .maxlen = sizeof(int), | ||
1477 | .mode = 0644, | ||
1478 | .proc_handler = &proc_dointvec_jiffies, | ||
1479 | }, | ||
1480 | { | ||
1481 | .ctl_name = NET_IPV4_VS_TO_TW, | ||
1482 | .procname = "timeout_timewait", | ||
1483 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], | ||
1484 | .maxlen = sizeof(int), | ||
1485 | .mode = 0644, | ||
1486 | .proc_handler = &proc_dointvec_jiffies, | ||
1487 | }, | ||
1488 | { | ||
1489 | .ctl_name = NET_IPV4_VS_TO_CL, | ||
1490 | .procname = "timeout_close", | ||
1491 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], | ||
1492 | .maxlen = sizeof(int), | ||
1493 | .mode = 0644, | ||
1494 | .proc_handler = &proc_dointvec_jiffies, | ||
1495 | }, | ||
1496 | { | ||
1497 | .ctl_name = NET_IPV4_VS_TO_CW, | ||
1498 | .procname = "timeout_closewait", | ||
1499 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], | ||
1500 | .maxlen = sizeof(int), | ||
1501 | .mode = 0644, | ||
1502 | .proc_handler = &proc_dointvec_jiffies, | ||
1503 | }, | ||
1504 | { | ||
1505 | .ctl_name = NET_IPV4_VS_TO_LA, | ||
1506 | .procname = "timeout_lastack", | ||
1507 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], | ||
1508 | .maxlen = sizeof(int), | ||
1509 | .mode = 0644, | ||
1510 | .proc_handler = &proc_dointvec_jiffies, | ||
1511 | }, | ||
1512 | { | ||
1513 | .ctl_name = NET_IPV4_VS_TO_LI, | ||
1514 | .procname = "timeout_listen", | ||
1515 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], | ||
1516 | .maxlen = sizeof(int), | ||
1517 | .mode = 0644, | ||
1518 | .proc_handler = &proc_dointvec_jiffies, | ||
1519 | }, | ||
1520 | { | ||
1521 | .ctl_name = NET_IPV4_VS_TO_SA, | ||
1522 | .procname = "timeout_synack", | ||
1523 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], | ||
1524 | .maxlen = sizeof(int), | ||
1525 | .mode = 0644, | ||
1526 | .proc_handler = &proc_dointvec_jiffies, | ||
1527 | }, | ||
1528 | { | ||
1529 | .ctl_name = NET_IPV4_VS_TO_UDP, | ||
1530 | .procname = "timeout_udp", | ||
1531 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], | ||
1532 | .maxlen = sizeof(int), | ||
1533 | .mode = 0644, | ||
1534 | .proc_handler = &proc_dointvec_jiffies, | ||
1535 | }, | ||
1536 | { | ||
1537 | .ctl_name = NET_IPV4_VS_TO_ICMP, | ||
1538 | .procname = "timeout_icmp", | ||
1539 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], | ||
1540 | .maxlen = sizeof(int), | ||
1541 | .mode = 0644, | ||
1542 | .proc_handler = &proc_dointvec_jiffies, | ||
1543 | }, | ||
1544 | #endif | ||
1545 | { | ||
1546 | .ctl_name = NET_IPV4_VS_CACHE_BYPASS, | ||
1547 | .procname = "cache_bypass", | ||
1548 | .data = &sysctl_ip_vs_cache_bypass, | ||
1549 | .maxlen = sizeof(int), | ||
1550 | .mode = 0644, | ||
1551 | .proc_handler = &proc_dointvec, | ||
1552 | }, | ||
1553 | { | ||
1554 | .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN, | ||
1555 | .procname = "expire_nodest_conn", | ||
1556 | .data = &sysctl_ip_vs_expire_nodest_conn, | ||
1557 | .maxlen = sizeof(int), | ||
1558 | .mode = 0644, | ||
1559 | .proc_handler = &proc_dointvec, | ||
1560 | }, | ||
1561 | { | ||
1562 | .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, | ||
1563 | .procname = "expire_quiescent_template", | ||
1564 | .data = &sysctl_ip_vs_expire_quiescent_template, | ||
1565 | .maxlen = sizeof(int), | ||
1566 | .mode = 0644, | ||
1567 | .proc_handler = &proc_dointvec, | ||
1568 | }, | ||
1569 | { | ||
1570 | .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD, | ||
1571 | .procname = "sync_threshold", | ||
1572 | .data = &sysctl_ip_vs_sync_threshold, | ||
1573 | .maxlen = sizeof(sysctl_ip_vs_sync_threshold), | ||
1574 | .mode = 0644, | ||
1575 | .proc_handler = &proc_do_sync_threshold, | ||
1576 | }, | ||
1577 | { | ||
1578 | .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND, | ||
1579 | .procname = "nat_icmp_send", | ||
1580 | .data = &sysctl_ip_vs_nat_icmp_send, | ||
1581 | .maxlen = sizeof(int), | ||
1582 | .mode = 0644, | ||
1583 | .proc_handler = &proc_dointvec, | ||
1584 | }, | ||
1585 | { .ctl_name = 0 } | ||
1586 | }; | ||
1587 | |||
1588 | static ctl_table vs_table[] = { | ||
1589 | { | ||
1590 | .ctl_name = NET_IPV4_VS, | ||
1591 | .procname = "vs", | ||
1592 | .mode = 0555, | ||
1593 | .child = vs_vars | ||
1594 | }, | ||
1595 | { .ctl_name = 0 } | ||
1596 | }; | ||
1597 | |||
1598 | static ctl_table ipv4_table[] = { | ||
1599 | { | ||
1600 | .ctl_name = NET_IPV4, | ||
1601 | .procname = "ipv4", | ||
1602 | .mode = 0555, | ||
1603 | .child = vs_table, | ||
1604 | }, | ||
1605 | { .ctl_name = 0 } | ||
1606 | }; | ||
1607 | |||
1608 | static ctl_table vs_root_table[] = { | ||
1609 | { | ||
1610 | .ctl_name = CTL_NET, | ||
1611 | .procname = "net", | ||
1612 | .mode = 0555, | ||
1613 | .child = ipv4_table, | ||
1614 | }, | ||
1615 | { .ctl_name = 0 } | ||
1616 | }; | ||
1617 | |||
1618 | static struct ctl_table_header * sysctl_header; | ||
1619 | |||
1620 | #ifdef CONFIG_PROC_FS | ||
1621 | |||
1622 | struct ip_vs_iter { | ||
1623 | struct list_head *table; | ||
1624 | int bucket; | ||
1625 | }; | ||
1626 | |||
1627 | /* | ||
1628 | * Write the contents of the VS rule table to a PROCfs file. | ||
1629 | * (It is kept just for backward compatibility) | ||
1630 | */ | ||
1631 | static inline const char *ip_vs_fwd_name(unsigned flags) | ||
1632 | { | ||
1633 | switch (flags & IP_VS_CONN_F_FWD_MASK) { | ||
1634 | case IP_VS_CONN_F_LOCALNODE: | ||
1635 | return "Local"; | ||
1636 | case IP_VS_CONN_F_TUNNEL: | ||
1637 | return "Tunnel"; | ||
1638 | case IP_VS_CONN_F_DROUTE: | ||
1639 | return "Route"; | ||
1640 | default: | ||
1641 | return "Masq"; | ||
1642 | } | ||
1643 | } | ||
1644 | |||
1645 | |||
1646 | /* Get the Nth entry in the two lists */ | ||
1647 | static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) | ||
1648 | { | ||
1649 | struct ip_vs_iter *iter = seq->private; | ||
1650 | int idx; | ||
1651 | struct ip_vs_service *svc; | ||
1652 | |||
1653 | /* look in hash by protocol */ | ||
1654 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1655 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
1656 | if (pos-- == 0){ | ||
1657 | iter->table = ip_vs_svc_table; | ||
1658 | iter->bucket = idx; | ||
1659 | return svc; | ||
1660 | } | ||
1661 | } | ||
1662 | } | ||
1663 | |||
1664 | /* keep looking in fwmark */ | ||
1665 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1666 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
1667 | if (pos-- == 0) { | ||
1668 | iter->table = ip_vs_svc_fwm_table; | ||
1669 | iter->bucket = idx; | ||
1670 | return svc; | ||
1671 | } | ||
1672 | } | ||
1673 | } | ||
1674 | |||
1675 | return NULL; | ||
1676 | } | ||
1677 | |||
1678 | static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) | ||
1679 | { | ||
1680 | |||
1681 | read_lock_bh(&__ip_vs_svc_lock); | ||
1682 | return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; | ||
1683 | } | ||
1684 | |||
1685 | |||
1686 | static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
1687 | { | ||
1688 | struct list_head *e; | ||
1689 | struct ip_vs_iter *iter; | ||
1690 | struct ip_vs_service *svc; | ||
1691 | |||
1692 | ++*pos; | ||
1693 | if (v == SEQ_START_TOKEN) | ||
1694 | return ip_vs_info_array(seq,0); | ||
1695 | |||
1696 | svc = v; | ||
1697 | iter = seq->private; | ||
1698 | |||
1699 | if (iter->table == ip_vs_svc_table) { | ||
1700 | /* next service in table hashed by protocol */ | ||
1701 | if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) | ||
1702 | return list_entry(e, struct ip_vs_service, s_list); | ||
1703 | |||
1704 | |||
1705 | while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { | ||
1706 | list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], | ||
1707 | s_list) { | ||
1708 | return svc; | ||
1709 | } | ||
1710 | } | ||
1711 | |||
1712 | iter->table = ip_vs_svc_fwm_table; | ||
1713 | iter->bucket = -1; | ||
1714 | goto scan_fwmark; | ||
1715 | } | ||
1716 | |||
1717 | /* next service in hashed by fwmark */ | ||
1718 | if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) | ||
1719 | return list_entry(e, struct ip_vs_service, f_list); | ||
1720 | |||
1721 | scan_fwmark: | ||
1722 | while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { | ||
1723 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], | ||
1724 | f_list) | ||
1725 | return svc; | ||
1726 | } | ||
1727 | |||
1728 | return NULL; | ||
1729 | } | ||
1730 | |||
1731 | static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) | ||
1732 | { | ||
1733 | read_unlock_bh(&__ip_vs_svc_lock); | ||
1734 | } | ||
1735 | |||
1736 | |||
1737 | static int ip_vs_info_seq_show(struct seq_file *seq, void *v) | ||
1738 | { | ||
1739 | if (v == SEQ_START_TOKEN) { | ||
1740 | seq_printf(seq, | ||
1741 | "IP Virtual Server version %d.%d.%d (size=%d)\n", | ||
1742 | NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); | ||
1743 | seq_puts(seq, | ||
1744 | "Prot LocalAddress:Port Scheduler Flags\n"); | ||
1745 | seq_puts(seq, | ||
1746 | " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); | ||
1747 | } else { | ||
1748 | const struct ip_vs_service *svc = v; | ||
1749 | const struct ip_vs_iter *iter = seq->private; | ||
1750 | const struct ip_vs_dest *dest; | ||
1751 | |||
1752 | if (iter->table == ip_vs_svc_table) | ||
1753 | seq_printf(seq, "%s %08X:%04X %s ", | ||
1754 | ip_vs_proto_name(svc->protocol), | ||
1755 | ntohl(svc->addr), | ||
1756 | ntohs(svc->port), | ||
1757 | svc->scheduler->name); | ||
1758 | else | ||
1759 | seq_printf(seq, "FWM %08X %s ", | ||
1760 | svc->fwmark, svc->scheduler->name); | ||
1761 | |||
1762 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) | ||
1763 | seq_printf(seq, "persistent %d %08X\n", | ||
1764 | svc->timeout, | ||
1765 | ntohl(svc->netmask)); | ||
1766 | else | ||
1767 | seq_putc(seq, '\n'); | ||
1768 | |||
1769 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
1770 | seq_printf(seq, | ||
1771 | " -> %08X:%04X %-7s %-6d %-10d %-10d\n", | ||
1772 | ntohl(dest->addr), ntohs(dest->port), | ||
1773 | ip_vs_fwd_name(atomic_read(&dest->conn_flags)), | ||
1774 | atomic_read(&dest->weight), | ||
1775 | atomic_read(&dest->activeconns), | ||
1776 | atomic_read(&dest->inactconns)); | ||
1777 | } | ||
1778 | } | ||
1779 | return 0; | ||
1780 | } | ||
1781 | |||
1782 | static struct seq_operations ip_vs_info_seq_ops = { | ||
1783 | .start = ip_vs_info_seq_start, | ||
1784 | .next = ip_vs_info_seq_next, | ||
1785 | .stop = ip_vs_info_seq_stop, | ||
1786 | .show = ip_vs_info_seq_show, | ||
1787 | }; | ||
1788 | |||
1789 | static int ip_vs_info_open(struct inode *inode, struct file *file) | ||
1790 | { | ||
1791 | struct seq_file *seq; | ||
1792 | int rc = -ENOMEM; | ||
1793 | struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
1794 | |||
1795 | if (!s) | ||
1796 | goto out; | ||
1797 | |||
1798 | rc = seq_open(file, &ip_vs_info_seq_ops); | ||
1799 | if (rc) | ||
1800 | goto out_kfree; | ||
1801 | |||
1802 | seq = file->private_data; | ||
1803 | seq->private = s; | ||
1804 | memset(s, 0, sizeof(*s)); | ||
1805 | out: | ||
1806 | return rc; | ||
1807 | out_kfree: | ||
1808 | kfree(s); | ||
1809 | goto out; | ||
1810 | } | ||
1811 | |||
1812 | static struct file_operations ip_vs_info_fops = { | ||
1813 | .owner = THIS_MODULE, | ||
1814 | .open = ip_vs_info_open, | ||
1815 | .read = seq_read, | ||
1816 | .llseek = seq_lseek, | ||
1817 | .release = seq_release_private, | ||
1818 | }; | ||
1819 | |||
1820 | #endif | ||
1821 | |||
1822 | struct ip_vs_stats ip_vs_stats; | ||
1823 | |||
1824 | #ifdef CONFIG_PROC_FS | ||
1825 | static int ip_vs_stats_show(struct seq_file *seq, void *v) | ||
1826 | { | ||
1827 | |||
1828 | /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ | ||
1829 | seq_puts(seq, | ||
1830 | " Total Incoming Outgoing Incoming Outgoing\n"); | ||
1831 | seq_printf(seq, | ||
1832 | " Conns Packets Packets Bytes Bytes\n"); | ||
1833 | |||
1834 | spin_lock_bh(&ip_vs_stats.lock); | ||
1835 | seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, | ||
1836 | ip_vs_stats.inpkts, ip_vs_stats.outpkts, | ||
1837 | (unsigned long long) ip_vs_stats.inbytes, | ||
1838 | (unsigned long long) ip_vs_stats.outbytes); | ||
1839 | |||
1840 | /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ | ||
1841 | seq_puts(seq, | ||
1842 | " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); | ||
1843 | seq_printf(seq,"%8X %8X %8X %16X %16X\n", | ||
1844 | ip_vs_stats.cps, | ||
1845 | ip_vs_stats.inpps, | ||
1846 | ip_vs_stats.outpps, | ||
1847 | ip_vs_stats.inbps, | ||
1848 | ip_vs_stats.outbps); | ||
1849 | spin_unlock_bh(&ip_vs_stats.lock); | ||
1850 | |||
1851 | return 0; | ||
1852 | } | ||
1853 | |||
1854 | static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) | ||
1855 | { | ||
1856 | return single_open(file, ip_vs_stats_show, NULL); | ||
1857 | } | ||
1858 | |||
1859 | static struct file_operations ip_vs_stats_fops = { | ||
1860 | .owner = THIS_MODULE, | ||
1861 | .open = ip_vs_stats_seq_open, | ||
1862 | .read = seq_read, | ||
1863 | .llseek = seq_lseek, | ||
1864 | .release = single_release, | ||
1865 | }; | ||
1866 | |||
1867 | #endif | ||
1868 | |||
1869 | /* | ||
1870 | * Set timeout values for tcp tcpfin udp in the timeout_table. | ||
1871 | */ | ||
1872 | static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) | ||
1873 | { | ||
1874 | IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", | ||
1875 | u->tcp_timeout, | ||
1876 | u->tcp_fin_timeout, | ||
1877 | u->udp_timeout); | ||
1878 | |||
1879 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
1880 | if (u->tcp_timeout) { | ||
1881 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] | ||
1882 | = u->tcp_timeout * HZ; | ||
1883 | } | ||
1884 | |||
1885 | if (u->tcp_fin_timeout) { | ||
1886 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] | ||
1887 | = u->tcp_fin_timeout * HZ; | ||
1888 | } | ||
1889 | #endif | ||
1890 | |||
1891 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
1892 | if (u->udp_timeout) { | ||
1893 | ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] | ||
1894 | = u->udp_timeout * HZ; | ||
1895 | } | ||
1896 | #endif | ||
1897 | return 0; | ||
1898 | } | ||
1899 | |||
1900 | |||
1901 | #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) | ||
1902 | #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) | ||
1903 | #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ | ||
1904 | sizeof(struct ip_vs_dest_user)) | ||
1905 | #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) | ||
1906 | #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) | ||
1907 | #define MAX_ARG_LEN SVCDEST_ARG_LEN | ||
1908 | |||
1909 | static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { | ||
1910 | [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, | ||
1911 | [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, | ||
1912 | [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, | ||
1913 | [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, | ||
1914 | [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, | ||
1915 | [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, | ||
1916 | [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, | ||
1917 | [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, | ||
1918 | [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, | ||
1919 | [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, | ||
1920 | [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, | ||
1921 | }; | ||
1922 | |||
1923 | static int | ||
1924 | do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) | ||
1925 | { | ||
1926 | int ret; | ||
1927 | unsigned char arg[MAX_ARG_LEN]; | ||
1928 | struct ip_vs_service_user *usvc; | ||
1929 | struct ip_vs_service *svc; | ||
1930 | struct ip_vs_dest_user *udest; | ||
1931 | |||
1932 | if (!capable(CAP_NET_ADMIN)) | ||
1933 | return -EPERM; | ||
1934 | |||
1935 | if (len != set_arglen[SET_CMDID(cmd)]) { | ||
1936 | IP_VS_ERR("set_ctl: len %u != %u\n", | ||
1937 | len, set_arglen[SET_CMDID(cmd)]); | ||
1938 | return -EINVAL; | ||
1939 | } | ||
1940 | |||
1941 | if (copy_from_user(arg, user, len) != 0) | ||
1942 | return -EFAULT; | ||
1943 | |||
1944 | /* increase the module use count */ | ||
1945 | ip_vs_use_count_inc(); | ||
1946 | |||
1947 | if (down_interruptible(&__ip_vs_mutex)) { | ||
1948 | ret = -ERESTARTSYS; | ||
1949 | goto out_dec; | ||
1950 | } | ||
1951 | |||
1952 | if (cmd == IP_VS_SO_SET_FLUSH) { | ||
1953 | /* Flush the virtual service */ | ||
1954 | ret = ip_vs_flush(); | ||
1955 | goto out_unlock; | ||
1956 | } else if (cmd == IP_VS_SO_SET_TIMEOUT) { | ||
1957 | /* Set timeout values for (tcp tcpfin udp) */ | ||
1958 | ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); | ||
1959 | goto out_unlock; | ||
1960 | } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { | ||
1961 | struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; | ||
1962 | ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); | ||
1963 | goto out_unlock; | ||
1964 | } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { | ||
1965 | struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; | ||
1966 | ret = stop_sync_thread(dm->state); | ||
1967 | goto out_unlock; | ||
1968 | } | ||
1969 | |||
1970 | usvc = (struct ip_vs_service_user *)arg; | ||
1971 | udest = (struct ip_vs_dest_user *)(usvc + 1); | ||
1972 | |||
1973 | if (cmd == IP_VS_SO_SET_ZERO) { | ||
1974 | /* if no service address is set, zero counters in all */ | ||
1975 | if (!usvc->fwmark && !usvc->addr && !usvc->port) { | ||
1976 | ret = ip_vs_zero_all(); | ||
1977 | goto out_unlock; | ||
1978 | } | ||
1979 | } | ||
1980 | |||
1981 | /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ | ||
1982 | if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { | ||
1983 | IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", | ||
1984 | usvc->protocol, NIPQUAD(usvc->addr), | ||
1985 | ntohs(usvc->port), usvc->sched_name); | ||
1986 | ret = -EFAULT; | ||
1987 | goto out_unlock; | ||
1988 | } | ||
1989 | |||
1990 | /* Lookup the exact service by <protocol, addr, port> or fwmark */ | ||
1991 | if (usvc->fwmark == 0) | ||
1992 | svc = __ip_vs_service_get(usvc->protocol, | ||
1993 | usvc->addr, usvc->port); | ||
1994 | else | ||
1995 | svc = __ip_vs_svc_fwm_get(usvc->fwmark); | ||
1996 | |||
1997 | if (cmd != IP_VS_SO_SET_ADD | ||
1998 | && (svc == NULL || svc->protocol != usvc->protocol)) { | ||
1999 | ret = -ESRCH; | ||
2000 | goto out_unlock; | ||
2001 | } | ||
2002 | |||
2003 | switch (cmd) { | ||
2004 | case IP_VS_SO_SET_ADD: | ||
2005 | if (svc != NULL) | ||
2006 | ret = -EEXIST; | ||
2007 | else | ||
2008 | ret = ip_vs_add_service(usvc, &svc); | ||
2009 | break; | ||
2010 | case IP_VS_SO_SET_EDIT: | ||
2011 | ret = ip_vs_edit_service(svc, usvc); | ||
2012 | break; | ||
2013 | case IP_VS_SO_SET_DEL: | ||
2014 | ret = ip_vs_del_service(svc); | ||
2015 | if (!ret) | ||
2016 | goto out_unlock; | ||
2017 | break; | ||
2018 | case IP_VS_SO_SET_ZERO: | ||
2019 | ret = ip_vs_zero_service(svc); | ||
2020 | break; | ||
2021 | case IP_VS_SO_SET_ADDDEST: | ||
2022 | ret = ip_vs_add_dest(svc, udest); | ||
2023 | break; | ||
2024 | case IP_VS_SO_SET_EDITDEST: | ||
2025 | ret = ip_vs_edit_dest(svc, udest); | ||
2026 | break; | ||
2027 | case IP_VS_SO_SET_DELDEST: | ||
2028 | ret = ip_vs_del_dest(svc, udest); | ||
2029 | break; | ||
2030 | default: | ||
2031 | ret = -EINVAL; | ||
2032 | } | ||
2033 | |||
2034 | if (svc) | ||
2035 | ip_vs_service_put(svc); | ||
2036 | |||
2037 | out_unlock: | ||
2038 | up(&__ip_vs_mutex); | ||
2039 | out_dec: | ||
2040 | /* decrease the module use count */ | ||
2041 | ip_vs_use_count_dec(); | ||
2042 | |||
2043 | return ret; | ||
2044 | } | ||
2045 | |||
2046 | |||
2047 | static void | ||
2048 | ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) | ||
2049 | { | ||
2050 | spin_lock_bh(&src->lock); | ||
2051 | memcpy(dst, src, (char*)&src->lock - (char*)src); | ||
2052 | spin_unlock_bh(&src->lock); | ||
2053 | } | ||
2054 | |||
2055 | static void | ||
2056 | ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) | ||
2057 | { | ||
2058 | dst->protocol = src->protocol; | ||
2059 | dst->addr = src->addr; | ||
2060 | dst->port = src->port; | ||
2061 | dst->fwmark = src->fwmark; | ||
2062 | strcpy(dst->sched_name, src->scheduler->name); | ||
2063 | dst->flags = src->flags; | ||
2064 | dst->timeout = src->timeout / HZ; | ||
2065 | dst->netmask = src->netmask; | ||
2066 | dst->num_dests = src->num_dests; | ||
2067 | ip_vs_copy_stats(&dst->stats, &src->stats); | ||
2068 | } | ||
2069 | |||
2070 | static inline int | ||
2071 | __ip_vs_get_service_entries(const struct ip_vs_get_services *get, | ||
2072 | struct ip_vs_get_services __user *uptr) | ||
2073 | { | ||
2074 | int idx, count=0; | ||
2075 | struct ip_vs_service *svc; | ||
2076 | struct ip_vs_service_entry entry; | ||
2077 | int ret = 0; | ||
2078 | |||
2079 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2080 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
2081 | if (count >= get->num_services) | ||
2082 | goto out; | ||
2083 | ip_vs_copy_service(&entry, svc); | ||
2084 | if (copy_to_user(&uptr->entrytable[count], | ||
2085 | &entry, sizeof(entry))) { | ||
2086 | ret = -EFAULT; | ||
2087 | goto out; | ||
2088 | } | ||
2089 | count++; | ||
2090 | } | ||
2091 | } | ||
2092 | |||
2093 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2094 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
2095 | if (count >= get->num_services) | ||
2096 | goto out; | ||
2097 | ip_vs_copy_service(&entry, svc); | ||
2098 | if (copy_to_user(&uptr->entrytable[count], | ||
2099 | &entry, sizeof(entry))) { | ||
2100 | ret = -EFAULT; | ||
2101 | goto out; | ||
2102 | } | ||
2103 | count++; | ||
2104 | } | ||
2105 | } | ||
2106 | out: | ||
2107 | return ret; | ||
2108 | } | ||
2109 | |||
2110 | static inline int | ||
2111 | __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, | ||
2112 | struct ip_vs_get_dests __user *uptr) | ||
2113 | { | ||
2114 | struct ip_vs_service *svc; | ||
2115 | int ret = 0; | ||
2116 | |||
2117 | if (get->fwmark) | ||
2118 | svc = __ip_vs_svc_fwm_get(get->fwmark); | ||
2119 | else | ||
2120 | svc = __ip_vs_service_get(get->protocol, | ||
2121 | get->addr, get->port); | ||
2122 | if (svc) { | ||
2123 | int count = 0; | ||
2124 | struct ip_vs_dest *dest; | ||
2125 | struct ip_vs_dest_entry entry; | ||
2126 | |||
2127 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
2128 | if (count >= get->num_dests) | ||
2129 | break; | ||
2130 | |||
2131 | entry.addr = dest->addr; | ||
2132 | entry.port = dest->port; | ||
2133 | entry.conn_flags = atomic_read(&dest->conn_flags); | ||
2134 | entry.weight = atomic_read(&dest->weight); | ||
2135 | entry.u_threshold = dest->u_threshold; | ||
2136 | entry.l_threshold = dest->l_threshold; | ||
2137 | entry.activeconns = atomic_read(&dest->activeconns); | ||
2138 | entry.inactconns = atomic_read(&dest->inactconns); | ||
2139 | entry.persistconns = atomic_read(&dest->persistconns); | ||
2140 | ip_vs_copy_stats(&entry.stats, &dest->stats); | ||
2141 | if (copy_to_user(&uptr->entrytable[count], | ||
2142 | &entry, sizeof(entry))) { | ||
2143 | ret = -EFAULT; | ||
2144 | break; | ||
2145 | } | ||
2146 | count++; | ||
2147 | } | ||
2148 | ip_vs_service_put(svc); | ||
2149 | } else | ||
2150 | ret = -ESRCH; | ||
2151 | return ret; | ||
2152 | } | ||
2153 | |||
2154 | static inline void | ||
2155 | __ip_vs_get_timeouts(struct ip_vs_timeout_user *u) | ||
2156 | { | ||
2157 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
2158 | u->tcp_timeout = | ||
2159 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; | ||
2160 | u->tcp_fin_timeout = | ||
2161 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; | ||
2162 | #endif | ||
2163 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
2164 | u->udp_timeout = | ||
2165 | ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; | ||
2166 | #endif | ||
2167 | } | ||
2168 | |||
2169 | |||
2170 | #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) | ||
2171 | #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) | ||
2172 | #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) | ||
2173 | #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) | ||
2174 | #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) | ||
2175 | #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) | ||
2176 | #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) | ||
2177 | |||
2178 | static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { | ||
2179 | [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, | ||
2180 | [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, | ||
2181 | [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, | ||
2182 | [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, | ||
2183 | [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, | ||
2184 | [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, | ||
2185 | [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, | ||
2186 | }; | ||
2187 | |||
2188 | static int | ||
2189 | do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) | ||
2190 | { | ||
2191 | unsigned char arg[128]; | ||
2192 | int ret = 0; | ||
2193 | |||
2194 | if (!capable(CAP_NET_ADMIN)) | ||
2195 | return -EPERM; | ||
2196 | |||
2197 | if (*len < get_arglen[GET_CMDID(cmd)]) { | ||
2198 | IP_VS_ERR("get_ctl: len %u < %u\n", | ||
2199 | *len, get_arglen[GET_CMDID(cmd)]); | ||
2200 | return -EINVAL; | ||
2201 | } | ||
2202 | |||
2203 | if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) | ||
2204 | return -EFAULT; | ||
2205 | |||
2206 | if (down_interruptible(&__ip_vs_mutex)) | ||
2207 | return -ERESTARTSYS; | ||
2208 | |||
2209 | switch (cmd) { | ||
2210 | case IP_VS_SO_GET_VERSION: | ||
2211 | { | ||
2212 | char buf[64]; | ||
2213 | |||
2214 | sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", | ||
2215 | NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); | ||
2216 | if (copy_to_user(user, buf, strlen(buf)+1) != 0) { | ||
2217 | ret = -EFAULT; | ||
2218 | goto out; | ||
2219 | } | ||
2220 | *len = strlen(buf)+1; | ||
2221 | } | ||
2222 | break; | ||
2223 | |||
2224 | case IP_VS_SO_GET_INFO: | ||
2225 | { | ||
2226 | struct ip_vs_getinfo info; | ||
2227 | info.version = IP_VS_VERSION_CODE; | ||
2228 | info.size = IP_VS_CONN_TAB_SIZE; | ||
2229 | info.num_services = ip_vs_num_services; | ||
2230 | if (copy_to_user(user, &info, sizeof(info)) != 0) | ||
2231 | ret = -EFAULT; | ||
2232 | } | ||
2233 | break; | ||
2234 | |||
2235 | case IP_VS_SO_GET_SERVICES: | ||
2236 | { | ||
2237 | struct ip_vs_get_services *get; | ||
2238 | int size; | ||
2239 | |||
2240 | get = (struct ip_vs_get_services *)arg; | ||
2241 | size = sizeof(*get) + | ||
2242 | sizeof(struct ip_vs_service_entry) * get->num_services; | ||
2243 | if (*len != size) { | ||
2244 | IP_VS_ERR("length: %u != %u\n", *len, size); | ||
2245 | ret = -EINVAL; | ||
2246 | goto out; | ||
2247 | } | ||
2248 | ret = __ip_vs_get_service_entries(get, user); | ||
2249 | } | ||
2250 | break; | ||
2251 | |||
2252 | case IP_VS_SO_GET_SERVICE: | ||
2253 | { | ||
2254 | struct ip_vs_service_entry *entry; | ||
2255 | struct ip_vs_service *svc; | ||
2256 | |||
2257 | entry = (struct ip_vs_service_entry *)arg; | ||
2258 | if (entry->fwmark) | ||
2259 | svc = __ip_vs_svc_fwm_get(entry->fwmark); | ||
2260 | else | ||
2261 | svc = __ip_vs_service_get(entry->protocol, | ||
2262 | entry->addr, entry->port); | ||
2263 | if (svc) { | ||
2264 | ip_vs_copy_service(entry, svc); | ||
2265 | if (copy_to_user(user, entry, sizeof(*entry)) != 0) | ||
2266 | ret = -EFAULT; | ||
2267 | ip_vs_service_put(svc); | ||
2268 | } else | ||
2269 | ret = -ESRCH; | ||
2270 | } | ||
2271 | break; | ||
2272 | |||
2273 | case IP_VS_SO_GET_DESTS: | ||
2274 | { | ||
2275 | struct ip_vs_get_dests *get; | ||
2276 | int size; | ||
2277 | |||
2278 | get = (struct ip_vs_get_dests *)arg; | ||
2279 | size = sizeof(*get) + | ||
2280 | sizeof(struct ip_vs_dest_entry) * get->num_dests; | ||
2281 | if (*len != size) { | ||
2282 | IP_VS_ERR("length: %u != %u\n", *len, size); | ||
2283 | ret = -EINVAL; | ||
2284 | goto out; | ||
2285 | } | ||
2286 | ret = __ip_vs_get_dest_entries(get, user); | ||
2287 | } | ||
2288 | break; | ||
2289 | |||
2290 | case IP_VS_SO_GET_TIMEOUT: | ||
2291 | { | ||
2292 | struct ip_vs_timeout_user t; | ||
2293 | |||
2294 | __ip_vs_get_timeouts(&t); | ||
2295 | if (copy_to_user(user, &t, sizeof(t)) != 0) | ||
2296 | ret = -EFAULT; | ||
2297 | } | ||
2298 | break; | ||
2299 | |||
2300 | case IP_VS_SO_GET_DAEMON: | ||
2301 | { | ||
2302 | struct ip_vs_daemon_user d[2]; | ||
2303 | |||
2304 | memset(&d, 0, sizeof(d)); | ||
2305 | if (ip_vs_sync_state & IP_VS_STATE_MASTER) { | ||
2306 | d[0].state = IP_VS_STATE_MASTER; | ||
2307 | strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); | ||
2308 | d[0].syncid = ip_vs_master_syncid; | ||
2309 | } | ||
2310 | if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { | ||
2311 | d[1].state = IP_VS_STATE_BACKUP; | ||
2312 | strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); | ||
2313 | d[1].syncid = ip_vs_backup_syncid; | ||
2314 | } | ||
2315 | if (copy_to_user(user, &d, sizeof(d)) != 0) | ||
2316 | ret = -EFAULT; | ||
2317 | } | ||
2318 | break; | ||
2319 | |||
2320 | default: | ||
2321 | ret = -EINVAL; | ||
2322 | } | ||
2323 | |||
2324 | out: | ||
2325 | up(&__ip_vs_mutex); | ||
2326 | return ret; | ||
2327 | } | ||
2328 | |||
2329 | |||
2330 | static struct nf_sockopt_ops ip_vs_sockopts = { | ||
2331 | .pf = PF_INET, | ||
2332 | .set_optmin = IP_VS_BASE_CTL, | ||
2333 | .set_optmax = IP_VS_SO_SET_MAX+1, | ||
2334 | .set = do_ip_vs_set_ctl, | ||
2335 | .get_optmin = IP_VS_BASE_CTL, | ||
2336 | .get_optmax = IP_VS_SO_GET_MAX+1, | ||
2337 | .get = do_ip_vs_get_ctl, | ||
2338 | }; | ||
2339 | |||
2340 | |||
2341 | int ip_vs_control_init(void) | ||
2342 | { | ||
2343 | int ret; | ||
2344 | int idx; | ||
2345 | |||
2346 | EnterFunction(2); | ||
2347 | |||
2348 | ret = nf_register_sockopt(&ip_vs_sockopts); | ||
2349 | if (ret) { | ||
2350 | IP_VS_ERR("cannot register sockopt.\n"); | ||
2351 | return ret; | ||
2352 | } | ||
2353 | |||
2354 | proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops); | ||
2355 | proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops); | ||
2356 | |||
2357 | sysctl_header = register_sysctl_table(vs_root_table, 0); | ||
2358 | |||
2359 | /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ | ||
2360 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2361 | INIT_LIST_HEAD(&ip_vs_svc_table[idx]); | ||
2362 | INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); | ||
2363 | } | ||
2364 | for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { | ||
2365 | INIT_LIST_HEAD(&ip_vs_rtable[idx]); | ||
2366 | } | ||
2367 | |||
2368 | memset(&ip_vs_stats, 0, sizeof(ip_vs_stats)); | ||
2369 | spin_lock_init(&ip_vs_stats.lock); | ||
2370 | ip_vs_new_estimator(&ip_vs_stats); | ||
2371 | |||
2372 | /* Hook the defense timer */ | ||
2373 | schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); | ||
2374 | |||
2375 | LeaveFunction(2); | ||
2376 | return 0; | ||
2377 | } | ||
2378 | |||
2379 | |||
2380 | void ip_vs_control_cleanup(void) | ||
2381 | { | ||
2382 | EnterFunction(2); | ||
2383 | ip_vs_trash_cleanup(); | ||
2384 | cancel_rearming_delayed_work(&defense_work); | ||
2385 | ip_vs_kill_estimator(&ip_vs_stats); | ||
2386 | unregister_sysctl_table(sysctl_header); | ||
2387 | proc_net_remove("ip_vs_stats"); | ||
2388 | proc_net_remove("ip_vs"); | ||
2389 | nf_unregister_sockopt(&ip_vs_sockopts); | ||
2390 | LeaveFunction(2); | ||
2391 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c new file mode 100644 index 000000000000..f3bc320dce93 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_dh.c | |||
@@ -0,0 +1,258 @@ | |||
1 | /* | ||
2 | * IPVS: Destination Hashing scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
7 | * | ||
8 | * Inspired by the consistent hashing scheduler patch from | ||
9 | * Thomas Proell <proellt@gmx.de> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * Changes: | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * The dh algorithm is to select server by the hash key of destination IP | ||
22 | * address. The pseudo code is as follows: | ||
23 | * | ||
24 | * n <- servernode[dest_ip]; | ||
25 | * if (n is dead) OR | ||
26 | * (n is overloaded) OR (n.weight <= 0) then | ||
27 | * return NULL; | ||
28 | * | ||
29 | * return n; | ||
30 | * | ||
31 | * Notes that servernode is a 256-bucket hash table that maps the hash | ||
32 | * index derived from packet destination IP address to the current server | ||
33 | * array. If the dh scheduler is used in cache cluster, it is good to | ||
34 | * combine it with cache_bypass feature. When the statically assigned | ||
35 | * server is dead or overloaded, the load balancer can bypass the cache | ||
36 | * server and send requests to the original server directly. | ||
37 | * | ||
38 | */ | ||
39 | |||
40 | #include <linux/module.h> | ||
41 | #include <linux/kernel.h> | ||
42 | |||
43 | #include <net/ip_vs.h> | ||
44 | |||
45 | |||
46 | /* | ||
47 | * IPVS DH bucket | ||
48 | */ | ||
49 | struct ip_vs_dh_bucket { | ||
50 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
51 | }; | ||
52 | |||
53 | /* | ||
54 | * for IPVS DH entry hash table | ||
55 | */ | ||
56 | #ifndef CONFIG_IP_VS_DH_TAB_BITS | ||
57 | #define CONFIG_IP_VS_DH_TAB_BITS 8 | ||
58 | #endif | ||
59 | #define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS | ||
60 | #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) | ||
61 | #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) | ||
62 | |||
63 | |||
64 | /* | ||
65 | * Returns hash value for IPVS DH entry | ||
66 | */ | ||
67 | static inline unsigned ip_vs_dh_hashkey(__u32 addr) | ||
68 | { | ||
69 | return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; | ||
70 | } | ||
71 | |||
72 | |||
73 | /* | ||
74 | * Get ip_vs_dest associated with supplied parameters. | ||
75 | */ | ||
76 | static inline struct ip_vs_dest * | ||
77 | ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr) | ||
78 | { | ||
79 | return (tbl[ip_vs_dh_hashkey(addr)]).dest; | ||
80 | } | ||
81 | |||
82 | |||
83 | /* | ||
84 | * Assign all the hash buckets of the specified table with the service. | ||
85 | */ | ||
86 | static int | ||
87 | ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) | ||
88 | { | ||
89 | int i; | ||
90 | struct ip_vs_dh_bucket *b; | ||
91 | struct list_head *p; | ||
92 | struct ip_vs_dest *dest; | ||
93 | |||
94 | b = tbl; | ||
95 | p = &svc->destinations; | ||
96 | for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { | ||
97 | if (list_empty(p)) { | ||
98 | b->dest = NULL; | ||
99 | } else { | ||
100 | if (p == &svc->destinations) | ||
101 | p = p->next; | ||
102 | |||
103 | dest = list_entry(p, struct ip_vs_dest, n_list); | ||
104 | atomic_inc(&dest->refcnt); | ||
105 | b->dest = dest; | ||
106 | |||
107 | p = p->next; | ||
108 | } | ||
109 | b++; | ||
110 | } | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | |||
115 | /* | ||
116 | * Flush all the hash buckets of the specified table. | ||
117 | */ | ||
118 | static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) | ||
119 | { | ||
120 | int i; | ||
121 | struct ip_vs_dh_bucket *b; | ||
122 | |||
123 | b = tbl; | ||
124 | for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { | ||
125 | if (b->dest) { | ||
126 | atomic_dec(&b->dest->refcnt); | ||
127 | b->dest = NULL; | ||
128 | } | ||
129 | b++; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | |||
134 | static int ip_vs_dh_init_svc(struct ip_vs_service *svc) | ||
135 | { | ||
136 | struct ip_vs_dh_bucket *tbl; | ||
137 | |||
138 | /* allocate the DH table for this service */ | ||
139 | tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, | ||
140 | GFP_ATOMIC); | ||
141 | if (tbl == NULL) { | ||
142 | IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); | ||
143 | return -ENOMEM; | ||
144 | } | ||
145 | svc->sched_data = tbl; | ||
146 | IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " | ||
147 | "current service\n", | ||
148 | sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); | ||
149 | |||
150 | /* assign the hash buckets with the updated service */ | ||
151 | ip_vs_dh_assign(tbl, svc); | ||
152 | |||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | |||
157 | static int ip_vs_dh_done_svc(struct ip_vs_service *svc) | ||
158 | { | ||
159 | struct ip_vs_dh_bucket *tbl = svc->sched_data; | ||
160 | |||
161 | /* got to clean up hash buckets here */ | ||
162 | ip_vs_dh_flush(tbl); | ||
163 | |||
164 | /* release the table itself */ | ||
165 | kfree(svc->sched_data); | ||
166 | IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", | ||
167 | sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); | ||
168 | |||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | |||
173 | static int ip_vs_dh_update_svc(struct ip_vs_service *svc) | ||
174 | { | ||
175 | struct ip_vs_dh_bucket *tbl = svc->sched_data; | ||
176 | |||
177 | /* got to clean up hash buckets here */ | ||
178 | ip_vs_dh_flush(tbl); | ||
179 | |||
180 | /* assign the hash buckets with the updated service */ | ||
181 | ip_vs_dh_assign(tbl, svc); | ||
182 | |||
183 | return 0; | ||
184 | } | ||
185 | |||
186 | |||
187 | /* | ||
188 | * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, | ||
189 | * consider that the server is overloaded here. | ||
190 | */ | ||
191 | static inline int is_overloaded(struct ip_vs_dest *dest) | ||
192 | { | ||
193 | return dest->flags & IP_VS_DEST_F_OVERLOAD; | ||
194 | } | ||
195 | |||
196 | |||
197 | /* | ||
198 | * Destination hashing scheduling | ||
199 | */ | ||
200 | static struct ip_vs_dest * | ||
201 | ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
202 | { | ||
203 | struct ip_vs_dest *dest; | ||
204 | struct ip_vs_dh_bucket *tbl; | ||
205 | struct iphdr *iph = skb->nh.iph; | ||
206 | |||
207 | IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); | ||
208 | |||
209 | tbl = (struct ip_vs_dh_bucket *)svc->sched_data; | ||
210 | dest = ip_vs_dh_get(tbl, iph->daddr); | ||
211 | if (!dest | ||
212 | || !(dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
213 | || atomic_read(&dest->weight) <= 0 | ||
214 | || is_overloaded(dest)) { | ||
215 | return NULL; | ||
216 | } | ||
217 | |||
218 | IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " | ||
219 | "--> server %u.%u.%u.%u:%d\n", | ||
220 | NIPQUAD(iph->daddr), | ||
221 | NIPQUAD(dest->addr), | ||
222 | ntohs(dest->port)); | ||
223 | |||
224 | return dest; | ||
225 | } | ||
226 | |||
227 | |||
228 | /* | ||
229 | * IPVS DH Scheduler structure | ||
230 | */ | ||
231 | static struct ip_vs_scheduler ip_vs_dh_scheduler = | ||
232 | { | ||
233 | .name = "dh", | ||
234 | .refcnt = ATOMIC_INIT(0), | ||
235 | .module = THIS_MODULE, | ||
236 | .init_service = ip_vs_dh_init_svc, | ||
237 | .done_service = ip_vs_dh_done_svc, | ||
238 | .update_service = ip_vs_dh_update_svc, | ||
239 | .schedule = ip_vs_dh_schedule, | ||
240 | }; | ||
241 | |||
242 | |||
243 | static int __init ip_vs_dh_init(void) | ||
244 | { | ||
245 | INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list); | ||
246 | return register_ip_vs_scheduler(&ip_vs_dh_scheduler); | ||
247 | } | ||
248 | |||
249 | |||
250 | static void __exit ip_vs_dh_cleanup(void) | ||
251 | { | ||
252 | unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); | ||
253 | } | ||
254 | |||
255 | |||
256 | module_init(ip_vs_dh_init); | ||
257 | module_exit(ip_vs_dh_cleanup); | ||
258 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c new file mode 100644 index 000000000000..67b3e2fc1fa1 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_est.c | |||
@@ -0,0 +1,200 @@ | |||
1 | /* | ||
2 | * ip_vs_est.c: simple rate estimator for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * | ||
15 | */ | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/types.h> | ||
18 | |||
19 | #include <net/ip_vs.h> | ||
20 | |||
21 | /* | ||
22 | This code is to estimate rate in a shorter interval (such as 8 | ||
23 | seconds) for virtual services and real servers. For measure rate in a | ||
24 | long interval, it is easy to implement a user level daemon which | ||
25 | periodically reads those statistical counters and measure rate. | ||
26 | |||
27 | Currently, the measurement is activated by slow timer handler. Hope | ||
28 | this measurement will not introduce too much load. | ||
29 | |||
30 | We measure rate during the last 8 seconds every 2 seconds: | ||
31 | |||
32 | avgrate = avgrate*(1-W) + rate*W | ||
33 | |||
34 | where W = 2^(-2) | ||
35 | |||
36 | NOTES. | ||
37 | |||
38 | * The stored value for average bps is scaled by 2^5, so that maximal | ||
39 | rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. | ||
40 | |||
41 | * A lot code is taken from net/sched/estimator.c | ||
42 | */ | ||
43 | |||
44 | |||
45 | struct ip_vs_estimator | ||
46 | { | ||
47 | struct ip_vs_estimator *next; | ||
48 | struct ip_vs_stats *stats; | ||
49 | |||
50 | u32 last_conns; | ||
51 | u32 last_inpkts; | ||
52 | u32 last_outpkts; | ||
53 | u64 last_inbytes; | ||
54 | u64 last_outbytes; | ||
55 | |||
56 | u32 cps; | ||
57 | u32 inpps; | ||
58 | u32 outpps; | ||
59 | u32 inbps; | ||
60 | u32 outbps; | ||
61 | }; | ||
62 | |||
63 | |||
64 | static struct ip_vs_estimator *est_list = NULL; | ||
65 | static DEFINE_RWLOCK(est_lock); | ||
66 | static struct timer_list est_timer; | ||
67 | |||
68 | static void estimation_timer(unsigned long arg) | ||
69 | { | ||
70 | struct ip_vs_estimator *e; | ||
71 | struct ip_vs_stats *s; | ||
72 | u32 n_conns; | ||
73 | u32 n_inpkts, n_outpkts; | ||
74 | u64 n_inbytes, n_outbytes; | ||
75 | u32 rate; | ||
76 | |||
77 | read_lock(&est_lock); | ||
78 | for (e = est_list; e; e = e->next) { | ||
79 | s = e->stats; | ||
80 | |||
81 | spin_lock(&s->lock); | ||
82 | n_conns = s->conns; | ||
83 | n_inpkts = s->inpkts; | ||
84 | n_outpkts = s->outpkts; | ||
85 | n_inbytes = s->inbytes; | ||
86 | n_outbytes = s->outbytes; | ||
87 | |||
88 | /* scaled by 2^10, but divided 2 seconds */ | ||
89 | rate = (n_conns - e->last_conns)<<9; | ||
90 | e->last_conns = n_conns; | ||
91 | e->cps += ((long)rate - (long)e->cps)>>2; | ||
92 | s->cps = (e->cps+0x1FF)>>10; | ||
93 | |||
94 | rate = (n_inpkts - e->last_inpkts)<<9; | ||
95 | e->last_inpkts = n_inpkts; | ||
96 | e->inpps += ((long)rate - (long)e->inpps)>>2; | ||
97 | s->inpps = (e->inpps+0x1FF)>>10; | ||
98 | |||
99 | rate = (n_outpkts - e->last_outpkts)<<9; | ||
100 | e->last_outpkts = n_outpkts; | ||
101 | e->outpps += ((long)rate - (long)e->outpps)>>2; | ||
102 | s->outpps = (e->outpps+0x1FF)>>10; | ||
103 | |||
104 | rate = (n_inbytes - e->last_inbytes)<<4; | ||
105 | e->last_inbytes = n_inbytes; | ||
106 | e->inbps += ((long)rate - (long)e->inbps)>>2; | ||
107 | s->inbps = (e->inbps+0xF)>>5; | ||
108 | |||
109 | rate = (n_outbytes - e->last_outbytes)<<4; | ||
110 | e->last_outbytes = n_outbytes; | ||
111 | e->outbps += ((long)rate - (long)e->outbps)>>2; | ||
112 | s->outbps = (e->outbps+0xF)>>5; | ||
113 | spin_unlock(&s->lock); | ||
114 | } | ||
115 | read_unlock(&est_lock); | ||
116 | mod_timer(&est_timer, jiffies + 2*HZ); | ||
117 | } | ||
118 | |||
119 | int ip_vs_new_estimator(struct ip_vs_stats *stats) | ||
120 | { | ||
121 | struct ip_vs_estimator *est; | ||
122 | |||
123 | est = kmalloc(sizeof(*est), GFP_KERNEL); | ||
124 | if (est == NULL) | ||
125 | return -ENOMEM; | ||
126 | |||
127 | memset(est, 0, sizeof(*est)); | ||
128 | est->stats = stats; | ||
129 | est->last_conns = stats->conns; | ||
130 | est->cps = stats->cps<<10; | ||
131 | |||
132 | est->last_inpkts = stats->inpkts; | ||
133 | est->inpps = stats->inpps<<10; | ||
134 | |||
135 | est->last_outpkts = stats->outpkts; | ||
136 | est->outpps = stats->outpps<<10; | ||
137 | |||
138 | est->last_inbytes = stats->inbytes; | ||
139 | est->inbps = stats->inbps<<5; | ||
140 | |||
141 | est->last_outbytes = stats->outbytes; | ||
142 | est->outbps = stats->outbps<<5; | ||
143 | |||
144 | write_lock_bh(&est_lock); | ||
145 | est->next = est_list; | ||
146 | if (est->next == NULL) { | ||
147 | init_timer(&est_timer); | ||
148 | est_timer.expires = jiffies + 2*HZ; | ||
149 | est_timer.function = estimation_timer; | ||
150 | add_timer(&est_timer); | ||
151 | } | ||
152 | est_list = est; | ||
153 | write_unlock_bh(&est_lock); | ||
154 | return 0; | ||
155 | } | ||
156 | |||
157 | void ip_vs_kill_estimator(struct ip_vs_stats *stats) | ||
158 | { | ||
159 | struct ip_vs_estimator *est, **pest; | ||
160 | int killed = 0; | ||
161 | |||
162 | write_lock_bh(&est_lock); | ||
163 | pest = &est_list; | ||
164 | while ((est=*pest) != NULL) { | ||
165 | if (est->stats != stats) { | ||
166 | pest = &est->next; | ||
167 | continue; | ||
168 | } | ||
169 | *pest = est->next; | ||
170 | kfree(est); | ||
171 | killed++; | ||
172 | } | ||
173 | if (killed && est_list == NULL) | ||
174 | del_timer_sync(&est_timer); | ||
175 | write_unlock_bh(&est_lock); | ||
176 | } | ||
177 | |||
178 | void ip_vs_zero_estimator(struct ip_vs_stats *stats) | ||
179 | { | ||
180 | struct ip_vs_estimator *e; | ||
181 | |||
182 | write_lock_bh(&est_lock); | ||
183 | for (e = est_list; e; e = e->next) { | ||
184 | if (e->stats != stats) | ||
185 | continue; | ||
186 | |||
187 | /* set counters zero */ | ||
188 | e->last_conns = 0; | ||
189 | e->last_inpkts = 0; | ||
190 | e->last_outpkts = 0; | ||
191 | e->last_inbytes = 0; | ||
192 | e->last_outbytes = 0; | ||
193 | e->cps = 0; | ||
194 | e->inpps = 0; | ||
195 | e->outpps = 0; | ||
196 | e->inbps = 0; | ||
197 | e->outbps = 0; | ||
198 | } | ||
199 | write_unlock_bh(&est_lock); | ||
200 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c new file mode 100644 index 000000000000..a19a33ceb811 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ftp.c | |||
@@ -0,0 +1,400 @@ | |||
1 | /* | ||
2 | * ip_vs_ftp.c: IPVS ftp application module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * | ||
8 | * Changes: | ||
9 | * | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference | ||
17 | * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. | ||
18 | * | ||
19 | * IP_MASQ_FTP ftp masquerading module | ||
20 | * | ||
21 | * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 | ||
22 | * | ||
23 | * Author: Wouter Gadeyne | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/module.h> | ||
28 | #include <linux/moduleparam.h> | ||
29 | #include <linux/kernel.h> | ||
30 | #include <linux/skbuff.h> | ||
31 | #include <linux/in.h> | ||
32 | #include <linux/ip.h> | ||
33 | #include <net/protocol.h> | ||
34 | #include <net/tcp.h> | ||
35 | |||
36 | #include <net/ip_vs.h> | ||
37 | |||
38 | |||
39 | #define SERVER_STRING "227 Entering Passive Mode (" | ||
40 | #define CLIENT_STRING "PORT " | ||
41 | |||
42 | |||
43 | /* | ||
44 | * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper | ||
45 | * First port is set to the default port. | ||
46 | */ | ||
47 | static int ports[IP_VS_APP_MAX_PORTS] = {21, 0}; | ||
48 | module_param_array(ports, int, NULL, 0); | ||
49 | |||
50 | /* | ||
51 | * Debug level | ||
52 | */ | ||
53 | #ifdef CONFIG_IP_VS_DEBUG | ||
54 | static int debug=0; | ||
55 | module_param(debug, int, 0); | ||
56 | #endif | ||
57 | |||
58 | |||
59 | /* Dummy variable */ | ||
60 | static int ip_vs_ftp_pasv; | ||
61 | |||
62 | |||
63 | static int | ||
64 | ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) | ||
65 | { | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | |||
70 | static int | ||
71 | ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) | ||
72 | { | ||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | |||
77 | /* | ||
78 | * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started | ||
79 | * with the "pattern" and terminated with the "term" character. | ||
80 | * <addr,port> is in network order. | ||
81 | */ | ||
82 | static int ip_vs_ftp_get_addrport(char *data, char *data_limit, | ||
83 | const char *pattern, size_t plen, char term, | ||
84 | __u32 *addr, __u16 *port, | ||
85 | char **start, char **end) | ||
86 | { | ||
87 | unsigned char p[6]; | ||
88 | int i = 0; | ||
89 | |||
90 | if (data_limit - data < plen) { | ||
91 | /* check if there is partial match */ | ||
92 | if (strnicmp(data, pattern, data_limit - data) == 0) | ||
93 | return -1; | ||
94 | else | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | if (strnicmp(data, pattern, plen) != 0) { | ||
99 | return 0; | ||
100 | } | ||
101 | *start = data + plen; | ||
102 | |||
103 | for (data = *start; *data != term; data++) { | ||
104 | if (data == data_limit) | ||
105 | return -1; | ||
106 | } | ||
107 | *end = data; | ||
108 | |||
109 | memset(p, 0, sizeof(p)); | ||
110 | for (data = *start; data != *end; data++) { | ||
111 | if (*data >= '0' && *data <= '9') { | ||
112 | p[i] = p[i]*10 + *data - '0'; | ||
113 | } else if (*data == ',' && i < 5) { | ||
114 | i++; | ||
115 | } else { | ||
116 | /* unexpected character */ | ||
117 | return -1; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | if (i != 5) | ||
122 | return -1; | ||
123 | |||
124 | *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0]; | ||
125 | *port = (p[5]<<8) | p[4]; | ||
126 | return 1; | ||
127 | } | ||
128 | |||
129 | |||
130 | /* | ||
131 | * Look at outgoing ftp packets to catch the response to a PASV command | ||
132 | * from the server (inside-to-outside). | ||
133 | * When we see one, we build a connection entry with the client address, | ||
134 | * client port 0 (unknown at the moment), the server address and the | ||
135 | * server port. Mark the current connection entry as a control channel | ||
136 | * of the new entry. All this work is just to make the data connection | ||
137 | * can be scheduled to the right server later. | ||
138 | * | ||
139 | * The outgoing packet should be something like | ||
140 | * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". | ||
141 | * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. | ||
142 | */ | ||
143 | static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, | ||
144 | struct sk_buff **pskb, int *diff) | ||
145 | { | ||
146 | struct iphdr *iph; | ||
147 | struct tcphdr *th; | ||
148 | char *data, *data_limit; | ||
149 | char *start, *end; | ||
150 | __u32 from; | ||
151 | __u16 port; | ||
152 | struct ip_vs_conn *n_cp; | ||
153 | char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ | ||
154 | unsigned buf_len; | ||
155 | int ret; | ||
156 | |||
157 | *diff = 0; | ||
158 | |||
159 | /* Only useful for established sessions */ | ||
160 | if (cp->state != IP_VS_TCP_S_ESTABLISHED) | ||
161 | return 1; | ||
162 | |||
163 | /* Linear packets are much easier to deal with. */ | ||
164 | if (!ip_vs_make_skb_writable(pskb, (*pskb)->len)) | ||
165 | return 0; | ||
166 | |||
167 | if (cp->app_data == &ip_vs_ftp_pasv) { | ||
168 | iph = (*pskb)->nh.iph; | ||
169 | th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); | ||
170 | data = (char *)th + (th->doff << 2); | ||
171 | data_limit = (*pskb)->tail; | ||
172 | |||
173 | if (ip_vs_ftp_get_addrport(data, data_limit, | ||
174 | SERVER_STRING, | ||
175 | sizeof(SERVER_STRING)-1, ')', | ||
176 | &from, &port, | ||
177 | &start, &end) != 1) | ||
178 | return 1; | ||
179 | |||
180 | IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> " | ||
181 | "%u.%u.%u.%u:%d detected\n", | ||
182 | NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); | ||
183 | |||
184 | /* | ||
185 | * Now update or create an connection entry for it | ||
186 | */ | ||
187 | n_cp = ip_vs_conn_out_get(iph->protocol, from, port, | ||
188 | cp->caddr, 0); | ||
189 | if (!n_cp) { | ||
190 | n_cp = ip_vs_conn_new(IPPROTO_TCP, | ||
191 | cp->caddr, 0, | ||
192 | cp->vaddr, port, | ||
193 | from, port, | ||
194 | IP_VS_CONN_F_NO_CPORT, | ||
195 | cp->dest); | ||
196 | if (!n_cp) | ||
197 | return 0; | ||
198 | |||
199 | /* add its controller */ | ||
200 | ip_vs_control_add(n_cp, cp); | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Replace the old passive address with the new one | ||
205 | */ | ||
206 | from = n_cp->vaddr; | ||
207 | port = n_cp->vport; | ||
208 | sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), | ||
209 | port&255, (port>>8)&255); | ||
210 | buf_len = strlen(buf); | ||
211 | |||
212 | /* | ||
213 | * Calculate required delta-offset to keep TCP happy | ||
214 | */ | ||
215 | *diff = buf_len - (end-start); | ||
216 | |||
217 | if (*diff == 0) { | ||
218 | /* simply replace it with new passive address */ | ||
219 | memcpy(start, buf, buf_len); | ||
220 | ret = 1; | ||
221 | } else { | ||
222 | ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start, | ||
223 | end-start, buf, buf_len); | ||
224 | } | ||
225 | |||
226 | cp->app_data = NULL; | ||
227 | ip_vs_tcp_conn_listen(n_cp); | ||
228 | ip_vs_conn_put(n_cp); | ||
229 | return ret; | ||
230 | } | ||
231 | return 1; | ||
232 | } | ||
233 | |||
234 | |||
235 | /* | ||
236 | * Look at incoming ftp packets to catch the PASV/PORT command | ||
237 | * (outside-to-inside). | ||
238 | * | ||
239 | * The incoming packet having the PORT command should be something like | ||
240 | * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". | ||
241 | * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. | ||
242 | * In this case, we create a connection entry using the client address and | ||
243 | * port, so that the active ftp data connection from the server can reach | ||
244 | * the client. | ||
245 | */ | ||
246 | static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, | ||
247 | struct sk_buff **pskb, int *diff) | ||
248 | { | ||
249 | struct iphdr *iph; | ||
250 | struct tcphdr *th; | ||
251 | char *data, *data_start, *data_limit; | ||
252 | char *start, *end; | ||
253 | __u32 to; | ||
254 | __u16 port; | ||
255 | struct ip_vs_conn *n_cp; | ||
256 | |||
257 | /* no diff required for incoming packets */ | ||
258 | *diff = 0; | ||
259 | |||
260 | /* Only useful for established sessions */ | ||
261 | if (cp->state != IP_VS_TCP_S_ESTABLISHED) | ||
262 | return 1; | ||
263 | |||
264 | /* Linear packets are much easier to deal with. */ | ||
265 | if (!ip_vs_make_skb_writable(pskb, (*pskb)->len)) | ||
266 | return 0; | ||
267 | |||
268 | /* | ||
269 | * Detecting whether it is passive | ||
270 | */ | ||
271 | iph = (*pskb)->nh.iph; | ||
272 | th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); | ||
273 | |||
274 | /* Since there may be OPTIONS in the TCP packet and the HLEN is | ||
275 | the length of the header in 32-bit multiples, it is accurate | ||
276 | to calculate data address by th+HLEN*4 */ | ||
277 | data = data_start = (char *)th + (th->doff << 2); | ||
278 | data_limit = (*pskb)->tail; | ||
279 | |||
280 | while (data <= data_limit - 6) { | ||
281 | if (strnicmp(data, "PASV\r\n", 6) == 0) { | ||
282 | /* Passive mode on */ | ||
283 | IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n", | ||
284 | data - data_start, | ||
285 | data_limit - data_start); | ||
286 | cp->app_data = &ip_vs_ftp_pasv; | ||
287 | return 1; | ||
288 | } | ||
289 | data++; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * To support virtual FTP server, the scenerio is as follows: | ||
294 | * FTP client ----> Load Balancer ----> FTP server | ||
295 | * First detect the port number in the application data, | ||
296 | * then create a new connection entry for the coming data | ||
297 | * connection. | ||
298 | */ | ||
299 | if (ip_vs_ftp_get_addrport(data_start, data_limit, | ||
300 | CLIENT_STRING, sizeof(CLIENT_STRING)-1, | ||
301 | '\r', &to, &port, | ||
302 | &start, &end) != 1) | ||
303 | return 1; | ||
304 | |||
305 | IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n", | ||
306 | NIPQUAD(to), ntohs(port)); | ||
307 | |||
308 | /* Passive mode off */ | ||
309 | cp->app_data = NULL; | ||
310 | |||
311 | /* | ||
312 | * Now update or create a connection entry for it | ||
313 | */ | ||
314 | IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", | ||
315 | ip_vs_proto_name(iph->protocol), | ||
316 | NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); | ||
317 | |||
318 | n_cp = ip_vs_conn_in_get(iph->protocol, | ||
319 | to, port, | ||
320 | cp->vaddr, htons(ntohs(cp->vport)-1)); | ||
321 | if (!n_cp) { | ||
322 | n_cp = ip_vs_conn_new(IPPROTO_TCP, | ||
323 | to, port, | ||
324 | cp->vaddr, htons(ntohs(cp->vport)-1), | ||
325 | cp->daddr, htons(ntohs(cp->dport)-1), | ||
326 | 0, | ||
327 | cp->dest); | ||
328 | if (!n_cp) | ||
329 | return 0; | ||
330 | |||
331 | /* add its controller */ | ||
332 | ip_vs_control_add(n_cp, cp); | ||
333 | } | ||
334 | |||
335 | /* | ||
336 | * Move tunnel to listen state | ||
337 | */ | ||
338 | ip_vs_tcp_conn_listen(n_cp); | ||
339 | ip_vs_conn_put(n_cp); | ||
340 | |||
341 | return 1; | ||
342 | } | ||
343 | |||
344 | |||
345 | static struct ip_vs_app ip_vs_ftp = { | ||
346 | .name = "ftp", | ||
347 | .type = IP_VS_APP_TYPE_FTP, | ||
348 | .protocol = IPPROTO_TCP, | ||
349 | .module = THIS_MODULE, | ||
350 | .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), | ||
351 | .init_conn = ip_vs_ftp_init_conn, | ||
352 | .done_conn = ip_vs_ftp_done_conn, | ||
353 | .bind_conn = NULL, | ||
354 | .unbind_conn = NULL, | ||
355 | .pkt_out = ip_vs_ftp_out, | ||
356 | .pkt_in = ip_vs_ftp_in, | ||
357 | }; | ||
358 | |||
359 | |||
360 | /* | ||
361 | * ip_vs_ftp initialization | ||
362 | */ | ||
363 | static int __init ip_vs_ftp_init(void) | ||
364 | { | ||
365 | int i, ret; | ||
366 | struct ip_vs_app *app = &ip_vs_ftp; | ||
367 | |||
368 | ret = register_ip_vs_app(app); | ||
369 | if (ret) | ||
370 | return ret; | ||
371 | |||
372 | for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { | ||
373 | if (!ports[i]) | ||
374 | continue; | ||
375 | ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); | ||
376 | if (ret) | ||
377 | break; | ||
378 | IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n", | ||
379 | app->name, i, ports[i]); | ||
380 | } | ||
381 | |||
382 | if (ret) | ||
383 | unregister_ip_vs_app(app); | ||
384 | |||
385 | return ret; | ||
386 | } | ||
387 | |||
388 | |||
389 | /* | ||
390 | * ip_vs_ftp finish. | ||
391 | */ | ||
392 | static void __exit ip_vs_ftp_exit(void) | ||
393 | { | ||
394 | unregister_ip_vs_app(&ip_vs_ftp); | ||
395 | } | ||
396 | |||
397 | |||
398 | module_init(ip_vs_ftp_init); | ||
399 | module_exit(ip_vs_ftp_exit); | ||
400 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c new file mode 100644 index 000000000000..c035838b780a --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblc.c | |||
@@ -0,0 +1,624 @@ | |||
1 | /* | ||
2 | * IPVS: Locality-Based Least-Connection scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * Martin Hamilton : fixed the terrible locking bugs | ||
15 | * *lock(tbl->lock) ==> *lock(&tbl->lock) | ||
16 | * Wensong Zhang : fixed the uninitilized tbl->lock bug | ||
17 | * Wensong Zhang : added doing full expiration check to | ||
18 | * collect stale entries of 24+ hours when | ||
19 | * no partial expire check in a half hour | ||
20 | * Julian Anastasov : replaced del_timer call with del_timer_sync | ||
21 | * to avoid the possible race between timer | ||
22 | * handler and del_timer thread in SMP | ||
23 | * | ||
24 | */ | ||
25 | |||
26 | /* | ||
27 | * The lblc algorithm is as follows (pseudo code): | ||
28 | * | ||
29 | * if cachenode[dest_ip] is null then | ||
30 | * n, cachenode[dest_ip] <- {weighted least-conn node}; | ||
31 | * else | ||
32 | * n <- cachenode[dest_ip]; | ||
33 | * if (n is dead) OR | ||
34 | * (n.conns>n.weight AND | ||
35 | * there is a node m with m.conns<m.weight/2) then | ||
36 | * n, cachenode[dest_ip] <- {weighted least-conn node}; | ||
37 | * | ||
38 | * return n; | ||
39 | * | ||
40 | * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing | ||
41 | * me to write this module. | ||
42 | */ | ||
43 | |||
44 | #include <linux/module.h> | ||
45 | #include <linux/kernel.h> | ||
46 | |||
47 | /* for sysctl */ | ||
48 | #include <linux/fs.h> | ||
49 | #include <linux/sysctl.h> | ||
50 | |||
51 | #include <net/ip_vs.h> | ||
52 | |||
53 | |||
54 | /* | ||
55 | * It is for garbage collection of stale IPVS lblc entries, | ||
56 | * when the table is full. | ||
57 | */ | ||
58 | #define CHECK_EXPIRE_INTERVAL (60*HZ) | ||
59 | #define ENTRY_TIMEOUT (6*60*HZ) | ||
60 | |||
61 | /* | ||
62 | * It is for full expiration check. | ||
63 | * When there is no partial expiration check (garbage collection) | ||
64 | * in a half hour, do a full expiration check to collect stale | ||
65 | * entries that haven't been touched for a day. | ||
66 | */ | ||
67 | #define COUNT_FOR_FULL_EXPIRATION 30 | ||
68 | static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; | ||
69 | |||
70 | |||
71 | /* | ||
72 | * for IPVS lblc entry hash table | ||
73 | */ | ||
74 | #ifndef CONFIG_IP_VS_LBLC_TAB_BITS | ||
75 | #define CONFIG_IP_VS_LBLC_TAB_BITS 10 | ||
76 | #endif | ||
77 | #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS | ||
78 | #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) | ||
79 | #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) | ||
80 | |||
81 | |||
82 | /* | ||
83 | * IPVS lblc entry represents an association between destination | ||
84 | * IP address and its destination server | ||
85 | */ | ||
86 | struct ip_vs_lblc_entry { | ||
87 | struct list_head list; | ||
88 | __u32 addr; /* destination IP address */ | ||
89 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
90 | unsigned long lastuse; /* last used time */ | ||
91 | }; | ||
92 | |||
93 | |||
94 | /* | ||
95 | * IPVS lblc hash table | ||
96 | */ | ||
97 | struct ip_vs_lblc_table { | ||
98 | rwlock_t lock; /* lock for this table */ | ||
99 | struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | ||
100 | atomic_t entries; /* number of entries */ | ||
101 | int max_size; /* maximum size of entries */ | ||
102 | struct timer_list periodic_timer; /* collect stale entries */ | ||
103 | int rover; /* rover for expire check */ | ||
104 | int counter; /* counter for no expire */ | ||
105 | }; | ||
106 | |||
107 | |||
108 | /* | ||
109 | * IPVS LBLC sysctl table | ||
110 | */ | ||
111 | |||
112 | static ctl_table vs_vars_table[] = { | ||
113 | { | ||
114 | .ctl_name = NET_IPV4_VS_LBLC_EXPIRE, | ||
115 | .procname = "lblc_expiration", | ||
116 | .data = &sysctl_ip_vs_lblc_expiration, | ||
117 | .maxlen = sizeof(int), | ||
118 | .mode = 0644, | ||
119 | .proc_handler = &proc_dointvec_jiffies, | ||
120 | }, | ||
121 | { .ctl_name = 0 } | ||
122 | }; | ||
123 | |||
124 | static ctl_table vs_table[] = { | ||
125 | { | ||
126 | .ctl_name = NET_IPV4_VS, | ||
127 | .procname = "vs", | ||
128 | .mode = 0555, | ||
129 | .child = vs_vars_table | ||
130 | }, | ||
131 | { .ctl_name = 0 } | ||
132 | }; | ||
133 | |||
134 | static ctl_table ipv4_table[] = { | ||
135 | { | ||
136 | .ctl_name = NET_IPV4, | ||
137 | .procname = "ipv4", | ||
138 | .mode = 0555, | ||
139 | .child = vs_table | ||
140 | }, | ||
141 | { .ctl_name = 0 } | ||
142 | }; | ||
143 | |||
144 | static ctl_table lblc_root_table[] = { | ||
145 | { | ||
146 | .ctl_name = CTL_NET, | ||
147 | .procname = "net", | ||
148 | .mode = 0555, | ||
149 | .child = ipv4_table | ||
150 | }, | ||
151 | { .ctl_name = 0 } | ||
152 | }; | ||
153 | |||
154 | static struct ctl_table_header * sysctl_header; | ||
155 | |||
156 | /* | ||
157 | * new/free a ip_vs_lblc_entry, which is a mapping of a destionation | ||
158 | * IP address to a server. | ||
159 | */ | ||
160 | static inline struct ip_vs_lblc_entry * | ||
161 | ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) | ||
162 | { | ||
163 | struct ip_vs_lblc_entry *en; | ||
164 | |||
165 | en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); | ||
166 | if (en == NULL) { | ||
167 | IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | ||
168 | return NULL; | ||
169 | } | ||
170 | |||
171 | INIT_LIST_HEAD(&en->list); | ||
172 | en->addr = daddr; | ||
173 | |||
174 | atomic_inc(&dest->refcnt); | ||
175 | en->dest = dest; | ||
176 | |||
177 | return en; | ||
178 | } | ||
179 | |||
180 | |||
181 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) | ||
182 | { | ||
183 | list_del(&en->list); | ||
184 | /* | ||
185 | * We don't kfree dest because it is refered either by its service | ||
186 | * or the trash dest list. | ||
187 | */ | ||
188 | atomic_dec(&en->dest->refcnt); | ||
189 | kfree(en); | ||
190 | } | ||
191 | |||
192 | |||
193 | /* | ||
194 | * Returns hash value for IPVS LBLC entry | ||
195 | */ | ||
196 | static inline unsigned ip_vs_lblc_hashkey(__u32 addr) | ||
197 | { | ||
198 | return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; | ||
199 | } | ||
200 | |||
201 | |||
202 | /* | ||
203 | * Hash an entry in the ip_vs_lblc_table. | ||
204 | * returns bool success. | ||
205 | */ | ||
206 | static int | ||
207 | ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) | ||
208 | { | ||
209 | unsigned hash; | ||
210 | |||
211 | if (!list_empty(&en->list)) { | ||
212 | IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " | ||
213 | "called from %p\n", __builtin_return_address(0)); | ||
214 | return 0; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * Hash by destination IP address | ||
219 | */ | ||
220 | hash = ip_vs_lblc_hashkey(en->addr); | ||
221 | |||
222 | write_lock(&tbl->lock); | ||
223 | list_add(&en->list, &tbl->bucket[hash]); | ||
224 | atomic_inc(&tbl->entries); | ||
225 | write_unlock(&tbl->lock); | ||
226 | |||
227 | return 1; | ||
228 | } | ||
229 | |||
230 | |||
231 | #if 0000 | ||
232 | /* | ||
233 | * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. | ||
234 | * returns bool success. | ||
235 | */ | ||
236 | static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, | ||
237 | struct ip_vs_lblc_entry *en) | ||
238 | { | ||
239 | if (list_empty(&en->list)) { | ||
240 | IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " | ||
241 | "called from %p\n", __builtin_return_address(0)); | ||
242 | return 0; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Remove it from the table | ||
247 | */ | ||
248 | write_lock(&tbl->lock); | ||
249 | list_del(&en->list); | ||
250 | INIT_LIST_HEAD(&en->list); | ||
251 | write_unlock(&tbl->lock); | ||
252 | |||
253 | return 1; | ||
254 | } | ||
255 | #endif | ||
256 | |||
257 | |||
258 | /* | ||
259 | * Get ip_vs_lblc_entry associated with supplied parameters. | ||
260 | */ | ||
261 | static inline struct ip_vs_lblc_entry * | ||
262 | ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr) | ||
263 | { | ||
264 | unsigned hash; | ||
265 | struct ip_vs_lblc_entry *en; | ||
266 | |||
267 | hash = ip_vs_lblc_hashkey(addr); | ||
268 | |||
269 | read_lock(&tbl->lock); | ||
270 | |||
271 | list_for_each_entry(en, &tbl->bucket[hash], list) { | ||
272 | if (en->addr == addr) { | ||
273 | /* HIT */ | ||
274 | read_unlock(&tbl->lock); | ||
275 | return en; | ||
276 | } | ||
277 | } | ||
278 | |||
279 | read_unlock(&tbl->lock); | ||
280 | |||
281 | return NULL; | ||
282 | } | ||
283 | |||
284 | |||
285 | /* | ||
286 | * Flush all the entries of the specified table. | ||
287 | */ | ||
288 | static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) | ||
289 | { | ||
290 | int i; | ||
291 | struct ip_vs_lblc_entry *en, *nxt; | ||
292 | |||
293 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
294 | write_lock(&tbl->lock); | ||
295 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | ||
296 | ip_vs_lblc_free(en); | ||
297 | atomic_dec(&tbl->entries); | ||
298 | } | ||
299 | write_unlock(&tbl->lock); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | |||
304 | static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | ||
305 | { | ||
306 | unsigned long now = jiffies; | ||
307 | int i, j; | ||
308 | struct ip_vs_lblc_entry *en, *nxt; | ||
309 | |||
310 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
311 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | ||
312 | |||
313 | write_lock(&tbl->lock); | ||
314 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
315 | if (time_before(now, | ||
316 | en->lastuse + sysctl_ip_vs_lblc_expiration)) | ||
317 | continue; | ||
318 | |||
319 | ip_vs_lblc_free(en); | ||
320 | atomic_dec(&tbl->entries); | ||
321 | } | ||
322 | write_unlock(&tbl->lock); | ||
323 | } | ||
324 | tbl->rover = j; | ||
325 | } | ||
326 | |||
327 | |||
328 | /* | ||
329 | * Periodical timer handler for IPVS lblc table | ||
330 | * It is used to collect stale entries when the number of entries | ||
331 | * exceeds the maximum size of the table. | ||
332 | * | ||
333 | * Fixme: we probably need more complicated algorithm to collect | ||
334 | * entries that have not been used for a long time even | ||
335 | * if the number of entries doesn't exceed the maximum size | ||
336 | * of the table. | ||
337 | * The full expiration check is for this purpose now. | ||
338 | */ | ||
339 | static void ip_vs_lblc_check_expire(unsigned long data) | ||
340 | { | ||
341 | struct ip_vs_lblc_table *tbl; | ||
342 | unsigned long now = jiffies; | ||
343 | int goal; | ||
344 | int i, j; | ||
345 | struct ip_vs_lblc_entry *en, *nxt; | ||
346 | |||
347 | tbl = (struct ip_vs_lblc_table *)data; | ||
348 | |||
349 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | ||
350 | /* do full expiration check */ | ||
351 | ip_vs_lblc_full_check(tbl); | ||
352 | tbl->counter = 1; | ||
353 | goto out; | ||
354 | } | ||
355 | |||
356 | if (atomic_read(&tbl->entries) <= tbl->max_size) { | ||
357 | tbl->counter++; | ||
358 | goto out; | ||
359 | } | ||
360 | |||
361 | goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | ||
362 | if (goal > tbl->max_size/2) | ||
363 | goal = tbl->max_size/2; | ||
364 | |||
365 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
366 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | ||
367 | |||
368 | write_lock(&tbl->lock); | ||
369 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
370 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) | ||
371 | continue; | ||
372 | |||
373 | ip_vs_lblc_free(en); | ||
374 | atomic_dec(&tbl->entries); | ||
375 | goal--; | ||
376 | } | ||
377 | write_unlock(&tbl->lock); | ||
378 | if (goal <= 0) | ||
379 | break; | ||
380 | } | ||
381 | tbl->rover = j; | ||
382 | |||
383 | out: | ||
384 | mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | ||
385 | } | ||
386 | |||
387 | |||
388 | static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | ||
389 | { | ||
390 | int i; | ||
391 | struct ip_vs_lblc_table *tbl; | ||
392 | |||
393 | /* | ||
394 | * Allocate the ip_vs_lblc_table for this service | ||
395 | */ | ||
396 | tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); | ||
397 | if (tbl == NULL) { | ||
398 | IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); | ||
399 | return -ENOMEM; | ||
400 | } | ||
401 | svc->sched_data = tbl; | ||
402 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " | ||
403 | "current service\n", | ||
404 | sizeof(struct ip_vs_lblc_table)); | ||
405 | |||
406 | /* | ||
407 | * Initialize the hash buckets | ||
408 | */ | ||
409 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
410 | INIT_LIST_HEAD(&tbl->bucket[i]); | ||
411 | } | ||
412 | rwlock_init(&tbl->lock); | ||
413 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; | ||
414 | tbl->rover = 0; | ||
415 | tbl->counter = 1; | ||
416 | |||
417 | /* | ||
418 | * Hook periodic timer for garbage collection | ||
419 | */ | ||
420 | init_timer(&tbl->periodic_timer); | ||
421 | tbl->periodic_timer.data = (unsigned long)tbl; | ||
422 | tbl->periodic_timer.function = ip_vs_lblc_check_expire; | ||
423 | tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | ||
424 | add_timer(&tbl->periodic_timer); | ||
425 | |||
426 | return 0; | ||
427 | } | ||
428 | |||
429 | |||
430 | static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) | ||
431 | { | ||
432 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
433 | |||
434 | /* remove periodic timer */ | ||
435 | del_timer_sync(&tbl->periodic_timer); | ||
436 | |||
437 | /* got to clean up table entries here */ | ||
438 | ip_vs_lblc_flush(tbl); | ||
439 | |||
440 | /* release the table itself */ | ||
441 | kfree(svc->sched_data); | ||
442 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", | ||
443 | sizeof(struct ip_vs_lblc_table)); | ||
444 | |||
445 | return 0; | ||
446 | } | ||
447 | |||
448 | |||
449 | static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) | ||
450 | { | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | |||
455 | static inline struct ip_vs_dest * | ||
456 | __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | ||
457 | { | ||
458 | struct ip_vs_dest *dest, *least; | ||
459 | int loh, doh; | ||
460 | |||
461 | /* | ||
462 | * We think the overhead of processing active connections is fifty | ||
463 | * times higher than that of inactive connections in average. (This | ||
464 | * fifty times might not be accurate, we will change it later.) We | ||
465 | * use the following formula to estimate the overhead: | ||
466 | * dest->activeconns*50 + dest->inactconns | ||
467 | * and the load: | ||
468 | * (dest overhead) / dest->weight | ||
469 | * | ||
470 | * Remember -- no floats in kernel mode!!! | ||
471 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
472 | * h1/w1 > h2/w2 | ||
473 | * if every weight is larger than zero. | ||
474 | * | ||
475 | * The server with weight=0 is quiesced and will not receive any | ||
476 | * new connection. | ||
477 | */ | ||
478 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
479 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
480 | continue; | ||
481 | if (atomic_read(&dest->weight) > 0) { | ||
482 | least = dest; | ||
483 | loh = atomic_read(&least->activeconns) * 50 | ||
484 | + atomic_read(&least->inactconns); | ||
485 | goto nextstage; | ||
486 | } | ||
487 | } | ||
488 | return NULL; | ||
489 | |||
490 | /* | ||
491 | * Find the destination with the least load. | ||
492 | */ | ||
493 | nextstage: | ||
494 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
495 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
496 | continue; | ||
497 | |||
498 | doh = atomic_read(&dest->activeconns) * 50 | ||
499 | + atomic_read(&dest->inactconns); | ||
500 | if (loh * atomic_read(&dest->weight) > | ||
501 | doh * atomic_read(&least->weight)) { | ||
502 | least = dest; | ||
503 | loh = doh; | ||
504 | } | ||
505 | } | ||
506 | |||
507 | IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " | ||
508 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
509 | NIPQUAD(least->addr), ntohs(least->port), | ||
510 | atomic_read(&least->activeconns), | ||
511 | atomic_read(&least->refcnt), | ||
512 | atomic_read(&least->weight), loh); | ||
513 | |||
514 | return least; | ||
515 | } | ||
516 | |||
517 | |||
518 | /* | ||
519 | * If this destination server is overloaded and there is a less loaded | ||
520 | * server, then return true. | ||
521 | */ | ||
522 | static inline int | ||
523 | is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
524 | { | ||
525 | if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { | ||
526 | struct ip_vs_dest *d; | ||
527 | |||
528 | list_for_each_entry(d, &svc->destinations, n_list) { | ||
529 | if (atomic_read(&d->activeconns)*2 | ||
530 | < atomic_read(&d->weight)) { | ||
531 | return 1; | ||
532 | } | ||
533 | } | ||
534 | } | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | |||
539 | /* | ||
540 | * Locality-Based (weighted) Least-Connection scheduling | ||
541 | */ | ||
542 | static struct ip_vs_dest * | ||
543 | ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
544 | { | ||
545 | struct ip_vs_dest *dest; | ||
546 | struct ip_vs_lblc_table *tbl; | ||
547 | struct ip_vs_lblc_entry *en; | ||
548 | struct iphdr *iph = skb->nh.iph; | ||
549 | |||
550 | IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); | ||
551 | |||
552 | tbl = (struct ip_vs_lblc_table *)svc->sched_data; | ||
553 | en = ip_vs_lblc_get(tbl, iph->daddr); | ||
554 | if (en == NULL) { | ||
555 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
556 | if (dest == NULL) { | ||
557 | IP_VS_DBG(1, "no destination available\n"); | ||
558 | return NULL; | ||
559 | } | ||
560 | en = ip_vs_lblc_new(iph->daddr, dest); | ||
561 | if (en == NULL) { | ||
562 | return NULL; | ||
563 | } | ||
564 | ip_vs_lblc_hash(tbl, en); | ||
565 | } else { | ||
566 | dest = en->dest; | ||
567 | if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
568 | || atomic_read(&dest->weight) <= 0 | ||
569 | || is_overloaded(dest, svc)) { | ||
570 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
571 | if (dest == NULL) { | ||
572 | IP_VS_DBG(1, "no destination available\n"); | ||
573 | return NULL; | ||
574 | } | ||
575 | atomic_dec(&en->dest->refcnt); | ||
576 | atomic_inc(&dest->refcnt); | ||
577 | en->dest = dest; | ||
578 | } | ||
579 | } | ||
580 | en->lastuse = jiffies; | ||
581 | |||
582 | IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " | ||
583 | "--> server %u.%u.%u.%u:%d\n", | ||
584 | NIPQUAD(en->addr), | ||
585 | NIPQUAD(dest->addr), | ||
586 | ntohs(dest->port)); | ||
587 | |||
588 | return dest; | ||
589 | } | ||
590 | |||
591 | |||
592 | /* | ||
593 | * IPVS LBLC Scheduler structure | ||
594 | */ | ||
595 | static struct ip_vs_scheduler ip_vs_lblc_scheduler = | ||
596 | { | ||
597 | .name = "lblc", | ||
598 | .refcnt = ATOMIC_INIT(0), | ||
599 | .module = THIS_MODULE, | ||
600 | .init_service = ip_vs_lblc_init_svc, | ||
601 | .done_service = ip_vs_lblc_done_svc, | ||
602 | .update_service = ip_vs_lblc_update_svc, | ||
603 | .schedule = ip_vs_lblc_schedule, | ||
604 | }; | ||
605 | |||
606 | |||
607 | static int __init ip_vs_lblc_init(void) | ||
608 | { | ||
609 | INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); | ||
610 | sysctl_header = register_sysctl_table(lblc_root_table, 0); | ||
611 | return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); | ||
612 | } | ||
613 | |||
614 | |||
615 | static void __exit ip_vs_lblc_cleanup(void) | ||
616 | { | ||
617 | unregister_sysctl_table(sysctl_header); | ||
618 | unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); | ||
619 | } | ||
620 | |||
621 | |||
622 | module_init(ip_vs_lblc_init); | ||
623 | module_exit(ip_vs_lblc_cleanup); | ||
624 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c new file mode 100644 index 000000000000..22b5dd55d271 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblcr.c | |||
@@ -0,0 +1,888 @@ | |||
1 | /* | ||
2 | * IPVS: Locality-Based Least-Connection with Replication scheduler | ||
3 | * | ||
4 | * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * Julian Anastasov : Added the missing (dest->weight>0) | ||
15 | * condition in the ip_vs_dest_set_max. | ||
16 | * | ||
17 | */ | ||
18 | |||
19 | /* | ||
20 | * The lblc/r algorithm is as follows (pseudo code): | ||
21 | * | ||
22 | * if serverSet[dest_ip] is null then | ||
23 | * n, serverSet[dest_ip] <- {weighted least-conn node}; | ||
24 | * else | ||
25 | * n <- {least-conn (alive) node in serverSet[dest_ip]}; | ||
26 | * if (n is null) OR | ||
27 | * (n.conns>n.weight AND | ||
28 | * there is a node m with m.conns<m.weight/2) then | ||
29 | * n <- {weighted least-conn node}; | ||
30 | * add n to serverSet[dest_ip]; | ||
31 | * if |serverSet[dest_ip]| > 1 AND | ||
32 | * now - serverSet[dest_ip].lastMod > T then | ||
33 | * m <- {most conn node in serverSet[dest_ip]}; | ||
34 | * remove m from serverSet[dest_ip]; | ||
35 | * if serverSet[dest_ip] changed then | ||
36 | * serverSet[dest_ip].lastMod <- now; | ||
37 | * | ||
38 | * return n; | ||
39 | * | ||
40 | */ | ||
41 | |||
42 | #include <linux/module.h> | ||
43 | #include <linux/kernel.h> | ||
44 | |||
45 | /* for sysctl */ | ||
46 | #include <linux/fs.h> | ||
47 | #include <linux/sysctl.h> | ||
48 | /* for proc_net_create/proc_net_remove */ | ||
49 | #include <linux/proc_fs.h> | ||
50 | |||
51 | #include <net/ip_vs.h> | ||
52 | |||
53 | |||
54 | /* | ||
55 | * It is for garbage collection of stale IPVS lblcr entries, | ||
56 | * when the table is full. | ||
57 | */ | ||
58 | #define CHECK_EXPIRE_INTERVAL (60*HZ) | ||
59 | #define ENTRY_TIMEOUT (6*60*HZ) | ||
60 | |||
61 | /* | ||
62 | * It is for full expiration check. | ||
63 | * When there is no partial expiration check (garbage collection) | ||
64 | * in a half hour, do a full expiration check to collect stale | ||
65 | * entries that haven't been touched for a day. | ||
66 | */ | ||
67 | #define COUNT_FOR_FULL_EXPIRATION 30 | ||
68 | static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; | ||
69 | |||
70 | |||
71 | /* | ||
72 | * for IPVS lblcr entry hash table | ||
73 | */ | ||
74 | #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS | ||
75 | #define CONFIG_IP_VS_LBLCR_TAB_BITS 10 | ||
76 | #endif | ||
77 | #define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS | ||
78 | #define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) | ||
79 | #define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) | ||
80 | |||
81 | |||
82 | /* | ||
83 | * IPVS destination set structure and operations | ||
84 | */ | ||
85 | struct ip_vs_dest_list { | ||
86 | struct ip_vs_dest_list *next; /* list link */ | ||
87 | struct ip_vs_dest *dest; /* destination server */ | ||
88 | }; | ||
89 | |||
90 | struct ip_vs_dest_set { | ||
91 | atomic_t size; /* set size */ | ||
92 | unsigned long lastmod; /* last modified time */ | ||
93 | struct ip_vs_dest_list *list; /* destination list */ | ||
94 | rwlock_t lock; /* lock for this list */ | ||
95 | }; | ||
96 | |||
97 | |||
98 | static struct ip_vs_dest_list * | ||
99 | ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | ||
100 | { | ||
101 | struct ip_vs_dest_list *e; | ||
102 | |||
103 | for (e=set->list; e!=NULL; e=e->next) { | ||
104 | if (e->dest == dest) | ||
105 | /* already existed */ | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); | ||
110 | if (e == NULL) { | ||
111 | IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); | ||
112 | return NULL; | ||
113 | } | ||
114 | |||
115 | atomic_inc(&dest->refcnt); | ||
116 | e->dest = dest; | ||
117 | |||
118 | /* link it to the list */ | ||
119 | write_lock(&set->lock); | ||
120 | e->next = set->list; | ||
121 | set->list = e; | ||
122 | atomic_inc(&set->size); | ||
123 | write_unlock(&set->lock); | ||
124 | |||
125 | set->lastmod = jiffies; | ||
126 | return e; | ||
127 | } | ||
128 | |||
129 | static void | ||
130 | ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | ||
131 | { | ||
132 | struct ip_vs_dest_list *e, **ep; | ||
133 | |||
134 | write_lock(&set->lock); | ||
135 | for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | ||
136 | if (e->dest == dest) { | ||
137 | /* HIT */ | ||
138 | *ep = e->next; | ||
139 | atomic_dec(&set->size); | ||
140 | set->lastmod = jiffies; | ||
141 | atomic_dec(&e->dest->refcnt); | ||
142 | kfree(e); | ||
143 | break; | ||
144 | } | ||
145 | ep = &e->next; | ||
146 | } | ||
147 | write_unlock(&set->lock); | ||
148 | } | ||
149 | |||
150 | static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) | ||
151 | { | ||
152 | struct ip_vs_dest_list *e, **ep; | ||
153 | |||
154 | write_lock(&set->lock); | ||
155 | for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | ||
156 | *ep = e->next; | ||
157 | /* | ||
158 | * We don't kfree dest because it is refered either | ||
159 | * by its service or by the trash dest list. | ||
160 | */ | ||
161 | atomic_dec(&e->dest->refcnt); | ||
162 | kfree(e); | ||
163 | } | ||
164 | write_unlock(&set->lock); | ||
165 | } | ||
166 | |||
167 | /* get weighted least-connection node in the destination set */ | ||
168 | static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) | ||
169 | { | ||
170 | register struct ip_vs_dest_list *e; | ||
171 | struct ip_vs_dest *dest, *least; | ||
172 | int loh, doh; | ||
173 | |||
174 | if (set == NULL) | ||
175 | return NULL; | ||
176 | |||
177 | read_lock(&set->lock); | ||
178 | /* select the first destination server, whose weight > 0 */ | ||
179 | for (e=set->list; e!=NULL; e=e->next) { | ||
180 | least = e->dest; | ||
181 | if (least->flags & IP_VS_DEST_F_OVERLOAD) | ||
182 | continue; | ||
183 | |||
184 | if ((atomic_read(&least->weight) > 0) | ||
185 | && (least->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
186 | loh = atomic_read(&least->activeconns) * 50 | ||
187 | + atomic_read(&least->inactconns); | ||
188 | goto nextstage; | ||
189 | } | ||
190 | } | ||
191 | read_unlock(&set->lock); | ||
192 | return NULL; | ||
193 | |||
194 | /* find the destination with the weighted least load */ | ||
195 | nextstage: | ||
196 | for (e=e->next; e!=NULL; e=e->next) { | ||
197 | dest = e->dest; | ||
198 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
199 | continue; | ||
200 | |||
201 | doh = atomic_read(&dest->activeconns) * 50 | ||
202 | + atomic_read(&dest->inactconns); | ||
203 | if ((loh * atomic_read(&dest->weight) > | ||
204 | doh * atomic_read(&least->weight)) | ||
205 | && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
206 | least = dest; | ||
207 | loh = doh; | ||
208 | } | ||
209 | } | ||
210 | read_unlock(&set->lock); | ||
211 | |||
212 | IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " | ||
213 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
214 | NIPQUAD(least->addr), ntohs(least->port), | ||
215 | atomic_read(&least->activeconns), | ||
216 | atomic_read(&least->refcnt), | ||
217 | atomic_read(&least->weight), loh); | ||
218 | return least; | ||
219 | } | ||
220 | |||
221 | |||
222 | /* get weighted most-connection node in the destination set */ | ||
223 | static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) | ||
224 | { | ||
225 | register struct ip_vs_dest_list *e; | ||
226 | struct ip_vs_dest *dest, *most; | ||
227 | int moh, doh; | ||
228 | |||
229 | if (set == NULL) | ||
230 | return NULL; | ||
231 | |||
232 | read_lock(&set->lock); | ||
233 | /* select the first destination server, whose weight > 0 */ | ||
234 | for (e=set->list; e!=NULL; e=e->next) { | ||
235 | most = e->dest; | ||
236 | if (atomic_read(&most->weight) > 0) { | ||
237 | moh = atomic_read(&most->activeconns) * 50 | ||
238 | + atomic_read(&most->inactconns); | ||
239 | goto nextstage; | ||
240 | } | ||
241 | } | ||
242 | read_unlock(&set->lock); | ||
243 | return NULL; | ||
244 | |||
245 | /* find the destination with the weighted most load */ | ||
246 | nextstage: | ||
247 | for (e=e->next; e!=NULL; e=e->next) { | ||
248 | dest = e->dest; | ||
249 | doh = atomic_read(&dest->activeconns) * 50 | ||
250 | + atomic_read(&dest->inactconns); | ||
251 | /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ | ||
252 | if ((moh * atomic_read(&dest->weight) < | ||
253 | doh * atomic_read(&most->weight)) | ||
254 | && (atomic_read(&dest->weight) > 0)) { | ||
255 | most = dest; | ||
256 | moh = doh; | ||
257 | } | ||
258 | } | ||
259 | read_unlock(&set->lock); | ||
260 | |||
261 | IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " | ||
262 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
263 | NIPQUAD(most->addr), ntohs(most->port), | ||
264 | atomic_read(&most->activeconns), | ||
265 | atomic_read(&most->refcnt), | ||
266 | atomic_read(&most->weight), moh); | ||
267 | return most; | ||
268 | } | ||
269 | |||
270 | |||
271 | /* | ||
272 | * IPVS lblcr entry represents an association between destination | ||
273 | * IP address and its destination server set | ||
274 | */ | ||
275 | struct ip_vs_lblcr_entry { | ||
276 | struct list_head list; | ||
277 | __u32 addr; /* destination IP address */ | ||
278 | struct ip_vs_dest_set set; /* destination server set */ | ||
279 | unsigned long lastuse; /* last used time */ | ||
280 | }; | ||
281 | |||
282 | |||
283 | /* | ||
284 | * IPVS lblcr hash table | ||
285 | */ | ||
286 | struct ip_vs_lblcr_table { | ||
287 | rwlock_t lock; /* lock for this table */ | ||
288 | struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ | ||
289 | atomic_t entries; /* number of entries */ | ||
290 | int max_size; /* maximum size of entries */ | ||
291 | struct timer_list periodic_timer; /* collect stale entries */ | ||
292 | int rover; /* rover for expire check */ | ||
293 | int counter; /* counter for no expire */ | ||
294 | }; | ||
295 | |||
296 | |||
297 | /* | ||
298 | * IPVS LBLCR sysctl table | ||
299 | */ | ||
300 | |||
301 | static ctl_table vs_vars_table[] = { | ||
302 | { | ||
303 | .ctl_name = NET_IPV4_VS_LBLCR_EXPIRE, | ||
304 | .procname = "lblcr_expiration", | ||
305 | .data = &sysctl_ip_vs_lblcr_expiration, | ||
306 | .maxlen = sizeof(int), | ||
307 | .mode = 0644, | ||
308 | .proc_handler = &proc_dointvec_jiffies, | ||
309 | }, | ||
310 | { .ctl_name = 0 } | ||
311 | }; | ||
312 | |||
313 | static ctl_table vs_table[] = { | ||
314 | { | ||
315 | .ctl_name = NET_IPV4_VS, | ||
316 | .procname = "vs", | ||
317 | .mode = 0555, | ||
318 | .child = vs_vars_table | ||
319 | }, | ||
320 | { .ctl_name = 0 } | ||
321 | }; | ||
322 | |||
323 | static ctl_table ipv4_table[] = { | ||
324 | { | ||
325 | .ctl_name = NET_IPV4, | ||
326 | .procname = "ipv4", | ||
327 | .mode = 0555, | ||
328 | .child = vs_table | ||
329 | }, | ||
330 | { .ctl_name = 0 } | ||
331 | }; | ||
332 | |||
333 | static ctl_table lblcr_root_table[] = { | ||
334 | { | ||
335 | .ctl_name = CTL_NET, | ||
336 | .procname = "net", | ||
337 | .mode = 0555, | ||
338 | .child = ipv4_table | ||
339 | }, | ||
340 | { .ctl_name = 0 } | ||
341 | }; | ||
342 | |||
343 | static struct ctl_table_header * sysctl_header; | ||
344 | |||
345 | /* | ||
346 | * new/free a ip_vs_lblcr_entry, which is a mapping of a destination | ||
347 | * IP address to a server. | ||
348 | */ | ||
349 | static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) | ||
350 | { | ||
351 | struct ip_vs_lblcr_entry *en; | ||
352 | |||
353 | en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); | ||
354 | if (en == NULL) { | ||
355 | IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); | ||
356 | return NULL; | ||
357 | } | ||
358 | |||
359 | INIT_LIST_HEAD(&en->list); | ||
360 | en->addr = daddr; | ||
361 | |||
362 | /* initilize its dest set */ | ||
363 | atomic_set(&(en->set.size), 0); | ||
364 | en->set.list = NULL; | ||
365 | rwlock_init(&en->set.lock); | ||
366 | |||
367 | return en; | ||
368 | } | ||
369 | |||
370 | |||
371 | static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) | ||
372 | { | ||
373 | list_del(&en->list); | ||
374 | ip_vs_dest_set_eraseall(&en->set); | ||
375 | kfree(en); | ||
376 | } | ||
377 | |||
378 | |||
379 | /* | ||
380 | * Returns hash value for IPVS LBLCR entry | ||
381 | */ | ||
382 | static inline unsigned ip_vs_lblcr_hashkey(__u32 addr) | ||
383 | { | ||
384 | return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; | ||
385 | } | ||
386 | |||
387 | |||
388 | /* | ||
389 | * Hash an entry in the ip_vs_lblcr_table. | ||
390 | * returns bool success. | ||
391 | */ | ||
392 | static int | ||
393 | ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) | ||
394 | { | ||
395 | unsigned hash; | ||
396 | |||
397 | if (!list_empty(&en->list)) { | ||
398 | IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " | ||
399 | "called from %p\n", __builtin_return_address(0)); | ||
400 | return 0; | ||
401 | } | ||
402 | |||
403 | /* | ||
404 | * Hash by destination IP address | ||
405 | */ | ||
406 | hash = ip_vs_lblcr_hashkey(en->addr); | ||
407 | |||
408 | write_lock(&tbl->lock); | ||
409 | list_add(&en->list, &tbl->bucket[hash]); | ||
410 | atomic_inc(&tbl->entries); | ||
411 | write_unlock(&tbl->lock); | ||
412 | |||
413 | return 1; | ||
414 | } | ||
415 | |||
416 | |||
417 | #if 0000 | ||
418 | /* | ||
419 | * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. | ||
420 | * returns bool success. | ||
421 | */ | ||
422 | static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, | ||
423 | struct ip_vs_lblcr_entry *en) | ||
424 | { | ||
425 | if (list_empty(&en->list)) { | ||
426 | IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " | ||
427 | "called from %p\n", __builtin_return_address(0)); | ||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * Remove it from the table | ||
433 | */ | ||
434 | write_lock(&tbl->lock); | ||
435 | list_del(&en->list); | ||
436 | INIT_LIST_HEAD(&en->list); | ||
437 | write_unlock(&tbl->lock); | ||
438 | |||
439 | return 1; | ||
440 | } | ||
441 | #endif | ||
442 | |||
443 | |||
444 | /* | ||
445 | * Get ip_vs_lblcr_entry associated with supplied parameters. | ||
446 | */ | ||
447 | static inline struct ip_vs_lblcr_entry * | ||
448 | ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr) | ||
449 | { | ||
450 | unsigned hash; | ||
451 | struct ip_vs_lblcr_entry *en; | ||
452 | |||
453 | hash = ip_vs_lblcr_hashkey(addr); | ||
454 | |||
455 | read_lock(&tbl->lock); | ||
456 | |||
457 | list_for_each_entry(en, &tbl->bucket[hash], list) { | ||
458 | if (en->addr == addr) { | ||
459 | /* HIT */ | ||
460 | read_unlock(&tbl->lock); | ||
461 | return en; | ||
462 | } | ||
463 | } | ||
464 | |||
465 | read_unlock(&tbl->lock); | ||
466 | |||
467 | return NULL; | ||
468 | } | ||
469 | |||
470 | |||
471 | /* | ||
472 | * Flush all the entries of the specified table. | ||
473 | */ | ||
474 | static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) | ||
475 | { | ||
476 | int i; | ||
477 | struct ip_vs_lblcr_entry *en, *nxt; | ||
478 | |||
479 | for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
480 | write_lock(&tbl->lock); | ||
481 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | ||
482 | ip_vs_lblcr_free(en); | ||
483 | atomic_dec(&tbl->entries); | ||
484 | } | ||
485 | write_unlock(&tbl->lock); | ||
486 | } | ||
487 | } | ||
488 | |||
489 | |||
490 | static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) | ||
491 | { | ||
492 | unsigned long now = jiffies; | ||
493 | int i, j; | ||
494 | struct ip_vs_lblcr_entry *en, *nxt; | ||
495 | |||
496 | for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
497 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | ||
498 | |||
499 | write_lock(&tbl->lock); | ||
500 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
501 | if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, | ||
502 | now)) | ||
503 | continue; | ||
504 | |||
505 | ip_vs_lblcr_free(en); | ||
506 | atomic_dec(&tbl->entries); | ||
507 | } | ||
508 | write_unlock(&tbl->lock); | ||
509 | } | ||
510 | tbl->rover = j; | ||
511 | } | ||
512 | |||
513 | |||
514 | /* | ||
515 | * Periodical timer handler for IPVS lblcr table | ||
516 | * It is used to collect stale entries when the number of entries | ||
517 | * exceeds the maximum size of the table. | ||
518 | * | ||
519 | * Fixme: we probably need more complicated algorithm to collect | ||
520 | * entries that have not been used for a long time even | ||
521 | * if the number of entries doesn't exceed the maximum size | ||
522 | * of the table. | ||
523 | * The full expiration check is for this purpose now. | ||
524 | */ | ||
525 | static void ip_vs_lblcr_check_expire(unsigned long data) | ||
526 | { | ||
527 | struct ip_vs_lblcr_table *tbl; | ||
528 | unsigned long now = jiffies; | ||
529 | int goal; | ||
530 | int i, j; | ||
531 | struct ip_vs_lblcr_entry *en, *nxt; | ||
532 | |||
533 | tbl = (struct ip_vs_lblcr_table *)data; | ||
534 | |||
535 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | ||
536 | /* do full expiration check */ | ||
537 | ip_vs_lblcr_full_check(tbl); | ||
538 | tbl->counter = 1; | ||
539 | goto out; | ||
540 | } | ||
541 | |||
542 | if (atomic_read(&tbl->entries) <= tbl->max_size) { | ||
543 | tbl->counter++; | ||
544 | goto out; | ||
545 | } | ||
546 | |||
547 | goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | ||
548 | if (goal > tbl->max_size/2) | ||
549 | goal = tbl->max_size/2; | ||
550 | |||
551 | for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
552 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | ||
553 | |||
554 | write_lock(&tbl->lock); | ||
555 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
556 | if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) | ||
557 | continue; | ||
558 | |||
559 | ip_vs_lblcr_free(en); | ||
560 | atomic_dec(&tbl->entries); | ||
561 | goal--; | ||
562 | } | ||
563 | write_unlock(&tbl->lock); | ||
564 | if (goal <= 0) | ||
565 | break; | ||
566 | } | ||
567 | tbl->rover = j; | ||
568 | |||
569 | out: | ||
570 | mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | ||
571 | } | ||
572 | |||
573 | |||
574 | #ifdef CONFIG_IP_VS_LBLCR_DEBUG | ||
575 | static struct ip_vs_lblcr_table *lblcr_table_list; | ||
576 | |||
577 | /* | ||
578 | * /proc/net/ip_vs_lblcr to display the mappings of | ||
579 | * destination IP address <==> its serverSet | ||
580 | */ | ||
581 | static int | ||
582 | ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length) | ||
583 | { | ||
584 | off_t pos=0, begin; | ||
585 | int len=0, size; | ||
586 | struct ip_vs_lblcr_table *tbl; | ||
587 | unsigned long now = jiffies; | ||
588 | int i; | ||
589 | struct ip_vs_lblcr_entry *en; | ||
590 | |||
591 | tbl = lblcr_table_list; | ||
592 | |||
593 | size = sprintf(buffer, "LastTime Dest IP address Server set\n"); | ||
594 | pos += size; | ||
595 | len += size; | ||
596 | |||
597 | for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
598 | read_lock_bh(&tbl->lock); | ||
599 | list_for_each_entry(en, &tbl->bucket[i], list) { | ||
600 | char tbuf[16]; | ||
601 | struct ip_vs_dest_list *d; | ||
602 | |||
603 | sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr)); | ||
604 | size = sprintf(buffer+len, "%8lu %-16s ", | ||
605 | now-en->lastuse, tbuf); | ||
606 | |||
607 | read_lock(&en->set.lock); | ||
608 | for (d=en->set.list; d!=NULL; d=d->next) { | ||
609 | size += sprintf(buffer+len+size, | ||
610 | "%u.%u.%u.%u ", | ||
611 | NIPQUAD(d->dest->addr)); | ||
612 | } | ||
613 | read_unlock(&en->set.lock); | ||
614 | size += sprintf(buffer+len+size, "\n"); | ||
615 | len += size; | ||
616 | pos += size; | ||
617 | if (pos <= offset) | ||
618 | len=0; | ||
619 | if (pos >= offset+length) { | ||
620 | read_unlock_bh(&tbl->lock); | ||
621 | goto done; | ||
622 | } | ||
623 | } | ||
624 | read_unlock_bh(&tbl->lock); | ||
625 | } | ||
626 | |||
627 | done: | ||
628 | begin = len - (pos - offset); | ||
629 | *start = buffer + begin; | ||
630 | len -= begin; | ||
631 | if(len>length) | ||
632 | len = length; | ||
633 | return len; | ||
634 | } | ||
635 | #endif | ||
636 | |||
637 | |||
638 | static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) | ||
639 | { | ||
640 | int i; | ||
641 | struct ip_vs_lblcr_table *tbl; | ||
642 | |||
643 | /* | ||
644 | * Allocate the ip_vs_lblcr_table for this service | ||
645 | */ | ||
646 | tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); | ||
647 | if (tbl == NULL) { | ||
648 | IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); | ||
649 | return -ENOMEM; | ||
650 | } | ||
651 | svc->sched_data = tbl; | ||
652 | IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " | ||
653 | "current service\n", | ||
654 | sizeof(struct ip_vs_lblcr_table)); | ||
655 | |||
656 | /* | ||
657 | * Initialize the hash buckets | ||
658 | */ | ||
659 | for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
660 | INIT_LIST_HEAD(&tbl->bucket[i]); | ||
661 | } | ||
662 | rwlock_init(&tbl->lock); | ||
663 | tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; | ||
664 | tbl->rover = 0; | ||
665 | tbl->counter = 1; | ||
666 | |||
667 | /* | ||
668 | * Hook periodic timer for garbage collection | ||
669 | */ | ||
670 | init_timer(&tbl->periodic_timer); | ||
671 | tbl->periodic_timer.data = (unsigned long)tbl; | ||
672 | tbl->periodic_timer.function = ip_vs_lblcr_check_expire; | ||
673 | tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | ||
674 | add_timer(&tbl->periodic_timer); | ||
675 | |||
676 | #ifdef CONFIG_IP_VS_LBLCR_DEBUG | ||
677 | lblcr_table_list = tbl; | ||
678 | #endif | ||
679 | return 0; | ||
680 | } | ||
681 | |||
682 | |||
683 | static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) | ||
684 | { | ||
685 | struct ip_vs_lblcr_table *tbl = svc->sched_data; | ||
686 | |||
687 | /* remove periodic timer */ | ||
688 | del_timer_sync(&tbl->periodic_timer); | ||
689 | |||
690 | /* got to clean up table entries here */ | ||
691 | ip_vs_lblcr_flush(tbl); | ||
692 | |||
693 | /* release the table itself */ | ||
694 | kfree(svc->sched_data); | ||
695 | IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", | ||
696 | sizeof(struct ip_vs_lblcr_table)); | ||
697 | |||
698 | return 0; | ||
699 | } | ||
700 | |||
701 | |||
702 | static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) | ||
703 | { | ||
704 | return 0; | ||
705 | } | ||
706 | |||
707 | |||
708 | static inline struct ip_vs_dest * | ||
709 | __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | ||
710 | { | ||
711 | struct ip_vs_dest *dest, *least; | ||
712 | int loh, doh; | ||
713 | |||
714 | /* | ||
715 | * We think the overhead of processing active connections is fifty | ||
716 | * times higher than that of inactive connections in average. (This | ||
717 | * fifty times might not be accurate, we will change it later.) We | ||
718 | * use the following formula to estimate the overhead: | ||
719 | * dest->activeconns*50 + dest->inactconns | ||
720 | * and the load: | ||
721 | * (dest overhead) / dest->weight | ||
722 | * | ||
723 | * Remember -- no floats in kernel mode!!! | ||
724 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
725 | * h1/w1 > h2/w2 | ||
726 | * if every weight is larger than zero. | ||
727 | * | ||
728 | * The server with weight=0 is quiesced and will not receive any | ||
729 | * new connection. | ||
730 | */ | ||
731 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
732 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
733 | continue; | ||
734 | |||
735 | if (atomic_read(&dest->weight) > 0) { | ||
736 | least = dest; | ||
737 | loh = atomic_read(&least->activeconns) * 50 | ||
738 | + atomic_read(&least->inactconns); | ||
739 | goto nextstage; | ||
740 | } | ||
741 | } | ||
742 | return NULL; | ||
743 | |||
744 | /* | ||
745 | * Find the destination with the least load. | ||
746 | */ | ||
747 | nextstage: | ||
748 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
749 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
750 | continue; | ||
751 | |||
752 | doh = atomic_read(&dest->activeconns) * 50 | ||
753 | + atomic_read(&dest->inactconns); | ||
754 | if (loh * atomic_read(&dest->weight) > | ||
755 | doh * atomic_read(&least->weight)) { | ||
756 | least = dest; | ||
757 | loh = doh; | ||
758 | } | ||
759 | } | ||
760 | |||
761 | IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " | ||
762 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
763 | NIPQUAD(least->addr), ntohs(least->port), | ||
764 | atomic_read(&least->activeconns), | ||
765 | atomic_read(&least->refcnt), | ||
766 | atomic_read(&least->weight), loh); | ||
767 | |||
768 | return least; | ||
769 | } | ||
770 | |||
771 | |||
772 | /* | ||
773 | * If this destination server is overloaded and there is a less loaded | ||
774 | * server, then return true. | ||
775 | */ | ||
776 | static inline int | ||
777 | is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
778 | { | ||
779 | if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { | ||
780 | struct ip_vs_dest *d; | ||
781 | |||
782 | list_for_each_entry(d, &svc->destinations, n_list) { | ||
783 | if (atomic_read(&d->activeconns)*2 | ||
784 | < atomic_read(&d->weight)) { | ||
785 | return 1; | ||
786 | } | ||
787 | } | ||
788 | } | ||
789 | return 0; | ||
790 | } | ||
791 | |||
792 | |||
793 | /* | ||
794 | * Locality-Based (weighted) Least-Connection scheduling | ||
795 | */ | ||
796 | static struct ip_vs_dest * | ||
797 | ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
798 | { | ||
799 | struct ip_vs_dest *dest; | ||
800 | struct ip_vs_lblcr_table *tbl; | ||
801 | struct ip_vs_lblcr_entry *en; | ||
802 | struct iphdr *iph = skb->nh.iph; | ||
803 | |||
804 | IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); | ||
805 | |||
806 | tbl = (struct ip_vs_lblcr_table *)svc->sched_data; | ||
807 | en = ip_vs_lblcr_get(tbl, iph->daddr); | ||
808 | if (en == NULL) { | ||
809 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
810 | if (dest == NULL) { | ||
811 | IP_VS_DBG(1, "no destination available\n"); | ||
812 | return NULL; | ||
813 | } | ||
814 | en = ip_vs_lblcr_new(iph->daddr); | ||
815 | if (en == NULL) { | ||
816 | return NULL; | ||
817 | } | ||
818 | ip_vs_dest_set_insert(&en->set, dest); | ||
819 | ip_vs_lblcr_hash(tbl, en); | ||
820 | } else { | ||
821 | dest = ip_vs_dest_set_min(&en->set); | ||
822 | if (!dest || is_overloaded(dest, svc)) { | ||
823 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
824 | if (dest == NULL) { | ||
825 | IP_VS_DBG(1, "no destination available\n"); | ||
826 | return NULL; | ||
827 | } | ||
828 | ip_vs_dest_set_insert(&en->set, dest); | ||
829 | } | ||
830 | if (atomic_read(&en->set.size) > 1 && | ||
831 | jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { | ||
832 | struct ip_vs_dest *m; | ||
833 | m = ip_vs_dest_set_max(&en->set); | ||
834 | if (m) | ||
835 | ip_vs_dest_set_erase(&en->set, m); | ||
836 | } | ||
837 | } | ||
838 | en->lastuse = jiffies; | ||
839 | |||
840 | IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " | ||
841 | "--> server %u.%u.%u.%u:%d\n", | ||
842 | NIPQUAD(en->addr), | ||
843 | NIPQUAD(dest->addr), | ||
844 | ntohs(dest->port)); | ||
845 | |||
846 | return dest; | ||
847 | } | ||
848 | |||
849 | |||
850 | /* | ||
851 | * IPVS LBLCR Scheduler structure | ||
852 | */ | ||
853 | static struct ip_vs_scheduler ip_vs_lblcr_scheduler = | ||
854 | { | ||
855 | .name = "lblcr", | ||
856 | .refcnt = ATOMIC_INIT(0), | ||
857 | .module = THIS_MODULE, | ||
858 | .init_service = ip_vs_lblcr_init_svc, | ||
859 | .done_service = ip_vs_lblcr_done_svc, | ||
860 | .update_service = ip_vs_lblcr_update_svc, | ||
861 | .schedule = ip_vs_lblcr_schedule, | ||
862 | }; | ||
863 | |||
864 | |||
865 | static int __init ip_vs_lblcr_init(void) | ||
866 | { | ||
867 | INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); | ||
868 | sysctl_header = register_sysctl_table(lblcr_root_table, 0); | ||
869 | #ifdef CONFIG_IP_VS_LBLCR_DEBUG | ||
870 | proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); | ||
871 | #endif | ||
872 | return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); | ||
873 | } | ||
874 | |||
875 | |||
876 | static void __exit ip_vs_lblcr_cleanup(void) | ||
877 | { | ||
878 | #ifdef CONFIG_IP_VS_LBLCR_DEBUG | ||
879 | proc_net_remove("ip_vs_lblcr"); | ||
880 | #endif | ||
881 | unregister_sysctl_table(sysctl_header); | ||
882 | unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); | ||
883 | } | ||
884 | |||
885 | |||
886 | module_init(ip_vs_lblcr_init); | ||
887 | module_exit(ip_vs_lblcr_cleanup); | ||
888 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c new file mode 100644 index 000000000000..d88fef90a641 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lc.c | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * IPVS: Least-Connection Scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * Wensong Zhang : added the ip_vs_lc_update_svc | ||
15 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
16 | * | ||
17 | */ | ||
18 | |||
19 | #include <linux/module.h> | ||
20 | #include <linux/kernel.h> | ||
21 | |||
22 | #include <net/ip_vs.h> | ||
23 | |||
24 | |||
25 | static int ip_vs_lc_init_svc(struct ip_vs_service *svc) | ||
26 | { | ||
27 | return 0; | ||
28 | } | ||
29 | |||
30 | |||
31 | static int ip_vs_lc_done_svc(struct ip_vs_service *svc) | ||
32 | { | ||
33 | return 0; | ||
34 | } | ||
35 | |||
36 | |||
37 | static int ip_vs_lc_update_svc(struct ip_vs_service *svc) | ||
38 | { | ||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | |||
43 | static inline unsigned int | ||
44 | ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) | ||
45 | { | ||
46 | /* | ||
47 | * We think the overhead of processing active connections is 256 | ||
48 | * times higher than that of inactive connections in average. (This | ||
49 | * 256 times might not be accurate, we will change it later) We | ||
50 | * use the following formula to estimate the overhead now: | ||
51 | * dest->activeconns*256 + dest->inactconns | ||
52 | */ | ||
53 | return (atomic_read(&dest->activeconns) << 8) + | ||
54 | atomic_read(&dest->inactconns); | ||
55 | } | ||
56 | |||
57 | |||
58 | /* | ||
59 | * Least Connection scheduling | ||
60 | */ | ||
61 | static struct ip_vs_dest * | ||
62 | ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
63 | { | ||
64 | struct ip_vs_dest *dest, *least = NULL; | ||
65 | unsigned int loh = 0, doh; | ||
66 | |||
67 | IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); | ||
68 | |||
69 | /* | ||
70 | * Simply select the server with the least number of | ||
71 | * (activeconns<<5) + inactconns | ||
72 | * Except whose weight is equal to zero. | ||
73 | * If the weight is equal to zero, it means that the server is | ||
74 | * quiesced, the existing connections to the server still get | ||
75 | * served, but no new connection is assigned to the server. | ||
76 | */ | ||
77 | |||
78 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
79 | if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || | ||
80 | atomic_read(&dest->weight) == 0) | ||
81 | continue; | ||
82 | doh = ip_vs_lc_dest_overhead(dest); | ||
83 | if (!least || doh < loh) { | ||
84 | least = dest; | ||
85 | loh = doh; | ||
86 | } | ||
87 | } | ||
88 | |||
89 | if (least) | ||
90 | IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", | ||
91 | NIPQUAD(least->addr), ntohs(least->port), | ||
92 | atomic_read(&least->activeconns), | ||
93 | atomic_read(&least->inactconns)); | ||
94 | |||
95 | return least; | ||
96 | } | ||
97 | |||
98 | |||
99 | static struct ip_vs_scheduler ip_vs_lc_scheduler = { | ||
100 | .name = "lc", | ||
101 | .refcnt = ATOMIC_INIT(0), | ||
102 | .module = THIS_MODULE, | ||
103 | .init_service = ip_vs_lc_init_svc, | ||
104 | .done_service = ip_vs_lc_done_svc, | ||
105 | .update_service = ip_vs_lc_update_svc, | ||
106 | .schedule = ip_vs_lc_schedule, | ||
107 | }; | ||
108 | |||
109 | |||
110 | static int __init ip_vs_lc_init(void) | ||
111 | { | ||
112 | INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); | ||
113 | return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; | ||
114 | } | ||
115 | |||
116 | static void __exit ip_vs_lc_cleanup(void) | ||
117 | { | ||
118 | unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); | ||
119 | } | ||
120 | |||
121 | module_init(ip_vs_lc_init); | ||
122 | module_exit(ip_vs_lc_cleanup); | ||
123 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c new file mode 100644 index 000000000000..bc2a9e5f2a7b --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_nq.c | |||
@@ -0,0 +1,161 @@ | |||
1 | /* | ||
2 | * IPVS: Never Queue scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | * The NQ algorithm adopts a two-speed model. When there is an idle server | ||
19 | * available, the job will be sent to the idle server, instead of waiting | ||
20 | * for a fast one. When there is no idle server available, the job will be | ||
21 | * sent to the server that minimize its expected delay (The Shortest | ||
22 | * Expected Delay scheduling algorithm). | ||
23 | * | ||
24 | * See the following paper for more information: | ||
25 | * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing | ||
26 | * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, | ||
27 | * pages 986-994, 1988. | ||
28 | * | ||
29 | * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. | ||
30 | * | ||
31 | * The difference between NQ and SED is that NQ can improve overall | ||
32 | * system utilization. | ||
33 | * | ||
34 | */ | ||
35 | |||
36 | #include <linux/module.h> | ||
37 | #include <linux/kernel.h> | ||
38 | |||
39 | #include <net/ip_vs.h> | ||
40 | |||
41 | |||
42 | static int | ||
43 | ip_vs_nq_init_svc(struct ip_vs_service *svc) | ||
44 | { | ||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | |||
49 | static int | ||
50 | ip_vs_nq_done_svc(struct ip_vs_service *svc) | ||
51 | { | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | |||
56 | static int | ||
57 | ip_vs_nq_update_svc(struct ip_vs_service *svc) | ||
58 | { | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | |||
63 | static inline unsigned int | ||
64 | ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) | ||
65 | { | ||
66 | /* | ||
67 | * We only use the active connection number in the cost | ||
68 | * calculation here. | ||
69 | */ | ||
70 | return atomic_read(&dest->activeconns) + 1; | ||
71 | } | ||
72 | |||
73 | |||
74 | /* | ||
75 | * Weighted Least Connection scheduling | ||
76 | */ | ||
77 | static struct ip_vs_dest * | ||
78 | ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
79 | { | ||
80 | struct ip_vs_dest *dest, *least = NULL; | ||
81 | unsigned int loh = 0, doh; | ||
82 | |||
83 | IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); | ||
84 | |||
85 | /* | ||
86 | * We calculate the load of each dest server as follows: | ||
87 | * (server expected overhead) / dest->weight | ||
88 | * | ||
89 | * Remember -- no floats in kernel mode!!! | ||
90 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
91 | * h1/w1 > h2/w2 | ||
92 | * if every weight is larger than zero. | ||
93 | * | ||
94 | * The server with weight=0 is quiesced and will not receive any | ||
95 | * new connections. | ||
96 | */ | ||
97 | |||
98 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
99 | |||
100 | if (dest->flags & IP_VS_DEST_F_OVERLOAD || | ||
101 | !atomic_read(&dest->weight)) | ||
102 | continue; | ||
103 | |||
104 | doh = ip_vs_nq_dest_overhead(dest); | ||
105 | |||
106 | /* return the server directly if it is idle */ | ||
107 | if (atomic_read(&dest->activeconns) == 0) { | ||
108 | least = dest; | ||
109 | loh = doh; | ||
110 | goto out; | ||
111 | } | ||
112 | |||
113 | if (!least || | ||
114 | (loh * atomic_read(&dest->weight) > | ||
115 | doh * atomic_read(&least->weight))) { | ||
116 | least = dest; | ||
117 | loh = doh; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | if (!least) | ||
122 | return NULL; | ||
123 | |||
124 | out: | ||
125 | IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " | ||
126 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
127 | NIPQUAD(least->addr), ntohs(least->port), | ||
128 | atomic_read(&least->activeconns), | ||
129 | atomic_read(&least->refcnt), | ||
130 | atomic_read(&least->weight), loh); | ||
131 | |||
132 | return least; | ||
133 | } | ||
134 | |||
135 | |||
136 | static struct ip_vs_scheduler ip_vs_nq_scheduler = | ||
137 | { | ||
138 | .name = "nq", | ||
139 | .refcnt = ATOMIC_INIT(0), | ||
140 | .module = THIS_MODULE, | ||
141 | .init_service = ip_vs_nq_init_svc, | ||
142 | .done_service = ip_vs_nq_done_svc, | ||
143 | .update_service = ip_vs_nq_update_svc, | ||
144 | .schedule = ip_vs_nq_schedule, | ||
145 | }; | ||
146 | |||
147 | |||
148 | static int __init ip_vs_nq_init(void) | ||
149 | { | ||
150 | INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list); | ||
151 | return register_ip_vs_scheduler(&ip_vs_nq_scheduler); | ||
152 | } | ||
153 | |||
154 | static void __exit ip_vs_nq_cleanup(void) | ||
155 | { | ||
156 | unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); | ||
157 | } | ||
158 | |||
159 | module_init(ip_vs_nq_init); | ||
160 | module_exit(ip_vs_nq_cleanup); | ||
161 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c new file mode 100644 index 000000000000..253c46252bd5 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto.c | |||
@@ -0,0 +1,244 @@ | |||
1 | /* | ||
2 | * ip_vs_proto.c: transport protocol load balancing support for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * Julian Anastasov <ja@ssi.bg> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Changes: | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include <linux/module.h> | ||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/skbuff.h> | ||
21 | #include <linux/in.h> | ||
22 | #include <linux/ip.h> | ||
23 | #include <net/protocol.h> | ||
24 | #include <net/tcp.h> | ||
25 | #include <net/udp.h> | ||
26 | #include <asm/system.h> | ||
27 | #include <linux/stat.h> | ||
28 | #include <linux/proc_fs.h> | ||
29 | |||
30 | #include <net/ip_vs.h> | ||
31 | |||
32 | |||
33 | /* | ||
34 | * IPVS protocols can only be registered/unregistered when the ipvs | ||
35 | * module is loaded/unloaded, so no lock is needed in accessing the | ||
36 | * ipvs protocol table. | ||
37 | */ | ||
38 | |||
39 | #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ | ||
40 | #define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) | ||
41 | |||
42 | static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; | ||
43 | |||
44 | |||
45 | /* | ||
46 | * register an ipvs protocol | ||
47 | */ | ||
48 | static int register_ip_vs_protocol(struct ip_vs_protocol *pp) | ||
49 | { | ||
50 | unsigned hash = IP_VS_PROTO_HASH(pp->protocol); | ||
51 | |||
52 | pp->next = ip_vs_proto_table[hash]; | ||
53 | ip_vs_proto_table[hash] = pp; | ||
54 | |||
55 | if (pp->init != NULL) | ||
56 | pp->init(pp); | ||
57 | |||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | |||
62 | /* | ||
63 | * unregister an ipvs protocol | ||
64 | */ | ||
65 | static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) | ||
66 | { | ||
67 | struct ip_vs_protocol **pp_p; | ||
68 | unsigned hash = IP_VS_PROTO_HASH(pp->protocol); | ||
69 | |||
70 | pp_p = &ip_vs_proto_table[hash]; | ||
71 | for (; *pp_p; pp_p = &(*pp_p)->next) { | ||
72 | if (*pp_p == pp) { | ||
73 | *pp_p = pp->next; | ||
74 | if (pp->exit != NULL) | ||
75 | pp->exit(pp); | ||
76 | return 0; | ||
77 | } | ||
78 | } | ||
79 | |||
80 | return -ESRCH; | ||
81 | } | ||
82 | |||
83 | |||
84 | /* | ||
85 | * get ip_vs_protocol object by its proto. | ||
86 | */ | ||
87 | struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) | ||
88 | { | ||
89 | struct ip_vs_protocol *pp; | ||
90 | unsigned hash = IP_VS_PROTO_HASH(proto); | ||
91 | |||
92 | for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { | ||
93 | if (pp->protocol == proto) | ||
94 | return pp; | ||
95 | } | ||
96 | |||
97 | return NULL; | ||
98 | } | ||
99 | |||
100 | |||
101 | /* | ||
102 | * Propagate event for state change to all protocols | ||
103 | */ | ||
104 | void ip_vs_protocol_timeout_change(int flags) | ||
105 | { | ||
106 | struct ip_vs_protocol *pp; | ||
107 | int i; | ||
108 | |||
109 | for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { | ||
110 | for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { | ||
111 | if (pp->timeout_change) | ||
112 | pp->timeout_change(pp, flags); | ||
113 | } | ||
114 | } | ||
115 | } | ||
116 | |||
117 | |||
118 | int * | ||
119 | ip_vs_create_timeout_table(int *table, int size) | ||
120 | { | ||
121 | int *t; | ||
122 | |||
123 | t = kmalloc(size, GFP_ATOMIC); | ||
124 | if (t == NULL) | ||
125 | return NULL; | ||
126 | memcpy(t, table, size); | ||
127 | return t; | ||
128 | } | ||
129 | |||
130 | |||
131 | /* | ||
132 | * Set timeout value for state specified by name | ||
133 | */ | ||
134 | int | ||
135 | ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) | ||
136 | { | ||
137 | int i; | ||
138 | |||
139 | if (!table || !name || !to) | ||
140 | return -EINVAL; | ||
141 | |||
142 | for (i = 0; i < num; i++) { | ||
143 | if (strcmp(names[i], name)) | ||
144 | continue; | ||
145 | table[i] = to * HZ; | ||
146 | return 0; | ||
147 | } | ||
148 | return -ENOENT; | ||
149 | } | ||
150 | |||
151 | |||
152 | const char * ip_vs_state_name(__u16 proto, int state) | ||
153 | { | ||
154 | struct ip_vs_protocol *pp = ip_vs_proto_get(proto); | ||
155 | |||
156 | if (pp == NULL || pp->state_name == NULL) | ||
157 | return "ERR!"; | ||
158 | return pp->state_name(state); | ||
159 | } | ||
160 | |||
161 | |||
162 | void | ||
163 | ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, | ||
164 | const struct sk_buff *skb, | ||
165 | int offset, | ||
166 | const char *msg) | ||
167 | { | ||
168 | char buf[128]; | ||
169 | struct iphdr _iph, *ih; | ||
170 | |||
171 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
172 | if (ih == NULL) | ||
173 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
174 | else if (ih->frag_off & __constant_htons(IP_OFFSET)) | ||
175 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", | ||
176 | pp->name, NIPQUAD(ih->saddr), | ||
177 | NIPQUAD(ih->daddr)); | ||
178 | else { | ||
179 | __u16 _ports[2], *pptr | ||
180 | ; | ||
181 | pptr = skb_header_pointer(skb, offset + ih->ihl*4, | ||
182 | sizeof(_ports), _ports); | ||
183 | if (pptr == NULL) | ||
184 | sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", | ||
185 | pp->name, | ||
186 | NIPQUAD(ih->saddr), | ||
187 | NIPQUAD(ih->daddr)); | ||
188 | else | ||
189 | sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", | ||
190 | pp->name, | ||
191 | NIPQUAD(ih->saddr), | ||
192 | ntohs(pptr[0]), | ||
193 | NIPQUAD(ih->daddr), | ||
194 | ntohs(pptr[1])); | ||
195 | } | ||
196 | |||
197 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
198 | } | ||
199 | |||
200 | |||
201 | int ip_vs_protocol_init(void) | ||
202 | { | ||
203 | char protocols[64]; | ||
204 | #define REGISTER_PROTOCOL(p) \ | ||
205 | do { \ | ||
206 | register_ip_vs_protocol(p); \ | ||
207 | strcat(protocols, ", "); \ | ||
208 | strcat(protocols, (p)->name); \ | ||
209 | } while (0) | ||
210 | |||
211 | protocols[0] = '\0'; | ||
212 | protocols[2] = '\0'; | ||
213 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
214 | REGISTER_PROTOCOL(&ip_vs_protocol_tcp); | ||
215 | #endif | ||
216 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
217 | REGISTER_PROTOCOL(&ip_vs_protocol_udp); | ||
218 | #endif | ||
219 | #ifdef CONFIG_IP_VS_PROTO_ICMP | ||
220 | REGISTER_PROTOCOL(&ip_vs_protocol_icmp); | ||
221 | #endif | ||
222 | #ifdef CONFIG_IP_VS_PROTO_AH | ||
223 | REGISTER_PROTOCOL(&ip_vs_protocol_ah); | ||
224 | #endif | ||
225 | #ifdef CONFIG_IP_VS_PROTO_ESP | ||
226 | REGISTER_PROTOCOL(&ip_vs_protocol_esp); | ||
227 | #endif | ||
228 | IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); | ||
229 | |||
230 | return 0; | ||
231 | } | ||
232 | |||
233 | |||
234 | void ip_vs_protocol_cleanup(void) | ||
235 | { | ||
236 | struct ip_vs_protocol *pp; | ||
237 | int i; | ||
238 | |||
239 | /* unregister all the ipvs protocols */ | ||
240 | for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { | ||
241 | while ((pp = ip_vs_proto_table[i]) != NULL) | ||
242 | unregister_ip_vs_protocol(pp); | ||
243 | } | ||
244 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c new file mode 100644 index 000000000000..453e94a0bbd7 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c | |||
@@ -0,0 +1,177 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 | ||
7 | * Wensong Zhang <wensong@linuxvirtualserver.org> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * version 2 as published by the Free Software Foundation; | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <linux/module.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/netfilter.h> | ||
18 | #include <linux/netfilter_ipv4.h> | ||
19 | |||
20 | #include <net/ip_vs.h> | ||
21 | |||
22 | |||
23 | /* TODO: | ||
24 | |||
25 | struct isakmp_hdr { | ||
26 | __u8 icookie[8]; | ||
27 | __u8 rcookie[8]; | ||
28 | __u8 np; | ||
29 | __u8 version; | ||
30 | __u8 xchgtype; | ||
31 | __u8 flags; | ||
32 | __u32 msgid; | ||
33 | __u32 length; | ||
34 | }; | ||
35 | |||
36 | */ | ||
37 | |||
38 | #define PORT_ISAKMP 500 | ||
39 | |||
40 | |||
41 | static struct ip_vs_conn * | ||
42 | ah_conn_in_get(const struct sk_buff *skb, | ||
43 | struct ip_vs_protocol *pp, | ||
44 | const struct iphdr *iph, | ||
45 | unsigned int proto_off, | ||
46 | int inverse) | ||
47 | { | ||
48 | struct ip_vs_conn *cp; | ||
49 | |||
50 | if (likely(!inverse)) { | ||
51 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
52 | iph->saddr, | ||
53 | __constant_htons(PORT_ISAKMP), | ||
54 | iph->daddr, | ||
55 | __constant_htons(PORT_ISAKMP)); | ||
56 | } else { | ||
57 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
58 | iph->daddr, | ||
59 | __constant_htons(PORT_ISAKMP), | ||
60 | iph->saddr, | ||
61 | __constant_htons(PORT_ISAKMP)); | ||
62 | } | ||
63 | |||
64 | if (!cp) { | ||
65 | /* | ||
66 | * We are not sure if the packet is from our | ||
67 | * service, so our conn_schedule hook should return NF_ACCEPT | ||
68 | */ | ||
69 | IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " | ||
70 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
71 | inverse ? "ICMP+" : "", | ||
72 | pp->name, | ||
73 | NIPQUAD(iph->saddr), | ||
74 | NIPQUAD(iph->daddr)); | ||
75 | } | ||
76 | |||
77 | return cp; | ||
78 | } | ||
79 | |||
80 | |||
81 | static struct ip_vs_conn * | ||
82 | ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
83 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
84 | { | ||
85 | struct ip_vs_conn *cp; | ||
86 | |||
87 | if (likely(!inverse)) { | ||
88 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
89 | iph->saddr, | ||
90 | __constant_htons(PORT_ISAKMP), | ||
91 | iph->daddr, | ||
92 | __constant_htons(PORT_ISAKMP)); | ||
93 | } else { | ||
94 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
95 | iph->daddr, | ||
96 | __constant_htons(PORT_ISAKMP), | ||
97 | iph->saddr, | ||
98 | __constant_htons(PORT_ISAKMP)); | ||
99 | } | ||
100 | |||
101 | if (!cp) { | ||
102 | IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " | ||
103 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
104 | inverse ? "ICMP+" : "", | ||
105 | pp->name, | ||
106 | NIPQUAD(iph->saddr), | ||
107 | NIPQUAD(iph->daddr)); | ||
108 | } | ||
109 | |||
110 | return cp; | ||
111 | } | ||
112 | |||
113 | |||
114 | static int | ||
115 | ah_conn_schedule(struct sk_buff *skb, | ||
116 | struct ip_vs_protocol *pp, | ||
117 | int *verdict, struct ip_vs_conn **cpp) | ||
118 | { | ||
119 | /* | ||
120 | * AH is only related traffic. Pass the packet to IP stack. | ||
121 | */ | ||
122 | *verdict = NF_ACCEPT; | ||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | |||
127 | static void | ||
128 | ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, | ||
129 | int offset, const char *msg) | ||
130 | { | ||
131 | char buf[256]; | ||
132 | struct iphdr _iph, *ih; | ||
133 | |||
134 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
135 | if (ih == NULL) | ||
136 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
137 | else | ||
138 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", | ||
139 | pp->name, NIPQUAD(ih->saddr), | ||
140 | NIPQUAD(ih->daddr)); | ||
141 | |||
142 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
143 | } | ||
144 | |||
145 | |||
146 | static void ah_init(struct ip_vs_protocol *pp) | ||
147 | { | ||
148 | /* nothing to do now */ | ||
149 | } | ||
150 | |||
151 | |||
152 | static void ah_exit(struct ip_vs_protocol *pp) | ||
153 | { | ||
154 | /* nothing to do now */ | ||
155 | } | ||
156 | |||
157 | |||
158 | struct ip_vs_protocol ip_vs_protocol_ah = { | ||
159 | .name = "AH", | ||
160 | .protocol = IPPROTO_AH, | ||
161 | .dont_defrag = 1, | ||
162 | .init = ah_init, | ||
163 | .exit = ah_exit, | ||
164 | .conn_schedule = ah_conn_schedule, | ||
165 | .conn_in_get = ah_conn_in_get, | ||
166 | .conn_out_get = ah_conn_out_get, | ||
167 | .snat_handler = NULL, | ||
168 | .dnat_handler = NULL, | ||
169 | .csum_check = NULL, | ||
170 | .state_transition = NULL, | ||
171 | .register_app = NULL, | ||
172 | .unregister_app = NULL, | ||
173 | .app_conn_bind = NULL, | ||
174 | .debug_packet = ah_debug_packet, | ||
175 | .timeout_change = NULL, /* ISAKMP */ | ||
176 | .set_state_timeout = NULL, | ||
177 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c new file mode 100644 index 000000000000..478e5c7c7e8e --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c | |||
@@ -0,0 +1,175 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 | ||
7 | * Wensong Zhang <wensong@linuxvirtualserver.org> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * version 2 as published by the Free Software Foundation; | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <linux/module.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/netfilter.h> | ||
18 | #include <linux/netfilter_ipv4.h> | ||
19 | |||
20 | #include <net/ip_vs.h> | ||
21 | |||
22 | |||
23 | /* TODO: | ||
24 | |||
25 | struct isakmp_hdr { | ||
26 | __u8 icookie[8]; | ||
27 | __u8 rcookie[8]; | ||
28 | __u8 np; | ||
29 | __u8 version; | ||
30 | __u8 xchgtype; | ||
31 | __u8 flags; | ||
32 | __u32 msgid; | ||
33 | __u32 length; | ||
34 | }; | ||
35 | |||
36 | */ | ||
37 | |||
38 | #define PORT_ISAKMP 500 | ||
39 | |||
40 | |||
41 | static struct ip_vs_conn * | ||
42 | esp_conn_in_get(const struct sk_buff *skb, | ||
43 | struct ip_vs_protocol *pp, | ||
44 | const struct iphdr *iph, | ||
45 | unsigned int proto_off, | ||
46 | int inverse) | ||
47 | { | ||
48 | struct ip_vs_conn *cp; | ||
49 | |||
50 | if (likely(!inverse)) { | ||
51 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
52 | iph->saddr, | ||
53 | __constant_htons(PORT_ISAKMP), | ||
54 | iph->daddr, | ||
55 | __constant_htons(PORT_ISAKMP)); | ||
56 | } else { | ||
57 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
58 | iph->daddr, | ||
59 | __constant_htons(PORT_ISAKMP), | ||
60 | iph->saddr, | ||
61 | __constant_htons(PORT_ISAKMP)); | ||
62 | } | ||
63 | |||
64 | if (!cp) { | ||
65 | /* | ||
66 | * We are not sure if the packet is from our | ||
67 | * service, so our conn_schedule hook should return NF_ACCEPT | ||
68 | */ | ||
69 | IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " | ||
70 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
71 | inverse ? "ICMP+" : "", | ||
72 | pp->name, | ||
73 | NIPQUAD(iph->saddr), | ||
74 | NIPQUAD(iph->daddr)); | ||
75 | } | ||
76 | |||
77 | return cp; | ||
78 | } | ||
79 | |||
80 | |||
81 | static struct ip_vs_conn * | ||
82 | esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
83 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
84 | { | ||
85 | struct ip_vs_conn *cp; | ||
86 | |||
87 | if (likely(!inverse)) { | ||
88 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
89 | iph->saddr, | ||
90 | __constant_htons(PORT_ISAKMP), | ||
91 | iph->daddr, | ||
92 | __constant_htons(PORT_ISAKMP)); | ||
93 | } else { | ||
94 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
95 | iph->daddr, | ||
96 | __constant_htons(PORT_ISAKMP), | ||
97 | iph->saddr, | ||
98 | __constant_htons(PORT_ISAKMP)); | ||
99 | } | ||
100 | |||
101 | if (!cp) { | ||
102 | IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " | ||
103 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
104 | inverse ? "ICMP+" : "", | ||
105 | pp->name, | ||
106 | NIPQUAD(iph->saddr), | ||
107 | NIPQUAD(iph->daddr)); | ||
108 | } | ||
109 | |||
110 | return cp; | ||
111 | } | ||
112 | |||
113 | |||
114 | static int | ||
115 | esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
116 | int *verdict, struct ip_vs_conn **cpp) | ||
117 | { | ||
118 | /* | ||
119 | * ESP is only related traffic. Pass the packet to IP stack. | ||
120 | */ | ||
121 | *verdict = NF_ACCEPT; | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | |||
126 | static void | ||
127 | esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, | ||
128 | int offset, const char *msg) | ||
129 | { | ||
130 | char buf[256]; | ||
131 | struct iphdr _iph, *ih; | ||
132 | |||
133 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
134 | if (ih == NULL) | ||
135 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
136 | else | ||
137 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", | ||
138 | pp->name, NIPQUAD(ih->saddr), | ||
139 | NIPQUAD(ih->daddr)); | ||
140 | |||
141 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
142 | } | ||
143 | |||
144 | |||
145 | static void esp_init(struct ip_vs_protocol *pp) | ||
146 | { | ||
147 | /* nothing to do now */ | ||
148 | } | ||
149 | |||
150 | |||
151 | static void esp_exit(struct ip_vs_protocol *pp) | ||
152 | { | ||
153 | /* nothing to do now */ | ||
154 | } | ||
155 | |||
156 | |||
157 | struct ip_vs_protocol ip_vs_protocol_esp = { | ||
158 | .name = "ESP", | ||
159 | .protocol = IPPROTO_ESP, | ||
160 | .dont_defrag = 1, | ||
161 | .init = esp_init, | ||
162 | .exit = esp_exit, | ||
163 | .conn_schedule = esp_conn_schedule, | ||
164 | .conn_in_get = esp_conn_in_get, | ||
165 | .conn_out_get = esp_conn_out_get, | ||
166 | .snat_handler = NULL, | ||
167 | .dnat_handler = NULL, | ||
168 | .csum_check = NULL, | ||
169 | .state_transition = NULL, | ||
170 | .register_app = NULL, | ||
171 | .unregister_app = NULL, | ||
172 | .app_conn_bind = NULL, | ||
173 | .debug_packet = esp_debug_packet, | ||
174 | .timeout_change = NULL, /* ISAKMP */ | ||
175 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c new file mode 100644 index 000000000000..191e94aa1c1f --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c | |||
@@ -0,0 +1,182 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server | ||
3 | * | ||
4 | * Authors: Julian Anastasov <ja@ssi.bg>, March 2002 | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * version 2 as published by the Free Software Foundation; | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/icmp.h> | ||
15 | #include <linux/netfilter.h> | ||
16 | #include <linux/netfilter_ipv4.h> | ||
17 | |||
18 | #include <net/ip_vs.h> | ||
19 | |||
20 | |||
21 | static int icmp_timeouts[1] = { 1*60*HZ }; | ||
22 | |||
23 | static char * icmp_state_name_table[1] = { "ICMP" }; | ||
24 | |||
25 | static struct ip_vs_conn * | ||
26 | icmp_conn_in_get(const struct sk_buff *skb, | ||
27 | struct ip_vs_protocol *pp, | ||
28 | const struct iphdr *iph, | ||
29 | unsigned int proto_off, | ||
30 | int inverse) | ||
31 | { | ||
32 | #if 0 | ||
33 | struct ip_vs_conn *cp; | ||
34 | |||
35 | if (likely(!inverse)) { | ||
36 | cp = ip_vs_conn_in_get(iph->protocol, | ||
37 | iph->saddr, 0, | ||
38 | iph->daddr, 0); | ||
39 | } else { | ||
40 | cp = ip_vs_conn_in_get(iph->protocol, | ||
41 | iph->daddr, 0, | ||
42 | iph->saddr, 0); | ||
43 | } | ||
44 | |||
45 | return cp; | ||
46 | |||
47 | #else | ||
48 | return NULL; | ||
49 | #endif | ||
50 | } | ||
51 | |||
52 | static struct ip_vs_conn * | ||
53 | icmp_conn_out_get(const struct sk_buff *skb, | ||
54 | struct ip_vs_protocol *pp, | ||
55 | const struct iphdr *iph, | ||
56 | unsigned int proto_off, | ||
57 | int inverse) | ||
58 | { | ||
59 | #if 0 | ||
60 | struct ip_vs_conn *cp; | ||
61 | |||
62 | if (likely(!inverse)) { | ||
63 | cp = ip_vs_conn_out_get(iph->protocol, | ||
64 | iph->saddr, 0, | ||
65 | iph->daddr, 0); | ||
66 | } else { | ||
67 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
68 | iph->daddr, 0, | ||
69 | iph->saddr, 0); | ||
70 | } | ||
71 | |||
72 | return cp; | ||
73 | #else | ||
74 | return NULL; | ||
75 | #endif | ||
76 | } | ||
77 | |||
78 | static int | ||
79 | icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
80 | int *verdict, struct ip_vs_conn **cpp) | ||
81 | { | ||
82 | *verdict = NF_ACCEPT; | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static int | ||
87 | icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) | ||
88 | { | ||
89 | if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) { | ||
90 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | ||
91 | if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) { | ||
92 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for"); | ||
93 | return 0; | ||
94 | } | ||
95 | } | ||
96 | } | ||
97 | return 1; | ||
98 | } | ||
99 | |||
100 | static void | ||
101 | icmp_debug_packet(struct ip_vs_protocol *pp, | ||
102 | const struct sk_buff *skb, | ||
103 | int offset, | ||
104 | const char *msg) | ||
105 | { | ||
106 | char buf[256]; | ||
107 | struct iphdr _iph, *ih; | ||
108 | |||
109 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
110 | if (ih == NULL) | ||
111 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
112 | else if (ih->frag_off & __constant_htons(IP_OFFSET)) | ||
113 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", | ||
114 | pp->name, NIPQUAD(ih->saddr), | ||
115 | NIPQUAD(ih->daddr)); | ||
116 | else { | ||
117 | struct icmphdr _icmph, *ic; | ||
118 | |||
119 | ic = skb_header_pointer(skb, offset + ih->ihl*4, | ||
120 | sizeof(_icmph), &_icmph); | ||
121 | if (ic == NULL) | ||
122 | sprintf(buf, "%s TRUNCATED to %u bytes\n", | ||
123 | pp->name, skb->len - offset); | ||
124 | else | ||
125 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d", | ||
126 | pp->name, NIPQUAD(ih->saddr), | ||
127 | NIPQUAD(ih->daddr), | ||
128 | ic->type, ic->code); | ||
129 | } | ||
130 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
131 | } | ||
132 | |||
133 | static int | ||
134 | icmp_state_transition(struct ip_vs_conn *cp, int direction, | ||
135 | const struct sk_buff *skb, | ||
136 | struct ip_vs_protocol *pp) | ||
137 | { | ||
138 | cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL]; | ||
139 | return 1; | ||
140 | } | ||
141 | |||
142 | static int | ||
143 | icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) | ||
144 | { | ||
145 | int num; | ||
146 | char **names; | ||
147 | |||
148 | num = IP_VS_ICMP_S_LAST; | ||
149 | names = icmp_state_name_table; | ||
150 | return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to); | ||
151 | } | ||
152 | |||
153 | |||
154 | static void icmp_init(struct ip_vs_protocol *pp) | ||
155 | { | ||
156 | pp->timeout_table = icmp_timeouts; | ||
157 | } | ||
158 | |||
159 | static void icmp_exit(struct ip_vs_protocol *pp) | ||
160 | { | ||
161 | } | ||
162 | |||
163 | struct ip_vs_protocol ip_vs_protocol_icmp = { | ||
164 | .name = "ICMP", | ||
165 | .protocol = IPPROTO_ICMP, | ||
166 | .dont_defrag = 0, | ||
167 | .init = icmp_init, | ||
168 | .exit = icmp_exit, | ||
169 | .conn_schedule = icmp_conn_schedule, | ||
170 | .conn_in_get = icmp_conn_in_get, | ||
171 | .conn_out_get = icmp_conn_out_get, | ||
172 | .snat_handler = NULL, | ||
173 | .dnat_handler = NULL, | ||
174 | .csum_check = icmp_csum_check, | ||
175 | .state_transition = icmp_state_transition, | ||
176 | .register_app = NULL, | ||
177 | .unregister_app = NULL, | ||
178 | .app_conn_bind = NULL, | ||
179 | .debug_packet = icmp_debug_packet, | ||
180 | .timeout_change = NULL, | ||
181 | .set_state_timeout = icmp_set_state_timeout, | ||
182 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 000000000000..e65de675da74 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c | |||
@@ -0,0 +1,640 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_tcp.c: TCP load balancing support for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * Julian Anastasov <ja@ssi.bg> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Changes: | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/ip.h> | ||
20 | #include <linux/tcp.h> /* for tcphdr */ | ||
21 | #include <net/ip.h> | ||
22 | #include <net/tcp.h> /* for csum_tcpudp_magic */ | ||
23 | #include <linux/netfilter_ipv4.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | |||
28 | static struct ip_vs_conn * | ||
29 | tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
30 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
31 | { | ||
32 | __u16 _ports[2], *pptr; | ||
33 | |||
34 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
35 | if (pptr == NULL) | ||
36 | return NULL; | ||
37 | |||
38 | if (likely(!inverse)) { | ||
39 | return ip_vs_conn_in_get(iph->protocol, | ||
40 | iph->saddr, pptr[0], | ||
41 | iph->daddr, pptr[1]); | ||
42 | } else { | ||
43 | return ip_vs_conn_in_get(iph->protocol, | ||
44 | iph->daddr, pptr[1], | ||
45 | iph->saddr, pptr[0]); | ||
46 | } | ||
47 | } | ||
48 | |||
49 | static struct ip_vs_conn * | ||
50 | tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
51 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
52 | { | ||
53 | __u16 _ports[2], *pptr; | ||
54 | |||
55 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
56 | if (pptr == NULL) | ||
57 | return NULL; | ||
58 | |||
59 | if (likely(!inverse)) { | ||
60 | return ip_vs_conn_out_get(iph->protocol, | ||
61 | iph->saddr, pptr[0], | ||
62 | iph->daddr, pptr[1]); | ||
63 | } else { | ||
64 | return ip_vs_conn_out_get(iph->protocol, | ||
65 | iph->daddr, pptr[1], | ||
66 | iph->saddr, pptr[0]); | ||
67 | } | ||
68 | } | ||
69 | |||
70 | |||
71 | static int | ||
72 | tcp_conn_schedule(struct sk_buff *skb, | ||
73 | struct ip_vs_protocol *pp, | ||
74 | int *verdict, struct ip_vs_conn **cpp) | ||
75 | { | ||
76 | struct ip_vs_service *svc; | ||
77 | struct tcphdr _tcph, *th; | ||
78 | |||
79 | th = skb_header_pointer(skb, skb->nh.iph->ihl*4, | ||
80 | sizeof(_tcph), &_tcph); | ||
81 | if (th == NULL) { | ||
82 | *verdict = NF_DROP; | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | if (th->syn && | ||
87 | (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, | ||
88 | skb->nh.iph->daddr, th->dest))) { | ||
89 | if (ip_vs_todrop()) { | ||
90 | /* | ||
91 | * It seems that we are very loaded. | ||
92 | * We have to drop this packet :( | ||
93 | */ | ||
94 | ip_vs_service_put(svc); | ||
95 | *verdict = NF_DROP; | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * Let the virtual server select a real server for the | ||
101 | * incoming connection, and create a connection entry. | ||
102 | */ | ||
103 | *cpp = ip_vs_schedule(svc, skb); | ||
104 | if (!*cpp) { | ||
105 | *verdict = ip_vs_leave(svc, skb, pp); | ||
106 | return 0; | ||
107 | } | ||
108 | ip_vs_service_put(svc); | ||
109 | } | ||
110 | return 1; | ||
111 | } | ||
112 | |||
113 | |||
114 | static inline void | ||
115 | tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip, | ||
116 | u16 oldport, u16 newport) | ||
117 | { | ||
118 | tcph->check = | ||
119 | ip_vs_check_diff(~oldip, newip, | ||
120 | ip_vs_check_diff(oldport ^ 0xFFFF, | ||
121 | newport, tcph->check)); | ||
122 | } | ||
123 | |||
124 | |||
125 | static int | ||
126 | tcp_snat_handler(struct sk_buff **pskb, | ||
127 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
128 | { | ||
129 | struct tcphdr *tcph; | ||
130 | unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; | ||
131 | |||
132 | /* csum_check requires unshared skb */ | ||
133 | if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) | ||
134 | return 0; | ||
135 | |||
136 | if (unlikely(cp->app != NULL)) { | ||
137 | /* Some checks before mangling */ | ||
138 | if (pp->csum_check && !pp->csum_check(*pskb, pp)) | ||
139 | return 0; | ||
140 | |||
141 | /* Call application helper if needed */ | ||
142 | if (!ip_vs_app_pkt_out(cp, pskb)) | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | tcph = (void *)(*pskb)->nh.iph + tcphoff; | ||
147 | tcph->source = cp->vport; | ||
148 | |||
149 | /* Adjust TCP checksums */ | ||
150 | if (!cp->app) { | ||
151 | /* Only port and addr are changed, do fast csum update */ | ||
152 | tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, | ||
153 | cp->dport, cp->vport); | ||
154 | if ((*pskb)->ip_summed == CHECKSUM_HW) | ||
155 | (*pskb)->ip_summed = CHECKSUM_NONE; | ||
156 | } else { | ||
157 | /* full checksum calculation */ | ||
158 | tcph->check = 0; | ||
159 | (*pskb)->csum = skb_checksum(*pskb, tcphoff, | ||
160 | (*pskb)->len - tcphoff, 0); | ||
161 | tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, | ||
162 | (*pskb)->len - tcphoff, | ||
163 | cp->protocol, | ||
164 | (*pskb)->csum); | ||
165 | IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", | ||
166 | pp->name, tcph->check, | ||
167 | (char*)&(tcph->check) - (char*)tcph); | ||
168 | } | ||
169 | return 1; | ||
170 | } | ||
171 | |||
172 | |||
173 | static int | ||
174 | tcp_dnat_handler(struct sk_buff **pskb, | ||
175 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
176 | { | ||
177 | struct tcphdr *tcph; | ||
178 | unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; | ||
179 | |||
180 | /* csum_check requires unshared skb */ | ||
181 | if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) | ||
182 | return 0; | ||
183 | |||
184 | if (unlikely(cp->app != NULL)) { | ||
185 | /* Some checks before mangling */ | ||
186 | if (pp->csum_check && !pp->csum_check(*pskb, pp)) | ||
187 | return 0; | ||
188 | |||
189 | /* | ||
190 | * Attempt ip_vs_app call. | ||
191 | * It will fix ip_vs_conn and iph ack_seq stuff | ||
192 | */ | ||
193 | if (!ip_vs_app_pkt_in(cp, pskb)) | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | tcph = (void *)(*pskb)->nh.iph + tcphoff; | ||
198 | tcph->dest = cp->dport; | ||
199 | |||
200 | /* | ||
201 | * Adjust TCP checksums | ||
202 | */ | ||
203 | if (!cp->app) { | ||
204 | /* Only port and addr are changed, do fast csum update */ | ||
205 | tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, | ||
206 | cp->vport, cp->dport); | ||
207 | if ((*pskb)->ip_summed == CHECKSUM_HW) | ||
208 | (*pskb)->ip_summed = CHECKSUM_NONE; | ||
209 | } else { | ||
210 | /* full checksum calculation */ | ||
211 | tcph->check = 0; | ||
212 | (*pskb)->csum = skb_checksum(*pskb, tcphoff, | ||
213 | (*pskb)->len - tcphoff, 0); | ||
214 | tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, | ||
215 | (*pskb)->len - tcphoff, | ||
216 | cp->protocol, | ||
217 | (*pskb)->csum); | ||
218 | (*pskb)->ip_summed = CHECKSUM_UNNECESSARY; | ||
219 | } | ||
220 | return 1; | ||
221 | } | ||
222 | |||
223 | |||
224 | static int | ||
225 | tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) | ||
226 | { | ||
227 | unsigned int tcphoff = skb->nh.iph->ihl*4; | ||
228 | |||
229 | switch (skb->ip_summed) { | ||
230 | case CHECKSUM_NONE: | ||
231 | skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); | ||
232 | case CHECKSUM_HW: | ||
233 | if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, | ||
234 | skb->len - tcphoff, | ||
235 | skb->nh.iph->protocol, skb->csum)) { | ||
236 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
237 | "Failed checksum for"); | ||
238 | return 0; | ||
239 | } | ||
240 | break; | ||
241 | default: | ||
242 | /* CHECKSUM_UNNECESSARY */ | ||
243 | break; | ||
244 | } | ||
245 | |||
246 | return 1; | ||
247 | } | ||
248 | |||
249 | |||
250 | #define TCP_DIR_INPUT 0 | ||
251 | #define TCP_DIR_OUTPUT 4 | ||
252 | #define TCP_DIR_INPUT_ONLY 8 | ||
253 | |||
254 | static int tcp_state_off[IP_VS_DIR_LAST] = { | ||
255 | [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, | ||
256 | [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, | ||
257 | [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, | ||
258 | }; | ||
259 | |||
260 | /* | ||
261 | * Timeout table[state] | ||
262 | */ | ||
263 | static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { | ||
264 | [IP_VS_TCP_S_NONE] = 2*HZ, | ||
265 | [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, | ||
266 | [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, | ||
267 | [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, | ||
268 | [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, | ||
269 | [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, | ||
270 | [IP_VS_TCP_S_CLOSE] = 10*HZ, | ||
271 | [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, | ||
272 | [IP_VS_TCP_S_LAST_ACK] = 30*HZ, | ||
273 | [IP_VS_TCP_S_LISTEN] = 2*60*HZ, | ||
274 | [IP_VS_TCP_S_SYNACK] = 120*HZ, | ||
275 | [IP_VS_TCP_S_LAST] = 2*HZ, | ||
276 | }; | ||
277 | |||
278 | |||
279 | #if 0 | ||
280 | |||
281 | /* FIXME: This is going to die */ | ||
282 | |||
283 | static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { | ||
284 | [IP_VS_TCP_S_NONE] = 2*HZ, | ||
285 | [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, | ||
286 | [IP_VS_TCP_S_SYN_SENT] = 60*HZ, | ||
287 | [IP_VS_TCP_S_SYN_RECV] = 10*HZ, | ||
288 | [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, | ||
289 | [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, | ||
290 | [IP_VS_TCP_S_CLOSE] = 10*HZ, | ||
291 | [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, | ||
292 | [IP_VS_TCP_S_LAST_ACK] = 30*HZ, | ||
293 | [IP_VS_TCP_S_LISTEN] = 2*60*HZ, | ||
294 | [IP_VS_TCP_S_SYNACK] = 100*HZ, | ||
295 | [IP_VS_TCP_S_LAST] = 2*HZ, | ||
296 | }; | ||
297 | |||
298 | #endif | ||
299 | |||
300 | static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { | ||
301 | [IP_VS_TCP_S_NONE] = "NONE", | ||
302 | [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", | ||
303 | [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", | ||
304 | [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", | ||
305 | [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", | ||
306 | [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", | ||
307 | [IP_VS_TCP_S_CLOSE] = "CLOSE", | ||
308 | [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", | ||
309 | [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", | ||
310 | [IP_VS_TCP_S_LISTEN] = "LISTEN", | ||
311 | [IP_VS_TCP_S_SYNACK] = "SYNACK", | ||
312 | [IP_VS_TCP_S_LAST] = "BUG!", | ||
313 | }; | ||
314 | |||
315 | #define sNO IP_VS_TCP_S_NONE | ||
316 | #define sES IP_VS_TCP_S_ESTABLISHED | ||
317 | #define sSS IP_VS_TCP_S_SYN_SENT | ||
318 | #define sSR IP_VS_TCP_S_SYN_RECV | ||
319 | #define sFW IP_VS_TCP_S_FIN_WAIT | ||
320 | #define sTW IP_VS_TCP_S_TIME_WAIT | ||
321 | #define sCL IP_VS_TCP_S_CLOSE | ||
322 | #define sCW IP_VS_TCP_S_CLOSE_WAIT | ||
323 | #define sLA IP_VS_TCP_S_LAST_ACK | ||
324 | #define sLI IP_VS_TCP_S_LISTEN | ||
325 | #define sSA IP_VS_TCP_S_SYNACK | ||
326 | |||
327 | struct tcp_states_t { | ||
328 | int next_state[IP_VS_TCP_S_LAST]; | ||
329 | }; | ||
330 | |||
331 | static const char * tcp_state_name(int state) | ||
332 | { | ||
333 | if (state >= IP_VS_TCP_S_LAST) | ||
334 | return "ERR!"; | ||
335 | return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; | ||
336 | } | ||
337 | |||
338 | static struct tcp_states_t tcp_states [] = { | ||
339 | /* INPUT */ | ||
340 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
341 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, | ||
342 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
343 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
344 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, | ||
345 | |||
346 | /* OUTPUT */ | ||
347 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
348 | /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, | ||
349 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, | ||
350 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, | ||
351 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, | ||
352 | |||
353 | /* INPUT-ONLY */ | ||
354 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
355 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, | ||
356 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
357 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
358 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
359 | }; | ||
360 | |||
361 | static struct tcp_states_t tcp_states_dos [] = { | ||
362 | /* INPUT */ | ||
363 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
364 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, | ||
365 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, | ||
366 | /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, | ||
367 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
368 | |||
369 | /* OUTPUT */ | ||
370 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
371 | /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, | ||
372 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, | ||
373 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, | ||
374 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, | ||
375 | |||
376 | /* INPUT-ONLY */ | ||
377 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
378 | /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, | ||
379 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
380 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
381 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
382 | }; | ||
383 | |||
384 | static struct tcp_states_t *tcp_state_table = tcp_states; | ||
385 | |||
386 | |||
387 | static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) | ||
388 | { | ||
389 | int on = (flags & 1); /* secure_tcp */ | ||
390 | |||
391 | /* | ||
392 | ** FIXME: change secure_tcp to independent sysctl var | ||
393 | ** or make it per-service or per-app because it is valid | ||
394 | ** for most if not for all of the applications. Something | ||
395 | ** like "capabilities" (flags) for each object. | ||
396 | */ | ||
397 | tcp_state_table = (on? tcp_states_dos : tcp_states); | ||
398 | } | ||
399 | |||
400 | static int | ||
401 | tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) | ||
402 | { | ||
403 | return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, | ||
404 | tcp_state_name_table, sname, to); | ||
405 | } | ||
406 | |||
407 | static inline int tcp_state_idx(struct tcphdr *th) | ||
408 | { | ||
409 | if (th->rst) | ||
410 | return 3; | ||
411 | if (th->syn) | ||
412 | return 0; | ||
413 | if (th->fin) | ||
414 | return 1; | ||
415 | if (th->ack) | ||
416 | return 2; | ||
417 | return -1; | ||
418 | } | ||
419 | |||
420 | static inline void | ||
421 | set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, | ||
422 | int direction, struct tcphdr *th) | ||
423 | { | ||
424 | int state_idx; | ||
425 | int new_state = IP_VS_TCP_S_CLOSE; | ||
426 | int state_off = tcp_state_off[direction]; | ||
427 | |||
428 | /* | ||
429 | * Update state offset to INPUT_ONLY if necessary | ||
430 | * or delete NO_OUTPUT flag if output packet detected | ||
431 | */ | ||
432 | if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { | ||
433 | if (state_off == TCP_DIR_OUTPUT) | ||
434 | cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; | ||
435 | else | ||
436 | state_off = TCP_DIR_INPUT_ONLY; | ||
437 | } | ||
438 | |||
439 | if ((state_idx = tcp_state_idx(th)) < 0) { | ||
440 | IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); | ||
441 | goto tcp_state_out; | ||
442 | } | ||
443 | |||
444 | new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; | ||
445 | |||
446 | tcp_state_out: | ||
447 | if (new_state != cp->state) { | ||
448 | struct ip_vs_dest *dest = cp->dest; | ||
449 | |||
450 | IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" | ||
451 | "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", | ||
452 | pp->name, | ||
453 | (state_off==TCP_DIR_OUTPUT)?"output ":"input ", | ||
454 | th->syn? 'S' : '.', | ||
455 | th->fin? 'F' : '.', | ||
456 | th->ack? 'A' : '.', | ||
457 | th->rst? 'R' : '.', | ||
458 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
459 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
460 | tcp_state_name(cp->state), | ||
461 | tcp_state_name(new_state), | ||
462 | atomic_read(&cp->refcnt)); | ||
463 | if (dest) { | ||
464 | if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
465 | (new_state != IP_VS_TCP_S_ESTABLISHED)) { | ||
466 | atomic_dec(&dest->activeconns); | ||
467 | atomic_inc(&dest->inactconns); | ||
468 | cp->flags |= IP_VS_CONN_F_INACTIVE; | ||
469 | } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
470 | (new_state == IP_VS_TCP_S_ESTABLISHED)) { | ||
471 | atomic_inc(&dest->activeconns); | ||
472 | atomic_dec(&dest->inactconns); | ||
473 | cp->flags &= ~IP_VS_CONN_F_INACTIVE; | ||
474 | } | ||
475 | } | ||
476 | } | ||
477 | |||
478 | cp->timeout = pp->timeout_table[cp->state = new_state]; | ||
479 | } | ||
480 | |||
481 | |||
482 | /* | ||
483 | * Handle state transitions | ||
484 | */ | ||
485 | static int | ||
486 | tcp_state_transition(struct ip_vs_conn *cp, int direction, | ||
487 | const struct sk_buff *skb, | ||
488 | struct ip_vs_protocol *pp) | ||
489 | { | ||
490 | struct tcphdr _tcph, *th; | ||
491 | |||
492 | th = skb_header_pointer(skb, skb->nh.iph->ihl*4, | ||
493 | sizeof(_tcph), &_tcph); | ||
494 | if (th == NULL) | ||
495 | return 0; | ||
496 | |||
497 | spin_lock(&cp->lock); | ||
498 | set_tcp_state(pp, cp, direction, th); | ||
499 | spin_unlock(&cp->lock); | ||
500 | |||
501 | return 1; | ||
502 | } | ||
503 | |||
504 | |||
505 | /* | ||
506 | * Hash table for TCP application incarnations | ||
507 | */ | ||
508 | #define TCP_APP_TAB_BITS 4 | ||
509 | #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) | ||
510 | #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) | ||
511 | |||
512 | static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; | ||
513 | static DEFINE_SPINLOCK(tcp_app_lock); | ||
514 | |||
515 | static inline __u16 tcp_app_hashkey(__u16 port) | ||
516 | { | ||
517 | return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK; | ||
518 | } | ||
519 | |||
520 | |||
521 | static int tcp_register_app(struct ip_vs_app *inc) | ||
522 | { | ||
523 | struct ip_vs_app *i; | ||
524 | __u16 hash, port = inc->port; | ||
525 | int ret = 0; | ||
526 | |||
527 | hash = tcp_app_hashkey(port); | ||
528 | |||
529 | spin_lock_bh(&tcp_app_lock); | ||
530 | list_for_each_entry(i, &tcp_apps[hash], p_list) { | ||
531 | if (i->port == port) { | ||
532 | ret = -EEXIST; | ||
533 | goto out; | ||
534 | } | ||
535 | } | ||
536 | list_add(&inc->p_list, &tcp_apps[hash]); | ||
537 | atomic_inc(&ip_vs_protocol_tcp.appcnt); | ||
538 | |||
539 | out: | ||
540 | spin_unlock_bh(&tcp_app_lock); | ||
541 | return ret; | ||
542 | } | ||
543 | |||
544 | |||
545 | static void | ||
546 | tcp_unregister_app(struct ip_vs_app *inc) | ||
547 | { | ||
548 | spin_lock_bh(&tcp_app_lock); | ||
549 | atomic_dec(&ip_vs_protocol_tcp.appcnt); | ||
550 | list_del(&inc->p_list); | ||
551 | spin_unlock_bh(&tcp_app_lock); | ||
552 | } | ||
553 | |||
554 | |||
555 | static int | ||
556 | tcp_app_conn_bind(struct ip_vs_conn *cp) | ||
557 | { | ||
558 | int hash; | ||
559 | struct ip_vs_app *inc; | ||
560 | int result = 0; | ||
561 | |||
562 | /* Default binding: bind app only for NAT */ | ||
563 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) | ||
564 | return 0; | ||
565 | |||
566 | /* Lookup application incarnations and bind the right one */ | ||
567 | hash = tcp_app_hashkey(cp->vport); | ||
568 | |||
569 | spin_lock(&tcp_app_lock); | ||
570 | list_for_each_entry(inc, &tcp_apps[hash], p_list) { | ||
571 | if (inc->port == cp->vport) { | ||
572 | if (unlikely(!ip_vs_app_inc_get(inc))) | ||
573 | break; | ||
574 | spin_unlock(&tcp_app_lock); | ||
575 | |||
576 | IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" | ||
577 | "%u.%u.%u.%u:%u to app %s on port %u\n", | ||
578 | __FUNCTION__, | ||
579 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
580 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
581 | inc->name, ntohs(inc->port)); | ||
582 | cp->app = inc; | ||
583 | if (inc->init_conn) | ||
584 | result = inc->init_conn(inc, cp); | ||
585 | goto out; | ||
586 | } | ||
587 | } | ||
588 | spin_unlock(&tcp_app_lock); | ||
589 | |||
590 | out: | ||
591 | return result; | ||
592 | } | ||
593 | |||
594 | |||
595 | /* | ||
596 | * Set LISTEN timeout. (ip_vs_conn_put will setup timer) | ||
597 | */ | ||
598 | void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) | ||
599 | { | ||
600 | spin_lock(&cp->lock); | ||
601 | cp->state = IP_VS_TCP_S_LISTEN; | ||
602 | cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; | ||
603 | spin_unlock(&cp->lock); | ||
604 | } | ||
605 | |||
606 | |||
607 | static void tcp_init(struct ip_vs_protocol *pp) | ||
608 | { | ||
609 | IP_VS_INIT_HASH_TABLE(tcp_apps); | ||
610 | pp->timeout_table = tcp_timeouts; | ||
611 | } | ||
612 | |||
613 | |||
614 | static void tcp_exit(struct ip_vs_protocol *pp) | ||
615 | { | ||
616 | } | ||
617 | |||
618 | |||
619 | struct ip_vs_protocol ip_vs_protocol_tcp = { | ||
620 | .name = "TCP", | ||
621 | .protocol = IPPROTO_TCP, | ||
622 | .dont_defrag = 0, | ||
623 | .appcnt = ATOMIC_INIT(0), | ||
624 | .init = tcp_init, | ||
625 | .exit = tcp_exit, | ||
626 | .register_app = tcp_register_app, | ||
627 | .unregister_app = tcp_unregister_app, | ||
628 | .conn_schedule = tcp_conn_schedule, | ||
629 | .conn_in_get = tcp_conn_in_get, | ||
630 | .conn_out_get = tcp_conn_out_get, | ||
631 | .snat_handler = tcp_snat_handler, | ||
632 | .dnat_handler = tcp_dnat_handler, | ||
633 | .csum_check = tcp_csum_check, | ||
634 | .state_name = tcp_state_name, | ||
635 | .state_transition = tcp_state_transition, | ||
636 | .app_conn_bind = tcp_app_conn_bind, | ||
637 | .debug_packet = ip_vs_tcpudp_debug_packet, | ||
638 | .timeout_change = tcp_timeout_change, | ||
639 | .set_state_timeout = tcp_set_state_timeout, | ||
640 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c new file mode 100644 index 000000000000..8ae5f2e0aefa --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c | |||
@@ -0,0 +1,427 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_udp.c: UDP load balancing support for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * Julian Anastasov <ja@ssi.bg> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Changes: | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/netfilter_ipv4.h> | ||
20 | |||
21 | #include <net/ip_vs.h> | ||
22 | |||
23 | |||
24 | static struct ip_vs_conn * | ||
25 | udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
26 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
27 | { | ||
28 | struct ip_vs_conn *cp; | ||
29 | __u16 _ports[2], *pptr; | ||
30 | |||
31 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
32 | if (pptr == NULL) | ||
33 | return NULL; | ||
34 | |||
35 | if (likely(!inverse)) { | ||
36 | cp = ip_vs_conn_in_get(iph->protocol, | ||
37 | iph->saddr, pptr[0], | ||
38 | iph->daddr, pptr[1]); | ||
39 | } else { | ||
40 | cp = ip_vs_conn_in_get(iph->protocol, | ||
41 | iph->daddr, pptr[1], | ||
42 | iph->saddr, pptr[0]); | ||
43 | } | ||
44 | |||
45 | return cp; | ||
46 | } | ||
47 | |||
48 | |||
49 | static struct ip_vs_conn * | ||
50 | udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
51 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
52 | { | ||
53 | struct ip_vs_conn *cp; | ||
54 | __u16 _ports[2], *pptr; | ||
55 | |||
56 | pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4, | ||
57 | sizeof(_ports), _ports); | ||
58 | if (pptr == NULL) | ||
59 | return NULL; | ||
60 | |||
61 | if (likely(!inverse)) { | ||
62 | cp = ip_vs_conn_out_get(iph->protocol, | ||
63 | iph->saddr, pptr[0], | ||
64 | iph->daddr, pptr[1]); | ||
65 | } else { | ||
66 | cp = ip_vs_conn_out_get(iph->protocol, | ||
67 | iph->daddr, pptr[1], | ||
68 | iph->saddr, pptr[0]); | ||
69 | } | ||
70 | |||
71 | return cp; | ||
72 | } | ||
73 | |||
74 | |||
75 | static int | ||
76 | udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
77 | int *verdict, struct ip_vs_conn **cpp) | ||
78 | { | ||
79 | struct ip_vs_service *svc; | ||
80 | struct udphdr _udph, *uh; | ||
81 | |||
82 | uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, | ||
83 | sizeof(_udph), &_udph); | ||
84 | if (uh == NULL) { | ||
85 | *verdict = NF_DROP; | ||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, | ||
90 | skb->nh.iph->daddr, uh->dest))) { | ||
91 | if (ip_vs_todrop()) { | ||
92 | /* | ||
93 | * It seems that we are very loaded. | ||
94 | * We have to drop this packet :( | ||
95 | */ | ||
96 | ip_vs_service_put(svc); | ||
97 | *verdict = NF_DROP; | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Let the virtual server select a real server for the | ||
103 | * incoming connection, and create a connection entry. | ||
104 | */ | ||
105 | *cpp = ip_vs_schedule(svc, skb); | ||
106 | if (!*cpp) { | ||
107 | *verdict = ip_vs_leave(svc, skb, pp); | ||
108 | return 0; | ||
109 | } | ||
110 | ip_vs_service_put(svc); | ||
111 | } | ||
112 | return 1; | ||
113 | } | ||
114 | |||
115 | |||
116 | static inline void | ||
117 | udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip, | ||
118 | u16 oldport, u16 newport) | ||
119 | { | ||
120 | uhdr->check = | ||
121 | ip_vs_check_diff(~oldip, newip, | ||
122 | ip_vs_check_diff(oldport ^ 0xFFFF, | ||
123 | newport, uhdr->check)); | ||
124 | if (!uhdr->check) | ||
125 | uhdr->check = 0xFFFF; | ||
126 | } | ||
127 | |||
128 | static int | ||
129 | udp_snat_handler(struct sk_buff **pskb, | ||
130 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
131 | { | ||
132 | struct udphdr *udph; | ||
133 | unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; | ||
134 | |||
135 | /* csum_check requires unshared skb */ | ||
136 | if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) | ||
137 | return 0; | ||
138 | |||
139 | if (unlikely(cp->app != NULL)) { | ||
140 | /* Some checks before mangling */ | ||
141 | if (pp->csum_check && !pp->csum_check(*pskb, pp)) | ||
142 | return 0; | ||
143 | |||
144 | /* | ||
145 | * Call application helper if needed | ||
146 | */ | ||
147 | if (!ip_vs_app_pkt_out(cp, pskb)) | ||
148 | return 0; | ||
149 | } | ||
150 | |||
151 | udph = (void *)(*pskb)->nh.iph + udphoff; | ||
152 | udph->source = cp->vport; | ||
153 | |||
154 | /* | ||
155 | * Adjust UDP checksums | ||
156 | */ | ||
157 | if (!cp->app && (udph->check != 0)) { | ||
158 | /* Only port and addr are changed, do fast csum update */ | ||
159 | udp_fast_csum_update(udph, cp->daddr, cp->vaddr, | ||
160 | cp->dport, cp->vport); | ||
161 | if ((*pskb)->ip_summed == CHECKSUM_HW) | ||
162 | (*pskb)->ip_summed = CHECKSUM_NONE; | ||
163 | } else { | ||
164 | /* full checksum calculation */ | ||
165 | udph->check = 0; | ||
166 | (*pskb)->csum = skb_checksum(*pskb, udphoff, | ||
167 | (*pskb)->len - udphoff, 0); | ||
168 | udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, | ||
169 | (*pskb)->len - udphoff, | ||
170 | cp->protocol, | ||
171 | (*pskb)->csum); | ||
172 | if (udph->check == 0) | ||
173 | udph->check = 0xFFFF; | ||
174 | IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", | ||
175 | pp->name, udph->check, | ||
176 | (char*)&(udph->check) - (char*)udph); | ||
177 | } | ||
178 | return 1; | ||
179 | } | ||
180 | |||
181 | |||
182 | static int | ||
183 | udp_dnat_handler(struct sk_buff **pskb, | ||
184 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
185 | { | ||
186 | struct udphdr *udph; | ||
187 | unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; | ||
188 | |||
189 | /* csum_check requires unshared skb */ | ||
190 | if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) | ||
191 | return 0; | ||
192 | |||
193 | if (unlikely(cp->app != NULL)) { | ||
194 | /* Some checks before mangling */ | ||
195 | if (pp->csum_check && !pp->csum_check(*pskb, pp)) | ||
196 | return 0; | ||
197 | |||
198 | /* | ||
199 | * Attempt ip_vs_app call. | ||
200 | * It will fix ip_vs_conn | ||
201 | */ | ||
202 | if (!ip_vs_app_pkt_in(cp, pskb)) | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | udph = (void *)(*pskb)->nh.iph + udphoff; | ||
207 | udph->dest = cp->dport; | ||
208 | |||
209 | /* | ||
210 | * Adjust UDP checksums | ||
211 | */ | ||
212 | if (!cp->app && (udph->check != 0)) { | ||
213 | /* Only port and addr are changed, do fast csum update */ | ||
214 | udp_fast_csum_update(udph, cp->vaddr, cp->daddr, | ||
215 | cp->vport, cp->dport); | ||
216 | if ((*pskb)->ip_summed == CHECKSUM_HW) | ||
217 | (*pskb)->ip_summed = CHECKSUM_NONE; | ||
218 | } else { | ||
219 | /* full checksum calculation */ | ||
220 | udph->check = 0; | ||
221 | (*pskb)->csum = skb_checksum(*pskb, udphoff, | ||
222 | (*pskb)->len - udphoff, 0); | ||
223 | udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, | ||
224 | (*pskb)->len - udphoff, | ||
225 | cp->protocol, | ||
226 | (*pskb)->csum); | ||
227 | if (udph->check == 0) | ||
228 | udph->check = 0xFFFF; | ||
229 | (*pskb)->ip_summed = CHECKSUM_UNNECESSARY; | ||
230 | } | ||
231 | return 1; | ||
232 | } | ||
233 | |||
234 | |||
235 | static int | ||
236 | udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) | ||
237 | { | ||
238 | struct udphdr _udph, *uh; | ||
239 | unsigned int udphoff = skb->nh.iph->ihl*4; | ||
240 | |||
241 | uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); | ||
242 | if (uh == NULL) | ||
243 | return 0; | ||
244 | |||
245 | if (uh->check != 0) { | ||
246 | switch (skb->ip_summed) { | ||
247 | case CHECKSUM_NONE: | ||
248 | skb->csum = skb_checksum(skb, udphoff, | ||
249 | skb->len - udphoff, 0); | ||
250 | case CHECKSUM_HW: | ||
251 | if (csum_tcpudp_magic(skb->nh.iph->saddr, | ||
252 | skb->nh.iph->daddr, | ||
253 | skb->len - udphoff, | ||
254 | skb->nh.iph->protocol, | ||
255 | skb->csum)) { | ||
256 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
257 | "Failed checksum for"); | ||
258 | return 0; | ||
259 | } | ||
260 | break; | ||
261 | default: | ||
262 | /* CHECKSUM_UNNECESSARY */ | ||
263 | break; | ||
264 | } | ||
265 | } | ||
266 | return 1; | ||
267 | } | ||
268 | |||
269 | |||
270 | /* | ||
271 | * Note: the caller guarantees that only one of register_app, | ||
272 | * unregister_app or app_conn_bind is called each time. | ||
273 | */ | ||
274 | |||
275 | #define UDP_APP_TAB_BITS 4 | ||
276 | #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) | ||
277 | #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) | ||
278 | |||
279 | static struct list_head udp_apps[UDP_APP_TAB_SIZE]; | ||
280 | static DEFINE_SPINLOCK(udp_app_lock); | ||
281 | |||
282 | static inline __u16 udp_app_hashkey(__u16 port) | ||
283 | { | ||
284 | return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK; | ||
285 | } | ||
286 | |||
287 | |||
288 | static int udp_register_app(struct ip_vs_app *inc) | ||
289 | { | ||
290 | struct ip_vs_app *i; | ||
291 | __u16 hash, port = inc->port; | ||
292 | int ret = 0; | ||
293 | |||
294 | hash = udp_app_hashkey(port); | ||
295 | |||
296 | |||
297 | spin_lock_bh(&udp_app_lock); | ||
298 | list_for_each_entry(i, &udp_apps[hash], p_list) { | ||
299 | if (i->port == port) { | ||
300 | ret = -EEXIST; | ||
301 | goto out; | ||
302 | } | ||
303 | } | ||
304 | list_add(&inc->p_list, &udp_apps[hash]); | ||
305 | atomic_inc(&ip_vs_protocol_udp.appcnt); | ||
306 | |||
307 | out: | ||
308 | spin_unlock_bh(&udp_app_lock); | ||
309 | return ret; | ||
310 | } | ||
311 | |||
312 | |||
313 | static void | ||
314 | udp_unregister_app(struct ip_vs_app *inc) | ||
315 | { | ||
316 | spin_lock_bh(&udp_app_lock); | ||
317 | atomic_dec(&ip_vs_protocol_udp.appcnt); | ||
318 | list_del(&inc->p_list); | ||
319 | spin_unlock_bh(&udp_app_lock); | ||
320 | } | ||
321 | |||
322 | |||
323 | static int udp_app_conn_bind(struct ip_vs_conn *cp) | ||
324 | { | ||
325 | int hash; | ||
326 | struct ip_vs_app *inc; | ||
327 | int result = 0; | ||
328 | |||
329 | /* Default binding: bind app only for NAT */ | ||
330 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) | ||
331 | return 0; | ||
332 | |||
333 | /* Lookup application incarnations and bind the right one */ | ||
334 | hash = udp_app_hashkey(cp->vport); | ||
335 | |||
336 | spin_lock(&udp_app_lock); | ||
337 | list_for_each_entry(inc, &udp_apps[hash], p_list) { | ||
338 | if (inc->port == cp->vport) { | ||
339 | if (unlikely(!ip_vs_app_inc_get(inc))) | ||
340 | break; | ||
341 | spin_unlock(&udp_app_lock); | ||
342 | |||
343 | IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" | ||
344 | "%u.%u.%u.%u:%u to app %s on port %u\n", | ||
345 | __FUNCTION__, | ||
346 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
347 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
348 | inc->name, ntohs(inc->port)); | ||
349 | cp->app = inc; | ||
350 | if (inc->init_conn) | ||
351 | result = inc->init_conn(inc, cp); | ||
352 | goto out; | ||
353 | } | ||
354 | } | ||
355 | spin_unlock(&udp_app_lock); | ||
356 | |||
357 | out: | ||
358 | return result; | ||
359 | } | ||
360 | |||
361 | |||
362 | static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { | ||
363 | [IP_VS_UDP_S_NORMAL] = 5*60*HZ, | ||
364 | [IP_VS_UDP_S_LAST] = 2*HZ, | ||
365 | }; | ||
366 | |||
367 | static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { | ||
368 | [IP_VS_UDP_S_NORMAL] = "UDP", | ||
369 | [IP_VS_UDP_S_LAST] = "BUG!", | ||
370 | }; | ||
371 | |||
372 | |||
373 | static int | ||
374 | udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) | ||
375 | { | ||
376 | return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, | ||
377 | udp_state_name_table, sname, to); | ||
378 | } | ||
379 | |||
380 | static const char * udp_state_name(int state) | ||
381 | { | ||
382 | if (state >= IP_VS_UDP_S_LAST) | ||
383 | return "ERR!"; | ||
384 | return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; | ||
385 | } | ||
386 | |||
387 | static int | ||
388 | udp_state_transition(struct ip_vs_conn *cp, int direction, | ||
389 | const struct sk_buff *skb, | ||
390 | struct ip_vs_protocol *pp) | ||
391 | { | ||
392 | cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; | ||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | static void udp_init(struct ip_vs_protocol *pp) | ||
397 | { | ||
398 | IP_VS_INIT_HASH_TABLE(udp_apps); | ||
399 | pp->timeout_table = udp_timeouts; | ||
400 | } | ||
401 | |||
402 | static void udp_exit(struct ip_vs_protocol *pp) | ||
403 | { | ||
404 | } | ||
405 | |||
406 | |||
407 | struct ip_vs_protocol ip_vs_protocol_udp = { | ||
408 | .name = "UDP", | ||
409 | .protocol = IPPROTO_UDP, | ||
410 | .dont_defrag = 0, | ||
411 | .init = udp_init, | ||
412 | .exit = udp_exit, | ||
413 | .conn_schedule = udp_conn_schedule, | ||
414 | .conn_in_get = udp_conn_in_get, | ||
415 | .conn_out_get = udp_conn_out_get, | ||
416 | .snat_handler = udp_snat_handler, | ||
417 | .dnat_handler = udp_dnat_handler, | ||
418 | .csum_check = udp_csum_check, | ||
419 | .state_transition = udp_state_transition, | ||
420 | .state_name = udp_state_name, | ||
421 | .register_app = udp_register_app, | ||
422 | .unregister_app = udp_unregister_app, | ||
423 | .app_conn_bind = udp_app_conn_bind, | ||
424 | .debug_packet = ip_vs_tcpudp_debug_packet, | ||
425 | .timeout_change = NULL, | ||
426 | .set_state_timeout = udp_set_state_timeout, | ||
427 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c new file mode 100644 index 000000000000..b23bab231cab --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_rr.c | |||
@@ -0,0 +1,118 @@ | |||
1 | /* | ||
2 | * IPVS: Round-Robin Scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * Peter Kese <peter.kese@ijs.si> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Fixes/Changes: | ||
15 | * Wensong Zhang : changed the ip_vs_rr_schedule to return dest | ||
16 | * Julian Anastasov : fixed the NULL pointer access bug in debugging | ||
17 | * Wensong Zhang : changed some comestics things for debugging | ||
18 | * Wensong Zhang : changed for the d-linked destination list | ||
19 | * Wensong Zhang : added the ip_vs_rr_update_svc | ||
20 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | #include <linux/module.h> | ||
25 | #include <linux/kernel.h> | ||
26 | |||
27 | #include <net/ip_vs.h> | ||
28 | |||
29 | |||
30 | static int ip_vs_rr_init_svc(struct ip_vs_service *svc) | ||
31 | { | ||
32 | svc->sched_data = &svc->destinations; | ||
33 | return 0; | ||
34 | } | ||
35 | |||
36 | |||
37 | static int ip_vs_rr_done_svc(struct ip_vs_service *svc) | ||
38 | { | ||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | |||
43 | static int ip_vs_rr_update_svc(struct ip_vs_service *svc) | ||
44 | { | ||
45 | svc->sched_data = &svc->destinations; | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | |||
50 | /* | ||
51 | * Round-Robin Scheduling | ||
52 | */ | ||
53 | static struct ip_vs_dest * | ||
54 | ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
55 | { | ||
56 | struct list_head *p, *q; | ||
57 | struct ip_vs_dest *dest; | ||
58 | |||
59 | IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); | ||
60 | |||
61 | write_lock(&svc->sched_lock); | ||
62 | p = (struct list_head *)svc->sched_data; | ||
63 | p = p->next; | ||
64 | q = p; | ||
65 | do { | ||
66 | /* skip list head */ | ||
67 | if (q == &svc->destinations) { | ||
68 | q = q->next; | ||
69 | continue; | ||
70 | } | ||
71 | |||
72 | dest = list_entry(q, struct ip_vs_dest, n_list); | ||
73 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
74 | atomic_read(&dest->weight) > 0) | ||
75 | /* HIT */ | ||
76 | goto out; | ||
77 | q = q->next; | ||
78 | } while (q != p); | ||
79 | write_unlock(&svc->sched_lock); | ||
80 | return NULL; | ||
81 | |||
82 | out: | ||
83 | svc->sched_data = q; | ||
84 | write_unlock(&svc->sched_lock); | ||
85 | IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " | ||
86 | "activeconns %d refcnt %d weight %d\n", | ||
87 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
88 | atomic_read(&dest->activeconns), | ||
89 | atomic_read(&dest->refcnt), atomic_read(&dest->weight)); | ||
90 | |||
91 | return dest; | ||
92 | } | ||
93 | |||
94 | |||
95 | static struct ip_vs_scheduler ip_vs_rr_scheduler = { | ||
96 | .name = "rr", /* name */ | ||
97 | .refcnt = ATOMIC_INIT(0), | ||
98 | .module = THIS_MODULE, | ||
99 | .init_service = ip_vs_rr_init_svc, | ||
100 | .done_service = ip_vs_rr_done_svc, | ||
101 | .update_service = ip_vs_rr_update_svc, | ||
102 | .schedule = ip_vs_rr_schedule, | ||
103 | }; | ||
104 | |||
105 | static int __init ip_vs_rr_init(void) | ||
106 | { | ||
107 | INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); | ||
108 | return register_ip_vs_scheduler(&ip_vs_rr_scheduler); | ||
109 | } | ||
110 | |||
111 | static void __exit ip_vs_rr_cleanup(void) | ||
112 | { | ||
113 | unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); | ||
114 | } | ||
115 | |||
116 | module_init(ip_vs_rr_init); | ||
117 | module_exit(ip_vs_rr_cleanup); | ||
118 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c new file mode 100644 index 000000000000..0f7c56a225bd --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sched.c | |||
@@ -0,0 +1,251 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $ | ||
9 | * | ||
10 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
11 | * Peter Kese <peter.kese@ijs.si> | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public License | ||
15 | * as published by the Free Software Foundation; either version | ||
16 | * 2 of the License, or (at your option) any later version. | ||
17 | * | ||
18 | * Changes: | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/spinlock.h> | ||
25 | #include <asm/string.h> | ||
26 | #include <linux/kmod.h> | ||
27 | |||
28 | #include <net/ip_vs.h> | ||
29 | |||
30 | /* | ||
31 | * IPVS scheduler list | ||
32 | */ | ||
33 | static LIST_HEAD(ip_vs_schedulers); | ||
34 | |||
35 | /* lock for service table */ | ||
36 | static DEFINE_RWLOCK(__ip_vs_sched_lock); | ||
37 | |||
38 | |||
39 | /* | ||
40 | * Bind a service with a scheduler | ||
41 | */ | ||
42 | int ip_vs_bind_scheduler(struct ip_vs_service *svc, | ||
43 | struct ip_vs_scheduler *scheduler) | ||
44 | { | ||
45 | int ret; | ||
46 | |||
47 | if (svc == NULL) { | ||
48 | IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); | ||
49 | return -EINVAL; | ||
50 | } | ||
51 | if (scheduler == NULL) { | ||
52 | IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); | ||
53 | return -EINVAL; | ||
54 | } | ||
55 | |||
56 | svc->scheduler = scheduler; | ||
57 | |||
58 | if (scheduler->init_service) { | ||
59 | ret = scheduler->init_service(svc); | ||
60 | if (ret) { | ||
61 | IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); | ||
62 | return ret; | ||
63 | } | ||
64 | } | ||
65 | |||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | |||
70 | /* | ||
71 | * Unbind a service with its scheduler | ||
72 | */ | ||
73 | int ip_vs_unbind_scheduler(struct ip_vs_service *svc) | ||
74 | { | ||
75 | struct ip_vs_scheduler *sched; | ||
76 | |||
77 | if (svc == NULL) { | ||
78 | IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); | ||
79 | return -EINVAL; | ||
80 | } | ||
81 | |||
82 | sched = svc->scheduler; | ||
83 | if (sched == NULL) { | ||
84 | IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); | ||
85 | return -EINVAL; | ||
86 | } | ||
87 | |||
88 | if (sched->done_service) { | ||
89 | if (sched->done_service(svc) != 0) { | ||
90 | IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); | ||
91 | return -EINVAL; | ||
92 | } | ||
93 | } | ||
94 | |||
95 | svc->scheduler = NULL; | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | |||
100 | /* | ||
101 | * Get scheduler in the scheduler list by name | ||
102 | */ | ||
103 | static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) | ||
104 | { | ||
105 | struct ip_vs_scheduler *sched; | ||
106 | |||
107 | IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", | ||
108 | sched_name); | ||
109 | |||
110 | read_lock_bh(&__ip_vs_sched_lock); | ||
111 | |||
112 | list_for_each_entry(sched, &ip_vs_schedulers, n_list) { | ||
113 | /* | ||
114 | * Test and get the modules atomically | ||
115 | */ | ||
116 | if (sched->module && !try_module_get(sched->module)) { | ||
117 | /* | ||
118 | * This scheduler is just deleted | ||
119 | */ | ||
120 | continue; | ||
121 | } | ||
122 | if (strcmp(sched_name, sched->name)==0) { | ||
123 | /* HIT */ | ||
124 | read_unlock_bh(&__ip_vs_sched_lock); | ||
125 | return sched; | ||
126 | } | ||
127 | if (sched->module) | ||
128 | module_put(sched->module); | ||
129 | } | ||
130 | |||
131 | read_unlock_bh(&__ip_vs_sched_lock); | ||
132 | return NULL; | ||
133 | } | ||
134 | |||
135 | |||
136 | /* | ||
137 | * Lookup scheduler and try to load it if it doesn't exist | ||
138 | */ | ||
139 | struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) | ||
140 | { | ||
141 | struct ip_vs_scheduler *sched; | ||
142 | |||
143 | /* | ||
144 | * Search for the scheduler by sched_name | ||
145 | */ | ||
146 | sched = ip_vs_sched_getbyname(sched_name); | ||
147 | |||
148 | /* | ||
149 | * If scheduler not found, load the module and search again | ||
150 | */ | ||
151 | if (sched == NULL) { | ||
152 | request_module("ip_vs_%s", sched_name); | ||
153 | sched = ip_vs_sched_getbyname(sched_name); | ||
154 | } | ||
155 | |||
156 | return sched; | ||
157 | } | ||
158 | |||
159 | void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) | ||
160 | { | ||
161 | if (scheduler->module) | ||
162 | module_put(scheduler->module); | ||
163 | } | ||
164 | |||
165 | |||
166 | /* | ||
167 | * Register a scheduler in the scheduler list | ||
168 | */ | ||
169 | int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | ||
170 | { | ||
171 | struct ip_vs_scheduler *sched; | ||
172 | |||
173 | if (!scheduler) { | ||
174 | IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); | ||
175 | return -EINVAL; | ||
176 | } | ||
177 | |||
178 | if (!scheduler->name) { | ||
179 | IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); | ||
180 | return -EINVAL; | ||
181 | } | ||
182 | |||
183 | /* increase the module use count */ | ||
184 | ip_vs_use_count_inc(); | ||
185 | |||
186 | /* | ||
187 | * Make sure that the scheduler with this name doesn't exist | ||
188 | * in the scheduler list. | ||
189 | */ | ||
190 | sched = ip_vs_sched_getbyname(scheduler->name); | ||
191 | if (sched) { | ||
192 | ip_vs_scheduler_put(sched); | ||
193 | ip_vs_use_count_dec(); | ||
194 | IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " | ||
195 | "already existed in the system\n", scheduler->name); | ||
196 | return -EINVAL; | ||
197 | } | ||
198 | |||
199 | write_lock_bh(&__ip_vs_sched_lock); | ||
200 | |||
201 | if (scheduler->n_list.next != &scheduler->n_list) { | ||
202 | write_unlock_bh(&__ip_vs_sched_lock); | ||
203 | ip_vs_use_count_dec(); | ||
204 | IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " | ||
205 | "already linked\n", scheduler->name); | ||
206 | return -EINVAL; | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * Add it into the d-linked scheduler list | ||
211 | */ | ||
212 | list_add(&scheduler->n_list, &ip_vs_schedulers); | ||
213 | write_unlock_bh(&__ip_vs_sched_lock); | ||
214 | |||
215 | IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); | ||
216 | |||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | |||
221 | /* | ||
222 | * Unregister a scheduler from the scheduler list | ||
223 | */ | ||
224 | int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | ||
225 | { | ||
226 | if (!scheduler) { | ||
227 | IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); | ||
228 | return -EINVAL; | ||
229 | } | ||
230 | |||
231 | write_lock_bh(&__ip_vs_sched_lock); | ||
232 | if (scheduler->n_list.next == &scheduler->n_list) { | ||
233 | write_unlock_bh(&__ip_vs_sched_lock); | ||
234 | IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " | ||
235 | "is not in the list. failed\n", scheduler->name); | ||
236 | return -EINVAL; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Remove it from the d-linked scheduler list | ||
241 | */ | ||
242 | list_del(&scheduler->n_list); | ||
243 | write_unlock_bh(&__ip_vs_sched_lock); | ||
244 | |||
245 | /* decrease the module use count */ | ||
246 | ip_vs_use_count_dec(); | ||
247 | |||
248 | IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); | ||
249 | |||
250 | return 0; | ||
251 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c new file mode 100644 index 000000000000..ff366f7390d9 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sed.c | |||
@@ -0,0 +1,163 @@ | |||
1 | /* | ||
2 | * IPVS: Shortest Expected Delay scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | * The SED algorithm attempts to minimize each job's expected delay until | ||
19 | * completion. The expected delay that the job will experience is | ||
20 | * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of | ||
21 | * jobs on the the ith server and Ui is the fixed service rate (weight) of | ||
22 | * the ith server. The SED algorithm adopts a greedy policy that each does | ||
23 | * what is in its own best interest, i.e. to join the queue which would | ||
24 | * minimize its expected delay of completion. | ||
25 | * | ||
26 | * See the following paper for more information: | ||
27 | * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing | ||
28 | * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, | ||
29 | * pages 986-994, 1988. | ||
30 | * | ||
31 | * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. | ||
32 | * | ||
33 | * The difference between SED and WLC is that SED includes the incoming | ||
34 | * job in the cost function (the increment of 1). SED may outperform | ||
35 | * WLC, while scheduling big jobs under larger heterogeneous systems | ||
36 | * (the server weight varies a lot). | ||
37 | * | ||
38 | */ | ||
39 | |||
40 | #include <linux/module.h> | ||
41 | #include <linux/kernel.h> | ||
42 | |||
43 | #include <net/ip_vs.h> | ||
44 | |||
45 | |||
46 | static int | ||
47 | ip_vs_sed_init_svc(struct ip_vs_service *svc) | ||
48 | { | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | |||
53 | static int | ||
54 | ip_vs_sed_done_svc(struct ip_vs_service *svc) | ||
55 | { | ||
56 | return 0; | ||
57 | } | ||
58 | |||
59 | |||
60 | static int | ||
61 | ip_vs_sed_update_svc(struct ip_vs_service *svc) | ||
62 | { | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | |||
67 | static inline unsigned int | ||
68 | ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) | ||
69 | { | ||
70 | /* | ||
71 | * We only use the active connection number in the cost | ||
72 | * calculation here. | ||
73 | */ | ||
74 | return atomic_read(&dest->activeconns) + 1; | ||
75 | } | ||
76 | |||
77 | |||
78 | /* | ||
79 | * Weighted Least Connection scheduling | ||
80 | */ | ||
81 | static struct ip_vs_dest * | ||
82 | ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
83 | { | ||
84 | struct ip_vs_dest *dest, *least; | ||
85 | unsigned int loh, doh; | ||
86 | |||
87 | IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); | ||
88 | |||
89 | /* | ||
90 | * We calculate the load of each dest server as follows: | ||
91 | * (server expected overhead) / dest->weight | ||
92 | * | ||
93 | * Remember -- no floats in kernel mode!!! | ||
94 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
95 | * h1/w1 > h2/w2 | ||
96 | * if every weight is larger than zero. | ||
97 | * | ||
98 | * The server with weight=0 is quiesced and will not receive any | ||
99 | * new connections. | ||
100 | */ | ||
101 | |||
102 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
103 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
104 | atomic_read(&dest->weight) > 0) { | ||
105 | least = dest; | ||
106 | loh = ip_vs_sed_dest_overhead(least); | ||
107 | goto nextstage; | ||
108 | } | ||
109 | } | ||
110 | return NULL; | ||
111 | |||
112 | /* | ||
113 | * Find the destination with the least load. | ||
114 | */ | ||
115 | nextstage: | ||
116 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
117 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
118 | continue; | ||
119 | doh = ip_vs_sed_dest_overhead(dest); | ||
120 | if (loh * atomic_read(&dest->weight) > | ||
121 | doh * atomic_read(&least->weight)) { | ||
122 | least = dest; | ||
123 | loh = doh; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " | ||
128 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
129 | NIPQUAD(least->addr), ntohs(least->port), | ||
130 | atomic_read(&least->activeconns), | ||
131 | atomic_read(&least->refcnt), | ||
132 | atomic_read(&least->weight), loh); | ||
133 | |||
134 | return least; | ||
135 | } | ||
136 | |||
137 | |||
138 | static struct ip_vs_scheduler ip_vs_sed_scheduler = | ||
139 | { | ||
140 | .name = "sed", | ||
141 | .refcnt = ATOMIC_INIT(0), | ||
142 | .module = THIS_MODULE, | ||
143 | .init_service = ip_vs_sed_init_svc, | ||
144 | .done_service = ip_vs_sed_done_svc, | ||
145 | .update_service = ip_vs_sed_update_svc, | ||
146 | .schedule = ip_vs_sed_schedule, | ||
147 | }; | ||
148 | |||
149 | |||
150 | static int __init ip_vs_sed_init(void) | ||
151 | { | ||
152 | INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list); | ||
153 | return register_ip_vs_scheduler(&ip_vs_sed_scheduler); | ||
154 | } | ||
155 | |||
156 | static void __exit ip_vs_sed_cleanup(void) | ||
157 | { | ||
158 | unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); | ||
159 | } | ||
160 | |||
161 | module_init(ip_vs_sed_init); | ||
162 | module_exit(ip_vs_sed_cleanup); | ||
163 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c new file mode 100644 index 000000000000..6f7c50e44a39 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sh.c | |||
@@ -0,0 +1,255 @@ | |||
1 | /* | ||
2 | * IPVS: Source Hashing scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | * The sh algorithm is to select server by the hash key of source IP | ||
19 | * address. The pseudo code is as follows: | ||
20 | * | ||
21 | * n <- servernode[src_ip]; | ||
22 | * if (n is dead) OR | ||
23 | * (n is overloaded) or (n.weight <= 0) then | ||
24 | * return NULL; | ||
25 | * | ||
26 | * return n; | ||
27 | * | ||
28 | * Notes that servernode is a 256-bucket hash table that maps the hash | ||
29 | * index derived from packet source IP address to the current server | ||
30 | * array. If the sh scheduler is used in cache cluster, it is good to | ||
31 | * combine it with cache_bypass feature. When the statically assigned | ||
32 | * server is dead or overloaded, the load balancer can bypass the cache | ||
33 | * server and send requests to the original server directly. | ||
34 | * | ||
35 | */ | ||
36 | |||
37 | #include <linux/module.h> | ||
38 | #include <linux/kernel.h> | ||
39 | |||
40 | #include <net/ip_vs.h> | ||
41 | |||
42 | |||
43 | /* | ||
44 | * IPVS SH bucket | ||
45 | */ | ||
46 | struct ip_vs_sh_bucket { | ||
47 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
48 | }; | ||
49 | |||
50 | /* | ||
51 | * for IPVS SH entry hash table | ||
52 | */ | ||
53 | #ifndef CONFIG_IP_VS_SH_TAB_BITS | ||
54 | #define CONFIG_IP_VS_SH_TAB_BITS 8 | ||
55 | #endif | ||
56 | #define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS | ||
57 | #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) | ||
58 | #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) | ||
59 | |||
60 | |||
61 | /* | ||
62 | * Returns hash value for IPVS SH entry | ||
63 | */ | ||
64 | static inline unsigned ip_vs_sh_hashkey(__u32 addr) | ||
65 | { | ||
66 | return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; | ||
67 | } | ||
68 | |||
69 | |||
70 | /* | ||
71 | * Get ip_vs_dest associated with supplied parameters. | ||
72 | */ | ||
73 | static inline struct ip_vs_dest * | ||
74 | ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr) | ||
75 | { | ||
76 | return (tbl[ip_vs_sh_hashkey(addr)]).dest; | ||
77 | } | ||
78 | |||
79 | |||
80 | /* | ||
81 | * Assign all the hash buckets of the specified table with the service. | ||
82 | */ | ||
83 | static int | ||
84 | ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) | ||
85 | { | ||
86 | int i; | ||
87 | struct ip_vs_sh_bucket *b; | ||
88 | struct list_head *p; | ||
89 | struct ip_vs_dest *dest; | ||
90 | |||
91 | b = tbl; | ||
92 | p = &svc->destinations; | ||
93 | for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { | ||
94 | if (list_empty(p)) { | ||
95 | b->dest = NULL; | ||
96 | } else { | ||
97 | if (p == &svc->destinations) | ||
98 | p = p->next; | ||
99 | |||
100 | dest = list_entry(p, struct ip_vs_dest, n_list); | ||
101 | atomic_inc(&dest->refcnt); | ||
102 | b->dest = dest; | ||
103 | |||
104 | p = p->next; | ||
105 | } | ||
106 | b++; | ||
107 | } | ||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | |||
112 | /* | ||
113 | * Flush all the hash buckets of the specified table. | ||
114 | */ | ||
115 | static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) | ||
116 | { | ||
117 | int i; | ||
118 | struct ip_vs_sh_bucket *b; | ||
119 | |||
120 | b = tbl; | ||
121 | for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { | ||
122 | if (b->dest) { | ||
123 | atomic_dec(&b->dest->refcnt); | ||
124 | b->dest = NULL; | ||
125 | } | ||
126 | b++; | ||
127 | } | ||
128 | } | ||
129 | |||
130 | |||
131 | static int ip_vs_sh_init_svc(struct ip_vs_service *svc) | ||
132 | { | ||
133 | struct ip_vs_sh_bucket *tbl; | ||
134 | |||
135 | /* allocate the SH table for this service */ | ||
136 | tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, | ||
137 | GFP_ATOMIC); | ||
138 | if (tbl == NULL) { | ||
139 | IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); | ||
140 | return -ENOMEM; | ||
141 | } | ||
142 | svc->sched_data = tbl; | ||
143 | IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " | ||
144 | "current service\n", | ||
145 | sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); | ||
146 | |||
147 | /* assign the hash buckets with the updated service */ | ||
148 | ip_vs_sh_assign(tbl, svc); | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | |||
154 | static int ip_vs_sh_done_svc(struct ip_vs_service *svc) | ||
155 | { | ||
156 | struct ip_vs_sh_bucket *tbl = svc->sched_data; | ||
157 | |||
158 | /* got to clean up hash buckets here */ | ||
159 | ip_vs_sh_flush(tbl); | ||
160 | |||
161 | /* release the table itself */ | ||
162 | kfree(svc->sched_data); | ||
163 | IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", | ||
164 | sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); | ||
165 | |||
166 | return 0; | ||
167 | } | ||
168 | |||
169 | |||
170 | static int ip_vs_sh_update_svc(struct ip_vs_service *svc) | ||
171 | { | ||
172 | struct ip_vs_sh_bucket *tbl = svc->sched_data; | ||
173 | |||
174 | /* got to clean up hash buckets here */ | ||
175 | ip_vs_sh_flush(tbl); | ||
176 | |||
177 | /* assign the hash buckets with the updated service */ | ||
178 | ip_vs_sh_assign(tbl, svc); | ||
179 | |||
180 | return 0; | ||
181 | } | ||
182 | |||
183 | |||
184 | /* | ||
185 | * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, | ||
186 | * consider that the server is overloaded here. | ||
187 | */ | ||
188 | static inline int is_overloaded(struct ip_vs_dest *dest) | ||
189 | { | ||
190 | return dest->flags & IP_VS_DEST_F_OVERLOAD; | ||
191 | } | ||
192 | |||
193 | |||
194 | /* | ||
195 | * Source Hashing scheduling | ||
196 | */ | ||
197 | static struct ip_vs_dest * | ||
198 | ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
199 | { | ||
200 | struct ip_vs_dest *dest; | ||
201 | struct ip_vs_sh_bucket *tbl; | ||
202 | struct iphdr *iph = skb->nh.iph; | ||
203 | |||
204 | IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); | ||
205 | |||
206 | tbl = (struct ip_vs_sh_bucket *)svc->sched_data; | ||
207 | dest = ip_vs_sh_get(tbl, iph->saddr); | ||
208 | if (!dest | ||
209 | || !(dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
210 | || atomic_read(&dest->weight) <= 0 | ||
211 | || is_overloaded(dest)) { | ||
212 | return NULL; | ||
213 | } | ||
214 | |||
215 | IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " | ||
216 | "--> server %u.%u.%u.%u:%d\n", | ||
217 | NIPQUAD(iph->saddr), | ||
218 | NIPQUAD(dest->addr), | ||
219 | ntohs(dest->port)); | ||
220 | |||
221 | return dest; | ||
222 | } | ||
223 | |||
224 | |||
225 | /* | ||
226 | * IPVS SH Scheduler structure | ||
227 | */ | ||
228 | static struct ip_vs_scheduler ip_vs_sh_scheduler = | ||
229 | { | ||
230 | .name = "sh", | ||
231 | .refcnt = ATOMIC_INIT(0), | ||
232 | .module = THIS_MODULE, | ||
233 | .init_service = ip_vs_sh_init_svc, | ||
234 | .done_service = ip_vs_sh_done_svc, | ||
235 | .update_service = ip_vs_sh_update_svc, | ||
236 | .schedule = ip_vs_sh_schedule, | ||
237 | }; | ||
238 | |||
239 | |||
240 | static int __init ip_vs_sh_init(void) | ||
241 | { | ||
242 | INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list); | ||
243 | return register_ip_vs_scheduler(&ip_vs_sh_scheduler); | ||
244 | } | ||
245 | |||
246 | |||
247 | static void __exit ip_vs_sh_cleanup(void) | ||
248 | { | ||
249 | unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); | ||
250 | } | ||
251 | |||
252 | |||
253 | module_init(ip_vs_sh_init); | ||
254 | module_exit(ip_vs_sh_cleanup); | ||
255 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c new file mode 100644 index 000000000000..25c479550a32 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sync.c | |||
@@ -0,0 +1,892 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the NetFilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $ | ||
9 | * | ||
10 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
11 | * | ||
12 | * ip_vs_sync: sync connection info from master load balancer to backups | ||
13 | * through multicast | ||
14 | * | ||
15 | * Changes: | ||
16 | * Alexandre Cassen : Added master & backup support at a time. | ||
17 | * Alexandre Cassen : Added SyncID support for incoming sync | ||
18 | * messages filtering. | ||
19 | * Justin Ossevoort : Fix endian problem on sync message size. | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/net.h> | ||
25 | #include <linux/completion.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/skbuff.h> | ||
28 | #include <linux/in.h> | ||
29 | #include <linux/igmp.h> /* for ip_mc_join_group */ | ||
30 | |||
31 | #include <net/ip.h> | ||
32 | #include <net/sock.h> | ||
33 | #include <asm/uaccess.h> /* for get_fs and set_fs */ | ||
34 | |||
35 | #include <net/ip_vs.h> | ||
36 | |||
37 | #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ | ||
38 | #define IP_VS_SYNC_PORT 8848 /* multicast port */ | ||
39 | |||
40 | |||
41 | /* | ||
42 | * IPVS sync connection entry | ||
43 | */ | ||
44 | struct ip_vs_sync_conn { | ||
45 | __u8 reserved; | ||
46 | |||
47 | /* Protocol, addresses and port numbers */ | ||
48 | __u8 protocol; /* Which protocol (TCP/UDP) */ | ||
49 | __u16 cport; | ||
50 | __u16 vport; | ||
51 | __u16 dport; | ||
52 | __u32 caddr; /* client address */ | ||
53 | __u32 vaddr; /* virtual address */ | ||
54 | __u32 daddr; /* destination address */ | ||
55 | |||
56 | /* Flags and state transition */ | ||
57 | __u16 flags; /* status flags */ | ||
58 | __u16 state; /* state info */ | ||
59 | |||
60 | /* The sequence options start here */ | ||
61 | }; | ||
62 | |||
63 | struct ip_vs_sync_conn_options { | ||
64 | struct ip_vs_seq in_seq; /* incoming seq. struct */ | ||
65 | struct ip_vs_seq out_seq; /* outgoing seq. struct */ | ||
66 | }; | ||
67 | |||
68 | #define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) | ||
69 | #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) | ||
70 | #define FULL_CONN_SIZE \ | ||
71 | (sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) | ||
72 | |||
73 | |||
74 | /* | ||
75 | The master mulitcasts messages to the backup load balancers in the | ||
76 | following format. | ||
77 | |||
78 | 0 1 2 3 | ||
79 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
80 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
81 | | Count Conns | SyncID | Size | | ||
82 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
83 | | | | ||
84 | | IPVS Sync Connection (1) | | ||
85 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
86 | | . | | ||
87 | | . | | ||
88 | | . | | ||
89 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
90 | | | | ||
91 | | IPVS Sync Connection (n) | | ||
92 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
93 | */ | ||
94 | |||
95 | #define SYNC_MESG_HEADER_LEN 4 | ||
96 | |||
97 | struct ip_vs_sync_mesg { | ||
98 | __u8 nr_conns; | ||
99 | __u8 syncid; | ||
100 | __u16 size; | ||
101 | |||
102 | /* ip_vs_sync_conn entries start here */ | ||
103 | }; | ||
104 | |||
105 | /* the maximum length of sync (sending/receiving) message */ | ||
106 | static int sync_send_mesg_maxlen; | ||
107 | static int sync_recv_mesg_maxlen; | ||
108 | |||
109 | struct ip_vs_sync_buff { | ||
110 | struct list_head list; | ||
111 | unsigned long firstuse; | ||
112 | |||
113 | /* pointers for the message data */ | ||
114 | struct ip_vs_sync_mesg *mesg; | ||
115 | unsigned char *head; | ||
116 | unsigned char *end; | ||
117 | }; | ||
118 | |||
119 | |||
120 | /* the sync_buff list head and the lock */ | ||
121 | static LIST_HEAD(ip_vs_sync_queue); | ||
122 | static DEFINE_SPINLOCK(ip_vs_sync_lock); | ||
123 | |||
124 | /* current sync_buff for accepting new conn entries */ | ||
125 | static struct ip_vs_sync_buff *curr_sb = NULL; | ||
126 | static DEFINE_SPINLOCK(curr_sb_lock); | ||
127 | |||
128 | /* ipvs sync daemon state */ | ||
129 | volatile int ip_vs_sync_state = IP_VS_STATE_NONE; | ||
130 | volatile int ip_vs_master_syncid = 0; | ||
131 | volatile int ip_vs_backup_syncid = 0; | ||
132 | |||
133 | /* multicast interface name */ | ||
134 | char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; | ||
135 | char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; | ||
136 | |||
137 | /* multicast addr */ | ||
138 | static struct sockaddr_in mcast_addr; | ||
139 | |||
140 | |||
141 | static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) | ||
142 | { | ||
143 | spin_lock(&ip_vs_sync_lock); | ||
144 | list_add_tail(&sb->list, &ip_vs_sync_queue); | ||
145 | spin_unlock(&ip_vs_sync_lock); | ||
146 | } | ||
147 | |||
148 | static inline struct ip_vs_sync_buff * sb_dequeue(void) | ||
149 | { | ||
150 | struct ip_vs_sync_buff *sb; | ||
151 | |||
152 | spin_lock_bh(&ip_vs_sync_lock); | ||
153 | if (list_empty(&ip_vs_sync_queue)) { | ||
154 | sb = NULL; | ||
155 | } else { | ||
156 | sb = list_entry(ip_vs_sync_queue.next, | ||
157 | struct ip_vs_sync_buff, | ||
158 | list); | ||
159 | list_del(&sb->list); | ||
160 | } | ||
161 | spin_unlock_bh(&ip_vs_sync_lock); | ||
162 | |||
163 | return sb; | ||
164 | } | ||
165 | |||
166 | static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) | ||
167 | { | ||
168 | struct ip_vs_sync_buff *sb; | ||
169 | |||
170 | if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) | ||
171 | return NULL; | ||
172 | |||
173 | if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { | ||
174 | kfree(sb); | ||
175 | return NULL; | ||
176 | } | ||
177 | sb->mesg->nr_conns = 0; | ||
178 | sb->mesg->syncid = ip_vs_master_syncid; | ||
179 | sb->mesg->size = 4; | ||
180 | sb->head = (unsigned char *)sb->mesg + 4; | ||
181 | sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; | ||
182 | sb->firstuse = jiffies; | ||
183 | return sb; | ||
184 | } | ||
185 | |||
186 | static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) | ||
187 | { | ||
188 | kfree(sb->mesg); | ||
189 | kfree(sb); | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * Get the current sync buffer if it has been created for more | ||
194 | * than the specified time or the specified time is zero. | ||
195 | */ | ||
196 | static inline struct ip_vs_sync_buff * | ||
197 | get_curr_sync_buff(unsigned long time) | ||
198 | { | ||
199 | struct ip_vs_sync_buff *sb; | ||
200 | |||
201 | spin_lock_bh(&curr_sb_lock); | ||
202 | if (curr_sb && (time == 0 || | ||
203 | time_before(jiffies - curr_sb->firstuse, time))) { | ||
204 | sb = curr_sb; | ||
205 | curr_sb = NULL; | ||
206 | } else | ||
207 | sb = NULL; | ||
208 | spin_unlock_bh(&curr_sb_lock); | ||
209 | return sb; | ||
210 | } | ||
211 | |||
212 | |||
213 | /* | ||
214 | * Add an ip_vs_conn information into the current sync_buff. | ||
215 | * Called by ip_vs_in. | ||
216 | */ | ||
217 | void ip_vs_sync_conn(struct ip_vs_conn *cp) | ||
218 | { | ||
219 | struct ip_vs_sync_mesg *m; | ||
220 | struct ip_vs_sync_conn *s; | ||
221 | int len; | ||
222 | |||
223 | spin_lock(&curr_sb_lock); | ||
224 | if (!curr_sb) { | ||
225 | if (!(curr_sb=ip_vs_sync_buff_create())) { | ||
226 | spin_unlock(&curr_sb_lock); | ||
227 | IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); | ||
228 | return; | ||
229 | } | ||
230 | } | ||
231 | |||
232 | len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : | ||
233 | SIMPLE_CONN_SIZE; | ||
234 | m = curr_sb->mesg; | ||
235 | s = (struct ip_vs_sync_conn *)curr_sb->head; | ||
236 | |||
237 | /* copy members */ | ||
238 | s->protocol = cp->protocol; | ||
239 | s->cport = cp->cport; | ||
240 | s->vport = cp->vport; | ||
241 | s->dport = cp->dport; | ||
242 | s->caddr = cp->caddr; | ||
243 | s->vaddr = cp->vaddr; | ||
244 | s->daddr = cp->daddr; | ||
245 | s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); | ||
246 | s->state = htons(cp->state); | ||
247 | if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { | ||
248 | struct ip_vs_sync_conn_options *opt = | ||
249 | (struct ip_vs_sync_conn_options *)&s[1]; | ||
250 | memcpy(opt, &cp->in_seq, sizeof(*opt)); | ||
251 | } | ||
252 | |||
253 | m->nr_conns++; | ||
254 | m->size += len; | ||
255 | curr_sb->head += len; | ||
256 | |||
257 | /* check if there is a space for next one */ | ||
258 | if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { | ||
259 | sb_queue_tail(curr_sb); | ||
260 | curr_sb = NULL; | ||
261 | } | ||
262 | spin_unlock(&curr_sb_lock); | ||
263 | |||
264 | /* synchronize its controller if it has */ | ||
265 | if (cp->control) | ||
266 | ip_vs_sync_conn(cp->control); | ||
267 | } | ||
268 | |||
269 | |||
270 | /* | ||
271 | * Process received multicast message and create the corresponding | ||
272 | * ip_vs_conn entries. | ||
273 | */ | ||
274 | static void ip_vs_process_message(const char *buffer, const size_t buflen) | ||
275 | { | ||
276 | struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; | ||
277 | struct ip_vs_sync_conn *s; | ||
278 | struct ip_vs_sync_conn_options *opt; | ||
279 | struct ip_vs_conn *cp; | ||
280 | char *p; | ||
281 | int i; | ||
282 | |||
283 | /* Convert size back to host byte order */ | ||
284 | m->size = ntohs(m->size); | ||
285 | |||
286 | if (buflen != m->size) { | ||
287 | IP_VS_ERR("bogus message\n"); | ||
288 | return; | ||
289 | } | ||
290 | |||
291 | /* SyncID sanity check */ | ||
292 | if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { | ||
293 | IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", | ||
294 | m->syncid); | ||
295 | return; | ||
296 | } | ||
297 | |||
298 | p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); | ||
299 | for (i=0; i<m->nr_conns; i++) { | ||
300 | s = (struct ip_vs_sync_conn *)p; | ||
301 | cp = ip_vs_conn_in_get(s->protocol, | ||
302 | s->caddr, s->cport, | ||
303 | s->vaddr, s->vport); | ||
304 | if (!cp) { | ||
305 | cp = ip_vs_conn_new(s->protocol, | ||
306 | s->caddr, s->cport, | ||
307 | s->vaddr, s->vport, | ||
308 | s->daddr, s->dport, | ||
309 | ntohs(s->flags), NULL); | ||
310 | if (!cp) { | ||
311 | IP_VS_ERR("ip_vs_conn_new failed\n"); | ||
312 | return; | ||
313 | } | ||
314 | cp->state = ntohs(s->state); | ||
315 | } else if (!cp->dest) { | ||
316 | /* it is an entry created by the synchronization */ | ||
317 | cp->state = ntohs(s->state); | ||
318 | cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED; | ||
319 | } /* Note that we don't touch its state and flags | ||
320 | if it is a normal entry. */ | ||
321 | |||
322 | if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) { | ||
323 | opt = (struct ip_vs_sync_conn_options *)&s[1]; | ||
324 | memcpy(&cp->in_seq, opt, sizeof(*opt)); | ||
325 | p += FULL_CONN_SIZE; | ||
326 | } else | ||
327 | p += SIMPLE_CONN_SIZE; | ||
328 | |||
329 | atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); | ||
330 | cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; | ||
331 | ip_vs_conn_put(cp); | ||
332 | |||
333 | if (p > buffer+buflen) { | ||
334 | IP_VS_ERR("bogus message\n"); | ||
335 | return; | ||
336 | } | ||
337 | } | ||
338 | } | ||
339 | |||
340 | |||
341 | /* | ||
342 | * Setup loopback of outgoing multicasts on a sending socket | ||
343 | */ | ||
344 | static void set_mcast_loop(struct sock *sk, u_char loop) | ||
345 | { | ||
346 | struct inet_sock *inet = inet_sk(sk); | ||
347 | |||
348 | /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ | ||
349 | lock_sock(sk); | ||
350 | inet->mc_loop = loop ? 1 : 0; | ||
351 | release_sock(sk); | ||
352 | } | ||
353 | |||
354 | /* | ||
355 | * Specify TTL for outgoing multicasts on a sending socket | ||
356 | */ | ||
357 | static void set_mcast_ttl(struct sock *sk, u_char ttl) | ||
358 | { | ||
359 | struct inet_sock *inet = inet_sk(sk); | ||
360 | |||
361 | /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ | ||
362 | lock_sock(sk); | ||
363 | inet->mc_ttl = ttl; | ||
364 | release_sock(sk); | ||
365 | } | ||
366 | |||
367 | /* | ||
368 | * Specifiy default interface for outgoing multicasts | ||
369 | */ | ||
370 | static int set_mcast_if(struct sock *sk, char *ifname) | ||
371 | { | ||
372 | struct net_device *dev; | ||
373 | struct inet_sock *inet = inet_sk(sk); | ||
374 | |||
375 | if ((dev = __dev_get_by_name(ifname)) == NULL) | ||
376 | return -ENODEV; | ||
377 | |||
378 | if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) | ||
379 | return -EINVAL; | ||
380 | |||
381 | lock_sock(sk); | ||
382 | inet->mc_index = dev->ifindex; | ||
383 | /* inet->mc_addr = 0; */ | ||
384 | release_sock(sk); | ||
385 | |||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | |||
390 | /* | ||
391 | * Set the maximum length of sync message according to the | ||
392 | * specified interface's MTU. | ||
393 | */ | ||
394 | static int set_sync_mesg_maxlen(int sync_state) | ||
395 | { | ||
396 | struct net_device *dev; | ||
397 | int num; | ||
398 | |||
399 | if (sync_state == IP_VS_STATE_MASTER) { | ||
400 | if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL) | ||
401 | return -ENODEV; | ||
402 | |||
403 | num = (dev->mtu - sizeof(struct iphdr) - | ||
404 | sizeof(struct udphdr) - | ||
405 | SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; | ||
406 | sync_send_mesg_maxlen = | ||
407 | SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; | ||
408 | IP_VS_DBG(7, "setting the maximum length of sync sending " | ||
409 | "message %d.\n", sync_send_mesg_maxlen); | ||
410 | } else if (sync_state == IP_VS_STATE_BACKUP) { | ||
411 | if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL) | ||
412 | return -ENODEV; | ||
413 | |||
414 | sync_recv_mesg_maxlen = dev->mtu - | ||
415 | sizeof(struct iphdr) - sizeof(struct udphdr); | ||
416 | IP_VS_DBG(7, "setting the maximum length of sync receiving " | ||
417 | "message %d.\n", sync_recv_mesg_maxlen); | ||
418 | } | ||
419 | |||
420 | return 0; | ||
421 | } | ||
422 | |||
423 | |||
424 | /* | ||
425 | * Join a multicast group. | ||
426 | * the group is specified by a class D multicast address 224.0.0.0/8 | ||
427 | * in the in_addr structure passed in as a parameter. | ||
428 | */ | ||
429 | static int | ||
430 | join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) | ||
431 | { | ||
432 | struct ip_mreqn mreq; | ||
433 | struct net_device *dev; | ||
434 | int ret; | ||
435 | |||
436 | memset(&mreq, 0, sizeof(mreq)); | ||
437 | memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); | ||
438 | |||
439 | if ((dev = __dev_get_by_name(ifname)) == NULL) | ||
440 | return -ENODEV; | ||
441 | if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) | ||
442 | return -EINVAL; | ||
443 | |||
444 | mreq.imr_ifindex = dev->ifindex; | ||
445 | |||
446 | lock_sock(sk); | ||
447 | ret = ip_mc_join_group(sk, &mreq); | ||
448 | release_sock(sk); | ||
449 | |||
450 | return ret; | ||
451 | } | ||
452 | |||
453 | |||
454 | static int bind_mcastif_addr(struct socket *sock, char *ifname) | ||
455 | { | ||
456 | struct net_device *dev; | ||
457 | u32 addr; | ||
458 | struct sockaddr_in sin; | ||
459 | |||
460 | if ((dev = __dev_get_by_name(ifname)) == NULL) | ||
461 | return -ENODEV; | ||
462 | |||
463 | addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
464 | if (!addr) | ||
465 | IP_VS_ERR("You probably need to specify IP address on " | ||
466 | "multicast interface.\n"); | ||
467 | |||
468 | IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", | ||
469 | ifname, NIPQUAD(addr)); | ||
470 | |||
471 | /* Now bind the socket with the address of multicast interface */ | ||
472 | sin.sin_family = AF_INET; | ||
473 | sin.sin_addr.s_addr = addr; | ||
474 | sin.sin_port = 0; | ||
475 | |||
476 | return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * Set up sending multicast socket over UDP | ||
481 | */ | ||
482 | static struct socket * make_send_sock(void) | ||
483 | { | ||
484 | struct socket *sock; | ||
485 | |||
486 | /* First create a socket */ | ||
487 | if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { | ||
488 | IP_VS_ERR("Error during creation of socket; terminating\n"); | ||
489 | return NULL; | ||
490 | } | ||
491 | |||
492 | if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { | ||
493 | IP_VS_ERR("Error setting outbound mcast interface\n"); | ||
494 | goto error; | ||
495 | } | ||
496 | |||
497 | set_mcast_loop(sock->sk, 0); | ||
498 | set_mcast_ttl(sock->sk, 1); | ||
499 | |||
500 | if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { | ||
501 | IP_VS_ERR("Error binding address of the mcast interface\n"); | ||
502 | goto error; | ||
503 | } | ||
504 | |||
505 | if (sock->ops->connect(sock, | ||
506 | (struct sockaddr*)&mcast_addr, | ||
507 | sizeof(struct sockaddr), 0) < 0) { | ||
508 | IP_VS_ERR("Error connecting to the multicast addr\n"); | ||
509 | goto error; | ||
510 | } | ||
511 | |||
512 | return sock; | ||
513 | |||
514 | error: | ||
515 | sock_release(sock); | ||
516 | return NULL; | ||
517 | } | ||
518 | |||
519 | |||
520 | /* | ||
521 | * Set up receiving multicast socket over UDP | ||
522 | */ | ||
523 | static struct socket * make_receive_sock(void) | ||
524 | { | ||
525 | struct socket *sock; | ||
526 | |||
527 | /* First create a socket */ | ||
528 | if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { | ||
529 | IP_VS_ERR("Error during creation of socket; terminating\n"); | ||
530 | return NULL; | ||
531 | } | ||
532 | |||
533 | /* it is equivalent to the REUSEADDR option in user-space */ | ||
534 | sock->sk->sk_reuse = 1; | ||
535 | |||
536 | if (sock->ops->bind(sock, | ||
537 | (struct sockaddr*)&mcast_addr, | ||
538 | sizeof(struct sockaddr)) < 0) { | ||
539 | IP_VS_ERR("Error binding to the multicast addr\n"); | ||
540 | goto error; | ||
541 | } | ||
542 | |||
543 | /* join the multicast group */ | ||
544 | if (join_mcast_group(sock->sk, | ||
545 | (struct in_addr*)&mcast_addr.sin_addr, | ||
546 | ip_vs_backup_mcast_ifn) < 0) { | ||
547 | IP_VS_ERR("Error joining to the multicast group\n"); | ||
548 | goto error; | ||
549 | } | ||
550 | |||
551 | return sock; | ||
552 | |||
553 | error: | ||
554 | sock_release(sock); | ||
555 | return NULL; | ||
556 | } | ||
557 | |||
558 | |||
559 | static int | ||
560 | ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) | ||
561 | { | ||
562 | struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; | ||
563 | struct kvec iov; | ||
564 | int len; | ||
565 | |||
566 | EnterFunction(7); | ||
567 | iov.iov_base = (void *)buffer; | ||
568 | iov.iov_len = length; | ||
569 | |||
570 | len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); | ||
571 | |||
572 | LeaveFunction(7); | ||
573 | return len; | ||
574 | } | ||
575 | |||
576 | static void | ||
577 | ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) | ||
578 | { | ||
579 | int msize; | ||
580 | |||
581 | msize = msg->size; | ||
582 | |||
583 | /* Put size in network byte order */ | ||
584 | msg->size = htons(msg->size); | ||
585 | |||
586 | if (ip_vs_send_async(sock, (char *)msg, msize) != msize) | ||
587 | IP_VS_ERR("ip_vs_send_async error\n"); | ||
588 | } | ||
589 | |||
590 | static int | ||
591 | ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) | ||
592 | { | ||
593 | struct msghdr msg = {NULL,}; | ||
594 | struct kvec iov; | ||
595 | int len; | ||
596 | |||
597 | EnterFunction(7); | ||
598 | |||
599 | /* Receive a packet */ | ||
600 | iov.iov_base = buffer; | ||
601 | iov.iov_len = (size_t)buflen; | ||
602 | |||
603 | len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); | ||
604 | |||
605 | if (len < 0) | ||
606 | return -1; | ||
607 | |||
608 | LeaveFunction(7); | ||
609 | return len; | ||
610 | } | ||
611 | |||
612 | |||
613 | static DECLARE_WAIT_QUEUE_HEAD(sync_wait); | ||
614 | static pid_t sync_master_pid = 0; | ||
615 | static pid_t sync_backup_pid = 0; | ||
616 | |||
617 | static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); | ||
618 | static int stop_master_sync = 0; | ||
619 | static int stop_backup_sync = 0; | ||
620 | |||
621 | static void sync_master_loop(void) | ||
622 | { | ||
623 | struct socket *sock; | ||
624 | struct ip_vs_sync_buff *sb; | ||
625 | |||
626 | /* create the sending multicast socket */ | ||
627 | sock = make_send_sock(); | ||
628 | if (!sock) | ||
629 | return; | ||
630 | |||
631 | IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " | ||
632 | "syncid = %d\n", | ||
633 | ip_vs_master_mcast_ifn, ip_vs_master_syncid); | ||
634 | |||
635 | for (;;) { | ||
636 | while ((sb=sb_dequeue())) { | ||
637 | ip_vs_send_sync_msg(sock, sb->mesg); | ||
638 | ip_vs_sync_buff_release(sb); | ||
639 | } | ||
640 | |||
641 | /* check if entries stay in curr_sb for 2 seconds */ | ||
642 | if ((sb = get_curr_sync_buff(2*HZ))) { | ||
643 | ip_vs_send_sync_msg(sock, sb->mesg); | ||
644 | ip_vs_sync_buff_release(sb); | ||
645 | } | ||
646 | |||
647 | if (stop_master_sync) | ||
648 | break; | ||
649 | |||
650 | ssleep(1); | ||
651 | } | ||
652 | |||
653 | /* clean up the sync_buff queue */ | ||
654 | while ((sb=sb_dequeue())) { | ||
655 | ip_vs_sync_buff_release(sb); | ||
656 | } | ||
657 | |||
658 | /* clean up the current sync_buff */ | ||
659 | if ((sb = get_curr_sync_buff(0))) { | ||
660 | ip_vs_sync_buff_release(sb); | ||
661 | } | ||
662 | |||
663 | /* release the sending multicast socket */ | ||
664 | sock_release(sock); | ||
665 | } | ||
666 | |||
667 | |||
668 | static void sync_backup_loop(void) | ||
669 | { | ||
670 | struct socket *sock; | ||
671 | char *buf; | ||
672 | int len; | ||
673 | |||
674 | if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { | ||
675 | IP_VS_ERR("sync_backup_loop: kmalloc error\n"); | ||
676 | return; | ||
677 | } | ||
678 | |||
679 | /* create the receiving multicast socket */ | ||
680 | sock = make_receive_sock(); | ||
681 | if (!sock) | ||
682 | goto out; | ||
683 | |||
684 | IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " | ||
685 | "syncid = %d\n", | ||
686 | ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); | ||
687 | |||
688 | for (;;) { | ||
689 | /* do you have data now? */ | ||
690 | while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { | ||
691 | if ((len = | ||
692 | ip_vs_receive(sock, buf, | ||
693 | sync_recv_mesg_maxlen)) <= 0) { | ||
694 | IP_VS_ERR("receiving message error\n"); | ||
695 | break; | ||
696 | } | ||
697 | /* disable bottom half, because it accessed the data | ||
698 | shared by softirq while getting/creating conns */ | ||
699 | local_bh_disable(); | ||
700 | ip_vs_process_message(buf, len); | ||
701 | local_bh_enable(); | ||
702 | } | ||
703 | |||
704 | if (stop_backup_sync) | ||
705 | break; | ||
706 | |||
707 | ssleep(1); | ||
708 | } | ||
709 | |||
710 | /* release the sending multicast socket */ | ||
711 | sock_release(sock); | ||
712 | |||
713 | out: | ||
714 | kfree(buf); | ||
715 | } | ||
716 | |||
717 | |||
718 | static void set_sync_pid(int sync_state, pid_t sync_pid) | ||
719 | { | ||
720 | if (sync_state == IP_VS_STATE_MASTER) | ||
721 | sync_master_pid = sync_pid; | ||
722 | else if (sync_state == IP_VS_STATE_BACKUP) | ||
723 | sync_backup_pid = sync_pid; | ||
724 | } | ||
725 | |||
726 | static void set_stop_sync(int sync_state, int set) | ||
727 | { | ||
728 | if (sync_state == IP_VS_STATE_MASTER) | ||
729 | stop_master_sync = set; | ||
730 | else if (sync_state == IP_VS_STATE_BACKUP) | ||
731 | stop_backup_sync = set; | ||
732 | else { | ||
733 | stop_master_sync = set; | ||
734 | stop_backup_sync = set; | ||
735 | } | ||
736 | } | ||
737 | |||
738 | static int sync_thread(void *startup) | ||
739 | { | ||
740 | DECLARE_WAITQUEUE(wait, current); | ||
741 | mm_segment_t oldmm; | ||
742 | int state; | ||
743 | const char *name; | ||
744 | |||
745 | /* increase the module use count */ | ||
746 | ip_vs_use_count_inc(); | ||
747 | |||
748 | if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { | ||
749 | state = IP_VS_STATE_MASTER; | ||
750 | name = "ipvs_syncmaster"; | ||
751 | } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { | ||
752 | state = IP_VS_STATE_BACKUP; | ||
753 | name = "ipvs_syncbackup"; | ||
754 | } else { | ||
755 | IP_VS_BUG(); | ||
756 | ip_vs_use_count_dec(); | ||
757 | return -EINVAL; | ||
758 | } | ||
759 | |||
760 | daemonize(name); | ||
761 | |||
762 | oldmm = get_fs(); | ||
763 | set_fs(KERNEL_DS); | ||
764 | |||
765 | /* Block all signals */ | ||
766 | spin_lock_irq(¤t->sighand->siglock); | ||
767 | siginitsetinv(¤t->blocked, 0); | ||
768 | recalc_sigpending(); | ||
769 | spin_unlock_irq(¤t->sighand->siglock); | ||
770 | |||
771 | /* set the maximum length of sync message */ | ||
772 | set_sync_mesg_maxlen(state); | ||
773 | |||
774 | /* set up multicast address */ | ||
775 | mcast_addr.sin_family = AF_INET; | ||
776 | mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); | ||
777 | mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); | ||
778 | |||
779 | add_wait_queue(&sync_wait, &wait); | ||
780 | |||
781 | set_sync_pid(state, current->pid); | ||
782 | complete((struct completion *)startup); | ||
783 | |||
784 | /* processing master/backup loop here */ | ||
785 | if (state == IP_VS_STATE_MASTER) | ||
786 | sync_master_loop(); | ||
787 | else if (state == IP_VS_STATE_BACKUP) | ||
788 | sync_backup_loop(); | ||
789 | else IP_VS_BUG(); | ||
790 | |||
791 | remove_wait_queue(&sync_wait, &wait); | ||
792 | |||
793 | /* thread exits */ | ||
794 | set_sync_pid(state, 0); | ||
795 | IP_VS_INFO("sync thread stopped!\n"); | ||
796 | |||
797 | set_fs(oldmm); | ||
798 | |||
799 | /* decrease the module use count */ | ||
800 | ip_vs_use_count_dec(); | ||
801 | |||
802 | set_stop_sync(state, 0); | ||
803 | wake_up(&stop_sync_wait); | ||
804 | |||
805 | return 0; | ||
806 | } | ||
807 | |||
808 | |||
809 | static int fork_sync_thread(void *startup) | ||
810 | { | ||
811 | pid_t pid; | ||
812 | |||
813 | /* fork the sync thread here, then the parent process of the | ||
814 | sync thread is the init process after this thread exits. */ | ||
815 | repeat: | ||
816 | if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { | ||
817 | IP_VS_ERR("could not create sync_thread due to %d... " | ||
818 | "retrying.\n", pid); | ||
819 | ssleep(1); | ||
820 | goto repeat; | ||
821 | } | ||
822 | |||
823 | return 0; | ||
824 | } | ||
825 | |||
826 | |||
827 | int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) | ||
828 | { | ||
829 | DECLARE_COMPLETION(startup); | ||
830 | pid_t pid; | ||
831 | |||
832 | if ((state == IP_VS_STATE_MASTER && sync_master_pid) || | ||
833 | (state == IP_VS_STATE_BACKUP && sync_backup_pid)) | ||
834 | return -EEXIST; | ||
835 | |||
836 | IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); | ||
837 | IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", | ||
838 | sizeof(struct ip_vs_sync_conn)); | ||
839 | |||
840 | ip_vs_sync_state |= state; | ||
841 | if (state == IP_VS_STATE_MASTER) { | ||
842 | strcpy(ip_vs_master_mcast_ifn, mcast_ifn); | ||
843 | ip_vs_master_syncid = syncid; | ||
844 | } else { | ||
845 | strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); | ||
846 | ip_vs_backup_syncid = syncid; | ||
847 | } | ||
848 | |||
849 | repeat: | ||
850 | if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) { | ||
851 | IP_VS_ERR("could not create fork_sync_thread due to %d... " | ||
852 | "retrying.\n", pid); | ||
853 | ssleep(1); | ||
854 | goto repeat; | ||
855 | } | ||
856 | |||
857 | wait_for_completion(&startup); | ||
858 | |||
859 | return 0; | ||
860 | } | ||
861 | |||
862 | |||
863 | int stop_sync_thread(int state) | ||
864 | { | ||
865 | DECLARE_WAITQUEUE(wait, current); | ||
866 | |||
867 | if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || | ||
868 | (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) | ||
869 | return -ESRCH; | ||
870 | |||
871 | IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); | ||
872 | IP_VS_INFO("stopping sync thread %d ...\n", | ||
873 | (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid); | ||
874 | |||
875 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
876 | add_wait_queue(&stop_sync_wait, &wait); | ||
877 | set_stop_sync(state, 1); | ||
878 | ip_vs_sync_state -= state; | ||
879 | wake_up(&sync_wait); | ||
880 | schedule(); | ||
881 | __set_current_state(TASK_RUNNING); | ||
882 | remove_wait_queue(&stop_sync_wait, &wait); | ||
883 | |||
884 | /* Note: no need to reap the sync thread, because its parent | ||
885 | process is the init process */ | ||
886 | |||
887 | if ((state == IP_VS_STATE_MASTER && stop_master_sync) || | ||
888 | (state == IP_VS_STATE_BACKUP && stop_backup_sync)) | ||
889 | IP_VS_BUG(); | ||
890 | |||
891 | return 0; | ||
892 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c new file mode 100644 index 000000000000..8a9d913261d8 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wlc.c | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * IPVS: Weighted Least-Connection Scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * Peter Kese <peter.kese@ijs.si> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Changes: | ||
15 | * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest | ||
16 | * Wensong Zhang : changed to use the inactconns in scheduling | ||
17 | * Wensong Zhang : changed some comestics things for debugging | ||
18 | * Wensong Zhang : changed for the d-linked destination list | ||
19 | * Wensong Zhang : added the ip_vs_wlc_update_svc | ||
20 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | #include <linux/module.h> | ||
25 | #include <linux/kernel.h> | ||
26 | |||
27 | #include <net/ip_vs.h> | ||
28 | |||
29 | |||
30 | static int | ||
31 | ip_vs_wlc_init_svc(struct ip_vs_service *svc) | ||
32 | { | ||
33 | return 0; | ||
34 | } | ||
35 | |||
36 | |||
37 | static int | ||
38 | ip_vs_wlc_done_svc(struct ip_vs_service *svc) | ||
39 | { | ||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | |||
44 | static int | ||
45 | ip_vs_wlc_update_svc(struct ip_vs_service *svc) | ||
46 | { | ||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | |||
51 | static inline unsigned int | ||
52 | ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) | ||
53 | { | ||
54 | /* | ||
55 | * We think the overhead of processing active connections is 256 | ||
56 | * times higher than that of inactive connections in average. (This | ||
57 | * 256 times might not be accurate, we will change it later) We | ||
58 | * use the following formula to estimate the overhead now: | ||
59 | * dest->activeconns*256 + dest->inactconns | ||
60 | */ | ||
61 | return (atomic_read(&dest->activeconns) << 8) + | ||
62 | atomic_read(&dest->inactconns); | ||
63 | } | ||
64 | |||
65 | |||
66 | /* | ||
67 | * Weighted Least Connection scheduling | ||
68 | */ | ||
69 | static struct ip_vs_dest * | ||
70 | ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
71 | { | ||
72 | struct ip_vs_dest *dest, *least; | ||
73 | unsigned int loh, doh; | ||
74 | |||
75 | IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); | ||
76 | |||
77 | /* | ||
78 | * We calculate the load of each dest server as follows: | ||
79 | * (dest overhead) / dest->weight | ||
80 | * | ||
81 | * Remember -- no floats in kernel mode!!! | ||
82 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
83 | * h1/w1 > h2/w2 | ||
84 | * if every weight is larger than zero. | ||
85 | * | ||
86 | * The server with weight=0 is quiesced and will not receive any | ||
87 | * new connections. | ||
88 | */ | ||
89 | |||
90 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
91 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
92 | atomic_read(&dest->weight) > 0) { | ||
93 | least = dest; | ||
94 | loh = ip_vs_wlc_dest_overhead(least); | ||
95 | goto nextstage; | ||
96 | } | ||
97 | } | ||
98 | return NULL; | ||
99 | |||
100 | /* | ||
101 | * Find the destination with the least load. | ||
102 | */ | ||
103 | nextstage: | ||
104 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
105 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
106 | continue; | ||
107 | doh = ip_vs_wlc_dest_overhead(dest); | ||
108 | if (loh * atomic_read(&dest->weight) > | ||
109 | doh * atomic_read(&least->weight)) { | ||
110 | least = dest; | ||
111 | loh = doh; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " | ||
116 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
117 | NIPQUAD(least->addr), ntohs(least->port), | ||
118 | atomic_read(&least->activeconns), | ||
119 | atomic_read(&least->refcnt), | ||
120 | atomic_read(&least->weight), loh); | ||
121 | |||
122 | return least; | ||
123 | } | ||
124 | |||
125 | |||
126 | static struct ip_vs_scheduler ip_vs_wlc_scheduler = | ||
127 | { | ||
128 | .name = "wlc", | ||
129 | .refcnt = ATOMIC_INIT(0), | ||
130 | .module = THIS_MODULE, | ||
131 | .init_service = ip_vs_wlc_init_svc, | ||
132 | .done_service = ip_vs_wlc_done_svc, | ||
133 | .update_service = ip_vs_wlc_update_svc, | ||
134 | .schedule = ip_vs_wlc_schedule, | ||
135 | }; | ||
136 | |||
137 | |||
138 | static int __init ip_vs_wlc_init(void) | ||
139 | { | ||
140 | INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); | ||
141 | return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); | ||
142 | } | ||
143 | |||
144 | static void __exit ip_vs_wlc_cleanup(void) | ||
145 | { | ||
146 | unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); | ||
147 | } | ||
148 | |||
149 | module_init(ip_vs_wlc_init); | ||
150 | module_exit(ip_vs_wlc_cleanup); | ||
151 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c new file mode 100644 index 000000000000..749fa044eca5 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wrr.c | |||
@@ -0,0 +1,235 @@ | |||
1 | /* | ||
2 | * IPVS: Weighted Round-Robin Scheduling module | ||
3 | * | ||
4 | * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * Changes: | ||
14 | * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest | ||
15 | * Wensong Zhang : changed some comestics things for debugging | ||
16 | * Wensong Zhang : changed for the d-linked destination list | ||
17 | * Wensong Zhang : added the ip_vs_wrr_update_svc | ||
18 | * Julian Anastasov : fixed the bug of returning destination | ||
19 | * with weight 0 when all weights are zero | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/kernel.h> | ||
25 | |||
26 | #include <net/ip_vs.h> | ||
27 | |||
28 | /* | ||
29 | * current destination pointer for weighted round-robin scheduling | ||
30 | */ | ||
31 | struct ip_vs_wrr_mark { | ||
32 | struct list_head *cl; /* current list head */ | ||
33 | int cw; /* current weight */ | ||
34 | int mw; /* maximum weight */ | ||
35 | int di; /* decreasing interval */ | ||
36 | }; | ||
37 | |||
38 | |||
39 | /* | ||
40 | * Get the gcd of server weights | ||
41 | */ | ||
42 | static int gcd(int a, int b) | ||
43 | { | ||
44 | int c; | ||
45 | |||
46 | while ((c = a % b)) { | ||
47 | a = b; | ||
48 | b = c; | ||
49 | } | ||
50 | return b; | ||
51 | } | ||
52 | |||
53 | static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) | ||
54 | { | ||
55 | struct ip_vs_dest *dest; | ||
56 | int weight; | ||
57 | int g = 0; | ||
58 | |||
59 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
60 | weight = atomic_read(&dest->weight); | ||
61 | if (weight > 0) { | ||
62 | if (g > 0) | ||
63 | g = gcd(weight, g); | ||
64 | else | ||
65 | g = weight; | ||
66 | } | ||
67 | } | ||
68 | return g ? g : 1; | ||
69 | } | ||
70 | |||
71 | |||
72 | /* | ||
73 | * Get the maximum weight of the service destinations. | ||
74 | */ | ||
75 | static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) | ||
76 | { | ||
77 | struct ip_vs_dest *dest; | ||
78 | int weight = 0; | ||
79 | |||
80 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
81 | if (atomic_read(&dest->weight) > weight) | ||
82 | weight = atomic_read(&dest->weight); | ||
83 | } | ||
84 | |||
85 | return weight; | ||
86 | } | ||
87 | |||
88 | |||
89 | static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) | ||
90 | { | ||
91 | struct ip_vs_wrr_mark *mark; | ||
92 | |||
93 | /* | ||
94 | * Allocate the mark variable for WRR scheduling | ||
95 | */ | ||
96 | mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); | ||
97 | if (mark == NULL) { | ||
98 | IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); | ||
99 | return -ENOMEM; | ||
100 | } | ||
101 | mark->cl = &svc->destinations; | ||
102 | mark->cw = 0; | ||
103 | mark->mw = ip_vs_wrr_max_weight(svc); | ||
104 | mark->di = ip_vs_wrr_gcd_weight(svc); | ||
105 | svc->sched_data = mark; | ||
106 | |||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | |||
111 | static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) | ||
112 | { | ||
113 | /* | ||
114 | * Release the mark variable | ||
115 | */ | ||
116 | kfree(svc->sched_data); | ||
117 | |||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | |||
122 | static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) | ||
123 | { | ||
124 | struct ip_vs_wrr_mark *mark = svc->sched_data; | ||
125 | |||
126 | mark->cl = &svc->destinations; | ||
127 | mark->mw = ip_vs_wrr_max_weight(svc); | ||
128 | mark->di = ip_vs_wrr_gcd_weight(svc); | ||
129 | if (mark->cw > mark->mw) | ||
130 | mark->cw = 0; | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | |||
135 | /* | ||
136 | * Weighted Round-Robin Scheduling | ||
137 | */ | ||
138 | static struct ip_vs_dest * | ||
139 | ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
140 | { | ||
141 | struct ip_vs_dest *dest; | ||
142 | struct ip_vs_wrr_mark *mark = svc->sched_data; | ||
143 | struct list_head *p; | ||
144 | |||
145 | IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); | ||
146 | |||
147 | /* | ||
148 | * This loop will always terminate, because mark->cw in (0, max_weight] | ||
149 | * and at least one server has its weight equal to max_weight. | ||
150 | */ | ||
151 | write_lock(&svc->sched_lock); | ||
152 | p = mark->cl; | ||
153 | while (1) { | ||
154 | if (mark->cl == &svc->destinations) { | ||
155 | /* it is at the head of the destination list */ | ||
156 | |||
157 | if (mark->cl == mark->cl->next) { | ||
158 | /* no dest entry */ | ||
159 | dest = NULL; | ||
160 | goto out; | ||
161 | } | ||
162 | |||
163 | mark->cl = svc->destinations.next; | ||
164 | mark->cw -= mark->di; | ||
165 | if (mark->cw <= 0) { | ||
166 | mark->cw = mark->mw; | ||
167 | /* | ||
168 | * Still zero, which means no available servers. | ||
169 | */ | ||
170 | if (mark->cw == 0) { | ||
171 | mark->cl = &svc->destinations; | ||
172 | IP_VS_INFO("ip_vs_wrr_schedule(): " | ||
173 | "no available servers\n"); | ||
174 | dest = NULL; | ||
175 | goto out; | ||
176 | } | ||
177 | } | ||
178 | } else | ||
179 | mark->cl = mark->cl->next; | ||
180 | |||
181 | if (mark->cl != &svc->destinations) { | ||
182 | /* not at the head of the list */ | ||
183 | dest = list_entry(mark->cl, struct ip_vs_dest, n_list); | ||
184 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
185 | atomic_read(&dest->weight) >= mark->cw) { | ||
186 | /* got it */ | ||
187 | break; | ||
188 | } | ||
189 | } | ||
190 | |||
191 | if (mark->cl == p && mark->cw == mark->di) { | ||
192 | /* back to the start, and no dest is found. | ||
193 | It is only possible when all dests are OVERLOADED */ | ||
194 | dest = NULL; | ||
195 | goto out; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " | ||
200 | "activeconns %d refcnt %d weight %d\n", | ||
201 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
202 | atomic_read(&dest->activeconns), | ||
203 | atomic_read(&dest->refcnt), | ||
204 | atomic_read(&dest->weight)); | ||
205 | |||
206 | out: | ||
207 | write_unlock(&svc->sched_lock); | ||
208 | return dest; | ||
209 | } | ||
210 | |||
211 | |||
212 | static struct ip_vs_scheduler ip_vs_wrr_scheduler = { | ||
213 | .name = "wrr", | ||
214 | .refcnt = ATOMIC_INIT(0), | ||
215 | .module = THIS_MODULE, | ||
216 | .init_service = ip_vs_wrr_init_svc, | ||
217 | .done_service = ip_vs_wrr_done_svc, | ||
218 | .update_service = ip_vs_wrr_update_svc, | ||
219 | .schedule = ip_vs_wrr_schedule, | ||
220 | }; | ||
221 | |||
222 | static int __init ip_vs_wrr_init(void) | ||
223 | { | ||
224 | INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); | ||
225 | return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; | ||
226 | } | ||
227 | |||
228 | static void __exit ip_vs_wrr_cleanup(void) | ||
229 | { | ||
230 | unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); | ||
231 | } | ||
232 | |||
233 | module_init(ip_vs_wrr_init); | ||
234 | module_exit(ip_vs_wrr_cleanup); | ||
235 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c new file mode 100644 index 000000000000..faa6176bbeb1 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_xmit.c | |||
@@ -0,0 +1,563 @@ | |||
1 | /* | ||
2 | * ip_vs_xmit.c: various packet transmitters for IPVS | ||
3 | * | ||
4 | * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ | ||
5 | * | ||
6 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
7 | * Julian Anastasov <ja@ssi.bg> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Changes: | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/ip.h> | ||
20 | #include <linux/tcp.h> /* for tcphdr */ | ||
21 | #include <net/tcp.h> /* for csum_tcpudp_magic */ | ||
22 | #include <net/udp.h> | ||
23 | #include <net/icmp.h> /* for icmp_send */ | ||
24 | #include <net/route.h> /* for ip_route_output */ | ||
25 | #include <linux/netfilter.h> | ||
26 | #include <linux/netfilter_ipv4.h> | ||
27 | |||
28 | #include <net/ip_vs.h> | ||
29 | |||
30 | |||
31 | /* | ||
32 | * Destination cache to speed up outgoing route lookup | ||
33 | */ | ||
34 | static inline void | ||
35 | __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) | ||
36 | { | ||
37 | struct dst_entry *old_dst; | ||
38 | |||
39 | old_dst = dest->dst_cache; | ||
40 | dest->dst_cache = dst; | ||
41 | dest->dst_rtos = rtos; | ||
42 | dst_release(old_dst); | ||
43 | } | ||
44 | |||
45 | static inline struct dst_entry * | ||
46 | __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) | ||
47 | { | ||
48 | struct dst_entry *dst = dest->dst_cache; | ||
49 | |||
50 | if (!dst) | ||
51 | return NULL; | ||
52 | if ((dst->obsolete || rtos != dest->dst_rtos) && | ||
53 | dst->ops->check(dst, cookie) == NULL) { | ||
54 | dest->dst_cache = NULL; | ||
55 | dst_release(dst); | ||
56 | return NULL; | ||
57 | } | ||
58 | dst_hold(dst); | ||
59 | return dst; | ||
60 | } | ||
61 | |||
62 | static inline struct rtable * | ||
63 | __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) | ||
64 | { | ||
65 | struct rtable *rt; /* Route to the other host */ | ||
66 | struct ip_vs_dest *dest = cp->dest; | ||
67 | |||
68 | if (dest) { | ||
69 | spin_lock(&dest->dst_lock); | ||
70 | if (!(rt = (struct rtable *) | ||
71 | __ip_vs_dst_check(dest, rtos, 0))) { | ||
72 | struct flowi fl = { | ||
73 | .oif = 0, | ||
74 | .nl_u = { | ||
75 | .ip4_u = { | ||
76 | .daddr = dest->addr, | ||
77 | .saddr = 0, | ||
78 | .tos = rtos, } }, | ||
79 | }; | ||
80 | |||
81 | if (ip_route_output_key(&rt, &fl)) { | ||
82 | spin_unlock(&dest->dst_lock); | ||
83 | IP_VS_DBG_RL("ip_route_output error, " | ||
84 | "dest: %u.%u.%u.%u\n", | ||
85 | NIPQUAD(dest->addr)); | ||
86 | return NULL; | ||
87 | } | ||
88 | __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); | ||
89 | IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", | ||
90 | NIPQUAD(dest->addr), | ||
91 | atomic_read(&rt->u.dst.__refcnt), rtos); | ||
92 | } | ||
93 | spin_unlock(&dest->dst_lock); | ||
94 | } else { | ||
95 | struct flowi fl = { | ||
96 | .oif = 0, | ||
97 | .nl_u = { | ||
98 | .ip4_u = { | ||
99 | .daddr = cp->daddr, | ||
100 | .saddr = 0, | ||
101 | .tos = rtos, } }, | ||
102 | }; | ||
103 | |||
104 | if (ip_route_output_key(&rt, &fl)) { | ||
105 | IP_VS_DBG_RL("ip_route_output error, dest: " | ||
106 | "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); | ||
107 | return NULL; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | return rt; | ||
112 | } | ||
113 | |||
114 | |||
115 | /* | ||
116 | * Release dest->dst_cache before a dest is removed | ||
117 | */ | ||
118 | void | ||
119 | ip_vs_dst_reset(struct ip_vs_dest *dest) | ||
120 | { | ||
121 | struct dst_entry *old_dst; | ||
122 | |||
123 | old_dst = dest->dst_cache; | ||
124 | dest->dst_cache = NULL; | ||
125 | dst_release(old_dst); | ||
126 | } | ||
127 | |||
128 | #define IP_VS_XMIT(skb, rt) \ | ||
129 | do { \ | ||
130 | nf_reset_debug(skb); \ | ||
131 | (skb)->nfcache |= NFC_IPVS_PROPERTY; \ | ||
132 | (skb)->ip_summed = CHECKSUM_NONE; \ | ||
133 | NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ | ||
134 | (rt)->u.dst.dev, dst_output); \ | ||
135 | } while (0) | ||
136 | |||
137 | |||
138 | /* | ||
139 | * NULL transmitter (do nothing except return NF_ACCEPT) | ||
140 | */ | ||
141 | int | ||
142 | ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
143 | struct ip_vs_protocol *pp) | ||
144 | { | ||
145 | /* we do not touch skb and do not need pskb ptr */ | ||
146 | return NF_ACCEPT; | ||
147 | } | ||
148 | |||
149 | |||
150 | /* | ||
151 | * Bypass transmitter | ||
152 | * Let packets bypass the destination when the destination is not | ||
153 | * available, it may be only used in transparent cache cluster. | ||
154 | */ | ||
155 | int | ||
156 | ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
157 | struct ip_vs_protocol *pp) | ||
158 | { | ||
159 | struct rtable *rt; /* Route to the other host */ | ||
160 | struct iphdr *iph = skb->nh.iph; | ||
161 | u8 tos = iph->tos; | ||
162 | int mtu; | ||
163 | struct flowi fl = { | ||
164 | .oif = 0, | ||
165 | .nl_u = { | ||
166 | .ip4_u = { | ||
167 | .daddr = iph->daddr, | ||
168 | .saddr = 0, | ||
169 | .tos = RT_TOS(tos), } }, | ||
170 | }; | ||
171 | |||
172 | EnterFunction(10); | ||
173 | |||
174 | if (ip_route_output_key(&rt, &fl)) { | ||
175 | IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " | ||
176 | "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); | ||
177 | goto tx_error_icmp; | ||
178 | } | ||
179 | |||
180 | /* MTU checking */ | ||
181 | mtu = dst_mtu(&rt->u.dst); | ||
182 | if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { | ||
183 | ip_rt_put(rt); | ||
184 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
185 | IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); | ||
186 | goto tx_error; | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * Call ip_send_check because we are not sure it is called | ||
191 | * after ip_defrag. Is copy-on-write needed? | ||
192 | */ | ||
193 | if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { | ||
194 | ip_rt_put(rt); | ||
195 | return NF_STOLEN; | ||
196 | } | ||
197 | ip_send_check(skb->nh.iph); | ||
198 | |||
199 | /* drop old route */ | ||
200 | dst_release(skb->dst); | ||
201 | skb->dst = &rt->u.dst; | ||
202 | |||
203 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
204 | skb->local_df = 1; | ||
205 | |||
206 | IP_VS_XMIT(skb, rt); | ||
207 | |||
208 | LeaveFunction(10); | ||
209 | return NF_STOLEN; | ||
210 | |||
211 | tx_error_icmp: | ||
212 | dst_link_failure(skb); | ||
213 | tx_error: | ||
214 | kfree_skb(skb); | ||
215 | LeaveFunction(10); | ||
216 | return NF_STOLEN; | ||
217 | } | ||
218 | |||
219 | |||
220 | /* | ||
221 | * NAT transmitter (only for outside-to-inside nat forwarding) | ||
222 | * Not used for related ICMP | ||
223 | */ | ||
224 | int | ||
225 | ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
226 | struct ip_vs_protocol *pp) | ||
227 | { | ||
228 | struct rtable *rt; /* Route to the other host */ | ||
229 | int mtu; | ||
230 | struct iphdr *iph = skb->nh.iph; | ||
231 | |||
232 | EnterFunction(10); | ||
233 | |||
234 | /* check if it is a connection of no-client-port */ | ||
235 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { | ||
236 | __u16 _pt, *p; | ||
237 | p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); | ||
238 | if (p == NULL) | ||
239 | goto tx_error; | ||
240 | ip_vs_conn_fill_cport(cp, *p); | ||
241 | IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); | ||
242 | } | ||
243 | |||
244 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) | ||
245 | goto tx_error_icmp; | ||
246 | |||
247 | /* MTU checking */ | ||
248 | mtu = dst_mtu(&rt->u.dst); | ||
249 | if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { | ||
250 | ip_rt_put(rt); | ||
251 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
252 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); | ||
253 | goto tx_error; | ||
254 | } | ||
255 | |||
256 | /* copy-on-write the packet before mangling it */ | ||
257 | if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr))) | ||
258 | goto tx_error_put; | ||
259 | |||
260 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
261 | goto tx_error_put; | ||
262 | |||
263 | /* drop old route */ | ||
264 | dst_release(skb->dst); | ||
265 | skb->dst = &rt->u.dst; | ||
266 | |||
267 | /* mangle the packet */ | ||
268 | if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) | ||
269 | goto tx_error; | ||
270 | skb->nh.iph->daddr = cp->daddr; | ||
271 | ip_send_check(skb->nh.iph); | ||
272 | |||
273 | IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); | ||
274 | |||
275 | /* FIXME: when application helper enlarges the packet and the length | ||
276 | is larger than the MTU of outgoing device, there will be still | ||
277 | MTU problem. */ | ||
278 | |||
279 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
280 | skb->local_df = 1; | ||
281 | |||
282 | IP_VS_XMIT(skb, rt); | ||
283 | |||
284 | LeaveFunction(10); | ||
285 | return NF_STOLEN; | ||
286 | |||
287 | tx_error_icmp: | ||
288 | dst_link_failure(skb); | ||
289 | tx_error: | ||
290 | LeaveFunction(10); | ||
291 | kfree_skb(skb); | ||
292 | return NF_STOLEN; | ||
293 | tx_error_put: | ||
294 | ip_rt_put(rt); | ||
295 | goto tx_error; | ||
296 | } | ||
297 | |||
298 | |||
299 | /* | ||
300 | * IP Tunneling transmitter | ||
301 | * | ||
302 | * This function encapsulates the packet in a new IP packet, its | ||
303 | * destination will be set to cp->daddr. Most code of this function | ||
304 | * is taken from ipip.c. | ||
305 | * | ||
306 | * It is used in VS/TUN cluster. The load balancer selects a real | ||
307 | * server from a cluster based on a scheduling algorithm, | ||
308 | * encapsulates the request packet and forwards it to the selected | ||
309 | * server. For example, all real servers are configured with | ||
310 | * "ifconfig tunl0 <Virtual IP Address> up". When the server receives | ||
311 | * the encapsulated packet, it will decapsulate the packet, processe | ||
312 | * the request and return the response packets directly to the client | ||
313 | * without passing the load balancer. This can greatly increase the | ||
314 | * scalability of virtual server. | ||
315 | * | ||
316 | * Used for ANY protocol | ||
317 | */ | ||
318 | int | ||
319 | ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
320 | struct ip_vs_protocol *pp) | ||
321 | { | ||
322 | struct rtable *rt; /* Route to the other host */ | ||
323 | struct net_device *tdev; /* Device to other host */ | ||
324 | struct iphdr *old_iph = skb->nh.iph; | ||
325 | u8 tos = old_iph->tos; | ||
326 | u16 df = old_iph->frag_off; | ||
327 | struct iphdr *iph; /* Our new IP header */ | ||
328 | int max_headroom; /* The extra header space needed */ | ||
329 | int mtu; | ||
330 | |||
331 | EnterFunction(10); | ||
332 | |||
333 | if (skb->protocol != __constant_htons(ETH_P_IP)) { | ||
334 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " | ||
335 | "ETH_P_IP: %d, skb protocol: %d\n", | ||
336 | __constant_htons(ETH_P_IP), skb->protocol); | ||
337 | goto tx_error; | ||
338 | } | ||
339 | |||
340 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) | ||
341 | goto tx_error_icmp; | ||
342 | |||
343 | tdev = rt->u.dst.dev; | ||
344 | |||
345 | mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); | ||
346 | if (mtu < 68) { | ||
347 | ip_rt_put(rt); | ||
348 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); | ||
349 | goto tx_error; | ||
350 | } | ||
351 | if (skb->dst) | ||
352 | skb->dst->ops->update_pmtu(skb->dst, mtu); | ||
353 | |||
354 | df |= (old_iph->frag_off&__constant_htons(IP_DF)); | ||
355 | |||
356 | if ((old_iph->frag_off&__constant_htons(IP_DF)) | ||
357 | && mtu < ntohs(old_iph->tot_len)) { | ||
358 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
359 | ip_rt_put(rt); | ||
360 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); | ||
361 | goto tx_error; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Okay, now see if we can stuff it in the buffer as-is. | ||
366 | */ | ||
367 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); | ||
368 | |||
369 | if (skb_headroom(skb) < max_headroom | ||
370 | || skb_cloned(skb) || skb_shared(skb)) { | ||
371 | struct sk_buff *new_skb = | ||
372 | skb_realloc_headroom(skb, max_headroom); | ||
373 | if (!new_skb) { | ||
374 | ip_rt_put(rt); | ||
375 | kfree_skb(skb); | ||
376 | IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); | ||
377 | return NF_STOLEN; | ||
378 | } | ||
379 | kfree_skb(skb); | ||
380 | skb = new_skb; | ||
381 | old_iph = skb->nh.iph; | ||
382 | } | ||
383 | |||
384 | skb->h.raw = (void *) old_iph; | ||
385 | |||
386 | /* fix old IP header checksum */ | ||
387 | ip_send_check(old_iph); | ||
388 | |||
389 | skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); | ||
390 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | ||
391 | |||
392 | /* drop old route */ | ||
393 | dst_release(skb->dst); | ||
394 | skb->dst = &rt->u.dst; | ||
395 | |||
396 | /* | ||
397 | * Push down and install the IPIP header. | ||
398 | */ | ||
399 | iph = skb->nh.iph; | ||
400 | iph->version = 4; | ||
401 | iph->ihl = sizeof(struct iphdr)>>2; | ||
402 | iph->frag_off = df; | ||
403 | iph->protocol = IPPROTO_IPIP; | ||
404 | iph->tos = tos; | ||
405 | iph->daddr = rt->rt_dst; | ||
406 | iph->saddr = rt->rt_src; | ||
407 | iph->ttl = old_iph->ttl; | ||
408 | iph->tot_len = htons(skb->len); | ||
409 | ip_select_ident(iph, &rt->u.dst, NULL); | ||
410 | ip_send_check(iph); | ||
411 | |||
412 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
413 | skb->local_df = 1; | ||
414 | |||
415 | IP_VS_XMIT(skb, rt); | ||
416 | |||
417 | LeaveFunction(10); | ||
418 | |||
419 | return NF_STOLEN; | ||
420 | |||
421 | tx_error_icmp: | ||
422 | dst_link_failure(skb); | ||
423 | tx_error: | ||
424 | kfree_skb(skb); | ||
425 | LeaveFunction(10); | ||
426 | return NF_STOLEN; | ||
427 | } | ||
428 | |||
429 | |||
430 | /* | ||
431 | * Direct Routing transmitter | ||
432 | * Used for ANY protocol | ||
433 | */ | ||
434 | int | ||
435 | ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
436 | struct ip_vs_protocol *pp) | ||
437 | { | ||
438 | struct rtable *rt; /* Route to the other host */ | ||
439 | struct iphdr *iph = skb->nh.iph; | ||
440 | int mtu; | ||
441 | |||
442 | EnterFunction(10); | ||
443 | |||
444 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) | ||
445 | goto tx_error_icmp; | ||
446 | |||
447 | /* MTU checking */ | ||
448 | mtu = dst_mtu(&rt->u.dst); | ||
449 | if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { | ||
450 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
451 | ip_rt_put(rt); | ||
452 | IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); | ||
453 | goto tx_error; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Call ip_send_check because we are not sure it is called | ||
458 | * after ip_defrag. Is copy-on-write needed? | ||
459 | */ | ||
460 | if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { | ||
461 | ip_rt_put(rt); | ||
462 | return NF_STOLEN; | ||
463 | } | ||
464 | ip_send_check(skb->nh.iph); | ||
465 | |||
466 | /* drop old route */ | ||
467 | dst_release(skb->dst); | ||
468 | skb->dst = &rt->u.dst; | ||
469 | |||
470 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
471 | skb->local_df = 1; | ||
472 | |||
473 | IP_VS_XMIT(skb, rt); | ||
474 | |||
475 | LeaveFunction(10); | ||
476 | return NF_STOLEN; | ||
477 | |||
478 | tx_error_icmp: | ||
479 | dst_link_failure(skb); | ||
480 | tx_error: | ||
481 | kfree_skb(skb); | ||
482 | LeaveFunction(10); | ||
483 | return NF_STOLEN; | ||
484 | } | ||
485 | |||
486 | |||
487 | /* | ||
488 | * ICMP packet transmitter | ||
489 | * called by the ip_vs_in_icmp | ||
490 | */ | ||
491 | int | ||
492 | ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
493 | struct ip_vs_protocol *pp, int offset) | ||
494 | { | ||
495 | struct rtable *rt; /* Route to the other host */ | ||
496 | int mtu; | ||
497 | int rc; | ||
498 | |||
499 | EnterFunction(10); | ||
500 | |||
501 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be | ||
502 | forwarded directly here, because there is no need to | ||
503 | translate address/port back */ | ||
504 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | ||
505 | if (cp->packet_xmit) | ||
506 | rc = cp->packet_xmit(skb, cp, pp); | ||
507 | else | ||
508 | rc = NF_ACCEPT; | ||
509 | /* do not touch skb anymore */ | ||
510 | atomic_inc(&cp->in_pkts); | ||
511 | __ip_vs_conn_put(cp); | ||
512 | goto out; | ||
513 | } | ||
514 | |||
515 | /* | ||
516 | * mangle and send the packet here (only for VS/NAT) | ||
517 | */ | ||
518 | |||
519 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) | ||
520 | goto tx_error_icmp; | ||
521 | |||
522 | /* MTU checking */ | ||
523 | mtu = dst_mtu(&rt->u.dst); | ||
524 | if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { | ||
525 | ip_rt_put(rt); | ||
526 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | ||
527 | IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); | ||
528 | goto tx_error; | ||
529 | } | ||
530 | |||
531 | /* copy-on-write the packet before mangling it */ | ||
532 | if (!ip_vs_make_skb_writable(&skb, offset)) | ||
533 | goto tx_error_put; | ||
534 | |||
535 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
536 | goto tx_error_put; | ||
537 | |||
538 | /* drop the old route when skb is not shared */ | ||
539 | dst_release(skb->dst); | ||
540 | skb->dst = &rt->u.dst; | ||
541 | |||
542 | ip_vs_nat_icmp(skb, pp, cp, 0); | ||
543 | |||
544 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
545 | skb->local_df = 1; | ||
546 | |||
547 | IP_VS_XMIT(skb, rt); | ||
548 | |||
549 | rc = NF_STOLEN; | ||
550 | goto out; | ||
551 | |||
552 | tx_error_icmp: | ||
553 | dst_link_failure(skb); | ||
554 | tx_error: | ||
555 | dev_kfree_skb(skb); | ||
556 | rc = NF_STOLEN; | ||
557 | out: | ||
558 | LeaveFunction(10); | ||
559 | return rc; | ||
560 | tx_error_put: | ||
561 | ip_rt_put(rt); | ||
562 | goto tx_error; | ||
563 | } | ||