diff options
author | Julius Volz <juliusv@google.com> | 2008-09-19 06:32:57 -0400 |
---|---|---|
committer | Simon Horman <horms@verge.net.au> | 2008-10-06 17:38:24 -0400 |
commit | cb7f6a7b716e801097b564dec3ccb58d330aef56 (patch) | |
tree | 92fa8fa5381e04576c43eab88874ab54ea670767 /net/ipv4/ipvs | |
parent | 8d5803bf6fbe5264000afc8c34bff08e8ecc023b (diff) |
IPVS: Move IPVS to net/netfilter/ipvs
Since IPVS now has partial IPv6 support, this patch moves IPVS from
net/ipv4/ipvs to net/netfilter/ipvs. It's a result of:
$ git mv net/ipv4/ipvs net/netfilter
and adapting the relevant Kconfigs/Makefiles to the new path.
Signed-off-by: Julius Volz <juliusv@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net/ipv4/ipvs')
-rw-r--r-- | net/ipv4/ipvs/Kconfig | 239 | ||||
-rw-r--r-- | net/ipv4/ipvs/Makefile | 33 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_app.c | 622 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_conn.c | 1110 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_core.c | 1542 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_ctl.c | 3443 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_est.c | 166 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_ftp.c | 410 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_lblc.c | 555 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_lblcr.c | 755 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_lc.c | 103 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_nq.c | 138 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_proto.c | 288 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 235 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_proto_tcp.c | 732 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_proto_udp.c | 533 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_rr.c | 112 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_sched.c | 251 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_sed.c | 140 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_sh.c | 258 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_sync.c | 942 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_wlc.c | 128 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_wrr.c | 237 | ||||
-rw-r--r-- | net/ipv4/ipvs/ip_vs_xmit.c | 1004 |
24 files changed, 0 insertions, 13976 deletions
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig deleted file mode 100644 index de6004de80bc..000000000000 --- a/net/ipv4/ipvs/Kconfig +++ /dev/null | |||
@@ -1,239 +0,0 @@ | |||
1 | # | ||
2 | # IP Virtual Server configuration | ||
3 | # | ||
4 | menuconfig IP_VS | ||
5 | tristate "IP virtual server support (EXPERIMENTAL)" | ||
6 | depends on NETFILTER | ||
7 | ---help--- | ||
8 | IP Virtual Server support will let you build a high-performance | ||
9 | virtual server based on cluster of two or more real servers. This | ||
10 | option must be enabled for at least one of the clustered computers | ||
11 | that will take care of intercepting incoming connections to a | ||
12 | single IP address and scheduling them to real servers. | ||
13 | |||
14 | Three request dispatching techniques are implemented, they are | ||
15 | virtual server via NAT, virtual server via tunneling and virtual | ||
16 | server via direct routing. The several scheduling algorithms can | ||
17 | be used to choose which server the connection is directed to, | ||
18 | thus load balancing can be achieved among the servers. For more | ||
19 | information and its administration program, please visit the | ||
20 | following URL: <http://www.linuxvirtualserver.org/>. | ||
21 | |||
22 | If you want to compile it in kernel, say Y. To compile it as a | ||
23 | module, choose M here. If unsure, say N. | ||
24 | |||
25 | if IP_VS | ||
26 | |||
27 | config IP_VS_IPV6 | ||
28 | bool "IPv6 support for IPVS (DANGEROUS)" | ||
29 | depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) | ||
30 | ---help--- | ||
31 | Add IPv6 support to IPVS. This is incomplete and might be dangerous. | ||
32 | |||
33 | Say N if unsure. | ||
34 | |||
35 | config IP_VS_DEBUG | ||
36 | bool "IP virtual server debugging" | ||
37 | ---help--- | ||
38 | Say Y here if you want to get additional messages useful in | ||
39 | debugging the IP virtual server code. You can change the debug | ||
40 | level in /proc/sys/net/ipv4/vs/debug_level | ||
41 | |||
42 | config IP_VS_TAB_BITS | ||
43 | int "IPVS connection table size (the Nth power of 2)" | ||
44 | range 8 20 | ||
45 | default 12 | ||
46 | ---help--- | ||
47 | The IPVS connection hash table uses the chaining scheme to handle | ||
48 | hash collisions. Using a big IPVS connection hash table will greatly | ||
49 | reduce conflicts when there are hundreds of thousands of connections | ||
50 | in the hash table. | ||
51 | |||
52 | Note the table size must be power of 2. The table size will be the | ||
53 | value of 2 to the your input number power. The number to choose is | ||
54 | from 8 to 20, the default number is 12, which means the table size | ||
55 | is 4096. Don't input the number too small, otherwise you will lose | ||
56 | performance on it. You can adapt the table size yourself, according | ||
57 | to your virtual server application. It is good to set the table size | ||
58 | not far less than the number of connections per second multiplying | ||
59 | average lasting time of connection in the table. For example, your | ||
60 | virtual server gets 200 connections per second, the connection lasts | ||
61 | for 200 seconds in average in the connection table, the table size | ||
62 | should be not far less than 200x200, it is good to set the table | ||
63 | size 32768 (2**15). | ||
64 | |||
65 | Another note that each connection occupies 128 bytes effectively and | ||
66 | each hash entry uses 8 bytes, so you can estimate how much memory is | ||
67 | needed for your box. | ||
68 | |||
69 | comment "IPVS transport protocol load balancing support" | ||
70 | |||
71 | config IP_VS_PROTO_TCP | ||
72 | bool "TCP load balancing support" | ||
73 | ---help--- | ||
74 | This option enables support for load balancing TCP transport | ||
75 | protocol. Say Y if unsure. | ||
76 | |||
77 | config IP_VS_PROTO_UDP | ||
78 | bool "UDP load balancing support" | ||
79 | ---help--- | ||
80 | This option enables support for load balancing UDP transport | ||
81 | protocol. Say Y if unsure. | ||
82 | |||
83 | config IP_VS_PROTO_AH_ESP | ||
84 | bool | ||
85 | depends on UNDEFINED | ||
86 | |||
87 | config IP_VS_PROTO_ESP | ||
88 | bool "ESP load balancing support" | ||
89 | select IP_VS_PROTO_AH_ESP | ||
90 | ---help--- | ||
91 | This option enables support for load balancing ESP (Encapsulation | ||
92 | Security Payload) transport protocol. Say Y if unsure. | ||
93 | |||
94 | config IP_VS_PROTO_AH | ||
95 | bool "AH load balancing support" | ||
96 | select IP_VS_PROTO_AH_ESP | ||
97 | ---help--- | ||
98 | This option enables support for load balancing AH (Authentication | ||
99 | Header) transport protocol. Say Y if unsure. | ||
100 | |||
101 | comment "IPVS scheduler" | ||
102 | |||
103 | config IP_VS_RR | ||
104 | tristate "round-robin scheduling" | ||
105 | ---help--- | ||
106 | The robin-robin scheduling algorithm simply directs network | ||
107 | connections to different real servers in a round-robin manner. | ||
108 | |||
109 | If you want to compile it in kernel, say Y. To compile it as a | ||
110 | module, choose M here. If unsure, say N. | ||
111 | |||
112 | config IP_VS_WRR | ||
113 | tristate "weighted round-robin scheduling" | ||
114 | ---help--- | ||
115 | The weighted robin-robin scheduling algorithm directs network | ||
116 | connections to different real servers based on server weights | ||
117 | in a round-robin manner. Servers with higher weights receive | ||
118 | new connections first than those with less weights, and servers | ||
119 | with higher weights get more connections than those with less | ||
120 | weights and servers with equal weights get equal connections. | ||
121 | |||
122 | If you want to compile it in kernel, say Y. To compile it as a | ||
123 | module, choose M here. If unsure, say N. | ||
124 | |||
125 | config IP_VS_LC | ||
126 | tristate "least-connection scheduling" | ||
127 | ---help--- | ||
128 | The least-connection scheduling algorithm directs network | ||
129 | connections to the server with the least number of active | ||
130 | connections. | ||
131 | |||
132 | If you want to compile it in kernel, say Y. To compile it as a | ||
133 | module, choose M here. If unsure, say N. | ||
134 | |||
135 | config IP_VS_WLC | ||
136 | tristate "weighted least-connection scheduling" | ||
137 | ---help--- | ||
138 | The weighted least-connection scheduling algorithm directs network | ||
139 | connections to the server with the least active connections | ||
140 | normalized by the server weight. | ||
141 | |||
142 | If you want to compile it in kernel, say Y. To compile it as a | ||
143 | module, choose M here. If unsure, say N. | ||
144 | |||
145 | config IP_VS_LBLC | ||
146 | tristate "locality-based least-connection scheduling" | ||
147 | ---help--- | ||
148 | The locality-based least-connection scheduling algorithm is for | ||
149 | destination IP load balancing. It is usually used in cache cluster. | ||
150 | This algorithm usually directs packet destined for an IP address to | ||
151 | its server if the server is alive and under load. If the server is | ||
152 | overloaded (its active connection numbers is larger than its weight) | ||
153 | and there is a server in its half load, then allocate the weighted | ||
154 | least-connection server to this IP address. | ||
155 | |||
156 | If you want to compile it in kernel, say Y. To compile it as a | ||
157 | module, choose M here. If unsure, say N. | ||
158 | |||
159 | config IP_VS_LBLCR | ||
160 | tristate "locality-based least-connection with replication scheduling" | ||
161 | ---help--- | ||
162 | The locality-based least-connection with replication scheduling | ||
163 | algorithm is also for destination IP load balancing. It is | ||
164 | usually used in cache cluster. It differs from the LBLC scheduling | ||
165 | as follows: the load balancer maintains mappings from a target | ||
166 | to a set of server nodes that can serve the target. Requests for | ||
167 | a target are assigned to the least-connection node in the target's | ||
168 | server set. If all the node in the server set are over loaded, | ||
169 | it picks up a least-connection node in the cluster and adds it | ||
170 | in the sever set for the target. If the server set has not been | ||
171 | modified for the specified time, the most loaded node is removed | ||
172 | from the server set, in order to avoid high degree of replication. | ||
173 | |||
174 | If you want to compile it in kernel, say Y. To compile it as a | ||
175 | module, choose M here. If unsure, say N. | ||
176 | |||
177 | config IP_VS_DH | ||
178 | tristate "destination hashing scheduling" | ||
179 | ---help--- | ||
180 | The destination hashing scheduling algorithm assigns network | ||
181 | connections to the servers through looking up a statically assigned | ||
182 | hash table by their destination IP addresses. | ||
183 | |||
184 | If you want to compile it in kernel, say Y. To compile it as a | ||
185 | module, choose M here. If unsure, say N. | ||
186 | |||
187 | config IP_VS_SH | ||
188 | tristate "source hashing scheduling" | ||
189 | ---help--- | ||
190 | The source hashing scheduling algorithm assigns network | ||
191 | connections to the servers through looking up a statically assigned | ||
192 | hash table by their source IP addresses. | ||
193 | |||
194 | If you want to compile it in kernel, say Y. To compile it as a | ||
195 | module, choose M here. If unsure, say N. | ||
196 | |||
197 | config IP_VS_SED | ||
198 | tristate "shortest expected delay scheduling" | ||
199 | ---help--- | ||
200 | The shortest expected delay scheduling algorithm assigns network | ||
201 | connections to the server with the shortest expected delay. The | ||
202 | expected delay that the job will experience is (Ci + 1) / Ui if | ||
203 | sent to the ith server, in which Ci is the number of connections | ||
204 | on the ith server and Ui is the fixed service rate (weight) | ||
205 | of the ith server. | ||
206 | |||
207 | If you want to compile it in kernel, say Y. To compile it as a | ||
208 | module, choose M here. If unsure, say N. | ||
209 | |||
210 | config IP_VS_NQ | ||
211 | tristate "never queue scheduling" | ||
212 | ---help--- | ||
213 | The never queue scheduling algorithm adopts a two-speed model. | ||
214 | When there is an idle server available, the job will be sent to | ||
215 | the idle server, instead of waiting for a fast one. When there | ||
216 | is no idle server available, the job will be sent to the server | ||
217 | that minimize its expected delay (The Shortest Expected Delay | ||
218 | scheduling algorithm). | ||
219 | |||
220 | If you want to compile it in kernel, say Y. To compile it as a | ||
221 | module, choose M here. If unsure, say N. | ||
222 | |||
223 | comment 'IPVS application helper' | ||
224 | |||
225 | config IP_VS_FTP | ||
226 | tristate "FTP protocol helper" | ||
227 | depends on IP_VS_PROTO_TCP | ||
228 | ---help--- | ||
229 | FTP is a protocol that transfers IP address and/or port number in | ||
230 | the payload. In the virtual server via Network Address Translation, | ||
231 | the IP address and port number of real servers cannot be sent to | ||
232 | clients in ftp connections directly, so FTP protocol helper is | ||
233 | required for tracking the connection and mangling it back to that of | ||
234 | virtual service. | ||
235 | |||
236 | If you want to compile it in kernel, say Y. To compile it as a | ||
237 | module, choose M here. If unsure, say N. | ||
238 | |||
239 | endif # IP_VS | ||
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile deleted file mode 100644 index 73a46fe1fe4c..000000000000 --- a/net/ipv4/ipvs/Makefile +++ /dev/null | |||
@@ -1,33 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the IPVS modules on top of IPv4. | ||
3 | # | ||
4 | |||
5 | # IPVS transport protocol load balancing support | ||
6 | ip_vs_proto-objs-y := | ||
7 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o | ||
8 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o | ||
9 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o | ||
10 | |||
11 | ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ | ||
12 | ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ | ||
13 | ip_vs_est.o ip_vs_proto.o \ | ||
14 | $(ip_vs_proto-objs-y) | ||
15 | |||
16 | |||
17 | # IPVS core | ||
18 | obj-$(CONFIG_IP_VS) += ip_vs.o | ||
19 | |||
20 | # IPVS schedulers | ||
21 | obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o | ||
22 | obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o | ||
23 | obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o | ||
24 | obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o | ||
25 | obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o | ||
26 | obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o | ||
27 | obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o | ||
28 | obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o | ||
29 | obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o | ||
30 | obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o | ||
31 | |||
32 | # IPVS application helpers | ||
33 | obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o | ||
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c deleted file mode 100644 index 201b8ea3020d..000000000000 --- a/net/ipv4/ipvs/ip_vs_app.c +++ /dev/null | |||
@@ -1,622 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_app.c: Application module support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference | ||
12 | * is that ip_vs_app module handles the reverse direction (incoming requests | ||
13 | * and outgoing responses). | ||
14 | * | ||
15 | * IP_MASQ_APP application masquerading module | ||
16 | * | ||
17 | * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/skbuff.h> | ||
24 | #include <linux/in.h> | ||
25 | #include <linux/ip.h> | ||
26 | #include <linux/netfilter.h> | ||
27 | #include <net/net_namespace.h> | ||
28 | #include <net/protocol.h> | ||
29 | #include <net/tcp.h> | ||
30 | #include <asm/system.h> | ||
31 | #include <linux/stat.h> | ||
32 | #include <linux/proc_fs.h> | ||
33 | #include <linux/seq_file.h> | ||
34 | #include <linux/mutex.h> | ||
35 | |||
36 | #include <net/ip_vs.h> | ||
37 | |||
38 | EXPORT_SYMBOL(register_ip_vs_app); | ||
39 | EXPORT_SYMBOL(unregister_ip_vs_app); | ||
40 | EXPORT_SYMBOL(register_ip_vs_app_inc); | ||
41 | |||
42 | /* ipvs application list head */ | ||
43 | static LIST_HEAD(ip_vs_app_list); | ||
44 | static DEFINE_MUTEX(__ip_vs_app_mutex); | ||
45 | |||
46 | |||
47 | /* | ||
48 | * Get an ip_vs_app object | ||
49 | */ | ||
50 | static inline int ip_vs_app_get(struct ip_vs_app *app) | ||
51 | { | ||
52 | return try_module_get(app->module); | ||
53 | } | ||
54 | |||
55 | |||
56 | static inline void ip_vs_app_put(struct ip_vs_app *app) | ||
57 | { | ||
58 | module_put(app->module); | ||
59 | } | ||
60 | |||
61 | |||
62 | /* | ||
63 | * Allocate/initialize app incarnation and register it in proto apps. | ||
64 | */ | ||
65 | static int | ||
66 | ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) | ||
67 | { | ||
68 | struct ip_vs_protocol *pp; | ||
69 | struct ip_vs_app *inc; | ||
70 | int ret; | ||
71 | |||
72 | if (!(pp = ip_vs_proto_get(proto))) | ||
73 | return -EPROTONOSUPPORT; | ||
74 | |||
75 | if (!pp->unregister_app) | ||
76 | return -EOPNOTSUPP; | ||
77 | |||
78 | inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); | ||
79 | if (!inc) | ||
80 | return -ENOMEM; | ||
81 | INIT_LIST_HEAD(&inc->p_list); | ||
82 | INIT_LIST_HEAD(&inc->incs_list); | ||
83 | inc->app = app; | ||
84 | inc->port = htons(port); | ||
85 | atomic_set(&inc->usecnt, 0); | ||
86 | |||
87 | if (app->timeouts) { | ||
88 | inc->timeout_table = | ||
89 | ip_vs_create_timeout_table(app->timeouts, | ||
90 | app->timeouts_size); | ||
91 | if (!inc->timeout_table) { | ||
92 | ret = -ENOMEM; | ||
93 | goto out; | ||
94 | } | ||
95 | } | ||
96 | |||
97 | ret = pp->register_app(inc); | ||
98 | if (ret) | ||
99 | goto out; | ||
100 | |||
101 | list_add(&inc->a_list, &app->incs_list); | ||
102 | IP_VS_DBG(9, "%s application %s:%u registered\n", | ||
103 | pp->name, inc->name, inc->port); | ||
104 | |||
105 | return 0; | ||
106 | |||
107 | out: | ||
108 | kfree(inc->timeout_table); | ||
109 | kfree(inc); | ||
110 | return ret; | ||
111 | } | ||
112 | |||
113 | |||
114 | /* | ||
115 | * Release app incarnation | ||
116 | */ | ||
117 | static void | ||
118 | ip_vs_app_inc_release(struct ip_vs_app *inc) | ||
119 | { | ||
120 | struct ip_vs_protocol *pp; | ||
121 | |||
122 | if (!(pp = ip_vs_proto_get(inc->protocol))) | ||
123 | return; | ||
124 | |||
125 | if (pp->unregister_app) | ||
126 | pp->unregister_app(inc); | ||
127 | |||
128 | IP_VS_DBG(9, "%s App %s:%u unregistered\n", | ||
129 | pp->name, inc->name, inc->port); | ||
130 | |||
131 | list_del(&inc->a_list); | ||
132 | |||
133 | kfree(inc->timeout_table); | ||
134 | kfree(inc); | ||
135 | } | ||
136 | |||
137 | |||
138 | /* | ||
139 | * Get reference to app inc (only called from softirq) | ||
140 | * | ||
141 | */ | ||
142 | int ip_vs_app_inc_get(struct ip_vs_app *inc) | ||
143 | { | ||
144 | int result; | ||
145 | |||
146 | atomic_inc(&inc->usecnt); | ||
147 | if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) | ||
148 | atomic_dec(&inc->usecnt); | ||
149 | return result; | ||
150 | } | ||
151 | |||
152 | |||
153 | /* | ||
154 | * Put the app inc (only called from timer or net softirq) | ||
155 | */ | ||
156 | void ip_vs_app_inc_put(struct ip_vs_app *inc) | ||
157 | { | ||
158 | ip_vs_app_put(inc->app); | ||
159 | atomic_dec(&inc->usecnt); | ||
160 | } | ||
161 | |||
162 | |||
163 | /* | ||
164 | * Register an application incarnation in protocol applications | ||
165 | */ | ||
166 | int | ||
167 | register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) | ||
168 | { | ||
169 | int result; | ||
170 | |||
171 | mutex_lock(&__ip_vs_app_mutex); | ||
172 | |||
173 | result = ip_vs_app_inc_new(app, proto, port); | ||
174 | |||
175 | mutex_unlock(&__ip_vs_app_mutex); | ||
176 | |||
177 | return result; | ||
178 | } | ||
179 | |||
180 | |||
181 | /* | ||
182 | * ip_vs_app registration routine | ||
183 | */ | ||
184 | int register_ip_vs_app(struct ip_vs_app *app) | ||
185 | { | ||
186 | /* increase the module use count */ | ||
187 | ip_vs_use_count_inc(); | ||
188 | |||
189 | mutex_lock(&__ip_vs_app_mutex); | ||
190 | |||
191 | list_add(&app->a_list, &ip_vs_app_list); | ||
192 | |||
193 | mutex_unlock(&__ip_vs_app_mutex); | ||
194 | |||
195 | return 0; | ||
196 | } | ||
197 | |||
198 | |||
199 | /* | ||
200 | * ip_vs_app unregistration routine | ||
201 | * We are sure there are no app incarnations attached to services | ||
202 | */ | ||
203 | void unregister_ip_vs_app(struct ip_vs_app *app) | ||
204 | { | ||
205 | struct ip_vs_app *inc, *nxt; | ||
206 | |||
207 | mutex_lock(&__ip_vs_app_mutex); | ||
208 | |||
209 | list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { | ||
210 | ip_vs_app_inc_release(inc); | ||
211 | } | ||
212 | |||
213 | list_del(&app->a_list); | ||
214 | |||
215 | mutex_unlock(&__ip_vs_app_mutex); | ||
216 | |||
217 | /* decrease the module use count */ | ||
218 | ip_vs_use_count_dec(); | ||
219 | } | ||
220 | |||
221 | |||
222 | /* | ||
223 | * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) | ||
224 | */ | ||
225 | int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) | ||
226 | { | ||
227 | return pp->app_conn_bind(cp); | ||
228 | } | ||
229 | |||
230 | |||
231 | /* | ||
232 | * Unbind cp from application incarnation (called by cp destructor) | ||
233 | */ | ||
234 | void ip_vs_unbind_app(struct ip_vs_conn *cp) | ||
235 | { | ||
236 | struct ip_vs_app *inc = cp->app; | ||
237 | |||
238 | if (!inc) | ||
239 | return; | ||
240 | |||
241 | if (inc->unbind_conn) | ||
242 | inc->unbind_conn(inc, cp); | ||
243 | if (inc->done_conn) | ||
244 | inc->done_conn(inc, cp); | ||
245 | ip_vs_app_inc_put(inc); | ||
246 | cp->app = NULL; | ||
247 | } | ||
248 | |||
249 | |||
250 | /* | ||
251 | * Fixes th->seq based on ip_vs_seq info. | ||
252 | */ | ||
253 | static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) | ||
254 | { | ||
255 | __u32 seq = ntohl(th->seq); | ||
256 | |||
257 | /* | ||
258 | * Adjust seq with delta-offset for all packets after | ||
259 | * the most recent resized pkt seq and with previous_delta offset | ||
260 | * for all packets before most recent resized pkt seq. | ||
261 | */ | ||
262 | if (vseq->delta || vseq->previous_delta) { | ||
263 | if(after(seq, vseq->init_seq)) { | ||
264 | th->seq = htonl(seq + vseq->delta); | ||
265 | IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", | ||
266 | vseq->delta); | ||
267 | } else { | ||
268 | th->seq = htonl(seq + vseq->previous_delta); | ||
269 | IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " | ||
270 | "(%d) to seq\n", vseq->previous_delta); | ||
271 | } | ||
272 | } | ||
273 | } | ||
274 | |||
275 | |||
276 | /* | ||
277 | * Fixes th->ack_seq based on ip_vs_seq info. | ||
278 | */ | ||
279 | static inline void | ||
280 | vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) | ||
281 | { | ||
282 | __u32 ack_seq = ntohl(th->ack_seq); | ||
283 | |||
284 | /* | ||
285 | * Adjust ack_seq with delta-offset for | ||
286 | * the packets AFTER most recent resized pkt has caused a shift | ||
287 | * for packets before most recent resized pkt, use previous_delta | ||
288 | */ | ||
289 | if (vseq->delta || vseq->previous_delta) { | ||
290 | /* since ack_seq is the number of octet that is expected | ||
291 | to receive next, so compare it with init_seq+delta */ | ||
292 | if(after(ack_seq, vseq->init_seq+vseq->delta)) { | ||
293 | th->ack_seq = htonl(ack_seq - vseq->delta); | ||
294 | IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " | ||
295 | "(%d) from ack_seq\n", vseq->delta); | ||
296 | |||
297 | } else { | ||
298 | th->ack_seq = htonl(ack_seq - vseq->previous_delta); | ||
299 | IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " | ||
300 | "previous_delta (%d) from ack_seq\n", | ||
301 | vseq->previous_delta); | ||
302 | } | ||
303 | } | ||
304 | } | ||
305 | |||
306 | |||
307 | /* | ||
308 | * Updates ip_vs_seq if pkt has been resized | ||
309 | * Assumes already checked proto==IPPROTO_TCP and diff!=0. | ||
310 | */ | ||
311 | static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, | ||
312 | unsigned flag, __u32 seq, int diff) | ||
313 | { | ||
314 | /* spinlock is to keep updating cp->flags atomic */ | ||
315 | spin_lock(&cp->lock); | ||
316 | if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { | ||
317 | vseq->previous_delta = vseq->delta; | ||
318 | vseq->delta += diff; | ||
319 | vseq->init_seq = seq; | ||
320 | cp->flags |= flag; | ||
321 | } | ||
322 | spin_unlock(&cp->lock); | ||
323 | } | ||
324 | |||
325 | static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, | ||
326 | struct ip_vs_app *app) | ||
327 | { | ||
328 | int diff; | ||
329 | const unsigned int tcp_offset = ip_hdrlen(skb); | ||
330 | struct tcphdr *th; | ||
331 | __u32 seq; | ||
332 | |||
333 | if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) | ||
334 | return 0; | ||
335 | |||
336 | th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); | ||
337 | |||
338 | /* | ||
339 | * Remember seq number in case this pkt gets resized | ||
340 | */ | ||
341 | seq = ntohl(th->seq); | ||
342 | |||
343 | /* | ||
344 | * Fix seq stuff if flagged as so. | ||
345 | */ | ||
346 | if (cp->flags & IP_VS_CONN_F_OUT_SEQ) | ||
347 | vs_fix_seq(&cp->out_seq, th); | ||
348 | if (cp->flags & IP_VS_CONN_F_IN_SEQ) | ||
349 | vs_fix_ack_seq(&cp->in_seq, th); | ||
350 | |||
351 | /* | ||
352 | * Call private output hook function | ||
353 | */ | ||
354 | if (app->pkt_out == NULL) | ||
355 | return 1; | ||
356 | |||
357 | if (!app->pkt_out(app, cp, skb, &diff)) | ||
358 | return 0; | ||
359 | |||
360 | /* | ||
361 | * Update ip_vs seq stuff if len has changed. | ||
362 | */ | ||
363 | if (diff != 0) | ||
364 | vs_seq_update(cp, &cp->out_seq, | ||
365 | IP_VS_CONN_F_OUT_SEQ, seq, diff); | ||
366 | |||
367 | return 1; | ||
368 | } | ||
369 | |||
370 | /* | ||
371 | * Output pkt hook. Will call bound ip_vs_app specific function | ||
372 | * called by ipvs packet handler, assumes previously checked cp!=NULL | ||
373 | * returns false if it can't handle packet (oom) | ||
374 | */ | ||
375 | int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
376 | { | ||
377 | struct ip_vs_app *app; | ||
378 | |||
379 | /* | ||
380 | * check if application module is bound to | ||
381 | * this ip_vs_conn. | ||
382 | */ | ||
383 | if ((app = cp->app) == NULL) | ||
384 | return 1; | ||
385 | |||
386 | /* TCP is complicated */ | ||
387 | if (cp->protocol == IPPROTO_TCP) | ||
388 | return app_tcp_pkt_out(cp, skb, app); | ||
389 | |||
390 | /* | ||
391 | * Call private output hook function | ||
392 | */ | ||
393 | if (app->pkt_out == NULL) | ||
394 | return 1; | ||
395 | |||
396 | return app->pkt_out(app, cp, skb, NULL); | ||
397 | } | ||
398 | |||
399 | |||
400 | static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, | ||
401 | struct ip_vs_app *app) | ||
402 | { | ||
403 | int diff; | ||
404 | const unsigned int tcp_offset = ip_hdrlen(skb); | ||
405 | struct tcphdr *th; | ||
406 | __u32 seq; | ||
407 | |||
408 | if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) | ||
409 | return 0; | ||
410 | |||
411 | th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); | ||
412 | |||
413 | /* | ||
414 | * Remember seq number in case this pkt gets resized | ||
415 | */ | ||
416 | seq = ntohl(th->seq); | ||
417 | |||
418 | /* | ||
419 | * Fix seq stuff if flagged as so. | ||
420 | */ | ||
421 | if (cp->flags & IP_VS_CONN_F_IN_SEQ) | ||
422 | vs_fix_seq(&cp->in_seq, th); | ||
423 | if (cp->flags & IP_VS_CONN_F_OUT_SEQ) | ||
424 | vs_fix_ack_seq(&cp->out_seq, th); | ||
425 | |||
426 | /* | ||
427 | * Call private input hook function | ||
428 | */ | ||
429 | if (app->pkt_in == NULL) | ||
430 | return 1; | ||
431 | |||
432 | if (!app->pkt_in(app, cp, skb, &diff)) | ||
433 | return 0; | ||
434 | |||
435 | /* | ||
436 | * Update ip_vs seq stuff if len has changed. | ||
437 | */ | ||
438 | if (diff != 0) | ||
439 | vs_seq_update(cp, &cp->in_seq, | ||
440 | IP_VS_CONN_F_IN_SEQ, seq, diff); | ||
441 | |||
442 | return 1; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * Input pkt hook. Will call bound ip_vs_app specific function | ||
447 | * called by ipvs packet handler, assumes previously checked cp!=NULL. | ||
448 | * returns false if can't handle packet (oom). | ||
449 | */ | ||
450 | int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
451 | { | ||
452 | struct ip_vs_app *app; | ||
453 | |||
454 | /* | ||
455 | * check if application module is bound to | ||
456 | * this ip_vs_conn. | ||
457 | */ | ||
458 | if ((app = cp->app) == NULL) | ||
459 | return 1; | ||
460 | |||
461 | /* TCP is complicated */ | ||
462 | if (cp->protocol == IPPROTO_TCP) | ||
463 | return app_tcp_pkt_in(cp, skb, app); | ||
464 | |||
465 | /* | ||
466 | * Call private input hook function | ||
467 | */ | ||
468 | if (app->pkt_in == NULL) | ||
469 | return 1; | ||
470 | |||
471 | return app->pkt_in(app, cp, skb, NULL); | ||
472 | } | ||
473 | |||
474 | |||
475 | #ifdef CONFIG_PROC_FS | ||
476 | /* | ||
477 | * /proc/net/ip_vs_app entry function | ||
478 | */ | ||
479 | |||
480 | static struct ip_vs_app *ip_vs_app_idx(loff_t pos) | ||
481 | { | ||
482 | struct ip_vs_app *app, *inc; | ||
483 | |||
484 | list_for_each_entry(app, &ip_vs_app_list, a_list) { | ||
485 | list_for_each_entry(inc, &app->incs_list, a_list) { | ||
486 | if (pos-- == 0) | ||
487 | return inc; | ||
488 | } | ||
489 | } | ||
490 | return NULL; | ||
491 | |||
492 | } | ||
493 | |||
494 | static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) | ||
495 | { | ||
496 | mutex_lock(&__ip_vs_app_mutex); | ||
497 | |||
498 | return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; | ||
499 | } | ||
500 | |||
501 | static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
502 | { | ||
503 | struct ip_vs_app *inc, *app; | ||
504 | struct list_head *e; | ||
505 | |||
506 | ++*pos; | ||
507 | if (v == SEQ_START_TOKEN) | ||
508 | return ip_vs_app_idx(0); | ||
509 | |||
510 | inc = v; | ||
511 | app = inc->app; | ||
512 | |||
513 | if ((e = inc->a_list.next) != &app->incs_list) | ||
514 | return list_entry(e, struct ip_vs_app, a_list); | ||
515 | |||
516 | /* go on to next application */ | ||
517 | for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { | ||
518 | app = list_entry(e, struct ip_vs_app, a_list); | ||
519 | list_for_each_entry(inc, &app->incs_list, a_list) { | ||
520 | return inc; | ||
521 | } | ||
522 | } | ||
523 | return NULL; | ||
524 | } | ||
525 | |||
526 | static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) | ||
527 | { | ||
528 | mutex_unlock(&__ip_vs_app_mutex); | ||
529 | } | ||
530 | |||
531 | static int ip_vs_app_seq_show(struct seq_file *seq, void *v) | ||
532 | { | ||
533 | if (v == SEQ_START_TOKEN) | ||
534 | seq_puts(seq, "prot port usecnt name\n"); | ||
535 | else { | ||
536 | const struct ip_vs_app *inc = v; | ||
537 | |||
538 | seq_printf(seq, "%-3s %-7u %-6d %-17s\n", | ||
539 | ip_vs_proto_name(inc->protocol), | ||
540 | ntohs(inc->port), | ||
541 | atomic_read(&inc->usecnt), | ||
542 | inc->name); | ||
543 | } | ||
544 | return 0; | ||
545 | } | ||
546 | |||
547 | static const struct seq_operations ip_vs_app_seq_ops = { | ||
548 | .start = ip_vs_app_seq_start, | ||
549 | .next = ip_vs_app_seq_next, | ||
550 | .stop = ip_vs_app_seq_stop, | ||
551 | .show = ip_vs_app_seq_show, | ||
552 | }; | ||
553 | |||
554 | static int ip_vs_app_open(struct inode *inode, struct file *file) | ||
555 | { | ||
556 | return seq_open(file, &ip_vs_app_seq_ops); | ||
557 | } | ||
558 | |||
559 | static const struct file_operations ip_vs_app_fops = { | ||
560 | .owner = THIS_MODULE, | ||
561 | .open = ip_vs_app_open, | ||
562 | .read = seq_read, | ||
563 | .llseek = seq_lseek, | ||
564 | .release = seq_release, | ||
565 | }; | ||
566 | #endif | ||
567 | |||
568 | |||
569 | /* | ||
570 | * Replace a segment of data with a new segment | ||
571 | */ | ||
572 | int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, | ||
573 | char *o_buf, int o_len, char *n_buf, int n_len) | ||
574 | { | ||
575 | int diff; | ||
576 | int o_offset; | ||
577 | int o_left; | ||
578 | |||
579 | EnterFunction(9); | ||
580 | |||
581 | diff = n_len - o_len; | ||
582 | o_offset = o_buf - (char *)skb->data; | ||
583 | /* The length of left data after o_buf+o_len in the skb data */ | ||
584 | o_left = skb->len - (o_offset + o_len); | ||
585 | |||
586 | if (diff <= 0) { | ||
587 | memmove(o_buf + n_len, o_buf + o_len, o_left); | ||
588 | memcpy(o_buf, n_buf, n_len); | ||
589 | skb_trim(skb, skb->len + diff); | ||
590 | } else if (diff <= skb_tailroom(skb)) { | ||
591 | skb_put(skb, diff); | ||
592 | memmove(o_buf + n_len, o_buf + o_len, o_left); | ||
593 | memcpy(o_buf, n_buf, n_len); | ||
594 | } else { | ||
595 | if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) | ||
596 | return -ENOMEM; | ||
597 | skb_put(skb, diff); | ||
598 | memmove(skb->data + o_offset + n_len, | ||
599 | skb->data + o_offset + o_len, o_left); | ||
600 | skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); | ||
601 | } | ||
602 | |||
603 | /* must update the iph total length here */ | ||
604 | ip_hdr(skb)->tot_len = htons(skb->len); | ||
605 | |||
606 | LeaveFunction(9); | ||
607 | return 0; | ||
608 | } | ||
609 | |||
610 | |||
611 | int __init ip_vs_app_init(void) | ||
612 | { | ||
613 | /* we will replace it with proc_net_ipvs_create() soon */ | ||
614 | proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); | ||
615 | return 0; | ||
616 | } | ||
617 | |||
618 | |||
619 | void ip_vs_app_cleanup(void) | ||
620 | { | ||
621 | proc_net_remove(&init_net, "ip_vs_app"); | ||
622 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c deleted file mode 100644 index 9a24332fbed8..000000000000 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ /dev/null | |||
@@ -1,1110 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * Julian Anastasov <ja@ssi.bg> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version | ||
15 | * 2 of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, | ||
18 | * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms | ||
19 | * and others. Many code here is taken from IP MASQ code of kernel 2.2. | ||
20 | * | ||
21 | * Changes: | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/net.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/vmalloc.h> | ||
31 | #include <linux/proc_fs.h> /* for proc_net_* */ | ||
32 | #include <linux/seq_file.h> | ||
33 | #include <linux/jhash.h> | ||
34 | #include <linux/random.h> | ||
35 | |||
36 | #include <net/net_namespace.h> | ||
37 | #include <net/ip_vs.h> | ||
38 | |||
39 | |||
40 | /* | ||
41 | * Connection hash table: for input and output packets lookups of IPVS | ||
42 | */ | ||
43 | static struct list_head *ip_vs_conn_tab; | ||
44 | |||
45 | /* SLAB cache for IPVS connections */ | ||
46 | static struct kmem_cache *ip_vs_conn_cachep __read_mostly; | ||
47 | |||
48 | /* counter for current IPVS connections */ | ||
49 | static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); | ||
50 | |||
51 | /* counter for no client port connections */ | ||
52 | static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); | ||
53 | |||
54 | /* random value for IPVS connection hash */ | ||
55 | static unsigned int ip_vs_conn_rnd; | ||
56 | |||
57 | /* | ||
58 | * Fine locking granularity for big connection hash table | ||
59 | */ | ||
60 | #define CT_LOCKARRAY_BITS 4 | ||
61 | #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) | ||
62 | #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) | ||
63 | |||
64 | struct ip_vs_aligned_lock | ||
65 | { | ||
66 | rwlock_t l; | ||
67 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); | ||
68 | |||
69 | /* lock array for conn table */ | ||
70 | static struct ip_vs_aligned_lock | ||
71 | __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; | ||
72 | |||
73 | static inline void ct_read_lock(unsigned key) | ||
74 | { | ||
75 | read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
76 | } | ||
77 | |||
78 | static inline void ct_read_unlock(unsigned key) | ||
79 | { | ||
80 | read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
81 | } | ||
82 | |||
83 | static inline void ct_write_lock(unsigned key) | ||
84 | { | ||
85 | write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
86 | } | ||
87 | |||
88 | static inline void ct_write_unlock(unsigned key) | ||
89 | { | ||
90 | write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
91 | } | ||
92 | |||
93 | static inline void ct_read_lock_bh(unsigned key) | ||
94 | { | ||
95 | read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
96 | } | ||
97 | |||
98 | static inline void ct_read_unlock_bh(unsigned key) | ||
99 | { | ||
100 | read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
101 | } | ||
102 | |||
103 | static inline void ct_write_lock_bh(unsigned key) | ||
104 | { | ||
105 | write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
106 | } | ||
107 | |||
108 | static inline void ct_write_unlock_bh(unsigned key) | ||
109 | { | ||
110 | write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
111 | } | ||
112 | |||
113 | |||
114 | /* | ||
115 | * Returns hash value for IPVS connection entry | ||
116 | */ | ||
117 | static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, | ||
118 | const union nf_inet_addr *addr, | ||
119 | __be16 port) | ||
120 | { | ||
121 | #ifdef CONFIG_IP_VS_IPV6 | ||
122 | if (af == AF_INET6) | ||
123 | return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), | ||
124 | (__force u32)port, proto, ip_vs_conn_rnd) | ||
125 | & IP_VS_CONN_TAB_MASK; | ||
126 | #endif | ||
127 | return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, | ||
128 | ip_vs_conn_rnd) | ||
129 | & IP_VS_CONN_TAB_MASK; | ||
130 | } | ||
131 | |||
132 | |||
133 | /* | ||
134 | * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. | ||
135 | * returns bool success. | ||
136 | */ | ||
137 | static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) | ||
138 | { | ||
139 | unsigned hash; | ||
140 | int ret; | ||
141 | |||
142 | /* Hash by protocol, client address and port */ | ||
143 | hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); | ||
144 | |||
145 | ct_write_lock(hash); | ||
146 | |||
147 | if (!(cp->flags & IP_VS_CONN_F_HASHED)) { | ||
148 | list_add(&cp->c_list, &ip_vs_conn_tab[hash]); | ||
149 | cp->flags |= IP_VS_CONN_F_HASHED; | ||
150 | atomic_inc(&cp->refcnt); | ||
151 | ret = 1; | ||
152 | } else { | ||
153 | IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " | ||
154 | "called from %p\n", __builtin_return_address(0)); | ||
155 | ret = 0; | ||
156 | } | ||
157 | |||
158 | ct_write_unlock(hash); | ||
159 | |||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | |||
164 | /* | ||
165 | * UNhashes ip_vs_conn from ip_vs_conn_tab. | ||
166 | * returns bool success. | ||
167 | */ | ||
168 | static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) | ||
169 | { | ||
170 | unsigned hash; | ||
171 | int ret; | ||
172 | |||
173 | /* unhash it and decrease its reference counter */ | ||
174 | hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); | ||
175 | |||
176 | ct_write_lock(hash); | ||
177 | |||
178 | if (cp->flags & IP_VS_CONN_F_HASHED) { | ||
179 | list_del(&cp->c_list); | ||
180 | cp->flags &= ~IP_VS_CONN_F_HASHED; | ||
181 | atomic_dec(&cp->refcnt); | ||
182 | ret = 1; | ||
183 | } else | ||
184 | ret = 0; | ||
185 | |||
186 | ct_write_unlock(hash); | ||
187 | |||
188 | return ret; | ||
189 | } | ||
190 | |||
191 | |||
192 | /* | ||
193 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. | ||
194 | * Called for pkts coming from OUTside-to-INside. | ||
195 | * s_addr, s_port: pkt source address (foreign host) | ||
196 | * d_addr, d_port: pkt dest address (load balancer) | ||
197 | */ | ||
198 | static inline struct ip_vs_conn *__ip_vs_conn_in_get | ||
199 | (int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, | ||
200 | const union nf_inet_addr *d_addr, __be16 d_port) | ||
201 | { | ||
202 | unsigned hash; | ||
203 | struct ip_vs_conn *cp; | ||
204 | |||
205 | hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); | ||
206 | |||
207 | ct_read_lock(hash); | ||
208 | |||
209 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
210 | if (cp->af == af && | ||
211 | ip_vs_addr_equal(af, s_addr, &cp->caddr) && | ||
212 | ip_vs_addr_equal(af, d_addr, &cp->vaddr) && | ||
213 | s_port == cp->cport && d_port == cp->vport && | ||
214 | ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && | ||
215 | protocol == cp->protocol) { | ||
216 | /* HIT */ | ||
217 | atomic_inc(&cp->refcnt); | ||
218 | ct_read_unlock(hash); | ||
219 | return cp; | ||
220 | } | ||
221 | } | ||
222 | |||
223 | ct_read_unlock(hash); | ||
224 | |||
225 | return NULL; | ||
226 | } | ||
227 | |||
228 | struct ip_vs_conn *ip_vs_conn_in_get | ||
229 | (int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, | ||
230 | const union nf_inet_addr *d_addr, __be16 d_port) | ||
231 | { | ||
232 | struct ip_vs_conn *cp; | ||
233 | |||
234 | cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); | ||
235 | if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) | ||
236 | cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, | ||
237 | d_port); | ||
238 | |||
239 | IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", | ||
240 | ip_vs_proto_name(protocol), | ||
241 | IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), | ||
242 | IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), | ||
243 | cp ? "hit" : "not hit"); | ||
244 | |||
245 | return cp; | ||
246 | } | ||
247 | |||
248 | /* Get reference to connection template */ | ||
249 | struct ip_vs_conn *ip_vs_ct_in_get | ||
250 | (int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, | ||
251 | const union nf_inet_addr *d_addr, __be16 d_port) | ||
252 | { | ||
253 | unsigned hash; | ||
254 | struct ip_vs_conn *cp; | ||
255 | |||
256 | hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); | ||
257 | |||
258 | ct_read_lock(hash); | ||
259 | |||
260 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
261 | if (cp->af == af && | ||
262 | ip_vs_addr_equal(af, s_addr, &cp->caddr) && | ||
263 | ip_vs_addr_equal(af, d_addr, &cp->vaddr) && | ||
264 | s_port == cp->cport && d_port == cp->vport && | ||
265 | cp->flags & IP_VS_CONN_F_TEMPLATE && | ||
266 | protocol == cp->protocol) { | ||
267 | /* HIT */ | ||
268 | atomic_inc(&cp->refcnt); | ||
269 | goto out; | ||
270 | } | ||
271 | } | ||
272 | cp = NULL; | ||
273 | |||
274 | out: | ||
275 | ct_read_unlock(hash); | ||
276 | |||
277 | IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", | ||
278 | ip_vs_proto_name(protocol), | ||
279 | IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), | ||
280 | IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), | ||
281 | cp ? "hit" : "not hit"); | ||
282 | |||
283 | return cp; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. | ||
288 | * Called for pkts coming from inside-to-OUTside. | ||
289 | * s_addr, s_port: pkt source address (inside host) | ||
290 | * d_addr, d_port: pkt dest address (foreign host) | ||
291 | */ | ||
292 | struct ip_vs_conn *ip_vs_conn_out_get | ||
293 | (int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, | ||
294 | const union nf_inet_addr *d_addr, __be16 d_port) | ||
295 | { | ||
296 | unsigned hash; | ||
297 | struct ip_vs_conn *cp, *ret=NULL; | ||
298 | |||
299 | /* | ||
300 | * Check for "full" addressed entries | ||
301 | */ | ||
302 | hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); | ||
303 | |||
304 | ct_read_lock(hash); | ||
305 | |||
306 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
307 | if (cp->af == af && | ||
308 | ip_vs_addr_equal(af, d_addr, &cp->caddr) && | ||
309 | ip_vs_addr_equal(af, s_addr, &cp->daddr) && | ||
310 | d_port == cp->cport && s_port == cp->dport && | ||
311 | protocol == cp->protocol) { | ||
312 | /* HIT */ | ||
313 | atomic_inc(&cp->refcnt); | ||
314 | ret = cp; | ||
315 | break; | ||
316 | } | ||
317 | } | ||
318 | |||
319 | ct_read_unlock(hash); | ||
320 | |||
321 | IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", | ||
322 | ip_vs_proto_name(protocol), | ||
323 | IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), | ||
324 | IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), | ||
325 | ret ? "hit" : "not hit"); | ||
326 | |||
327 | return ret; | ||
328 | } | ||
329 | |||
330 | |||
331 | /* | ||
332 | * Put back the conn and restart its timer with its timeout | ||
333 | */ | ||
334 | void ip_vs_conn_put(struct ip_vs_conn *cp) | ||
335 | { | ||
336 | /* reset it expire in its timeout */ | ||
337 | mod_timer(&cp->timer, jiffies+cp->timeout); | ||
338 | |||
339 | __ip_vs_conn_put(cp); | ||
340 | } | ||
341 | |||
342 | |||
343 | /* | ||
344 | * Fill a no_client_port connection with a client port number | ||
345 | */ | ||
346 | void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) | ||
347 | { | ||
348 | if (ip_vs_conn_unhash(cp)) { | ||
349 | spin_lock(&cp->lock); | ||
350 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) { | ||
351 | atomic_dec(&ip_vs_conn_no_cport_cnt); | ||
352 | cp->flags &= ~IP_VS_CONN_F_NO_CPORT; | ||
353 | cp->cport = cport; | ||
354 | } | ||
355 | spin_unlock(&cp->lock); | ||
356 | |||
357 | /* hash on new dport */ | ||
358 | ip_vs_conn_hash(cp); | ||
359 | } | ||
360 | } | ||
361 | |||
362 | |||
363 | /* | ||
364 | * Bind a connection entry with the corresponding packet_xmit. | ||
365 | * Called by ip_vs_conn_new. | ||
366 | */ | ||
367 | static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) | ||
368 | { | ||
369 | switch (IP_VS_FWD_METHOD(cp)) { | ||
370 | case IP_VS_CONN_F_MASQ: | ||
371 | cp->packet_xmit = ip_vs_nat_xmit; | ||
372 | break; | ||
373 | |||
374 | case IP_VS_CONN_F_TUNNEL: | ||
375 | cp->packet_xmit = ip_vs_tunnel_xmit; | ||
376 | break; | ||
377 | |||
378 | case IP_VS_CONN_F_DROUTE: | ||
379 | cp->packet_xmit = ip_vs_dr_xmit; | ||
380 | break; | ||
381 | |||
382 | case IP_VS_CONN_F_LOCALNODE: | ||
383 | cp->packet_xmit = ip_vs_null_xmit; | ||
384 | break; | ||
385 | |||
386 | case IP_VS_CONN_F_BYPASS: | ||
387 | cp->packet_xmit = ip_vs_bypass_xmit; | ||
388 | break; | ||
389 | } | ||
390 | } | ||
391 | |||
392 | #ifdef CONFIG_IP_VS_IPV6 | ||
393 | static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) | ||
394 | { | ||
395 | switch (IP_VS_FWD_METHOD(cp)) { | ||
396 | case IP_VS_CONN_F_MASQ: | ||
397 | cp->packet_xmit = ip_vs_nat_xmit_v6; | ||
398 | break; | ||
399 | |||
400 | case IP_VS_CONN_F_TUNNEL: | ||
401 | cp->packet_xmit = ip_vs_tunnel_xmit_v6; | ||
402 | break; | ||
403 | |||
404 | case IP_VS_CONN_F_DROUTE: | ||
405 | cp->packet_xmit = ip_vs_dr_xmit_v6; | ||
406 | break; | ||
407 | |||
408 | case IP_VS_CONN_F_LOCALNODE: | ||
409 | cp->packet_xmit = ip_vs_null_xmit; | ||
410 | break; | ||
411 | |||
412 | case IP_VS_CONN_F_BYPASS: | ||
413 | cp->packet_xmit = ip_vs_bypass_xmit_v6; | ||
414 | break; | ||
415 | } | ||
416 | } | ||
417 | #endif | ||
418 | |||
419 | |||
420 | static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) | ||
421 | { | ||
422 | return atomic_read(&dest->activeconns) | ||
423 | + atomic_read(&dest->inactconns); | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * Bind a connection entry with a virtual service destination | ||
428 | * Called just after a new connection entry is created. | ||
429 | */ | ||
430 | static inline void | ||
431 | ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) | ||
432 | { | ||
433 | /* if dest is NULL, then return directly */ | ||
434 | if (!dest) | ||
435 | return; | ||
436 | |||
437 | /* Increase the refcnt counter of the dest */ | ||
438 | atomic_inc(&dest->refcnt); | ||
439 | |||
440 | /* Bind with the destination and its corresponding transmitter */ | ||
441 | if ((cp->flags & IP_VS_CONN_F_SYNC) && | ||
442 | (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) | ||
443 | /* if the connection is not template and is created | ||
444 | * by sync, preserve the activity flag. | ||
445 | */ | ||
446 | cp->flags |= atomic_read(&dest->conn_flags) & | ||
447 | (~IP_VS_CONN_F_INACTIVE); | ||
448 | else | ||
449 | cp->flags |= atomic_read(&dest->conn_flags); | ||
450 | cp->dest = dest; | ||
451 | |||
452 | IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " | ||
453 | "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " | ||
454 | "dest->refcnt:%d\n", | ||
455 | ip_vs_proto_name(cp->protocol), | ||
456 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), | ||
457 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), | ||
458 | IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), | ||
459 | ip_vs_fwd_tag(cp), cp->state, | ||
460 | cp->flags, atomic_read(&cp->refcnt), | ||
461 | atomic_read(&dest->refcnt)); | ||
462 | |||
463 | /* Update the connection counters */ | ||
464 | if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { | ||
465 | /* It is a normal connection, so increase the inactive | ||
466 | connection counter because it is in TCP SYNRECV | ||
467 | state (inactive) or other protocol inacive state */ | ||
468 | if ((cp->flags & IP_VS_CONN_F_SYNC) && | ||
469 | (!(cp->flags & IP_VS_CONN_F_INACTIVE))) | ||
470 | atomic_inc(&dest->activeconns); | ||
471 | else | ||
472 | atomic_inc(&dest->inactconns); | ||
473 | } else { | ||
474 | /* It is a persistent connection/template, so increase | ||
475 | the peristent connection counter */ | ||
476 | atomic_inc(&dest->persistconns); | ||
477 | } | ||
478 | |||
479 | if (dest->u_threshold != 0 && | ||
480 | ip_vs_dest_totalconns(dest) >= dest->u_threshold) | ||
481 | dest->flags |= IP_VS_DEST_F_OVERLOAD; | ||
482 | } | ||
483 | |||
484 | |||
485 | /* | ||
486 | * Check if there is a destination for the connection, if so | ||
487 | * bind the connection to the destination. | ||
488 | */ | ||
489 | struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) | ||
490 | { | ||
491 | struct ip_vs_dest *dest; | ||
492 | |||
493 | if ((cp) && (!cp->dest)) { | ||
494 | dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, | ||
495 | &cp->vaddr, cp->vport, | ||
496 | cp->protocol); | ||
497 | ip_vs_bind_dest(cp, dest); | ||
498 | return dest; | ||
499 | } else | ||
500 | return NULL; | ||
501 | } | ||
502 | |||
503 | |||
504 | /* | ||
505 | * Unbind a connection entry with its VS destination | ||
506 | * Called by the ip_vs_conn_expire function. | ||
507 | */ | ||
508 | static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) | ||
509 | { | ||
510 | struct ip_vs_dest *dest = cp->dest; | ||
511 | |||
512 | if (!dest) | ||
513 | return; | ||
514 | |||
515 | IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " | ||
516 | "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " | ||
517 | "dest->refcnt:%d\n", | ||
518 | ip_vs_proto_name(cp->protocol), | ||
519 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), | ||
520 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), | ||
521 | IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), | ||
522 | ip_vs_fwd_tag(cp), cp->state, | ||
523 | cp->flags, atomic_read(&cp->refcnt), | ||
524 | atomic_read(&dest->refcnt)); | ||
525 | |||
526 | /* Update the connection counters */ | ||
527 | if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { | ||
528 | /* It is a normal connection, so decrease the inactconns | ||
529 | or activeconns counter */ | ||
530 | if (cp->flags & IP_VS_CONN_F_INACTIVE) { | ||
531 | atomic_dec(&dest->inactconns); | ||
532 | } else { | ||
533 | atomic_dec(&dest->activeconns); | ||
534 | } | ||
535 | } else { | ||
536 | /* It is a persistent connection/template, so decrease | ||
537 | the peristent connection counter */ | ||
538 | atomic_dec(&dest->persistconns); | ||
539 | } | ||
540 | |||
541 | if (dest->l_threshold != 0) { | ||
542 | if (ip_vs_dest_totalconns(dest) < dest->l_threshold) | ||
543 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
544 | } else if (dest->u_threshold != 0) { | ||
545 | if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) | ||
546 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
547 | } else { | ||
548 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
549 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
550 | } | ||
551 | |||
552 | /* | ||
553 | * Simply decrease the refcnt of the dest, because the | ||
554 | * dest will be either in service's destination list | ||
555 | * or in the trash. | ||
556 | */ | ||
557 | atomic_dec(&dest->refcnt); | ||
558 | } | ||
559 | |||
560 | |||
561 | /* | ||
562 | * Checking if the destination of a connection template is available. | ||
563 | * If available, return 1, otherwise invalidate this connection | ||
564 | * template and return 0. | ||
565 | */ | ||
566 | int ip_vs_check_template(struct ip_vs_conn *ct) | ||
567 | { | ||
568 | struct ip_vs_dest *dest = ct->dest; | ||
569 | |||
570 | /* | ||
571 | * Checking the dest server status. | ||
572 | */ | ||
573 | if ((dest == NULL) || | ||
574 | !(dest->flags & IP_VS_DEST_F_AVAILABLE) || | ||
575 | (sysctl_ip_vs_expire_quiescent_template && | ||
576 | (atomic_read(&dest->weight) == 0))) { | ||
577 | IP_VS_DBG_BUF(9, "check_template: dest not available for " | ||
578 | "protocol %s s:%s:%d v:%s:%d " | ||
579 | "-> d:%s:%d\n", | ||
580 | ip_vs_proto_name(ct->protocol), | ||
581 | IP_VS_DBG_ADDR(ct->af, &ct->caddr), | ||
582 | ntohs(ct->cport), | ||
583 | IP_VS_DBG_ADDR(ct->af, &ct->vaddr), | ||
584 | ntohs(ct->vport), | ||
585 | IP_VS_DBG_ADDR(ct->af, &ct->daddr), | ||
586 | ntohs(ct->dport)); | ||
587 | |||
588 | /* | ||
589 | * Invalidate the connection template | ||
590 | */ | ||
591 | if (ct->vport != htons(0xffff)) { | ||
592 | if (ip_vs_conn_unhash(ct)) { | ||
593 | ct->dport = htons(0xffff); | ||
594 | ct->vport = htons(0xffff); | ||
595 | ct->cport = 0; | ||
596 | ip_vs_conn_hash(ct); | ||
597 | } | ||
598 | } | ||
599 | |||
600 | /* | ||
601 | * Simply decrease the refcnt of the template, | ||
602 | * don't restart its timer. | ||
603 | */ | ||
604 | atomic_dec(&ct->refcnt); | ||
605 | return 0; | ||
606 | } | ||
607 | return 1; | ||
608 | } | ||
609 | |||
610 | static void ip_vs_conn_expire(unsigned long data) | ||
611 | { | ||
612 | struct ip_vs_conn *cp = (struct ip_vs_conn *)data; | ||
613 | |||
614 | cp->timeout = 60*HZ; | ||
615 | |||
616 | /* | ||
617 | * hey, I'm using it | ||
618 | */ | ||
619 | atomic_inc(&cp->refcnt); | ||
620 | |||
621 | /* | ||
622 | * do I control anybody? | ||
623 | */ | ||
624 | if (atomic_read(&cp->n_control)) | ||
625 | goto expire_later; | ||
626 | |||
627 | /* | ||
628 | * unhash it if it is hashed in the conn table | ||
629 | */ | ||
630 | if (!ip_vs_conn_unhash(cp)) | ||
631 | goto expire_later; | ||
632 | |||
633 | /* | ||
634 | * refcnt==1 implies I'm the only one referrer | ||
635 | */ | ||
636 | if (likely(atomic_read(&cp->refcnt) == 1)) { | ||
637 | /* delete the timer if it is activated by other users */ | ||
638 | if (timer_pending(&cp->timer)) | ||
639 | del_timer(&cp->timer); | ||
640 | |||
641 | /* does anybody control me? */ | ||
642 | if (cp->control) | ||
643 | ip_vs_control_del(cp); | ||
644 | |||
645 | if (unlikely(cp->app != NULL)) | ||
646 | ip_vs_unbind_app(cp); | ||
647 | ip_vs_unbind_dest(cp); | ||
648 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) | ||
649 | atomic_dec(&ip_vs_conn_no_cport_cnt); | ||
650 | atomic_dec(&ip_vs_conn_count); | ||
651 | |||
652 | kmem_cache_free(ip_vs_conn_cachep, cp); | ||
653 | return; | ||
654 | } | ||
655 | |||
656 | /* hash it back to the table */ | ||
657 | ip_vs_conn_hash(cp); | ||
658 | |||
659 | expire_later: | ||
660 | IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", | ||
661 | atomic_read(&cp->refcnt)-1, | ||
662 | atomic_read(&cp->n_control)); | ||
663 | |||
664 | ip_vs_conn_put(cp); | ||
665 | } | ||
666 | |||
667 | |||
668 | void ip_vs_conn_expire_now(struct ip_vs_conn *cp) | ||
669 | { | ||
670 | if (del_timer(&cp->timer)) | ||
671 | mod_timer(&cp->timer, jiffies); | ||
672 | } | ||
673 | |||
674 | |||
675 | /* | ||
676 | * Create a new connection entry and hash it into the ip_vs_conn_tab | ||
677 | */ | ||
678 | struct ip_vs_conn * | ||
679 | ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, | ||
680 | const union nf_inet_addr *vaddr, __be16 vport, | ||
681 | const union nf_inet_addr *daddr, __be16 dport, unsigned flags, | ||
682 | struct ip_vs_dest *dest) | ||
683 | { | ||
684 | struct ip_vs_conn *cp; | ||
685 | struct ip_vs_protocol *pp = ip_vs_proto_get(proto); | ||
686 | |||
687 | cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); | ||
688 | if (cp == NULL) { | ||
689 | IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); | ||
690 | return NULL; | ||
691 | } | ||
692 | |||
693 | INIT_LIST_HEAD(&cp->c_list); | ||
694 | setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); | ||
695 | cp->af = af; | ||
696 | cp->protocol = proto; | ||
697 | ip_vs_addr_copy(af, &cp->caddr, caddr); | ||
698 | cp->cport = cport; | ||
699 | ip_vs_addr_copy(af, &cp->vaddr, vaddr); | ||
700 | cp->vport = vport; | ||
701 | ip_vs_addr_copy(af, &cp->daddr, daddr); | ||
702 | cp->dport = dport; | ||
703 | cp->flags = flags; | ||
704 | spin_lock_init(&cp->lock); | ||
705 | |||
706 | /* | ||
707 | * Set the entry is referenced by the current thread before hashing | ||
708 | * it in the table, so that other thread run ip_vs_random_dropentry | ||
709 | * but cannot drop this entry. | ||
710 | */ | ||
711 | atomic_set(&cp->refcnt, 1); | ||
712 | |||
713 | atomic_set(&cp->n_control, 0); | ||
714 | atomic_set(&cp->in_pkts, 0); | ||
715 | |||
716 | atomic_inc(&ip_vs_conn_count); | ||
717 | if (flags & IP_VS_CONN_F_NO_CPORT) | ||
718 | atomic_inc(&ip_vs_conn_no_cport_cnt); | ||
719 | |||
720 | /* Bind the connection with a destination server */ | ||
721 | ip_vs_bind_dest(cp, dest); | ||
722 | |||
723 | /* Set its state and timeout */ | ||
724 | cp->state = 0; | ||
725 | cp->timeout = 3*HZ; | ||
726 | |||
727 | /* Bind its packet transmitter */ | ||
728 | #ifdef CONFIG_IP_VS_IPV6 | ||
729 | if (af == AF_INET6) | ||
730 | ip_vs_bind_xmit_v6(cp); | ||
731 | else | ||
732 | #endif | ||
733 | ip_vs_bind_xmit(cp); | ||
734 | |||
735 | if (unlikely(pp && atomic_read(&pp->appcnt))) | ||
736 | ip_vs_bind_app(cp, pp); | ||
737 | |||
738 | /* Hash it in the ip_vs_conn_tab finally */ | ||
739 | ip_vs_conn_hash(cp); | ||
740 | |||
741 | return cp; | ||
742 | } | ||
743 | |||
744 | |||
745 | /* | ||
746 | * /proc/net/ip_vs_conn entries | ||
747 | */ | ||
748 | #ifdef CONFIG_PROC_FS | ||
749 | |||
750 | static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) | ||
751 | { | ||
752 | int idx; | ||
753 | struct ip_vs_conn *cp; | ||
754 | |||
755 | for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { | ||
756 | ct_read_lock_bh(idx); | ||
757 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
758 | if (pos-- == 0) { | ||
759 | seq->private = &ip_vs_conn_tab[idx]; | ||
760 | return cp; | ||
761 | } | ||
762 | } | ||
763 | ct_read_unlock_bh(idx); | ||
764 | } | ||
765 | |||
766 | return NULL; | ||
767 | } | ||
768 | |||
769 | static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) | ||
770 | { | ||
771 | seq->private = NULL; | ||
772 | return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; | ||
773 | } | ||
774 | |||
775 | static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
776 | { | ||
777 | struct ip_vs_conn *cp = v; | ||
778 | struct list_head *e, *l = seq->private; | ||
779 | int idx; | ||
780 | |||
781 | ++*pos; | ||
782 | if (v == SEQ_START_TOKEN) | ||
783 | return ip_vs_conn_array(seq, 0); | ||
784 | |||
785 | /* more on same hash chain? */ | ||
786 | if ((e = cp->c_list.next) != l) | ||
787 | return list_entry(e, struct ip_vs_conn, c_list); | ||
788 | |||
789 | idx = l - ip_vs_conn_tab; | ||
790 | ct_read_unlock_bh(idx); | ||
791 | |||
792 | while (++idx < IP_VS_CONN_TAB_SIZE) { | ||
793 | ct_read_lock_bh(idx); | ||
794 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
795 | seq->private = &ip_vs_conn_tab[idx]; | ||
796 | return cp; | ||
797 | } | ||
798 | ct_read_unlock_bh(idx); | ||
799 | } | ||
800 | seq->private = NULL; | ||
801 | return NULL; | ||
802 | } | ||
803 | |||
804 | static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) | ||
805 | { | ||
806 | struct list_head *l = seq->private; | ||
807 | |||
808 | if (l) | ||
809 | ct_read_unlock_bh(l - ip_vs_conn_tab); | ||
810 | } | ||
811 | |||
812 | static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) | ||
813 | { | ||
814 | |||
815 | if (v == SEQ_START_TOKEN) | ||
816 | seq_puts(seq, | ||
817 | "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); | ||
818 | else { | ||
819 | const struct ip_vs_conn *cp = v; | ||
820 | |||
821 | #ifdef CONFIG_IP_VS_IPV6 | ||
822 | if (cp->af == AF_INET6) | ||
823 | seq_printf(seq, | ||
824 | "%-3s " NIP6_FMT " %04X " NIP6_FMT | ||
825 | " %04X " NIP6_FMT " %04X %-11s %7lu\n", | ||
826 | ip_vs_proto_name(cp->protocol), | ||
827 | NIP6(cp->caddr.in6), ntohs(cp->cport), | ||
828 | NIP6(cp->vaddr.in6), ntohs(cp->vport), | ||
829 | NIP6(cp->daddr.in6), ntohs(cp->dport), | ||
830 | ip_vs_state_name(cp->protocol, cp->state), | ||
831 | (cp->timer.expires-jiffies)/HZ); | ||
832 | else | ||
833 | #endif | ||
834 | seq_printf(seq, | ||
835 | "%-3s %08X %04X %08X %04X" | ||
836 | " %08X %04X %-11s %7lu\n", | ||
837 | ip_vs_proto_name(cp->protocol), | ||
838 | ntohl(cp->caddr.ip), ntohs(cp->cport), | ||
839 | ntohl(cp->vaddr.ip), ntohs(cp->vport), | ||
840 | ntohl(cp->daddr.ip), ntohs(cp->dport), | ||
841 | ip_vs_state_name(cp->protocol, cp->state), | ||
842 | (cp->timer.expires-jiffies)/HZ); | ||
843 | } | ||
844 | return 0; | ||
845 | } | ||
846 | |||
847 | static const struct seq_operations ip_vs_conn_seq_ops = { | ||
848 | .start = ip_vs_conn_seq_start, | ||
849 | .next = ip_vs_conn_seq_next, | ||
850 | .stop = ip_vs_conn_seq_stop, | ||
851 | .show = ip_vs_conn_seq_show, | ||
852 | }; | ||
853 | |||
854 | static int ip_vs_conn_open(struct inode *inode, struct file *file) | ||
855 | { | ||
856 | return seq_open(file, &ip_vs_conn_seq_ops); | ||
857 | } | ||
858 | |||
859 | static const struct file_operations ip_vs_conn_fops = { | ||
860 | .owner = THIS_MODULE, | ||
861 | .open = ip_vs_conn_open, | ||
862 | .read = seq_read, | ||
863 | .llseek = seq_lseek, | ||
864 | .release = seq_release, | ||
865 | }; | ||
866 | |||
867 | static const char *ip_vs_origin_name(unsigned flags) | ||
868 | { | ||
869 | if (flags & IP_VS_CONN_F_SYNC) | ||
870 | return "SYNC"; | ||
871 | else | ||
872 | return "LOCAL"; | ||
873 | } | ||
874 | |||
875 | static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) | ||
876 | { | ||
877 | |||
878 | if (v == SEQ_START_TOKEN) | ||
879 | seq_puts(seq, | ||
880 | "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); | ||
881 | else { | ||
882 | const struct ip_vs_conn *cp = v; | ||
883 | |||
884 | #ifdef CONFIG_IP_VS_IPV6 | ||
885 | if (cp->af == AF_INET6) | ||
886 | seq_printf(seq, | ||
887 | "%-3s " NIP6_FMT " %04X " NIP6_FMT | ||
888 | " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", | ||
889 | ip_vs_proto_name(cp->protocol), | ||
890 | NIP6(cp->caddr.in6), ntohs(cp->cport), | ||
891 | NIP6(cp->vaddr.in6), ntohs(cp->vport), | ||
892 | NIP6(cp->daddr.in6), ntohs(cp->dport), | ||
893 | ip_vs_state_name(cp->protocol, cp->state), | ||
894 | ip_vs_origin_name(cp->flags), | ||
895 | (cp->timer.expires-jiffies)/HZ); | ||
896 | else | ||
897 | #endif | ||
898 | seq_printf(seq, | ||
899 | "%-3s %08X %04X %08X %04X " | ||
900 | "%08X %04X %-11s %-6s %7lu\n", | ||
901 | ip_vs_proto_name(cp->protocol), | ||
902 | ntohl(cp->caddr.ip), ntohs(cp->cport), | ||
903 | ntohl(cp->vaddr.ip), ntohs(cp->vport), | ||
904 | ntohl(cp->daddr.ip), ntohs(cp->dport), | ||
905 | ip_vs_state_name(cp->protocol, cp->state), | ||
906 | ip_vs_origin_name(cp->flags), | ||
907 | (cp->timer.expires-jiffies)/HZ); | ||
908 | } | ||
909 | return 0; | ||
910 | } | ||
911 | |||
912 | static const struct seq_operations ip_vs_conn_sync_seq_ops = { | ||
913 | .start = ip_vs_conn_seq_start, | ||
914 | .next = ip_vs_conn_seq_next, | ||
915 | .stop = ip_vs_conn_seq_stop, | ||
916 | .show = ip_vs_conn_sync_seq_show, | ||
917 | }; | ||
918 | |||
919 | static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) | ||
920 | { | ||
921 | return seq_open(file, &ip_vs_conn_sync_seq_ops); | ||
922 | } | ||
923 | |||
924 | static const struct file_operations ip_vs_conn_sync_fops = { | ||
925 | .owner = THIS_MODULE, | ||
926 | .open = ip_vs_conn_sync_open, | ||
927 | .read = seq_read, | ||
928 | .llseek = seq_lseek, | ||
929 | .release = seq_release, | ||
930 | }; | ||
931 | |||
932 | #endif | ||
933 | |||
934 | |||
935 | /* | ||
936 | * Randomly drop connection entries before running out of memory | ||
937 | */ | ||
938 | static inline int todrop_entry(struct ip_vs_conn *cp) | ||
939 | { | ||
940 | /* | ||
941 | * The drop rate array needs tuning for real environments. | ||
942 | * Called from timer bh only => no locking | ||
943 | */ | ||
944 | static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; | ||
945 | static char todrop_counter[9] = {0}; | ||
946 | int i; | ||
947 | |||
948 | /* if the conn entry hasn't lasted for 60 seconds, don't drop it. | ||
949 | This will leave enough time for normal connection to get | ||
950 | through. */ | ||
951 | if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) | ||
952 | return 0; | ||
953 | |||
954 | /* Don't drop the entry if its number of incoming packets is not | ||
955 | located in [0, 8] */ | ||
956 | i = atomic_read(&cp->in_pkts); | ||
957 | if (i > 8 || i < 0) return 0; | ||
958 | |||
959 | if (!todrop_rate[i]) return 0; | ||
960 | if (--todrop_counter[i] > 0) return 0; | ||
961 | |||
962 | todrop_counter[i] = todrop_rate[i]; | ||
963 | return 1; | ||
964 | } | ||
965 | |||
966 | /* Called from keventd and must protect itself from softirqs */ | ||
967 | void ip_vs_random_dropentry(void) | ||
968 | { | ||
969 | int idx; | ||
970 | struct ip_vs_conn *cp; | ||
971 | |||
972 | /* | ||
973 | * Randomly scan 1/32 of the whole table every second | ||
974 | */ | ||
975 | for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { | ||
976 | unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; | ||
977 | |||
978 | /* | ||
979 | * Lock is actually needed in this loop. | ||
980 | */ | ||
981 | ct_write_lock_bh(hash); | ||
982 | |||
983 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
984 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) | ||
985 | /* connection template */ | ||
986 | continue; | ||
987 | |||
988 | if (cp->protocol == IPPROTO_TCP) { | ||
989 | switch(cp->state) { | ||
990 | case IP_VS_TCP_S_SYN_RECV: | ||
991 | case IP_VS_TCP_S_SYNACK: | ||
992 | break; | ||
993 | |||
994 | case IP_VS_TCP_S_ESTABLISHED: | ||
995 | if (todrop_entry(cp)) | ||
996 | break; | ||
997 | continue; | ||
998 | |||
999 | default: | ||
1000 | continue; | ||
1001 | } | ||
1002 | } else { | ||
1003 | if (!todrop_entry(cp)) | ||
1004 | continue; | ||
1005 | } | ||
1006 | |||
1007 | IP_VS_DBG(4, "del connection\n"); | ||
1008 | ip_vs_conn_expire_now(cp); | ||
1009 | if (cp->control) { | ||
1010 | IP_VS_DBG(4, "del conn template\n"); | ||
1011 | ip_vs_conn_expire_now(cp->control); | ||
1012 | } | ||
1013 | } | ||
1014 | ct_write_unlock_bh(hash); | ||
1015 | } | ||
1016 | } | ||
1017 | |||
1018 | |||
1019 | /* | ||
1020 | * Flush all the connection entries in the ip_vs_conn_tab | ||
1021 | */ | ||
1022 | static void ip_vs_conn_flush(void) | ||
1023 | { | ||
1024 | int idx; | ||
1025 | struct ip_vs_conn *cp; | ||
1026 | |||
1027 | flush_again: | ||
1028 | for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { | ||
1029 | /* | ||
1030 | * Lock is actually needed in this loop. | ||
1031 | */ | ||
1032 | ct_write_lock_bh(idx); | ||
1033 | |||
1034 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
1035 | |||
1036 | IP_VS_DBG(4, "del connection\n"); | ||
1037 | ip_vs_conn_expire_now(cp); | ||
1038 | if (cp->control) { | ||
1039 | IP_VS_DBG(4, "del conn template\n"); | ||
1040 | ip_vs_conn_expire_now(cp->control); | ||
1041 | } | ||
1042 | } | ||
1043 | ct_write_unlock_bh(idx); | ||
1044 | } | ||
1045 | |||
1046 | /* the counter may be not NULL, because maybe some conn entries | ||
1047 | are run by slow timer handler or unhashed but still referred */ | ||
1048 | if (atomic_read(&ip_vs_conn_count) != 0) { | ||
1049 | schedule(); | ||
1050 | goto flush_again; | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | |||
1055 | int __init ip_vs_conn_init(void) | ||
1056 | { | ||
1057 | int idx; | ||
1058 | |||
1059 | /* | ||
1060 | * Allocate the connection hash table and initialize its list heads | ||
1061 | */ | ||
1062 | ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); | ||
1063 | if (!ip_vs_conn_tab) | ||
1064 | return -ENOMEM; | ||
1065 | |||
1066 | /* Allocate ip_vs_conn slab cache */ | ||
1067 | ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", | ||
1068 | sizeof(struct ip_vs_conn), 0, | ||
1069 | SLAB_HWCACHE_ALIGN, NULL); | ||
1070 | if (!ip_vs_conn_cachep) { | ||
1071 | vfree(ip_vs_conn_tab); | ||
1072 | return -ENOMEM; | ||
1073 | } | ||
1074 | |||
1075 | IP_VS_INFO("Connection hash table configured " | ||
1076 | "(size=%d, memory=%ldKbytes)\n", | ||
1077 | IP_VS_CONN_TAB_SIZE, | ||
1078 | (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); | ||
1079 | IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", | ||
1080 | sizeof(struct ip_vs_conn)); | ||
1081 | |||
1082 | for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { | ||
1083 | INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); | ||
1084 | } | ||
1085 | |||
1086 | for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { | ||
1087 | rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); | ||
1088 | } | ||
1089 | |||
1090 | proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); | ||
1091 | proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); | ||
1092 | |||
1093 | /* calculate the random value for connection hash */ | ||
1094 | get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); | ||
1095 | |||
1096 | return 0; | ||
1097 | } | ||
1098 | |||
1099 | |||
1100 | void ip_vs_conn_cleanup(void) | ||
1101 | { | ||
1102 | /* flush all the connection entries first */ | ||
1103 | ip_vs_conn_flush(); | ||
1104 | |||
1105 | /* Release the empty cache */ | ||
1106 | kmem_cache_destroy(ip_vs_conn_cachep); | ||
1107 | proc_net_remove(&init_net, "ip_vs_conn"); | ||
1108 | proc_net_remove(&init_net, "ip_vs_conn_sync"); | ||
1109 | vfree(ip_vs_conn_tab); | ||
1110 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c deleted file mode 100644 index 958abf3e5f8c..000000000000 --- a/net/ipv4/ipvs/ip_vs_core.c +++ /dev/null | |||
@@ -1,1542 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * Julian Anastasov <ja@ssi.bg> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version | ||
15 | * 2 of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, | ||
18 | * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms | ||
19 | * and others. | ||
20 | * | ||
21 | * Changes: | ||
22 | * Paul `Rusty' Russell properly handle non-linear skbs | ||
23 | * Harald Welte don't use nfcache | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/module.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/ip.h> | ||
30 | #include <linux/tcp.h> | ||
31 | #include <linux/icmp.h> | ||
32 | |||
33 | #include <net/ip.h> | ||
34 | #include <net/tcp.h> | ||
35 | #include <net/udp.h> | ||
36 | #include <net/icmp.h> /* for icmp_send */ | ||
37 | #include <net/route.h> | ||
38 | |||
39 | #include <linux/netfilter.h> | ||
40 | #include <linux/netfilter_ipv4.h> | ||
41 | |||
42 | #ifdef CONFIG_IP_VS_IPV6 | ||
43 | #include <net/ipv6.h> | ||
44 | #include <linux/netfilter_ipv6.h> | ||
45 | #endif | ||
46 | |||
47 | #include <net/ip_vs.h> | ||
48 | |||
49 | |||
50 | EXPORT_SYMBOL(register_ip_vs_scheduler); | ||
51 | EXPORT_SYMBOL(unregister_ip_vs_scheduler); | ||
52 | EXPORT_SYMBOL(ip_vs_skb_replace); | ||
53 | EXPORT_SYMBOL(ip_vs_proto_name); | ||
54 | EXPORT_SYMBOL(ip_vs_conn_new); | ||
55 | EXPORT_SYMBOL(ip_vs_conn_in_get); | ||
56 | EXPORT_SYMBOL(ip_vs_conn_out_get); | ||
57 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
58 | EXPORT_SYMBOL(ip_vs_tcp_conn_listen); | ||
59 | #endif | ||
60 | EXPORT_SYMBOL(ip_vs_conn_put); | ||
61 | #ifdef CONFIG_IP_VS_DEBUG | ||
62 | EXPORT_SYMBOL(ip_vs_get_debug_level); | ||
63 | #endif | ||
64 | |||
65 | |||
66 | /* ID used in ICMP lookups */ | ||
67 | #define icmp_id(icmph) (((icmph)->un).echo.id) | ||
68 | #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) | ||
69 | |||
70 | const char *ip_vs_proto_name(unsigned proto) | ||
71 | { | ||
72 | static char buf[20]; | ||
73 | |||
74 | switch (proto) { | ||
75 | case IPPROTO_IP: | ||
76 | return "IP"; | ||
77 | case IPPROTO_UDP: | ||
78 | return "UDP"; | ||
79 | case IPPROTO_TCP: | ||
80 | return "TCP"; | ||
81 | case IPPROTO_ICMP: | ||
82 | return "ICMP"; | ||
83 | #ifdef CONFIG_IP_VS_IPV6 | ||
84 | case IPPROTO_ICMPV6: | ||
85 | return "ICMPv6"; | ||
86 | #endif | ||
87 | default: | ||
88 | sprintf(buf, "IP_%d", proto); | ||
89 | return buf; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | void ip_vs_init_hash_table(struct list_head *table, int rows) | ||
94 | { | ||
95 | while (--rows >= 0) | ||
96 | INIT_LIST_HEAD(&table[rows]); | ||
97 | } | ||
98 | |||
99 | static inline void | ||
100 | ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
101 | { | ||
102 | struct ip_vs_dest *dest = cp->dest; | ||
103 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
104 | spin_lock(&dest->stats.lock); | ||
105 | dest->stats.ustats.inpkts++; | ||
106 | dest->stats.ustats.inbytes += skb->len; | ||
107 | spin_unlock(&dest->stats.lock); | ||
108 | |||
109 | spin_lock(&dest->svc->stats.lock); | ||
110 | dest->svc->stats.ustats.inpkts++; | ||
111 | dest->svc->stats.ustats.inbytes += skb->len; | ||
112 | spin_unlock(&dest->svc->stats.lock); | ||
113 | |||
114 | spin_lock(&ip_vs_stats.lock); | ||
115 | ip_vs_stats.ustats.inpkts++; | ||
116 | ip_vs_stats.ustats.inbytes += skb->len; | ||
117 | spin_unlock(&ip_vs_stats.lock); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | |||
122 | static inline void | ||
123 | ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
124 | { | ||
125 | struct ip_vs_dest *dest = cp->dest; | ||
126 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
127 | spin_lock(&dest->stats.lock); | ||
128 | dest->stats.ustats.outpkts++; | ||
129 | dest->stats.ustats.outbytes += skb->len; | ||
130 | spin_unlock(&dest->stats.lock); | ||
131 | |||
132 | spin_lock(&dest->svc->stats.lock); | ||
133 | dest->svc->stats.ustats.outpkts++; | ||
134 | dest->svc->stats.ustats.outbytes += skb->len; | ||
135 | spin_unlock(&dest->svc->stats.lock); | ||
136 | |||
137 | spin_lock(&ip_vs_stats.lock); | ||
138 | ip_vs_stats.ustats.outpkts++; | ||
139 | ip_vs_stats.ustats.outbytes += skb->len; | ||
140 | spin_unlock(&ip_vs_stats.lock); | ||
141 | } | ||
142 | } | ||
143 | |||
144 | |||
145 | static inline void | ||
146 | ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) | ||
147 | { | ||
148 | spin_lock(&cp->dest->stats.lock); | ||
149 | cp->dest->stats.ustats.conns++; | ||
150 | spin_unlock(&cp->dest->stats.lock); | ||
151 | |||
152 | spin_lock(&svc->stats.lock); | ||
153 | svc->stats.ustats.conns++; | ||
154 | spin_unlock(&svc->stats.lock); | ||
155 | |||
156 | spin_lock(&ip_vs_stats.lock); | ||
157 | ip_vs_stats.ustats.conns++; | ||
158 | spin_unlock(&ip_vs_stats.lock); | ||
159 | } | ||
160 | |||
161 | |||
162 | static inline int | ||
163 | ip_vs_set_state(struct ip_vs_conn *cp, int direction, | ||
164 | const struct sk_buff *skb, | ||
165 | struct ip_vs_protocol *pp) | ||
166 | { | ||
167 | if (unlikely(!pp->state_transition)) | ||
168 | return 0; | ||
169 | return pp->state_transition(cp, direction, skb, pp); | ||
170 | } | ||
171 | |||
172 | |||
173 | /* | ||
174 | * IPVS persistent scheduling function | ||
175 | * It creates a connection entry according to its template if exists, | ||
176 | * or selects a server and creates a connection entry plus a template. | ||
177 | * Locking: we are svc user (svc->refcnt), so we hold all dests too | ||
178 | * Protocols supported: TCP, UDP | ||
179 | */ | ||
180 | static struct ip_vs_conn * | ||
181 | ip_vs_sched_persist(struct ip_vs_service *svc, | ||
182 | const struct sk_buff *skb, | ||
183 | __be16 ports[2]) | ||
184 | { | ||
185 | struct ip_vs_conn *cp = NULL; | ||
186 | struct ip_vs_iphdr iph; | ||
187 | struct ip_vs_dest *dest; | ||
188 | struct ip_vs_conn *ct; | ||
189 | __be16 dport; /* destination port to forward */ | ||
190 | union nf_inet_addr snet; /* source network of the client, | ||
191 | after masking */ | ||
192 | |||
193 | ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); | ||
194 | |||
195 | /* Mask saddr with the netmask to adjust template granularity */ | ||
196 | #ifdef CONFIG_IP_VS_IPV6 | ||
197 | if (svc->af == AF_INET6) | ||
198 | ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); | ||
199 | else | ||
200 | #endif | ||
201 | snet.ip = iph.saddr.ip & svc->netmask; | ||
202 | |||
203 | IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " | ||
204 | "mnet %s\n", | ||
205 | IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), | ||
206 | IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), | ||
207 | IP_VS_DBG_ADDR(svc->af, &snet)); | ||
208 | |||
209 | /* | ||
210 | * As far as we know, FTP is a very complicated network protocol, and | ||
211 | * it uses control connection and data connections. For active FTP, | ||
212 | * FTP server initialize data connection to the client, its source port | ||
213 | * is often 20. For passive FTP, FTP server tells the clients the port | ||
214 | * that it passively listens to, and the client issues the data | ||
215 | * connection. In the tunneling or direct routing mode, the load | ||
216 | * balancer is on the client-to-server half of connection, the port | ||
217 | * number is unknown to the load balancer. So, a conn template like | ||
218 | * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP | ||
219 | * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> | ||
220 | * is created for other persistent services. | ||
221 | */ | ||
222 | if (ports[1] == svc->port) { | ||
223 | /* Check if a template already exists */ | ||
224 | if (svc->port != FTPPORT) | ||
225 | ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, | ||
226 | &iph.daddr, ports[1]); | ||
227 | else | ||
228 | ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, | ||
229 | &iph.daddr, 0); | ||
230 | |||
231 | if (!ct || !ip_vs_check_template(ct)) { | ||
232 | /* | ||
233 | * No template found or the dest of the connection | ||
234 | * template is not available. | ||
235 | */ | ||
236 | dest = svc->scheduler->schedule(svc, skb); | ||
237 | if (dest == NULL) { | ||
238 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); | ||
239 | return NULL; | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * Create a template like <protocol,caddr,0, | ||
244 | * vaddr,vport,daddr,dport> for non-ftp service, | ||
245 | * and <protocol,caddr,0,vaddr,0,daddr,0> | ||
246 | * for ftp service. | ||
247 | */ | ||
248 | if (svc->port != FTPPORT) | ||
249 | ct = ip_vs_conn_new(svc->af, iph.protocol, | ||
250 | &snet, 0, | ||
251 | &iph.daddr, | ||
252 | ports[1], | ||
253 | &dest->addr, dest->port, | ||
254 | IP_VS_CONN_F_TEMPLATE, | ||
255 | dest); | ||
256 | else | ||
257 | ct = ip_vs_conn_new(svc->af, iph.protocol, | ||
258 | &snet, 0, | ||
259 | &iph.daddr, 0, | ||
260 | &dest->addr, 0, | ||
261 | IP_VS_CONN_F_TEMPLATE, | ||
262 | dest); | ||
263 | if (ct == NULL) | ||
264 | return NULL; | ||
265 | |||
266 | ct->timeout = svc->timeout; | ||
267 | } else { | ||
268 | /* set destination with the found template */ | ||
269 | dest = ct->dest; | ||
270 | } | ||
271 | dport = dest->port; | ||
272 | } else { | ||
273 | /* | ||
274 | * Note: persistent fwmark-based services and persistent | ||
275 | * port zero service are handled here. | ||
276 | * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> | ||
277 | * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> | ||
278 | */ | ||
279 | if (svc->fwmark) { | ||
280 | union nf_inet_addr fwmark = { | ||
281 | .all = { 0, 0, 0, htonl(svc->fwmark) } | ||
282 | }; | ||
283 | |||
284 | ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, | ||
285 | &fwmark, 0); | ||
286 | } else | ||
287 | ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, | ||
288 | &iph.daddr, 0); | ||
289 | |||
290 | if (!ct || !ip_vs_check_template(ct)) { | ||
291 | /* | ||
292 | * If it is not persistent port zero, return NULL, | ||
293 | * otherwise create a connection template. | ||
294 | */ | ||
295 | if (svc->port) | ||
296 | return NULL; | ||
297 | |||
298 | dest = svc->scheduler->schedule(svc, skb); | ||
299 | if (dest == NULL) { | ||
300 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); | ||
301 | return NULL; | ||
302 | } | ||
303 | |||
304 | /* | ||
305 | * Create a template according to the service | ||
306 | */ | ||
307 | if (svc->fwmark) { | ||
308 | union nf_inet_addr fwmark = { | ||
309 | .all = { 0, 0, 0, htonl(svc->fwmark) } | ||
310 | }; | ||
311 | |||
312 | ct = ip_vs_conn_new(svc->af, IPPROTO_IP, | ||
313 | &snet, 0, | ||
314 | &fwmark, 0, | ||
315 | &dest->addr, 0, | ||
316 | IP_VS_CONN_F_TEMPLATE, | ||
317 | dest); | ||
318 | } else | ||
319 | ct = ip_vs_conn_new(svc->af, iph.protocol, | ||
320 | &snet, 0, | ||
321 | &iph.daddr, 0, | ||
322 | &dest->addr, 0, | ||
323 | IP_VS_CONN_F_TEMPLATE, | ||
324 | dest); | ||
325 | if (ct == NULL) | ||
326 | return NULL; | ||
327 | |||
328 | ct->timeout = svc->timeout; | ||
329 | } else { | ||
330 | /* set destination with the found template */ | ||
331 | dest = ct->dest; | ||
332 | } | ||
333 | dport = ports[1]; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Create a new connection according to the template | ||
338 | */ | ||
339 | cp = ip_vs_conn_new(svc->af, iph.protocol, | ||
340 | &iph.saddr, ports[0], | ||
341 | &iph.daddr, ports[1], | ||
342 | &dest->addr, dport, | ||
343 | 0, | ||
344 | dest); | ||
345 | if (cp == NULL) { | ||
346 | ip_vs_conn_put(ct); | ||
347 | return NULL; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * Add its control | ||
352 | */ | ||
353 | ip_vs_control_add(cp, ct); | ||
354 | ip_vs_conn_put(ct); | ||
355 | |||
356 | ip_vs_conn_stats(cp, svc); | ||
357 | return cp; | ||
358 | } | ||
359 | |||
360 | |||
361 | /* | ||
362 | * IPVS main scheduling function | ||
363 | * It selects a server according to the virtual service, and | ||
364 | * creates a connection entry. | ||
365 | * Protocols supported: TCP, UDP | ||
366 | */ | ||
367 | struct ip_vs_conn * | ||
368 | ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
369 | { | ||
370 | struct ip_vs_conn *cp = NULL; | ||
371 | struct ip_vs_iphdr iph; | ||
372 | struct ip_vs_dest *dest; | ||
373 | __be16 _ports[2], *pptr; | ||
374 | |||
375 | ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); | ||
376 | pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); | ||
377 | if (pptr == NULL) | ||
378 | return NULL; | ||
379 | |||
380 | /* | ||
381 | * Persistent service | ||
382 | */ | ||
383 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) | ||
384 | return ip_vs_sched_persist(svc, skb, pptr); | ||
385 | |||
386 | /* | ||
387 | * Non-persistent service | ||
388 | */ | ||
389 | if (!svc->fwmark && pptr[1] != svc->port) { | ||
390 | if (!svc->port) | ||
391 | IP_VS_ERR("Schedule: port zero only supported " | ||
392 | "in persistent services, " | ||
393 | "check your ipvs configuration\n"); | ||
394 | return NULL; | ||
395 | } | ||
396 | |||
397 | dest = svc->scheduler->schedule(svc, skb); | ||
398 | if (dest == NULL) { | ||
399 | IP_VS_DBG(1, "Schedule: no dest found.\n"); | ||
400 | return NULL; | ||
401 | } | ||
402 | |||
403 | /* | ||
404 | * Create a connection entry. | ||
405 | */ | ||
406 | cp = ip_vs_conn_new(svc->af, iph.protocol, | ||
407 | &iph.saddr, pptr[0], | ||
408 | &iph.daddr, pptr[1], | ||
409 | &dest->addr, dest->port ? dest->port : pptr[1], | ||
410 | 0, | ||
411 | dest); | ||
412 | if (cp == NULL) | ||
413 | return NULL; | ||
414 | |||
415 | IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " | ||
416 | "d:%s:%u conn->flags:%X conn->refcnt:%d\n", | ||
417 | ip_vs_fwd_tag(cp), | ||
418 | IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), | ||
419 | IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), | ||
420 | IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), | ||
421 | cp->flags, atomic_read(&cp->refcnt)); | ||
422 | |||
423 | ip_vs_conn_stats(cp, svc); | ||
424 | return cp; | ||
425 | } | ||
426 | |||
427 | |||
428 | /* | ||
429 | * Pass or drop the packet. | ||
430 | * Called by ip_vs_in, when the virtual service is available but | ||
431 | * no destination is available for a new connection. | ||
432 | */ | ||
433 | int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | ||
434 | struct ip_vs_protocol *pp) | ||
435 | { | ||
436 | __be16 _ports[2], *pptr; | ||
437 | struct ip_vs_iphdr iph; | ||
438 | int unicast; | ||
439 | ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); | ||
440 | |||
441 | pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); | ||
442 | if (pptr == NULL) { | ||
443 | ip_vs_service_put(svc); | ||
444 | return NF_DROP; | ||
445 | } | ||
446 | |||
447 | #ifdef CONFIG_IP_VS_IPV6 | ||
448 | if (svc->af == AF_INET6) | ||
449 | unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; | ||
450 | else | ||
451 | #endif | ||
452 | unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); | ||
453 | |||
454 | /* if it is fwmark-based service, the cache_bypass sysctl is up | ||
455 | and the destination is a non-local unicast, then create | ||
456 | a cache_bypass connection entry */ | ||
457 | if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { | ||
458 | int ret, cs; | ||
459 | struct ip_vs_conn *cp; | ||
460 | union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; | ||
461 | |||
462 | ip_vs_service_put(svc); | ||
463 | |||
464 | /* create a new connection entry */ | ||
465 | IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); | ||
466 | cp = ip_vs_conn_new(svc->af, iph.protocol, | ||
467 | &iph.saddr, pptr[0], | ||
468 | &iph.daddr, pptr[1], | ||
469 | &daddr, 0, | ||
470 | IP_VS_CONN_F_BYPASS, | ||
471 | NULL); | ||
472 | if (cp == NULL) | ||
473 | return NF_DROP; | ||
474 | |||
475 | /* statistics */ | ||
476 | ip_vs_in_stats(cp, skb); | ||
477 | |||
478 | /* set state */ | ||
479 | cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | ||
480 | |||
481 | /* transmit the first SYN packet */ | ||
482 | ret = cp->packet_xmit(skb, cp, pp); | ||
483 | /* do not touch skb anymore */ | ||
484 | |||
485 | atomic_inc(&cp->in_pkts); | ||
486 | ip_vs_conn_put(cp); | ||
487 | return ret; | ||
488 | } | ||
489 | |||
490 | /* | ||
491 | * When the virtual ftp service is presented, packets destined | ||
492 | * for other services on the VIP may get here (except services | ||
493 | * listed in the ipvs table), pass the packets, because it is | ||
494 | * not ipvs job to decide to drop the packets. | ||
495 | */ | ||
496 | if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { | ||
497 | ip_vs_service_put(svc); | ||
498 | return NF_ACCEPT; | ||
499 | } | ||
500 | |||
501 | ip_vs_service_put(svc); | ||
502 | |||
503 | /* | ||
504 | * Notify the client that the destination is unreachable, and | ||
505 | * release the socket buffer. | ||
506 | * Since it is in IP layer, the TCP socket is not actually | ||
507 | * created, the TCP RST packet cannot be sent, instead that | ||
508 | * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ | ||
509 | */ | ||
510 | #ifdef CONFIG_IP_VS_IPV6 | ||
511 | if (svc->af == AF_INET6) | ||
512 | icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, | ||
513 | skb->dev); | ||
514 | else | ||
515 | #endif | ||
516 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | ||
517 | |||
518 | return NF_DROP; | ||
519 | } | ||
520 | |||
521 | |||
522 | /* | ||
523 | * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING | ||
524 | * chain, and is used for VS/NAT. | ||
525 | * It detects packets for VS/NAT connections and sends the packets | ||
526 | * immediately. This can avoid that iptable_nat mangles the packets | ||
527 | * for VS/NAT. | ||
528 | */ | ||
529 | static unsigned int ip_vs_post_routing(unsigned int hooknum, | ||
530 | struct sk_buff *skb, | ||
531 | const struct net_device *in, | ||
532 | const struct net_device *out, | ||
533 | int (*okfn)(struct sk_buff *)) | ||
534 | { | ||
535 | if (!skb->ipvs_property) | ||
536 | return NF_ACCEPT; | ||
537 | /* The packet was sent from IPVS, exit this chain */ | ||
538 | return NF_STOP; | ||
539 | } | ||
540 | |||
541 | __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) | ||
542 | { | ||
543 | return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); | ||
544 | } | ||
545 | |||
546 | static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) | ||
547 | { | ||
548 | int err = ip_defrag(skb, user); | ||
549 | |||
550 | if (!err) | ||
551 | ip_send_check(ip_hdr(skb)); | ||
552 | |||
553 | return err; | ||
554 | } | ||
555 | |||
556 | #ifdef CONFIG_IP_VS_IPV6 | ||
557 | static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) | ||
558 | { | ||
559 | /* TODO IPv6: Find out what to do here for IPv6 */ | ||
560 | return 0; | ||
561 | } | ||
562 | #endif | ||
563 | |||
564 | /* | ||
565 | * Packet has been made sufficiently writable in caller | ||
566 | * - inout: 1=in->out, 0=out->in | ||
567 | */ | ||
568 | void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
569 | struct ip_vs_conn *cp, int inout) | ||
570 | { | ||
571 | struct iphdr *iph = ip_hdr(skb); | ||
572 | unsigned int icmp_offset = iph->ihl*4; | ||
573 | struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + | ||
574 | icmp_offset); | ||
575 | struct iphdr *ciph = (struct iphdr *)(icmph + 1); | ||
576 | |||
577 | if (inout) { | ||
578 | iph->saddr = cp->vaddr.ip; | ||
579 | ip_send_check(iph); | ||
580 | ciph->daddr = cp->vaddr.ip; | ||
581 | ip_send_check(ciph); | ||
582 | } else { | ||
583 | iph->daddr = cp->daddr.ip; | ||
584 | ip_send_check(iph); | ||
585 | ciph->saddr = cp->daddr.ip; | ||
586 | ip_send_check(ciph); | ||
587 | } | ||
588 | |||
589 | /* the TCP/UDP port */ | ||
590 | if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { | ||
591 | __be16 *ports = (void *)ciph + ciph->ihl*4; | ||
592 | |||
593 | if (inout) | ||
594 | ports[1] = cp->vport; | ||
595 | else | ||
596 | ports[0] = cp->dport; | ||
597 | } | ||
598 | |||
599 | /* And finally the ICMP checksum */ | ||
600 | icmph->checksum = 0; | ||
601 | icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); | ||
602 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
603 | |||
604 | if (inout) | ||
605 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
606 | "Forwarding altered outgoing ICMP"); | ||
607 | else | ||
608 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
609 | "Forwarding altered incoming ICMP"); | ||
610 | } | ||
611 | |||
612 | #ifdef CONFIG_IP_VS_IPV6 | ||
613 | void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
614 | struct ip_vs_conn *cp, int inout) | ||
615 | { | ||
616 | struct ipv6hdr *iph = ipv6_hdr(skb); | ||
617 | unsigned int icmp_offset = sizeof(struct ipv6hdr); | ||
618 | struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + | ||
619 | icmp_offset); | ||
620 | struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); | ||
621 | |||
622 | if (inout) { | ||
623 | iph->saddr = cp->vaddr.in6; | ||
624 | ciph->daddr = cp->vaddr.in6; | ||
625 | } else { | ||
626 | iph->daddr = cp->daddr.in6; | ||
627 | ciph->saddr = cp->daddr.in6; | ||
628 | } | ||
629 | |||
630 | /* the TCP/UDP port */ | ||
631 | if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { | ||
632 | __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); | ||
633 | |||
634 | if (inout) | ||
635 | ports[1] = cp->vport; | ||
636 | else | ||
637 | ports[0] = cp->dport; | ||
638 | } | ||
639 | |||
640 | /* And finally the ICMP checksum */ | ||
641 | icmph->icmp6_cksum = 0; | ||
642 | /* TODO IPv6: is this correct for ICMPv6? */ | ||
643 | ip_vs_checksum_complete(skb, icmp_offset); | ||
644 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
645 | |||
646 | if (inout) | ||
647 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
648 | "Forwarding altered outgoing ICMPv6"); | ||
649 | else | ||
650 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
651 | "Forwarding altered incoming ICMPv6"); | ||
652 | } | ||
653 | #endif | ||
654 | |||
655 | /* Handle relevant response ICMP messages - forward to the right | ||
656 | * destination host. Used for NAT and local client. | ||
657 | */ | ||
658 | static int handle_response_icmp(int af, struct sk_buff *skb, | ||
659 | union nf_inet_addr *snet, | ||
660 | __u8 protocol, struct ip_vs_conn *cp, | ||
661 | struct ip_vs_protocol *pp, | ||
662 | unsigned int offset, unsigned int ihl) | ||
663 | { | ||
664 | unsigned int verdict = NF_DROP; | ||
665 | |||
666 | if (IP_VS_FWD_METHOD(cp) != 0) { | ||
667 | IP_VS_ERR("shouldn't reach here, because the box is on the " | ||
668 | "half connection in the tun/dr module.\n"); | ||
669 | } | ||
670 | |||
671 | /* Ensure the checksum is correct */ | ||
672 | if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { | ||
673 | /* Failed checksum! */ | ||
674 | IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", | ||
675 | IP_VS_DBG_ADDR(af, snet)); | ||
676 | goto out; | ||
677 | } | ||
678 | |||
679 | if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) | ||
680 | offset += 2 * sizeof(__u16); | ||
681 | if (!skb_make_writable(skb, offset)) | ||
682 | goto out; | ||
683 | |||
684 | #ifdef CONFIG_IP_VS_IPV6 | ||
685 | if (af == AF_INET6) | ||
686 | ip_vs_nat_icmp_v6(skb, pp, cp, 1); | ||
687 | else | ||
688 | #endif | ||
689 | ip_vs_nat_icmp(skb, pp, cp, 1); | ||
690 | |||
691 | /* do the statistics and put it back */ | ||
692 | ip_vs_out_stats(cp, skb); | ||
693 | |||
694 | skb->ipvs_property = 1; | ||
695 | verdict = NF_ACCEPT; | ||
696 | |||
697 | out: | ||
698 | __ip_vs_conn_put(cp); | ||
699 | |||
700 | return verdict; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * Handle ICMP messages in the inside-to-outside direction (outgoing). | ||
705 | * Find any that might be relevant, check against existing connections. | ||
706 | * Currently handles error types - unreachable, quench, ttl exceeded. | ||
707 | */ | ||
708 | static int ip_vs_out_icmp(struct sk_buff *skb, int *related) | ||
709 | { | ||
710 | struct iphdr *iph; | ||
711 | struct icmphdr _icmph, *ic; | ||
712 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ | ||
713 | struct ip_vs_iphdr ciph; | ||
714 | struct ip_vs_conn *cp; | ||
715 | struct ip_vs_protocol *pp; | ||
716 | unsigned int offset, ihl; | ||
717 | union nf_inet_addr snet; | ||
718 | |||
719 | *related = 1; | ||
720 | |||
721 | /* reassemble IP fragments */ | ||
722 | if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { | ||
723 | if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) | ||
724 | return NF_STOLEN; | ||
725 | } | ||
726 | |||
727 | iph = ip_hdr(skb); | ||
728 | offset = ihl = iph->ihl * 4; | ||
729 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
730 | if (ic == NULL) | ||
731 | return NF_DROP; | ||
732 | |||
733 | IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
734 | ic->type, ntohs(icmp_id(ic)), | ||
735 | NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | ||
736 | |||
737 | /* | ||
738 | * Work through seeing if this is for us. | ||
739 | * These checks are supposed to be in an order that means easy | ||
740 | * things are checked first to speed up processing.... however | ||
741 | * this means that some packets will manage to get a long way | ||
742 | * down this stack and then be rejected, but that's life. | ||
743 | */ | ||
744 | if ((ic->type != ICMP_DEST_UNREACH) && | ||
745 | (ic->type != ICMP_SOURCE_QUENCH) && | ||
746 | (ic->type != ICMP_TIME_EXCEEDED)) { | ||
747 | *related = 0; | ||
748 | return NF_ACCEPT; | ||
749 | } | ||
750 | |||
751 | /* Now find the contained IP header */ | ||
752 | offset += sizeof(_icmph); | ||
753 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
754 | if (cih == NULL) | ||
755 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
756 | |||
757 | pp = ip_vs_proto_get(cih->protocol); | ||
758 | if (!pp) | ||
759 | return NF_ACCEPT; | ||
760 | |||
761 | /* Is the embedded protocol header present? */ | ||
762 | if (unlikely(cih->frag_off & htons(IP_OFFSET) && | ||
763 | pp->dont_defrag)) | ||
764 | return NF_ACCEPT; | ||
765 | |||
766 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); | ||
767 | |||
768 | offset += cih->ihl * 4; | ||
769 | |||
770 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); | ||
771 | /* The embedded headers contain source and dest in reverse order */ | ||
772 | cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); | ||
773 | if (!cp) | ||
774 | return NF_ACCEPT; | ||
775 | |||
776 | snet.ip = iph->saddr; | ||
777 | return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, | ||
778 | pp, offset, ihl); | ||
779 | } | ||
780 | |||
781 | #ifdef CONFIG_IP_VS_IPV6 | ||
782 | static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) | ||
783 | { | ||
784 | struct ipv6hdr *iph; | ||
785 | struct icmp6hdr _icmph, *ic; | ||
786 | struct ipv6hdr _ciph, *cih; /* The ip header contained | ||
787 | within the ICMP */ | ||
788 | struct ip_vs_iphdr ciph; | ||
789 | struct ip_vs_conn *cp; | ||
790 | struct ip_vs_protocol *pp; | ||
791 | unsigned int offset; | ||
792 | union nf_inet_addr snet; | ||
793 | |||
794 | *related = 1; | ||
795 | |||
796 | /* reassemble IP fragments */ | ||
797 | if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { | ||
798 | if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) | ||
799 | return NF_STOLEN; | ||
800 | } | ||
801 | |||
802 | iph = ipv6_hdr(skb); | ||
803 | offset = sizeof(struct ipv6hdr); | ||
804 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
805 | if (ic == NULL) | ||
806 | return NF_DROP; | ||
807 | |||
808 | IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", | ||
809 | ic->icmp6_type, ntohs(icmpv6_id(ic)), | ||
810 | NIP6(iph->saddr), NIP6(iph->daddr)); | ||
811 | |||
812 | /* | ||
813 | * Work through seeing if this is for us. | ||
814 | * These checks are supposed to be in an order that means easy | ||
815 | * things are checked first to speed up processing.... however | ||
816 | * this means that some packets will manage to get a long way | ||
817 | * down this stack and then be rejected, but that's life. | ||
818 | */ | ||
819 | if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && | ||
820 | (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && | ||
821 | (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { | ||
822 | *related = 0; | ||
823 | return NF_ACCEPT; | ||
824 | } | ||
825 | |||
826 | /* Now find the contained IP header */ | ||
827 | offset += sizeof(_icmph); | ||
828 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
829 | if (cih == NULL) | ||
830 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
831 | |||
832 | pp = ip_vs_proto_get(cih->nexthdr); | ||
833 | if (!pp) | ||
834 | return NF_ACCEPT; | ||
835 | |||
836 | /* Is the embedded protocol header present? */ | ||
837 | /* TODO: we don't support fragmentation at the moment anyways */ | ||
838 | if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) | ||
839 | return NF_ACCEPT; | ||
840 | |||
841 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); | ||
842 | |||
843 | offset += sizeof(struct ipv6hdr); | ||
844 | |||
845 | ip_vs_fill_iphdr(AF_INET6, cih, &ciph); | ||
846 | /* The embedded headers contain source and dest in reverse order */ | ||
847 | cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); | ||
848 | if (!cp) | ||
849 | return NF_ACCEPT; | ||
850 | |||
851 | ipv6_addr_copy(&snet.in6, &iph->saddr); | ||
852 | return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, | ||
853 | pp, offset, sizeof(struct ipv6hdr)); | ||
854 | } | ||
855 | #endif | ||
856 | |||
857 | static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) | ||
858 | { | ||
859 | struct tcphdr _tcph, *th; | ||
860 | |||
861 | th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); | ||
862 | if (th == NULL) | ||
863 | return 0; | ||
864 | return th->rst; | ||
865 | } | ||
866 | |||
867 | /* Handle response packets: rewrite addresses and send away... | ||
868 | * Used for NAT and local client. | ||
869 | */ | ||
870 | static unsigned int | ||
871 | handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
872 | struct ip_vs_conn *cp, int ihl) | ||
873 | { | ||
874 | IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); | ||
875 | |||
876 | if (!skb_make_writable(skb, ihl)) | ||
877 | goto drop; | ||
878 | |||
879 | /* mangle the packet */ | ||
880 | if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) | ||
881 | goto drop; | ||
882 | |||
883 | #ifdef CONFIG_IP_VS_IPV6 | ||
884 | if (af == AF_INET6) | ||
885 | ipv6_hdr(skb)->saddr = cp->vaddr.in6; | ||
886 | else | ||
887 | #endif | ||
888 | { | ||
889 | ip_hdr(skb)->saddr = cp->vaddr.ip; | ||
890 | ip_send_check(ip_hdr(skb)); | ||
891 | } | ||
892 | |||
893 | /* For policy routing, packets originating from this | ||
894 | * machine itself may be routed differently to packets | ||
895 | * passing through. We want this packet to be routed as | ||
896 | * if it came from this machine itself. So re-compute | ||
897 | * the routing information. | ||
898 | */ | ||
899 | #ifdef CONFIG_IP_VS_IPV6 | ||
900 | if (af == AF_INET6) { | ||
901 | if (ip6_route_me_harder(skb) != 0) | ||
902 | goto drop; | ||
903 | } else | ||
904 | #endif | ||
905 | if (ip_route_me_harder(skb, RTN_LOCAL) != 0) | ||
906 | goto drop; | ||
907 | |||
908 | IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); | ||
909 | |||
910 | ip_vs_out_stats(cp, skb); | ||
911 | ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); | ||
912 | ip_vs_conn_put(cp); | ||
913 | |||
914 | skb->ipvs_property = 1; | ||
915 | |||
916 | LeaveFunction(11); | ||
917 | return NF_ACCEPT; | ||
918 | |||
919 | drop: | ||
920 | ip_vs_conn_put(cp); | ||
921 | kfree_skb(skb); | ||
922 | return NF_STOLEN; | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. | ||
927 | * Check if outgoing packet belongs to the established ip_vs_conn. | ||
928 | */ | ||
929 | static unsigned int | ||
930 | ip_vs_out(unsigned int hooknum, struct sk_buff *skb, | ||
931 | const struct net_device *in, const struct net_device *out, | ||
932 | int (*okfn)(struct sk_buff *)) | ||
933 | { | ||
934 | struct ip_vs_iphdr iph; | ||
935 | struct ip_vs_protocol *pp; | ||
936 | struct ip_vs_conn *cp; | ||
937 | int af; | ||
938 | |||
939 | EnterFunction(11); | ||
940 | |||
941 | af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; | ||
942 | |||
943 | if (skb->ipvs_property) | ||
944 | return NF_ACCEPT; | ||
945 | |||
946 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
947 | #ifdef CONFIG_IP_VS_IPV6 | ||
948 | if (af == AF_INET6) { | ||
949 | if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { | ||
950 | int related, verdict = ip_vs_out_icmp_v6(skb, &related); | ||
951 | |||
952 | if (related) | ||
953 | return verdict; | ||
954 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
955 | } | ||
956 | } else | ||
957 | #endif | ||
958 | if (unlikely(iph.protocol == IPPROTO_ICMP)) { | ||
959 | int related, verdict = ip_vs_out_icmp(skb, &related); | ||
960 | |||
961 | if (related) | ||
962 | return verdict; | ||
963 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
964 | } | ||
965 | |||
966 | pp = ip_vs_proto_get(iph.protocol); | ||
967 | if (unlikely(!pp)) | ||
968 | return NF_ACCEPT; | ||
969 | |||
970 | /* reassemble IP fragments */ | ||
971 | #ifdef CONFIG_IP_VS_IPV6 | ||
972 | if (af == AF_INET6) { | ||
973 | if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { | ||
974 | int related, verdict = ip_vs_out_icmp_v6(skb, &related); | ||
975 | |||
976 | if (related) | ||
977 | return verdict; | ||
978 | |||
979 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
980 | } | ||
981 | } else | ||
982 | #endif | ||
983 | if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && | ||
984 | !pp->dont_defrag)) { | ||
985 | if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) | ||
986 | return NF_STOLEN; | ||
987 | |||
988 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
989 | } | ||
990 | |||
991 | /* | ||
992 | * Check if the packet belongs to an existing entry | ||
993 | */ | ||
994 | cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); | ||
995 | |||
996 | if (unlikely(!cp)) { | ||
997 | if (sysctl_ip_vs_nat_icmp_send && | ||
998 | (pp->protocol == IPPROTO_TCP || | ||
999 | pp->protocol == IPPROTO_UDP)) { | ||
1000 | __be16 _ports[2], *pptr; | ||
1001 | |||
1002 | pptr = skb_header_pointer(skb, iph.len, | ||
1003 | sizeof(_ports), _ports); | ||
1004 | if (pptr == NULL) | ||
1005 | return NF_ACCEPT; /* Not for me */ | ||
1006 | if (ip_vs_lookup_real_service(af, iph.protocol, | ||
1007 | &iph.saddr, | ||
1008 | pptr[0])) { | ||
1009 | /* | ||
1010 | * Notify the real server: there is no | ||
1011 | * existing entry if it is not RST | ||
1012 | * packet or not TCP packet. | ||
1013 | */ | ||
1014 | if (iph.protocol != IPPROTO_TCP | ||
1015 | || !is_tcp_reset(skb, iph.len)) { | ||
1016 | #ifdef CONFIG_IP_VS_IPV6 | ||
1017 | if (af == AF_INET6) | ||
1018 | icmpv6_send(skb, | ||
1019 | ICMPV6_DEST_UNREACH, | ||
1020 | ICMPV6_PORT_UNREACH, | ||
1021 | 0, skb->dev); | ||
1022 | else | ||
1023 | #endif | ||
1024 | icmp_send(skb, | ||
1025 | ICMP_DEST_UNREACH, | ||
1026 | ICMP_PORT_UNREACH, 0); | ||
1027 | return NF_DROP; | ||
1028 | } | ||
1029 | } | ||
1030 | } | ||
1031 | IP_VS_DBG_PKT(12, pp, skb, 0, | ||
1032 | "packet continues traversal as normal"); | ||
1033 | return NF_ACCEPT; | ||
1034 | } | ||
1035 | |||
1036 | return handle_response(af, skb, pp, cp, iph.len); | ||
1037 | } | ||
1038 | |||
1039 | |||
1040 | /* | ||
1041 | * Handle ICMP messages in the outside-to-inside direction (incoming). | ||
1042 | * Find any that might be relevant, check against existing connections, | ||
1043 | * forward to the right destination host if relevant. | ||
1044 | * Currently handles error types - unreachable, quench, ttl exceeded. | ||
1045 | */ | ||
1046 | static int | ||
1047 | ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | ||
1048 | { | ||
1049 | struct iphdr *iph; | ||
1050 | struct icmphdr _icmph, *ic; | ||
1051 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ | ||
1052 | struct ip_vs_iphdr ciph; | ||
1053 | struct ip_vs_conn *cp; | ||
1054 | struct ip_vs_protocol *pp; | ||
1055 | unsigned int offset, ihl, verdict; | ||
1056 | union nf_inet_addr snet; | ||
1057 | |||
1058 | *related = 1; | ||
1059 | |||
1060 | /* reassemble IP fragments */ | ||
1061 | if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { | ||
1062 | if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? | ||
1063 | IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) | ||
1064 | return NF_STOLEN; | ||
1065 | } | ||
1066 | |||
1067 | iph = ip_hdr(skb); | ||
1068 | offset = ihl = iph->ihl * 4; | ||
1069 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
1070 | if (ic == NULL) | ||
1071 | return NF_DROP; | ||
1072 | |||
1073 | IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
1074 | ic->type, ntohs(icmp_id(ic)), | ||
1075 | NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | ||
1076 | |||
1077 | /* | ||
1078 | * Work through seeing if this is for us. | ||
1079 | * These checks are supposed to be in an order that means easy | ||
1080 | * things are checked first to speed up processing.... however | ||
1081 | * this means that some packets will manage to get a long way | ||
1082 | * down this stack and then be rejected, but that's life. | ||
1083 | */ | ||
1084 | if ((ic->type != ICMP_DEST_UNREACH) && | ||
1085 | (ic->type != ICMP_SOURCE_QUENCH) && | ||
1086 | (ic->type != ICMP_TIME_EXCEEDED)) { | ||
1087 | *related = 0; | ||
1088 | return NF_ACCEPT; | ||
1089 | } | ||
1090 | |||
1091 | /* Now find the contained IP header */ | ||
1092 | offset += sizeof(_icmph); | ||
1093 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
1094 | if (cih == NULL) | ||
1095 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
1096 | |||
1097 | pp = ip_vs_proto_get(cih->protocol); | ||
1098 | if (!pp) | ||
1099 | return NF_ACCEPT; | ||
1100 | |||
1101 | /* Is the embedded protocol header present? */ | ||
1102 | if (unlikely(cih->frag_off & htons(IP_OFFSET) && | ||
1103 | pp->dont_defrag)) | ||
1104 | return NF_ACCEPT; | ||
1105 | |||
1106 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); | ||
1107 | |||
1108 | offset += cih->ihl * 4; | ||
1109 | |||
1110 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); | ||
1111 | /* The embedded headers contain source and dest in reverse order */ | ||
1112 | cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); | ||
1113 | if (!cp) { | ||
1114 | /* The packet could also belong to a local client */ | ||
1115 | cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); | ||
1116 | if (cp) { | ||
1117 | snet.ip = iph->saddr; | ||
1118 | return handle_response_icmp(AF_INET, skb, &snet, | ||
1119 | cih->protocol, cp, pp, | ||
1120 | offset, ihl); | ||
1121 | } | ||
1122 | return NF_ACCEPT; | ||
1123 | } | ||
1124 | |||
1125 | verdict = NF_DROP; | ||
1126 | |||
1127 | /* Ensure the checksum is correct */ | ||
1128 | if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { | ||
1129 | /* Failed checksum! */ | ||
1130 | IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", | ||
1131 | NIPQUAD(iph->saddr)); | ||
1132 | goto out; | ||
1133 | } | ||
1134 | |||
1135 | /* do the statistics and put it back */ | ||
1136 | ip_vs_in_stats(cp, skb); | ||
1137 | if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) | ||
1138 | offset += 2 * sizeof(__u16); | ||
1139 | verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); | ||
1140 | /* do not touch skb anymore */ | ||
1141 | |||
1142 | out: | ||
1143 | __ip_vs_conn_put(cp); | ||
1144 | |||
1145 | return verdict; | ||
1146 | } | ||
1147 | |||
1148 | #ifdef CONFIG_IP_VS_IPV6 | ||
1149 | static int | ||
1150 | ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) | ||
1151 | { | ||
1152 | struct ipv6hdr *iph; | ||
1153 | struct icmp6hdr _icmph, *ic; | ||
1154 | struct ipv6hdr _ciph, *cih; /* The ip header contained | ||
1155 | within the ICMP */ | ||
1156 | struct ip_vs_iphdr ciph; | ||
1157 | struct ip_vs_conn *cp; | ||
1158 | struct ip_vs_protocol *pp; | ||
1159 | unsigned int offset, verdict; | ||
1160 | union nf_inet_addr snet; | ||
1161 | |||
1162 | *related = 1; | ||
1163 | |||
1164 | /* reassemble IP fragments */ | ||
1165 | if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { | ||
1166 | if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? | ||
1167 | IP_DEFRAG_VS_IN : | ||
1168 | IP_DEFRAG_VS_FWD)) | ||
1169 | return NF_STOLEN; | ||
1170 | } | ||
1171 | |||
1172 | iph = ipv6_hdr(skb); | ||
1173 | offset = sizeof(struct ipv6hdr); | ||
1174 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
1175 | if (ic == NULL) | ||
1176 | return NF_DROP; | ||
1177 | |||
1178 | IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", | ||
1179 | ic->icmp6_type, ntohs(icmpv6_id(ic)), | ||
1180 | NIP6(iph->saddr), NIP6(iph->daddr)); | ||
1181 | |||
1182 | /* | ||
1183 | * Work through seeing if this is for us. | ||
1184 | * These checks are supposed to be in an order that means easy | ||
1185 | * things are checked first to speed up processing.... however | ||
1186 | * this means that some packets will manage to get a long way | ||
1187 | * down this stack and then be rejected, but that's life. | ||
1188 | */ | ||
1189 | if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && | ||
1190 | (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && | ||
1191 | (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { | ||
1192 | *related = 0; | ||
1193 | return NF_ACCEPT; | ||
1194 | } | ||
1195 | |||
1196 | /* Now find the contained IP header */ | ||
1197 | offset += sizeof(_icmph); | ||
1198 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
1199 | if (cih == NULL) | ||
1200 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
1201 | |||
1202 | pp = ip_vs_proto_get(cih->nexthdr); | ||
1203 | if (!pp) | ||
1204 | return NF_ACCEPT; | ||
1205 | |||
1206 | /* Is the embedded protocol header present? */ | ||
1207 | /* TODO: we don't support fragmentation at the moment anyways */ | ||
1208 | if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) | ||
1209 | return NF_ACCEPT; | ||
1210 | |||
1211 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); | ||
1212 | |||
1213 | offset += sizeof(struct ipv6hdr); | ||
1214 | |||
1215 | ip_vs_fill_iphdr(AF_INET6, cih, &ciph); | ||
1216 | /* The embedded headers contain source and dest in reverse order */ | ||
1217 | cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); | ||
1218 | if (!cp) { | ||
1219 | /* The packet could also belong to a local client */ | ||
1220 | cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); | ||
1221 | if (cp) { | ||
1222 | ipv6_addr_copy(&snet.in6, &iph->saddr); | ||
1223 | return handle_response_icmp(AF_INET6, skb, &snet, | ||
1224 | cih->nexthdr, | ||
1225 | cp, pp, offset, | ||
1226 | sizeof(struct ipv6hdr)); | ||
1227 | } | ||
1228 | return NF_ACCEPT; | ||
1229 | } | ||
1230 | |||
1231 | verdict = NF_DROP; | ||
1232 | |||
1233 | /* do the statistics and put it back */ | ||
1234 | ip_vs_in_stats(cp, skb); | ||
1235 | if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) | ||
1236 | offset += 2 * sizeof(__u16); | ||
1237 | verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); | ||
1238 | /* do not touch skb anymore */ | ||
1239 | |||
1240 | __ip_vs_conn_put(cp); | ||
1241 | |||
1242 | return verdict; | ||
1243 | } | ||
1244 | #endif | ||
1245 | |||
1246 | |||
1247 | /* | ||
1248 | * Check if it's for virtual services, look it up, | ||
1249 | * and send it on its way... | ||
1250 | */ | ||
1251 | static unsigned int | ||
1252 | ip_vs_in(unsigned int hooknum, struct sk_buff *skb, | ||
1253 | const struct net_device *in, const struct net_device *out, | ||
1254 | int (*okfn)(struct sk_buff *)) | ||
1255 | { | ||
1256 | struct ip_vs_iphdr iph; | ||
1257 | struct ip_vs_protocol *pp; | ||
1258 | struct ip_vs_conn *cp; | ||
1259 | int ret, restart, af; | ||
1260 | |||
1261 | af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; | ||
1262 | |||
1263 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
1264 | |||
1265 | /* | ||
1266 | * Big tappo: only PACKET_HOST, including loopback for local client | ||
1267 | * Don't handle local packets on IPv6 for now | ||
1268 | */ | ||
1269 | if (unlikely(skb->pkt_type != PACKET_HOST)) { | ||
1270 | IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", | ||
1271 | skb->pkt_type, | ||
1272 | iph.protocol, | ||
1273 | IP_VS_DBG_ADDR(af, &iph.daddr)); | ||
1274 | return NF_ACCEPT; | ||
1275 | } | ||
1276 | |||
1277 | if (unlikely(iph.protocol == IPPROTO_ICMP)) { | ||
1278 | int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); | ||
1279 | |||
1280 | if (related) | ||
1281 | return verdict; | ||
1282 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
1283 | } | ||
1284 | |||
1285 | /* Protocol supported? */ | ||
1286 | pp = ip_vs_proto_get(iph.protocol); | ||
1287 | if (unlikely(!pp)) | ||
1288 | return NF_ACCEPT; | ||
1289 | |||
1290 | /* | ||
1291 | * Check if the packet belongs to an existing connection entry | ||
1292 | */ | ||
1293 | cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); | ||
1294 | |||
1295 | if (unlikely(!cp)) { | ||
1296 | int v; | ||
1297 | |||
1298 | /* For local client packets, it could be a response */ | ||
1299 | cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); | ||
1300 | if (cp) | ||
1301 | return handle_response(af, skb, pp, cp, iph.len); | ||
1302 | |||
1303 | if (!pp->conn_schedule(af, skb, pp, &v, &cp)) | ||
1304 | return v; | ||
1305 | } | ||
1306 | |||
1307 | if (unlikely(!cp)) { | ||
1308 | /* sorry, all this trouble for a no-hit :) */ | ||
1309 | IP_VS_DBG_PKT(12, pp, skb, 0, | ||
1310 | "packet continues traversal as normal"); | ||
1311 | return NF_ACCEPT; | ||
1312 | } | ||
1313 | |||
1314 | IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); | ||
1315 | |||
1316 | /* Check the server status */ | ||
1317 | if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
1318 | /* the destination server is not available */ | ||
1319 | |||
1320 | if (sysctl_ip_vs_expire_nodest_conn) { | ||
1321 | /* try to expire the connection immediately */ | ||
1322 | ip_vs_conn_expire_now(cp); | ||
1323 | } | ||
1324 | /* don't restart its timer, and silently | ||
1325 | drop the packet. */ | ||
1326 | __ip_vs_conn_put(cp); | ||
1327 | return NF_DROP; | ||
1328 | } | ||
1329 | |||
1330 | ip_vs_in_stats(cp, skb); | ||
1331 | restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | ||
1332 | if (cp->packet_xmit) | ||
1333 | ret = cp->packet_xmit(skb, cp, pp); | ||
1334 | /* do not touch skb anymore */ | ||
1335 | else { | ||
1336 | IP_VS_DBG_RL("warning: packet_xmit is null"); | ||
1337 | ret = NF_ACCEPT; | ||
1338 | } | ||
1339 | |||
1340 | /* Increase its packet counter and check if it is needed | ||
1341 | * to be synchronized | ||
1342 | * | ||
1343 | * Sync connection if it is about to close to | ||
1344 | * encorage the standby servers to update the connections timeout | ||
1345 | */ | ||
1346 | atomic_inc(&cp->in_pkts); | ||
1347 | if (af == AF_INET && | ||
1348 | (ip_vs_sync_state & IP_VS_STATE_MASTER) && | ||
1349 | (((cp->protocol != IPPROTO_TCP || | ||
1350 | cp->state == IP_VS_TCP_S_ESTABLISHED) && | ||
1351 | (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] | ||
1352 | == sysctl_ip_vs_sync_threshold[0])) || | ||
1353 | ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && | ||
1354 | ((cp->state == IP_VS_TCP_S_FIN_WAIT) || | ||
1355 | (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || | ||
1356 | (cp->state == IP_VS_TCP_S_TIME_WAIT))))) | ||
1357 | ip_vs_sync_conn(cp); | ||
1358 | cp->old_state = cp->state; | ||
1359 | |||
1360 | ip_vs_conn_put(cp); | ||
1361 | return ret; | ||
1362 | } | ||
1363 | |||
1364 | |||
1365 | /* | ||
1366 | * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP | ||
1367 | * related packets destined for 0.0.0.0/0. | ||
1368 | * When fwmark-based virtual service is used, such as transparent | ||
1369 | * cache cluster, TCP packets can be marked and routed to ip_vs_in, | ||
1370 | * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and | ||
1371 | * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain | ||
1372 | * and send them to ip_vs_in_icmp. | ||
1373 | */ | ||
1374 | static unsigned int | ||
1375 | ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, | ||
1376 | const struct net_device *in, const struct net_device *out, | ||
1377 | int (*okfn)(struct sk_buff *)) | ||
1378 | { | ||
1379 | int r; | ||
1380 | |||
1381 | if (ip_hdr(skb)->protocol != IPPROTO_ICMP) | ||
1382 | return NF_ACCEPT; | ||
1383 | |||
1384 | return ip_vs_in_icmp(skb, &r, hooknum); | ||
1385 | } | ||
1386 | |||
1387 | #ifdef CONFIG_IP_VS_IPV6 | ||
1388 | static unsigned int | ||
1389 | ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, | ||
1390 | const struct net_device *in, const struct net_device *out, | ||
1391 | int (*okfn)(struct sk_buff *)) | ||
1392 | { | ||
1393 | int r; | ||
1394 | |||
1395 | if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) | ||
1396 | return NF_ACCEPT; | ||
1397 | |||
1398 | return ip_vs_in_icmp_v6(skb, &r, hooknum); | ||
1399 | } | ||
1400 | #endif | ||
1401 | |||
1402 | |||
1403 | static struct nf_hook_ops ip_vs_ops[] __read_mostly = { | ||
1404 | /* After packet filtering, forward packet through VS/DR, VS/TUN, | ||
1405 | * or VS/NAT(change destination), so that filtering rules can be | ||
1406 | * applied to IPVS. */ | ||
1407 | { | ||
1408 | .hook = ip_vs_in, | ||
1409 | .owner = THIS_MODULE, | ||
1410 | .pf = PF_INET, | ||
1411 | .hooknum = NF_INET_LOCAL_IN, | ||
1412 | .priority = 100, | ||
1413 | }, | ||
1414 | /* After packet filtering, change source only for VS/NAT */ | ||
1415 | { | ||
1416 | .hook = ip_vs_out, | ||
1417 | .owner = THIS_MODULE, | ||
1418 | .pf = PF_INET, | ||
1419 | .hooknum = NF_INET_FORWARD, | ||
1420 | .priority = 100, | ||
1421 | }, | ||
1422 | /* After packet filtering (but before ip_vs_out_icmp), catch icmp | ||
1423 | * destined for 0.0.0.0/0, which is for incoming IPVS connections */ | ||
1424 | { | ||
1425 | .hook = ip_vs_forward_icmp, | ||
1426 | .owner = THIS_MODULE, | ||
1427 | .pf = PF_INET, | ||
1428 | .hooknum = NF_INET_FORWARD, | ||
1429 | .priority = 99, | ||
1430 | }, | ||
1431 | /* Before the netfilter connection tracking, exit from POST_ROUTING */ | ||
1432 | { | ||
1433 | .hook = ip_vs_post_routing, | ||
1434 | .owner = THIS_MODULE, | ||
1435 | .pf = PF_INET, | ||
1436 | .hooknum = NF_INET_POST_ROUTING, | ||
1437 | .priority = NF_IP_PRI_NAT_SRC-1, | ||
1438 | }, | ||
1439 | #ifdef CONFIG_IP_VS_IPV6 | ||
1440 | /* After packet filtering, forward packet through VS/DR, VS/TUN, | ||
1441 | * or VS/NAT(change destination), so that filtering rules can be | ||
1442 | * applied to IPVS. */ | ||
1443 | { | ||
1444 | .hook = ip_vs_in, | ||
1445 | .owner = THIS_MODULE, | ||
1446 | .pf = PF_INET6, | ||
1447 | .hooknum = NF_INET_LOCAL_IN, | ||
1448 | .priority = 100, | ||
1449 | }, | ||
1450 | /* After packet filtering, change source only for VS/NAT */ | ||
1451 | { | ||
1452 | .hook = ip_vs_out, | ||
1453 | .owner = THIS_MODULE, | ||
1454 | .pf = PF_INET6, | ||
1455 | .hooknum = NF_INET_FORWARD, | ||
1456 | .priority = 100, | ||
1457 | }, | ||
1458 | /* After packet filtering (but before ip_vs_out_icmp), catch icmp | ||
1459 | * destined for 0.0.0.0/0, which is for incoming IPVS connections */ | ||
1460 | { | ||
1461 | .hook = ip_vs_forward_icmp_v6, | ||
1462 | .owner = THIS_MODULE, | ||
1463 | .pf = PF_INET6, | ||
1464 | .hooknum = NF_INET_FORWARD, | ||
1465 | .priority = 99, | ||
1466 | }, | ||
1467 | /* Before the netfilter connection tracking, exit from POST_ROUTING */ | ||
1468 | { | ||
1469 | .hook = ip_vs_post_routing, | ||
1470 | .owner = THIS_MODULE, | ||
1471 | .pf = PF_INET6, | ||
1472 | .hooknum = NF_INET_POST_ROUTING, | ||
1473 | .priority = NF_IP6_PRI_NAT_SRC-1, | ||
1474 | }, | ||
1475 | #endif | ||
1476 | }; | ||
1477 | |||
1478 | |||
1479 | /* | ||
1480 | * Initialize IP Virtual Server | ||
1481 | */ | ||
1482 | static int __init ip_vs_init(void) | ||
1483 | { | ||
1484 | int ret; | ||
1485 | |||
1486 | ip_vs_estimator_init(); | ||
1487 | |||
1488 | ret = ip_vs_control_init(); | ||
1489 | if (ret < 0) { | ||
1490 | IP_VS_ERR("can't setup control.\n"); | ||
1491 | goto cleanup_estimator; | ||
1492 | } | ||
1493 | |||
1494 | ip_vs_protocol_init(); | ||
1495 | |||
1496 | ret = ip_vs_app_init(); | ||
1497 | if (ret < 0) { | ||
1498 | IP_VS_ERR("can't setup application helper.\n"); | ||
1499 | goto cleanup_protocol; | ||
1500 | } | ||
1501 | |||
1502 | ret = ip_vs_conn_init(); | ||
1503 | if (ret < 0) { | ||
1504 | IP_VS_ERR("can't setup connection table.\n"); | ||
1505 | goto cleanup_app; | ||
1506 | } | ||
1507 | |||
1508 | ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); | ||
1509 | if (ret < 0) { | ||
1510 | IP_VS_ERR("can't register hooks.\n"); | ||
1511 | goto cleanup_conn; | ||
1512 | } | ||
1513 | |||
1514 | IP_VS_INFO("ipvs loaded.\n"); | ||
1515 | return ret; | ||
1516 | |||
1517 | cleanup_conn: | ||
1518 | ip_vs_conn_cleanup(); | ||
1519 | cleanup_app: | ||
1520 | ip_vs_app_cleanup(); | ||
1521 | cleanup_protocol: | ||
1522 | ip_vs_protocol_cleanup(); | ||
1523 | ip_vs_control_cleanup(); | ||
1524 | cleanup_estimator: | ||
1525 | ip_vs_estimator_cleanup(); | ||
1526 | return ret; | ||
1527 | } | ||
1528 | |||
1529 | static void __exit ip_vs_cleanup(void) | ||
1530 | { | ||
1531 | nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); | ||
1532 | ip_vs_conn_cleanup(); | ||
1533 | ip_vs_app_cleanup(); | ||
1534 | ip_vs_protocol_cleanup(); | ||
1535 | ip_vs_control_cleanup(); | ||
1536 | ip_vs_estimator_cleanup(); | ||
1537 | IP_VS_INFO("ipvs unloaded.\n"); | ||
1538 | } | ||
1539 | |||
1540 | module_init(ip_vs_init); | ||
1541 | module_exit(ip_vs_cleanup); | ||
1542 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c deleted file mode 100644 index 0302cf3e5039..000000000000 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ /dev/null | |||
@@ -1,3443 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the NetFilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * Julian Anastasov <ja@ssi.bg> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version | ||
15 | * 2 of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * Changes: | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/types.h> | ||
24 | #include <linux/capability.h> | ||
25 | #include <linux/fs.h> | ||
26 | #include <linux/sysctl.h> | ||
27 | #include <linux/proc_fs.h> | ||
28 | #include <linux/workqueue.h> | ||
29 | #include <linux/swap.h> | ||
30 | #include <linux/seq_file.h> | ||
31 | |||
32 | #include <linux/netfilter.h> | ||
33 | #include <linux/netfilter_ipv4.h> | ||
34 | #include <linux/mutex.h> | ||
35 | |||
36 | #include <net/net_namespace.h> | ||
37 | #include <net/ip.h> | ||
38 | #ifdef CONFIG_IP_VS_IPV6 | ||
39 | #include <net/ipv6.h> | ||
40 | #include <net/ip6_route.h> | ||
41 | #endif | ||
42 | #include <net/route.h> | ||
43 | #include <net/sock.h> | ||
44 | #include <net/genetlink.h> | ||
45 | |||
46 | #include <asm/uaccess.h> | ||
47 | |||
48 | #include <net/ip_vs.h> | ||
49 | |||
50 | /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ | ||
51 | static DEFINE_MUTEX(__ip_vs_mutex); | ||
52 | |||
53 | /* lock for service table */ | ||
54 | static DEFINE_RWLOCK(__ip_vs_svc_lock); | ||
55 | |||
56 | /* lock for table with the real services */ | ||
57 | static DEFINE_RWLOCK(__ip_vs_rs_lock); | ||
58 | |||
59 | /* lock for state and timeout tables */ | ||
60 | static DEFINE_RWLOCK(__ip_vs_securetcp_lock); | ||
61 | |||
62 | /* lock for drop entry handling */ | ||
63 | static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); | ||
64 | |||
65 | /* lock for drop packet handling */ | ||
66 | static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); | ||
67 | |||
68 | /* 1/rate drop and drop-entry variables */ | ||
69 | int ip_vs_drop_rate = 0; | ||
70 | int ip_vs_drop_counter = 0; | ||
71 | static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); | ||
72 | |||
73 | /* number of virtual services */ | ||
74 | static int ip_vs_num_services = 0; | ||
75 | |||
76 | /* sysctl variables */ | ||
77 | static int sysctl_ip_vs_drop_entry = 0; | ||
78 | static int sysctl_ip_vs_drop_packet = 0; | ||
79 | static int sysctl_ip_vs_secure_tcp = 0; | ||
80 | static int sysctl_ip_vs_amemthresh = 1024; | ||
81 | static int sysctl_ip_vs_am_droprate = 10; | ||
82 | int sysctl_ip_vs_cache_bypass = 0; | ||
83 | int sysctl_ip_vs_expire_nodest_conn = 0; | ||
84 | int sysctl_ip_vs_expire_quiescent_template = 0; | ||
85 | int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; | ||
86 | int sysctl_ip_vs_nat_icmp_send = 0; | ||
87 | |||
88 | |||
89 | #ifdef CONFIG_IP_VS_DEBUG | ||
90 | static int sysctl_ip_vs_debug_level = 0; | ||
91 | |||
92 | int ip_vs_get_debug_level(void) | ||
93 | { | ||
94 | return sysctl_ip_vs_debug_level; | ||
95 | } | ||
96 | #endif | ||
97 | |||
98 | #ifdef CONFIG_IP_VS_IPV6 | ||
99 | /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ | ||
100 | static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) | ||
101 | { | ||
102 | struct rt6_info *rt; | ||
103 | struct flowi fl = { | ||
104 | .oif = 0, | ||
105 | .nl_u = { | ||
106 | .ip6_u = { | ||
107 | .daddr = *addr, | ||
108 | .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, | ||
109 | }; | ||
110 | |||
111 | rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); | ||
112 | if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) | ||
113 | return 1; | ||
114 | |||
115 | return 0; | ||
116 | } | ||
117 | #endif | ||
118 | /* | ||
119 | * update_defense_level is called from keventd and from sysctl, | ||
120 | * so it needs to protect itself from softirqs | ||
121 | */ | ||
122 | static void update_defense_level(void) | ||
123 | { | ||
124 | struct sysinfo i; | ||
125 | static int old_secure_tcp = 0; | ||
126 | int availmem; | ||
127 | int nomem; | ||
128 | int to_change = -1; | ||
129 | |||
130 | /* we only count free and buffered memory (in pages) */ | ||
131 | si_meminfo(&i); | ||
132 | availmem = i.freeram + i.bufferram; | ||
133 | /* however in linux 2.5 the i.bufferram is total page cache size, | ||
134 | we need adjust it */ | ||
135 | /* si_swapinfo(&i); */ | ||
136 | /* availmem = availmem - (i.totalswap - i.freeswap); */ | ||
137 | |||
138 | nomem = (availmem < sysctl_ip_vs_amemthresh); | ||
139 | |||
140 | local_bh_disable(); | ||
141 | |||
142 | /* drop_entry */ | ||
143 | spin_lock(&__ip_vs_dropentry_lock); | ||
144 | switch (sysctl_ip_vs_drop_entry) { | ||
145 | case 0: | ||
146 | atomic_set(&ip_vs_dropentry, 0); | ||
147 | break; | ||
148 | case 1: | ||
149 | if (nomem) { | ||
150 | atomic_set(&ip_vs_dropentry, 1); | ||
151 | sysctl_ip_vs_drop_entry = 2; | ||
152 | } else { | ||
153 | atomic_set(&ip_vs_dropentry, 0); | ||
154 | } | ||
155 | break; | ||
156 | case 2: | ||
157 | if (nomem) { | ||
158 | atomic_set(&ip_vs_dropentry, 1); | ||
159 | } else { | ||
160 | atomic_set(&ip_vs_dropentry, 0); | ||
161 | sysctl_ip_vs_drop_entry = 1; | ||
162 | }; | ||
163 | break; | ||
164 | case 3: | ||
165 | atomic_set(&ip_vs_dropentry, 1); | ||
166 | break; | ||
167 | } | ||
168 | spin_unlock(&__ip_vs_dropentry_lock); | ||
169 | |||
170 | /* drop_packet */ | ||
171 | spin_lock(&__ip_vs_droppacket_lock); | ||
172 | switch (sysctl_ip_vs_drop_packet) { | ||
173 | case 0: | ||
174 | ip_vs_drop_rate = 0; | ||
175 | break; | ||
176 | case 1: | ||
177 | if (nomem) { | ||
178 | ip_vs_drop_rate = ip_vs_drop_counter | ||
179 | = sysctl_ip_vs_amemthresh / | ||
180 | (sysctl_ip_vs_amemthresh-availmem); | ||
181 | sysctl_ip_vs_drop_packet = 2; | ||
182 | } else { | ||
183 | ip_vs_drop_rate = 0; | ||
184 | } | ||
185 | break; | ||
186 | case 2: | ||
187 | if (nomem) { | ||
188 | ip_vs_drop_rate = ip_vs_drop_counter | ||
189 | = sysctl_ip_vs_amemthresh / | ||
190 | (sysctl_ip_vs_amemthresh-availmem); | ||
191 | } else { | ||
192 | ip_vs_drop_rate = 0; | ||
193 | sysctl_ip_vs_drop_packet = 1; | ||
194 | } | ||
195 | break; | ||
196 | case 3: | ||
197 | ip_vs_drop_rate = sysctl_ip_vs_am_droprate; | ||
198 | break; | ||
199 | } | ||
200 | spin_unlock(&__ip_vs_droppacket_lock); | ||
201 | |||
202 | /* secure_tcp */ | ||
203 | write_lock(&__ip_vs_securetcp_lock); | ||
204 | switch (sysctl_ip_vs_secure_tcp) { | ||
205 | case 0: | ||
206 | if (old_secure_tcp >= 2) | ||
207 | to_change = 0; | ||
208 | break; | ||
209 | case 1: | ||
210 | if (nomem) { | ||
211 | if (old_secure_tcp < 2) | ||
212 | to_change = 1; | ||
213 | sysctl_ip_vs_secure_tcp = 2; | ||
214 | } else { | ||
215 | if (old_secure_tcp >= 2) | ||
216 | to_change = 0; | ||
217 | } | ||
218 | break; | ||
219 | case 2: | ||
220 | if (nomem) { | ||
221 | if (old_secure_tcp < 2) | ||
222 | to_change = 1; | ||
223 | } else { | ||
224 | if (old_secure_tcp >= 2) | ||
225 | to_change = 0; | ||
226 | sysctl_ip_vs_secure_tcp = 1; | ||
227 | } | ||
228 | break; | ||
229 | case 3: | ||
230 | if (old_secure_tcp < 2) | ||
231 | to_change = 1; | ||
232 | break; | ||
233 | } | ||
234 | old_secure_tcp = sysctl_ip_vs_secure_tcp; | ||
235 | if (to_change >= 0) | ||
236 | ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); | ||
237 | write_unlock(&__ip_vs_securetcp_lock); | ||
238 | |||
239 | local_bh_enable(); | ||
240 | } | ||
241 | |||
242 | |||
243 | /* | ||
244 | * Timer for checking the defense | ||
245 | */ | ||
246 | #define DEFENSE_TIMER_PERIOD 1*HZ | ||
247 | static void defense_work_handler(struct work_struct *work); | ||
248 | static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); | ||
249 | |||
250 | static void defense_work_handler(struct work_struct *work) | ||
251 | { | ||
252 | update_defense_level(); | ||
253 | if (atomic_read(&ip_vs_dropentry)) | ||
254 | ip_vs_random_dropentry(); | ||
255 | |||
256 | schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); | ||
257 | } | ||
258 | |||
259 | int | ||
260 | ip_vs_use_count_inc(void) | ||
261 | { | ||
262 | return try_module_get(THIS_MODULE); | ||
263 | } | ||
264 | |||
265 | void | ||
266 | ip_vs_use_count_dec(void) | ||
267 | { | ||
268 | module_put(THIS_MODULE); | ||
269 | } | ||
270 | |||
271 | |||
272 | /* | ||
273 | * Hash table: for virtual service lookups | ||
274 | */ | ||
275 | #define IP_VS_SVC_TAB_BITS 8 | ||
276 | #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) | ||
277 | #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) | ||
278 | |||
279 | /* the service table hashed by <protocol, addr, port> */ | ||
280 | static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; | ||
281 | /* the service table hashed by fwmark */ | ||
282 | static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; | ||
283 | |||
284 | /* | ||
285 | * Hash table: for real service lookups | ||
286 | */ | ||
287 | #define IP_VS_RTAB_BITS 4 | ||
288 | #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) | ||
289 | #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) | ||
290 | |||
291 | static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; | ||
292 | |||
293 | /* | ||
294 | * Trash for destinations | ||
295 | */ | ||
296 | static LIST_HEAD(ip_vs_dest_trash); | ||
297 | |||
298 | /* | ||
299 | * FTP & NULL virtual service counters | ||
300 | */ | ||
301 | static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); | ||
302 | static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); | ||
303 | |||
304 | |||
305 | /* | ||
306 | * Returns hash value for virtual service | ||
307 | */ | ||
308 | static __inline__ unsigned | ||
309 | ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, | ||
310 | __be16 port) | ||
311 | { | ||
312 | register unsigned porth = ntohs(port); | ||
313 | __be32 addr_fold = addr->ip; | ||
314 | |||
315 | #ifdef CONFIG_IP_VS_IPV6 | ||
316 | if (af == AF_INET6) | ||
317 | addr_fold = addr->ip6[0]^addr->ip6[1]^ | ||
318 | addr->ip6[2]^addr->ip6[3]; | ||
319 | #endif | ||
320 | |||
321 | return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) | ||
322 | & IP_VS_SVC_TAB_MASK; | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Returns hash value of fwmark for virtual service lookup | ||
327 | */ | ||
328 | static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) | ||
329 | { | ||
330 | return fwmark & IP_VS_SVC_TAB_MASK; | ||
331 | } | ||
332 | |||
333 | /* | ||
334 | * Hashes a service in the ip_vs_svc_table by <proto,addr,port> | ||
335 | * or in the ip_vs_svc_fwm_table by fwmark. | ||
336 | * Should be called with locked tables. | ||
337 | */ | ||
338 | static int ip_vs_svc_hash(struct ip_vs_service *svc) | ||
339 | { | ||
340 | unsigned hash; | ||
341 | |||
342 | if (svc->flags & IP_VS_SVC_F_HASHED) { | ||
343 | IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " | ||
344 | "called from %p\n", __builtin_return_address(0)); | ||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | if (svc->fwmark == 0) { | ||
349 | /* | ||
350 | * Hash it by <protocol,addr,port> in ip_vs_svc_table | ||
351 | */ | ||
352 | hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, | ||
353 | svc->port); | ||
354 | list_add(&svc->s_list, &ip_vs_svc_table[hash]); | ||
355 | } else { | ||
356 | /* | ||
357 | * Hash it by fwmark in ip_vs_svc_fwm_table | ||
358 | */ | ||
359 | hash = ip_vs_svc_fwm_hashkey(svc->fwmark); | ||
360 | list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); | ||
361 | } | ||
362 | |||
363 | svc->flags |= IP_VS_SVC_F_HASHED; | ||
364 | /* increase its refcnt because it is referenced by the svc table */ | ||
365 | atomic_inc(&svc->refcnt); | ||
366 | return 1; | ||
367 | } | ||
368 | |||
369 | |||
370 | /* | ||
371 | * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. | ||
372 | * Should be called with locked tables. | ||
373 | */ | ||
374 | static int ip_vs_svc_unhash(struct ip_vs_service *svc) | ||
375 | { | ||
376 | if (!(svc->flags & IP_VS_SVC_F_HASHED)) { | ||
377 | IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " | ||
378 | "called from %p\n", __builtin_return_address(0)); | ||
379 | return 0; | ||
380 | } | ||
381 | |||
382 | if (svc->fwmark == 0) { | ||
383 | /* Remove it from the ip_vs_svc_table table */ | ||
384 | list_del(&svc->s_list); | ||
385 | } else { | ||
386 | /* Remove it from the ip_vs_svc_fwm_table table */ | ||
387 | list_del(&svc->f_list); | ||
388 | } | ||
389 | |||
390 | svc->flags &= ~IP_VS_SVC_F_HASHED; | ||
391 | atomic_dec(&svc->refcnt); | ||
392 | return 1; | ||
393 | } | ||
394 | |||
395 | |||
396 | /* | ||
397 | * Get service by {proto,addr,port} in the service table. | ||
398 | */ | ||
399 | static inline struct ip_vs_service * | ||
400 | __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, | ||
401 | __be16 vport) | ||
402 | { | ||
403 | unsigned hash; | ||
404 | struct ip_vs_service *svc; | ||
405 | |||
406 | /* Check for "full" addressed entries */ | ||
407 | hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); | ||
408 | |||
409 | list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ | ||
410 | if ((svc->af == af) | ||
411 | && ip_vs_addr_equal(af, &svc->addr, vaddr) | ||
412 | && (svc->port == vport) | ||
413 | && (svc->protocol == protocol)) { | ||
414 | /* HIT */ | ||
415 | atomic_inc(&svc->usecnt); | ||
416 | return svc; | ||
417 | } | ||
418 | } | ||
419 | |||
420 | return NULL; | ||
421 | } | ||
422 | |||
423 | |||
424 | /* | ||
425 | * Get service by {fwmark} in the service table. | ||
426 | */ | ||
427 | static inline struct ip_vs_service * | ||
428 | __ip_vs_svc_fwm_get(int af, __u32 fwmark) | ||
429 | { | ||
430 | unsigned hash; | ||
431 | struct ip_vs_service *svc; | ||
432 | |||
433 | /* Check for fwmark addressed entries */ | ||
434 | hash = ip_vs_svc_fwm_hashkey(fwmark); | ||
435 | |||
436 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { | ||
437 | if (svc->fwmark == fwmark && svc->af == af) { | ||
438 | /* HIT */ | ||
439 | atomic_inc(&svc->usecnt); | ||
440 | return svc; | ||
441 | } | ||
442 | } | ||
443 | |||
444 | return NULL; | ||
445 | } | ||
446 | |||
447 | struct ip_vs_service * | ||
448 | ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, | ||
449 | const union nf_inet_addr *vaddr, __be16 vport) | ||
450 | { | ||
451 | struct ip_vs_service *svc; | ||
452 | |||
453 | read_lock(&__ip_vs_svc_lock); | ||
454 | |||
455 | /* | ||
456 | * Check the table hashed by fwmark first | ||
457 | */ | ||
458 | if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) | ||
459 | goto out; | ||
460 | |||
461 | /* | ||
462 | * Check the table hashed by <protocol,addr,port> | ||
463 | * for "full" addressed entries | ||
464 | */ | ||
465 | svc = __ip_vs_service_get(af, protocol, vaddr, vport); | ||
466 | |||
467 | if (svc == NULL | ||
468 | && protocol == IPPROTO_TCP | ||
469 | && atomic_read(&ip_vs_ftpsvc_counter) | ||
470 | && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { | ||
471 | /* | ||
472 | * Check if ftp service entry exists, the packet | ||
473 | * might belong to FTP data connections. | ||
474 | */ | ||
475 | svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); | ||
476 | } | ||
477 | |||
478 | if (svc == NULL | ||
479 | && atomic_read(&ip_vs_nullsvc_counter)) { | ||
480 | /* | ||
481 | * Check if the catch-all port (port zero) exists | ||
482 | */ | ||
483 | svc = __ip_vs_service_get(af, protocol, vaddr, 0); | ||
484 | } | ||
485 | |||
486 | out: | ||
487 | read_unlock(&__ip_vs_svc_lock); | ||
488 | |||
489 | IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", | ||
490 | fwmark, ip_vs_proto_name(protocol), | ||
491 | IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), | ||
492 | svc ? "hit" : "not hit"); | ||
493 | |||
494 | return svc; | ||
495 | } | ||
496 | |||
497 | |||
498 | static inline void | ||
499 | __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
500 | { | ||
501 | atomic_inc(&svc->refcnt); | ||
502 | dest->svc = svc; | ||
503 | } | ||
504 | |||
505 | static inline void | ||
506 | __ip_vs_unbind_svc(struct ip_vs_dest *dest) | ||
507 | { | ||
508 | struct ip_vs_service *svc = dest->svc; | ||
509 | |||
510 | dest->svc = NULL; | ||
511 | if (atomic_dec_and_test(&svc->refcnt)) | ||
512 | kfree(svc); | ||
513 | } | ||
514 | |||
515 | |||
516 | /* | ||
517 | * Returns hash value for real service | ||
518 | */ | ||
519 | static inline unsigned ip_vs_rs_hashkey(int af, | ||
520 | const union nf_inet_addr *addr, | ||
521 | __be16 port) | ||
522 | { | ||
523 | register unsigned porth = ntohs(port); | ||
524 | __be32 addr_fold = addr->ip; | ||
525 | |||
526 | #ifdef CONFIG_IP_VS_IPV6 | ||
527 | if (af == AF_INET6) | ||
528 | addr_fold = addr->ip6[0]^addr->ip6[1]^ | ||
529 | addr->ip6[2]^addr->ip6[3]; | ||
530 | #endif | ||
531 | |||
532 | return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) | ||
533 | & IP_VS_RTAB_MASK; | ||
534 | } | ||
535 | |||
536 | /* | ||
537 | * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. | ||
538 | * should be called with locked tables. | ||
539 | */ | ||
540 | static int ip_vs_rs_hash(struct ip_vs_dest *dest) | ||
541 | { | ||
542 | unsigned hash; | ||
543 | |||
544 | if (!list_empty(&dest->d_list)) { | ||
545 | return 0; | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * Hash by proto,addr,port, | ||
550 | * which are the parameters of the real service. | ||
551 | */ | ||
552 | hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); | ||
553 | |||
554 | list_add(&dest->d_list, &ip_vs_rtable[hash]); | ||
555 | |||
556 | return 1; | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * UNhashes ip_vs_dest from ip_vs_rtable. | ||
561 | * should be called with locked tables. | ||
562 | */ | ||
563 | static int ip_vs_rs_unhash(struct ip_vs_dest *dest) | ||
564 | { | ||
565 | /* | ||
566 | * Remove it from the ip_vs_rtable table. | ||
567 | */ | ||
568 | if (!list_empty(&dest->d_list)) { | ||
569 | list_del(&dest->d_list); | ||
570 | INIT_LIST_HEAD(&dest->d_list); | ||
571 | } | ||
572 | |||
573 | return 1; | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * Lookup real service by <proto,addr,port> in the real service table. | ||
578 | */ | ||
579 | struct ip_vs_dest * | ||
580 | ip_vs_lookup_real_service(int af, __u16 protocol, | ||
581 | const union nf_inet_addr *daddr, | ||
582 | __be16 dport) | ||
583 | { | ||
584 | unsigned hash; | ||
585 | struct ip_vs_dest *dest; | ||
586 | |||
587 | /* | ||
588 | * Check for "full" addressed entries | ||
589 | * Return the first found entry | ||
590 | */ | ||
591 | hash = ip_vs_rs_hashkey(af, daddr, dport); | ||
592 | |||
593 | read_lock(&__ip_vs_rs_lock); | ||
594 | list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { | ||
595 | if ((dest->af == af) | ||
596 | && ip_vs_addr_equal(af, &dest->addr, daddr) | ||
597 | && (dest->port == dport) | ||
598 | && ((dest->protocol == protocol) || | ||
599 | dest->vfwmark)) { | ||
600 | /* HIT */ | ||
601 | read_unlock(&__ip_vs_rs_lock); | ||
602 | return dest; | ||
603 | } | ||
604 | } | ||
605 | read_unlock(&__ip_vs_rs_lock); | ||
606 | |||
607 | return NULL; | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Lookup destination by {addr,port} in the given service | ||
612 | */ | ||
613 | static struct ip_vs_dest * | ||
614 | ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, | ||
615 | __be16 dport) | ||
616 | { | ||
617 | struct ip_vs_dest *dest; | ||
618 | |||
619 | /* | ||
620 | * Find the destination for the given service | ||
621 | */ | ||
622 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
623 | if ((dest->af == svc->af) | ||
624 | && ip_vs_addr_equal(svc->af, &dest->addr, daddr) | ||
625 | && (dest->port == dport)) { | ||
626 | /* HIT */ | ||
627 | return dest; | ||
628 | } | ||
629 | } | ||
630 | |||
631 | return NULL; | ||
632 | } | ||
633 | |||
634 | /* | ||
635 | * Find destination by {daddr,dport,vaddr,protocol} | ||
636 | * Cretaed to be used in ip_vs_process_message() in | ||
637 | * the backup synchronization daemon. It finds the | ||
638 | * destination to be bound to the received connection | ||
639 | * on the backup. | ||
640 | * | ||
641 | * ip_vs_lookup_real_service() looked promissing, but | ||
642 | * seems not working as expected. | ||
643 | */ | ||
644 | struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, | ||
645 | __be16 dport, | ||
646 | const union nf_inet_addr *vaddr, | ||
647 | __be16 vport, __u16 protocol) | ||
648 | { | ||
649 | struct ip_vs_dest *dest; | ||
650 | struct ip_vs_service *svc; | ||
651 | |||
652 | svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); | ||
653 | if (!svc) | ||
654 | return NULL; | ||
655 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
656 | if (dest) | ||
657 | atomic_inc(&dest->refcnt); | ||
658 | ip_vs_service_put(svc); | ||
659 | return dest; | ||
660 | } | ||
661 | |||
662 | /* | ||
663 | * Lookup dest by {svc,addr,port} in the destination trash. | ||
664 | * The destination trash is used to hold the destinations that are removed | ||
665 | * from the service table but are still referenced by some conn entries. | ||
666 | * The reason to add the destination trash is when the dest is temporary | ||
667 | * down (either by administrator or by monitor program), the dest can be | ||
668 | * picked back from the trash, the remaining connections to the dest can | ||
669 | * continue, and the counting information of the dest is also useful for | ||
670 | * scheduling. | ||
671 | */ | ||
672 | static struct ip_vs_dest * | ||
673 | ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, | ||
674 | __be16 dport) | ||
675 | { | ||
676 | struct ip_vs_dest *dest, *nxt; | ||
677 | |||
678 | /* | ||
679 | * Find the destination in trash | ||
680 | */ | ||
681 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { | ||
682 | IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " | ||
683 | "dest->refcnt=%d\n", | ||
684 | dest->vfwmark, | ||
685 | IP_VS_DBG_ADDR(svc->af, &dest->addr), | ||
686 | ntohs(dest->port), | ||
687 | atomic_read(&dest->refcnt)); | ||
688 | if (dest->af == svc->af && | ||
689 | ip_vs_addr_equal(svc->af, &dest->addr, daddr) && | ||
690 | dest->port == dport && | ||
691 | dest->vfwmark == svc->fwmark && | ||
692 | dest->protocol == svc->protocol && | ||
693 | (svc->fwmark || | ||
694 | (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && | ||
695 | dest->vport == svc->port))) { | ||
696 | /* HIT */ | ||
697 | return dest; | ||
698 | } | ||
699 | |||
700 | /* | ||
701 | * Try to purge the destination from trash if not referenced | ||
702 | */ | ||
703 | if (atomic_read(&dest->refcnt) == 1) { | ||
704 | IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " | ||
705 | "from trash\n", | ||
706 | dest->vfwmark, | ||
707 | IP_VS_DBG_ADDR(svc->af, &dest->addr), | ||
708 | ntohs(dest->port)); | ||
709 | list_del(&dest->n_list); | ||
710 | ip_vs_dst_reset(dest); | ||
711 | __ip_vs_unbind_svc(dest); | ||
712 | kfree(dest); | ||
713 | } | ||
714 | } | ||
715 | |||
716 | return NULL; | ||
717 | } | ||
718 | |||
719 | |||
720 | /* | ||
721 | * Clean up all the destinations in the trash | ||
722 | * Called by the ip_vs_control_cleanup() | ||
723 | * | ||
724 | * When the ip_vs_control_clearup is activated by ipvs module exit, | ||
725 | * the service tables must have been flushed and all the connections | ||
726 | * are expired, and the refcnt of each destination in the trash must | ||
727 | * be 1, so we simply release them here. | ||
728 | */ | ||
729 | static void ip_vs_trash_cleanup(void) | ||
730 | { | ||
731 | struct ip_vs_dest *dest, *nxt; | ||
732 | |||
733 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { | ||
734 | list_del(&dest->n_list); | ||
735 | ip_vs_dst_reset(dest); | ||
736 | __ip_vs_unbind_svc(dest); | ||
737 | kfree(dest); | ||
738 | } | ||
739 | } | ||
740 | |||
741 | |||
742 | static void | ||
743 | ip_vs_zero_stats(struct ip_vs_stats *stats) | ||
744 | { | ||
745 | spin_lock_bh(&stats->lock); | ||
746 | |||
747 | memset(&stats->ustats, 0, sizeof(stats->ustats)); | ||
748 | ip_vs_zero_estimator(stats); | ||
749 | |||
750 | spin_unlock_bh(&stats->lock); | ||
751 | } | ||
752 | |||
753 | /* | ||
754 | * Update a destination in the given service | ||
755 | */ | ||
756 | static void | ||
757 | __ip_vs_update_dest(struct ip_vs_service *svc, | ||
758 | struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) | ||
759 | { | ||
760 | int conn_flags; | ||
761 | |||
762 | /* set the weight and the flags */ | ||
763 | atomic_set(&dest->weight, udest->weight); | ||
764 | conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; | ||
765 | |||
766 | /* check if local node and update the flags */ | ||
767 | #ifdef CONFIG_IP_VS_IPV6 | ||
768 | if (svc->af == AF_INET6) { | ||
769 | if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { | ||
770 | conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | ||
771 | | IP_VS_CONN_F_LOCALNODE; | ||
772 | } | ||
773 | } else | ||
774 | #endif | ||
775 | if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { | ||
776 | conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | ||
777 | | IP_VS_CONN_F_LOCALNODE; | ||
778 | } | ||
779 | |||
780 | /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ | ||
781 | if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { | ||
782 | conn_flags |= IP_VS_CONN_F_NOOUTPUT; | ||
783 | } else { | ||
784 | /* | ||
785 | * Put the real service in ip_vs_rtable if not present. | ||
786 | * For now only for NAT! | ||
787 | */ | ||
788 | write_lock_bh(&__ip_vs_rs_lock); | ||
789 | ip_vs_rs_hash(dest); | ||
790 | write_unlock_bh(&__ip_vs_rs_lock); | ||
791 | } | ||
792 | atomic_set(&dest->conn_flags, conn_flags); | ||
793 | |||
794 | /* bind the service */ | ||
795 | if (!dest->svc) { | ||
796 | __ip_vs_bind_svc(dest, svc); | ||
797 | } else { | ||
798 | if (dest->svc != svc) { | ||
799 | __ip_vs_unbind_svc(dest); | ||
800 | ip_vs_zero_stats(&dest->stats); | ||
801 | __ip_vs_bind_svc(dest, svc); | ||
802 | } | ||
803 | } | ||
804 | |||
805 | /* set the dest status flags */ | ||
806 | dest->flags |= IP_VS_DEST_F_AVAILABLE; | ||
807 | |||
808 | if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) | ||
809 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
810 | dest->u_threshold = udest->u_threshold; | ||
811 | dest->l_threshold = udest->l_threshold; | ||
812 | } | ||
813 | |||
814 | |||
815 | /* | ||
816 | * Create a destination for the given service | ||
817 | */ | ||
818 | static int | ||
819 | ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, | ||
820 | struct ip_vs_dest **dest_p) | ||
821 | { | ||
822 | struct ip_vs_dest *dest; | ||
823 | unsigned atype; | ||
824 | |||
825 | EnterFunction(2); | ||
826 | |||
827 | #ifdef CONFIG_IP_VS_IPV6 | ||
828 | if (svc->af == AF_INET6) { | ||
829 | atype = ipv6_addr_type(&udest->addr.in6); | ||
830 | if ((!(atype & IPV6_ADDR_UNICAST) || | ||
831 | atype & IPV6_ADDR_LINKLOCAL) && | ||
832 | !__ip_vs_addr_is_local_v6(&udest->addr.in6)) | ||
833 | return -EINVAL; | ||
834 | } else | ||
835 | #endif | ||
836 | { | ||
837 | atype = inet_addr_type(&init_net, udest->addr.ip); | ||
838 | if (atype != RTN_LOCAL && atype != RTN_UNICAST) | ||
839 | return -EINVAL; | ||
840 | } | ||
841 | |||
842 | dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); | ||
843 | if (dest == NULL) { | ||
844 | IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); | ||
845 | return -ENOMEM; | ||
846 | } | ||
847 | |||
848 | dest->af = svc->af; | ||
849 | dest->protocol = svc->protocol; | ||
850 | dest->vaddr = svc->addr; | ||
851 | dest->vport = svc->port; | ||
852 | dest->vfwmark = svc->fwmark; | ||
853 | ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); | ||
854 | dest->port = udest->port; | ||
855 | |||
856 | atomic_set(&dest->activeconns, 0); | ||
857 | atomic_set(&dest->inactconns, 0); | ||
858 | atomic_set(&dest->persistconns, 0); | ||
859 | atomic_set(&dest->refcnt, 0); | ||
860 | |||
861 | INIT_LIST_HEAD(&dest->d_list); | ||
862 | spin_lock_init(&dest->dst_lock); | ||
863 | spin_lock_init(&dest->stats.lock); | ||
864 | __ip_vs_update_dest(svc, dest, udest); | ||
865 | ip_vs_new_estimator(&dest->stats); | ||
866 | |||
867 | *dest_p = dest; | ||
868 | |||
869 | LeaveFunction(2); | ||
870 | return 0; | ||
871 | } | ||
872 | |||
873 | |||
874 | /* | ||
875 | * Add a destination into an existing service | ||
876 | */ | ||
877 | static int | ||
878 | ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) | ||
879 | { | ||
880 | struct ip_vs_dest *dest; | ||
881 | union nf_inet_addr daddr; | ||
882 | __be16 dport = udest->port; | ||
883 | int ret; | ||
884 | |||
885 | EnterFunction(2); | ||
886 | |||
887 | if (udest->weight < 0) { | ||
888 | IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); | ||
889 | return -ERANGE; | ||
890 | } | ||
891 | |||
892 | if (udest->l_threshold > udest->u_threshold) { | ||
893 | IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " | ||
894 | "upper threshold\n"); | ||
895 | return -ERANGE; | ||
896 | } | ||
897 | |||
898 | ip_vs_addr_copy(svc->af, &daddr, &udest->addr); | ||
899 | |||
900 | /* | ||
901 | * Check if the dest already exists in the list | ||
902 | */ | ||
903 | dest = ip_vs_lookup_dest(svc, &daddr, dport); | ||
904 | |||
905 | if (dest != NULL) { | ||
906 | IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); | ||
907 | return -EEXIST; | ||
908 | } | ||
909 | |||
910 | /* | ||
911 | * Check if the dest already exists in the trash and | ||
912 | * is from the same service | ||
913 | */ | ||
914 | dest = ip_vs_trash_get_dest(svc, &daddr, dport); | ||
915 | |||
916 | if (dest != NULL) { | ||
917 | IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " | ||
918 | "dest->refcnt=%d, service %u/%s:%u\n", | ||
919 | IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), | ||
920 | atomic_read(&dest->refcnt), | ||
921 | dest->vfwmark, | ||
922 | IP_VS_DBG_ADDR(svc->af, &dest->vaddr), | ||
923 | ntohs(dest->vport)); | ||
924 | |||
925 | __ip_vs_update_dest(svc, dest, udest); | ||
926 | |||
927 | /* | ||
928 | * Get the destination from the trash | ||
929 | */ | ||
930 | list_del(&dest->n_list); | ||
931 | |||
932 | ip_vs_new_estimator(&dest->stats); | ||
933 | |||
934 | write_lock_bh(&__ip_vs_svc_lock); | ||
935 | |||
936 | /* | ||
937 | * Wait until all other svc users go away. | ||
938 | */ | ||
939 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
940 | |||
941 | list_add(&dest->n_list, &svc->destinations); | ||
942 | svc->num_dests++; | ||
943 | |||
944 | /* call the update_service function of its scheduler */ | ||
945 | if (svc->scheduler->update_service) | ||
946 | svc->scheduler->update_service(svc); | ||
947 | |||
948 | write_unlock_bh(&__ip_vs_svc_lock); | ||
949 | return 0; | ||
950 | } | ||
951 | |||
952 | /* | ||
953 | * Allocate and initialize the dest structure | ||
954 | */ | ||
955 | ret = ip_vs_new_dest(svc, udest, &dest); | ||
956 | if (ret) { | ||
957 | return ret; | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Add the dest entry into the list | ||
962 | */ | ||
963 | atomic_inc(&dest->refcnt); | ||
964 | |||
965 | write_lock_bh(&__ip_vs_svc_lock); | ||
966 | |||
967 | /* | ||
968 | * Wait until all other svc users go away. | ||
969 | */ | ||
970 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
971 | |||
972 | list_add(&dest->n_list, &svc->destinations); | ||
973 | svc->num_dests++; | ||
974 | |||
975 | /* call the update_service function of its scheduler */ | ||
976 | if (svc->scheduler->update_service) | ||
977 | svc->scheduler->update_service(svc); | ||
978 | |||
979 | write_unlock_bh(&__ip_vs_svc_lock); | ||
980 | |||
981 | LeaveFunction(2); | ||
982 | |||
983 | return 0; | ||
984 | } | ||
985 | |||
986 | |||
987 | /* | ||
988 | * Edit a destination in the given service | ||
989 | */ | ||
990 | static int | ||
991 | ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) | ||
992 | { | ||
993 | struct ip_vs_dest *dest; | ||
994 | union nf_inet_addr daddr; | ||
995 | __be16 dport = udest->port; | ||
996 | |||
997 | EnterFunction(2); | ||
998 | |||
999 | if (udest->weight < 0) { | ||
1000 | IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); | ||
1001 | return -ERANGE; | ||
1002 | } | ||
1003 | |||
1004 | if (udest->l_threshold > udest->u_threshold) { | ||
1005 | IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " | ||
1006 | "upper threshold\n"); | ||
1007 | return -ERANGE; | ||
1008 | } | ||
1009 | |||
1010 | ip_vs_addr_copy(svc->af, &daddr, &udest->addr); | ||
1011 | |||
1012 | /* | ||
1013 | * Lookup the destination list | ||
1014 | */ | ||
1015 | dest = ip_vs_lookup_dest(svc, &daddr, dport); | ||
1016 | |||
1017 | if (dest == NULL) { | ||
1018 | IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); | ||
1019 | return -ENOENT; | ||
1020 | } | ||
1021 | |||
1022 | __ip_vs_update_dest(svc, dest, udest); | ||
1023 | |||
1024 | write_lock_bh(&__ip_vs_svc_lock); | ||
1025 | |||
1026 | /* Wait until all other svc users go away */ | ||
1027 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1028 | |||
1029 | /* call the update_service, because server weight may be changed */ | ||
1030 | if (svc->scheduler->update_service) | ||
1031 | svc->scheduler->update_service(svc); | ||
1032 | |||
1033 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1034 | |||
1035 | LeaveFunction(2); | ||
1036 | |||
1037 | return 0; | ||
1038 | } | ||
1039 | |||
1040 | |||
1041 | /* | ||
1042 | * Delete a destination (must be already unlinked from the service) | ||
1043 | */ | ||
1044 | static void __ip_vs_del_dest(struct ip_vs_dest *dest) | ||
1045 | { | ||
1046 | ip_vs_kill_estimator(&dest->stats); | ||
1047 | |||
1048 | /* | ||
1049 | * Remove it from the d-linked list with the real services. | ||
1050 | */ | ||
1051 | write_lock_bh(&__ip_vs_rs_lock); | ||
1052 | ip_vs_rs_unhash(dest); | ||
1053 | write_unlock_bh(&__ip_vs_rs_lock); | ||
1054 | |||
1055 | /* | ||
1056 | * Decrease the refcnt of the dest, and free the dest | ||
1057 | * if nobody refers to it (refcnt=0). Otherwise, throw | ||
1058 | * the destination into the trash. | ||
1059 | */ | ||
1060 | if (atomic_dec_and_test(&dest->refcnt)) { | ||
1061 | ip_vs_dst_reset(dest); | ||
1062 | /* simply decrease svc->refcnt here, let the caller check | ||
1063 | and release the service if nobody refers to it. | ||
1064 | Only user context can release destination and service, | ||
1065 | and only one user context can update virtual service at a | ||
1066 | time, so the operation here is OK */ | ||
1067 | atomic_dec(&dest->svc->refcnt); | ||
1068 | kfree(dest); | ||
1069 | } else { | ||
1070 | IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " | ||
1071 | "dest->refcnt=%d\n", | ||
1072 | IP_VS_DBG_ADDR(dest->af, &dest->addr), | ||
1073 | ntohs(dest->port), | ||
1074 | atomic_read(&dest->refcnt)); | ||
1075 | list_add(&dest->n_list, &ip_vs_dest_trash); | ||
1076 | atomic_inc(&dest->refcnt); | ||
1077 | } | ||
1078 | } | ||
1079 | |||
1080 | |||
1081 | /* | ||
1082 | * Unlink a destination from the given service | ||
1083 | */ | ||
1084 | static void __ip_vs_unlink_dest(struct ip_vs_service *svc, | ||
1085 | struct ip_vs_dest *dest, | ||
1086 | int svcupd) | ||
1087 | { | ||
1088 | dest->flags &= ~IP_VS_DEST_F_AVAILABLE; | ||
1089 | |||
1090 | /* | ||
1091 | * Remove it from the d-linked destination list. | ||
1092 | */ | ||
1093 | list_del(&dest->n_list); | ||
1094 | svc->num_dests--; | ||
1095 | |||
1096 | /* | ||
1097 | * Call the update_service function of its scheduler | ||
1098 | */ | ||
1099 | if (svcupd && svc->scheduler->update_service) | ||
1100 | svc->scheduler->update_service(svc); | ||
1101 | } | ||
1102 | |||
1103 | |||
1104 | /* | ||
1105 | * Delete a destination server in the given service | ||
1106 | */ | ||
1107 | static int | ||
1108 | ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) | ||
1109 | { | ||
1110 | struct ip_vs_dest *dest; | ||
1111 | __be16 dport = udest->port; | ||
1112 | |||
1113 | EnterFunction(2); | ||
1114 | |||
1115 | dest = ip_vs_lookup_dest(svc, &udest->addr, dport); | ||
1116 | |||
1117 | if (dest == NULL) { | ||
1118 | IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); | ||
1119 | return -ENOENT; | ||
1120 | } | ||
1121 | |||
1122 | write_lock_bh(&__ip_vs_svc_lock); | ||
1123 | |||
1124 | /* | ||
1125 | * Wait until all other svc users go away. | ||
1126 | */ | ||
1127 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1128 | |||
1129 | /* | ||
1130 | * Unlink dest from the service | ||
1131 | */ | ||
1132 | __ip_vs_unlink_dest(svc, dest, 1); | ||
1133 | |||
1134 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1135 | |||
1136 | /* | ||
1137 | * Delete the destination | ||
1138 | */ | ||
1139 | __ip_vs_del_dest(dest); | ||
1140 | |||
1141 | LeaveFunction(2); | ||
1142 | |||
1143 | return 0; | ||
1144 | } | ||
1145 | |||
1146 | |||
1147 | /* | ||
1148 | * Add a service into the service hash table | ||
1149 | */ | ||
1150 | static int | ||
1151 | ip_vs_add_service(struct ip_vs_service_user_kern *u, | ||
1152 | struct ip_vs_service **svc_p) | ||
1153 | { | ||
1154 | int ret = 0; | ||
1155 | struct ip_vs_scheduler *sched = NULL; | ||
1156 | struct ip_vs_service *svc = NULL; | ||
1157 | |||
1158 | /* increase the module use count */ | ||
1159 | ip_vs_use_count_inc(); | ||
1160 | |||
1161 | /* Lookup the scheduler by 'u->sched_name' */ | ||
1162 | sched = ip_vs_scheduler_get(u->sched_name); | ||
1163 | if (sched == NULL) { | ||
1164 | IP_VS_INFO("Scheduler module ip_vs_%s not found\n", | ||
1165 | u->sched_name); | ||
1166 | ret = -ENOENT; | ||
1167 | goto out_mod_dec; | ||
1168 | } | ||
1169 | |||
1170 | #ifdef CONFIG_IP_VS_IPV6 | ||
1171 | if (u->af == AF_INET6) { | ||
1172 | if (!sched->supports_ipv6) { | ||
1173 | ret = -EAFNOSUPPORT; | ||
1174 | goto out_err; | ||
1175 | } | ||
1176 | if ((u->netmask < 1) || (u->netmask > 128)) { | ||
1177 | ret = -EINVAL; | ||
1178 | goto out_err; | ||
1179 | } | ||
1180 | } | ||
1181 | #endif | ||
1182 | |||
1183 | svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); | ||
1184 | if (svc == NULL) { | ||
1185 | IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); | ||
1186 | ret = -ENOMEM; | ||
1187 | goto out_err; | ||
1188 | } | ||
1189 | |||
1190 | /* I'm the first user of the service */ | ||
1191 | atomic_set(&svc->usecnt, 1); | ||
1192 | atomic_set(&svc->refcnt, 0); | ||
1193 | |||
1194 | svc->af = u->af; | ||
1195 | svc->protocol = u->protocol; | ||
1196 | ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); | ||
1197 | svc->port = u->port; | ||
1198 | svc->fwmark = u->fwmark; | ||
1199 | svc->flags = u->flags; | ||
1200 | svc->timeout = u->timeout * HZ; | ||
1201 | svc->netmask = u->netmask; | ||
1202 | |||
1203 | INIT_LIST_HEAD(&svc->destinations); | ||
1204 | rwlock_init(&svc->sched_lock); | ||
1205 | spin_lock_init(&svc->stats.lock); | ||
1206 | |||
1207 | /* Bind the scheduler */ | ||
1208 | ret = ip_vs_bind_scheduler(svc, sched); | ||
1209 | if (ret) | ||
1210 | goto out_err; | ||
1211 | sched = NULL; | ||
1212 | |||
1213 | /* Update the virtual service counters */ | ||
1214 | if (svc->port == FTPPORT) | ||
1215 | atomic_inc(&ip_vs_ftpsvc_counter); | ||
1216 | else if (svc->port == 0) | ||
1217 | atomic_inc(&ip_vs_nullsvc_counter); | ||
1218 | |||
1219 | ip_vs_new_estimator(&svc->stats); | ||
1220 | |||
1221 | /* Count only IPv4 services for old get/setsockopt interface */ | ||
1222 | if (svc->af == AF_INET) | ||
1223 | ip_vs_num_services++; | ||
1224 | |||
1225 | /* Hash the service into the service table */ | ||
1226 | write_lock_bh(&__ip_vs_svc_lock); | ||
1227 | ip_vs_svc_hash(svc); | ||
1228 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1229 | |||
1230 | *svc_p = svc; | ||
1231 | return 0; | ||
1232 | |||
1233 | out_err: | ||
1234 | if (svc != NULL) { | ||
1235 | if (svc->scheduler) | ||
1236 | ip_vs_unbind_scheduler(svc); | ||
1237 | if (svc->inc) { | ||
1238 | local_bh_disable(); | ||
1239 | ip_vs_app_inc_put(svc->inc); | ||
1240 | local_bh_enable(); | ||
1241 | } | ||
1242 | kfree(svc); | ||
1243 | } | ||
1244 | ip_vs_scheduler_put(sched); | ||
1245 | |||
1246 | out_mod_dec: | ||
1247 | /* decrease the module use count */ | ||
1248 | ip_vs_use_count_dec(); | ||
1249 | |||
1250 | return ret; | ||
1251 | } | ||
1252 | |||
1253 | |||
1254 | /* | ||
1255 | * Edit a service and bind it with a new scheduler | ||
1256 | */ | ||
1257 | static int | ||
1258 | ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) | ||
1259 | { | ||
1260 | struct ip_vs_scheduler *sched, *old_sched; | ||
1261 | int ret = 0; | ||
1262 | |||
1263 | /* | ||
1264 | * Lookup the scheduler, by 'u->sched_name' | ||
1265 | */ | ||
1266 | sched = ip_vs_scheduler_get(u->sched_name); | ||
1267 | if (sched == NULL) { | ||
1268 | IP_VS_INFO("Scheduler module ip_vs_%s not found\n", | ||
1269 | u->sched_name); | ||
1270 | return -ENOENT; | ||
1271 | } | ||
1272 | old_sched = sched; | ||
1273 | |||
1274 | #ifdef CONFIG_IP_VS_IPV6 | ||
1275 | if (u->af == AF_INET6) { | ||
1276 | if (!sched->supports_ipv6) { | ||
1277 | ret = -EAFNOSUPPORT; | ||
1278 | goto out; | ||
1279 | } | ||
1280 | if ((u->netmask < 1) || (u->netmask > 128)) { | ||
1281 | ret = -EINVAL; | ||
1282 | goto out; | ||
1283 | } | ||
1284 | } | ||
1285 | #endif | ||
1286 | |||
1287 | write_lock_bh(&__ip_vs_svc_lock); | ||
1288 | |||
1289 | /* | ||
1290 | * Wait until all other svc users go away. | ||
1291 | */ | ||
1292 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1293 | |||
1294 | /* | ||
1295 | * Set the flags and timeout value | ||
1296 | */ | ||
1297 | svc->flags = u->flags | IP_VS_SVC_F_HASHED; | ||
1298 | svc->timeout = u->timeout * HZ; | ||
1299 | svc->netmask = u->netmask; | ||
1300 | |||
1301 | old_sched = svc->scheduler; | ||
1302 | if (sched != old_sched) { | ||
1303 | /* | ||
1304 | * Unbind the old scheduler | ||
1305 | */ | ||
1306 | if ((ret = ip_vs_unbind_scheduler(svc))) { | ||
1307 | old_sched = sched; | ||
1308 | goto out_unlock; | ||
1309 | } | ||
1310 | |||
1311 | /* | ||
1312 | * Bind the new scheduler | ||
1313 | */ | ||
1314 | if ((ret = ip_vs_bind_scheduler(svc, sched))) { | ||
1315 | /* | ||
1316 | * If ip_vs_bind_scheduler fails, restore the old | ||
1317 | * scheduler. | ||
1318 | * The main reason of failure is out of memory. | ||
1319 | * | ||
1320 | * The question is if the old scheduler can be | ||
1321 | * restored all the time. TODO: if it cannot be | ||
1322 | * restored some time, we must delete the service, | ||
1323 | * otherwise the system may crash. | ||
1324 | */ | ||
1325 | ip_vs_bind_scheduler(svc, old_sched); | ||
1326 | old_sched = sched; | ||
1327 | goto out_unlock; | ||
1328 | } | ||
1329 | } | ||
1330 | |||
1331 | out_unlock: | ||
1332 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1333 | #ifdef CONFIG_IP_VS_IPV6 | ||
1334 | out: | ||
1335 | #endif | ||
1336 | |||
1337 | if (old_sched) | ||
1338 | ip_vs_scheduler_put(old_sched); | ||
1339 | |||
1340 | return ret; | ||
1341 | } | ||
1342 | |||
1343 | |||
1344 | /* | ||
1345 | * Delete a service from the service list | ||
1346 | * - The service must be unlinked, unlocked and not referenced! | ||
1347 | * - We are called under _bh lock | ||
1348 | */ | ||
1349 | static void __ip_vs_del_service(struct ip_vs_service *svc) | ||
1350 | { | ||
1351 | struct ip_vs_dest *dest, *nxt; | ||
1352 | struct ip_vs_scheduler *old_sched; | ||
1353 | |||
1354 | /* Count only IPv4 services for old get/setsockopt interface */ | ||
1355 | if (svc->af == AF_INET) | ||
1356 | ip_vs_num_services--; | ||
1357 | |||
1358 | ip_vs_kill_estimator(&svc->stats); | ||
1359 | |||
1360 | /* Unbind scheduler */ | ||
1361 | old_sched = svc->scheduler; | ||
1362 | ip_vs_unbind_scheduler(svc); | ||
1363 | if (old_sched) | ||
1364 | ip_vs_scheduler_put(old_sched); | ||
1365 | |||
1366 | /* Unbind app inc */ | ||
1367 | if (svc->inc) { | ||
1368 | ip_vs_app_inc_put(svc->inc); | ||
1369 | svc->inc = NULL; | ||
1370 | } | ||
1371 | |||
1372 | /* | ||
1373 | * Unlink the whole destination list | ||
1374 | */ | ||
1375 | list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { | ||
1376 | __ip_vs_unlink_dest(svc, dest, 0); | ||
1377 | __ip_vs_del_dest(dest); | ||
1378 | } | ||
1379 | |||
1380 | /* | ||
1381 | * Update the virtual service counters | ||
1382 | */ | ||
1383 | if (svc->port == FTPPORT) | ||
1384 | atomic_dec(&ip_vs_ftpsvc_counter); | ||
1385 | else if (svc->port == 0) | ||
1386 | atomic_dec(&ip_vs_nullsvc_counter); | ||
1387 | |||
1388 | /* | ||
1389 | * Free the service if nobody refers to it | ||
1390 | */ | ||
1391 | if (atomic_read(&svc->refcnt) == 0) | ||
1392 | kfree(svc); | ||
1393 | |||
1394 | /* decrease the module use count */ | ||
1395 | ip_vs_use_count_dec(); | ||
1396 | } | ||
1397 | |||
1398 | /* | ||
1399 | * Delete a service from the service list | ||
1400 | */ | ||
1401 | static int ip_vs_del_service(struct ip_vs_service *svc) | ||
1402 | { | ||
1403 | if (svc == NULL) | ||
1404 | return -EEXIST; | ||
1405 | |||
1406 | /* | ||
1407 | * Unhash it from the service table | ||
1408 | */ | ||
1409 | write_lock_bh(&__ip_vs_svc_lock); | ||
1410 | |||
1411 | ip_vs_svc_unhash(svc); | ||
1412 | |||
1413 | /* | ||
1414 | * Wait until all the svc users go away. | ||
1415 | */ | ||
1416 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1417 | |||
1418 | __ip_vs_del_service(svc); | ||
1419 | |||
1420 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1421 | |||
1422 | return 0; | ||
1423 | } | ||
1424 | |||
1425 | |||
1426 | /* | ||
1427 | * Flush all the virtual services | ||
1428 | */ | ||
1429 | static int ip_vs_flush(void) | ||
1430 | { | ||
1431 | int idx; | ||
1432 | struct ip_vs_service *svc, *nxt; | ||
1433 | |||
1434 | /* | ||
1435 | * Flush the service table hashed by <protocol,addr,port> | ||
1436 | */ | ||
1437 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1438 | list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { | ||
1439 | write_lock_bh(&__ip_vs_svc_lock); | ||
1440 | ip_vs_svc_unhash(svc); | ||
1441 | /* | ||
1442 | * Wait until all the svc users go away. | ||
1443 | */ | ||
1444 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); | ||
1445 | __ip_vs_del_service(svc); | ||
1446 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1447 | } | ||
1448 | } | ||
1449 | |||
1450 | /* | ||
1451 | * Flush the service table hashed by fwmark | ||
1452 | */ | ||
1453 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1454 | list_for_each_entry_safe(svc, nxt, | ||
1455 | &ip_vs_svc_fwm_table[idx], f_list) { | ||
1456 | write_lock_bh(&__ip_vs_svc_lock); | ||
1457 | ip_vs_svc_unhash(svc); | ||
1458 | /* | ||
1459 | * Wait until all the svc users go away. | ||
1460 | */ | ||
1461 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); | ||
1462 | __ip_vs_del_service(svc); | ||
1463 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1464 | } | ||
1465 | } | ||
1466 | |||
1467 | return 0; | ||
1468 | } | ||
1469 | |||
1470 | |||
1471 | /* | ||
1472 | * Zero counters in a service or all services | ||
1473 | */ | ||
1474 | static int ip_vs_zero_service(struct ip_vs_service *svc) | ||
1475 | { | ||
1476 | struct ip_vs_dest *dest; | ||
1477 | |||
1478 | write_lock_bh(&__ip_vs_svc_lock); | ||
1479 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
1480 | ip_vs_zero_stats(&dest->stats); | ||
1481 | } | ||
1482 | ip_vs_zero_stats(&svc->stats); | ||
1483 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1484 | return 0; | ||
1485 | } | ||
1486 | |||
1487 | static int ip_vs_zero_all(void) | ||
1488 | { | ||
1489 | int idx; | ||
1490 | struct ip_vs_service *svc; | ||
1491 | |||
1492 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1493 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
1494 | ip_vs_zero_service(svc); | ||
1495 | } | ||
1496 | } | ||
1497 | |||
1498 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1499 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
1500 | ip_vs_zero_service(svc); | ||
1501 | } | ||
1502 | } | ||
1503 | |||
1504 | ip_vs_zero_stats(&ip_vs_stats); | ||
1505 | return 0; | ||
1506 | } | ||
1507 | |||
1508 | |||
1509 | static int | ||
1510 | proc_do_defense_mode(ctl_table *table, int write, struct file * filp, | ||
1511 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1512 | { | ||
1513 | int *valp = table->data; | ||
1514 | int val = *valp; | ||
1515 | int rc; | ||
1516 | |||
1517 | rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
1518 | if (write && (*valp != val)) { | ||
1519 | if ((*valp < 0) || (*valp > 3)) { | ||
1520 | /* Restore the correct value */ | ||
1521 | *valp = val; | ||
1522 | } else { | ||
1523 | update_defense_level(); | ||
1524 | } | ||
1525 | } | ||
1526 | return rc; | ||
1527 | } | ||
1528 | |||
1529 | |||
1530 | static int | ||
1531 | proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, | ||
1532 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1533 | { | ||
1534 | int *valp = table->data; | ||
1535 | int val[2]; | ||
1536 | int rc; | ||
1537 | |||
1538 | /* backup the value first */ | ||
1539 | memcpy(val, valp, sizeof(val)); | ||
1540 | |||
1541 | rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
1542 | if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { | ||
1543 | /* Restore the correct value */ | ||
1544 | memcpy(valp, val, sizeof(val)); | ||
1545 | } | ||
1546 | return rc; | ||
1547 | } | ||
1548 | |||
1549 | |||
1550 | /* | ||
1551 | * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) | ||
1552 | */ | ||
1553 | |||
1554 | static struct ctl_table vs_vars[] = { | ||
1555 | { | ||
1556 | .procname = "amemthresh", | ||
1557 | .data = &sysctl_ip_vs_amemthresh, | ||
1558 | .maxlen = sizeof(int), | ||
1559 | .mode = 0644, | ||
1560 | .proc_handler = &proc_dointvec, | ||
1561 | }, | ||
1562 | #ifdef CONFIG_IP_VS_DEBUG | ||
1563 | { | ||
1564 | .procname = "debug_level", | ||
1565 | .data = &sysctl_ip_vs_debug_level, | ||
1566 | .maxlen = sizeof(int), | ||
1567 | .mode = 0644, | ||
1568 | .proc_handler = &proc_dointvec, | ||
1569 | }, | ||
1570 | #endif | ||
1571 | { | ||
1572 | .procname = "am_droprate", | ||
1573 | .data = &sysctl_ip_vs_am_droprate, | ||
1574 | .maxlen = sizeof(int), | ||
1575 | .mode = 0644, | ||
1576 | .proc_handler = &proc_dointvec, | ||
1577 | }, | ||
1578 | { | ||
1579 | .procname = "drop_entry", | ||
1580 | .data = &sysctl_ip_vs_drop_entry, | ||
1581 | .maxlen = sizeof(int), | ||
1582 | .mode = 0644, | ||
1583 | .proc_handler = &proc_do_defense_mode, | ||
1584 | }, | ||
1585 | { | ||
1586 | .procname = "drop_packet", | ||
1587 | .data = &sysctl_ip_vs_drop_packet, | ||
1588 | .maxlen = sizeof(int), | ||
1589 | .mode = 0644, | ||
1590 | .proc_handler = &proc_do_defense_mode, | ||
1591 | }, | ||
1592 | { | ||
1593 | .procname = "secure_tcp", | ||
1594 | .data = &sysctl_ip_vs_secure_tcp, | ||
1595 | .maxlen = sizeof(int), | ||
1596 | .mode = 0644, | ||
1597 | .proc_handler = &proc_do_defense_mode, | ||
1598 | }, | ||
1599 | #if 0 | ||
1600 | { | ||
1601 | .procname = "timeout_established", | ||
1602 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], | ||
1603 | .maxlen = sizeof(int), | ||
1604 | .mode = 0644, | ||
1605 | .proc_handler = &proc_dointvec_jiffies, | ||
1606 | }, | ||
1607 | { | ||
1608 | .procname = "timeout_synsent", | ||
1609 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], | ||
1610 | .maxlen = sizeof(int), | ||
1611 | .mode = 0644, | ||
1612 | .proc_handler = &proc_dointvec_jiffies, | ||
1613 | }, | ||
1614 | { | ||
1615 | .procname = "timeout_synrecv", | ||
1616 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], | ||
1617 | .maxlen = sizeof(int), | ||
1618 | .mode = 0644, | ||
1619 | .proc_handler = &proc_dointvec_jiffies, | ||
1620 | }, | ||
1621 | { | ||
1622 | .procname = "timeout_finwait", | ||
1623 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], | ||
1624 | .maxlen = sizeof(int), | ||
1625 | .mode = 0644, | ||
1626 | .proc_handler = &proc_dointvec_jiffies, | ||
1627 | }, | ||
1628 | { | ||
1629 | .procname = "timeout_timewait", | ||
1630 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], | ||
1631 | .maxlen = sizeof(int), | ||
1632 | .mode = 0644, | ||
1633 | .proc_handler = &proc_dointvec_jiffies, | ||
1634 | }, | ||
1635 | { | ||
1636 | .procname = "timeout_close", | ||
1637 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], | ||
1638 | .maxlen = sizeof(int), | ||
1639 | .mode = 0644, | ||
1640 | .proc_handler = &proc_dointvec_jiffies, | ||
1641 | }, | ||
1642 | { | ||
1643 | .procname = "timeout_closewait", | ||
1644 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], | ||
1645 | .maxlen = sizeof(int), | ||
1646 | .mode = 0644, | ||
1647 | .proc_handler = &proc_dointvec_jiffies, | ||
1648 | }, | ||
1649 | { | ||
1650 | .procname = "timeout_lastack", | ||
1651 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], | ||
1652 | .maxlen = sizeof(int), | ||
1653 | .mode = 0644, | ||
1654 | .proc_handler = &proc_dointvec_jiffies, | ||
1655 | }, | ||
1656 | { | ||
1657 | .procname = "timeout_listen", | ||
1658 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], | ||
1659 | .maxlen = sizeof(int), | ||
1660 | .mode = 0644, | ||
1661 | .proc_handler = &proc_dointvec_jiffies, | ||
1662 | }, | ||
1663 | { | ||
1664 | .procname = "timeout_synack", | ||
1665 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], | ||
1666 | .maxlen = sizeof(int), | ||
1667 | .mode = 0644, | ||
1668 | .proc_handler = &proc_dointvec_jiffies, | ||
1669 | }, | ||
1670 | { | ||
1671 | .procname = "timeout_udp", | ||
1672 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], | ||
1673 | .maxlen = sizeof(int), | ||
1674 | .mode = 0644, | ||
1675 | .proc_handler = &proc_dointvec_jiffies, | ||
1676 | }, | ||
1677 | { | ||
1678 | .procname = "timeout_icmp", | ||
1679 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], | ||
1680 | .maxlen = sizeof(int), | ||
1681 | .mode = 0644, | ||
1682 | .proc_handler = &proc_dointvec_jiffies, | ||
1683 | }, | ||
1684 | #endif | ||
1685 | { | ||
1686 | .procname = "cache_bypass", | ||
1687 | .data = &sysctl_ip_vs_cache_bypass, | ||
1688 | .maxlen = sizeof(int), | ||
1689 | .mode = 0644, | ||
1690 | .proc_handler = &proc_dointvec, | ||
1691 | }, | ||
1692 | { | ||
1693 | .procname = "expire_nodest_conn", | ||
1694 | .data = &sysctl_ip_vs_expire_nodest_conn, | ||
1695 | .maxlen = sizeof(int), | ||
1696 | .mode = 0644, | ||
1697 | .proc_handler = &proc_dointvec, | ||
1698 | }, | ||
1699 | { | ||
1700 | .procname = "expire_quiescent_template", | ||
1701 | .data = &sysctl_ip_vs_expire_quiescent_template, | ||
1702 | .maxlen = sizeof(int), | ||
1703 | .mode = 0644, | ||
1704 | .proc_handler = &proc_dointvec, | ||
1705 | }, | ||
1706 | { | ||
1707 | .procname = "sync_threshold", | ||
1708 | .data = &sysctl_ip_vs_sync_threshold, | ||
1709 | .maxlen = sizeof(sysctl_ip_vs_sync_threshold), | ||
1710 | .mode = 0644, | ||
1711 | .proc_handler = &proc_do_sync_threshold, | ||
1712 | }, | ||
1713 | { | ||
1714 | .procname = "nat_icmp_send", | ||
1715 | .data = &sysctl_ip_vs_nat_icmp_send, | ||
1716 | .maxlen = sizeof(int), | ||
1717 | .mode = 0644, | ||
1718 | .proc_handler = &proc_dointvec, | ||
1719 | }, | ||
1720 | { .ctl_name = 0 } | ||
1721 | }; | ||
1722 | |||
1723 | const struct ctl_path net_vs_ctl_path[] = { | ||
1724 | { .procname = "net", .ctl_name = CTL_NET, }, | ||
1725 | { .procname = "ipv4", .ctl_name = NET_IPV4, }, | ||
1726 | { .procname = "vs", }, | ||
1727 | { } | ||
1728 | }; | ||
1729 | EXPORT_SYMBOL_GPL(net_vs_ctl_path); | ||
1730 | |||
1731 | static struct ctl_table_header * sysctl_header; | ||
1732 | |||
1733 | #ifdef CONFIG_PROC_FS | ||
1734 | |||
1735 | struct ip_vs_iter { | ||
1736 | struct list_head *table; | ||
1737 | int bucket; | ||
1738 | }; | ||
1739 | |||
1740 | /* | ||
1741 | * Write the contents of the VS rule table to a PROCfs file. | ||
1742 | * (It is kept just for backward compatibility) | ||
1743 | */ | ||
1744 | static inline const char *ip_vs_fwd_name(unsigned flags) | ||
1745 | { | ||
1746 | switch (flags & IP_VS_CONN_F_FWD_MASK) { | ||
1747 | case IP_VS_CONN_F_LOCALNODE: | ||
1748 | return "Local"; | ||
1749 | case IP_VS_CONN_F_TUNNEL: | ||
1750 | return "Tunnel"; | ||
1751 | case IP_VS_CONN_F_DROUTE: | ||
1752 | return "Route"; | ||
1753 | default: | ||
1754 | return "Masq"; | ||
1755 | } | ||
1756 | } | ||
1757 | |||
1758 | |||
1759 | /* Get the Nth entry in the two lists */ | ||
1760 | static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) | ||
1761 | { | ||
1762 | struct ip_vs_iter *iter = seq->private; | ||
1763 | int idx; | ||
1764 | struct ip_vs_service *svc; | ||
1765 | |||
1766 | /* look in hash by protocol */ | ||
1767 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1768 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
1769 | if (pos-- == 0){ | ||
1770 | iter->table = ip_vs_svc_table; | ||
1771 | iter->bucket = idx; | ||
1772 | return svc; | ||
1773 | } | ||
1774 | } | ||
1775 | } | ||
1776 | |||
1777 | /* keep looking in fwmark */ | ||
1778 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1779 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
1780 | if (pos-- == 0) { | ||
1781 | iter->table = ip_vs_svc_fwm_table; | ||
1782 | iter->bucket = idx; | ||
1783 | return svc; | ||
1784 | } | ||
1785 | } | ||
1786 | } | ||
1787 | |||
1788 | return NULL; | ||
1789 | } | ||
1790 | |||
1791 | static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) | ||
1792 | __acquires(__ip_vs_svc_lock) | ||
1793 | { | ||
1794 | |||
1795 | read_lock_bh(&__ip_vs_svc_lock); | ||
1796 | return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; | ||
1797 | } | ||
1798 | |||
1799 | |||
1800 | static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
1801 | { | ||
1802 | struct list_head *e; | ||
1803 | struct ip_vs_iter *iter; | ||
1804 | struct ip_vs_service *svc; | ||
1805 | |||
1806 | ++*pos; | ||
1807 | if (v == SEQ_START_TOKEN) | ||
1808 | return ip_vs_info_array(seq,0); | ||
1809 | |||
1810 | svc = v; | ||
1811 | iter = seq->private; | ||
1812 | |||
1813 | if (iter->table == ip_vs_svc_table) { | ||
1814 | /* next service in table hashed by protocol */ | ||
1815 | if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) | ||
1816 | return list_entry(e, struct ip_vs_service, s_list); | ||
1817 | |||
1818 | |||
1819 | while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { | ||
1820 | list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], | ||
1821 | s_list) { | ||
1822 | return svc; | ||
1823 | } | ||
1824 | } | ||
1825 | |||
1826 | iter->table = ip_vs_svc_fwm_table; | ||
1827 | iter->bucket = -1; | ||
1828 | goto scan_fwmark; | ||
1829 | } | ||
1830 | |||
1831 | /* next service in hashed by fwmark */ | ||
1832 | if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) | ||
1833 | return list_entry(e, struct ip_vs_service, f_list); | ||
1834 | |||
1835 | scan_fwmark: | ||
1836 | while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { | ||
1837 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], | ||
1838 | f_list) | ||
1839 | return svc; | ||
1840 | } | ||
1841 | |||
1842 | return NULL; | ||
1843 | } | ||
1844 | |||
1845 | static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) | ||
1846 | __releases(__ip_vs_svc_lock) | ||
1847 | { | ||
1848 | read_unlock_bh(&__ip_vs_svc_lock); | ||
1849 | } | ||
1850 | |||
1851 | |||
1852 | static int ip_vs_info_seq_show(struct seq_file *seq, void *v) | ||
1853 | { | ||
1854 | if (v == SEQ_START_TOKEN) { | ||
1855 | seq_printf(seq, | ||
1856 | "IP Virtual Server version %d.%d.%d (size=%d)\n", | ||
1857 | NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); | ||
1858 | seq_puts(seq, | ||
1859 | "Prot LocalAddress:Port Scheduler Flags\n"); | ||
1860 | seq_puts(seq, | ||
1861 | " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); | ||
1862 | } else { | ||
1863 | const struct ip_vs_service *svc = v; | ||
1864 | const struct ip_vs_iter *iter = seq->private; | ||
1865 | const struct ip_vs_dest *dest; | ||
1866 | |||
1867 | if (iter->table == ip_vs_svc_table) { | ||
1868 | #ifdef CONFIG_IP_VS_IPV6 | ||
1869 | if (svc->af == AF_INET6) | ||
1870 | seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", | ||
1871 | ip_vs_proto_name(svc->protocol), | ||
1872 | NIP6(svc->addr.in6), | ||
1873 | ntohs(svc->port), | ||
1874 | svc->scheduler->name); | ||
1875 | else | ||
1876 | #endif | ||
1877 | seq_printf(seq, "%s %08X:%04X %s ", | ||
1878 | ip_vs_proto_name(svc->protocol), | ||
1879 | ntohl(svc->addr.ip), | ||
1880 | ntohs(svc->port), | ||
1881 | svc->scheduler->name); | ||
1882 | } else { | ||
1883 | seq_printf(seq, "FWM %08X %s ", | ||
1884 | svc->fwmark, svc->scheduler->name); | ||
1885 | } | ||
1886 | |||
1887 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) | ||
1888 | seq_printf(seq, "persistent %d %08X\n", | ||
1889 | svc->timeout, | ||
1890 | ntohl(svc->netmask)); | ||
1891 | else | ||
1892 | seq_putc(seq, '\n'); | ||
1893 | |||
1894 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
1895 | #ifdef CONFIG_IP_VS_IPV6 | ||
1896 | if (dest->af == AF_INET6) | ||
1897 | seq_printf(seq, | ||
1898 | " -> [" NIP6_FMT "]:%04X" | ||
1899 | " %-7s %-6d %-10d %-10d\n", | ||
1900 | NIP6(dest->addr.in6), | ||
1901 | ntohs(dest->port), | ||
1902 | ip_vs_fwd_name(atomic_read(&dest->conn_flags)), | ||
1903 | atomic_read(&dest->weight), | ||
1904 | atomic_read(&dest->activeconns), | ||
1905 | atomic_read(&dest->inactconns)); | ||
1906 | else | ||
1907 | #endif | ||
1908 | seq_printf(seq, | ||
1909 | " -> %08X:%04X " | ||
1910 | "%-7s %-6d %-10d %-10d\n", | ||
1911 | ntohl(dest->addr.ip), | ||
1912 | ntohs(dest->port), | ||
1913 | ip_vs_fwd_name(atomic_read(&dest->conn_flags)), | ||
1914 | atomic_read(&dest->weight), | ||
1915 | atomic_read(&dest->activeconns), | ||
1916 | atomic_read(&dest->inactconns)); | ||
1917 | |||
1918 | } | ||
1919 | } | ||
1920 | return 0; | ||
1921 | } | ||
1922 | |||
1923 | static const struct seq_operations ip_vs_info_seq_ops = { | ||
1924 | .start = ip_vs_info_seq_start, | ||
1925 | .next = ip_vs_info_seq_next, | ||
1926 | .stop = ip_vs_info_seq_stop, | ||
1927 | .show = ip_vs_info_seq_show, | ||
1928 | }; | ||
1929 | |||
1930 | static int ip_vs_info_open(struct inode *inode, struct file *file) | ||
1931 | { | ||
1932 | return seq_open_private(file, &ip_vs_info_seq_ops, | ||
1933 | sizeof(struct ip_vs_iter)); | ||
1934 | } | ||
1935 | |||
1936 | static const struct file_operations ip_vs_info_fops = { | ||
1937 | .owner = THIS_MODULE, | ||
1938 | .open = ip_vs_info_open, | ||
1939 | .read = seq_read, | ||
1940 | .llseek = seq_lseek, | ||
1941 | .release = seq_release_private, | ||
1942 | }; | ||
1943 | |||
1944 | #endif | ||
1945 | |||
1946 | struct ip_vs_stats ip_vs_stats = { | ||
1947 | .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), | ||
1948 | }; | ||
1949 | |||
1950 | #ifdef CONFIG_PROC_FS | ||
1951 | static int ip_vs_stats_show(struct seq_file *seq, void *v) | ||
1952 | { | ||
1953 | |||
1954 | /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ | ||
1955 | seq_puts(seq, | ||
1956 | " Total Incoming Outgoing Incoming Outgoing\n"); | ||
1957 | seq_printf(seq, | ||
1958 | " Conns Packets Packets Bytes Bytes\n"); | ||
1959 | |||
1960 | spin_lock_bh(&ip_vs_stats.lock); | ||
1961 | seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, | ||
1962 | ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, | ||
1963 | (unsigned long long) ip_vs_stats.ustats.inbytes, | ||
1964 | (unsigned long long) ip_vs_stats.ustats.outbytes); | ||
1965 | |||
1966 | /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ | ||
1967 | seq_puts(seq, | ||
1968 | " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); | ||
1969 | seq_printf(seq,"%8X %8X %8X %16X %16X\n", | ||
1970 | ip_vs_stats.ustats.cps, | ||
1971 | ip_vs_stats.ustats.inpps, | ||
1972 | ip_vs_stats.ustats.outpps, | ||
1973 | ip_vs_stats.ustats.inbps, | ||
1974 | ip_vs_stats.ustats.outbps); | ||
1975 | spin_unlock_bh(&ip_vs_stats.lock); | ||
1976 | |||
1977 | return 0; | ||
1978 | } | ||
1979 | |||
1980 | static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) | ||
1981 | { | ||
1982 | return single_open(file, ip_vs_stats_show, NULL); | ||
1983 | } | ||
1984 | |||
1985 | static const struct file_operations ip_vs_stats_fops = { | ||
1986 | .owner = THIS_MODULE, | ||
1987 | .open = ip_vs_stats_seq_open, | ||
1988 | .read = seq_read, | ||
1989 | .llseek = seq_lseek, | ||
1990 | .release = single_release, | ||
1991 | }; | ||
1992 | |||
1993 | #endif | ||
1994 | |||
1995 | /* | ||
1996 | * Set timeout values for tcp tcpfin udp in the timeout_table. | ||
1997 | */ | ||
1998 | static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) | ||
1999 | { | ||
2000 | IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", | ||
2001 | u->tcp_timeout, | ||
2002 | u->tcp_fin_timeout, | ||
2003 | u->udp_timeout); | ||
2004 | |||
2005 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
2006 | if (u->tcp_timeout) { | ||
2007 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] | ||
2008 | = u->tcp_timeout * HZ; | ||
2009 | } | ||
2010 | |||
2011 | if (u->tcp_fin_timeout) { | ||
2012 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] | ||
2013 | = u->tcp_fin_timeout * HZ; | ||
2014 | } | ||
2015 | #endif | ||
2016 | |||
2017 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
2018 | if (u->udp_timeout) { | ||
2019 | ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] | ||
2020 | = u->udp_timeout * HZ; | ||
2021 | } | ||
2022 | #endif | ||
2023 | return 0; | ||
2024 | } | ||
2025 | |||
2026 | |||
2027 | #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) | ||
2028 | #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) | ||
2029 | #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ | ||
2030 | sizeof(struct ip_vs_dest_user)) | ||
2031 | #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) | ||
2032 | #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) | ||
2033 | #define MAX_ARG_LEN SVCDEST_ARG_LEN | ||
2034 | |||
2035 | static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { | ||
2036 | [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, | ||
2037 | [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, | ||
2038 | [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, | ||
2039 | [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, | ||
2040 | [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, | ||
2041 | [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, | ||
2042 | [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, | ||
2043 | [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, | ||
2044 | [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, | ||
2045 | [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, | ||
2046 | [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, | ||
2047 | }; | ||
2048 | |||
2049 | static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, | ||
2050 | struct ip_vs_service_user *usvc_compat) | ||
2051 | { | ||
2052 | usvc->af = AF_INET; | ||
2053 | usvc->protocol = usvc_compat->protocol; | ||
2054 | usvc->addr.ip = usvc_compat->addr; | ||
2055 | usvc->port = usvc_compat->port; | ||
2056 | usvc->fwmark = usvc_compat->fwmark; | ||
2057 | |||
2058 | /* Deep copy of sched_name is not needed here */ | ||
2059 | usvc->sched_name = usvc_compat->sched_name; | ||
2060 | |||
2061 | usvc->flags = usvc_compat->flags; | ||
2062 | usvc->timeout = usvc_compat->timeout; | ||
2063 | usvc->netmask = usvc_compat->netmask; | ||
2064 | } | ||
2065 | |||
2066 | static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, | ||
2067 | struct ip_vs_dest_user *udest_compat) | ||
2068 | { | ||
2069 | udest->addr.ip = udest_compat->addr; | ||
2070 | udest->port = udest_compat->port; | ||
2071 | udest->conn_flags = udest_compat->conn_flags; | ||
2072 | udest->weight = udest_compat->weight; | ||
2073 | udest->u_threshold = udest_compat->u_threshold; | ||
2074 | udest->l_threshold = udest_compat->l_threshold; | ||
2075 | } | ||
2076 | |||
2077 | static int | ||
2078 | do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) | ||
2079 | { | ||
2080 | int ret; | ||
2081 | unsigned char arg[MAX_ARG_LEN]; | ||
2082 | struct ip_vs_service_user *usvc_compat; | ||
2083 | struct ip_vs_service_user_kern usvc; | ||
2084 | struct ip_vs_service *svc; | ||
2085 | struct ip_vs_dest_user *udest_compat; | ||
2086 | struct ip_vs_dest_user_kern udest; | ||
2087 | |||
2088 | if (!capable(CAP_NET_ADMIN)) | ||
2089 | return -EPERM; | ||
2090 | |||
2091 | if (len != set_arglen[SET_CMDID(cmd)]) { | ||
2092 | IP_VS_ERR("set_ctl: len %u != %u\n", | ||
2093 | len, set_arglen[SET_CMDID(cmd)]); | ||
2094 | return -EINVAL; | ||
2095 | } | ||
2096 | |||
2097 | if (copy_from_user(arg, user, len) != 0) | ||
2098 | return -EFAULT; | ||
2099 | |||
2100 | /* increase the module use count */ | ||
2101 | ip_vs_use_count_inc(); | ||
2102 | |||
2103 | if (mutex_lock_interruptible(&__ip_vs_mutex)) { | ||
2104 | ret = -ERESTARTSYS; | ||
2105 | goto out_dec; | ||
2106 | } | ||
2107 | |||
2108 | if (cmd == IP_VS_SO_SET_FLUSH) { | ||
2109 | /* Flush the virtual service */ | ||
2110 | ret = ip_vs_flush(); | ||
2111 | goto out_unlock; | ||
2112 | } else if (cmd == IP_VS_SO_SET_TIMEOUT) { | ||
2113 | /* Set timeout values for (tcp tcpfin udp) */ | ||
2114 | ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); | ||
2115 | goto out_unlock; | ||
2116 | } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { | ||
2117 | struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; | ||
2118 | ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); | ||
2119 | goto out_unlock; | ||
2120 | } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { | ||
2121 | struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; | ||
2122 | ret = stop_sync_thread(dm->state); | ||
2123 | goto out_unlock; | ||
2124 | } | ||
2125 | |||
2126 | usvc_compat = (struct ip_vs_service_user *)arg; | ||
2127 | udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); | ||
2128 | |||
2129 | /* We only use the new structs internally, so copy userspace compat | ||
2130 | * structs to extended internal versions */ | ||
2131 | ip_vs_copy_usvc_compat(&usvc, usvc_compat); | ||
2132 | ip_vs_copy_udest_compat(&udest, udest_compat); | ||
2133 | |||
2134 | if (cmd == IP_VS_SO_SET_ZERO) { | ||
2135 | /* if no service address is set, zero counters in all */ | ||
2136 | if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { | ||
2137 | ret = ip_vs_zero_all(); | ||
2138 | goto out_unlock; | ||
2139 | } | ||
2140 | } | ||
2141 | |||
2142 | /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ | ||
2143 | if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { | ||
2144 | IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", | ||
2145 | usvc.protocol, NIPQUAD(usvc.addr.ip), | ||
2146 | ntohs(usvc.port), usvc.sched_name); | ||
2147 | ret = -EFAULT; | ||
2148 | goto out_unlock; | ||
2149 | } | ||
2150 | |||
2151 | /* Lookup the exact service by <protocol, addr, port> or fwmark */ | ||
2152 | if (usvc.fwmark == 0) | ||
2153 | svc = __ip_vs_service_get(usvc.af, usvc.protocol, | ||
2154 | &usvc.addr, usvc.port); | ||
2155 | else | ||
2156 | svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); | ||
2157 | |||
2158 | if (cmd != IP_VS_SO_SET_ADD | ||
2159 | && (svc == NULL || svc->protocol != usvc.protocol)) { | ||
2160 | ret = -ESRCH; | ||
2161 | goto out_unlock; | ||
2162 | } | ||
2163 | |||
2164 | switch (cmd) { | ||
2165 | case IP_VS_SO_SET_ADD: | ||
2166 | if (svc != NULL) | ||
2167 | ret = -EEXIST; | ||
2168 | else | ||
2169 | ret = ip_vs_add_service(&usvc, &svc); | ||
2170 | break; | ||
2171 | case IP_VS_SO_SET_EDIT: | ||
2172 | ret = ip_vs_edit_service(svc, &usvc); | ||
2173 | break; | ||
2174 | case IP_VS_SO_SET_DEL: | ||
2175 | ret = ip_vs_del_service(svc); | ||
2176 | if (!ret) | ||
2177 | goto out_unlock; | ||
2178 | break; | ||
2179 | case IP_VS_SO_SET_ZERO: | ||
2180 | ret = ip_vs_zero_service(svc); | ||
2181 | break; | ||
2182 | case IP_VS_SO_SET_ADDDEST: | ||
2183 | ret = ip_vs_add_dest(svc, &udest); | ||
2184 | break; | ||
2185 | case IP_VS_SO_SET_EDITDEST: | ||
2186 | ret = ip_vs_edit_dest(svc, &udest); | ||
2187 | break; | ||
2188 | case IP_VS_SO_SET_DELDEST: | ||
2189 | ret = ip_vs_del_dest(svc, &udest); | ||
2190 | break; | ||
2191 | default: | ||
2192 | ret = -EINVAL; | ||
2193 | } | ||
2194 | |||
2195 | if (svc) | ||
2196 | ip_vs_service_put(svc); | ||
2197 | |||
2198 | out_unlock: | ||
2199 | mutex_unlock(&__ip_vs_mutex); | ||
2200 | out_dec: | ||
2201 | /* decrease the module use count */ | ||
2202 | ip_vs_use_count_dec(); | ||
2203 | |||
2204 | return ret; | ||
2205 | } | ||
2206 | |||
2207 | |||
2208 | static void | ||
2209 | ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) | ||
2210 | { | ||
2211 | spin_lock_bh(&src->lock); | ||
2212 | memcpy(dst, &src->ustats, sizeof(*dst)); | ||
2213 | spin_unlock_bh(&src->lock); | ||
2214 | } | ||
2215 | |||
2216 | static void | ||
2217 | ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) | ||
2218 | { | ||
2219 | dst->protocol = src->protocol; | ||
2220 | dst->addr = src->addr.ip; | ||
2221 | dst->port = src->port; | ||
2222 | dst->fwmark = src->fwmark; | ||
2223 | strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); | ||
2224 | dst->flags = src->flags; | ||
2225 | dst->timeout = src->timeout / HZ; | ||
2226 | dst->netmask = src->netmask; | ||
2227 | dst->num_dests = src->num_dests; | ||
2228 | ip_vs_copy_stats(&dst->stats, &src->stats); | ||
2229 | } | ||
2230 | |||
2231 | static inline int | ||
2232 | __ip_vs_get_service_entries(const struct ip_vs_get_services *get, | ||
2233 | struct ip_vs_get_services __user *uptr) | ||
2234 | { | ||
2235 | int idx, count=0; | ||
2236 | struct ip_vs_service *svc; | ||
2237 | struct ip_vs_service_entry entry; | ||
2238 | int ret = 0; | ||
2239 | |||
2240 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2241 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
2242 | /* Only expose IPv4 entries to old interface */ | ||
2243 | if (svc->af != AF_INET) | ||
2244 | continue; | ||
2245 | |||
2246 | if (count >= get->num_services) | ||
2247 | goto out; | ||
2248 | memset(&entry, 0, sizeof(entry)); | ||
2249 | ip_vs_copy_service(&entry, svc); | ||
2250 | if (copy_to_user(&uptr->entrytable[count], | ||
2251 | &entry, sizeof(entry))) { | ||
2252 | ret = -EFAULT; | ||
2253 | goto out; | ||
2254 | } | ||
2255 | count++; | ||
2256 | } | ||
2257 | } | ||
2258 | |||
2259 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2260 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
2261 | /* Only expose IPv4 entries to old interface */ | ||
2262 | if (svc->af != AF_INET) | ||
2263 | continue; | ||
2264 | |||
2265 | if (count >= get->num_services) | ||
2266 | goto out; | ||
2267 | memset(&entry, 0, sizeof(entry)); | ||
2268 | ip_vs_copy_service(&entry, svc); | ||
2269 | if (copy_to_user(&uptr->entrytable[count], | ||
2270 | &entry, sizeof(entry))) { | ||
2271 | ret = -EFAULT; | ||
2272 | goto out; | ||
2273 | } | ||
2274 | count++; | ||
2275 | } | ||
2276 | } | ||
2277 | out: | ||
2278 | return ret; | ||
2279 | } | ||
2280 | |||
2281 | static inline int | ||
2282 | __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, | ||
2283 | struct ip_vs_get_dests __user *uptr) | ||
2284 | { | ||
2285 | struct ip_vs_service *svc; | ||
2286 | union nf_inet_addr addr = { .ip = get->addr }; | ||
2287 | int ret = 0; | ||
2288 | |||
2289 | if (get->fwmark) | ||
2290 | svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); | ||
2291 | else | ||
2292 | svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, | ||
2293 | get->port); | ||
2294 | |||
2295 | if (svc) { | ||
2296 | int count = 0; | ||
2297 | struct ip_vs_dest *dest; | ||
2298 | struct ip_vs_dest_entry entry; | ||
2299 | |||
2300 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
2301 | if (count >= get->num_dests) | ||
2302 | break; | ||
2303 | |||
2304 | entry.addr = dest->addr.ip; | ||
2305 | entry.port = dest->port; | ||
2306 | entry.conn_flags = atomic_read(&dest->conn_flags); | ||
2307 | entry.weight = atomic_read(&dest->weight); | ||
2308 | entry.u_threshold = dest->u_threshold; | ||
2309 | entry.l_threshold = dest->l_threshold; | ||
2310 | entry.activeconns = atomic_read(&dest->activeconns); | ||
2311 | entry.inactconns = atomic_read(&dest->inactconns); | ||
2312 | entry.persistconns = atomic_read(&dest->persistconns); | ||
2313 | ip_vs_copy_stats(&entry.stats, &dest->stats); | ||
2314 | if (copy_to_user(&uptr->entrytable[count], | ||
2315 | &entry, sizeof(entry))) { | ||
2316 | ret = -EFAULT; | ||
2317 | break; | ||
2318 | } | ||
2319 | count++; | ||
2320 | } | ||
2321 | ip_vs_service_put(svc); | ||
2322 | } else | ||
2323 | ret = -ESRCH; | ||
2324 | return ret; | ||
2325 | } | ||
2326 | |||
2327 | static inline void | ||
2328 | __ip_vs_get_timeouts(struct ip_vs_timeout_user *u) | ||
2329 | { | ||
2330 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
2331 | u->tcp_timeout = | ||
2332 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; | ||
2333 | u->tcp_fin_timeout = | ||
2334 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; | ||
2335 | #endif | ||
2336 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
2337 | u->udp_timeout = | ||
2338 | ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; | ||
2339 | #endif | ||
2340 | } | ||
2341 | |||
2342 | |||
2343 | #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) | ||
2344 | #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) | ||
2345 | #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) | ||
2346 | #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) | ||
2347 | #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) | ||
2348 | #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) | ||
2349 | #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) | ||
2350 | |||
2351 | static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { | ||
2352 | [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, | ||
2353 | [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, | ||
2354 | [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, | ||
2355 | [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, | ||
2356 | [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, | ||
2357 | [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, | ||
2358 | [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, | ||
2359 | }; | ||
2360 | |||
2361 | static int | ||
2362 | do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) | ||
2363 | { | ||
2364 | unsigned char arg[128]; | ||
2365 | int ret = 0; | ||
2366 | |||
2367 | if (!capable(CAP_NET_ADMIN)) | ||
2368 | return -EPERM; | ||
2369 | |||
2370 | if (*len < get_arglen[GET_CMDID(cmd)]) { | ||
2371 | IP_VS_ERR("get_ctl: len %u < %u\n", | ||
2372 | *len, get_arglen[GET_CMDID(cmd)]); | ||
2373 | return -EINVAL; | ||
2374 | } | ||
2375 | |||
2376 | if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) | ||
2377 | return -EFAULT; | ||
2378 | |||
2379 | if (mutex_lock_interruptible(&__ip_vs_mutex)) | ||
2380 | return -ERESTARTSYS; | ||
2381 | |||
2382 | switch (cmd) { | ||
2383 | case IP_VS_SO_GET_VERSION: | ||
2384 | { | ||
2385 | char buf[64]; | ||
2386 | |||
2387 | sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", | ||
2388 | NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); | ||
2389 | if (copy_to_user(user, buf, strlen(buf)+1) != 0) { | ||
2390 | ret = -EFAULT; | ||
2391 | goto out; | ||
2392 | } | ||
2393 | *len = strlen(buf)+1; | ||
2394 | } | ||
2395 | break; | ||
2396 | |||
2397 | case IP_VS_SO_GET_INFO: | ||
2398 | { | ||
2399 | struct ip_vs_getinfo info; | ||
2400 | info.version = IP_VS_VERSION_CODE; | ||
2401 | info.size = IP_VS_CONN_TAB_SIZE; | ||
2402 | info.num_services = ip_vs_num_services; | ||
2403 | if (copy_to_user(user, &info, sizeof(info)) != 0) | ||
2404 | ret = -EFAULT; | ||
2405 | } | ||
2406 | break; | ||
2407 | |||
2408 | case IP_VS_SO_GET_SERVICES: | ||
2409 | { | ||
2410 | struct ip_vs_get_services *get; | ||
2411 | int size; | ||
2412 | |||
2413 | get = (struct ip_vs_get_services *)arg; | ||
2414 | size = sizeof(*get) + | ||
2415 | sizeof(struct ip_vs_service_entry) * get->num_services; | ||
2416 | if (*len != size) { | ||
2417 | IP_VS_ERR("length: %u != %u\n", *len, size); | ||
2418 | ret = -EINVAL; | ||
2419 | goto out; | ||
2420 | } | ||
2421 | ret = __ip_vs_get_service_entries(get, user); | ||
2422 | } | ||
2423 | break; | ||
2424 | |||
2425 | case IP_VS_SO_GET_SERVICE: | ||
2426 | { | ||
2427 | struct ip_vs_service_entry *entry; | ||
2428 | struct ip_vs_service *svc; | ||
2429 | union nf_inet_addr addr; | ||
2430 | |||
2431 | entry = (struct ip_vs_service_entry *)arg; | ||
2432 | addr.ip = entry->addr; | ||
2433 | if (entry->fwmark) | ||
2434 | svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); | ||
2435 | else | ||
2436 | svc = __ip_vs_service_get(AF_INET, entry->protocol, | ||
2437 | &addr, entry->port); | ||
2438 | if (svc) { | ||
2439 | ip_vs_copy_service(entry, svc); | ||
2440 | if (copy_to_user(user, entry, sizeof(*entry)) != 0) | ||
2441 | ret = -EFAULT; | ||
2442 | ip_vs_service_put(svc); | ||
2443 | } else | ||
2444 | ret = -ESRCH; | ||
2445 | } | ||
2446 | break; | ||
2447 | |||
2448 | case IP_VS_SO_GET_DESTS: | ||
2449 | { | ||
2450 | struct ip_vs_get_dests *get; | ||
2451 | int size; | ||
2452 | |||
2453 | get = (struct ip_vs_get_dests *)arg; | ||
2454 | size = sizeof(*get) + | ||
2455 | sizeof(struct ip_vs_dest_entry) * get->num_dests; | ||
2456 | if (*len != size) { | ||
2457 | IP_VS_ERR("length: %u != %u\n", *len, size); | ||
2458 | ret = -EINVAL; | ||
2459 | goto out; | ||
2460 | } | ||
2461 | ret = __ip_vs_get_dest_entries(get, user); | ||
2462 | } | ||
2463 | break; | ||
2464 | |||
2465 | case IP_VS_SO_GET_TIMEOUT: | ||
2466 | { | ||
2467 | struct ip_vs_timeout_user t; | ||
2468 | |||
2469 | __ip_vs_get_timeouts(&t); | ||
2470 | if (copy_to_user(user, &t, sizeof(t)) != 0) | ||
2471 | ret = -EFAULT; | ||
2472 | } | ||
2473 | break; | ||
2474 | |||
2475 | case IP_VS_SO_GET_DAEMON: | ||
2476 | { | ||
2477 | struct ip_vs_daemon_user d[2]; | ||
2478 | |||
2479 | memset(&d, 0, sizeof(d)); | ||
2480 | if (ip_vs_sync_state & IP_VS_STATE_MASTER) { | ||
2481 | d[0].state = IP_VS_STATE_MASTER; | ||
2482 | strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); | ||
2483 | d[0].syncid = ip_vs_master_syncid; | ||
2484 | } | ||
2485 | if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { | ||
2486 | d[1].state = IP_VS_STATE_BACKUP; | ||
2487 | strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); | ||
2488 | d[1].syncid = ip_vs_backup_syncid; | ||
2489 | } | ||
2490 | if (copy_to_user(user, &d, sizeof(d)) != 0) | ||
2491 | ret = -EFAULT; | ||
2492 | } | ||
2493 | break; | ||
2494 | |||
2495 | default: | ||
2496 | ret = -EINVAL; | ||
2497 | } | ||
2498 | |||
2499 | out: | ||
2500 | mutex_unlock(&__ip_vs_mutex); | ||
2501 | return ret; | ||
2502 | } | ||
2503 | |||
2504 | |||
2505 | static struct nf_sockopt_ops ip_vs_sockopts = { | ||
2506 | .pf = PF_INET, | ||
2507 | .set_optmin = IP_VS_BASE_CTL, | ||
2508 | .set_optmax = IP_VS_SO_SET_MAX+1, | ||
2509 | .set = do_ip_vs_set_ctl, | ||
2510 | .get_optmin = IP_VS_BASE_CTL, | ||
2511 | .get_optmax = IP_VS_SO_GET_MAX+1, | ||
2512 | .get = do_ip_vs_get_ctl, | ||
2513 | .owner = THIS_MODULE, | ||
2514 | }; | ||
2515 | |||
2516 | /* | ||
2517 | * Generic Netlink interface | ||
2518 | */ | ||
2519 | |||
2520 | /* IPVS genetlink family */ | ||
2521 | static struct genl_family ip_vs_genl_family = { | ||
2522 | .id = GENL_ID_GENERATE, | ||
2523 | .hdrsize = 0, | ||
2524 | .name = IPVS_GENL_NAME, | ||
2525 | .version = IPVS_GENL_VERSION, | ||
2526 | .maxattr = IPVS_CMD_MAX, | ||
2527 | }; | ||
2528 | |||
2529 | /* Policy used for first-level command attributes */ | ||
2530 | static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { | ||
2531 | [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, | ||
2532 | [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, | ||
2533 | [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, | ||
2534 | [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, | ||
2535 | [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, | ||
2536 | [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, | ||
2537 | }; | ||
2538 | |||
2539 | /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ | ||
2540 | static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { | ||
2541 | [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, | ||
2542 | [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, | ||
2543 | .len = IP_VS_IFNAME_MAXLEN }, | ||
2544 | [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, | ||
2545 | }; | ||
2546 | |||
2547 | /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ | ||
2548 | static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { | ||
2549 | [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, | ||
2550 | [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, | ||
2551 | [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, | ||
2552 | .len = sizeof(union nf_inet_addr) }, | ||
2553 | [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, | ||
2554 | [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, | ||
2555 | [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, | ||
2556 | .len = IP_VS_SCHEDNAME_MAXLEN }, | ||
2557 | [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, | ||
2558 | .len = sizeof(struct ip_vs_flags) }, | ||
2559 | [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, | ||
2560 | [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, | ||
2561 | [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, | ||
2562 | }; | ||
2563 | |||
2564 | /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ | ||
2565 | static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { | ||
2566 | [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, | ||
2567 | .len = sizeof(union nf_inet_addr) }, | ||
2568 | [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, | ||
2569 | [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, | ||
2570 | [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, | ||
2571 | [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, | ||
2572 | [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, | ||
2573 | [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, | ||
2574 | [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, | ||
2575 | [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, | ||
2576 | [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, | ||
2577 | }; | ||
2578 | |||
2579 | static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, | ||
2580 | struct ip_vs_stats *stats) | ||
2581 | { | ||
2582 | struct nlattr *nl_stats = nla_nest_start(skb, container_type); | ||
2583 | if (!nl_stats) | ||
2584 | return -EMSGSIZE; | ||
2585 | |||
2586 | spin_lock_bh(&stats->lock); | ||
2587 | |||
2588 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); | ||
2589 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); | ||
2590 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); | ||
2591 | NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); | ||
2592 | NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); | ||
2593 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); | ||
2594 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); | ||
2595 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); | ||
2596 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); | ||
2597 | NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); | ||
2598 | |||
2599 | spin_unlock_bh(&stats->lock); | ||
2600 | |||
2601 | nla_nest_end(skb, nl_stats); | ||
2602 | |||
2603 | return 0; | ||
2604 | |||
2605 | nla_put_failure: | ||
2606 | spin_unlock_bh(&stats->lock); | ||
2607 | nla_nest_cancel(skb, nl_stats); | ||
2608 | return -EMSGSIZE; | ||
2609 | } | ||
2610 | |||
2611 | static int ip_vs_genl_fill_service(struct sk_buff *skb, | ||
2612 | struct ip_vs_service *svc) | ||
2613 | { | ||
2614 | struct nlattr *nl_service; | ||
2615 | struct ip_vs_flags flags = { .flags = svc->flags, | ||
2616 | .mask = ~0 }; | ||
2617 | |||
2618 | nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); | ||
2619 | if (!nl_service) | ||
2620 | return -EMSGSIZE; | ||
2621 | |||
2622 | NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); | ||
2623 | |||
2624 | if (svc->fwmark) { | ||
2625 | NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); | ||
2626 | } else { | ||
2627 | NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); | ||
2628 | NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); | ||
2629 | NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); | ||
2630 | } | ||
2631 | |||
2632 | NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); | ||
2633 | NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); | ||
2634 | NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); | ||
2635 | NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); | ||
2636 | |||
2637 | if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) | ||
2638 | goto nla_put_failure; | ||
2639 | |||
2640 | nla_nest_end(skb, nl_service); | ||
2641 | |||
2642 | return 0; | ||
2643 | |||
2644 | nla_put_failure: | ||
2645 | nla_nest_cancel(skb, nl_service); | ||
2646 | return -EMSGSIZE; | ||
2647 | } | ||
2648 | |||
2649 | static int ip_vs_genl_dump_service(struct sk_buff *skb, | ||
2650 | struct ip_vs_service *svc, | ||
2651 | struct netlink_callback *cb) | ||
2652 | { | ||
2653 | void *hdr; | ||
2654 | |||
2655 | hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, | ||
2656 | &ip_vs_genl_family, NLM_F_MULTI, | ||
2657 | IPVS_CMD_NEW_SERVICE); | ||
2658 | if (!hdr) | ||
2659 | return -EMSGSIZE; | ||
2660 | |||
2661 | if (ip_vs_genl_fill_service(skb, svc) < 0) | ||
2662 | goto nla_put_failure; | ||
2663 | |||
2664 | return genlmsg_end(skb, hdr); | ||
2665 | |||
2666 | nla_put_failure: | ||
2667 | genlmsg_cancel(skb, hdr); | ||
2668 | return -EMSGSIZE; | ||
2669 | } | ||
2670 | |||
2671 | static int ip_vs_genl_dump_services(struct sk_buff *skb, | ||
2672 | struct netlink_callback *cb) | ||
2673 | { | ||
2674 | int idx = 0, i; | ||
2675 | int start = cb->args[0]; | ||
2676 | struct ip_vs_service *svc; | ||
2677 | |||
2678 | mutex_lock(&__ip_vs_mutex); | ||
2679 | for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { | ||
2680 | list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { | ||
2681 | if (++idx <= start) | ||
2682 | continue; | ||
2683 | if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { | ||
2684 | idx--; | ||
2685 | goto nla_put_failure; | ||
2686 | } | ||
2687 | } | ||
2688 | } | ||
2689 | |||
2690 | for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { | ||
2691 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { | ||
2692 | if (++idx <= start) | ||
2693 | continue; | ||
2694 | if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { | ||
2695 | idx--; | ||
2696 | goto nla_put_failure; | ||
2697 | } | ||
2698 | } | ||
2699 | } | ||
2700 | |||
2701 | nla_put_failure: | ||
2702 | mutex_unlock(&__ip_vs_mutex); | ||
2703 | cb->args[0] = idx; | ||
2704 | |||
2705 | return skb->len; | ||
2706 | } | ||
2707 | |||
2708 | static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, | ||
2709 | struct nlattr *nla, int full_entry) | ||
2710 | { | ||
2711 | struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; | ||
2712 | struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; | ||
2713 | |||
2714 | /* Parse mandatory identifying service fields first */ | ||
2715 | if (nla == NULL || | ||
2716 | nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) | ||
2717 | return -EINVAL; | ||
2718 | |||
2719 | nla_af = attrs[IPVS_SVC_ATTR_AF]; | ||
2720 | nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; | ||
2721 | nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; | ||
2722 | nla_port = attrs[IPVS_SVC_ATTR_PORT]; | ||
2723 | nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; | ||
2724 | |||
2725 | if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) | ||
2726 | return -EINVAL; | ||
2727 | |||
2728 | usvc->af = nla_get_u16(nla_af); | ||
2729 | #ifdef CONFIG_IP_VS_IPV6 | ||
2730 | if (usvc->af != AF_INET && usvc->af != AF_INET6) | ||
2731 | #else | ||
2732 | if (usvc->af != AF_INET) | ||
2733 | #endif | ||
2734 | return -EAFNOSUPPORT; | ||
2735 | |||
2736 | if (nla_fwmark) { | ||
2737 | usvc->protocol = IPPROTO_TCP; | ||
2738 | usvc->fwmark = nla_get_u32(nla_fwmark); | ||
2739 | } else { | ||
2740 | usvc->protocol = nla_get_u16(nla_protocol); | ||
2741 | nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); | ||
2742 | usvc->port = nla_get_u16(nla_port); | ||
2743 | usvc->fwmark = 0; | ||
2744 | } | ||
2745 | |||
2746 | /* If a full entry was requested, check for the additional fields */ | ||
2747 | if (full_entry) { | ||
2748 | struct nlattr *nla_sched, *nla_flags, *nla_timeout, | ||
2749 | *nla_netmask; | ||
2750 | struct ip_vs_flags flags; | ||
2751 | struct ip_vs_service *svc; | ||
2752 | |||
2753 | nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; | ||
2754 | nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; | ||
2755 | nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; | ||
2756 | nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; | ||
2757 | |||
2758 | if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) | ||
2759 | return -EINVAL; | ||
2760 | |||
2761 | nla_memcpy(&flags, nla_flags, sizeof(flags)); | ||
2762 | |||
2763 | /* prefill flags from service if it already exists */ | ||
2764 | if (usvc->fwmark) | ||
2765 | svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); | ||
2766 | else | ||
2767 | svc = __ip_vs_service_get(usvc->af, usvc->protocol, | ||
2768 | &usvc->addr, usvc->port); | ||
2769 | if (svc) { | ||
2770 | usvc->flags = svc->flags; | ||
2771 | ip_vs_service_put(svc); | ||
2772 | } else | ||
2773 | usvc->flags = 0; | ||
2774 | |||
2775 | /* set new flags from userland */ | ||
2776 | usvc->flags = (usvc->flags & ~flags.mask) | | ||
2777 | (flags.flags & flags.mask); | ||
2778 | usvc->sched_name = nla_data(nla_sched); | ||
2779 | usvc->timeout = nla_get_u32(nla_timeout); | ||
2780 | usvc->netmask = nla_get_u32(nla_netmask); | ||
2781 | } | ||
2782 | |||
2783 | return 0; | ||
2784 | } | ||
2785 | |||
2786 | static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) | ||
2787 | { | ||
2788 | struct ip_vs_service_user_kern usvc; | ||
2789 | int ret; | ||
2790 | |||
2791 | ret = ip_vs_genl_parse_service(&usvc, nla, 0); | ||
2792 | if (ret) | ||
2793 | return ERR_PTR(ret); | ||
2794 | |||
2795 | if (usvc.fwmark) | ||
2796 | return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); | ||
2797 | else | ||
2798 | return __ip_vs_service_get(usvc.af, usvc.protocol, | ||
2799 | &usvc.addr, usvc.port); | ||
2800 | } | ||
2801 | |||
2802 | static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) | ||
2803 | { | ||
2804 | struct nlattr *nl_dest; | ||
2805 | |||
2806 | nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); | ||
2807 | if (!nl_dest) | ||
2808 | return -EMSGSIZE; | ||
2809 | |||
2810 | NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); | ||
2811 | NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); | ||
2812 | |||
2813 | NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, | ||
2814 | atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); | ||
2815 | NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); | ||
2816 | NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); | ||
2817 | NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); | ||
2818 | NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, | ||
2819 | atomic_read(&dest->activeconns)); | ||
2820 | NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, | ||
2821 | atomic_read(&dest->inactconns)); | ||
2822 | NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, | ||
2823 | atomic_read(&dest->persistconns)); | ||
2824 | |||
2825 | if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) | ||
2826 | goto nla_put_failure; | ||
2827 | |||
2828 | nla_nest_end(skb, nl_dest); | ||
2829 | |||
2830 | return 0; | ||
2831 | |||
2832 | nla_put_failure: | ||
2833 | nla_nest_cancel(skb, nl_dest); | ||
2834 | return -EMSGSIZE; | ||
2835 | } | ||
2836 | |||
2837 | static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, | ||
2838 | struct netlink_callback *cb) | ||
2839 | { | ||
2840 | void *hdr; | ||
2841 | |||
2842 | hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, | ||
2843 | &ip_vs_genl_family, NLM_F_MULTI, | ||
2844 | IPVS_CMD_NEW_DEST); | ||
2845 | if (!hdr) | ||
2846 | return -EMSGSIZE; | ||
2847 | |||
2848 | if (ip_vs_genl_fill_dest(skb, dest) < 0) | ||
2849 | goto nla_put_failure; | ||
2850 | |||
2851 | return genlmsg_end(skb, hdr); | ||
2852 | |||
2853 | nla_put_failure: | ||
2854 | genlmsg_cancel(skb, hdr); | ||
2855 | return -EMSGSIZE; | ||
2856 | } | ||
2857 | |||
2858 | static int ip_vs_genl_dump_dests(struct sk_buff *skb, | ||
2859 | struct netlink_callback *cb) | ||
2860 | { | ||
2861 | int idx = 0; | ||
2862 | int start = cb->args[0]; | ||
2863 | struct ip_vs_service *svc; | ||
2864 | struct ip_vs_dest *dest; | ||
2865 | struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; | ||
2866 | |||
2867 | mutex_lock(&__ip_vs_mutex); | ||
2868 | |||
2869 | /* Try to find the service for which to dump destinations */ | ||
2870 | if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, | ||
2871 | IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) | ||
2872 | goto out_err; | ||
2873 | |||
2874 | svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); | ||
2875 | if (IS_ERR(svc) || svc == NULL) | ||
2876 | goto out_err; | ||
2877 | |||
2878 | /* Dump the destinations */ | ||
2879 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
2880 | if (++idx <= start) | ||
2881 | continue; | ||
2882 | if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { | ||
2883 | idx--; | ||
2884 | goto nla_put_failure; | ||
2885 | } | ||
2886 | } | ||
2887 | |||
2888 | nla_put_failure: | ||
2889 | cb->args[0] = idx; | ||
2890 | ip_vs_service_put(svc); | ||
2891 | |||
2892 | out_err: | ||
2893 | mutex_unlock(&__ip_vs_mutex); | ||
2894 | |||
2895 | return skb->len; | ||
2896 | } | ||
2897 | |||
2898 | static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, | ||
2899 | struct nlattr *nla, int full_entry) | ||
2900 | { | ||
2901 | struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; | ||
2902 | struct nlattr *nla_addr, *nla_port; | ||
2903 | |||
2904 | /* Parse mandatory identifying destination fields first */ | ||
2905 | if (nla == NULL || | ||
2906 | nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) | ||
2907 | return -EINVAL; | ||
2908 | |||
2909 | nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; | ||
2910 | nla_port = attrs[IPVS_DEST_ATTR_PORT]; | ||
2911 | |||
2912 | if (!(nla_addr && nla_port)) | ||
2913 | return -EINVAL; | ||
2914 | |||
2915 | nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); | ||
2916 | udest->port = nla_get_u16(nla_port); | ||
2917 | |||
2918 | /* If a full entry was requested, check for the additional fields */ | ||
2919 | if (full_entry) { | ||
2920 | struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, | ||
2921 | *nla_l_thresh; | ||
2922 | |||
2923 | nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; | ||
2924 | nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; | ||
2925 | nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; | ||
2926 | nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; | ||
2927 | |||
2928 | if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) | ||
2929 | return -EINVAL; | ||
2930 | |||
2931 | udest->conn_flags = nla_get_u32(nla_fwd) | ||
2932 | & IP_VS_CONN_F_FWD_MASK; | ||
2933 | udest->weight = nla_get_u32(nla_weight); | ||
2934 | udest->u_threshold = nla_get_u32(nla_u_thresh); | ||
2935 | udest->l_threshold = nla_get_u32(nla_l_thresh); | ||
2936 | } | ||
2937 | |||
2938 | return 0; | ||
2939 | } | ||
2940 | |||
2941 | static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, | ||
2942 | const char *mcast_ifn, __be32 syncid) | ||
2943 | { | ||
2944 | struct nlattr *nl_daemon; | ||
2945 | |||
2946 | nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); | ||
2947 | if (!nl_daemon) | ||
2948 | return -EMSGSIZE; | ||
2949 | |||
2950 | NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); | ||
2951 | NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); | ||
2952 | NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); | ||
2953 | |||
2954 | nla_nest_end(skb, nl_daemon); | ||
2955 | |||
2956 | return 0; | ||
2957 | |||
2958 | nla_put_failure: | ||
2959 | nla_nest_cancel(skb, nl_daemon); | ||
2960 | return -EMSGSIZE; | ||
2961 | } | ||
2962 | |||
2963 | static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, | ||
2964 | const char *mcast_ifn, __be32 syncid, | ||
2965 | struct netlink_callback *cb) | ||
2966 | { | ||
2967 | void *hdr; | ||
2968 | hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, | ||
2969 | &ip_vs_genl_family, NLM_F_MULTI, | ||
2970 | IPVS_CMD_NEW_DAEMON); | ||
2971 | if (!hdr) | ||
2972 | return -EMSGSIZE; | ||
2973 | |||
2974 | if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) | ||
2975 | goto nla_put_failure; | ||
2976 | |||
2977 | return genlmsg_end(skb, hdr); | ||
2978 | |||
2979 | nla_put_failure: | ||
2980 | genlmsg_cancel(skb, hdr); | ||
2981 | return -EMSGSIZE; | ||
2982 | } | ||
2983 | |||
2984 | static int ip_vs_genl_dump_daemons(struct sk_buff *skb, | ||
2985 | struct netlink_callback *cb) | ||
2986 | { | ||
2987 | mutex_lock(&__ip_vs_mutex); | ||
2988 | if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { | ||
2989 | if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, | ||
2990 | ip_vs_master_mcast_ifn, | ||
2991 | ip_vs_master_syncid, cb) < 0) | ||
2992 | goto nla_put_failure; | ||
2993 | |||
2994 | cb->args[0] = 1; | ||
2995 | } | ||
2996 | |||
2997 | if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { | ||
2998 | if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, | ||
2999 | ip_vs_backup_mcast_ifn, | ||
3000 | ip_vs_backup_syncid, cb) < 0) | ||
3001 | goto nla_put_failure; | ||
3002 | |||
3003 | cb->args[1] = 1; | ||
3004 | } | ||
3005 | |||
3006 | nla_put_failure: | ||
3007 | mutex_unlock(&__ip_vs_mutex); | ||
3008 | |||
3009 | return skb->len; | ||
3010 | } | ||
3011 | |||
3012 | static int ip_vs_genl_new_daemon(struct nlattr **attrs) | ||
3013 | { | ||
3014 | if (!(attrs[IPVS_DAEMON_ATTR_STATE] && | ||
3015 | attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && | ||
3016 | attrs[IPVS_DAEMON_ATTR_SYNC_ID])) | ||
3017 | return -EINVAL; | ||
3018 | |||
3019 | return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), | ||
3020 | nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), | ||
3021 | nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); | ||
3022 | } | ||
3023 | |||
3024 | static int ip_vs_genl_del_daemon(struct nlattr **attrs) | ||
3025 | { | ||
3026 | if (!attrs[IPVS_DAEMON_ATTR_STATE]) | ||
3027 | return -EINVAL; | ||
3028 | |||
3029 | return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); | ||
3030 | } | ||
3031 | |||
3032 | static int ip_vs_genl_set_config(struct nlattr **attrs) | ||
3033 | { | ||
3034 | struct ip_vs_timeout_user t; | ||
3035 | |||
3036 | __ip_vs_get_timeouts(&t); | ||
3037 | |||
3038 | if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) | ||
3039 | t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); | ||
3040 | |||
3041 | if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) | ||
3042 | t.tcp_fin_timeout = | ||
3043 | nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); | ||
3044 | |||
3045 | if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) | ||
3046 | t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); | ||
3047 | |||
3048 | return ip_vs_set_timeout(&t); | ||
3049 | } | ||
3050 | |||
3051 | static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) | ||
3052 | { | ||
3053 | struct ip_vs_service *svc = NULL; | ||
3054 | struct ip_vs_service_user_kern usvc; | ||
3055 | struct ip_vs_dest_user_kern udest; | ||
3056 | int ret = 0, cmd; | ||
3057 | int need_full_svc = 0, need_full_dest = 0; | ||
3058 | |||
3059 | cmd = info->genlhdr->cmd; | ||
3060 | |||
3061 | mutex_lock(&__ip_vs_mutex); | ||
3062 | |||
3063 | if (cmd == IPVS_CMD_FLUSH) { | ||
3064 | ret = ip_vs_flush(); | ||
3065 | goto out; | ||
3066 | } else if (cmd == IPVS_CMD_SET_CONFIG) { | ||
3067 | ret = ip_vs_genl_set_config(info->attrs); | ||
3068 | goto out; | ||
3069 | } else if (cmd == IPVS_CMD_NEW_DAEMON || | ||
3070 | cmd == IPVS_CMD_DEL_DAEMON) { | ||
3071 | |||
3072 | struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; | ||
3073 | |||
3074 | if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || | ||
3075 | nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, | ||
3076 | info->attrs[IPVS_CMD_ATTR_DAEMON], | ||
3077 | ip_vs_daemon_policy)) { | ||
3078 | ret = -EINVAL; | ||
3079 | goto out; | ||
3080 | } | ||
3081 | |||
3082 | if (cmd == IPVS_CMD_NEW_DAEMON) | ||
3083 | ret = ip_vs_genl_new_daemon(daemon_attrs); | ||
3084 | else | ||
3085 | ret = ip_vs_genl_del_daemon(daemon_attrs); | ||
3086 | goto out; | ||
3087 | } else if (cmd == IPVS_CMD_ZERO && | ||
3088 | !info->attrs[IPVS_CMD_ATTR_SERVICE]) { | ||
3089 | ret = ip_vs_zero_all(); | ||
3090 | goto out; | ||
3091 | } | ||
3092 | |||
3093 | /* All following commands require a service argument, so check if we | ||
3094 | * received a valid one. We need a full service specification when | ||
3095 | * adding / editing a service. Only identifying members otherwise. */ | ||
3096 | if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) | ||
3097 | need_full_svc = 1; | ||
3098 | |||
3099 | ret = ip_vs_genl_parse_service(&usvc, | ||
3100 | info->attrs[IPVS_CMD_ATTR_SERVICE], | ||
3101 | need_full_svc); | ||
3102 | if (ret) | ||
3103 | goto out; | ||
3104 | |||
3105 | /* Lookup the exact service by <protocol, addr, port> or fwmark */ | ||
3106 | if (usvc.fwmark == 0) | ||
3107 | svc = __ip_vs_service_get(usvc.af, usvc.protocol, | ||
3108 | &usvc.addr, usvc.port); | ||
3109 | else | ||
3110 | svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); | ||
3111 | |||
3112 | /* Unless we're adding a new service, the service must already exist */ | ||
3113 | if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { | ||
3114 | ret = -ESRCH; | ||
3115 | goto out; | ||
3116 | } | ||
3117 | |||
3118 | /* Destination commands require a valid destination argument. For | ||
3119 | * adding / editing a destination, we need a full destination | ||
3120 | * specification. */ | ||
3121 | if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || | ||
3122 | cmd == IPVS_CMD_DEL_DEST) { | ||
3123 | if (cmd != IPVS_CMD_DEL_DEST) | ||
3124 | need_full_dest = 1; | ||
3125 | |||
3126 | ret = ip_vs_genl_parse_dest(&udest, | ||
3127 | info->attrs[IPVS_CMD_ATTR_DEST], | ||
3128 | need_full_dest); | ||
3129 | if (ret) | ||
3130 | goto out; | ||
3131 | } | ||
3132 | |||
3133 | switch (cmd) { | ||
3134 | case IPVS_CMD_NEW_SERVICE: | ||
3135 | if (svc == NULL) | ||
3136 | ret = ip_vs_add_service(&usvc, &svc); | ||
3137 | else | ||
3138 | ret = -EEXIST; | ||
3139 | break; | ||
3140 | case IPVS_CMD_SET_SERVICE: | ||
3141 | ret = ip_vs_edit_service(svc, &usvc); | ||
3142 | break; | ||
3143 | case IPVS_CMD_DEL_SERVICE: | ||
3144 | ret = ip_vs_del_service(svc); | ||
3145 | break; | ||
3146 | case IPVS_CMD_NEW_DEST: | ||
3147 | ret = ip_vs_add_dest(svc, &udest); | ||
3148 | break; | ||
3149 | case IPVS_CMD_SET_DEST: | ||
3150 | ret = ip_vs_edit_dest(svc, &udest); | ||
3151 | break; | ||
3152 | case IPVS_CMD_DEL_DEST: | ||
3153 | ret = ip_vs_del_dest(svc, &udest); | ||
3154 | break; | ||
3155 | case IPVS_CMD_ZERO: | ||
3156 | ret = ip_vs_zero_service(svc); | ||
3157 | break; | ||
3158 | default: | ||
3159 | ret = -EINVAL; | ||
3160 | } | ||
3161 | |||
3162 | out: | ||
3163 | if (svc) | ||
3164 | ip_vs_service_put(svc); | ||
3165 | mutex_unlock(&__ip_vs_mutex); | ||
3166 | |||
3167 | return ret; | ||
3168 | } | ||
3169 | |||
3170 | static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) | ||
3171 | { | ||
3172 | struct sk_buff *msg; | ||
3173 | void *reply; | ||
3174 | int ret, cmd, reply_cmd; | ||
3175 | |||
3176 | cmd = info->genlhdr->cmd; | ||
3177 | |||
3178 | if (cmd == IPVS_CMD_GET_SERVICE) | ||
3179 | reply_cmd = IPVS_CMD_NEW_SERVICE; | ||
3180 | else if (cmd == IPVS_CMD_GET_INFO) | ||
3181 | reply_cmd = IPVS_CMD_SET_INFO; | ||
3182 | else if (cmd == IPVS_CMD_GET_CONFIG) | ||
3183 | reply_cmd = IPVS_CMD_SET_CONFIG; | ||
3184 | else { | ||
3185 | IP_VS_ERR("unknown Generic Netlink command\n"); | ||
3186 | return -EINVAL; | ||
3187 | } | ||
3188 | |||
3189 | msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); | ||
3190 | if (!msg) | ||
3191 | return -ENOMEM; | ||
3192 | |||
3193 | mutex_lock(&__ip_vs_mutex); | ||
3194 | |||
3195 | reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); | ||
3196 | if (reply == NULL) | ||
3197 | goto nla_put_failure; | ||
3198 | |||
3199 | switch (cmd) { | ||
3200 | case IPVS_CMD_GET_SERVICE: | ||
3201 | { | ||
3202 | struct ip_vs_service *svc; | ||
3203 | |||
3204 | svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); | ||
3205 | if (IS_ERR(svc)) { | ||
3206 | ret = PTR_ERR(svc); | ||
3207 | goto out_err; | ||
3208 | } else if (svc) { | ||
3209 | ret = ip_vs_genl_fill_service(msg, svc); | ||
3210 | ip_vs_service_put(svc); | ||
3211 | if (ret) | ||
3212 | goto nla_put_failure; | ||
3213 | } else { | ||
3214 | ret = -ESRCH; | ||
3215 | goto out_err; | ||
3216 | } | ||
3217 | |||
3218 | break; | ||
3219 | } | ||
3220 | |||
3221 | case IPVS_CMD_GET_CONFIG: | ||
3222 | { | ||
3223 | struct ip_vs_timeout_user t; | ||
3224 | |||
3225 | __ip_vs_get_timeouts(&t); | ||
3226 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
3227 | NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); | ||
3228 | NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, | ||
3229 | t.tcp_fin_timeout); | ||
3230 | #endif | ||
3231 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
3232 | NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); | ||
3233 | #endif | ||
3234 | |||
3235 | break; | ||
3236 | } | ||
3237 | |||
3238 | case IPVS_CMD_GET_INFO: | ||
3239 | NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); | ||
3240 | NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, | ||
3241 | IP_VS_CONN_TAB_SIZE); | ||
3242 | break; | ||
3243 | } | ||
3244 | |||
3245 | genlmsg_end(msg, reply); | ||
3246 | ret = genlmsg_unicast(msg, info->snd_pid); | ||
3247 | goto out; | ||
3248 | |||
3249 | nla_put_failure: | ||
3250 | IP_VS_ERR("not enough space in Netlink message\n"); | ||
3251 | ret = -EMSGSIZE; | ||
3252 | |||
3253 | out_err: | ||
3254 | nlmsg_free(msg); | ||
3255 | out: | ||
3256 | mutex_unlock(&__ip_vs_mutex); | ||
3257 | |||
3258 | return ret; | ||
3259 | } | ||
3260 | |||
3261 | |||
3262 | static struct genl_ops ip_vs_genl_ops[] __read_mostly = { | ||
3263 | { | ||
3264 | .cmd = IPVS_CMD_NEW_SERVICE, | ||
3265 | .flags = GENL_ADMIN_PERM, | ||
3266 | .policy = ip_vs_cmd_policy, | ||
3267 | .doit = ip_vs_genl_set_cmd, | ||
3268 | }, | ||
3269 | { | ||
3270 | .cmd = IPVS_CMD_SET_SERVICE, | ||
3271 | .flags = GENL_ADMIN_PERM, | ||
3272 | .policy = ip_vs_cmd_policy, | ||
3273 | .doit = ip_vs_genl_set_cmd, | ||
3274 | }, | ||
3275 | { | ||
3276 | .cmd = IPVS_CMD_DEL_SERVICE, | ||
3277 | .flags = GENL_ADMIN_PERM, | ||
3278 | .policy = ip_vs_cmd_policy, | ||
3279 | .doit = ip_vs_genl_set_cmd, | ||
3280 | }, | ||
3281 | { | ||
3282 | .cmd = IPVS_CMD_GET_SERVICE, | ||
3283 | .flags = GENL_ADMIN_PERM, | ||
3284 | .doit = ip_vs_genl_get_cmd, | ||
3285 | .dumpit = ip_vs_genl_dump_services, | ||
3286 | .policy = ip_vs_cmd_policy, | ||
3287 | }, | ||
3288 | { | ||
3289 | .cmd = IPVS_CMD_NEW_DEST, | ||
3290 | .flags = GENL_ADMIN_PERM, | ||
3291 | .policy = ip_vs_cmd_policy, | ||
3292 | .doit = ip_vs_genl_set_cmd, | ||
3293 | }, | ||
3294 | { | ||
3295 | .cmd = IPVS_CMD_SET_DEST, | ||
3296 | .flags = GENL_ADMIN_PERM, | ||
3297 | .policy = ip_vs_cmd_policy, | ||
3298 | .doit = ip_vs_genl_set_cmd, | ||
3299 | }, | ||
3300 | { | ||
3301 | .cmd = IPVS_CMD_DEL_DEST, | ||
3302 | .flags = GENL_ADMIN_PERM, | ||
3303 | .policy = ip_vs_cmd_policy, | ||
3304 | .doit = ip_vs_genl_set_cmd, | ||
3305 | }, | ||
3306 | { | ||
3307 | .cmd = IPVS_CMD_GET_DEST, | ||
3308 | .flags = GENL_ADMIN_PERM, | ||
3309 | .policy = ip_vs_cmd_policy, | ||
3310 | .dumpit = ip_vs_genl_dump_dests, | ||
3311 | }, | ||
3312 | { | ||
3313 | .cmd = IPVS_CMD_NEW_DAEMON, | ||
3314 | .flags = GENL_ADMIN_PERM, | ||
3315 | .policy = ip_vs_cmd_policy, | ||
3316 | .doit = ip_vs_genl_set_cmd, | ||
3317 | }, | ||
3318 | { | ||
3319 | .cmd = IPVS_CMD_DEL_DAEMON, | ||
3320 | .flags = GENL_ADMIN_PERM, | ||
3321 | .policy = ip_vs_cmd_policy, | ||
3322 | .doit = ip_vs_genl_set_cmd, | ||
3323 | }, | ||
3324 | { | ||
3325 | .cmd = IPVS_CMD_GET_DAEMON, | ||
3326 | .flags = GENL_ADMIN_PERM, | ||
3327 | .dumpit = ip_vs_genl_dump_daemons, | ||
3328 | }, | ||
3329 | { | ||
3330 | .cmd = IPVS_CMD_SET_CONFIG, | ||
3331 | .flags = GENL_ADMIN_PERM, | ||
3332 | .policy = ip_vs_cmd_policy, | ||
3333 | .doit = ip_vs_genl_set_cmd, | ||
3334 | }, | ||
3335 | { | ||
3336 | .cmd = IPVS_CMD_GET_CONFIG, | ||
3337 | .flags = GENL_ADMIN_PERM, | ||
3338 | .doit = ip_vs_genl_get_cmd, | ||
3339 | }, | ||
3340 | { | ||
3341 | .cmd = IPVS_CMD_GET_INFO, | ||
3342 | .flags = GENL_ADMIN_PERM, | ||
3343 | .doit = ip_vs_genl_get_cmd, | ||
3344 | }, | ||
3345 | { | ||
3346 | .cmd = IPVS_CMD_ZERO, | ||
3347 | .flags = GENL_ADMIN_PERM, | ||
3348 | .policy = ip_vs_cmd_policy, | ||
3349 | .doit = ip_vs_genl_set_cmd, | ||
3350 | }, | ||
3351 | { | ||
3352 | .cmd = IPVS_CMD_FLUSH, | ||
3353 | .flags = GENL_ADMIN_PERM, | ||
3354 | .doit = ip_vs_genl_set_cmd, | ||
3355 | }, | ||
3356 | }; | ||
3357 | |||
3358 | static int __init ip_vs_genl_register(void) | ||
3359 | { | ||
3360 | int ret, i; | ||
3361 | |||
3362 | ret = genl_register_family(&ip_vs_genl_family); | ||
3363 | if (ret) | ||
3364 | return ret; | ||
3365 | |||
3366 | for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { | ||
3367 | ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); | ||
3368 | if (ret) | ||
3369 | goto err_out; | ||
3370 | } | ||
3371 | return 0; | ||
3372 | |||
3373 | err_out: | ||
3374 | genl_unregister_family(&ip_vs_genl_family); | ||
3375 | return ret; | ||
3376 | } | ||
3377 | |||
3378 | static void ip_vs_genl_unregister(void) | ||
3379 | { | ||
3380 | genl_unregister_family(&ip_vs_genl_family); | ||
3381 | } | ||
3382 | |||
3383 | /* End of Generic Netlink interface definitions */ | ||
3384 | |||
3385 | |||
3386 | int __init ip_vs_control_init(void) | ||
3387 | { | ||
3388 | int ret; | ||
3389 | int idx; | ||
3390 | |||
3391 | EnterFunction(2); | ||
3392 | |||
3393 | ret = nf_register_sockopt(&ip_vs_sockopts); | ||
3394 | if (ret) { | ||
3395 | IP_VS_ERR("cannot register sockopt.\n"); | ||
3396 | return ret; | ||
3397 | } | ||
3398 | |||
3399 | ret = ip_vs_genl_register(); | ||
3400 | if (ret) { | ||
3401 | IP_VS_ERR("cannot register Generic Netlink interface.\n"); | ||
3402 | nf_unregister_sockopt(&ip_vs_sockopts); | ||
3403 | return ret; | ||
3404 | } | ||
3405 | |||
3406 | proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); | ||
3407 | proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); | ||
3408 | |||
3409 | sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); | ||
3410 | |||
3411 | /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ | ||
3412 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
3413 | INIT_LIST_HEAD(&ip_vs_svc_table[idx]); | ||
3414 | INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); | ||
3415 | } | ||
3416 | for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { | ||
3417 | INIT_LIST_HEAD(&ip_vs_rtable[idx]); | ||
3418 | } | ||
3419 | |||
3420 | ip_vs_new_estimator(&ip_vs_stats); | ||
3421 | |||
3422 | /* Hook the defense timer */ | ||
3423 | schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); | ||
3424 | |||
3425 | LeaveFunction(2); | ||
3426 | return 0; | ||
3427 | } | ||
3428 | |||
3429 | |||
3430 | void ip_vs_control_cleanup(void) | ||
3431 | { | ||
3432 | EnterFunction(2); | ||
3433 | ip_vs_trash_cleanup(); | ||
3434 | cancel_rearming_delayed_work(&defense_work); | ||
3435 | cancel_work_sync(&defense_work.work); | ||
3436 | ip_vs_kill_estimator(&ip_vs_stats); | ||
3437 | unregister_sysctl_table(sysctl_header); | ||
3438 | proc_net_remove(&init_net, "ip_vs_stats"); | ||
3439 | proc_net_remove(&init_net, "ip_vs"); | ||
3440 | ip_vs_genl_unregister(); | ||
3441 | nf_unregister_sockopt(&ip_vs_sockopts); | ||
3442 | LeaveFunction(2); | ||
3443 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c deleted file mode 100644 index 2eb2860dabb5..000000000000 --- a/net/ipv4/ipvs/ip_vs_est.c +++ /dev/null | |||
@@ -1,166 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_est.c: simple rate estimator for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/jiffies.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/types.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/sysctl.h> | ||
20 | #include <linux/list.h> | ||
21 | |||
22 | #include <net/ip_vs.h> | ||
23 | |||
24 | /* | ||
25 | This code is to estimate rate in a shorter interval (such as 8 | ||
26 | seconds) for virtual services and real servers. For measure rate in a | ||
27 | long interval, it is easy to implement a user level daemon which | ||
28 | periodically reads those statistical counters and measure rate. | ||
29 | |||
30 | Currently, the measurement is activated by slow timer handler. Hope | ||
31 | this measurement will not introduce too much load. | ||
32 | |||
33 | We measure rate during the last 8 seconds every 2 seconds: | ||
34 | |||
35 | avgrate = avgrate*(1-W) + rate*W | ||
36 | |||
37 | where W = 2^(-2) | ||
38 | |||
39 | NOTES. | ||
40 | |||
41 | * The stored value for average bps is scaled by 2^5, so that maximal | ||
42 | rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. | ||
43 | |||
44 | * A lot code is taken from net/sched/estimator.c | ||
45 | */ | ||
46 | |||
47 | |||
48 | static void estimation_timer(unsigned long arg); | ||
49 | |||
50 | static LIST_HEAD(est_list); | ||
51 | static DEFINE_SPINLOCK(est_lock); | ||
52 | static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); | ||
53 | |||
54 | static void estimation_timer(unsigned long arg) | ||
55 | { | ||
56 | struct ip_vs_estimator *e; | ||
57 | struct ip_vs_stats *s; | ||
58 | u32 n_conns; | ||
59 | u32 n_inpkts, n_outpkts; | ||
60 | u64 n_inbytes, n_outbytes; | ||
61 | u32 rate; | ||
62 | |||
63 | spin_lock(&est_lock); | ||
64 | list_for_each_entry(e, &est_list, list) { | ||
65 | s = container_of(e, struct ip_vs_stats, est); | ||
66 | |||
67 | spin_lock(&s->lock); | ||
68 | n_conns = s->ustats.conns; | ||
69 | n_inpkts = s->ustats.inpkts; | ||
70 | n_outpkts = s->ustats.outpkts; | ||
71 | n_inbytes = s->ustats.inbytes; | ||
72 | n_outbytes = s->ustats.outbytes; | ||
73 | |||
74 | /* scaled by 2^10, but divided 2 seconds */ | ||
75 | rate = (n_conns - e->last_conns)<<9; | ||
76 | e->last_conns = n_conns; | ||
77 | e->cps += ((long)rate - (long)e->cps)>>2; | ||
78 | s->ustats.cps = (e->cps+0x1FF)>>10; | ||
79 | |||
80 | rate = (n_inpkts - e->last_inpkts)<<9; | ||
81 | e->last_inpkts = n_inpkts; | ||
82 | e->inpps += ((long)rate - (long)e->inpps)>>2; | ||
83 | s->ustats.inpps = (e->inpps+0x1FF)>>10; | ||
84 | |||
85 | rate = (n_outpkts - e->last_outpkts)<<9; | ||
86 | e->last_outpkts = n_outpkts; | ||
87 | e->outpps += ((long)rate - (long)e->outpps)>>2; | ||
88 | s->ustats.outpps = (e->outpps+0x1FF)>>10; | ||
89 | |||
90 | rate = (n_inbytes - e->last_inbytes)<<4; | ||
91 | e->last_inbytes = n_inbytes; | ||
92 | e->inbps += ((long)rate - (long)e->inbps)>>2; | ||
93 | s->ustats.inbps = (e->inbps+0xF)>>5; | ||
94 | |||
95 | rate = (n_outbytes - e->last_outbytes)<<4; | ||
96 | e->last_outbytes = n_outbytes; | ||
97 | e->outbps += ((long)rate - (long)e->outbps)>>2; | ||
98 | s->ustats.outbps = (e->outbps+0xF)>>5; | ||
99 | spin_unlock(&s->lock); | ||
100 | } | ||
101 | spin_unlock(&est_lock); | ||
102 | mod_timer(&est_timer, jiffies + 2*HZ); | ||
103 | } | ||
104 | |||
105 | void ip_vs_new_estimator(struct ip_vs_stats *stats) | ||
106 | { | ||
107 | struct ip_vs_estimator *est = &stats->est; | ||
108 | |||
109 | INIT_LIST_HEAD(&est->list); | ||
110 | |||
111 | est->last_conns = stats->ustats.conns; | ||
112 | est->cps = stats->ustats.cps<<10; | ||
113 | |||
114 | est->last_inpkts = stats->ustats.inpkts; | ||
115 | est->inpps = stats->ustats.inpps<<10; | ||
116 | |||
117 | est->last_outpkts = stats->ustats.outpkts; | ||
118 | est->outpps = stats->ustats.outpps<<10; | ||
119 | |||
120 | est->last_inbytes = stats->ustats.inbytes; | ||
121 | est->inbps = stats->ustats.inbps<<5; | ||
122 | |||
123 | est->last_outbytes = stats->ustats.outbytes; | ||
124 | est->outbps = stats->ustats.outbps<<5; | ||
125 | |||
126 | spin_lock_bh(&est_lock); | ||
127 | list_add(&est->list, &est_list); | ||
128 | spin_unlock_bh(&est_lock); | ||
129 | } | ||
130 | |||
131 | void ip_vs_kill_estimator(struct ip_vs_stats *stats) | ||
132 | { | ||
133 | struct ip_vs_estimator *est = &stats->est; | ||
134 | |||
135 | spin_lock_bh(&est_lock); | ||
136 | list_del(&est->list); | ||
137 | spin_unlock_bh(&est_lock); | ||
138 | } | ||
139 | |||
140 | void ip_vs_zero_estimator(struct ip_vs_stats *stats) | ||
141 | { | ||
142 | struct ip_vs_estimator *est = &stats->est; | ||
143 | |||
144 | /* set counters zero, caller must hold the stats->lock lock */ | ||
145 | est->last_inbytes = 0; | ||
146 | est->last_outbytes = 0; | ||
147 | est->last_conns = 0; | ||
148 | est->last_inpkts = 0; | ||
149 | est->last_outpkts = 0; | ||
150 | est->cps = 0; | ||
151 | est->inpps = 0; | ||
152 | est->outpps = 0; | ||
153 | est->inbps = 0; | ||
154 | est->outbps = 0; | ||
155 | } | ||
156 | |||
157 | int __init ip_vs_estimator_init(void) | ||
158 | { | ||
159 | mod_timer(&est_timer, jiffies + 2 * HZ); | ||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | void ip_vs_estimator_cleanup(void) | ||
164 | { | ||
165 | del_timer_sync(&est_timer); | ||
166 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c deleted file mode 100644 index 2e7dbd8b73a4..000000000000 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ /dev/null | |||
@@ -1,410 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_ftp.c: IPVS ftp application module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * Changes: | ||
7 | * | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference | ||
15 | * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. | ||
16 | * | ||
17 | * IP_MASQ_FTP ftp masquerading module | ||
18 | * | ||
19 | * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 | ||
20 | * | ||
21 | * Author: Wouter Gadeyne | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/moduleparam.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/skbuff.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/ip.h> | ||
31 | #include <linux/netfilter.h> | ||
32 | #include <net/protocol.h> | ||
33 | #include <net/tcp.h> | ||
34 | #include <asm/unaligned.h> | ||
35 | |||
36 | #include <net/ip_vs.h> | ||
37 | |||
38 | |||
39 | #define SERVER_STRING "227 Entering Passive Mode (" | ||
40 | #define CLIENT_STRING "PORT " | ||
41 | |||
42 | |||
43 | /* | ||
44 | * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper | ||
45 | * First port is set to the default port. | ||
46 | */ | ||
47 | static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; | ||
48 | module_param_array(ports, ushort, NULL, 0); | ||
49 | MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); | ||
50 | |||
51 | |||
52 | /* Dummy variable */ | ||
53 | static int ip_vs_ftp_pasv; | ||
54 | |||
55 | |||
56 | static int | ||
57 | ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) | ||
58 | { | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | |||
63 | static int | ||
64 | ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) | ||
65 | { | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | |||
70 | /* | ||
71 | * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started | ||
72 | * with the "pattern" and terminated with the "term" character. | ||
73 | * <addr,port> is in network order. | ||
74 | */ | ||
75 | static int ip_vs_ftp_get_addrport(char *data, char *data_limit, | ||
76 | const char *pattern, size_t plen, char term, | ||
77 | __be32 *addr, __be16 *port, | ||
78 | char **start, char **end) | ||
79 | { | ||
80 | unsigned char p[6]; | ||
81 | int i = 0; | ||
82 | |||
83 | if (data_limit - data < plen) { | ||
84 | /* check if there is partial match */ | ||
85 | if (strnicmp(data, pattern, data_limit - data) == 0) | ||
86 | return -1; | ||
87 | else | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | if (strnicmp(data, pattern, plen) != 0) { | ||
92 | return 0; | ||
93 | } | ||
94 | *start = data + plen; | ||
95 | |||
96 | for (data = *start; *data != term; data++) { | ||
97 | if (data == data_limit) | ||
98 | return -1; | ||
99 | } | ||
100 | *end = data; | ||
101 | |||
102 | memset(p, 0, sizeof(p)); | ||
103 | for (data = *start; data != *end; data++) { | ||
104 | if (*data >= '0' && *data <= '9') { | ||
105 | p[i] = p[i]*10 + *data - '0'; | ||
106 | } else if (*data == ',' && i < 5) { | ||
107 | i++; | ||
108 | } else { | ||
109 | /* unexpected character */ | ||
110 | return -1; | ||
111 | } | ||
112 | } | ||
113 | |||
114 | if (i != 5) | ||
115 | return -1; | ||
116 | |||
117 | *addr = get_unaligned((__be32 *)p); | ||
118 | *port = get_unaligned((__be16 *)(p + 4)); | ||
119 | return 1; | ||
120 | } | ||
121 | |||
122 | |||
123 | /* | ||
124 | * Look at outgoing ftp packets to catch the response to a PASV command | ||
125 | * from the server (inside-to-outside). | ||
126 | * When we see one, we build a connection entry with the client address, | ||
127 | * client port 0 (unknown at the moment), the server address and the | ||
128 | * server port. Mark the current connection entry as a control channel | ||
129 | * of the new entry. All this work is just to make the data connection | ||
130 | * can be scheduled to the right server later. | ||
131 | * | ||
132 | * The outgoing packet should be something like | ||
133 | * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". | ||
134 | * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. | ||
135 | */ | ||
136 | static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, | ||
137 | struct sk_buff *skb, int *diff) | ||
138 | { | ||
139 | struct iphdr *iph; | ||
140 | struct tcphdr *th; | ||
141 | char *data, *data_limit; | ||
142 | char *start, *end; | ||
143 | union nf_inet_addr from; | ||
144 | __be16 port; | ||
145 | struct ip_vs_conn *n_cp; | ||
146 | char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ | ||
147 | unsigned buf_len; | ||
148 | int ret; | ||
149 | |||
150 | #ifdef CONFIG_IP_VS_IPV6 | ||
151 | /* This application helper doesn't work with IPv6 yet, | ||
152 | * so turn this into a no-op for IPv6 packets | ||
153 | */ | ||
154 | if (cp->af == AF_INET6) | ||
155 | return 1; | ||
156 | #endif | ||
157 | |||
158 | *diff = 0; | ||
159 | |||
160 | /* Only useful for established sessions */ | ||
161 | if (cp->state != IP_VS_TCP_S_ESTABLISHED) | ||
162 | return 1; | ||
163 | |||
164 | /* Linear packets are much easier to deal with. */ | ||
165 | if (!skb_make_writable(skb, skb->len)) | ||
166 | return 0; | ||
167 | |||
168 | if (cp->app_data == &ip_vs_ftp_pasv) { | ||
169 | iph = ip_hdr(skb); | ||
170 | th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); | ||
171 | data = (char *)th + (th->doff << 2); | ||
172 | data_limit = skb_tail_pointer(skb); | ||
173 | |||
174 | if (ip_vs_ftp_get_addrport(data, data_limit, | ||
175 | SERVER_STRING, | ||
176 | sizeof(SERVER_STRING)-1, ')', | ||
177 | &from.ip, &port, | ||
178 | &start, &end) != 1) | ||
179 | return 1; | ||
180 | |||
181 | IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " | ||
182 | "%u.%u.%u.%u:%d detected\n", | ||
183 | NIPQUAD(from.ip), ntohs(port), | ||
184 | NIPQUAD(cp->caddr.ip), 0); | ||
185 | |||
186 | /* | ||
187 | * Now update or create an connection entry for it | ||
188 | */ | ||
189 | n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, | ||
190 | &cp->caddr, 0); | ||
191 | if (!n_cp) { | ||
192 | n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, | ||
193 | &cp->caddr, 0, | ||
194 | &cp->vaddr, port, | ||
195 | &from, port, | ||
196 | IP_VS_CONN_F_NO_CPORT, | ||
197 | cp->dest); | ||
198 | if (!n_cp) | ||
199 | return 0; | ||
200 | |||
201 | /* add its controller */ | ||
202 | ip_vs_control_add(n_cp, cp); | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * Replace the old passive address with the new one | ||
207 | */ | ||
208 | from.ip = n_cp->vaddr.ip; | ||
209 | port = n_cp->vport; | ||
210 | sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), | ||
211 | (ntohs(port)>>8)&255, ntohs(port)&255); | ||
212 | buf_len = strlen(buf); | ||
213 | |||
214 | /* | ||
215 | * Calculate required delta-offset to keep TCP happy | ||
216 | */ | ||
217 | *diff = buf_len - (end-start); | ||
218 | |||
219 | if (*diff == 0) { | ||
220 | /* simply replace it with new passive address */ | ||
221 | memcpy(start, buf, buf_len); | ||
222 | ret = 1; | ||
223 | } else { | ||
224 | ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, | ||
225 | end-start, buf, buf_len); | ||
226 | } | ||
227 | |||
228 | cp->app_data = NULL; | ||
229 | ip_vs_tcp_conn_listen(n_cp); | ||
230 | ip_vs_conn_put(n_cp); | ||
231 | return ret; | ||
232 | } | ||
233 | return 1; | ||
234 | } | ||
235 | |||
236 | |||
237 | /* | ||
238 | * Look at incoming ftp packets to catch the PASV/PORT command | ||
239 | * (outside-to-inside). | ||
240 | * | ||
241 | * The incoming packet having the PORT command should be something like | ||
242 | * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". | ||
243 | * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. | ||
244 | * In this case, we create a connection entry using the client address and | ||
245 | * port, so that the active ftp data connection from the server can reach | ||
246 | * the client. | ||
247 | */ | ||
248 | static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, | ||
249 | struct sk_buff *skb, int *diff) | ||
250 | { | ||
251 | struct iphdr *iph; | ||
252 | struct tcphdr *th; | ||
253 | char *data, *data_start, *data_limit; | ||
254 | char *start, *end; | ||
255 | union nf_inet_addr to; | ||
256 | __be16 port; | ||
257 | struct ip_vs_conn *n_cp; | ||
258 | |||
259 | #ifdef CONFIG_IP_VS_IPV6 | ||
260 | /* This application helper doesn't work with IPv6 yet, | ||
261 | * so turn this into a no-op for IPv6 packets | ||
262 | */ | ||
263 | if (cp->af == AF_INET6) | ||
264 | return 1; | ||
265 | #endif | ||
266 | |||
267 | /* no diff required for incoming packets */ | ||
268 | *diff = 0; | ||
269 | |||
270 | /* Only useful for established sessions */ | ||
271 | if (cp->state != IP_VS_TCP_S_ESTABLISHED) | ||
272 | return 1; | ||
273 | |||
274 | /* Linear packets are much easier to deal with. */ | ||
275 | if (!skb_make_writable(skb, skb->len)) | ||
276 | return 0; | ||
277 | |||
278 | /* | ||
279 | * Detecting whether it is passive | ||
280 | */ | ||
281 | iph = ip_hdr(skb); | ||
282 | th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); | ||
283 | |||
284 | /* Since there may be OPTIONS in the TCP packet and the HLEN is | ||
285 | the length of the header in 32-bit multiples, it is accurate | ||
286 | to calculate data address by th+HLEN*4 */ | ||
287 | data = data_start = (char *)th + (th->doff << 2); | ||
288 | data_limit = skb_tail_pointer(skb); | ||
289 | |||
290 | while (data <= data_limit - 6) { | ||
291 | if (strnicmp(data, "PASV\r\n", 6) == 0) { | ||
292 | /* Passive mode on */ | ||
293 | IP_VS_DBG(7, "got PASV at %td of %td\n", | ||
294 | data - data_start, | ||
295 | data_limit - data_start); | ||
296 | cp->app_data = &ip_vs_ftp_pasv; | ||
297 | return 1; | ||
298 | } | ||
299 | data++; | ||
300 | } | ||
301 | |||
302 | /* | ||
303 | * To support virtual FTP server, the scenerio is as follows: | ||
304 | * FTP client ----> Load Balancer ----> FTP server | ||
305 | * First detect the port number in the application data, | ||
306 | * then create a new connection entry for the coming data | ||
307 | * connection. | ||
308 | */ | ||
309 | if (ip_vs_ftp_get_addrport(data_start, data_limit, | ||
310 | CLIENT_STRING, sizeof(CLIENT_STRING)-1, | ||
311 | '\r', &to.ip, &port, | ||
312 | &start, &end) != 1) | ||
313 | return 1; | ||
314 | |||
315 | IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", | ||
316 | NIPQUAD(to.ip), ntohs(port)); | ||
317 | |||
318 | /* Passive mode off */ | ||
319 | cp->app_data = NULL; | ||
320 | |||
321 | /* | ||
322 | * Now update or create a connection entry for it | ||
323 | */ | ||
324 | IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", | ||
325 | ip_vs_proto_name(iph->protocol), | ||
326 | NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); | ||
327 | |||
328 | n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, | ||
329 | &to, port, | ||
330 | &cp->vaddr, htons(ntohs(cp->vport)-1)); | ||
331 | if (!n_cp) { | ||
332 | n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, | ||
333 | &to, port, | ||
334 | &cp->vaddr, htons(ntohs(cp->vport)-1), | ||
335 | &cp->daddr, htons(ntohs(cp->dport)-1), | ||
336 | 0, | ||
337 | cp->dest); | ||
338 | if (!n_cp) | ||
339 | return 0; | ||
340 | |||
341 | /* add its controller */ | ||
342 | ip_vs_control_add(n_cp, cp); | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * Move tunnel to listen state | ||
347 | */ | ||
348 | ip_vs_tcp_conn_listen(n_cp); | ||
349 | ip_vs_conn_put(n_cp); | ||
350 | |||
351 | return 1; | ||
352 | } | ||
353 | |||
354 | |||
355 | static struct ip_vs_app ip_vs_ftp = { | ||
356 | .name = "ftp", | ||
357 | .type = IP_VS_APP_TYPE_FTP, | ||
358 | .protocol = IPPROTO_TCP, | ||
359 | .module = THIS_MODULE, | ||
360 | .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), | ||
361 | .init_conn = ip_vs_ftp_init_conn, | ||
362 | .done_conn = ip_vs_ftp_done_conn, | ||
363 | .bind_conn = NULL, | ||
364 | .unbind_conn = NULL, | ||
365 | .pkt_out = ip_vs_ftp_out, | ||
366 | .pkt_in = ip_vs_ftp_in, | ||
367 | }; | ||
368 | |||
369 | |||
370 | /* | ||
371 | * ip_vs_ftp initialization | ||
372 | */ | ||
373 | static int __init ip_vs_ftp_init(void) | ||
374 | { | ||
375 | int i, ret; | ||
376 | struct ip_vs_app *app = &ip_vs_ftp; | ||
377 | |||
378 | ret = register_ip_vs_app(app); | ||
379 | if (ret) | ||
380 | return ret; | ||
381 | |||
382 | for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { | ||
383 | if (!ports[i]) | ||
384 | continue; | ||
385 | ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); | ||
386 | if (ret) | ||
387 | break; | ||
388 | IP_VS_INFO("%s: loaded support on port[%d] = %d\n", | ||
389 | app->name, i, ports[i]); | ||
390 | } | ||
391 | |||
392 | if (ret) | ||
393 | unregister_ip_vs_app(app); | ||
394 | |||
395 | return ret; | ||
396 | } | ||
397 | |||
398 | |||
399 | /* | ||
400 | * ip_vs_ftp finish. | ||
401 | */ | ||
402 | static void __exit ip_vs_ftp_exit(void) | ||
403 | { | ||
404 | unregister_ip_vs_app(&ip_vs_ftp); | ||
405 | } | ||
406 | |||
407 | |||
408 | module_init(ip_vs_ftp_init); | ||
409 | module_exit(ip_vs_ftp_exit); | ||
410 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c deleted file mode 100644 index 6ecef3518cac..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ /dev/null | |||
@@ -1,555 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Locality-Based Least-Connection scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Martin Hamilton : fixed the terrible locking bugs | ||
13 | * *lock(tbl->lock) ==> *lock(&tbl->lock) | ||
14 | * Wensong Zhang : fixed the uninitilized tbl->lock bug | ||
15 | * Wensong Zhang : added doing full expiration check to | ||
16 | * collect stale entries of 24+ hours when | ||
17 | * no partial expire check in a half hour | ||
18 | * Julian Anastasov : replaced del_timer call with del_timer_sync | ||
19 | * to avoid the possible race between timer | ||
20 | * handler and del_timer thread in SMP | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * The lblc algorithm is as follows (pseudo code): | ||
26 | * | ||
27 | * if cachenode[dest_ip] is null then | ||
28 | * n, cachenode[dest_ip] <- {weighted least-conn node}; | ||
29 | * else | ||
30 | * n <- cachenode[dest_ip]; | ||
31 | * if (n is dead) OR | ||
32 | * (n.conns>n.weight AND | ||
33 | * there is a node m with m.conns<m.weight/2) then | ||
34 | * n, cachenode[dest_ip] <- {weighted least-conn node}; | ||
35 | * | ||
36 | * return n; | ||
37 | * | ||
38 | * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing | ||
39 | * me to write this module. | ||
40 | */ | ||
41 | |||
42 | #include <linux/ip.h> | ||
43 | #include <linux/module.h> | ||
44 | #include <linux/kernel.h> | ||
45 | #include <linux/skbuff.h> | ||
46 | #include <linux/jiffies.h> | ||
47 | |||
48 | /* for sysctl */ | ||
49 | #include <linux/fs.h> | ||
50 | #include <linux/sysctl.h> | ||
51 | |||
52 | #include <net/ip_vs.h> | ||
53 | |||
54 | |||
55 | /* | ||
56 | * It is for garbage collection of stale IPVS lblc entries, | ||
57 | * when the table is full. | ||
58 | */ | ||
59 | #define CHECK_EXPIRE_INTERVAL (60*HZ) | ||
60 | #define ENTRY_TIMEOUT (6*60*HZ) | ||
61 | |||
62 | /* | ||
63 | * It is for full expiration check. | ||
64 | * When there is no partial expiration check (garbage collection) | ||
65 | * in a half hour, do a full expiration check to collect stale | ||
66 | * entries that haven't been touched for a day. | ||
67 | */ | ||
68 | #define COUNT_FOR_FULL_EXPIRATION 30 | ||
69 | static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; | ||
70 | |||
71 | |||
72 | /* | ||
73 | * for IPVS lblc entry hash table | ||
74 | */ | ||
75 | #ifndef CONFIG_IP_VS_LBLC_TAB_BITS | ||
76 | #define CONFIG_IP_VS_LBLC_TAB_BITS 10 | ||
77 | #endif | ||
78 | #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS | ||
79 | #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) | ||
80 | #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) | ||
81 | |||
82 | |||
83 | /* | ||
84 | * IPVS lblc entry represents an association between destination | ||
85 | * IP address and its destination server | ||
86 | */ | ||
87 | struct ip_vs_lblc_entry { | ||
88 | struct list_head list; | ||
89 | __be32 addr; /* destination IP address */ | ||
90 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
91 | unsigned long lastuse; /* last used time */ | ||
92 | }; | ||
93 | |||
94 | |||
95 | /* | ||
96 | * IPVS lblc hash table | ||
97 | */ | ||
98 | struct ip_vs_lblc_table { | ||
99 | struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | ||
100 | atomic_t entries; /* number of entries */ | ||
101 | int max_size; /* maximum size of entries */ | ||
102 | struct timer_list periodic_timer; /* collect stale entries */ | ||
103 | int rover; /* rover for expire check */ | ||
104 | int counter; /* counter for no expire */ | ||
105 | }; | ||
106 | |||
107 | |||
108 | /* | ||
109 | * IPVS LBLC sysctl table | ||
110 | */ | ||
111 | |||
112 | static ctl_table vs_vars_table[] = { | ||
113 | { | ||
114 | .procname = "lblc_expiration", | ||
115 | .data = &sysctl_ip_vs_lblc_expiration, | ||
116 | .maxlen = sizeof(int), | ||
117 | .mode = 0644, | ||
118 | .proc_handler = &proc_dointvec_jiffies, | ||
119 | }, | ||
120 | { .ctl_name = 0 } | ||
121 | }; | ||
122 | |||
123 | static struct ctl_table_header * sysctl_header; | ||
124 | |||
125 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) | ||
126 | { | ||
127 | list_del(&en->list); | ||
128 | /* | ||
129 | * We don't kfree dest because it is refered either by its service | ||
130 | * or the trash dest list. | ||
131 | */ | ||
132 | atomic_dec(&en->dest->refcnt); | ||
133 | kfree(en); | ||
134 | } | ||
135 | |||
136 | |||
137 | /* | ||
138 | * Returns hash value for IPVS LBLC entry | ||
139 | */ | ||
140 | static inline unsigned ip_vs_lblc_hashkey(__be32 addr) | ||
141 | { | ||
142 | return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; | ||
143 | } | ||
144 | |||
145 | |||
146 | /* | ||
147 | * Hash an entry in the ip_vs_lblc_table. | ||
148 | * returns bool success. | ||
149 | */ | ||
150 | static void | ||
151 | ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) | ||
152 | { | ||
153 | unsigned hash = ip_vs_lblc_hashkey(en->addr); | ||
154 | |||
155 | list_add(&en->list, &tbl->bucket[hash]); | ||
156 | atomic_inc(&tbl->entries); | ||
157 | } | ||
158 | |||
159 | |||
160 | /* | ||
161 | * Get ip_vs_lblc_entry associated with supplied parameters. Called under read | ||
162 | * lock | ||
163 | */ | ||
164 | static inline struct ip_vs_lblc_entry * | ||
165 | ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) | ||
166 | { | ||
167 | unsigned hash = ip_vs_lblc_hashkey(addr); | ||
168 | struct ip_vs_lblc_entry *en; | ||
169 | |||
170 | list_for_each_entry(en, &tbl->bucket[hash], list) | ||
171 | if (en->addr == addr) | ||
172 | return en; | ||
173 | |||
174 | return NULL; | ||
175 | } | ||
176 | |||
177 | |||
178 | /* | ||
179 | * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP | ||
180 | * address to a server. Called under write lock. | ||
181 | */ | ||
182 | static inline struct ip_vs_lblc_entry * | ||
183 | ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, | ||
184 | struct ip_vs_dest *dest) | ||
185 | { | ||
186 | struct ip_vs_lblc_entry *en; | ||
187 | |||
188 | en = ip_vs_lblc_get(tbl, daddr); | ||
189 | if (!en) { | ||
190 | en = kmalloc(sizeof(*en), GFP_ATOMIC); | ||
191 | if (!en) { | ||
192 | IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | ||
193 | return NULL; | ||
194 | } | ||
195 | |||
196 | en->addr = daddr; | ||
197 | en->lastuse = jiffies; | ||
198 | |||
199 | atomic_inc(&dest->refcnt); | ||
200 | en->dest = dest; | ||
201 | |||
202 | ip_vs_lblc_hash(tbl, en); | ||
203 | } else if (en->dest != dest) { | ||
204 | atomic_dec(&en->dest->refcnt); | ||
205 | atomic_inc(&dest->refcnt); | ||
206 | en->dest = dest; | ||
207 | } | ||
208 | |||
209 | return en; | ||
210 | } | ||
211 | |||
212 | |||
213 | /* | ||
214 | * Flush all the entries of the specified table. | ||
215 | */ | ||
216 | static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) | ||
217 | { | ||
218 | struct ip_vs_lblc_entry *en, *nxt; | ||
219 | int i; | ||
220 | |||
221 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
222 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | ||
223 | ip_vs_lblc_free(en); | ||
224 | atomic_dec(&tbl->entries); | ||
225 | } | ||
226 | } | ||
227 | } | ||
228 | |||
229 | |||
230 | static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) | ||
231 | { | ||
232 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
233 | struct ip_vs_lblc_entry *en, *nxt; | ||
234 | unsigned long now = jiffies; | ||
235 | int i, j; | ||
236 | |||
237 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
238 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | ||
239 | |||
240 | write_lock(&svc->sched_lock); | ||
241 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
242 | if (time_before(now, | ||
243 | en->lastuse + sysctl_ip_vs_lblc_expiration)) | ||
244 | continue; | ||
245 | |||
246 | ip_vs_lblc_free(en); | ||
247 | atomic_dec(&tbl->entries); | ||
248 | } | ||
249 | write_unlock(&svc->sched_lock); | ||
250 | } | ||
251 | tbl->rover = j; | ||
252 | } | ||
253 | |||
254 | |||
255 | /* | ||
256 | * Periodical timer handler for IPVS lblc table | ||
257 | * It is used to collect stale entries when the number of entries | ||
258 | * exceeds the maximum size of the table. | ||
259 | * | ||
260 | * Fixme: we probably need more complicated algorithm to collect | ||
261 | * entries that have not been used for a long time even | ||
262 | * if the number of entries doesn't exceed the maximum size | ||
263 | * of the table. | ||
264 | * The full expiration check is for this purpose now. | ||
265 | */ | ||
266 | static void ip_vs_lblc_check_expire(unsigned long data) | ||
267 | { | ||
268 | struct ip_vs_service *svc = (struct ip_vs_service *) data; | ||
269 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
270 | unsigned long now = jiffies; | ||
271 | int goal; | ||
272 | int i, j; | ||
273 | struct ip_vs_lblc_entry *en, *nxt; | ||
274 | |||
275 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | ||
276 | /* do full expiration check */ | ||
277 | ip_vs_lblc_full_check(svc); | ||
278 | tbl->counter = 1; | ||
279 | goto out; | ||
280 | } | ||
281 | |||
282 | if (atomic_read(&tbl->entries) <= tbl->max_size) { | ||
283 | tbl->counter++; | ||
284 | goto out; | ||
285 | } | ||
286 | |||
287 | goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | ||
288 | if (goal > tbl->max_size/2) | ||
289 | goal = tbl->max_size/2; | ||
290 | |||
291 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
292 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | ||
293 | |||
294 | write_lock(&svc->sched_lock); | ||
295 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
296 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) | ||
297 | continue; | ||
298 | |||
299 | ip_vs_lblc_free(en); | ||
300 | atomic_dec(&tbl->entries); | ||
301 | goal--; | ||
302 | } | ||
303 | write_unlock(&svc->sched_lock); | ||
304 | if (goal <= 0) | ||
305 | break; | ||
306 | } | ||
307 | tbl->rover = j; | ||
308 | |||
309 | out: | ||
310 | mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | ||
311 | } | ||
312 | |||
313 | |||
314 | static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | ||
315 | { | ||
316 | int i; | ||
317 | struct ip_vs_lblc_table *tbl; | ||
318 | |||
319 | /* | ||
320 | * Allocate the ip_vs_lblc_table for this service | ||
321 | */ | ||
322 | tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); | ||
323 | if (tbl == NULL) { | ||
324 | IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); | ||
325 | return -ENOMEM; | ||
326 | } | ||
327 | svc->sched_data = tbl; | ||
328 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " | ||
329 | "current service\n", sizeof(*tbl)); | ||
330 | |||
331 | /* | ||
332 | * Initialize the hash buckets | ||
333 | */ | ||
334 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
335 | INIT_LIST_HEAD(&tbl->bucket[i]); | ||
336 | } | ||
337 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; | ||
338 | tbl->rover = 0; | ||
339 | tbl->counter = 1; | ||
340 | |||
341 | /* | ||
342 | * Hook periodic timer for garbage collection | ||
343 | */ | ||
344 | setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, | ||
345 | (unsigned long)svc); | ||
346 | mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); | ||
347 | |||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | |||
352 | static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) | ||
353 | { | ||
354 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
355 | |||
356 | /* remove periodic timer */ | ||
357 | del_timer_sync(&tbl->periodic_timer); | ||
358 | |||
359 | /* got to clean up table entries here */ | ||
360 | ip_vs_lblc_flush(tbl); | ||
361 | |||
362 | /* release the table itself */ | ||
363 | kfree(tbl); | ||
364 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", | ||
365 | sizeof(*tbl)); | ||
366 | |||
367 | return 0; | ||
368 | } | ||
369 | |||
370 | |||
371 | static inline struct ip_vs_dest * | ||
372 | __ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | ||
373 | { | ||
374 | struct ip_vs_dest *dest, *least; | ||
375 | int loh, doh; | ||
376 | |||
377 | /* | ||
378 | * We think the overhead of processing active connections is fifty | ||
379 | * times higher than that of inactive connections in average. (This | ||
380 | * fifty times might not be accurate, we will change it later.) We | ||
381 | * use the following formula to estimate the overhead: | ||
382 | * dest->activeconns*50 + dest->inactconns | ||
383 | * and the load: | ||
384 | * (dest overhead) / dest->weight | ||
385 | * | ||
386 | * Remember -- no floats in kernel mode!!! | ||
387 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
388 | * h1/w1 > h2/w2 | ||
389 | * if every weight is larger than zero. | ||
390 | * | ||
391 | * The server with weight=0 is quiesced and will not receive any | ||
392 | * new connection. | ||
393 | */ | ||
394 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
395 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
396 | continue; | ||
397 | if (atomic_read(&dest->weight) > 0) { | ||
398 | least = dest; | ||
399 | loh = atomic_read(&least->activeconns) * 50 | ||
400 | + atomic_read(&least->inactconns); | ||
401 | goto nextstage; | ||
402 | } | ||
403 | } | ||
404 | return NULL; | ||
405 | |||
406 | /* | ||
407 | * Find the destination with the least load. | ||
408 | */ | ||
409 | nextstage: | ||
410 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
411 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
412 | continue; | ||
413 | |||
414 | doh = atomic_read(&dest->activeconns) * 50 | ||
415 | + atomic_read(&dest->inactconns); | ||
416 | if (loh * atomic_read(&dest->weight) > | ||
417 | doh * atomic_read(&least->weight)) { | ||
418 | least = dest; | ||
419 | loh = doh; | ||
420 | } | ||
421 | } | ||
422 | |||
423 | IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " | ||
424 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
425 | NIPQUAD(least->addr.ip), ntohs(least->port), | ||
426 | atomic_read(&least->activeconns), | ||
427 | atomic_read(&least->refcnt), | ||
428 | atomic_read(&least->weight), loh); | ||
429 | |||
430 | return least; | ||
431 | } | ||
432 | |||
433 | |||
434 | /* | ||
435 | * If this destination server is overloaded and there is a less loaded | ||
436 | * server, then return true. | ||
437 | */ | ||
438 | static inline int | ||
439 | is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
440 | { | ||
441 | if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { | ||
442 | struct ip_vs_dest *d; | ||
443 | |||
444 | list_for_each_entry(d, &svc->destinations, n_list) { | ||
445 | if (atomic_read(&d->activeconns)*2 | ||
446 | < atomic_read(&d->weight)) { | ||
447 | return 1; | ||
448 | } | ||
449 | } | ||
450 | } | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | |||
455 | /* | ||
456 | * Locality-Based (weighted) Least-Connection scheduling | ||
457 | */ | ||
458 | static struct ip_vs_dest * | ||
459 | ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
460 | { | ||
461 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
462 | struct iphdr *iph = ip_hdr(skb); | ||
463 | struct ip_vs_dest *dest = NULL; | ||
464 | struct ip_vs_lblc_entry *en; | ||
465 | |||
466 | IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); | ||
467 | |||
468 | /* First look in our cache */ | ||
469 | read_lock(&svc->sched_lock); | ||
470 | en = ip_vs_lblc_get(tbl, iph->daddr); | ||
471 | if (en) { | ||
472 | /* We only hold a read lock, but this is atomic */ | ||
473 | en->lastuse = jiffies; | ||
474 | |||
475 | /* | ||
476 | * If the destination is not available, i.e. it's in the trash, | ||
477 | * we must ignore it, as it may be removed from under our feet, | ||
478 | * if someone drops our reference count. Our caller only makes | ||
479 | * sure that destinations, that are not in the trash, are not | ||
480 | * moved to the trash, while we are scheduling. But anyone can | ||
481 | * free up entries from the trash at any time. | ||
482 | */ | ||
483 | |||
484 | if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
485 | dest = en->dest; | ||
486 | } | ||
487 | read_unlock(&svc->sched_lock); | ||
488 | |||
489 | /* If the destination has a weight and is not overloaded, use it */ | ||
490 | if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) | ||
491 | goto out; | ||
492 | |||
493 | /* No cache entry or it is invalid, time to schedule */ | ||
494 | dest = __ip_vs_lblc_schedule(svc, iph); | ||
495 | if (!dest) { | ||
496 | IP_VS_DBG(1, "no destination available\n"); | ||
497 | return NULL; | ||
498 | } | ||
499 | |||
500 | /* If we fail to create a cache entry, we'll just use the valid dest */ | ||
501 | write_lock(&svc->sched_lock); | ||
502 | ip_vs_lblc_new(tbl, iph->daddr, dest); | ||
503 | write_unlock(&svc->sched_lock); | ||
504 | |||
505 | out: | ||
506 | IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " | ||
507 | "--> server %u.%u.%u.%u:%d\n", | ||
508 | NIPQUAD(iph->daddr), | ||
509 | NIPQUAD(dest->addr.ip), | ||
510 | ntohs(dest->port)); | ||
511 | |||
512 | return dest; | ||
513 | } | ||
514 | |||
515 | |||
516 | /* | ||
517 | * IPVS LBLC Scheduler structure | ||
518 | */ | ||
519 | static struct ip_vs_scheduler ip_vs_lblc_scheduler = | ||
520 | { | ||
521 | .name = "lblc", | ||
522 | .refcnt = ATOMIC_INIT(0), | ||
523 | .module = THIS_MODULE, | ||
524 | .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), | ||
525 | #ifdef CONFIG_IP_VS_IPV6 | ||
526 | .supports_ipv6 = 0, | ||
527 | #endif | ||
528 | .init_service = ip_vs_lblc_init_svc, | ||
529 | .done_service = ip_vs_lblc_done_svc, | ||
530 | .schedule = ip_vs_lblc_schedule, | ||
531 | }; | ||
532 | |||
533 | |||
534 | static int __init ip_vs_lblc_init(void) | ||
535 | { | ||
536 | int ret; | ||
537 | |||
538 | sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); | ||
539 | ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); | ||
540 | if (ret) | ||
541 | unregister_sysctl_table(sysctl_header); | ||
542 | return ret; | ||
543 | } | ||
544 | |||
545 | |||
546 | static void __exit ip_vs_lblc_cleanup(void) | ||
547 | { | ||
548 | unregister_sysctl_table(sysctl_header); | ||
549 | unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); | ||
550 | } | ||
551 | |||
552 | |||
553 | module_init(ip_vs_lblc_init); | ||
554 | module_exit(ip_vs_lblc_cleanup); | ||
555 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c deleted file mode 100644 index 1f75ea83bcf8..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ /dev/null | |||
@@ -1,755 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Locality-Based Least-Connection with Replication scheduler | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Julian Anastasov : Added the missing (dest->weight>0) | ||
13 | * condition in the ip_vs_dest_set_max. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | * The lblc/r algorithm is as follows (pseudo code): | ||
19 | * | ||
20 | * if serverSet[dest_ip] is null then | ||
21 | * n, serverSet[dest_ip] <- {weighted least-conn node}; | ||
22 | * else | ||
23 | * n <- {least-conn (alive) node in serverSet[dest_ip]}; | ||
24 | * if (n is null) OR | ||
25 | * (n.conns>n.weight AND | ||
26 | * there is a node m with m.conns<m.weight/2) then | ||
27 | * n <- {weighted least-conn node}; | ||
28 | * add n to serverSet[dest_ip]; | ||
29 | * if |serverSet[dest_ip]| > 1 AND | ||
30 | * now - serverSet[dest_ip].lastMod > T then | ||
31 | * m <- {most conn node in serverSet[dest_ip]}; | ||
32 | * remove m from serverSet[dest_ip]; | ||
33 | * if serverSet[dest_ip] changed then | ||
34 | * serverSet[dest_ip].lastMod <- now; | ||
35 | * | ||
36 | * return n; | ||
37 | * | ||
38 | */ | ||
39 | |||
40 | #include <linux/ip.h> | ||
41 | #include <linux/module.h> | ||
42 | #include <linux/kernel.h> | ||
43 | #include <linux/skbuff.h> | ||
44 | #include <linux/jiffies.h> | ||
45 | |||
46 | /* for sysctl */ | ||
47 | #include <linux/fs.h> | ||
48 | #include <linux/sysctl.h> | ||
49 | #include <net/net_namespace.h> | ||
50 | |||
51 | #include <net/ip_vs.h> | ||
52 | |||
53 | |||
54 | /* | ||
55 | * It is for garbage collection of stale IPVS lblcr entries, | ||
56 | * when the table is full. | ||
57 | */ | ||
58 | #define CHECK_EXPIRE_INTERVAL (60*HZ) | ||
59 | #define ENTRY_TIMEOUT (6*60*HZ) | ||
60 | |||
61 | /* | ||
62 | * It is for full expiration check. | ||
63 | * When there is no partial expiration check (garbage collection) | ||
64 | * in a half hour, do a full expiration check to collect stale | ||
65 | * entries that haven't been touched for a day. | ||
66 | */ | ||
67 | #define COUNT_FOR_FULL_EXPIRATION 30 | ||
68 | static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; | ||
69 | |||
70 | |||
71 | /* | ||
72 | * for IPVS lblcr entry hash table | ||
73 | */ | ||
74 | #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS | ||
75 | #define CONFIG_IP_VS_LBLCR_TAB_BITS 10 | ||
76 | #endif | ||
77 | #define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS | ||
78 | #define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) | ||
79 | #define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) | ||
80 | |||
81 | |||
82 | /* | ||
83 | * IPVS destination set structure and operations | ||
84 | */ | ||
85 | struct ip_vs_dest_list { | ||
86 | struct ip_vs_dest_list *next; /* list link */ | ||
87 | struct ip_vs_dest *dest; /* destination server */ | ||
88 | }; | ||
89 | |||
90 | struct ip_vs_dest_set { | ||
91 | atomic_t size; /* set size */ | ||
92 | unsigned long lastmod; /* last modified time */ | ||
93 | struct ip_vs_dest_list *list; /* destination list */ | ||
94 | rwlock_t lock; /* lock for this list */ | ||
95 | }; | ||
96 | |||
97 | |||
98 | static struct ip_vs_dest_list * | ||
99 | ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | ||
100 | { | ||
101 | struct ip_vs_dest_list *e; | ||
102 | |||
103 | for (e=set->list; e!=NULL; e=e->next) { | ||
104 | if (e->dest == dest) | ||
105 | /* already existed */ | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | e = kmalloc(sizeof(*e), GFP_ATOMIC); | ||
110 | if (e == NULL) { | ||
111 | IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); | ||
112 | return NULL; | ||
113 | } | ||
114 | |||
115 | atomic_inc(&dest->refcnt); | ||
116 | e->dest = dest; | ||
117 | |||
118 | /* link it to the list */ | ||
119 | e->next = set->list; | ||
120 | set->list = e; | ||
121 | atomic_inc(&set->size); | ||
122 | |||
123 | set->lastmod = jiffies; | ||
124 | return e; | ||
125 | } | ||
126 | |||
127 | static void | ||
128 | ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | ||
129 | { | ||
130 | struct ip_vs_dest_list *e, **ep; | ||
131 | |||
132 | for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | ||
133 | if (e->dest == dest) { | ||
134 | /* HIT */ | ||
135 | *ep = e->next; | ||
136 | atomic_dec(&set->size); | ||
137 | set->lastmod = jiffies; | ||
138 | atomic_dec(&e->dest->refcnt); | ||
139 | kfree(e); | ||
140 | break; | ||
141 | } | ||
142 | ep = &e->next; | ||
143 | } | ||
144 | } | ||
145 | |||
146 | static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) | ||
147 | { | ||
148 | struct ip_vs_dest_list *e, **ep; | ||
149 | |||
150 | write_lock(&set->lock); | ||
151 | for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | ||
152 | *ep = e->next; | ||
153 | /* | ||
154 | * We don't kfree dest because it is refered either | ||
155 | * by its service or by the trash dest list. | ||
156 | */ | ||
157 | atomic_dec(&e->dest->refcnt); | ||
158 | kfree(e); | ||
159 | } | ||
160 | write_unlock(&set->lock); | ||
161 | } | ||
162 | |||
163 | /* get weighted least-connection node in the destination set */ | ||
164 | static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) | ||
165 | { | ||
166 | register struct ip_vs_dest_list *e; | ||
167 | struct ip_vs_dest *dest, *least; | ||
168 | int loh, doh; | ||
169 | |||
170 | if (set == NULL) | ||
171 | return NULL; | ||
172 | |||
173 | /* select the first destination server, whose weight > 0 */ | ||
174 | for (e=set->list; e!=NULL; e=e->next) { | ||
175 | least = e->dest; | ||
176 | if (least->flags & IP_VS_DEST_F_OVERLOAD) | ||
177 | continue; | ||
178 | |||
179 | if ((atomic_read(&least->weight) > 0) | ||
180 | && (least->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
181 | loh = atomic_read(&least->activeconns) * 50 | ||
182 | + atomic_read(&least->inactconns); | ||
183 | goto nextstage; | ||
184 | } | ||
185 | } | ||
186 | return NULL; | ||
187 | |||
188 | /* find the destination with the weighted least load */ | ||
189 | nextstage: | ||
190 | for (e=e->next; e!=NULL; e=e->next) { | ||
191 | dest = e->dest; | ||
192 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
193 | continue; | ||
194 | |||
195 | doh = atomic_read(&dest->activeconns) * 50 | ||
196 | + atomic_read(&dest->inactconns); | ||
197 | if ((loh * atomic_read(&dest->weight) > | ||
198 | doh * atomic_read(&least->weight)) | ||
199 | && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
200 | least = dest; | ||
201 | loh = doh; | ||
202 | } | ||
203 | } | ||
204 | |||
205 | IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " | ||
206 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
207 | NIPQUAD(least->addr.ip), ntohs(least->port), | ||
208 | atomic_read(&least->activeconns), | ||
209 | atomic_read(&least->refcnt), | ||
210 | atomic_read(&least->weight), loh); | ||
211 | return least; | ||
212 | } | ||
213 | |||
214 | |||
215 | /* get weighted most-connection node in the destination set */ | ||
216 | static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) | ||
217 | { | ||
218 | register struct ip_vs_dest_list *e; | ||
219 | struct ip_vs_dest *dest, *most; | ||
220 | int moh, doh; | ||
221 | |||
222 | if (set == NULL) | ||
223 | return NULL; | ||
224 | |||
225 | /* select the first destination server, whose weight > 0 */ | ||
226 | for (e=set->list; e!=NULL; e=e->next) { | ||
227 | most = e->dest; | ||
228 | if (atomic_read(&most->weight) > 0) { | ||
229 | moh = atomic_read(&most->activeconns) * 50 | ||
230 | + atomic_read(&most->inactconns); | ||
231 | goto nextstage; | ||
232 | } | ||
233 | } | ||
234 | return NULL; | ||
235 | |||
236 | /* find the destination with the weighted most load */ | ||
237 | nextstage: | ||
238 | for (e=e->next; e!=NULL; e=e->next) { | ||
239 | dest = e->dest; | ||
240 | doh = atomic_read(&dest->activeconns) * 50 | ||
241 | + atomic_read(&dest->inactconns); | ||
242 | /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ | ||
243 | if ((moh * atomic_read(&dest->weight) < | ||
244 | doh * atomic_read(&most->weight)) | ||
245 | && (atomic_read(&dest->weight) > 0)) { | ||
246 | most = dest; | ||
247 | moh = doh; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " | ||
252 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
253 | NIPQUAD(most->addr.ip), ntohs(most->port), | ||
254 | atomic_read(&most->activeconns), | ||
255 | atomic_read(&most->refcnt), | ||
256 | atomic_read(&most->weight), moh); | ||
257 | return most; | ||
258 | } | ||
259 | |||
260 | |||
261 | /* | ||
262 | * IPVS lblcr entry represents an association between destination | ||
263 | * IP address and its destination server set | ||
264 | */ | ||
265 | struct ip_vs_lblcr_entry { | ||
266 | struct list_head list; | ||
267 | __be32 addr; /* destination IP address */ | ||
268 | struct ip_vs_dest_set set; /* destination server set */ | ||
269 | unsigned long lastuse; /* last used time */ | ||
270 | }; | ||
271 | |||
272 | |||
273 | /* | ||
274 | * IPVS lblcr hash table | ||
275 | */ | ||
276 | struct ip_vs_lblcr_table { | ||
277 | struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ | ||
278 | atomic_t entries; /* number of entries */ | ||
279 | int max_size; /* maximum size of entries */ | ||
280 | struct timer_list periodic_timer; /* collect stale entries */ | ||
281 | int rover; /* rover for expire check */ | ||
282 | int counter; /* counter for no expire */ | ||
283 | }; | ||
284 | |||
285 | |||
286 | /* | ||
287 | * IPVS LBLCR sysctl table | ||
288 | */ | ||
289 | |||
290 | static ctl_table vs_vars_table[] = { | ||
291 | { | ||
292 | .procname = "lblcr_expiration", | ||
293 | .data = &sysctl_ip_vs_lblcr_expiration, | ||
294 | .maxlen = sizeof(int), | ||
295 | .mode = 0644, | ||
296 | .proc_handler = &proc_dointvec_jiffies, | ||
297 | }, | ||
298 | { .ctl_name = 0 } | ||
299 | }; | ||
300 | |||
301 | static struct ctl_table_header * sysctl_header; | ||
302 | |||
303 | static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) | ||
304 | { | ||
305 | list_del(&en->list); | ||
306 | ip_vs_dest_set_eraseall(&en->set); | ||
307 | kfree(en); | ||
308 | } | ||
309 | |||
310 | |||
311 | /* | ||
312 | * Returns hash value for IPVS LBLCR entry | ||
313 | */ | ||
314 | static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) | ||
315 | { | ||
316 | return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; | ||
317 | } | ||
318 | |||
319 | |||
320 | /* | ||
321 | * Hash an entry in the ip_vs_lblcr_table. | ||
322 | * returns bool success. | ||
323 | */ | ||
324 | static void | ||
325 | ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) | ||
326 | { | ||
327 | unsigned hash = ip_vs_lblcr_hashkey(en->addr); | ||
328 | |||
329 | list_add(&en->list, &tbl->bucket[hash]); | ||
330 | atomic_inc(&tbl->entries); | ||
331 | } | ||
332 | |||
333 | |||
334 | /* | ||
335 | * Get ip_vs_lblcr_entry associated with supplied parameters. Called under | ||
336 | * read lock. | ||
337 | */ | ||
338 | static inline struct ip_vs_lblcr_entry * | ||
339 | ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) | ||
340 | { | ||
341 | unsigned hash = ip_vs_lblcr_hashkey(addr); | ||
342 | struct ip_vs_lblcr_entry *en; | ||
343 | |||
344 | list_for_each_entry(en, &tbl->bucket[hash], list) | ||
345 | if (en->addr == addr) | ||
346 | return en; | ||
347 | |||
348 | return NULL; | ||
349 | } | ||
350 | |||
351 | |||
352 | /* | ||
353 | * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination | ||
354 | * IP address to a server. Called under write lock. | ||
355 | */ | ||
356 | static inline struct ip_vs_lblcr_entry * | ||
357 | ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, | ||
358 | struct ip_vs_dest *dest) | ||
359 | { | ||
360 | struct ip_vs_lblcr_entry *en; | ||
361 | |||
362 | en = ip_vs_lblcr_get(tbl, daddr); | ||
363 | if (!en) { | ||
364 | en = kmalloc(sizeof(*en), GFP_ATOMIC); | ||
365 | if (!en) { | ||
366 | IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); | ||
367 | return NULL; | ||
368 | } | ||
369 | |||
370 | en->addr = daddr; | ||
371 | en->lastuse = jiffies; | ||
372 | |||
373 | /* initilize its dest set */ | ||
374 | atomic_set(&(en->set.size), 0); | ||
375 | en->set.list = NULL; | ||
376 | rwlock_init(&en->set.lock); | ||
377 | |||
378 | ip_vs_lblcr_hash(tbl, en); | ||
379 | } | ||
380 | |||
381 | write_lock(&en->set.lock); | ||
382 | ip_vs_dest_set_insert(&en->set, dest); | ||
383 | write_unlock(&en->set.lock); | ||
384 | |||
385 | return en; | ||
386 | } | ||
387 | |||
388 | |||
389 | /* | ||
390 | * Flush all the entries of the specified table. | ||
391 | */ | ||
392 | static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) | ||
393 | { | ||
394 | int i; | ||
395 | struct ip_vs_lblcr_entry *en, *nxt; | ||
396 | |||
397 | /* No locking required, only called during cleanup. */ | ||
398 | for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
399 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | ||
400 | ip_vs_lblcr_free(en); | ||
401 | } | ||
402 | } | ||
403 | } | ||
404 | |||
405 | |||
406 | static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) | ||
407 | { | ||
408 | struct ip_vs_lblcr_table *tbl = svc->sched_data; | ||
409 | unsigned long now = jiffies; | ||
410 | int i, j; | ||
411 | struct ip_vs_lblcr_entry *en, *nxt; | ||
412 | |||
413 | for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
414 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | ||
415 | |||
416 | write_lock(&svc->sched_lock); | ||
417 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
418 | if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, | ||
419 | now)) | ||
420 | continue; | ||
421 | |||
422 | ip_vs_lblcr_free(en); | ||
423 | atomic_dec(&tbl->entries); | ||
424 | } | ||
425 | write_unlock(&svc->sched_lock); | ||
426 | } | ||
427 | tbl->rover = j; | ||
428 | } | ||
429 | |||
430 | |||
431 | /* | ||
432 | * Periodical timer handler for IPVS lblcr table | ||
433 | * It is used to collect stale entries when the number of entries | ||
434 | * exceeds the maximum size of the table. | ||
435 | * | ||
436 | * Fixme: we probably need more complicated algorithm to collect | ||
437 | * entries that have not been used for a long time even | ||
438 | * if the number of entries doesn't exceed the maximum size | ||
439 | * of the table. | ||
440 | * The full expiration check is for this purpose now. | ||
441 | */ | ||
442 | static void ip_vs_lblcr_check_expire(unsigned long data) | ||
443 | { | ||
444 | struct ip_vs_service *svc = (struct ip_vs_service *) data; | ||
445 | struct ip_vs_lblcr_table *tbl = svc->sched_data; | ||
446 | unsigned long now = jiffies; | ||
447 | int goal; | ||
448 | int i, j; | ||
449 | struct ip_vs_lblcr_entry *en, *nxt; | ||
450 | |||
451 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | ||
452 | /* do full expiration check */ | ||
453 | ip_vs_lblcr_full_check(svc); | ||
454 | tbl->counter = 1; | ||
455 | goto out; | ||
456 | } | ||
457 | |||
458 | if (atomic_read(&tbl->entries) <= tbl->max_size) { | ||
459 | tbl->counter++; | ||
460 | goto out; | ||
461 | } | ||
462 | |||
463 | goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | ||
464 | if (goal > tbl->max_size/2) | ||
465 | goal = tbl->max_size/2; | ||
466 | |||
467 | for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
468 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | ||
469 | |||
470 | write_lock(&svc->sched_lock); | ||
471 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
472 | if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) | ||
473 | continue; | ||
474 | |||
475 | ip_vs_lblcr_free(en); | ||
476 | atomic_dec(&tbl->entries); | ||
477 | goal--; | ||
478 | } | ||
479 | write_unlock(&svc->sched_lock); | ||
480 | if (goal <= 0) | ||
481 | break; | ||
482 | } | ||
483 | tbl->rover = j; | ||
484 | |||
485 | out: | ||
486 | mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | ||
487 | } | ||
488 | |||
489 | static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) | ||
490 | { | ||
491 | int i; | ||
492 | struct ip_vs_lblcr_table *tbl; | ||
493 | |||
494 | /* | ||
495 | * Allocate the ip_vs_lblcr_table for this service | ||
496 | */ | ||
497 | tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); | ||
498 | if (tbl == NULL) { | ||
499 | IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); | ||
500 | return -ENOMEM; | ||
501 | } | ||
502 | svc->sched_data = tbl; | ||
503 | IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " | ||
504 | "current service\n", sizeof(*tbl)); | ||
505 | |||
506 | /* | ||
507 | * Initialize the hash buckets | ||
508 | */ | ||
509 | for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
510 | INIT_LIST_HEAD(&tbl->bucket[i]); | ||
511 | } | ||
512 | tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; | ||
513 | tbl->rover = 0; | ||
514 | tbl->counter = 1; | ||
515 | |||
516 | /* | ||
517 | * Hook periodic timer for garbage collection | ||
518 | */ | ||
519 | setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, | ||
520 | (unsigned long)svc); | ||
521 | mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); | ||
522 | |||
523 | return 0; | ||
524 | } | ||
525 | |||
526 | |||
527 | static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) | ||
528 | { | ||
529 | struct ip_vs_lblcr_table *tbl = svc->sched_data; | ||
530 | |||
531 | /* remove periodic timer */ | ||
532 | del_timer_sync(&tbl->periodic_timer); | ||
533 | |||
534 | /* got to clean up table entries here */ | ||
535 | ip_vs_lblcr_flush(tbl); | ||
536 | |||
537 | /* release the table itself */ | ||
538 | kfree(tbl); | ||
539 | IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", | ||
540 | sizeof(*tbl)); | ||
541 | |||
542 | return 0; | ||
543 | } | ||
544 | |||
545 | |||
546 | static inline struct ip_vs_dest * | ||
547 | __ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) | ||
548 | { | ||
549 | struct ip_vs_dest *dest, *least; | ||
550 | int loh, doh; | ||
551 | |||
552 | /* | ||
553 | * We think the overhead of processing active connections is fifty | ||
554 | * times higher than that of inactive connections in average. (This | ||
555 | * fifty times might not be accurate, we will change it later.) We | ||
556 | * use the following formula to estimate the overhead: | ||
557 | * dest->activeconns*50 + dest->inactconns | ||
558 | * and the load: | ||
559 | * (dest overhead) / dest->weight | ||
560 | * | ||
561 | * Remember -- no floats in kernel mode!!! | ||
562 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
563 | * h1/w1 > h2/w2 | ||
564 | * if every weight is larger than zero. | ||
565 | * | ||
566 | * The server with weight=0 is quiesced and will not receive any | ||
567 | * new connection. | ||
568 | */ | ||
569 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
570 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
571 | continue; | ||
572 | |||
573 | if (atomic_read(&dest->weight) > 0) { | ||
574 | least = dest; | ||
575 | loh = atomic_read(&least->activeconns) * 50 | ||
576 | + atomic_read(&least->inactconns); | ||
577 | goto nextstage; | ||
578 | } | ||
579 | } | ||
580 | return NULL; | ||
581 | |||
582 | /* | ||
583 | * Find the destination with the least load. | ||
584 | */ | ||
585 | nextstage: | ||
586 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
587 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
588 | continue; | ||
589 | |||
590 | doh = atomic_read(&dest->activeconns) * 50 | ||
591 | + atomic_read(&dest->inactconns); | ||
592 | if (loh * atomic_read(&dest->weight) > | ||
593 | doh * atomic_read(&least->weight)) { | ||
594 | least = dest; | ||
595 | loh = doh; | ||
596 | } | ||
597 | } | ||
598 | |||
599 | IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " | ||
600 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
601 | NIPQUAD(least->addr.ip), ntohs(least->port), | ||
602 | atomic_read(&least->activeconns), | ||
603 | atomic_read(&least->refcnt), | ||
604 | atomic_read(&least->weight), loh); | ||
605 | |||
606 | return least; | ||
607 | } | ||
608 | |||
609 | |||
610 | /* | ||
611 | * If this destination server is overloaded and there is a less loaded | ||
612 | * server, then return true. | ||
613 | */ | ||
614 | static inline int | ||
615 | is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
616 | { | ||
617 | if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { | ||
618 | struct ip_vs_dest *d; | ||
619 | |||
620 | list_for_each_entry(d, &svc->destinations, n_list) { | ||
621 | if (atomic_read(&d->activeconns)*2 | ||
622 | < atomic_read(&d->weight)) { | ||
623 | return 1; | ||
624 | } | ||
625 | } | ||
626 | } | ||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | |||
631 | /* | ||
632 | * Locality-Based (weighted) Least-Connection scheduling | ||
633 | */ | ||
634 | static struct ip_vs_dest * | ||
635 | ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
636 | { | ||
637 | struct ip_vs_lblcr_table *tbl = svc->sched_data; | ||
638 | struct iphdr *iph = ip_hdr(skb); | ||
639 | struct ip_vs_dest *dest = NULL; | ||
640 | struct ip_vs_lblcr_entry *en; | ||
641 | |||
642 | IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); | ||
643 | |||
644 | /* First look in our cache */ | ||
645 | read_lock(&svc->sched_lock); | ||
646 | en = ip_vs_lblcr_get(tbl, iph->daddr); | ||
647 | if (en) { | ||
648 | /* We only hold a read lock, but this is atomic */ | ||
649 | en->lastuse = jiffies; | ||
650 | |||
651 | /* Get the least loaded destination */ | ||
652 | read_lock(&en->set.lock); | ||
653 | dest = ip_vs_dest_set_min(&en->set); | ||
654 | read_unlock(&en->set.lock); | ||
655 | |||
656 | /* More than one destination + enough time passed by, cleanup */ | ||
657 | if (atomic_read(&en->set.size) > 1 && | ||
658 | time_after(jiffies, en->set.lastmod + | ||
659 | sysctl_ip_vs_lblcr_expiration)) { | ||
660 | struct ip_vs_dest *m; | ||
661 | |||
662 | write_lock(&en->set.lock); | ||
663 | m = ip_vs_dest_set_max(&en->set); | ||
664 | if (m) | ||
665 | ip_vs_dest_set_erase(&en->set, m); | ||
666 | write_unlock(&en->set.lock); | ||
667 | } | ||
668 | |||
669 | /* If the destination is not overloaded, use it */ | ||
670 | if (dest && !is_overloaded(dest, svc)) { | ||
671 | read_unlock(&svc->sched_lock); | ||
672 | goto out; | ||
673 | } | ||
674 | |||
675 | /* The cache entry is invalid, time to schedule */ | ||
676 | dest = __ip_vs_lblcr_schedule(svc, iph); | ||
677 | if (!dest) { | ||
678 | IP_VS_DBG(1, "no destination available\n"); | ||
679 | read_unlock(&svc->sched_lock); | ||
680 | return NULL; | ||
681 | } | ||
682 | |||
683 | /* Update our cache entry */ | ||
684 | write_lock(&en->set.lock); | ||
685 | ip_vs_dest_set_insert(&en->set, dest); | ||
686 | write_unlock(&en->set.lock); | ||
687 | } | ||
688 | read_unlock(&svc->sched_lock); | ||
689 | |||
690 | if (dest) | ||
691 | goto out; | ||
692 | |||
693 | /* No cache entry, time to schedule */ | ||
694 | dest = __ip_vs_lblcr_schedule(svc, iph); | ||
695 | if (!dest) { | ||
696 | IP_VS_DBG(1, "no destination available\n"); | ||
697 | return NULL; | ||
698 | } | ||
699 | |||
700 | /* If we fail to create a cache entry, we'll just use the valid dest */ | ||
701 | write_lock(&svc->sched_lock); | ||
702 | ip_vs_lblcr_new(tbl, iph->daddr, dest); | ||
703 | write_unlock(&svc->sched_lock); | ||
704 | |||
705 | out: | ||
706 | IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " | ||
707 | "--> server %u.%u.%u.%u:%d\n", | ||
708 | NIPQUAD(iph->daddr), | ||
709 | NIPQUAD(dest->addr.ip), | ||
710 | ntohs(dest->port)); | ||
711 | |||
712 | return dest; | ||
713 | } | ||
714 | |||
715 | |||
716 | /* | ||
717 | * IPVS LBLCR Scheduler structure | ||
718 | */ | ||
719 | static struct ip_vs_scheduler ip_vs_lblcr_scheduler = | ||
720 | { | ||
721 | .name = "lblcr", | ||
722 | .refcnt = ATOMIC_INIT(0), | ||
723 | .module = THIS_MODULE, | ||
724 | .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), | ||
725 | #ifdef CONFIG_IP_VS_IPV6 | ||
726 | .supports_ipv6 = 0, | ||
727 | #endif | ||
728 | .init_service = ip_vs_lblcr_init_svc, | ||
729 | .done_service = ip_vs_lblcr_done_svc, | ||
730 | .schedule = ip_vs_lblcr_schedule, | ||
731 | }; | ||
732 | |||
733 | |||
734 | static int __init ip_vs_lblcr_init(void) | ||
735 | { | ||
736 | int ret; | ||
737 | |||
738 | sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); | ||
739 | ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); | ||
740 | if (ret) | ||
741 | unregister_sysctl_table(sysctl_header); | ||
742 | return ret; | ||
743 | } | ||
744 | |||
745 | |||
746 | static void __exit ip_vs_lblcr_cleanup(void) | ||
747 | { | ||
748 | unregister_sysctl_table(sysctl_header); | ||
749 | unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); | ||
750 | } | ||
751 | |||
752 | |||
753 | module_init(ip_vs_lblcr_init); | ||
754 | module_exit(ip_vs_lblcr_cleanup); | ||
755 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c deleted file mode 100644 index b69f808ac461..000000000000 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ /dev/null | |||
@@ -1,103 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Least-Connection Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Wensong Zhang : added the ip_vs_lc_update_svc | ||
13 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <linux/module.h> | ||
18 | #include <linux/kernel.h> | ||
19 | |||
20 | #include <net/ip_vs.h> | ||
21 | |||
22 | |||
23 | static inline unsigned int | ||
24 | ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) | ||
25 | { | ||
26 | /* | ||
27 | * We think the overhead of processing active connections is 256 | ||
28 | * times higher than that of inactive connections in average. (This | ||
29 | * 256 times might not be accurate, we will change it later) We | ||
30 | * use the following formula to estimate the overhead now: | ||
31 | * dest->activeconns*256 + dest->inactconns | ||
32 | */ | ||
33 | return (atomic_read(&dest->activeconns) << 8) + | ||
34 | atomic_read(&dest->inactconns); | ||
35 | } | ||
36 | |||
37 | |||
38 | /* | ||
39 | * Least Connection scheduling | ||
40 | */ | ||
41 | static struct ip_vs_dest * | ||
42 | ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
43 | { | ||
44 | struct ip_vs_dest *dest, *least = NULL; | ||
45 | unsigned int loh = 0, doh; | ||
46 | |||
47 | IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); | ||
48 | |||
49 | /* | ||
50 | * Simply select the server with the least number of | ||
51 | * (activeconns<<5) + inactconns | ||
52 | * Except whose weight is equal to zero. | ||
53 | * If the weight is equal to zero, it means that the server is | ||
54 | * quiesced, the existing connections to the server still get | ||
55 | * served, but no new connection is assigned to the server. | ||
56 | */ | ||
57 | |||
58 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
59 | if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || | ||
60 | atomic_read(&dest->weight) == 0) | ||
61 | continue; | ||
62 | doh = ip_vs_lc_dest_overhead(dest); | ||
63 | if (!least || doh < loh) { | ||
64 | least = dest; | ||
65 | loh = doh; | ||
66 | } | ||
67 | } | ||
68 | |||
69 | if (least) | ||
70 | IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", | ||
71 | IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), | ||
72 | atomic_read(&least->activeconns), | ||
73 | atomic_read(&least->inactconns)); | ||
74 | |||
75 | return least; | ||
76 | } | ||
77 | |||
78 | |||
79 | static struct ip_vs_scheduler ip_vs_lc_scheduler = { | ||
80 | .name = "lc", | ||
81 | .refcnt = ATOMIC_INIT(0), | ||
82 | .module = THIS_MODULE, | ||
83 | .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), | ||
84 | #ifdef CONFIG_IP_VS_IPV6 | ||
85 | .supports_ipv6 = 1, | ||
86 | #endif | ||
87 | .schedule = ip_vs_lc_schedule, | ||
88 | }; | ||
89 | |||
90 | |||
91 | static int __init ip_vs_lc_init(void) | ||
92 | { | ||
93 | return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; | ||
94 | } | ||
95 | |||
96 | static void __exit ip_vs_lc_cleanup(void) | ||
97 | { | ||
98 | unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); | ||
99 | } | ||
100 | |||
101 | module_init(ip_vs_lc_init); | ||
102 | module_exit(ip_vs_lc_cleanup); | ||
103 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c deleted file mode 100644 index 9a2d8033f08f..000000000000 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ /dev/null | |||
@@ -1,138 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Never Queue scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * The NQ algorithm adopts a two-speed model. When there is an idle server | ||
17 | * available, the job will be sent to the idle server, instead of waiting | ||
18 | * for a fast one. When there is no idle server available, the job will be | ||
19 | * sent to the server that minimize its expected delay (The Shortest | ||
20 | * Expected Delay scheduling algorithm). | ||
21 | * | ||
22 | * See the following paper for more information: | ||
23 | * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing | ||
24 | * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, | ||
25 | * pages 986-994, 1988. | ||
26 | * | ||
27 | * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. | ||
28 | * | ||
29 | * The difference between NQ and SED is that NQ can improve overall | ||
30 | * system utilization. | ||
31 | * | ||
32 | */ | ||
33 | |||
34 | #include <linux/module.h> | ||
35 | #include <linux/kernel.h> | ||
36 | |||
37 | #include <net/ip_vs.h> | ||
38 | |||
39 | |||
40 | static inline unsigned int | ||
41 | ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) | ||
42 | { | ||
43 | /* | ||
44 | * We only use the active connection number in the cost | ||
45 | * calculation here. | ||
46 | */ | ||
47 | return atomic_read(&dest->activeconns) + 1; | ||
48 | } | ||
49 | |||
50 | |||
51 | /* | ||
52 | * Weighted Least Connection scheduling | ||
53 | */ | ||
54 | static struct ip_vs_dest * | ||
55 | ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
56 | { | ||
57 | struct ip_vs_dest *dest, *least = NULL; | ||
58 | unsigned int loh = 0, doh; | ||
59 | |||
60 | IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); | ||
61 | |||
62 | /* | ||
63 | * We calculate the load of each dest server as follows: | ||
64 | * (server expected overhead) / dest->weight | ||
65 | * | ||
66 | * Remember -- no floats in kernel mode!!! | ||
67 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
68 | * h1/w1 > h2/w2 | ||
69 | * if every weight is larger than zero. | ||
70 | * | ||
71 | * The server with weight=0 is quiesced and will not receive any | ||
72 | * new connections. | ||
73 | */ | ||
74 | |||
75 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
76 | |||
77 | if (dest->flags & IP_VS_DEST_F_OVERLOAD || | ||
78 | !atomic_read(&dest->weight)) | ||
79 | continue; | ||
80 | |||
81 | doh = ip_vs_nq_dest_overhead(dest); | ||
82 | |||
83 | /* return the server directly if it is idle */ | ||
84 | if (atomic_read(&dest->activeconns) == 0) { | ||
85 | least = dest; | ||
86 | loh = doh; | ||
87 | goto out; | ||
88 | } | ||
89 | |||
90 | if (!least || | ||
91 | (loh * atomic_read(&dest->weight) > | ||
92 | doh * atomic_read(&least->weight))) { | ||
93 | least = dest; | ||
94 | loh = doh; | ||
95 | } | ||
96 | } | ||
97 | |||
98 | if (!least) | ||
99 | return NULL; | ||
100 | |||
101 | out: | ||
102 | IP_VS_DBG_BUF(6, "NQ: server %s:%u " | ||
103 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
104 | IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), | ||
105 | atomic_read(&least->activeconns), | ||
106 | atomic_read(&least->refcnt), | ||
107 | atomic_read(&least->weight), loh); | ||
108 | |||
109 | return least; | ||
110 | } | ||
111 | |||
112 | |||
113 | static struct ip_vs_scheduler ip_vs_nq_scheduler = | ||
114 | { | ||
115 | .name = "nq", | ||
116 | .refcnt = ATOMIC_INIT(0), | ||
117 | .module = THIS_MODULE, | ||
118 | .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), | ||
119 | #ifdef CONFIG_IP_VS_IPV6 | ||
120 | .supports_ipv6 = 1, | ||
121 | #endif | ||
122 | .schedule = ip_vs_nq_schedule, | ||
123 | }; | ||
124 | |||
125 | |||
126 | static int __init ip_vs_nq_init(void) | ||
127 | { | ||
128 | return register_ip_vs_scheduler(&ip_vs_nq_scheduler); | ||
129 | } | ||
130 | |||
131 | static void __exit ip_vs_nq_cleanup(void) | ||
132 | { | ||
133 | unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); | ||
134 | } | ||
135 | |||
136 | module_init(ip_vs_nq_init); | ||
137 | module_exit(ip_vs_nq_cleanup); | ||
138 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c deleted file mode 100644 index 0791f9e08feb..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ /dev/null | |||
@@ -1,288 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto.c: transport protocol load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/module.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/skbuff.h> | ||
19 | #include <linux/in.h> | ||
20 | #include <linux/ip.h> | ||
21 | #include <net/protocol.h> | ||
22 | #include <net/tcp.h> | ||
23 | #include <net/udp.h> | ||
24 | #include <asm/system.h> | ||
25 | #include <linux/stat.h> | ||
26 | #include <linux/proc_fs.h> | ||
27 | |||
28 | #include <net/ip_vs.h> | ||
29 | |||
30 | |||
31 | /* | ||
32 | * IPVS protocols can only be registered/unregistered when the ipvs | ||
33 | * module is loaded/unloaded, so no lock is needed in accessing the | ||
34 | * ipvs protocol table. | ||
35 | */ | ||
36 | |||
37 | #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ | ||
38 | #define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) | ||
39 | |||
40 | static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; | ||
41 | |||
42 | |||
43 | /* | ||
44 | * register an ipvs protocol | ||
45 | */ | ||
46 | static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) | ||
47 | { | ||
48 | unsigned hash = IP_VS_PROTO_HASH(pp->protocol); | ||
49 | |||
50 | pp->next = ip_vs_proto_table[hash]; | ||
51 | ip_vs_proto_table[hash] = pp; | ||
52 | |||
53 | if (pp->init != NULL) | ||
54 | pp->init(pp); | ||
55 | |||
56 | return 0; | ||
57 | } | ||
58 | |||
59 | |||
60 | /* | ||
61 | * unregister an ipvs protocol | ||
62 | */ | ||
63 | static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) | ||
64 | { | ||
65 | struct ip_vs_protocol **pp_p; | ||
66 | unsigned hash = IP_VS_PROTO_HASH(pp->protocol); | ||
67 | |||
68 | pp_p = &ip_vs_proto_table[hash]; | ||
69 | for (; *pp_p; pp_p = &(*pp_p)->next) { | ||
70 | if (*pp_p == pp) { | ||
71 | *pp_p = pp->next; | ||
72 | if (pp->exit != NULL) | ||
73 | pp->exit(pp); | ||
74 | return 0; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | return -ESRCH; | ||
79 | } | ||
80 | |||
81 | |||
82 | /* | ||
83 | * get ip_vs_protocol object by its proto. | ||
84 | */ | ||
85 | struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) | ||
86 | { | ||
87 | struct ip_vs_protocol *pp; | ||
88 | unsigned hash = IP_VS_PROTO_HASH(proto); | ||
89 | |||
90 | for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { | ||
91 | if (pp->protocol == proto) | ||
92 | return pp; | ||
93 | } | ||
94 | |||
95 | return NULL; | ||
96 | } | ||
97 | |||
98 | |||
99 | /* | ||
100 | * Propagate event for state change to all protocols | ||
101 | */ | ||
102 | void ip_vs_protocol_timeout_change(int flags) | ||
103 | { | ||
104 | struct ip_vs_protocol *pp; | ||
105 | int i; | ||
106 | |||
107 | for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { | ||
108 | for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { | ||
109 | if (pp->timeout_change) | ||
110 | pp->timeout_change(pp, flags); | ||
111 | } | ||
112 | } | ||
113 | } | ||
114 | |||
115 | |||
116 | int * | ||
117 | ip_vs_create_timeout_table(int *table, int size) | ||
118 | { | ||
119 | return kmemdup(table, size, GFP_ATOMIC); | ||
120 | } | ||
121 | |||
122 | |||
123 | /* | ||
124 | * Set timeout value for state specified by name | ||
125 | */ | ||
126 | int | ||
127 | ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) | ||
128 | { | ||
129 | int i; | ||
130 | |||
131 | if (!table || !name || !to) | ||
132 | return -EINVAL; | ||
133 | |||
134 | for (i = 0; i < num; i++) { | ||
135 | if (strcmp(names[i], name)) | ||
136 | continue; | ||
137 | table[i] = to * HZ; | ||
138 | return 0; | ||
139 | } | ||
140 | return -ENOENT; | ||
141 | } | ||
142 | |||
143 | |||
144 | const char * ip_vs_state_name(__u16 proto, int state) | ||
145 | { | ||
146 | struct ip_vs_protocol *pp = ip_vs_proto_get(proto); | ||
147 | |||
148 | if (pp == NULL || pp->state_name == NULL) | ||
149 | return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; | ||
150 | return pp->state_name(state); | ||
151 | } | ||
152 | |||
153 | |||
154 | static void | ||
155 | ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, | ||
156 | const struct sk_buff *skb, | ||
157 | int offset, | ||
158 | const char *msg) | ||
159 | { | ||
160 | char buf[128]; | ||
161 | struct iphdr _iph, *ih; | ||
162 | |||
163 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
164 | if (ih == NULL) | ||
165 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
166 | else if (ih->frag_off & htons(IP_OFFSET)) | ||
167 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", | ||
168 | pp->name, NIPQUAD(ih->saddr), | ||
169 | NIPQUAD(ih->daddr)); | ||
170 | else { | ||
171 | __be16 _ports[2], *pptr | ||
172 | ; | ||
173 | pptr = skb_header_pointer(skb, offset + ih->ihl*4, | ||
174 | sizeof(_ports), _ports); | ||
175 | if (pptr == NULL) | ||
176 | sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", | ||
177 | pp->name, | ||
178 | NIPQUAD(ih->saddr), | ||
179 | NIPQUAD(ih->daddr)); | ||
180 | else | ||
181 | sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", | ||
182 | pp->name, | ||
183 | NIPQUAD(ih->saddr), | ||
184 | ntohs(pptr[0]), | ||
185 | NIPQUAD(ih->daddr), | ||
186 | ntohs(pptr[1])); | ||
187 | } | ||
188 | |||
189 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
190 | } | ||
191 | |||
192 | #ifdef CONFIG_IP_VS_IPV6 | ||
193 | static void | ||
194 | ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, | ||
195 | const struct sk_buff *skb, | ||
196 | int offset, | ||
197 | const char *msg) | ||
198 | { | ||
199 | char buf[192]; | ||
200 | struct ipv6hdr _iph, *ih; | ||
201 | |||
202 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
203 | if (ih == NULL) | ||
204 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
205 | else if (ih->nexthdr == IPPROTO_FRAGMENT) | ||
206 | sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", | ||
207 | pp->name, NIP6(ih->saddr), | ||
208 | NIP6(ih->daddr)); | ||
209 | else { | ||
210 | __be16 _ports[2], *pptr; | ||
211 | |||
212 | pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), | ||
213 | sizeof(_ports), _ports); | ||
214 | if (pptr == NULL) | ||
215 | sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, | ||
216 | pp->name, | ||
217 | NIP6(ih->saddr), | ||
218 | NIP6(ih->daddr)); | ||
219 | else | ||
220 | sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", | ||
221 | pp->name, | ||
222 | NIP6(ih->saddr), | ||
223 | ntohs(pptr[0]), | ||
224 | NIP6(ih->daddr), | ||
225 | ntohs(pptr[1])); | ||
226 | } | ||
227 | |||
228 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
229 | } | ||
230 | #endif | ||
231 | |||
232 | |||
233 | void | ||
234 | ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, | ||
235 | const struct sk_buff *skb, | ||
236 | int offset, | ||
237 | const char *msg) | ||
238 | { | ||
239 | #ifdef CONFIG_IP_VS_IPV6 | ||
240 | if (skb->protocol == htons(ETH_P_IPV6)) | ||
241 | ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); | ||
242 | else | ||
243 | #endif | ||
244 | ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); | ||
245 | } | ||
246 | |||
247 | |||
248 | int __init ip_vs_protocol_init(void) | ||
249 | { | ||
250 | char protocols[64]; | ||
251 | #define REGISTER_PROTOCOL(p) \ | ||
252 | do { \ | ||
253 | register_ip_vs_protocol(p); \ | ||
254 | strcat(protocols, ", "); \ | ||
255 | strcat(protocols, (p)->name); \ | ||
256 | } while (0) | ||
257 | |||
258 | protocols[0] = '\0'; | ||
259 | protocols[2] = '\0'; | ||
260 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
261 | REGISTER_PROTOCOL(&ip_vs_protocol_tcp); | ||
262 | #endif | ||
263 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
264 | REGISTER_PROTOCOL(&ip_vs_protocol_udp); | ||
265 | #endif | ||
266 | #ifdef CONFIG_IP_VS_PROTO_AH | ||
267 | REGISTER_PROTOCOL(&ip_vs_protocol_ah); | ||
268 | #endif | ||
269 | #ifdef CONFIG_IP_VS_PROTO_ESP | ||
270 | REGISTER_PROTOCOL(&ip_vs_protocol_esp); | ||
271 | #endif | ||
272 | IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); | ||
273 | |||
274 | return 0; | ||
275 | } | ||
276 | |||
277 | |||
278 | void ip_vs_protocol_cleanup(void) | ||
279 | { | ||
280 | struct ip_vs_protocol *pp; | ||
281 | int i; | ||
282 | |||
283 | /* unregister all the ipvs protocols */ | ||
284 | for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { | ||
285 | while ((pp = ip_vs_proto_table[i]) != NULL) | ||
286 | unregister_ip_vs_protocol(pp); | ||
287 | } | ||
288 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c deleted file mode 100644 index 80ab0c8e5b4a..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ /dev/null | |||
@@ -1,235 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 | ||
5 | * Wensong Zhang <wensong@linuxvirtualserver.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * version 2 as published by the Free Software Foundation; | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/in.h> | ||
14 | #include <linux/ip.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/netfilter.h> | ||
18 | #include <linux/netfilter_ipv4.h> | ||
19 | |||
20 | #include <net/ip_vs.h> | ||
21 | |||
22 | |||
23 | /* TODO: | ||
24 | |||
25 | struct isakmp_hdr { | ||
26 | __u8 icookie[8]; | ||
27 | __u8 rcookie[8]; | ||
28 | __u8 np; | ||
29 | __u8 version; | ||
30 | __u8 xchgtype; | ||
31 | __u8 flags; | ||
32 | __u32 msgid; | ||
33 | __u32 length; | ||
34 | }; | ||
35 | |||
36 | */ | ||
37 | |||
38 | #define PORT_ISAKMP 500 | ||
39 | |||
40 | |||
41 | static struct ip_vs_conn * | ||
42 | ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
43 | const struct ip_vs_iphdr *iph, unsigned int proto_off, | ||
44 | int inverse) | ||
45 | { | ||
46 | struct ip_vs_conn *cp; | ||
47 | |||
48 | if (likely(!inverse)) { | ||
49 | cp = ip_vs_conn_in_get(af, IPPROTO_UDP, | ||
50 | &iph->saddr, | ||
51 | htons(PORT_ISAKMP), | ||
52 | &iph->daddr, | ||
53 | htons(PORT_ISAKMP)); | ||
54 | } else { | ||
55 | cp = ip_vs_conn_in_get(af, IPPROTO_UDP, | ||
56 | &iph->daddr, | ||
57 | htons(PORT_ISAKMP), | ||
58 | &iph->saddr, | ||
59 | htons(PORT_ISAKMP)); | ||
60 | } | ||
61 | |||
62 | if (!cp) { | ||
63 | /* | ||
64 | * We are not sure if the packet is from our | ||
65 | * service, so our conn_schedule hook should return NF_ACCEPT | ||
66 | */ | ||
67 | IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " | ||
68 | "%s%s %s->%s\n", | ||
69 | inverse ? "ICMP+" : "", | ||
70 | pp->name, | ||
71 | IP_VS_DBG_ADDR(af, &iph->saddr), | ||
72 | IP_VS_DBG_ADDR(af, &iph->daddr)); | ||
73 | } | ||
74 | |||
75 | return cp; | ||
76 | } | ||
77 | |||
78 | |||
79 | static struct ip_vs_conn * | ||
80 | ah_esp_conn_out_get(int af, const struct sk_buff *skb, | ||
81 | struct ip_vs_protocol *pp, | ||
82 | const struct ip_vs_iphdr *iph, | ||
83 | unsigned int proto_off, | ||
84 | int inverse) | ||
85 | { | ||
86 | struct ip_vs_conn *cp; | ||
87 | |||
88 | if (likely(!inverse)) { | ||
89 | cp = ip_vs_conn_out_get(af, IPPROTO_UDP, | ||
90 | &iph->saddr, | ||
91 | htons(PORT_ISAKMP), | ||
92 | &iph->daddr, | ||
93 | htons(PORT_ISAKMP)); | ||
94 | } else { | ||
95 | cp = ip_vs_conn_out_get(af, IPPROTO_UDP, | ||
96 | &iph->daddr, | ||
97 | htons(PORT_ISAKMP), | ||
98 | &iph->saddr, | ||
99 | htons(PORT_ISAKMP)); | ||
100 | } | ||
101 | |||
102 | if (!cp) { | ||
103 | IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " | ||
104 | "%s%s %s->%s\n", | ||
105 | inverse ? "ICMP+" : "", | ||
106 | pp->name, | ||
107 | IP_VS_DBG_ADDR(af, &iph->saddr), | ||
108 | IP_VS_DBG_ADDR(af, &iph->daddr)); | ||
109 | } | ||
110 | |||
111 | return cp; | ||
112 | } | ||
113 | |||
114 | |||
115 | static int | ||
116 | ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
117 | int *verdict, struct ip_vs_conn **cpp) | ||
118 | { | ||
119 | /* | ||
120 | * AH/ESP is only related traffic. Pass the packet to IP stack. | ||
121 | */ | ||
122 | *verdict = NF_ACCEPT; | ||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | |||
127 | static void | ||
128 | ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, | ||
129 | int offset, const char *msg) | ||
130 | { | ||
131 | char buf[256]; | ||
132 | struct iphdr _iph, *ih; | ||
133 | |||
134 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
135 | if (ih == NULL) | ||
136 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
137 | else | ||
138 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", | ||
139 | pp->name, NIPQUAD(ih->saddr), | ||
140 | NIPQUAD(ih->daddr)); | ||
141 | |||
142 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
143 | } | ||
144 | |||
145 | #ifdef CONFIG_IP_VS_IPV6 | ||
146 | static void | ||
147 | ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, | ||
148 | int offset, const char *msg) | ||
149 | { | ||
150 | char buf[256]; | ||
151 | struct ipv6hdr _iph, *ih; | ||
152 | |||
153 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
154 | if (ih == NULL) | ||
155 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
156 | else | ||
157 | sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, | ||
158 | pp->name, NIP6(ih->saddr), | ||
159 | NIP6(ih->daddr)); | ||
160 | |||
161 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
162 | } | ||
163 | #endif | ||
164 | |||
165 | static void | ||
166 | ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, | ||
167 | int offset, const char *msg) | ||
168 | { | ||
169 | #ifdef CONFIG_IP_VS_IPV6 | ||
170 | if (skb->protocol == htons(ETH_P_IPV6)) | ||
171 | ah_esp_debug_packet_v6(pp, skb, offset, msg); | ||
172 | else | ||
173 | #endif | ||
174 | ah_esp_debug_packet_v4(pp, skb, offset, msg); | ||
175 | } | ||
176 | |||
177 | |||
178 | static void ah_esp_init(struct ip_vs_protocol *pp) | ||
179 | { | ||
180 | /* nothing to do now */ | ||
181 | } | ||
182 | |||
183 | |||
184 | static void ah_esp_exit(struct ip_vs_protocol *pp) | ||
185 | { | ||
186 | /* nothing to do now */ | ||
187 | } | ||
188 | |||
189 | |||
190 | #ifdef CONFIG_IP_VS_PROTO_AH | ||
191 | struct ip_vs_protocol ip_vs_protocol_ah = { | ||
192 | .name = "AH", | ||
193 | .protocol = IPPROTO_AH, | ||
194 | .num_states = 1, | ||
195 | .dont_defrag = 1, | ||
196 | .init = ah_esp_init, | ||
197 | .exit = ah_esp_exit, | ||
198 | .conn_schedule = ah_esp_conn_schedule, | ||
199 | .conn_in_get = ah_esp_conn_in_get, | ||
200 | .conn_out_get = ah_esp_conn_out_get, | ||
201 | .snat_handler = NULL, | ||
202 | .dnat_handler = NULL, | ||
203 | .csum_check = NULL, | ||
204 | .state_transition = NULL, | ||
205 | .register_app = NULL, | ||
206 | .unregister_app = NULL, | ||
207 | .app_conn_bind = NULL, | ||
208 | .debug_packet = ah_esp_debug_packet, | ||
209 | .timeout_change = NULL, /* ISAKMP */ | ||
210 | .set_state_timeout = NULL, | ||
211 | }; | ||
212 | #endif | ||
213 | |||
214 | #ifdef CONFIG_IP_VS_PROTO_ESP | ||
215 | struct ip_vs_protocol ip_vs_protocol_esp = { | ||
216 | .name = "ESP", | ||
217 | .protocol = IPPROTO_ESP, | ||
218 | .num_states = 1, | ||
219 | .dont_defrag = 1, | ||
220 | .init = ah_esp_init, | ||
221 | .exit = ah_esp_exit, | ||
222 | .conn_schedule = ah_esp_conn_schedule, | ||
223 | .conn_in_get = ah_esp_conn_in_get, | ||
224 | .conn_out_get = ah_esp_conn_out_get, | ||
225 | .snat_handler = NULL, | ||
226 | .dnat_handler = NULL, | ||
227 | .csum_check = NULL, | ||
228 | .state_transition = NULL, | ||
229 | .register_app = NULL, | ||
230 | .unregister_app = NULL, | ||
231 | .app_conn_bind = NULL, | ||
232 | .debug_packet = ah_esp_debug_packet, | ||
233 | .timeout_change = NULL, /* ISAKMP */ | ||
234 | }; | ||
235 | #endif | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c deleted file mode 100644 index dd4566ea2bff..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ /dev/null | |||
@@ -1,732 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_tcp.c: TCP load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/ip.h> | ||
18 | #include <linux/tcp.h> /* for tcphdr */ | ||
19 | #include <net/ip.h> | ||
20 | #include <net/tcp.h> /* for csum_tcpudp_magic */ | ||
21 | #include <net/ip6_checksum.h> | ||
22 | #include <linux/netfilter.h> | ||
23 | #include <linux/netfilter_ipv4.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | |||
28 | static struct ip_vs_conn * | ||
29 | tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
30 | const struct ip_vs_iphdr *iph, unsigned int proto_off, | ||
31 | int inverse) | ||
32 | { | ||
33 | __be16 _ports[2], *pptr; | ||
34 | |||
35 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
36 | if (pptr == NULL) | ||
37 | return NULL; | ||
38 | |||
39 | if (likely(!inverse)) { | ||
40 | return ip_vs_conn_in_get(af, iph->protocol, | ||
41 | &iph->saddr, pptr[0], | ||
42 | &iph->daddr, pptr[1]); | ||
43 | } else { | ||
44 | return ip_vs_conn_in_get(af, iph->protocol, | ||
45 | &iph->daddr, pptr[1], | ||
46 | &iph->saddr, pptr[0]); | ||
47 | } | ||
48 | } | ||
49 | |||
50 | static struct ip_vs_conn * | ||
51 | tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
52 | const struct ip_vs_iphdr *iph, unsigned int proto_off, | ||
53 | int inverse) | ||
54 | { | ||
55 | __be16 _ports[2], *pptr; | ||
56 | |||
57 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
58 | if (pptr == NULL) | ||
59 | return NULL; | ||
60 | |||
61 | if (likely(!inverse)) { | ||
62 | return ip_vs_conn_out_get(af, iph->protocol, | ||
63 | &iph->saddr, pptr[0], | ||
64 | &iph->daddr, pptr[1]); | ||
65 | } else { | ||
66 | return ip_vs_conn_out_get(af, iph->protocol, | ||
67 | &iph->daddr, pptr[1], | ||
68 | &iph->saddr, pptr[0]); | ||
69 | } | ||
70 | } | ||
71 | |||
72 | |||
73 | static int | ||
74 | tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
75 | int *verdict, struct ip_vs_conn **cpp) | ||
76 | { | ||
77 | struct ip_vs_service *svc; | ||
78 | struct tcphdr _tcph, *th; | ||
79 | struct ip_vs_iphdr iph; | ||
80 | |||
81 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
82 | |||
83 | th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); | ||
84 | if (th == NULL) { | ||
85 | *verdict = NF_DROP; | ||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | if (th->syn && | ||
90 | (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, | ||
91 | th->dest))) { | ||
92 | if (ip_vs_todrop()) { | ||
93 | /* | ||
94 | * It seems that we are very loaded. | ||
95 | * We have to drop this packet :( | ||
96 | */ | ||
97 | ip_vs_service_put(svc); | ||
98 | *verdict = NF_DROP; | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * Let the virtual server select a real server for the | ||
104 | * incoming connection, and create a connection entry. | ||
105 | */ | ||
106 | *cpp = ip_vs_schedule(svc, skb); | ||
107 | if (!*cpp) { | ||
108 | *verdict = ip_vs_leave(svc, skb, pp); | ||
109 | return 0; | ||
110 | } | ||
111 | ip_vs_service_put(svc); | ||
112 | } | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | |||
117 | static inline void | ||
118 | tcp_fast_csum_update(int af, struct tcphdr *tcph, | ||
119 | const union nf_inet_addr *oldip, | ||
120 | const union nf_inet_addr *newip, | ||
121 | __be16 oldport, __be16 newport) | ||
122 | { | ||
123 | #ifdef CONFIG_IP_VS_IPV6 | ||
124 | if (af == AF_INET6) | ||
125 | tcph->check = | ||
126 | csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, | ||
127 | ip_vs_check_diff2(oldport, newport, | ||
128 | ~csum_unfold(tcph->check)))); | ||
129 | else | ||
130 | #endif | ||
131 | tcph->check = | ||
132 | csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, | ||
133 | ip_vs_check_diff2(oldport, newport, | ||
134 | ~csum_unfold(tcph->check)))); | ||
135 | } | ||
136 | |||
137 | |||
138 | static inline void | ||
139 | tcp_partial_csum_update(int af, struct tcphdr *tcph, | ||
140 | const union nf_inet_addr *oldip, | ||
141 | const union nf_inet_addr *newip, | ||
142 | __be16 oldlen, __be16 newlen) | ||
143 | { | ||
144 | #ifdef CONFIG_IP_VS_IPV6 | ||
145 | if (af == AF_INET6) | ||
146 | tcph->check = | ||
147 | csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, | ||
148 | ip_vs_check_diff2(oldlen, newlen, | ||
149 | ~csum_unfold(tcph->check)))); | ||
150 | else | ||
151 | #endif | ||
152 | tcph->check = | ||
153 | csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, | ||
154 | ip_vs_check_diff2(oldlen, newlen, | ||
155 | ~csum_unfold(tcph->check)))); | ||
156 | } | ||
157 | |||
158 | |||
159 | static int | ||
160 | tcp_snat_handler(struct sk_buff *skb, | ||
161 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
162 | { | ||
163 | struct tcphdr *tcph; | ||
164 | unsigned int tcphoff; | ||
165 | int oldlen; | ||
166 | |||
167 | #ifdef CONFIG_IP_VS_IPV6 | ||
168 | if (cp->af == AF_INET6) | ||
169 | tcphoff = sizeof(struct ipv6hdr); | ||
170 | else | ||
171 | #endif | ||
172 | tcphoff = ip_hdrlen(skb); | ||
173 | oldlen = skb->len - tcphoff; | ||
174 | |||
175 | /* csum_check requires unshared skb */ | ||
176 | if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) | ||
177 | return 0; | ||
178 | |||
179 | if (unlikely(cp->app != NULL)) { | ||
180 | /* Some checks before mangling */ | ||
181 | if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) | ||
182 | return 0; | ||
183 | |||
184 | /* Call application helper if needed */ | ||
185 | if (!ip_vs_app_pkt_out(cp, skb)) | ||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | tcph = (void *)skb_network_header(skb) + tcphoff; | ||
190 | tcph->source = cp->vport; | ||
191 | |||
192 | /* Adjust TCP checksums */ | ||
193 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | ||
194 | tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, | ||
195 | htonl(oldlen), | ||
196 | htonl(skb->len - tcphoff)); | ||
197 | } else if (!cp->app) { | ||
198 | /* Only port and addr are changed, do fast csum update */ | ||
199 | tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, | ||
200 | cp->dport, cp->vport); | ||
201 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
202 | skb->ip_summed = CHECKSUM_NONE; | ||
203 | } else { | ||
204 | /* full checksum calculation */ | ||
205 | tcph->check = 0; | ||
206 | skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); | ||
207 | #ifdef CONFIG_IP_VS_IPV6 | ||
208 | if (cp->af == AF_INET6) | ||
209 | tcph->check = csum_ipv6_magic(&cp->vaddr.in6, | ||
210 | &cp->caddr.in6, | ||
211 | skb->len - tcphoff, | ||
212 | cp->protocol, skb->csum); | ||
213 | else | ||
214 | #endif | ||
215 | tcph->check = csum_tcpudp_magic(cp->vaddr.ip, | ||
216 | cp->caddr.ip, | ||
217 | skb->len - tcphoff, | ||
218 | cp->protocol, | ||
219 | skb->csum); | ||
220 | |||
221 | IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", | ||
222 | pp->name, tcph->check, | ||
223 | (char*)&(tcph->check) - (char*)tcph); | ||
224 | } | ||
225 | return 1; | ||
226 | } | ||
227 | |||
228 | |||
229 | static int | ||
230 | tcp_dnat_handler(struct sk_buff *skb, | ||
231 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
232 | { | ||
233 | struct tcphdr *tcph; | ||
234 | unsigned int tcphoff; | ||
235 | int oldlen; | ||
236 | |||
237 | #ifdef CONFIG_IP_VS_IPV6 | ||
238 | if (cp->af == AF_INET6) | ||
239 | tcphoff = sizeof(struct ipv6hdr); | ||
240 | else | ||
241 | #endif | ||
242 | tcphoff = ip_hdrlen(skb); | ||
243 | oldlen = skb->len - tcphoff; | ||
244 | |||
245 | /* csum_check requires unshared skb */ | ||
246 | if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) | ||
247 | return 0; | ||
248 | |||
249 | if (unlikely(cp->app != NULL)) { | ||
250 | /* Some checks before mangling */ | ||
251 | if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) | ||
252 | return 0; | ||
253 | |||
254 | /* | ||
255 | * Attempt ip_vs_app call. | ||
256 | * It will fix ip_vs_conn and iph ack_seq stuff | ||
257 | */ | ||
258 | if (!ip_vs_app_pkt_in(cp, skb)) | ||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | tcph = (void *)skb_network_header(skb) + tcphoff; | ||
263 | tcph->dest = cp->dport; | ||
264 | |||
265 | /* | ||
266 | * Adjust TCP checksums | ||
267 | */ | ||
268 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | ||
269 | tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, | ||
270 | htonl(oldlen), | ||
271 | htonl(skb->len - tcphoff)); | ||
272 | } else if (!cp->app) { | ||
273 | /* Only port and addr are changed, do fast csum update */ | ||
274 | tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, | ||
275 | cp->vport, cp->dport); | ||
276 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
277 | skb->ip_summed = CHECKSUM_NONE; | ||
278 | } else { | ||
279 | /* full checksum calculation */ | ||
280 | tcph->check = 0; | ||
281 | skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); | ||
282 | #ifdef CONFIG_IP_VS_IPV6 | ||
283 | if (cp->af == AF_INET6) | ||
284 | tcph->check = csum_ipv6_magic(&cp->caddr.in6, | ||
285 | &cp->daddr.in6, | ||
286 | skb->len - tcphoff, | ||
287 | cp->protocol, skb->csum); | ||
288 | else | ||
289 | #endif | ||
290 | tcph->check = csum_tcpudp_magic(cp->caddr.ip, | ||
291 | cp->daddr.ip, | ||
292 | skb->len - tcphoff, | ||
293 | cp->protocol, | ||
294 | skb->csum); | ||
295 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
296 | } | ||
297 | return 1; | ||
298 | } | ||
299 | |||
300 | |||
301 | static int | ||
302 | tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) | ||
303 | { | ||
304 | unsigned int tcphoff; | ||
305 | |||
306 | #ifdef CONFIG_IP_VS_IPV6 | ||
307 | if (af == AF_INET6) | ||
308 | tcphoff = sizeof(struct ipv6hdr); | ||
309 | else | ||
310 | #endif | ||
311 | tcphoff = ip_hdrlen(skb); | ||
312 | |||
313 | switch (skb->ip_summed) { | ||
314 | case CHECKSUM_NONE: | ||
315 | skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); | ||
316 | case CHECKSUM_COMPLETE: | ||
317 | #ifdef CONFIG_IP_VS_IPV6 | ||
318 | if (af == AF_INET6) { | ||
319 | if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, | ||
320 | &ipv6_hdr(skb)->daddr, | ||
321 | skb->len - tcphoff, | ||
322 | ipv6_hdr(skb)->nexthdr, | ||
323 | skb->csum)) { | ||
324 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
325 | "Failed checksum for"); | ||
326 | return 0; | ||
327 | } | ||
328 | } else | ||
329 | #endif | ||
330 | if (csum_tcpudp_magic(ip_hdr(skb)->saddr, | ||
331 | ip_hdr(skb)->daddr, | ||
332 | skb->len - tcphoff, | ||
333 | ip_hdr(skb)->protocol, | ||
334 | skb->csum)) { | ||
335 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
336 | "Failed checksum for"); | ||
337 | return 0; | ||
338 | } | ||
339 | break; | ||
340 | default: | ||
341 | /* No need to checksum. */ | ||
342 | break; | ||
343 | } | ||
344 | |||
345 | return 1; | ||
346 | } | ||
347 | |||
348 | |||
349 | #define TCP_DIR_INPUT 0 | ||
350 | #define TCP_DIR_OUTPUT 4 | ||
351 | #define TCP_DIR_INPUT_ONLY 8 | ||
352 | |||
353 | static const int tcp_state_off[IP_VS_DIR_LAST] = { | ||
354 | [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, | ||
355 | [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, | ||
356 | [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, | ||
357 | }; | ||
358 | |||
359 | /* | ||
360 | * Timeout table[state] | ||
361 | */ | ||
362 | static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { | ||
363 | [IP_VS_TCP_S_NONE] = 2*HZ, | ||
364 | [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, | ||
365 | [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, | ||
366 | [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, | ||
367 | [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, | ||
368 | [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, | ||
369 | [IP_VS_TCP_S_CLOSE] = 10*HZ, | ||
370 | [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, | ||
371 | [IP_VS_TCP_S_LAST_ACK] = 30*HZ, | ||
372 | [IP_VS_TCP_S_LISTEN] = 2*60*HZ, | ||
373 | [IP_VS_TCP_S_SYNACK] = 120*HZ, | ||
374 | [IP_VS_TCP_S_LAST] = 2*HZ, | ||
375 | }; | ||
376 | |||
377 | static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { | ||
378 | [IP_VS_TCP_S_NONE] = "NONE", | ||
379 | [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", | ||
380 | [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", | ||
381 | [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", | ||
382 | [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", | ||
383 | [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", | ||
384 | [IP_VS_TCP_S_CLOSE] = "CLOSE", | ||
385 | [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", | ||
386 | [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", | ||
387 | [IP_VS_TCP_S_LISTEN] = "LISTEN", | ||
388 | [IP_VS_TCP_S_SYNACK] = "SYNACK", | ||
389 | [IP_VS_TCP_S_LAST] = "BUG!", | ||
390 | }; | ||
391 | |||
392 | #define sNO IP_VS_TCP_S_NONE | ||
393 | #define sES IP_VS_TCP_S_ESTABLISHED | ||
394 | #define sSS IP_VS_TCP_S_SYN_SENT | ||
395 | #define sSR IP_VS_TCP_S_SYN_RECV | ||
396 | #define sFW IP_VS_TCP_S_FIN_WAIT | ||
397 | #define sTW IP_VS_TCP_S_TIME_WAIT | ||
398 | #define sCL IP_VS_TCP_S_CLOSE | ||
399 | #define sCW IP_VS_TCP_S_CLOSE_WAIT | ||
400 | #define sLA IP_VS_TCP_S_LAST_ACK | ||
401 | #define sLI IP_VS_TCP_S_LISTEN | ||
402 | #define sSA IP_VS_TCP_S_SYNACK | ||
403 | |||
404 | struct tcp_states_t { | ||
405 | int next_state[IP_VS_TCP_S_LAST]; | ||
406 | }; | ||
407 | |||
408 | static const char * tcp_state_name(int state) | ||
409 | { | ||
410 | if (state >= IP_VS_TCP_S_LAST) | ||
411 | return "ERR!"; | ||
412 | return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; | ||
413 | } | ||
414 | |||
415 | static struct tcp_states_t tcp_states [] = { | ||
416 | /* INPUT */ | ||
417 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
418 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, | ||
419 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
420 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
421 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, | ||
422 | |||
423 | /* OUTPUT */ | ||
424 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
425 | /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, | ||
426 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, | ||
427 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, | ||
428 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, | ||
429 | |||
430 | /* INPUT-ONLY */ | ||
431 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
432 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, | ||
433 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
434 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
435 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
436 | }; | ||
437 | |||
438 | static struct tcp_states_t tcp_states_dos [] = { | ||
439 | /* INPUT */ | ||
440 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
441 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, | ||
442 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, | ||
443 | /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, | ||
444 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
445 | |||
446 | /* OUTPUT */ | ||
447 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
448 | /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, | ||
449 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, | ||
450 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, | ||
451 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, | ||
452 | |||
453 | /* INPUT-ONLY */ | ||
454 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
455 | /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, | ||
456 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
457 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
458 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
459 | }; | ||
460 | |||
461 | static struct tcp_states_t *tcp_state_table = tcp_states; | ||
462 | |||
463 | |||
464 | static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) | ||
465 | { | ||
466 | int on = (flags & 1); /* secure_tcp */ | ||
467 | |||
468 | /* | ||
469 | ** FIXME: change secure_tcp to independent sysctl var | ||
470 | ** or make it per-service or per-app because it is valid | ||
471 | ** for most if not for all of the applications. Something | ||
472 | ** like "capabilities" (flags) for each object. | ||
473 | */ | ||
474 | tcp_state_table = (on? tcp_states_dos : tcp_states); | ||
475 | } | ||
476 | |||
477 | static int | ||
478 | tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) | ||
479 | { | ||
480 | return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, | ||
481 | tcp_state_name_table, sname, to); | ||
482 | } | ||
483 | |||
484 | static inline int tcp_state_idx(struct tcphdr *th) | ||
485 | { | ||
486 | if (th->rst) | ||
487 | return 3; | ||
488 | if (th->syn) | ||
489 | return 0; | ||
490 | if (th->fin) | ||
491 | return 1; | ||
492 | if (th->ack) | ||
493 | return 2; | ||
494 | return -1; | ||
495 | } | ||
496 | |||
497 | static inline void | ||
498 | set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, | ||
499 | int direction, struct tcphdr *th) | ||
500 | { | ||
501 | int state_idx; | ||
502 | int new_state = IP_VS_TCP_S_CLOSE; | ||
503 | int state_off = tcp_state_off[direction]; | ||
504 | |||
505 | /* | ||
506 | * Update state offset to INPUT_ONLY if necessary | ||
507 | * or delete NO_OUTPUT flag if output packet detected | ||
508 | */ | ||
509 | if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { | ||
510 | if (state_off == TCP_DIR_OUTPUT) | ||
511 | cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; | ||
512 | else | ||
513 | state_off = TCP_DIR_INPUT_ONLY; | ||
514 | } | ||
515 | |||
516 | if ((state_idx = tcp_state_idx(th)) < 0) { | ||
517 | IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); | ||
518 | goto tcp_state_out; | ||
519 | } | ||
520 | |||
521 | new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; | ||
522 | |||
523 | tcp_state_out: | ||
524 | if (new_state != cp->state) { | ||
525 | struct ip_vs_dest *dest = cp->dest; | ||
526 | |||
527 | IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" | ||
528 | "%s:%d state: %s->%s conn->refcnt:%d\n", | ||
529 | pp->name, | ||
530 | ((state_off == TCP_DIR_OUTPUT) ? | ||
531 | "output " : "input "), | ||
532 | th->syn ? 'S' : '.', | ||
533 | th->fin ? 'F' : '.', | ||
534 | th->ack ? 'A' : '.', | ||
535 | th->rst ? 'R' : '.', | ||
536 | IP_VS_DBG_ADDR(cp->af, &cp->daddr), | ||
537 | ntohs(cp->dport), | ||
538 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), | ||
539 | ntohs(cp->cport), | ||
540 | tcp_state_name(cp->state), | ||
541 | tcp_state_name(new_state), | ||
542 | atomic_read(&cp->refcnt)); | ||
543 | |||
544 | if (dest) { | ||
545 | if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
546 | (new_state != IP_VS_TCP_S_ESTABLISHED)) { | ||
547 | atomic_dec(&dest->activeconns); | ||
548 | atomic_inc(&dest->inactconns); | ||
549 | cp->flags |= IP_VS_CONN_F_INACTIVE; | ||
550 | } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
551 | (new_state == IP_VS_TCP_S_ESTABLISHED)) { | ||
552 | atomic_inc(&dest->activeconns); | ||
553 | atomic_dec(&dest->inactconns); | ||
554 | cp->flags &= ~IP_VS_CONN_F_INACTIVE; | ||
555 | } | ||
556 | } | ||
557 | } | ||
558 | |||
559 | cp->timeout = pp->timeout_table[cp->state = new_state]; | ||
560 | } | ||
561 | |||
562 | |||
563 | /* | ||
564 | * Handle state transitions | ||
565 | */ | ||
566 | static int | ||
567 | tcp_state_transition(struct ip_vs_conn *cp, int direction, | ||
568 | const struct sk_buff *skb, | ||
569 | struct ip_vs_protocol *pp) | ||
570 | { | ||
571 | struct tcphdr _tcph, *th; | ||
572 | |||
573 | #ifdef CONFIG_IP_VS_IPV6 | ||
574 | int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); | ||
575 | #else | ||
576 | int ihl = ip_hdrlen(skb); | ||
577 | #endif | ||
578 | |||
579 | th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); | ||
580 | if (th == NULL) | ||
581 | return 0; | ||
582 | |||
583 | spin_lock(&cp->lock); | ||
584 | set_tcp_state(pp, cp, direction, th); | ||
585 | spin_unlock(&cp->lock); | ||
586 | |||
587 | return 1; | ||
588 | } | ||
589 | |||
590 | |||
591 | /* | ||
592 | * Hash table for TCP application incarnations | ||
593 | */ | ||
594 | #define TCP_APP_TAB_BITS 4 | ||
595 | #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) | ||
596 | #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) | ||
597 | |||
598 | static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; | ||
599 | static DEFINE_SPINLOCK(tcp_app_lock); | ||
600 | |||
601 | static inline __u16 tcp_app_hashkey(__be16 port) | ||
602 | { | ||
603 | return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) | ||
604 | & TCP_APP_TAB_MASK; | ||
605 | } | ||
606 | |||
607 | |||
608 | static int tcp_register_app(struct ip_vs_app *inc) | ||
609 | { | ||
610 | struct ip_vs_app *i; | ||
611 | __u16 hash; | ||
612 | __be16 port = inc->port; | ||
613 | int ret = 0; | ||
614 | |||
615 | hash = tcp_app_hashkey(port); | ||
616 | |||
617 | spin_lock_bh(&tcp_app_lock); | ||
618 | list_for_each_entry(i, &tcp_apps[hash], p_list) { | ||
619 | if (i->port == port) { | ||
620 | ret = -EEXIST; | ||
621 | goto out; | ||
622 | } | ||
623 | } | ||
624 | list_add(&inc->p_list, &tcp_apps[hash]); | ||
625 | atomic_inc(&ip_vs_protocol_tcp.appcnt); | ||
626 | |||
627 | out: | ||
628 | spin_unlock_bh(&tcp_app_lock); | ||
629 | return ret; | ||
630 | } | ||
631 | |||
632 | |||
633 | static void | ||
634 | tcp_unregister_app(struct ip_vs_app *inc) | ||
635 | { | ||
636 | spin_lock_bh(&tcp_app_lock); | ||
637 | atomic_dec(&ip_vs_protocol_tcp.appcnt); | ||
638 | list_del(&inc->p_list); | ||
639 | spin_unlock_bh(&tcp_app_lock); | ||
640 | } | ||
641 | |||
642 | |||
643 | static int | ||
644 | tcp_app_conn_bind(struct ip_vs_conn *cp) | ||
645 | { | ||
646 | int hash; | ||
647 | struct ip_vs_app *inc; | ||
648 | int result = 0; | ||
649 | |||
650 | /* Default binding: bind app only for NAT */ | ||
651 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) | ||
652 | return 0; | ||
653 | |||
654 | /* Lookup application incarnations and bind the right one */ | ||
655 | hash = tcp_app_hashkey(cp->vport); | ||
656 | |||
657 | spin_lock(&tcp_app_lock); | ||
658 | list_for_each_entry(inc, &tcp_apps[hash], p_list) { | ||
659 | if (inc->port == cp->vport) { | ||
660 | if (unlikely(!ip_vs_app_inc_get(inc))) | ||
661 | break; | ||
662 | spin_unlock(&tcp_app_lock); | ||
663 | |||
664 | IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" | ||
665 | "%s:%u to app %s on port %u\n", | ||
666 | __func__, | ||
667 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), | ||
668 | ntohs(cp->cport), | ||
669 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), | ||
670 | ntohs(cp->vport), | ||
671 | inc->name, ntohs(inc->port)); | ||
672 | |||
673 | cp->app = inc; | ||
674 | if (inc->init_conn) | ||
675 | result = inc->init_conn(inc, cp); | ||
676 | goto out; | ||
677 | } | ||
678 | } | ||
679 | spin_unlock(&tcp_app_lock); | ||
680 | |||
681 | out: | ||
682 | return result; | ||
683 | } | ||
684 | |||
685 | |||
686 | /* | ||
687 | * Set LISTEN timeout. (ip_vs_conn_put will setup timer) | ||
688 | */ | ||
689 | void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) | ||
690 | { | ||
691 | spin_lock(&cp->lock); | ||
692 | cp->state = IP_VS_TCP_S_LISTEN; | ||
693 | cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; | ||
694 | spin_unlock(&cp->lock); | ||
695 | } | ||
696 | |||
697 | |||
698 | static void ip_vs_tcp_init(struct ip_vs_protocol *pp) | ||
699 | { | ||
700 | IP_VS_INIT_HASH_TABLE(tcp_apps); | ||
701 | pp->timeout_table = tcp_timeouts; | ||
702 | } | ||
703 | |||
704 | |||
705 | static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) | ||
706 | { | ||
707 | } | ||
708 | |||
709 | |||
710 | struct ip_vs_protocol ip_vs_protocol_tcp = { | ||
711 | .name = "TCP", | ||
712 | .protocol = IPPROTO_TCP, | ||
713 | .num_states = IP_VS_TCP_S_LAST, | ||
714 | .dont_defrag = 0, | ||
715 | .appcnt = ATOMIC_INIT(0), | ||
716 | .init = ip_vs_tcp_init, | ||
717 | .exit = ip_vs_tcp_exit, | ||
718 | .register_app = tcp_register_app, | ||
719 | .unregister_app = tcp_unregister_app, | ||
720 | .conn_schedule = tcp_conn_schedule, | ||
721 | .conn_in_get = tcp_conn_in_get, | ||
722 | .conn_out_get = tcp_conn_out_get, | ||
723 | .snat_handler = tcp_snat_handler, | ||
724 | .dnat_handler = tcp_dnat_handler, | ||
725 | .csum_check = tcp_csum_check, | ||
726 | .state_name = tcp_state_name, | ||
727 | .state_transition = tcp_state_transition, | ||
728 | .app_conn_bind = tcp_app_conn_bind, | ||
729 | .debug_packet = ip_vs_tcpudp_debug_packet, | ||
730 | .timeout_change = tcp_timeout_change, | ||
731 | .set_state_timeout = tcp_set_state_timeout, | ||
732 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c deleted file mode 100644 index 6eb6039d6343..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ /dev/null | |||
@@ -1,533 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_udp.c: UDP load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/in.h> | ||
17 | #include <linux/ip.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/netfilter.h> | ||
20 | #include <linux/netfilter_ipv4.h> | ||
21 | #include <linux/udp.h> | ||
22 | |||
23 | #include <net/ip_vs.h> | ||
24 | #include <net/ip.h> | ||
25 | #include <net/ip6_checksum.h> | ||
26 | |||
27 | static struct ip_vs_conn * | ||
28 | udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
29 | const struct ip_vs_iphdr *iph, unsigned int proto_off, | ||
30 | int inverse) | ||
31 | { | ||
32 | struct ip_vs_conn *cp; | ||
33 | __be16 _ports[2], *pptr; | ||
34 | |||
35 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
36 | if (pptr == NULL) | ||
37 | return NULL; | ||
38 | |||
39 | if (likely(!inverse)) { | ||
40 | cp = ip_vs_conn_in_get(af, iph->protocol, | ||
41 | &iph->saddr, pptr[0], | ||
42 | &iph->daddr, pptr[1]); | ||
43 | } else { | ||
44 | cp = ip_vs_conn_in_get(af, iph->protocol, | ||
45 | &iph->daddr, pptr[1], | ||
46 | &iph->saddr, pptr[0]); | ||
47 | } | ||
48 | |||
49 | return cp; | ||
50 | } | ||
51 | |||
52 | |||
53 | static struct ip_vs_conn * | ||
54 | udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
55 | const struct ip_vs_iphdr *iph, unsigned int proto_off, | ||
56 | int inverse) | ||
57 | { | ||
58 | struct ip_vs_conn *cp; | ||
59 | __be16 _ports[2], *pptr; | ||
60 | |||
61 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
62 | if (pptr == NULL) | ||
63 | return NULL; | ||
64 | |||
65 | if (likely(!inverse)) { | ||
66 | cp = ip_vs_conn_out_get(af, iph->protocol, | ||
67 | &iph->saddr, pptr[0], | ||
68 | &iph->daddr, pptr[1]); | ||
69 | } else { | ||
70 | cp = ip_vs_conn_out_get(af, iph->protocol, | ||
71 | &iph->daddr, pptr[1], | ||
72 | &iph->saddr, pptr[0]); | ||
73 | } | ||
74 | |||
75 | return cp; | ||
76 | } | ||
77 | |||
78 | |||
79 | static int | ||
80 | udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
81 | int *verdict, struct ip_vs_conn **cpp) | ||
82 | { | ||
83 | struct ip_vs_service *svc; | ||
84 | struct udphdr _udph, *uh; | ||
85 | struct ip_vs_iphdr iph; | ||
86 | |||
87 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | ||
88 | |||
89 | uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); | ||
90 | if (uh == NULL) { | ||
91 | *verdict = NF_DROP; | ||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | svc = ip_vs_service_get(af, skb->mark, iph.protocol, | ||
96 | &iph.daddr, uh->dest); | ||
97 | if (svc) { | ||
98 | if (ip_vs_todrop()) { | ||
99 | /* | ||
100 | * It seems that we are very loaded. | ||
101 | * We have to drop this packet :( | ||
102 | */ | ||
103 | ip_vs_service_put(svc); | ||
104 | *verdict = NF_DROP; | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Let the virtual server select a real server for the | ||
110 | * incoming connection, and create a connection entry. | ||
111 | */ | ||
112 | *cpp = ip_vs_schedule(svc, skb); | ||
113 | if (!*cpp) { | ||
114 | *verdict = ip_vs_leave(svc, skb, pp); | ||
115 | return 0; | ||
116 | } | ||
117 | ip_vs_service_put(svc); | ||
118 | } | ||
119 | return 1; | ||
120 | } | ||
121 | |||
122 | |||
123 | static inline void | ||
124 | udp_fast_csum_update(int af, struct udphdr *uhdr, | ||
125 | const union nf_inet_addr *oldip, | ||
126 | const union nf_inet_addr *newip, | ||
127 | __be16 oldport, __be16 newport) | ||
128 | { | ||
129 | #ifdef CONFIG_IP_VS_IPV6 | ||
130 | if (af == AF_INET6) | ||
131 | uhdr->check = | ||
132 | csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, | ||
133 | ip_vs_check_diff2(oldport, newport, | ||
134 | ~csum_unfold(uhdr->check)))); | ||
135 | else | ||
136 | #endif | ||
137 | uhdr->check = | ||
138 | csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, | ||
139 | ip_vs_check_diff2(oldport, newport, | ||
140 | ~csum_unfold(uhdr->check)))); | ||
141 | if (!uhdr->check) | ||
142 | uhdr->check = CSUM_MANGLED_0; | ||
143 | } | ||
144 | |||
145 | static inline void | ||
146 | udp_partial_csum_update(int af, struct udphdr *uhdr, | ||
147 | const union nf_inet_addr *oldip, | ||
148 | const union nf_inet_addr *newip, | ||
149 | __be16 oldlen, __be16 newlen) | ||
150 | { | ||
151 | #ifdef CONFIG_IP_VS_IPV6 | ||
152 | if (af == AF_INET6) | ||
153 | uhdr->check = | ||
154 | csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, | ||
155 | ip_vs_check_diff2(oldlen, newlen, | ||
156 | ~csum_unfold(uhdr->check)))); | ||
157 | else | ||
158 | #endif | ||
159 | uhdr->check = | ||
160 | csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, | ||
161 | ip_vs_check_diff2(oldlen, newlen, | ||
162 | ~csum_unfold(uhdr->check)))); | ||
163 | } | ||
164 | |||
165 | |||
166 | static int | ||
167 | udp_snat_handler(struct sk_buff *skb, | ||
168 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
169 | { | ||
170 | struct udphdr *udph; | ||
171 | unsigned int udphoff; | ||
172 | int oldlen; | ||
173 | |||
174 | #ifdef CONFIG_IP_VS_IPV6 | ||
175 | if (cp->af == AF_INET6) | ||
176 | udphoff = sizeof(struct ipv6hdr); | ||
177 | else | ||
178 | #endif | ||
179 | udphoff = ip_hdrlen(skb); | ||
180 | oldlen = skb->len - udphoff; | ||
181 | |||
182 | /* csum_check requires unshared skb */ | ||
183 | if (!skb_make_writable(skb, udphoff+sizeof(*udph))) | ||
184 | return 0; | ||
185 | |||
186 | if (unlikely(cp->app != NULL)) { | ||
187 | /* Some checks before mangling */ | ||
188 | if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) | ||
189 | return 0; | ||
190 | |||
191 | /* | ||
192 | * Call application helper if needed | ||
193 | */ | ||
194 | if (!ip_vs_app_pkt_out(cp, skb)) | ||
195 | return 0; | ||
196 | } | ||
197 | |||
198 | udph = (void *)skb_network_header(skb) + udphoff; | ||
199 | udph->source = cp->vport; | ||
200 | |||
201 | /* | ||
202 | * Adjust UDP checksums | ||
203 | */ | ||
204 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | ||
205 | udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, | ||
206 | htonl(oldlen), | ||
207 | htonl(skb->len - udphoff)); | ||
208 | } else if (!cp->app && (udph->check != 0)) { | ||
209 | /* Only port and addr are changed, do fast csum update */ | ||
210 | udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, | ||
211 | cp->dport, cp->vport); | ||
212 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
213 | skb->ip_summed = CHECKSUM_NONE; | ||
214 | } else { | ||
215 | /* full checksum calculation */ | ||
216 | udph->check = 0; | ||
217 | skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); | ||
218 | #ifdef CONFIG_IP_VS_IPV6 | ||
219 | if (cp->af == AF_INET6) | ||
220 | udph->check = csum_ipv6_magic(&cp->vaddr.in6, | ||
221 | &cp->caddr.in6, | ||
222 | skb->len - udphoff, | ||
223 | cp->protocol, skb->csum); | ||
224 | else | ||
225 | #endif | ||
226 | udph->check = csum_tcpudp_magic(cp->vaddr.ip, | ||
227 | cp->caddr.ip, | ||
228 | skb->len - udphoff, | ||
229 | cp->protocol, | ||
230 | skb->csum); | ||
231 | if (udph->check == 0) | ||
232 | udph->check = CSUM_MANGLED_0; | ||
233 | IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", | ||
234 | pp->name, udph->check, | ||
235 | (char*)&(udph->check) - (char*)udph); | ||
236 | } | ||
237 | return 1; | ||
238 | } | ||
239 | |||
240 | |||
241 | static int | ||
242 | udp_dnat_handler(struct sk_buff *skb, | ||
243 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
244 | { | ||
245 | struct udphdr *udph; | ||
246 | unsigned int udphoff; | ||
247 | int oldlen; | ||
248 | |||
249 | #ifdef CONFIG_IP_VS_IPV6 | ||
250 | if (cp->af == AF_INET6) | ||
251 | udphoff = sizeof(struct ipv6hdr); | ||
252 | else | ||
253 | #endif | ||
254 | udphoff = ip_hdrlen(skb); | ||
255 | oldlen = skb->len - udphoff; | ||
256 | |||
257 | /* csum_check requires unshared skb */ | ||
258 | if (!skb_make_writable(skb, udphoff+sizeof(*udph))) | ||
259 | return 0; | ||
260 | |||
261 | if (unlikely(cp->app != NULL)) { | ||
262 | /* Some checks before mangling */ | ||
263 | if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) | ||
264 | return 0; | ||
265 | |||
266 | /* | ||
267 | * Attempt ip_vs_app call. | ||
268 | * It will fix ip_vs_conn | ||
269 | */ | ||
270 | if (!ip_vs_app_pkt_in(cp, skb)) | ||
271 | return 0; | ||
272 | } | ||
273 | |||
274 | udph = (void *)skb_network_header(skb) + udphoff; | ||
275 | udph->dest = cp->dport; | ||
276 | |||
277 | /* | ||
278 | * Adjust UDP checksums | ||
279 | */ | ||
280 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | ||
281 | udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, | ||
282 | htonl(oldlen), | ||
283 | htonl(skb->len - udphoff)); | ||
284 | } else if (!cp->app && (udph->check != 0)) { | ||
285 | /* Only port and addr are changed, do fast csum update */ | ||
286 | udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, | ||
287 | cp->vport, cp->dport); | ||
288 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
289 | skb->ip_summed = CHECKSUM_NONE; | ||
290 | } else { | ||
291 | /* full checksum calculation */ | ||
292 | udph->check = 0; | ||
293 | skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); | ||
294 | #ifdef CONFIG_IP_VS_IPV6 | ||
295 | if (cp->af == AF_INET6) | ||
296 | udph->check = csum_ipv6_magic(&cp->caddr.in6, | ||
297 | &cp->daddr.in6, | ||
298 | skb->len - udphoff, | ||
299 | cp->protocol, skb->csum); | ||
300 | else | ||
301 | #endif | ||
302 | udph->check = csum_tcpudp_magic(cp->caddr.ip, | ||
303 | cp->daddr.ip, | ||
304 | skb->len - udphoff, | ||
305 | cp->protocol, | ||
306 | skb->csum); | ||
307 | if (udph->check == 0) | ||
308 | udph->check = CSUM_MANGLED_0; | ||
309 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
310 | } | ||
311 | return 1; | ||
312 | } | ||
313 | |||
314 | |||
315 | static int | ||
316 | udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) | ||
317 | { | ||
318 | struct udphdr _udph, *uh; | ||
319 | unsigned int udphoff; | ||
320 | |||
321 | #ifdef CONFIG_IP_VS_IPV6 | ||
322 | if (af == AF_INET6) | ||
323 | udphoff = sizeof(struct ipv6hdr); | ||
324 | else | ||
325 | #endif | ||
326 | udphoff = ip_hdrlen(skb); | ||
327 | |||
328 | uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); | ||
329 | if (uh == NULL) | ||
330 | return 0; | ||
331 | |||
332 | if (uh->check != 0) { | ||
333 | switch (skb->ip_summed) { | ||
334 | case CHECKSUM_NONE: | ||
335 | skb->csum = skb_checksum(skb, udphoff, | ||
336 | skb->len - udphoff, 0); | ||
337 | case CHECKSUM_COMPLETE: | ||
338 | #ifdef CONFIG_IP_VS_IPV6 | ||
339 | if (af == AF_INET6) { | ||
340 | if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, | ||
341 | &ipv6_hdr(skb)->daddr, | ||
342 | skb->len - udphoff, | ||
343 | ipv6_hdr(skb)->nexthdr, | ||
344 | skb->csum)) { | ||
345 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
346 | "Failed checksum for"); | ||
347 | return 0; | ||
348 | } | ||
349 | } else | ||
350 | #endif | ||
351 | if (csum_tcpudp_magic(ip_hdr(skb)->saddr, | ||
352 | ip_hdr(skb)->daddr, | ||
353 | skb->len - udphoff, | ||
354 | ip_hdr(skb)->protocol, | ||
355 | skb->csum)) { | ||
356 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
357 | "Failed checksum for"); | ||
358 | return 0; | ||
359 | } | ||
360 | break; | ||
361 | default: | ||
362 | /* No need to checksum. */ | ||
363 | break; | ||
364 | } | ||
365 | } | ||
366 | return 1; | ||
367 | } | ||
368 | |||
369 | |||
370 | /* | ||
371 | * Note: the caller guarantees that only one of register_app, | ||
372 | * unregister_app or app_conn_bind is called each time. | ||
373 | */ | ||
374 | |||
375 | #define UDP_APP_TAB_BITS 4 | ||
376 | #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) | ||
377 | #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) | ||
378 | |||
379 | static struct list_head udp_apps[UDP_APP_TAB_SIZE]; | ||
380 | static DEFINE_SPINLOCK(udp_app_lock); | ||
381 | |||
382 | static inline __u16 udp_app_hashkey(__be16 port) | ||
383 | { | ||
384 | return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) | ||
385 | & UDP_APP_TAB_MASK; | ||
386 | } | ||
387 | |||
388 | |||
389 | static int udp_register_app(struct ip_vs_app *inc) | ||
390 | { | ||
391 | struct ip_vs_app *i; | ||
392 | __u16 hash; | ||
393 | __be16 port = inc->port; | ||
394 | int ret = 0; | ||
395 | |||
396 | hash = udp_app_hashkey(port); | ||
397 | |||
398 | |||
399 | spin_lock_bh(&udp_app_lock); | ||
400 | list_for_each_entry(i, &udp_apps[hash], p_list) { | ||
401 | if (i->port == port) { | ||
402 | ret = -EEXIST; | ||
403 | goto out; | ||
404 | } | ||
405 | } | ||
406 | list_add(&inc->p_list, &udp_apps[hash]); | ||
407 | atomic_inc(&ip_vs_protocol_udp.appcnt); | ||
408 | |||
409 | out: | ||
410 | spin_unlock_bh(&udp_app_lock); | ||
411 | return ret; | ||
412 | } | ||
413 | |||
414 | |||
415 | static void | ||
416 | udp_unregister_app(struct ip_vs_app *inc) | ||
417 | { | ||
418 | spin_lock_bh(&udp_app_lock); | ||
419 | atomic_dec(&ip_vs_protocol_udp.appcnt); | ||
420 | list_del(&inc->p_list); | ||
421 | spin_unlock_bh(&udp_app_lock); | ||
422 | } | ||
423 | |||
424 | |||
425 | static int udp_app_conn_bind(struct ip_vs_conn *cp) | ||
426 | { | ||
427 | int hash; | ||
428 | struct ip_vs_app *inc; | ||
429 | int result = 0; | ||
430 | |||
431 | /* Default binding: bind app only for NAT */ | ||
432 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) | ||
433 | return 0; | ||
434 | |||
435 | /* Lookup application incarnations and bind the right one */ | ||
436 | hash = udp_app_hashkey(cp->vport); | ||
437 | |||
438 | spin_lock(&udp_app_lock); | ||
439 | list_for_each_entry(inc, &udp_apps[hash], p_list) { | ||
440 | if (inc->port == cp->vport) { | ||
441 | if (unlikely(!ip_vs_app_inc_get(inc))) | ||
442 | break; | ||
443 | spin_unlock(&udp_app_lock); | ||
444 | |||
445 | IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" | ||
446 | "%s:%u to app %s on port %u\n", | ||
447 | __func__, | ||
448 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), | ||
449 | ntohs(cp->cport), | ||
450 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), | ||
451 | ntohs(cp->vport), | ||
452 | inc->name, ntohs(inc->port)); | ||
453 | |||
454 | cp->app = inc; | ||
455 | if (inc->init_conn) | ||
456 | result = inc->init_conn(inc, cp); | ||
457 | goto out; | ||
458 | } | ||
459 | } | ||
460 | spin_unlock(&udp_app_lock); | ||
461 | |||
462 | out: | ||
463 | return result; | ||
464 | } | ||
465 | |||
466 | |||
467 | static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { | ||
468 | [IP_VS_UDP_S_NORMAL] = 5*60*HZ, | ||
469 | [IP_VS_UDP_S_LAST] = 2*HZ, | ||
470 | }; | ||
471 | |||
472 | static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { | ||
473 | [IP_VS_UDP_S_NORMAL] = "UDP", | ||
474 | [IP_VS_UDP_S_LAST] = "BUG!", | ||
475 | }; | ||
476 | |||
477 | |||
478 | static int | ||
479 | udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) | ||
480 | { | ||
481 | return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, | ||
482 | udp_state_name_table, sname, to); | ||
483 | } | ||
484 | |||
485 | static const char * udp_state_name(int state) | ||
486 | { | ||
487 | if (state >= IP_VS_UDP_S_LAST) | ||
488 | return "ERR!"; | ||
489 | return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; | ||
490 | } | ||
491 | |||
492 | static int | ||
493 | udp_state_transition(struct ip_vs_conn *cp, int direction, | ||
494 | const struct sk_buff *skb, | ||
495 | struct ip_vs_protocol *pp) | ||
496 | { | ||
497 | cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; | ||
498 | return 1; | ||
499 | } | ||
500 | |||
501 | static void udp_init(struct ip_vs_protocol *pp) | ||
502 | { | ||
503 | IP_VS_INIT_HASH_TABLE(udp_apps); | ||
504 | pp->timeout_table = udp_timeouts; | ||
505 | } | ||
506 | |||
507 | static void udp_exit(struct ip_vs_protocol *pp) | ||
508 | { | ||
509 | } | ||
510 | |||
511 | |||
512 | struct ip_vs_protocol ip_vs_protocol_udp = { | ||
513 | .name = "UDP", | ||
514 | .protocol = IPPROTO_UDP, | ||
515 | .num_states = IP_VS_UDP_S_LAST, | ||
516 | .dont_defrag = 0, | ||
517 | .init = udp_init, | ||
518 | .exit = udp_exit, | ||
519 | .conn_schedule = udp_conn_schedule, | ||
520 | .conn_in_get = udp_conn_in_get, | ||
521 | .conn_out_get = udp_conn_out_get, | ||
522 | .snat_handler = udp_snat_handler, | ||
523 | .dnat_handler = udp_dnat_handler, | ||
524 | .csum_check = udp_csum_check, | ||
525 | .state_transition = udp_state_transition, | ||
526 | .state_name = udp_state_name, | ||
527 | .register_app = udp_register_app, | ||
528 | .unregister_app = udp_unregister_app, | ||
529 | .app_conn_bind = udp_app_conn_bind, | ||
530 | .debug_packet = ip_vs_tcpudp_debug_packet, | ||
531 | .timeout_change = NULL, | ||
532 | .set_state_timeout = udp_set_state_timeout, | ||
533 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c deleted file mode 100644 index a22195f68ac4..000000000000 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ /dev/null | |||
@@ -1,112 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Round-Robin Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Peter Kese <peter.kese@ijs.si> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Fixes/Changes: | ||
13 | * Wensong Zhang : changed the ip_vs_rr_schedule to return dest | ||
14 | * Julian Anastasov : fixed the NULL pointer access bug in debugging | ||
15 | * Wensong Zhang : changed some comestics things for debugging | ||
16 | * Wensong Zhang : changed for the d-linked destination list | ||
17 | * Wensong Zhang : added the ip_vs_rr_update_svc | ||
18 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kernel.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | |||
28 | static int ip_vs_rr_init_svc(struct ip_vs_service *svc) | ||
29 | { | ||
30 | svc->sched_data = &svc->destinations; | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | |||
35 | static int ip_vs_rr_update_svc(struct ip_vs_service *svc) | ||
36 | { | ||
37 | svc->sched_data = &svc->destinations; | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | |||
42 | /* | ||
43 | * Round-Robin Scheduling | ||
44 | */ | ||
45 | static struct ip_vs_dest * | ||
46 | ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
47 | { | ||
48 | struct list_head *p, *q; | ||
49 | struct ip_vs_dest *dest; | ||
50 | |||
51 | IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); | ||
52 | |||
53 | write_lock(&svc->sched_lock); | ||
54 | p = (struct list_head *)svc->sched_data; | ||
55 | p = p->next; | ||
56 | q = p; | ||
57 | do { | ||
58 | /* skip list head */ | ||
59 | if (q == &svc->destinations) { | ||
60 | q = q->next; | ||
61 | continue; | ||
62 | } | ||
63 | |||
64 | dest = list_entry(q, struct ip_vs_dest, n_list); | ||
65 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
66 | atomic_read(&dest->weight) > 0) | ||
67 | /* HIT */ | ||
68 | goto out; | ||
69 | q = q->next; | ||
70 | } while (q != p); | ||
71 | write_unlock(&svc->sched_lock); | ||
72 | return NULL; | ||
73 | |||
74 | out: | ||
75 | svc->sched_data = q; | ||
76 | write_unlock(&svc->sched_lock); | ||
77 | IP_VS_DBG_BUF(6, "RR: server %s:%u " | ||
78 | "activeconns %d refcnt %d weight %d\n", | ||
79 | IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), | ||
80 | atomic_read(&dest->activeconns), | ||
81 | atomic_read(&dest->refcnt), atomic_read(&dest->weight)); | ||
82 | |||
83 | return dest; | ||
84 | } | ||
85 | |||
86 | |||
87 | static struct ip_vs_scheduler ip_vs_rr_scheduler = { | ||
88 | .name = "rr", /* name */ | ||
89 | .refcnt = ATOMIC_INIT(0), | ||
90 | .module = THIS_MODULE, | ||
91 | .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), | ||
92 | #ifdef CONFIG_IP_VS_IPV6 | ||
93 | .supports_ipv6 = 1, | ||
94 | #endif | ||
95 | .init_service = ip_vs_rr_init_svc, | ||
96 | .update_service = ip_vs_rr_update_svc, | ||
97 | .schedule = ip_vs_rr_schedule, | ||
98 | }; | ||
99 | |||
100 | static int __init ip_vs_rr_init(void) | ||
101 | { | ||
102 | return register_ip_vs_scheduler(&ip_vs_rr_scheduler); | ||
103 | } | ||
104 | |||
105 | static void __exit ip_vs_rr_cleanup(void) | ||
106 | { | ||
107 | unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); | ||
108 | } | ||
109 | |||
110 | module_init(ip_vs_rr_init); | ||
111 | module_exit(ip_vs_rr_cleanup); | ||
112 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c deleted file mode 100644 index a46ad9e35016..000000000000 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ /dev/null | |||
@@ -1,251 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * Changes: | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include <linux/module.h> | ||
21 | #include <linux/spinlock.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | #include <asm/string.h> | ||
24 | #include <linux/kmod.h> | ||
25 | #include <linux/sysctl.h> | ||
26 | |||
27 | #include <net/ip_vs.h> | ||
28 | |||
29 | /* | ||
30 | * IPVS scheduler list | ||
31 | */ | ||
32 | static LIST_HEAD(ip_vs_schedulers); | ||
33 | |||
34 | /* lock for service table */ | ||
35 | static DEFINE_RWLOCK(__ip_vs_sched_lock); | ||
36 | |||
37 | |||
38 | /* | ||
39 | * Bind a service with a scheduler | ||
40 | */ | ||
41 | int ip_vs_bind_scheduler(struct ip_vs_service *svc, | ||
42 | struct ip_vs_scheduler *scheduler) | ||
43 | { | ||
44 | int ret; | ||
45 | |||
46 | if (svc == NULL) { | ||
47 | IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); | ||
48 | return -EINVAL; | ||
49 | } | ||
50 | if (scheduler == NULL) { | ||
51 | IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); | ||
52 | return -EINVAL; | ||
53 | } | ||
54 | |||
55 | svc->scheduler = scheduler; | ||
56 | |||
57 | if (scheduler->init_service) { | ||
58 | ret = scheduler->init_service(svc); | ||
59 | if (ret) { | ||
60 | IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); | ||
61 | return ret; | ||
62 | } | ||
63 | } | ||
64 | |||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | |||
69 | /* | ||
70 | * Unbind a service with its scheduler | ||
71 | */ | ||
72 | int ip_vs_unbind_scheduler(struct ip_vs_service *svc) | ||
73 | { | ||
74 | struct ip_vs_scheduler *sched; | ||
75 | |||
76 | if (svc == NULL) { | ||
77 | IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); | ||
78 | return -EINVAL; | ||
79 | } | ||
80 | |||
81 | sched = svc->scheduler; | ||
82 | if (sched == NULL) { | ||
83 | IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); | ||
84 | return -EINVAL; | ||
85 | } | ||
86 | |||
87 | if (sched->done_service) { | ||
88 | if (sched->done_service(svc) != 0) { | ||
89 | IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); | ||
90 | return -EINVAL; | ||
91 | } | ||
92 | } | ||
93 | |||
94 | svc->scheduler = NULL; | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | |||
99 | /* | ||
100 | * Get scheduler in the scheduler list by name | ||
101 | */ | ||
102 | static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) | ||
103 | { | ||
104 | struct ip_vs_scheduler *sched; | ||
105 | |||
106 | IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", | ||
107 | sched_name); | ||
108 | |||
109 | read_lock_bh(&__ip_vs_sched_lock); | ||
110 | |||
111 | list_for_each_entry(sched, &ip_vs_schedulers, n_list) { | ||
112 | /* | ||
113 | * Test and get the modules atomically | ||
114 | */ | ||
115 | if (sched->module && !try_module_get(sched->module)) { | ||
116 | /* | ||
117 | * This scheduler is just deleted | ||
118 | */ | ||
119 | continue; | ||
120 | } | ||
121 | if (strcmp(sched_name, sched->name)==0) { | ||
122 | /* HIT */ | ||
123 | read_unlock_bh(&__ip_vs_sched_lock); | ||
124 | return sched; | ||
125 | } | ||
126 | if (sched->module) | ||
127 | module_put(sched->module); | ||
128 | } | ||
129 | |||
130 | read_unlock_bh(&__ip_vs_sched_lock); | ||
131 | return NULL; | ||
132 | } | ||
133 | |||
134 | |||
135 | /* | ||
136 | * Lookup scheduler and try to load it if it doesn't exist | ||
137 | */ | ||
138 | struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) | ||
139 | { | ||
140 | struct ip_vs_scheduler *sched; | ||
141 | |||
142 | /* | ||
143 | * Search for the scheduler by sched_name | ||
144 | */ | ||
145 | sched = ip_vs_sched_getbyname(sched_name); | ||
146 | |||
147 | /* | ||
148 | * If scheduler not found, load the module and search again | ||
149 | */ | ||
150 | if (sched == NULL) { | ||
151 | request_module("ip_vs_%s", sched_name); | ||
152 | sched = ip_vs_sched_getbyname(sched_name); | ||
153 | } | ||
154 | |||
155 | return sched; | ||
156 | } | ||
157 | |||
158 | void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) | ||
159 | { | ||
160 | if (scheduler->module) | ||
161 | module_put(scheduler->module); | ||
162 | } | ||
163 | |||
164 | |||
165 | /* | ||
166 | * Register a scheduler in the scheduler list | ||
167 | */ | ||
168 | int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | ||
169 | { | ||
170 | struct ip_vs_scheduler *sched; | ||
171 | |||
172 | if (!scheduler) { | ||
173 | IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); | ||
174 | return -EINVAL; | ||
175 | } | ||
176 | |||
177 | if (!scheduler->name) { | ||
178 | IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); | ||
179 | return -EINVAL; | ||
180 | } | ||
181 | |||
182 | /* increase the module use count */ | ||
183 | ip_vs_use_count_inc(); | ||
184 | |||
185 | write_lock_bh(&__ip_vs_sched_lock); | ||
186 | |||
187 | if (!list_empty(&scheduler->n_list)) { | ||
188 | write_unlock_bh(&__ip_vs_sched_lock); | ||
189 | ip_vs_use_count_dec(); | ||
190 | IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " | ||
191 | "already linked\n", scheduler->name); | ||
192 | return -EINVAL; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Make sure that the scheduler with this name doesn't exist | ||
197 | * in the scheduler list. | ||
198 | */ | ||
199 | list_for_each_entry(sched, &ip_vs_schedulers, n_list) { | ||
200 | if (strcmp(scheduler->name, sched->name) == 0) { | ||
201 | write_unlock_bh(&__ip_vs_sched_lock); | ||
202 | ip_vs_use_count_dec(); | ||
203 | IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " | ||
204 | "already existed in the system\n", | ||
205 | scheduler->name); | ||
206 | return -EINVAL; | ||
207 | } | ||
208 | } | ||
209 | /* | ||
210 | * Add it into the d-linked scheduler list | ||
211 | */ | ||
212 | list_add(&scheduler->n_list, &ip_vs_schedulers); | ||
213 | write_unlock_bh(&__ip_vs_sched_lock); | ||
214 | |||
215 | IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); | ||
216 | |||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | |||
221 | /* | ||
222 | * Unregister a scheduler from the scheduler list | ||
223 | */ | ||
224 | int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | ||
225 | { | ||
226 | if (!scheduler) { | ||
227 | IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); | ||
228 | return -EINVAL; | ||
229 | } | ||
230 | |||
231 | write_lock_bh(&__ip_vs_sched_lock); | ||
232 | if (list_empty(&scheduler->n_list)) { | ||
233 | write_unlock_bh(&__ip_vs_sched_lock); | ||
234 | IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " | ||
235 | "is not in the list. failed\n", scheduler->name); | ||
236 | return -EINVAL; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Remove it from the d-linked scheduler list | ||
241 | */ | ||
242 | list_del(&scheduler->n_list); | ||
243 | write_unlock_bh(&__ip_vs_sched_lock); | ||
244 | |||
245 | /* decrease the module use count */ | ||
246 | ip_vs_use_count_dec(); | ||
247 | |||
248 | IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); | ||
249 | |||
250 | return 0; | ||
251 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c deleted file mode 100644 index 7d2f22f04b83..000000000000 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ /dev/null | |||
@@ -1,140 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Shortest Expected Delay scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * The SED algorithm attempts to minimize each job's expected delay until | ||
17 | * completion. The expected delay that the job will experience is | ||
18 | * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of | ||
19 | * jobs on the ith server and Ui is the fixed service rate (weight) of | ||
20 | * the ith server. The SED algorithm adopts a greedy policy that each does | ||
21 | * what is in its own best interest, i.e. to join the queue which would | ||
22 | * minimize its expected delay of completion. | ||
23 | * | ||
24 | * See the following paper for more information: | ||
25 | * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing | ||
26 | * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, | ||
27 | * pages 986-994, 1988. | ||
28 | * | ||
29 | * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. | ||
30 | * | ||
31 | * The difference between SED and WLC is that SED includes the incoming | ||
32 | * job in the cost function (the increment of 1). SED may outperform | ||
33 | * WLC, while scheduling big jobs under larger heterogeneous systems | ||
34 | * (the server weight varies a lot). | ||
35 | * | ||
36 | */ | ||
37 | |||
38 | #include <linux/module.h> | ||
39 | #include <linux/kernel.h> | ||
40 | |||
41 | #include <net/ip_vs.h> | ||
42 | |||
43 | |||
44 | static inline unsigned int | ||
45 | ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) | ||
46 | { | ||
47 | /* | ||
48 | * We only use the active connection number in the cost | ||
49 | * calculation here. | ||
50 | */ | ||
51 | return atomic_read(&dest->activeconns) + 1; | ||
52 | } | ||
53 | |||
54 | |||
55 | /* | ||
56 | * Weighted Least Connection scheduling | ||
57 | */ | ||
58 | static struct ip_vs_dest * | ||
59 | ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
60 | { | ||
61 | struct ip_vs_dest *dest, *least; | ||
62 | unsigned int loh, doh; | ||
63 | |||
64 | IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); | ||
65 | |||
66 | /* | ||
67 | * We calculate the load of each dest server as follows: | ||
68 | * (server expected overhead) / dest->weight | ||
69 | * | ||
70 | * Remember -- no floats in kernel mode!!! | ||
71 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
72 | * h1/w1 > h2/w2 | ||
73 | * if every weight is larger than zero. | ||
74 | * | ||
75 | * The server with weight=0 is quiesced and will not receive any | ||
76 | * new connections. | ||
77 | */ | ||
78 | |||
79 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
80 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
81 | atomic_read(&dest->weight) > 0) { | ||
82 | least = dest; | ||
83 | loh = ip_vs_sed_dest_overhead(least); | ||
84 | goto nextstage; | ||
85 | } | ||
86 | } | ||
87 | return NULL; | ||
88 | |||
89 | /* | ||
90 | * Find the destination with the least load. | ||
91 | */ | ||
92 | nextstage: | ||
93 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
94 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
95 | continue; | ||
96 | doh = ip_vs_sed_dest_overhead(dest); | ||
97 | if (loh * atomic_read(&dest->weight) > | ||
98 | doh * atomic_read(&least->weight)) { | ||
99 | least = dest; | ||
100 | loh = doh; | ||
101 | } | ||
102 | } | ||
103 | |||
104 | IP_VS_DBG_BUF(6, "SED: server %s:%u " | ||
105 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
106 | IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), | ||
107 | atomic_read(&least->activeconns), | ||
108 | atomic_read(&least->refcnt), | ||
109 | atomic_read(&least->weight), loh); | ||
110 | |||
111 | return least; | ||
112 | } | ||
113 | |||
114 | |||
115 | static struct ip_vs_scheduler ip_vs_sed_scheduler = | ||
116 | { | ||
117 | .name = "sed", | ||
118 | .refcnt = ATOMIC_INIT(0), | ||
119 | .module = THIS_MODULE, | ||
120 | .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), | ||
121 | #ifdef CONFIG_IP_VS_IPV6 | ||
122 | .supports_ipv6 = 1, | ||
123 | #endif | ||
124 | .schedule = ip_vs_sed_schedule, | ||
125 | }; | ||
126 | |||
127 | |||
128 | static int __init ip_vs_sed_init(void) | ||
129 | { | ||
130 | return register_ip_vs_scheduler(&ip_vs_sed_scheduler); | ||
131 | } | ||
132 | |||
133 | static void __exit ip_vs_sed_cleanup(void) | ||
134 | { | ||
135 | unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); | ||
136 | } | ||
137 | |||
138 | module_init(ip_vs_sed_init); | ||
139 | module_exit(ip_vs_sed_cleanup); | ||
140 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c deleted file mode 100644 index 1d96de27fefd..000000000000 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ /dev/null | |||
@@ -1,258 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Source Hashing scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * The sh algorithm is to select server by the hash key of source IP | ||
17 | * address. The pseudo code is as follows: | ||
18 | * | ||
19 | * n <- servernode[src_ip]; | ||
20 | * if (n is dead) OR | ||
21 | * (n is overloaded) or (n.weight <= 0) then | ||
22 | * return NULL; | ||
23 | * | ||
24 | * return n; | ||
25 | * | ||
26 | * Notes that servernode is a 256-bucket hash table that maps the hash | ||
27 | * index derived from packet source IP address to the current server | ||
28 | * array. If the sh scheduler is used in cache cluster, it is good to | ||
29 | * combine it with cache_bypass feature. When the statically assigned | ||
30 | * server is dead or overloaded, the load balancer can bypass the cache | ||
31 | * server and send requests to the original server directly. | ||
32 | * | ||
33 | */ | ||
34 | |||
35 | #include <linux/ip.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/kernel.h> | ||
38 | #include <linux/skbuff.h> | ||
39 | |||
40 | #include <net/ip_vs.h> | ||
41 | |||
42 | |||
43 | /* | ||
44 | * IPVS SH bucket | ||
45 | */ | ||
46 | struct ip_vs_sh_bucket { | ||
47 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
48 | }; | ||
49 | |||
50 | /* | ||
51 | * for IPVS SH entry hash table | ||
52 | */ | ||
53 | #ifndef CONFIG_IP_VS_SH_TAB_BITS | ||
54 | #define CONFIG_IP_VS_SH_TAB_BITS 8 | ||
55 | #endif | ||
56 | #define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS | ||
57 | #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) | ||
58 | #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) | ||
59 | |||
60 | |||
61 | /* | ||
62 | * Returns hash value for IPVS SH entry | ||
63 | */ | ||
64 | static inline unsigned ip_vs_sh_hashkey(__be32 addr) | ||
65 | { | ||
66 | return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; | ||
67 | } | ||
68 | |||
69 | |||
70 | /* | ||
71 | * Get ip_vs_dest associated with supplied parameters. | ||
72 | */ | ||
73 | static inline struct ip_vs_dest * | ||
74 | ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) | ||
75 | { | ||
76 | return (tbl[ip_vs_sh_hashkey(addr)]).dest; | ||
77 | } | ||
78 | |||
79 | |||
80 | /* | ||
81 | * Assign all the hash buckets of the specified table with the service. | ||
82 | */ | ||
83 | static int | ||
84 | ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) | ||
85 | { | ||
86 | int i; | ||
87 | struct ip_vs_sh_bucket *b; | ||
88 | struct list_head *p; | ||
89 | struct ip_vs_dest *dest; | ||
90 | |||
91 | b = tbl; | ||
92 | p = &svc->destinations; | ||
93 | for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { | ||
94 | if (list_empty(p)) { | ||
95 | b->dest = NULL; | ||
96 | } else { | ||
97 | if (p == &svc->destinations) | ||
98 | p = p->next; | ||
99 | |||
100 | dest = list_entry(p, struct ip_vs_dest, n_list); | ||
101 | atomic_inc(&dest->refcnt); | ||
102 | b->dest = dest; | ||
103 | |||
104 | p = p->next; | ||
105 | } | ||
106 | b++; | ||
107 | } | ||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | |||
112 | /* | ||
113 | * Flush all the hash buckets of the specified table. | ||
114 | */ | ||
115 | static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) | ||
116 | { | ||
117 | int i; | ||
118 | struct ip_vs_sh_bucket *b; | ||
119 | |||
120 | b = tbl; | ||
121 | for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { | ||
122 | if (b->dest) { | ||
123 | atomic_dec(&b->dest->refcnt); | ||
124 | b->dest = NULL; | ||
125 | } | ||
126 | b++; | ||
127 | } | ||
128 | } | ||
129 | |||
130 | |||
131 | static int ip_vs_sh_init_svc(struct ip_vs_service *svc) | ||
132 | { | ||
133 | struct ip_vs_sh_bucket *tbl; | ||
134 | |||
135 | /* allocate the SH table for this service */ | ||
136 | tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, | ||
137 | GFP_ATOMIC); | ||
138 | if (tbl == NULL) { | ||
139 | IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); | ||
140 | return -ENOMEM; | ||
141 | } | ||
142 | svc->sched_data = tbl; | ||
143 | IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " | ||
144 | "current service\n", | ||
145 | sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); | ||
146 | |||
147 | /* assign the hash buckets with the updated service */ | ||
148 | ip_vs_sh_assign(tbl, svc); | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | |||
154 | static int ip_vs_sh_done_svc(struct ip_vs_service *svc) | ||
155 | { | ||
156 | struct ip_vs_sh_bucket *tbl = svc->sched_data; | ||
157 | |||
158 | /* got to clean up hash buckets here */ | ||
159 | ip_vs_sh_flush(tbl); | ||
160 | |||
161 | /* release the table itself */ | ||
162 | kfree(svc->sched_data); | ||
163 | IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", | ||
164 | sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); | ||
165 | |||
166 | return 0; | ||
167 | } | ||
168 | |||
169 | |||
170 | static int ip_vs_sh_update_svc(struct ip_vs_service *svc) | ||
171 | { | ||
172 | struct ip_vs_sh_bucket *tbl = svc->sched_data; | ||
173 | |||
174 | /* got to clean up hash buckets here */ | ||
175 | ip_vs_sh_flush(tbl); | ||
176 | |||
177 | /* assign the hash buckets with the updated service */ | ||
178 | ip_vs_sh_assign(tbl, svc); | ||
179 | |||
180 | return 0; | ||
181 | } | ||
182 | |||
183 | |||
184 | /* | ||
185 | * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, | ||
186 | * consider that the server is overloaded here. | ||
187 | */ | ||
188 | static inline int is_overloaded(struct ip_vs_dest *dest) | ||
189 | { | ||
190 | return dest->flags & IP_VS_DEST_F_OVERLOAD; | ||
191 | } | ||
192 | |||
193 | |||
194 | /* | ||
195 | * Source Hashing scheduling | ||
196 | */ | ||
197 | static struct ip_vs_dest * | ||
198 | ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
199 | { | ||
200 | struct ip_vs_dest *dest; | ||
201 | struct ip_vs_sh_bucket *tbl; | ||
202 | struct iphdr *iph = ip_hdr(skb); | ||
203 | |||
204 | IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); | ||
205 | |||
206 | tbl = (struct ip_vs_sh_bucket *)svc->sched_data; | ||
207 | dest = ip_vs_sh_get(tbl, iph->saddr); | ||
208 | if (!dest | ||
209 | || !(dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
210 | || atomic_read(&dest->weight) <= 0 | ||
211 | || is_overloaded(dest)) { | ||
212 | return NULL; | ||
213 | } | ||
214 | |||
215 | IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " | ||
216 | "--> server %u.%u.%u.%u:%d\n", | ||
217 | NIPQUAD(iph->saddr), | ||
218 | NIPQUAD(dest->addr.ip), | ||
219 | ntohs(dest->port)); | ||
220 | |||
221 | return dest; | ||
222 | } | ||
223 | |||
224 | |||
225 | /* | ||
226 | * IPVS SH Scheduler structure | ||
227 | */ | ||
228 | static struct ip_vs_scheduler ip_vs_sh_scheduler = | ||
229 | { | ||
230 | .name = "sh", | ||
231 | .refcnt = ATOMIC_INIT(0), | ||
232 | .module = THIS_MODULE, | ||
233 | .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), | ||
234 | #ifdef CONFIG_IP_VS_IPV6 | ||
235 | .supports_ipv6 = 0, | ||
236 | #endif | ||
237 | .init_service = ip_vs_sh_init_svc, | ||
238 | .done_service = ip_vs_sh_done_svc, | ||
239 | .update_service = ip_vs_sh_update_svc, | ||
240 | .schedule = ip_vs_sh_schedule, | ||
241 | }; | ||
242 | |||
243 | |||
244 | static int __init ip_vs_sh_init(void) | ||
245 | { | ||
246 | return register_ip_vs_scheduler(&ip_vs_sh_scheduler); | ||
247 | } | ||
248 | |||
249 | |||
250 | static void __exit ip_vs_sh_cleanup(void) | ||
251 | { | ||
252 | unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); | ||
253 | } | ||
254 | |||
255 | |||
256 | module_init(ip_vs_sh_init); | ||
257 | module_exit(ip_vs_sh_cleanup); | ||
258 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c deleted file mode 100644 index de5e7e118eed..000000000000 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ /dev/null | |||
@@ -1,942 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the NetFilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * | ||
10 | * ip_vs_sync: sync connection info from master load balancer to backups | ||
11 | * through multicast | ||
12 | * | ||
13 | * Changes: | ||
14 | * Alexandre Cassen : Added master & backup support at a time. | ||
15 | * Alexandre Cassen : Added SyncID support for incoming sync | ||
16 | * messages filtering. | ||
17 | * Justin Ossevoort : Fix endian problem on sync message size. | ||
18 | */ | ||
19 | |||
20 | #include <linux/module.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/inetdevice.h> | ||
23 | #include <linux/net.h> | ||
24 | #include <linux/completion.h> | ||
25 | #include <linux/delay.h> | ||
26 | #include <linux/skbuff.h> | ||
27 | #include <linux/in.h> | ||
28 | #include <linux/igmp.h> /* for ip_mc_join_group */ | ||
29 | #include <linux/udp.h> | ||
30 | #include <linux/err.h> | ||
31 | #include <linux/kthread.h> | ||
32 | #include <linux/wait.h> | ||
33 | #include <linux/kernel.h> | ||
34 | |||
35 | #include <net/ip.h> | ||
36 | #include <net/sock.h> | ||
37 | |||
38 | #include <net/ip_vs.h> | ||
39 | |||
40 | #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ | ||
41 | #define IP_VS_SYNC_PORT 8848 /* multicast port */ | ||
42 | |||
43 | |||
44 | /* | ||
45 | * IPVS sync connection entry | ||
46 | */ | ||
47 | struct ip_vs_sync_conn { | ||
48 | __u8 reserved; | ||
49 | |||
50 | /* Protocol, addresses and port numbers */ | ||
51 | __u8 protocol; /* Which protocol (TCP/UDP) */ | ||
52 | __be16 cport; | ||
53 | __be16 vport; | ||
54 | __be16 dport; | ||
55 | __be32 caddr; /* client address */ | ||
56 | __be32 vaddr; /* virtual address */ | ||
57 | __be32 daddr; /* destination address */ | ||
58 | |||
59 | /* Flags and state transition */ | ||
60 | __be16 flags; /* status flags */ | ||
61 | __be16 state; /* state info */ | ||
62 | |||
63 | /* The sequence options start here */ | ||
64 | }; | ||
65 | |||
66 | struct ip_vs_sync_conn_options { | ||
67 | struct ip_vs_seq in_seq; /* incoming seq. struct */ | ||
68 | struct ip_vs_seq out_seq; /* outgoing seq. struct */ | ||
69 | }; | ||
70 | |||
71 | struct ip_vs_sync_thread_data { | ||
72 | struct socket *sock; | ||
73 | char *buf; | ||
74 | }; | ||
75 | |||
76 | #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) | ||
77 | #define FULL_CONN_SIZE \ | ||
78 | (sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) | ||
79 | |||
80 | |||
81 | /* | ||
82 | The master mulitcasts messages to the backup load balancers in the | ||
83 | following format. | ||
84 | |||
85 | 0 1 2 3 | ||
86 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
87 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
88 | | Count Conns | SyncID | Size | | ||
89 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
90 | | | | ||
91 | | IPVS Sync Connection (1) | | ||
92 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
93 | | . | | ||
94 | | . | | ||
95 | | . | | ||
96 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
97 | | | | ||
98 | | IPVS Sync Connection (n) | | ||
99 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
100 | */ | ||
101 | |||
102 | #define SYNC_MESG_HEADER_LEN 4 | ||
103 | #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ | ||
104 | |||
105 | struct ip_vs_sync_mesg { | ||
106 | __u8 nr_conns; | ||
107 | __u8 syncid; | ||
108 | __u16 size; | ||
109 | |||
110 | /* ip_vs_sync_conn entries start here */ | ||
111 | }; | ||
112 | |||
113 | /* the maximum length of sync (sending/receiving) message */ | ||
114 | static int sync_send_mesg_maxlen; | ||
115 | static int sync_recv_mesg_maxlen; | ||
116 | |||
117 | struct ip_vs_sync_buff { | ||
118 | struct list_head list; | ||
119 | unsigned long firstuse; | ||
120 | |||
121 | /* pointers for the message data */ | ||
122 | struct ip_vs_sync_mesg *mesg; | ||
123 | unsigned char *head; | ||
124 | unsigned char *end; | ||
125 | }; | ||
126 | |||
127 | |||
128 | /* the sync_buff list head and the lock */ | ||
129 | static LIST_HEAD(ip_vs_sync_queue); | ||
130 | static DEFINE_SPINLOCK(ip_vs_sync_lock); | ||
131 | |||
132 | /* current sync_buff for accepting new conn entries */ | ||
133 | static struct ip_vs_sync_buff *curr_sb = NULL; | ||
134 | static DEFINE_SPINLOCK(curr_sb_lock); | ||
135 | |||
136 | /* ipvs sync daemon state */ | ||
137 | volatile int ip_vs_sync_state = IP_VS_STATE_NONE; | ||
138 | volatile int ip_vs_master_syncid = 0; | ||
139 | volatile int ip_vs_backup_syncid = 0; | ||
140 | |||
141 | /* multicast interface name */ | ||
142 | char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; | ||
143 | char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; | ||
144 | |||
145 | /* sync daemon tasks */ | ||
146 | static struct task_struct *sync_master_thread; | ||
147 | static struct task_struct *sync_backup_thread; | ||
148 | |||
149 | /* multicast addr */ | ||
150 | static struct sockaddr_in mcast_addr = { | ||
151 | .sin_family = AF_INET, | ||
152 | .sin_port = __constant_htons(IP_VS_SYNC_PORT), | ||
153 | .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), | ||
154 | }; | ||
155 | |||
156 | |||
157 | static inline struct ip_vs_sync_buff *sb_dequeue(void) | ||
158 | { | ||
159 | struct ip_vs_sync_buff *sb; | ||
160 | |||
161 | spin_lock_bh(&ip_vs_sync_lock); | ||
162 | if (list_empty(&ip_vs_sync_queue)) { | ||
163 | sb = NULL; | ||
164 | } else { | ||
165 | sb = list_entry(ip_vs_sync_queue.next, | ||
166 | struct ip_vs_sync_buff, | ||
167 | list); | ||
168 | list_del(&sb->list); | ||
169 | } | ||
170 | spin_unlock_bh(&ip_vs_sync_lock); | ||
171 | |||
172 | return sb; | ||
173 | } | ||
174 | |||
175 | static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) | ||
176 | { | ||
177 | struct ip_vs_sync_buff *sb; | ||
178 | |||
179 | if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) | ||
180 | return NULL; | ||
181 | |||
182 | if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { | ||
183 | kfree(sb); | ||
184 | return NULL; | ||
185 | } | ||
186 | sb->mesg->nr_conns = 0; | ||
187 | sb->mesg->syncid = ip_vs_master_syncid; | ||
188 | sb->mesg->size = 4; | ||
189 | sb->head = (unsigned char *)sb->mesg + 4; | ||
190 | sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; | ||
191 | sb->firstuse = jiffies; | ||
192 | return sb; | ||
193 | } | ||
194 | |||
195 | static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) | ||
196 | { | ||
197 | kfree(sb->mesg); | ||
198 | kfree(sb); | ||
199 | } | ||
200 | |||
201 | static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) | ||
202 | { | ||
203 | spin_lock(&ip_vs_sync_lock); | ||
204 | if (ip_vs_sync_state & IP_VS_STATE_MASTER) | ||
205 | list_add_tail(&sb->list, &ip_vs_sync_queue); | ||
206 | else | ||
207 | ip_vs_sync_buff_release(sb); | ||
208 | spin_unlock(&ip_vs_sync_lock); | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Get the current sync buffer if it has been created for more | ||
213 | * than the specified time or the specified time is zero. | ||
214 | */ | ||
215 | static inline struct ip_vs_sync_buff * | ||
216 | get_curr_sync_buff(unsigned long time) | ||
217 | { | ||
218 | struct ip_vs_sync_buff *sb; | ||
219 | |||
220 | spin_lock_bh(&curr_sb_lock); | ||
221 | if (curr_sb && (time == 0 || | ||
222 | time_before(jiffies - curr_sb->firstuse, time))) { | ||
223 | sb = curr_sb; | ||
224 | curr_sb = NULL; | ||
225 | } else | ||
226 | sb = NULL; | ||
227 | spin_unlock_bh(&curr_sb_lock); | ||
228 | return sb; | ||
229 | } | ||
230 | |||
231 | |||
232 | /* | ||
233 | * Add an ip_vs_conn information into the current sync_buff. | ||
234 | * Called by ip_vs_in. | ||
235 | */ | ||
236 | void ip_vs_sync_conn(struct ip_vs_conn *cp) | ||
237 | { | ||
238 | struct ip_vs_sync_mesg *m; | ||
239 | struct ip_vs_sync_conn *s; | ||
240 | int len; | ||
241 | |||
242 | spin_lock(&curr_sb_lock); | ||
243 | if (!curr_sb) { | ||
244 | if (!(curr_sb=ip_vs_sync_buff_create())) { | ||
245 | spin_unlock(&curr_sb_lock); | ||
246 | IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); | ||
247 | return; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : | ||
252 | SIMPLE_CONN_SIZE; | ||
253 | m = curr_sb->mesg; | ||
254 | s = (struct ip_vs_sync_conn *)curr_sb->head; | ||
255 | |||
256 | /* copy members */ | ||
257 | s->protocol = cp->protocol; | ||
258 | s->cport = cp->cport; | ||
259 | s->vport = cp->vport; | ||
260 | s->dport = cp->dport; | ||
261 | s->caddr = cp->caddr.ip; | ||
262 | s->vaddr = cp->vaddr.ip; | ||
263 | s->daddr = cp->daddr.ip; | ||
264 | s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); | ||
265 | s->state = htons(cp->state); | ||
266 | if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { | ||
267 | struct ip_vs_sync_conn_options *opt = | ||
268 | (struct ip_vs_sync_conn_options *)&s[1]; | ||
269 | memcpy(opt, &cp->in_seq, sizeof(*opt)); | ||
270 | } | ||
271 | |||
272 | m->nr_conns++; | ||
273 | m->size += len; | ||
274 | curr_sb->head += len; | ||
275 | |||
276 | /* check if there is a space for next one */ | ||
277 | if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { | ||
278 | sb_queue_tail(curr_sb); | ||
279 | curr_sb = NULL; | ||
280 | } | ||
281 | spin_unlock(&curr_sb_lock); | ||
282 | |||
283 | /* synchronize its controller if it has */ | ||
284 | if (cp->control) | ||
285 | ip_vs_sync_conn(cp->control); | ||
286 | } | ||
287 | |||
288 | |||
289 | /* | ||
290 | * Process received multicast message and create the corresponding | ||
291 | * ip_vs_conn entries. | ||
292 | */ | ||
293 | static void ip_vs_process_message(const char *buffer, const size_t buflen) | ||
294 | { | ||
295 | struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; | ||
296 | struct ip_vs_sync_conn *s; | ||
297 | struct ip_vs_sync_conn_options *opt; | ||
298 | struct ip_vs_conn *cp; | ||
299 | struct ip_vs_protocol *pp; | ||
300 | struct ip_vs_dest *dest; | ||
301 | char *p; | ||
302 | int i; | ||
303 | |||
304 | if (buflen < sizeof(struct ip_vs_sync_mesg)) { | ||
305 | IP_VS_ERR_RL("sync message header too short\n"); | ||
306 | return; | ||
307 | } | ||
308 | |||
309 | /* Convert size back to host byte order */ | ||
310 | m->size = ntohs(m->size); | ||
311 | |||
312 | if (buflen != m->size) { | ||
313 | IP_VS_ERR_RL("bogus sync message size\n"); | ||
314 | return; | ||
315 | } | ||
316 | |||
317 | /* SyncID sanity check */ | ||
318 | if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { | ||
319 | IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", | ||
320 | m->syncid); | ||
321 | return; | ||
322 | } | ||
323 | |||
324 | p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); | ||
325 | for (i=0; i<m->nr_conns; i++) { | ||
326 | unsigned flags, state; | ||
327 | |||
328 | if (p + SIMPLE_CONN_SIZE > buffer+buflen) { | ||
329 | IP_VS_ERR_RL("bogus conn in sync message\n"); | ||
330 | return; | ||
331 | } | ||
332 | s = (struct ip_vs_sync_conn *) p; | ||
333 | flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; | ||
334 | flags &= ~IP_VS_CONN_F_HASHED; | ||
335 | if (flags & IP_VS_CONN_F_SEQ_MASK) { | ||
336 | opt = (struct ip_vs_sync_conn_options *)&s[1]; | ||
337 | p += FULL_CONN_SIZE; | ||
338 | if (p > buffer+buflen) { | ||
339 | IP_VS_ERR_RL("bogus conn options in sync message\n"); | ||
340 | return; | ||
341 | } | ||
342 | } else { | ||
343 | opt = NULL; | ||
344 | p += SIMPLE_CONN_SIZE; | ||
345 | } | ||
346 | |||
347 | state = ntohs(s->state); | ||
348 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) { | ||
349 | pp = ip_vs_proto_get(s->protocol); | ||
350 | if (!pp) { | ||
351 | IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", | ||
352 | s->protocol); | ||
353 | continue; | ||
354 | } | ||
355 | if (state >= pp->num_states) { | ||
356 | IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", | ||
357 | pp->name, state); | ||
358 | continue; | ||
359 | } | ||
360 | } else { | ||
361 | /* protocol in templates is not used for state/timeout */ | ||
362 | pp = NULL; | ||
363 | if (state > 0) { | ||
364 | IP_VS_DBG(2, "Invalid template state %u in sync msg\n", | ||
365 | state); | ||
366 | state = 0; | ||
367 | } | ||
368 | } | ||
369 | |||
370 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) | ||
371 | cp = ip_vs_conn_in_get(AF_INET, s->protocol, | ||
372 | (union nf_inet_addr *)&s->caddr, | ||
373 | s->cport, | ||
374 | (union nf_inet_addr *)&s->vaddr, | ||
375 | s->vport); | ||
376 | else | ||
377 | cp = ip_vs_ct_in_get(AF_INET, s->protocol, | ||
378 | (union nf_inet_addr *)&s->caddr, | ||
379 | s->cport, | ||
380 | (union nf_inet_addr *)&s->vaddr, | ||
381 | s->vport); | ||
382 | if (!cp) { | ||
383 | /* | ||
384 | * Find the appropriate destination for the connection. | ||
385 | * If it is not found the connection will remain unbound | ||
386 | * but still handled. | ||
387 | */ | ||
388 | dest = ip_vs_find_dest(AF_INET, | ||
389 | (union nf_inet_addr *)&s->daddr, | ||
390 | s->dport, | ||
391 | (union nf_inet_addr *)&s->vaddr, | ||
392 | s->vport, | ||
393 | s->protocol); | ||
394 | /* Set the approprite ativity flag */ | ||
395 | if (s->protocol == IPPROTO_TCP) { | ||
396 | if (state != IP_VS_TCP_S_ESTABLISHED) | ||
397 | flags |= IP_VS_CONN_F_INACTIVE; | ||
398 | else | ||
399 | flags &= ~IP_VS_CONN_F_INACTIVE; | ||
400 | } | ||
401 | cp = ip_vs_conn_new(AF_INET, s->protocol, | ||
402 | (union nf_inet_addr *)&s->caddr, | ||
403 | s->cport, | ||
404 | (union nf_inet_addr *)&s->vaddr, | ||
405 | s->vport, | ||
406 | (union nf_inet_addr *)&s->daddr, | ||
407 | s->dport, | ||
408 | flags, dest); | ||
409 | if (dest) | ||
410 | atomic_dec(&dest->refcnt); | ||
411 | if (!cp) { | ||
412 | IP_VS_ERR("ip_vs_conn_new failed\n"); | ||
413 | return; | ||
414 | } | ||
415 | } else if (!cp->dest) { | ||
416 | dest = ip_vs_try_bind_dest(cp); | ||
417 | if (dest) | ||
418 | atomic_dec(&dest->refcnt); | ||
419 | } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && | ||
420 | (cp->state != state)) { | ||
421 | /* update active/inactive flag for the connection */ | ||
422 | dest = cp->dest; | ||
423 | if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
424 | (state != IP_VS_TCP_S_ESTABLISHED)) { | ||
425 | atomic_dec(&dest->activeconns); | ||
426 | atomic_inc(&dest->inactconns); | ||
427 | cp->flags |= IP_VS_CONN_F_INACTIVE; | ||
428 | } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
429 | (state == IP_VS_TCP_S_ESTABLISHED)) { | ||
430 | atomic_inc(&dest->activeconns); | ||
431 | atomic_dec(&dest->inactconns); | ||
432 | cp->flags &= ~IP_VS_CONN_F_INACTIVE; | ||
433 | } | ||
434 | } | ||
435 | |||
436 | if (opt) | ||
437 | memcpy(&cp->in_seq, opt, sizeof(*opt)); | ||
438 | atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); | ||
439 | cp->state = state; | ||
440 | cp->old_state = cp->state; | ||
441 | /* | ||
442 | * We can not recover the right timeout for templates | ||
443 | * in all cases, we can not find the right fwmark | ||
444 | * virtual service. If needed, we can do it for | ||
445 | * non-fwmark persistent services. | ||
446 | */ | ||
447 | if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) | ||
448 | cp->timeout = pp->timeout_table[state]; | ||
449 | else | ||
450 | cp->timeout = (3*60*HZ); | ||
451 | ip_vs_conn_put(cp); | ||
452 | } | ||
453 | } | ||
454 | |||
455 | |||
456 | /* | ||
457 | * Setup loopback of outgoing multicasts on a sending socket | ||
458 | */ | ||
459 | static void set_mcast_loop(struct sock *sk, u_char loop) | ||
460 | { | ||
461 | struct inet_sock *inet = inet_sk(sk); | ||
462 | |||
463 | /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ | ||
464 | lock_sock(sk); | ||
465 | inet->mc_loop = loop ? 1 : 0; | ||
466 | release_sock(sk); | ||
467 | } | ||
468 | |||
469 | /* | ||
470 | * Specify TTL for outgoing multicasts on a sending socket | ||
471 | */ | ||
472 | static void set_mcast_ttl(struct sock *sk, u_char ttl) | ||
473 | { | ||
474 | struct inet_sock *inet = inet_sk(sk); | ||
475 | |||
476 | /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ | ||
477 | lock_sock(sk); | ||
478 | inet->mc_ttl = ttl; | ||
479 | release_sock(sk); | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * Specifiy default interface for outgoing multicasts | ||
484 | */ | ||
485 | static int set_mcast_if(struct sock *sk, char *ifname) | ||
486 | { | ||
487 | struct net_device *dev; | ||
488 | struct inet_sock *inet = inet_sk(sk); | ||
489 | |||
490 | if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) | ||
491 | return -ENODEV; | ||
492 | |||
493 | if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) | ||
494 | return -EINVAL; | ||
495 | |||
496 | lock_sock(sk); | ||
497 | inet->mc_index = dev->ifindex; | ||
498 | /* inet->mc_addr = 0; */ | ||
499 | release_sock(sk); | ||
500 | |||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | |||
505 | /* | ||
506 | * Set the maximum length of sync message according to the | ||
507 | * specified interface's MTU. | ||
508 | */ | ||
509 | static int set_sync_mesg_maxlen(int sync_state) | ||
510 | { | ||
511 | struct net_device *dev; | ||
512 | int num; | ||
513 | |||
514 | if (sync_state == IP_VS_STATE_MASTER) { | ||
515 | if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) | ||
516 | return -ENODEV; | ||
517 | |||
518 | num = (dev->mtu - sizeof(struct iphdr) - | ||
519 | sizeof(struct udphdr) - | ||
520 | SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; | ||
521 | sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + | ||
522 | SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); | ||
523 | IP_VS_DBG(7, "setting the maximum length of sync sending " | ||
524 | "message %d.\n", sync_send_mesg_maxlen); | ||
525 | } else if (sync_state == IP_VS_STATE_BACKUP) { | ||
526 | if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) | ||
527 | return -ENODEV; | ||
528 | |||
529 | sync_recv_mesg_maxlen = dev->mtu - | ||
530 | sizeof(struct iphdr) - sizeof(struct udphdr); | ||
531 | IP_VS_DBG(7, "setting the maximum length of sync receiving " | ||
532 | "message %d.\n", sync_recv_mesg_maxlen); | ||
533 | } | ||
534 | |||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | |||
539 | /* | ||
540 | * Join a multicast group. | ||
541 | * the group is specified by a class D multicast address 224.0.0.0/8 | ||
542 | * in the in_addr structure passed in as a parameter. | ||
543 | */ | ||
544 | static int | ||
545 | join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) | ||
546 | { | ||
547 | struct ip_mreqn mreq; | ||
548 | struct net_device *dev; | ||
549 | int ret; | ||
550 | |||
551 | memset(&mreq, 0, sizeof(mreq)); | ||
552 | memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); | ||
553 | |||
554 | if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) | ||
555 | return -ENODEV; | ||
556 | if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) | ||
557 | return -EINVAL; | ||
558 | |||
559 | mreq.imr_ifindex = dev->ifindex; | ||
560 | |||
561 | lock_sock(sk); | ||
562 | ret = ip_mc_join_group(sk, &mreq); | ||
563 | release_sock(sk); | ||
564 | |||
565 | return ret; | ||
566 | } | ||
567 | |||
568 | |||
569 | static int bind_mcastif_addr(struct socket *sock, char *ifname) | ||
570 | { | ||
571 | struct net_device *dev; | ||
572 | __be32 addr; | ||
573 | struct sockaddr_in sin; | ||
574 | |||
575 | if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) | ||
576 | return -ENODEV; | ||
577 | |||
578 | addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
579 | if (!addr) | ||
580 | IP_VS_ERR("You probably need to specify IP address on " | ||
581 | "multicast interface.\n"); | ||
582 | |||
583 | IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", | ||
584 | ifname, NIPQUAD(addr)); | ||
585 | |||
586 | /* Now bind the socket with the address of multicast interface */ | ||
587 | sin.sin_family = AF_INET; | ||
588 | sin.sin_addr.s_addr = addr; | ||
589 | sin.sin_port = 0; | ||
590 | |||
591 | return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); | ||
592 | } | ||
593 | |||
594 | /* | ||
595 | * Set up sending multicast socket over UDP | ||
596 | */ | ||
597 | static struct socket * make_send_sock(void) | ||
598 | { | ||
599 | struct socket *sock; | ||
600 | int result; | ||
601 | |||
602 | /* First create a socket */ | ||
603 | result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); | ||
604 | if (result < 0) { | ||
605 | IP_VS_ERR("Error during creation of socket; terminating\n"); | ||
606 | return ERR_PTR(result); | ||
607 | } | ||
608 | |||
609 | result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); | ||
610 | if (result < 0) { | ||
611 | IP_VS_ERR("Error setting outbound mcast interface\n"); | ||
612 | goto error; | ||
613 | } | ||
614 | |||
615 | set_mcast_loop(sock->sk, 0); | ||
616 | set_mcast_ttl(sock->sk, 1); | ||
617 | |||
618 | result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); | ||
619 | if (result < 0) { | ||
620 | IP_VS_ERR("Error binding address of the mcast interface\n"); | ||
621 | goto error; | ||
622 | } | ||
623 | |||
624 | result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, | ||
625 | sizeof(struct sockaddr), 0); | ||
626 | if (result < 0) { | ||
627 | IP_VS_ERR("Error connecting to the multicast addr\n"); | ||
628 | goto error; | ||
629 | } | ||
630 | |||
631 | return sock; | ||
632 | |||
633 | error: | ||
634 | sock_release(sock); | ||
635 | return ERR_PTR(result); | ||
636 | } | ||
637 | |||
638 | |||
639 | /* | ||
640 | * Set up receiving multicast socket over UDP | ||
641 | */ | ||
642 | static struct socket * make_receive_sock(void) | ||
643 | { | ||
644 | struct socket *sock; | ||
645 | int result; | ||
646 | |||
647 | /* First create a socket */ | ||
648 | result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); | ||
649 | if (result < 0) { | ||
650 | IP_VS_ERR("Error during creation of socket; terminating\n"); | ||
651 | return ERR_PTR(result); | ||
652 | } | ||
653 | |||
654 | /* it is equivalent to the REUSEADDR option in user-space */ | ||
655 | sock->sk->sk_reuse = 1; | ||
656 | |||
657 | result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, | ||
658 | sizeof(struct sockaddr)); | ||
659 | if (result < 0) { | ||
660 | IP_VS_ERR("Error binding to the multicast addr\n"); | ||
661 | goto error; | ||
662 | } | ||
663 | |||
664 | /* join the multicast group */ | ||
665 | result = join_mcast_group(sock->sk, | ||
666 | (struct in_addr *) &mcast_addr.sin_addr, | ||
667 | ip_vs_backup_mcast_ifn); | ||
668 | if (result < 0) { | ||
669 | IP_VS_ERR("Error joining to the multicast group\n"); | ||
670 | goto error; | ||
671 | } | ||
672 | |||
673 | return sock; | ||
674 | |||
675 | error: | ||
676 | sock_release(sock); | ||
677 | return ERR_PTR(result); | ||
678 | } | ||
679 | |||
680 | |||
681 | static int | ||
682 | ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) | ||
683 | { | ||
684 | struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; | ||
685 | struct kvec iov; | ||
686 | int len; | ||
687 | |||
688 | EnterFunction(7); | ||
689 | iov.iov_base = (void *)buffer; | ||
690 | iov.iov_len = length; | ||
691 | |||
692 | len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); | ||
693 | |||
694 | LeaveFunction(7); | ||
695 | return len; | ||
696 | } | ||
697 | |||
698 | static void | ||
699 | ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) | ||
700 | { | ||
701 | int msize; | ||
702 | |||
703 | msize = msg->size; | ||
704 | |||
705 | /* Put size in network byte order */ | ||
706 | msg->size = htons(msg->size); | ||
707 | |||
708 | if (ip_vs_send_async(sock, (char *)msg, msize) != msize) | ||
709 | IP_VS_ERR("ip_vs_send_async error\n"); | ||
710 | } | ||
711 | |||
712 | static int | ||
713 | ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) | ||
714 | { | ||
715 | struct msghdr msg = {NULL,}; | ||
716 | struct kvec iov; | ||
717 | int len; | ||
718 | |||
719 | EnterFunction(7); | ||
720 | |||
721 | /* Receive a packet */ | ||
722 | iov.iov_base = buffer; | ||
723 | iov.iov_len = (size_t)buflen; | ||
724 | |||
725 | len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); | ||
726 | |||
727 | if (len < 0) | ||
728 | return -1; | ||
729 | |||
730 | LeaveFunction(7); | ||
731 | return len; | ||
732 | } | ||
733 | |||
734 | |||
735 | static int sync_thread_master(void *data) | ||
736 | { | ||
737 | struct ip_vs_sync_thread_data *tinfo = data; | ||
738 | struct ip_vs_sync_buff *sb; | ||
739 | |||
740 | IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " | ||
741 | "syncid = %d\n", | ||
742 | ip_vs_master_mcast_ifn, ip_vs_master_syncid); | ||
743 | |||
744 | while (!kthread_should_stop()) { | ||
745 | while ((sb = sb_dequeue())) { | ||
746 | ip_vs_send_sync_msg(tinfo->sock, sb->mesg); | ||
747 | ip_vs_sync_buff_release(sb); | ||
748 | } | ||
749 | |||
750 | /* check if entries stay in curr_sb for 2 seconds */ | ||
751 | sb = get_curr_sync_buff(2 * HZ); | ||
752 | if (sb) { | ||
753 | ip_vs_send_sync_msg(tinfo->sock, sb->mesg); | ||
754 | ip_vs_sync_buff_release(sb); | ||
755 | } | ||
756 | |||
757 | schedule_timeout_interruptible(HZ); | ||
758 | } | ||
759 | |||
760 | /* clean up the sync_buff queue */ | ||
761 | while ((sb=sb_dequeue())) { | ||
762 | ip_vs_sync_buff_release(sb); | ||
763 | } | ||
764 | |||
765 | /* clean up the current sync_buff */ | ||
766 | if ((sb = get_curr_sync_buff(0))) { | ||
767 | ip_vs_sync_buff_release(sb); | ||
768 | } | ||
769 | |||
770 | /* release the sending multicast socket */ | ||
771 | sock_release(tinfo->sock); | ||
772 | kfree(tinfo); | ||
773 | |||
774 | return 0; | ||
775 | } | ||
776 | |||
777 | |||
778 | static int sync_thread_backup(void *data) | ||
779 | { | ||
780 | struct ip_vs_sync_thread_data *tinfo = data; | ||
781 | int len; | ||
782 | |||
783 | IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " | ||
784 | "syncid = %d\n", | ||
785 | ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); | ||
786 | |||
787 | while (!kthread_should_stop()) { | ||
788 | wait_event_interruptible(*tinfo->sock->sk->sk_sleep, | ||
789 | !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) | ||
790 | || kthread_should_stop()); | ||
791 | |||
792 | /* do we have data now? */ | ||
793 | while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { | ||
794 | len = ip_vs_receive(tinfo->sock, tinfo->buf, | ||
795 | sync_recv_mesg_maxlen); | ||
796 | if (len <= 0) { | ||
797 | IP_VS_ERR("receiving message error\n"); | ||
798 | break; | ||
799 | } | ||
800 | |||
801 | /* disable bottom half, because it accesses the data | ||
802 | shared by softirq while getting/creating conns */ | ||
803 | local_bh_disable(); | ||
804 | ip_vs_process_message(tinfo->buf, len); | ||
805 | local_bh_enable(); | ||
806 | } | ||
807 | } | ||
808 | |||
809 | /* release the sending multicast socket */ | ||
810 | sock_release(tinfo->sock); | ||
811 | kfree(tinfo->buf); | ||
812 | kfree(tinfo); | ||
813 | |||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | |||
818 | int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) | ||
819 | { | ||
820 | struct ip_vs_sync_thread_data *tinfo; | ||
821 | struct task_struct **realtask, *task; | ||
822 | struct socket *sock; | ||
823 | char *name, *buf = NULL; | ||
824 | int (*threadfn)(void *data); | ||
825 | int result = -ENOMEM; | ||
826 | |||
827 | IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); | ||
828 | IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", | ||
829 | sizeof(struct ip_vs_sync_conn)); | ||
830 | |||
831 | if (state == IP_VS_STATE_MASTER) { | ||
832 | if (sync_master_thread) | ||
833 | return -EEXIST; | ||
834 | |||
835 | strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, | ||
836 | sizeof(ip_vs_master_mcast_ifn)); | ||
837 | ip_vs_master_syncid = syncid; | ||
838 | realtask = &sync_master_thread; | ||
839 | name = "ipvs_syncmaster"; | ||
840 | threadfn = sync_thread_master; | ||
841 | sock = make_send_sock(); | ||
842 | } else if (state == IP_VS_STATE_BACKUP) { | ||
843 | if (sync_backup_thread) | ||
844 | return -EEXIST; | ||
845 | |||
846 | strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, | ||
847 | sizeof(ip_vs_backup_mcast_ifn)); | ||
848 | ip_vs_backup_syncid = syncid; | ||
849 | realtask = &sync_backup_thread; | ||
850 | name = "ipvs_syncbackup"; | ||
851 | threadfn = sync_thread_backup; | ||
852 | sock = make_receive_sock(); | ||
853 | } else { | ||
854 | return -EINVAL; | ||
855 | } | ||
856 | |||
857 | if (IS_ERR(sock)) { | ||
858 | result = PTR_ERR(sock); | ||
859 | goto out; | ||
860 | } | ||
861 | |||
862 | set_sync_mesg_maxlen(state); | ||
863 | if (state == IP_VS_STATE_BACKUP) { | ||
864 | buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); | ||
865 | if (!buf) | ||
866 | goto outsocket; | ||
867 | } | ||
868 | |||
869 | tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); | ||
870 | if (!tinfo) | ||
871 | goto outbuf; | ||
872 | |||
873 | tinfo->sock = sock; | ||
874 | tinfo->buf = buf; | ||
875 | |||
876 | task = kthread_run(threadfn, tinfo, name); | ||
877 | if (IS_ERR(task)) { | ||
878 | result = PTR_ERR(task); | ||
879 | goto outtinfo; | ||
880 | } | ||
881 | |||
882 | /* mark as active */ | ||
883 | *realtask = task; | ||
884 | ip_vs_sync_state |= state; | ||
885 | |||
886 | /* increase the module use count */ | ||
887 | ip_vs_use_count_inc(); | ||
888 | |||
889 | return 0; | ||
890 | |||
891 | outtinfo: | ||
892 | kfree(tinfo); | ||
893 | outbuf: | ||
894 | kfree(buf); | ||
895 | outsocket: | ||
896 | sock_release(sock); | ||
897 | out: | ||
898 | return result; | ||
899 | } | ||
900 | |||
901 | |||
902 | int stop_sync_thread(int state) | ||
903 | { | ||
904 | IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); | ||
905 | |||
906 | if (state == IP_VS_STATE_MASTER) { | ||
907 | if (!sync_master_thread) | ||
908 | return -ESRCH; | ||
909 | |||
910 | IP_VS_INFO("stopping master sync thread %d ...\n", | ||
911 | task_pid_nr(sync_master_thread)); | ||
912 | |||
913 | /* | ||
914 | * The lock synchronizes with sb_queue_tail(), so that we don't | ||
915 | * add sync buffers to the queue, when we are already in | ||
916 | * progress of stopping the master sync daemon. | ||
917 | */ | ||
918 | |||
919 | spin_lock_bh(&ip_vs_sync_lock); | ||
920 | ip_vs_sync_state &= ~IP_VS_STATE_MASTER; | ||
921 | spin_unlock_bh(&ip_vs_sync_lock); | ||
922 | kthread_stop(sync_master_thread); | ||
923 | sync_master_thread = NULL; | ||
924 | } else if (state == IP_VS_STATE_BACKUP) { | ||
925 | if (!sync_backup_thread) | ||
926 | return -ESRCH; | ||
927 | |||
928 | IP_VS_INFO("stopping backup sync thread %d ...\n", | ||
929 | task_pid_nr(sync_backup_thread)); | ||
930 | |||
931 | ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; | ||
932 | kthread_stop(sync_backup_thread); | ||
933 | sync_backup_thread = NULL; | ||
934 | } else { | ||
935 | return -EINVAL; | ||
936 | } | ||
937 | |||
938 | /* decrease the module use count */ | ||
939 | ip_vs_use_count_dec(); | ||
940 | |||
941 | return 0; | ||
942 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c deleted file mode 100644 index 8c596e712599..000000000000 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ /dev/null | |||
@@ -1,128 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Weighted Least-Connection Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Peter Kese <peter.kese@ijs.si> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest | ||
14 | * Wensong Zhang : changed to use the inactconns in scheduling | ||
15 | * Wensong Zhang : changed some comestics things for debugging | ||
16 | * Wensong Zhang : changed for the d-linked destination list | ||
17 | * Wensong Zhang : added the ip_vs_wlc_update_svc | ||
18 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kernel.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | |||
28 | static inline unsigned int | ||
29 | ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) | ||
30 | { | ||
31 | /* | ||
32 | * We think the overhead of processing active connections is 256 | ||
33 | * times higher than that of inactive connections in average. (This | ||
34 | * 256 times might not be accurate, we will change it later) We | ||
35 | * use the following formula to estimate the overhead now: | ||
36 | * dest->activeconns*256 + dest->inactconns | ||
37 | */ | ||
38 | return (atomic_read(&dest->activeconns) << 8) + | ||
39 | atomic_read(&dest->inactconns); | ||
40 | } | ||
41 | |||
42 | |||
43 | /* | ||
44 | * Weighted Least Connection scheduling | ||
45 | */ | ||
46 | static struct ip_vs_dest * | ||
47 | ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
48 | { | ||
49 | struct ip_vs_dest *dest, *least; | ||
50 | unsigned int loh, doh; | ||
51 | |||
52 | IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); | ||
53 | |||
54 | /* | ||
55 | * We calculate the load of each dest server as follows: | ||
56 | * (dest overhead) / dest->weight | ||
57 | * | ||
58 | * Remember -- no floats in kernel mode!!! | ||
59 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
60 | * h1/w1 > h2/w2 | ||
61 | * if every weight is larger than zero. | ||
62 | * | ||
63 | * The server with weight=0 is quiesced and will not receive any | ||
64 | * new connections. | ||
65 | */ | ||
66 | |||
67 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
68 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
69 | atomic_read(&dest->weight) > 0) { | ||
70 | least = dest; | ||
71 | loh = ip_vs_wlc_dest_overhead(least); | ||
72 | goto nextstage; | ||
73 | } | ||
74 | } | ||
75 | return NULL; | ||
76 | |||
77 | /* | ||
78 | * Find the destination with the least load. | ||
79 | */ | ||
80 | nextstage: | ||
81 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
82 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
83 | continue; | ||
84 | doh = ip_vs_wlc_dest_overhead(dest); | ||
85 | if (loh * atomic_read(&dest->weight) > | ||
86 | doh * atomic_read(&least->weight)) { | ||
87 | least = dest; | ||
88 | loh = doh; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | IP_VS_DBG_BUF(6, "WLC: server %s:%u " | ||
93 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
94 | IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), | ||
95 | atomic_read(&least->activeconns), | ||
96 | atomic_read(&least->refcnt), | ||
97 | atomic_read(&least->weight), loh); | ||
98 | |||
99 | return least; | ||
100 | } | ||
101 | |||
102 | |||
103 | static struct ip_vs_scheduler ip_vs_wlc_scheduler = | ||
104 | { | ||
105 | .name = "wlc", | ||
106 | .refcnt = ATOMIC_INIT(0), | ||
107 | .module = THIS_MODULE, | ||
108 | .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), | ||
109 | #ifdef CONFIG_IP_VS_IPV6 | ||
110 | .supports_ipv6 = 1, | ||
111 | #endif | ||
112 | .schedule = ip_vs_wlc_schedule, | ||
113 | }; | ||
114 | |||
115 | |||
116 | static int __init ip_vs_wlc_init(void) | ||
117 | { | ||
118 | return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); | ||
119 | } | ||
120 | |||
121 | static void __exit ip_vs_wlc_cleanup(void) | ||
122 | { | ||
123 | unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); | ||
124 | } | ||
125 | |||
126 | module_init(ip_vs_wlc_init); | ||
127 | module_exit(ip_vs_wlc_cleanup); | ||
128 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c deleted file mode 100644 index 7ea92fed50bf..000000000000 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ /dev/null | |||
@@ -1,237 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Weighted Round-Robin Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest | ||
13 | * Wensong Zhang : changed some comestics things for debugging | ||
14 | * Wensong Zhang : changed for the d-linked destination list | ||
15 | * Wensong Zhang : added the ip_vs_wrr_update_svc | ||
16 | * Julian Anastasov : fixed the bug of returning destination | ||
17 | * with weight 0 when all weights are zero | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/net.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | /* | ||
28 | * current destination pointer for weighted round-robin scheduling | ||
29 | */ | ||
30 | struct ip_vs_wrr_mark { | ||
31 | struct list_head *cl; /* current list head */ | ||
32 | int cw; /* current weight */ | ||
33 | int mw; /* maximum weight */ | ||
34 | int di; /* decreasing interval */ | ||
35 | }; | ||
36 | |||
37 | |||
38 | /* | ||
39 | * Get the gcd of server weights | ||
40 | */ | ||
41 | static int gcd(int a, int b) | ||
42 | { | ||
43 | int c; | ||
44 | |||
45 | while ((c = a % b)) { | ||
46 | a = b; | ||
47 | b = c; | ||
48 | } | ||
49 | return b; | ||
50 | } | ||
51 | |||
52 | static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) | ||
53 | { | ||
54 | struct ip_vs_dest *dest; | ||
55 | int weight; | ||
56 | int g = 0; | ||
57 | |||
58 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
59 | weight = atomic_read(&dest->weight); | ||
60 | if (weight > 0) { | ||
61 | if (g > 0) | ||
62 | g = gcd(weight, g); | ||
63 | else | ||
64 | g = weight; | ||
65 | } | ||
66 | } | ||
67 | return g ? g : 1; | ||
68 | } | ||
69 | |||
70 | |||
71 | /* | ||
72 | * Get the maximum weight of the service destinations. | ||
73 | */ | ||
74 | static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) | ||
75 | { | ||
76 | struct ip_vs_dest *dest; | ||
77 | int weight = 0; | ||
78 | |||
79 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
80 | if (atomic_read(&dest->weight) > weight) | ||
81 | weight = atomic_read(&dest->weight); | ||
82 | } | ||
83 | |||
84 | return weight; | ||
85 | } | ||
86 | |||
87 | |||
88 | static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) | ||
89 | { | ||
90 | struct ip_vs_wrr_mark *mark; | ||
91 | |||
92 | /* | ||
93 | * Allocate the mark variable for WRR scheduling | ||
94 | */ | ||
95 | mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); | ||
96 | if (mark == NULL) { | ||
97 | IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); | ||
98 | return -ENOMEM; | ||
99 | } | ||
100 | mark->cl = &svc->destinations; | ||
101 | mark->cw = 0; | ||
102 | mark->mw = ip_vs_wrr_max_weight(svc); | ||
103 | mark->di = ip_vs_wrr_gcd_weight(svc); | ||
104 | svc->sched_data = mark; | ||
105 | |||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | |||
110 | static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) | ||
111 | { | ||
112 | /* | ||
113 | * Release the mark variable | ||
114 | */ | ||
115 | kfree(svc->sched_data); | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | |||
121 | static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) | ||
122 | { | ||
123 | struct ip_vs_wrr_mark *mark = svc->sched_data; | ||
124 | |||
125 | mark->cl = &svc->destinations; | ||
126 | mark->mw = ip_vs_wrr_max_weight(svc); | ||
127 | mark->di = ip_vs_wrr_gcd_weight(svc); | ||
128 | if (mark->cw > mark->mw) | ||
129 | mark->cw = 0; | ||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | |||
134 | /* | ||
135 | * Weighted Round-Robin Scheduling | ||
136 | */ | ||
137 | static struct ip_vs_dest * | ||
138 | ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
139 | { | ||
140 | struct ip_vs_dest *dest; | ||
141 | struct ip_vs_wrr_mark *mark = svc->sched_data; | ||
142 | struct list_head *p; | ||
143 | |||
144 | IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); | ||
145 | |||
146 | /* | ||
147 | * This loop will always terminate, because mark->cw in (0, max_weight] | ||
148 | * and at least one server has its weight equal to max_weight. | ||
149 | */ | ||
150 | write_lock(&svc->sched_lock); | ||
151 | p = mark->cl; | ||
152 | while (1) { | ||
153 | if (mark->cl == &svc->destinations) { | ||
154 | /* it is at the head of the destination list */ | ||
155 | |||
156 | if (mark->cl == mark->cl->next) { | ||
157 | /* no dest entry */ | ||
158 | dest = NULL; | ||
159 | goto out; | ||
160 | } | ||
161 | |||
162 | mark->cl = svc->destinations.next; | ||
163 | mark->cw -= mark->di; | ||
164 | if (mark->cw <= 0) { | ||
165 | mark->cw = mark->mw; | ||
166 | /* | ||
167 | * Still zero, which means no available servers. | ||
168 | */ | ||
169 | if (mark->cw == 0) { | ||
170 | mark->cl = &svc->destinations; | ||
171 | IP_VS_ERR_RL("ip_vs_wrr_schedule(): " | ||
172 | "no available servers\n"); | ||
173 | dest = NULL; | ||
174 | goto out; | ||
175 | } | ||
176 | } | ||
177 | } else | ||
178 | mark->cl = mark->cl->next; | ||
179 | |||
180 | if (mark->cl != &svc->destinations) { | ||
181 | /* not at the head of the list */ | ||
182 | dest = list_entry(mark->cl, struct ip_vs_dest, n_list); | ||
183 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
184 | atomic_read(&dest->weight) >= mark->cw) { | ||
185 | /* got it */ | ||
186 | break; | ||
187 | } | ||
188 | } | ||
189 | |||
190 | if (mark->cl == p && mark->cw == mark->di) { | ||
191 | /* back to the start, and no dest is found. | ||
192 | It is only possible when all dests are OVERLOADED */ | ||
193 | dest = NULL; | ||
194 | goto out; | ||
195 | } | ||
196 | } | ||
197 | |||
198 | IP_VS_DBG_BUF(6, "WRR: server %s:%u " | ||
199 | "activeconns %d refcnt %d weight %d\n", | ||
200 | IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), | ||
201 | atomic_read(&dest->activeconns), | ||
202 | atomic_read(&dest->refcnt), | ||
203 | atomic_read(&dest->weight)); | ||
204 | |||
205 | out: | ||
206 | write_unlock(&svc->sched_lock); | ||
207 | return dest; | ||
208 | } | ||
209 | |||
210 | |||
211 | static struct ip_vs_scheduler ip_vs_wrr_scheduler = { | ||
212 | .name = "wrr", | ||
213 | .refcnt = ATOMIC_INIT(0), | ||
214 | .module = THIS_MODULE, | ||
215 | .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), | ||
216 | #ifdef CONFIG_IP_VS_IPV6 | ||
217 | .supports_ipv6 = 1, | ||
218 | #endif | ||
219 | .init_service = ip_vs_wrr_init_svc, | ||
220 | .done_service = ip_vs_wrr_done_svc, | ||
221 | .update_service = ip_vs_wrr_update_svc, | ||
222 | .schedule = ip_vs_wrr_schedule, | ||
223 | }; | ||
224 | |||
225 | static int __init ip_vs_wrr_init(void) | ||
226 | { | ||
227 | return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; | ||
228 | } | ||
229 | |||
230 | static void __exit ip_vs_wrr_cleanup(void) | ||
231 | { | ||
232 | unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); | ||
233 | } | ||
234 | |||
235 | module_init(ip_vs_wrr_init); | ||
236 | module_exit(ip_vs_wrr_cleanup); | ||
237 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c deleted file mode 100644 index 02ddc2b3ce2e..000000000000 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ /dev/null | |||
@@ -1,1004 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_xmit.c: various packet transmitters for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/tcp.h> /* for tcphdr */ | ||
18 | #include <net/ip.h> | ||
19 | #include <net/tcp.h> /* for csum_tcpudp_magic */ | ||
20 | #include <net/udp.h> | ||
21 | #include <net/icmp.h> /* for icmp_send */ | ||
22 | #include <net/route.h> /* for ip_route_output */ | ||
23 | #include <net/ipv6.h> | ||
24 | #include <net/ip6_route.h> | ||
25 | #include <linux/icmpv6.h> | ||
26 | #include <linux/netfilter.h> | ||
27 | #include <linux/netfilter_ipv4.h> | ||
28 | |||
29 | #include <net/ip_vs.h> | ||
30 | |||
31 | |||
32 | /* | ||
33 | * Destination cache to speed up outgoing route lookup | ||
34 | */ | ||
35 | static inline void | ||
36 | __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) | ||
37 | { | ||
38 | struct dst_entry *old_dst; | ||
39 | |||
40 | old_dst = dest->dst_cache; | ||
41 | dest->dst_cache = dst; | ||
42 | dest->dst_rtos = rtos; | ||
43 | dst_release(old_dst); | ||
44 | } | ||
45 | |||
46 | static inline struct dst_entry * | ||
47 | __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) | ||
48 | { | ||
49 | struct dst_entry *dst = dest->dst_cache; | ||
50 | |||
51 | if (!dst) | ||
52 | return NULL; | ||
53 | if ((dst->obsolete | ||
54 | || (dest->af == AF_INET && rtos != dest->dst_rtos)) && | ||
55 | dst->ops->check(dst, cookie) == NULL) { | ||
56 | dest->dst_cache = NULL; | ||
57 | dst_release(dst); | ||
58 | return NULL; | ||
59 | } | ||
60 | dst_hold(dst); | ||
61 | return dst; | ||
62 | } | ||
63 | |||
64 | static struct rtable * | ||
65 | __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) | ||
66 | { | ||
67 | struct rtable *rt; /* Route to the other host */ | ||
68 | struct ip_vs_dest *dest = cp->dest; | ||
69 | |||
70 | if (dest) { | ||
71 | spin_lock(&dest->dst_lock); | ||
72 | if (!(rt = (struct rtable *) | ||
73 | __ip_vs_dst_check(dest, rtos, 0))) { | ||
74 | struct flowi fl = { | ||
75 | .oif = 0, | ||
76 | .nl_u = { | ||
77 | .ip4_u = { | ||
78 | .daddr = dest->addr.ip, | ||
79 | .saddr = 0, | ||
80 | .tos = rtos, } }, | ||
81 | }; | ||
82 | |||
83 | if (ip_route_output_key(&init_net, &rt, &fl)) { | ||
84 | spin_unlock(&dest->dst_lock); | ||
85 | IP_VS_DBG_RL("ip_route_output error, " | ||
86 | "dest: %u.%u.%u.%u\n", | ||
87 | NIPQUAD(dest->addr.ip)); | ||
88 | return NULL; | ||
89 | } | ||
90 | __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); | ||
91 | IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", | ||
92 | NIPQUAD(dest->addr.ip), | ||
93 | atomic_read(&rt->u.dst.__refcnt), rtos); | ||
94 | } | ||
95 | spin_unlock(&dest->dst_lock); | ||
96 | } else { | ||
97 | struct flowi fl = { | ||
98 | .oif = 0, | ||
99 | .nl_u = { | ||
100 | .ip4_u = { | ||
101 | .daddr = cp->daddr.ip, | ||
102 | .saddr = 0, | ||
103 | .tos = rtos, } }, | ||
104 | }; | ||
105 | |||
106 | if (ip_route_output_key(&init_net, &rt, &fl)) { | ||
107 | IP_VS_DBG_RL("ip_route_output error, dest: " | ||
108 | "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); | ||
109 | return NULL; | ||
110 | } | ||
111 | } | ||
112 | |||
113 | return rt; | ||
114 | } | ||
115 | |||
116 | #ifdef CONFIG_IP_VS_IPV6 | ||
117 | static struct rt6_info * | ||
118 | __ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) | ||
119 | { | ||
120 | struct rt6_info *rt; /* Route to the other host */ | ||
121 | struct ip_vs_dest *dest = cp->dest; | ||
122 | |||
123 | if (dest) { | ||
124 | spin_lock(&dest->dst_lock); | ||
125 | rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); | ||
126 | if (!rt) { | ||
127 | struct flowi fl = { | ||
128 | .oif = 0, | ||
129 | .nl_u = { | ||
130 | .ip6_u = { | ||
131 | .daddr = dest->addr.in6, | ||
132 | .saddr = { | ||
133 | .s6_addr32 = | ||
134 | { 0, 0, 0, 0 }, | ||
135 | }, | ||
136 | }, | ||
137 | }, | ||
138 | }; | ||
139 | |||
140 | rt = (struct rt6_info *)ip6_route_output(&init_net, | ||
141 | NULL, &fl); | ||
142 | if (!rt) { | ||
143 | spin_unlock(&dest->dst_lock); | ||
144 | IP_VS_DBG_RL("ip6_route_output error, " | ||
145 | "dest: " NIP6_FMT "\n", | ||
146 | NIP6(dest->addr.in6)); | ||
147 | return NULL; | ||
148 | } | ||
149 | __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); | ||
150 | IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", | ||
151 | NIP6(dest->addr.in6), | ||
152 | atomic_read(&rt->u.dst.__refcnt)); | ||
153 | } | ||
154 | spin_unlock(&dest->dst_lock); | ||
155 | } else { | ||
156 | struct flowi fl = { | ||
157 | .oif = 0, | ||
158 | .nl_u = { | ||
159 | .ip6_u = { | ||
160 | .daddr = cp->daddr.in6, | ||
161 | .saddr = { | ||
162 | .s6_addr32 = { 0, 0, 0, 0 }, | ||
163 | }, | ||
164 | }, | ||
165 | }, | ||
166 | }; | ||
167 | |||
168 | rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); | ||
169 | if (!rt) { | ||
170 | IP_VS_DBG_RL("ip6_route_output error, dest: " | ||
171 | NIP6_FMT "\n", NIP6(cp->daddr.in6)); | ||
172 | return NULL; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | return rt; | ||
177 | } | ||
178 | #endif | ||
179 | |||
180 | |||
181 | /* | ||
182 | * Release dest->dst_cache before a dest is removed | ||
183 | */ | ||
184 | void | ||
185 | ip_vs_dst_reset(struct ip_vs_dest *dest) | ||
186 | { | ||
187 | struct dst_entry *old_dst; | ||
188 | |||
189 | old_dst = dest->dst_cache; | ||
190 | dest->dst_cache = NULL; | ||
191 | dst_release(old_dst); | ||
192 | } | ||
193 | |||
194 | #define IP_VS_XMIT(pf, skb, rt) \ | ||
195 | do { \ | ||
196 | (skb)->ipvs_property = 1; \ | ||
197 | skb_forward_csum(skb); \ | ||
198 | NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ | ||
199 | (rt)->u.dst.dev, dst_output); \ | ||
200 | } while (0) | ||
201 | |||
202 | |||
203 | /* | ||
204 | * NULL transmitter (do nothing except return NF_ACCEPT) | ||
205 | */ | ||
206 | int | ||
207 | ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
208 | struct ip_vs_protocol *pp) | ||
209 | { | ||
210 | /* we do not touch skb and do not need pskb ptr */ | ||
211 | return NF_ACCEPT; | ||
212 | } | ||
213 | |||
214 | |||
215 | /* | ||
216 | * Bypass transmitter | ||
217 | * Let packets bypass the destination when the destination is not | ||
218 | * available, it may be only used in transparent cache cluster. | ||
219 | */ | ||
220 | int | ||
221 | ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
222 | struct ip_vs_protocol *pp) | ||
223 | { | ||
224 | struct rtable *rt; /* Route to the other host */ | ||
225 | struct iphdr *iph = ip_hdr(skb); | ||
226 | u8 tos = iph->tos; | ||
227 | int mtu; | ||
228 | struct flowi fl = { | ||
229 | .oif = 0, | ||
230 | .nl_u = { | ||
231 | .ip4_u = { | ||
232 | .daddr = iph->daddr, | ||
233 | .saddr = 0, | ||
234 | .tos = RT_TOS(tos), } }, | ||
235 | }; | ||
236 | |||
237 | EnterFunction(10); | ||
238 | |||
239 | if (ip_route_output_key(&init_net, &rt, &fl)) { | ||
240 | IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " | ||
241 | "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); | ||
242 | goto tx_error_icmp; | ||
243 | } | ||
244 | |||
245 | /* MTU checking */ | ||
246 | mtu = dst_mtu(&rt->u.dst); | ||
247 | if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { | ||
248 | ip_rt_put(rt); | ||
249 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
250 | IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); | ||
251 | goto tx_error; | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Call ip_send_check because we are not sure it is called | ||
256 | * after ip_defrag. Is copy-on-write needed? | ||
257 | */ | ||
258 | if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { | ||
259 | ip_rt_put(rt); | ||
260 | return NF_STOLEN; | ||
261 | } | ||
262 | ip_send_check(ip_hdr(skb)); | ||
263 | |||
264 | /* drop old route */ | ||
265 | dst_release(skb->dst); | ||
266 | skb->dst = &rt->u.dst; | ||
267 | |||
268 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
269 | skb->local_df = 1; | ||
270 | |||
271 | IP_VS_XMIT(PF_INET, skb, rt); | ||
272 | |||
273 | LeaveFunction(10); | ||
274 | return NF_STOLEN; | ||
275 | |||
276 | tx_error_icmp: | ||
277 | dst_link_failure(skb); | ||
278 | tx_error: | ||
279 | kfree_skb(skb); | ||
280 | LeaveFunction(10); | ||
281 | return NF_STOLEN; | ||
282 | } | ||
283 | |||
284 | #ifdef CONFIG_IP_VS_IPV6 | ||
285 | int | ||
286 | ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
287 | struct ip_vs_protocol *pp) | ||
288 | { | ||
289 | struct rt6_info *rt; /* Route to the other host */ | ||
290 | struct ipv6hdr *iph = ipv6_hdr(skb); | ||
291 | int mtu; | ||
292 | struct flowi fl = { | ||
293 | .oif = 0, | ||
294 | .nl_u = { | ||
295 | .ip6_u = { | ||
296 | .daddr = iph->daddr, | ||
297 | .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, | ||
298 | }; | ||
299 | |||
300 | EnterFunction(10); | ||
301 | |||
302 | rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); | ||
303 | if (!rt) { | ||
304 | IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " | ||
305 | "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); | ||
306 | goto tx_error_icmp; | ||
307 | } | ||
308 | |||
309 | /* MTU checking */ | ||
310 | mtu = dst_mtu(&rt->u.dst); | ||
311 | if (skb->len > mtu) { | ||
312 | dst_release(&rt->u.dst); | ||
313 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); | ||
314 | IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); | ||
315 | goto tx_error; | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Call ip_send_check because we are not sure it is called | ||
320 | * after ip_defrag. Is copy-on-write needed? | ||
321 | */ | ||
322 | skb = skb_share_check(skb, GFP_ATOMIC); | ||
323 | if (unlikely(skb == NULL)) { | ||
324 | dst_release(&rt->u.dst); | ||
325 | return NF_STOLEN; | ||
326 | } | ||
327 | |||
328 | /* drop old route */ | ||
329 | dst_release(skb->dst); | ||
330 | skb->dst = &rt->u.dst; | ||
331 | |||
332 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
333 | skb->local_df = 1; | ||
334 | |||
335 | IP_VS_XMIT(PF_INET6, skb, rt); | ||
336 | |||
337 | LeaveFunction(10); | ||
338 | return NF_STOLEN; | ||
339 | |||
340 | tx_error_icmp: | ||
341 | dst_link_failure(skb); | ||
342 | tx_error: | ||
343 | kfree_skb(skb); | ||
344 | LeaveFunction(10); | ||
345 | return NF_STOLEN; | ||
346 | } | ||
347 | #endif | ||
348 | |||
349 | /* | ||
350 | * NAT transmitter (only for outside-to-inside nat forwarding) | ||
351 | * Not used for related ICMP | ||
352 | */ | ||
353 | int | ||
354 | ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
355 | struct ip_vs_protocol *pp) | ||
356 | { | ||
357 | struct rtable *rt; /* Route to the other host */ | ||
358 | int mtu; | ||
359 | struct iphdr *iph = ip_hdr(skb); | ||
360 | |||
361 | EnterFunction(10); | ||
362 | |||
363 | /* check if it is a connection of no-client-port */ | ||
364 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { | ||
365 | __be16 _pt, *p; | ||
366 | p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); | ||
367 | if (p == NULL) | ||
368 | goto tx_error; | ||
369 | ip_vs_conn_fill_cport(cp, *p); | ||
370 | IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); | ||
371 | } | ||
372 | |||
373 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) | ||
374 | goto tx_error_icmp; | ||
375 | |||
376 | /* MTU checking */ | ||
377 | mtu = dst_mtu(&rt->u.dst); | ||
378 | if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { | ||
379 | ip_rt_put(rt); | ||
380 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
381 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); | ||
382 | goto tx_error; | ||
383 | } | ||
384 | |||
385 | /* copy-on-write the packet before mangling it */ | ||
386 | if (!skb_make_writable(skb, sizeof(struct iphdr))) | ||
387 | goto tx_error_put; | ||
388 | |||
389 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
390 | goto tx_error_put; | ||
391 | |||
392 | /* drop old route */ | ||
393 | dst_release(skb->dst); | ||
394 | skb->dst = &rt->u.dst; | ||
395 | |||
396 | /* mangle the packet */ | ||
397 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) | ||
398 | goto tx_error; | ||
399 | ip_hdr(skb)->daddr = cp->daddr.ip; | ||
400 | ip_send_check(ip_hdr(skb)); | ||
401 | |||
402 | IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); | ||
403 | |||
404 | /* FIXME: when application helper enlarges the packet and the length | ||
405 | is larger than the MTU of outgoing device, there will be still | ||
406 | MTU problem. */ | ||
407 | |||
408 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
409 | skb->local_df = 1; | ||
410 | |||
411 | IP_VS_XMIT(PF_INET, skb, rt); | ||
412 | |||
413 | LeaveFunction(10); | ||
414 | return NF_STOLEN; | ||
415 | |||
416 | tx_error_icmp: | ||
417 | dst_link_failure(skb); | ||
418 | tx_error: | ||
419 | LeaveFunction(10); | ||
420 | kfree_skb(skb); | ||
421 | return NF_STOLEN; | ||
422 | tx_error_put: | ||
423 | ip_rt_put(rt); | ||
424 | goto tx_error; | ||
425 | } | ||
426 | |||
427 | #ifdef CONFIG_IP_VS_IPV6 | ||
428 | int | ||
429 | ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
430 | struct ip_vs_protocol *pp) | ||
431 | { | ||
432 | struct rt6_info *rt; /* Route to the other host */ | ||
433 | int mtu; | ||
434 | |||
435 | EnterFunction(10); | ||
436 | |||
437 | /* check if it is a connection of no-client-port */ | ||
438 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { | ||
439 | __be16 _pt, *p; | ||
440 | p = skb_header_pointer(skb, sizeof(struct ipv6hdr), | ||
441 | sizeof(_pt), &_pt); | ||
442 | if (p == NULL) | ||
443 | goto tx_error; | ||
444 | ip_vs_conn_fill_cport(cp, *p); | ||
445 | IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); | ||
446 | } | ||
447 | |||
448 | rt = __ip_vs_get_out_rt_v6(cp); | ||
449 | if (!rt) | ||
450 | goto tx_error_icmp; | ||
451 | |||
452 | /* MTU checking */ | ||
453 | mtu = dst_mtu(&rt->u.dst); | ||
454 | if (skb->len > mtu) { | ||
455 | dst_release(&rt->u.dst); | ||
456 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); | ||
457 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
458 | "ip_vs_nat_xmit_v6(): frag needed for"); | ||
459 | goto tx_error; | ||
460 | } | ||
461 | |||
462 | /* copy-on-write the packet before mangling it */ | ||
463 | if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) | ||
464 | goto tx_error_put; | ||
465 | |||
466 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
467 | goto tx_error_put; | ||
468 | |||
469 | /* drop old route */ | ||
470 | dst_release(skb->dst); | ||
471 | skb->dst = &rt->u.dst; | ||
472 | |||
473 | /* mangle the packet */ | ||
474 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) | ||
475 | goto tx_error; | ||
476 | ipv6_hdr(skb)->daddr = cp->daddr.in6; | ||
477 | |||
478 | IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); | ||
479 | |||
480 | /* FIXME: when application helper enlarges the packet and the length | ||
481 | is larger than the MTU of outgoing device, there will be still | ||
482 | MTU problem. */ | ||
483 | |||
484 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
485 | skb->local_df = 1; | ||
486 | |||
487 | IP_VS_XMIT(PF_INET6, skb, rt); | ||
488 | |||
489 | LeaveFunction(10); | ||
490 | return NF_STOLEN; | ||
491 | |||
492 | tx_error_icmp: | ||
493 | dst_link_failure(skb); | ||
494 | tx_error: | ||
495 | LeaveFunction(10); | ||
496 | kfree_skb(skb); | ||
497 | return NF_STOLEN; | ||
498 | tx_error_put: | ||
499 | dst_release(&rt->u.dst); | ||
500 | goto tx_error; | ||
501 | } | ||
502 | #endif | ||
503 | |||
504 | |||
505 | /* | ||
506 | * IP Tunneling transmitter | ||
507 | * | ||
508 | * This function encapsulates the packet in a new IP packet, its | ||
509 | * destination will be set to cp->daddr. Most code of this function | ||
510 | * is taken from ipip.c. | ||
511 | * | ||
512 | * It is used in VS/TUN cluster. The load balancer selects a real | ||
513 | * server from a cluster based on a scheduling algorithm, | ||
514 | * encapsulates the request packet and forwards it to the selected | ||
515 | * server. For example, all real servers are configured with | ||
516 | * "ifconfig tunl0 <Virtual IP Address> up". When the server receives | ||
517 | * the encapsulated packet, it will decapsulate the packet, processe | ||
518 | * the request and return the response packets directly to the client | ||
519 | * without passing the load balancer. This can greatly increase the | ||
520 | * scalability of virtual server. | ||
521 | * | ||
522 | * Used for ANY protocol | ||
523 | */ | ||
524 | int | ||
525 | ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
526 | struct ip_vs_protocol *pp) | ||
527 | { | ||
528 | struct rtable *rt; /* Route to the other host */ | ||
529 | struct net_device *tdev; /* Device to other host */ | ||
530 | struct iphdr *old_iph = ip_hdr(skb); | ||
531 | u8 tos = old_iph->tos; | ||
532 | __be16 df = old_iph->frag_off; | ||
533 | sk_buff_data_t old_transport_header = skb->transport_header; | ||
534 | struct iphdr *iph; /* Our new IP header */ | ||
535 | unsigned int max_headroom; /* The extra header space needed */ | ||
536 | int mtu; | ||
537 | |||
538 | EnterFunction(10); | ||
539 | |||
540 | if (skb->protocol != htons(ETH_P_IP)) { | ||
541 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " | ||
542 | "ETH_P_IP: %d, skb protocol: %d\n", | ||
543 | htons(ETH_P_IP), skb->protocol); | ||
544 | goto tx_error; | ||
545 | } | ||
546 | |||
547 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) | ||
548 | goto tx_error_icmp; | ||
549 | |||
550 | tdev = rt->u.dst.dev; | ||
551 | |||
552 | mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); | ||
553 | if (mtu < 68) { | ||
554 | ip_rt_put(rt); | ||
555 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); | ||
556 | goto tx_error; | ||
557 | } | ||
558 | if (skb->dst) | ||
559 | skb->dst->ops->update_pmtu(skb->dst, mtu); | ||
560 | |||
561 | df |= (old_iph->frag_off & htons(IP_DF)); | ||
562 | |||
563 | if ((old_iph->frag_off & htons(IP_DF)) | ||
564 | && mtu < ntohs(old_iph->tot_len)) { | ||
565 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
566 | ip_rt_put(rt); | ||
567 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); | ||
568 | goto tx_error; | ||
569 | } | ||
570 | |||
571 | /* | ||
572 | * Okay, now see if we can stuff it in the buffer as-is. | ||
573 | */ | ||
574 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); | ||
575 | |||
576 | if (skb_headroom(skb) < max_headroom | ||
577 | || skb_cloned(skb) || skb_shared(skb)) { | ||
578 | struct sk_buff *new_skb = | ||
579 | skb_realloc_headroom(skb, max_headroom); | ||
580 | if (!new_skb) { | ||
581 | ip_rt_put(rt); | ||
582 | kfree_skb(skb); | ||
583 | IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); | ||
584 | return NF_STOLEN; | ||
585 | } | ||
586 | kfree_skb(skb); | ||
587 | skb = new_skb; | ||
588 | old_iph = ip_hdr(skb); | ||
589 | } | ||
590 | |||
591 | skb->transport_header = old_transport_header; | ||
592 | |||
593 | /* fix old IP header checksum */ | ||
594 | ip_send_check(old_iph); | ||
595 | |||
596 | skb_push(skb, sizeof(struct iphdr)); | ||
597 | skb_reset_network_header(skb); | ||
598 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | ||
599 | |||
600 | /* drop old route */ | ||
601 | dst_release(skb->dst); | ||
602 | skb->dst = &rt->u.dst; | ||
603 | |||
604 | /* | ||
605 | * Push down and install the IPIP header. | ||
606 | */ | ||
607 | iph = ip_hdr(skb); | ||
608 | iph->version = 4; | ||
609 | iph->ihl = sizeof(struct iphdr)>>2; | ||
610 | iph->frag_off = df; | ||
611 | iph->protocol = IPPROTO_IPIP; | ||
612 | iph->tos = tos; | ||
613 | iph->daddr = rt->rt_dst; | ||
614 | iph->saddr = rt->rt_src; | ||
615 | iph->ttl = old_iph->ttl; | ||
616 | ip_select_ident(iph, &rt->u.dst, NULL); | ||
617 | |||
618 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
619 | skb->local_df = 1; | ||
620 | |||
621 | ip_local_out(skb); | ||
622 | |||
623 | LeaveFunction(10); | ||
624 | |||
625 | return NF_STOLEN; | ||
626 | |||
627 | tx_error_icmp: | ||
628 | dst_link_failure(skb); | ||
629 | tx_error: | ||
630 | kfree_skb(skb); | ||
631 | LeaveFunction(10); | ||
632 | return NF_STOLEN; | ||
633 | } | ||
634 | |||
635 | #ifdef CONFIG_IP_VS_IPV6 | ||
636 | int | ||
637 | ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
638 | struct ip_vs_protocol *pp) | ||
639 | { | ||
640 | struct rt6_info *rt; /* Route to the other host */ | ||
641 | struct net_device *tdev; /* Device to other host */ | ||
642 | struct ipv6hdr *old_iph = ipv6_hdr(skb); | ||
643 | sk_buff_data_t old_transport_header = skb->transport_header; | ||
644 | struct ipv6hdr *iph; /* Our new IP header */ | ||
645 | unsigned int max_headroom; /* The extra header space needed */ | ||
646 | int mtu; | ||
647 | |||
648 | EnterFunction(10); | ||
649 | |||
650 | if (skb->protocol != htons(ETH_P_IPV6)) { | ||
651 | IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " | ||
652 | "ETH_P_IPV6: %d, skb protocol: %d\n", | ||
653 | htons(ETH_P_IPV6), skb->protocol); | ||
654 | goto tx_error; | ||
655 | } | ||
656 | |||
657 | rt = __ip_vs_get_out_rt_v6(cp); | ||
658 | if (!rt) | ||
659 | goto tx_error_icmp; | ||
660 | |||
661 | tdev = rt->u.dst.dev; | ||
662 | |||
663 | mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); | ||
664 | /* TODO IPv6: do we need this check in IPv6? */ | ||
665 | if (mtu < 1280) { | ||
666 | dst_release(&rt->u.dst); | ||
667 | IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); | ||
668 | goto tx_error; | ||
669 | } | ||
670 | if (skb->dst) | ||
671 | skb->dst->ops->update_pmtu(skb->dst, mtu); | ||
672 | |||
673 | if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { | ||
674 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); | ||
675 | dst_release(&rt->u.dst); | ||
676 | IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); | ||
677 | goto tx_error; | ||
678 | } | ||
679 | |||
680 | /* | ||
681 | * Okay, now see if we can stuff it in the buffer as-is. | ||
682 | */ | ||
683 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); | ||
684 | |||
685 | if (skb_headroom(skb) < max_headroom | ||
686 | || skb_cloned(skb) || skb_shared(skb)) { | ||
687 | struct sk_buff *new_skb = | ||
688 | skb_realloc_headroom(skb, max_headroom); | ||
689 | if (!new_skb) { | ||
690 | dst_release(&rt->u.dst); | ||
691 | kfree_skb(skb); | ||
692 | IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); | ||
693 | return NF_STOLEN; | ||
694 | } | ||
695 | kfree_skb(skb); | ||
696 | skb = new_skb; | ||
697 | old_iph = ipv6_hdr(skb); | ||
698 | } | ||
699 | |||
700 | skb->transport_header = old_transport_header; | ||
701 | |||
702 | skb_push(skb, sizeof(struct ipv6hdr)); | ||
703 | skb_reset_network_header(skb); | ||
704 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | ||
705 | |||
706 | /* drop old route */ | ||
707 | dst_release(skb->dst); | ||
708 | skb->dst = &rt->u.dst; | ||
709 | |||
710 | /* | ||
711 | * Push down and install the IPIP header. | ||
712 | */ | ||
713 | iph = ipv6_hdr(skb); | ||
714 | iph->version = 6; | ||
715 | iph->nexthdr = IPPROTO_IPV6; | ||
716 | iph->payload_len = old_iph->payload_len + sizeof(old_iph); | ||
717 | iph->priority = old_iph->priority; | ||
718 | memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); | ||
719 | iph->daddr = rt->rt6i_dst.addr; | ||
720 | iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ | ||
721 | iph->hop_limit = old_iph->hop_limit; | ||
722 | |||
723 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
724 | skb->local_df = 1; | ||
725 | |||
726 | ip6_local_out(skb); | ||
727 | |||
728 | LeaveFunction(10); | ||
729 | |||
730 | return NF_STOLEN; | ||
731 | |||
732 | tx_error_icmp: | ||
733 | dst_link_failure(skb); | ||
734 | tx_error: | ||
735 | kfree_skb(skb); | ||
736 | LeaveFunction(10); | ||
737 | return NF_STOLEN; | ||
738 | } | ||
739 | #endif | ||
740 | |||
741 | |||
742 | /* | ||
743 | * Direct Routing transmitter | ||
744 | * Used for ANY protocol | ||
745 | */ | ||
746 | int | ||
747 | ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
748 | struct ip_vs_protocol *pp) | ||
749 | { | ||
750 | struct rtable *rt; /* Route to the other host */ | ||
751 | struct iphdr *iph = ip_hdr(skb); | ||
752 | int mtu; | ||
753 | |||
754 | EnterFunction(10); | ||
755 | |||
756 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) | ||
757 | goto tx_error_icmp; | ||
758 | |||
759 | /* MTU checking */ | ||
760 | mtu = dst_mtu(&rt->u.dst); | ||
761 | if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { | ||
762 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
763 | ip_rt_put(rt); | ||
764 | IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); | ||
765 | goto tx_error; | ||
766 | } | ||
767 | |||
768 | /* | ||
769 | * Call ip_send_check because we are not sure it is called | ||
770 | * after ip_defrag. Is copy-on-write needed? | ||
771 | */ | ||
772 | if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { | ||
773 | ip_rt_put(rt); | ||
774 | return NF_STOLEN; | ||
775 | } | ||
776 | ip_send_check(ip_hdr(skb)); | ||
777 | |||
778 | /* drop old route */ | ||
779 | dst_release(skb->dst); | ||
780 | skb->dst = &rt->u.dst; | ||
781 | |||
782 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
783 | skb->local_df = 1; | ||
784 | |||
785 | IP_VS_XMIT(PF_INET, skb, rt); | ||
786 | |||
787 | LeaveFunction(10); | ||
788 | return NF_STOLEN; | ||
789 | |||
790 | tx_error_icmp: | ||
791 | dst_link_failure(skb); | ||
792 | tx_error: | ||
793 | kfree_skb(skb); | ||
794 | LeaveFunction(10); | ||
795 | return NF_STOLEN; | ||
796 | } | ||
797 | |||
798 | #ifdef CONFIG_IP_VS_IPV6 | ||
799 | int | ||
800 | ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
801 | struct ip_vs_protocol *pp) | ||
802 | { | ||
803 | struct rt6_info *rt; /* Route to the other host */ | ||
804 | int mtu; | ||
805 | |||
806 | EnterFunction(10); | ||
807 | |||
808 | rt = __ip_vs_get_out_rt_v6(cp); | ||
809 | if (!rt) | ||
810 | goto tx_error_icmp; | ||
811 | |||
812 | /* MTU checking */ | ||
813 | mtu = dst_mtu(&rt->u.dst); | ||
814 | if (skb->len > mtu) { | ||
815 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); | ||
816 | dst_release(&rt->u.dst); | ||
817 | IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); | ||
818 | goto tx_error; | ||
819 | } | ||
820 | |||
821 | /* | ||
822 | * Call ip_send_check because we are not sure it is called | ||
823 | * after ip_defrag. Is copy-on-write needed? | ||
824 | */ | ||
825 | skb = skb_share_check(skb, GFP_ATOMIC); | ||
826 | if (unlikely(skb == NULL)) { | ||
827 | dst_release(&rt->u.dst); | ||
828 | return NF_STOLEN; | ||
829 | } | ||
830 | |||
831 | /* drop old route */ | ||
832 | dst_release(skb->dst); | ||
833 | skb->dst = &rt->u.dst; | ||
834 | |||
835 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
836 | skb->local_df = 1; | ||
837 | |||
838 | IP_VS_XMIT(PF_INET6, skb, rt); | ||
839 | |||
840 | LeaveFunction(10); | ||
841 | return NF_STOLEN; | ||
842 | |||
843 | tx_error_icmp: | ||
844 | dst_link_failure(skb); | ||
845 | tx_error: | ||
846 | kfree_skb(skb); | ||
847 | LeaveFunction(10); | ||
848 | return NF_STOLEN; | ||
849 | } | ||
850 | #endif | ||
851 | |||
852 | |||
853 | /* | ||
854 | * ICMP packet transmitter | ||
855 | * called by the ip_vs_in_icmp | ||
856 | */ | ||
857 | int | ||
858 | ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
859 | struct ip_vs_protocol *pp, int offset) | ||
860 | { | ||
861 | struct rtable *rt; /* Route to the other host */ | ||
862 | int mtu; | ||
863 | int rc; | ||
864 | |||
865 | EnterFunction(10); | ||
866 | |||
867 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be | ||
868 | forwarded directly here, because there is no need to | ||
869 | translate address/port back */ | ||
870 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | ||
871 | if (cp->packet_xmit) | ||
872 | rc = cp->packet_xmit(skb, cp, pp); | ||
873 | else | ||
874 | rc = NF_ACCEPT; | ||
875 | /* do not touch skb anymore */ | ||
876 | atomic_inc(&cp->in_pkts); | ||
877 | goto out; | ||
878 | } | ||
879 | |||
880 | /* | ||
881 | * mangle and send the packet here (only for VS/NAT) | ||
882 | */ | ||
883 | |||
884 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) | ||
885 | goto tx_error_icmp; | ||
886 | |||
887 | /* MTU checking */ | ||
888 | mtu = dst_mtu(&rt->u.dst); | ||
889 | if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { | ||
890 | ip_rt_put(rt); | ||
891 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | ||
892 | IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); | ||
893 | goto tx_error; | ||
894 | } | ||
895 | |||
896 | /* copy-on-write the packet before mangling it */ | ||
897 | if (!skb_make_writable(skb, offset)) | ||
898 | goto tx_error_put; | ||
899 | |||
900 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
901 | goto tx_error_put; | ||
902 | |||
903 | /* drop the old route when skb is not shared */ | ||
904 | dst_release(skb->dst); | ||
905 | skb->dst = &rt->u.dst; | ||
906 | |||
907 | ip_vs_nat_icmp(skb, pp, cp, 0); | ||
908 | |||
909 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
910 | skb->local_df = 1; | ||
911 | |||
912 | IP_VS_XMIT(PF_INET, skb, rt); | ||
913 | |||
914 | rc = NF_STOLEN; | ||
915 | goto out; | ||
916 | |||
917 | tx_error_icmp: | ||
918 | dst_link_failure(skb); | ||
919 | tx_error: | ||
920 | dev_kfree_skb(skb); | ||
921 | rc = NF_STOLEN; | ||
922 | out: | ||
923 | LeaveFunction(10); | ||
924 | return rc; | ||
925 | tx_error_put: | ||
926 | ip_rt_put(rt); | ||
927 | goto tx_error; | ||
928 | } | ||
929 | |||
930 | #ifdef CONFIG_IP_VS_IPV6 | ||
931 | int | ||
932 | ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
933 | struct ip_vs_protocol *pp, int offset) | ||
934 | { | ||
935 | struct rt6_info *rt; /* Route to the other host */ | ||
936 | int mtu; | ||
937 | int rc; | ||
938 | |||
939 | EnterFunction(10); | ||
940 | |||
941 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be | ||
942 | forwarded directly here, because there is no need to | ||
943 | translate address/port back */ | ||
944 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | ||
945 | if (cp->packet_xmit) | ||
946 | rc = cp->packet_xmit(skb, cp, pp); | ||
947 | else | ||
948 | rc = NF_ACCEPT; | ||
949 | /* do not touch skb anymore */ | ||
950 | atomic_inc(&cp->in_pkts); | ||
951 | goto out; | ||
952 | } | ||
953 | |||
954 | /* | ||
955 | * mangle and send the packet here (only for VS/NAT) | ||
956 | */ | ||
957 | |||
958 | rt = __ip_vs_get_out_rt_v6(cp); | ||
959 | if (!rt) | ||
960 | goto tx_error_icmp; | ||
961 | |||
962 | /* MTU checking */ | ||
963 | mtu = dst_mtu(&rt->u.dst); | ||
964 | if (skb->len > mtu) { | ||
965 | dst_release(&rt->u.dst); | ||
966 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); | ||
967 | IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); | ||
968 | goto tx_error; | ||
969 | } | ||
970 | |||
971 | /* copy-on-write the packet before mangling it */ | ||
972 | if (!skb_make_writable(skb, offset)) | ||
973 | goto tx_error_put; | ||
974 | |||
975 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
976 | goto tx_error_put; | ||
977 | |||
978 | /* drop the old route when skb is not shared */ | ||
979 | dst_release(skb->dst); | ||
980 | skb->dst = &rt->u.dst; | ||
981 | |||
982 | ip_vs_nat_icmp_v6(skb, pp, cp, 0); | ||
983 | |||
984 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
985 | skb->local_df = 1; | ||
986 | |||
987 | IP_VS_XMIT(PF_INET6, skb, rt); | ||
988 | |||
989 | rc = NF_STOLEN; | ||
990 | goto out; | ||
991 | |||
992 | tx_error_icmp: | ||
993 | dst_link_failure(skb); | ||
994 | tx_error: | ||
995 | dev_kfree_skb(skb); | ||
996 | rc = NF_STOLEN; | ||
997 | out: | ||
998 | LeaveFunction(10); | ||
999 | return rc; | ||
1000 | tx_error_put: | ||
1001 | dst_release(&rt->u.dst); | ||
1002 | goto tx_error; | ||
1003 | } | ||
1004 | #endif | ||