aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig411
-rw-r--r--net/ipv4/Makefile33
-rw-r--r--net/ipv4/af_inet.c1188
-rw-r--r--net/ipv4/ah4.c335
-rw-r--r--net/ipv4/arp.c1425
-rw-r--r--net/ipv4/datagram.c73
-rw-r--r--net/ipv4/devinet.c1508
-rw-r--r--net/ipv4/esp4.c510
-rw-r--r--net/ipv4/fib_frontend.c611
-rw-r--r--net/ipv4/fib_hash.c1086
-rw-r--r--net/ipv4/fib_lookup.h43
-rw-r--r--net/ipv4/fib_rules.c437
-rw-r--r--net/ipv4/fib_semantics.c1332
-rw-r--r--net/ipv4/icmp.c1143
-rw-r--r--net/ipv4/igmp.c2473
-rw-r--r--net/ipv4/inetpeer.c460
-rw-r--r--net/ipv4/ip_forward.c127
-rw-r--r--net/ipv4/ip_fragment.c691
-rw-r--r--net/ipv4/ip_gre.c1290
-rw-r--r--net/ipv4/ip_input.c431
-rw-r--r--net/ipv4/ip_options.c625
-rw-r--r--net/ipv4/ip_output.c1359
-rw-r--r--net/ipv4/ip_sockglue.c1093
-rw-r--r--net/ipv4/ipcomp.c524
-rw-r--r--net/ipv4/ipconfig.c1507
-rw-r--r--net/ipv4/ipip.c905
-rw-r--r--net/ipv4/ipmr.c1900
-rw-r--r--net/ipv4/ipvs/Kconfig244
-rw-r--r--net/ipv4/ipvs/Makefile34
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c658
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c920
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1191
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c2391
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c258
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c200
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c400
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c624
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c888
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c123
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c161
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c244
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c177
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c175
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_icmp.c182
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c640
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c427
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c118
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c163
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c255
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c892
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c151
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c563
-rw-r--r--net/ipv4/multipath.c55
-rw-r--r--net/ipv4/multipath_drr.c265
-rw-r--r--net/ipv4/multipath_random.c128
-rw-r--r--net/ipv4/multipath_rr.c115
-rw-r--r--net/ipv4/multipath_wrandom.c344
-rw-r--r--net/ipv4/netfilter/Kconfig696
-rw-r--r--net/ipv4/netfilter/Makefile89
-rw-r--r--net/ipv4/netfilter/arp_tables.c1333
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c104
-rw-r--r--net/ipv4/netfilter/arptable_filter.c214
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c167
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c1247
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c501
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c313
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_generic.c75
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c279
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c649
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c1098
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c146
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c961
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c159
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c88
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c556
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c183
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c430
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c125
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c115
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c178
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c165
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c70
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c319
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c1347
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c349
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c70
-rw-r--r--net/ipv4/netfilter/ip_queue.c741
-rw-r--r--net/ipv4/netfilter/ip_tables.c1964
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c92
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c761
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c118
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c106
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c175
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c485
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c162
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c207
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c117
-rw-r--r--net/ipv4/netfilter/ipt_NOTRACK.c76
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c129
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c335
-rw-r--r--net/ipv4/netfilter/ipt_SAME.c211
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c262
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c105
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c419
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c77
-rw-r--r--net/ipv4/netfilter/ipt_ah.c117
-rw-r--r--net/ipv4/netfilter/ipt_comment.c59
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c81
-rw-r--r--net/ipv4/netfilter/ipt_conntrack.c136
-rw-r--r--net/ipv4/netfilter/ipt_dscp.c63
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c131
-rw-r--r--net/ipv4/netfilter/ipt_esp.c118
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c731
-rw-r--r--net/ipv4/netfilter/ipt_helper.c113
-rw-r--r--net/ipv4/netfilter/ipt_iprange.c99
-rw-r--r--net/ipv4/netfilter/ipt_length.c64
-rw-r--r--net/ipv4/netfilter/ipt_limit.c157
-rw-r--r--net/ipv4/netfilter/ipt_mac.c79
-rw-r--r--net/ipv4/netfilter/ipt_mark.c64
-rw-r--r--net/ipv4/netfilter/ipt_multiport.c212
-rw-r--r--net/ipv4/netfilter/ipt_owner.c217
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c134
-rw-r--r--net/ipv4/netfilter/ipt_pkttype.c70
-rw-r--r--net/ipv4/netfilter/ipt_realm.c76
-rw-r--r--net/ipv4/netfilter/ipt_recent.c1002
-rw-r--r--net/ipv4/netfilter/ipt_sctp.c203
-rw-r--r--net/ipv4/netfilter/ipt_state.c74
-rw-r--r--net/ipv4/netfilter/ipt_tcpmss.c127
-rw-r--r--net/ipv4/netfilter/ipt_tos.c64
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c79
-rw-r--r--net/ipv4/netfilter/iptable_filter.c194
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c260
-rw-r--r--net/ipv4/netfilter/iptable_raw.c156
-rw-r--r--net/ipv4/proc.c382
-rw-r--r--net/ipv4/protocol.c101
-rw-r--r--net/ipv4/raw.c888
-rw-r--r--net/ipv4/route.c3177
-rw-r--r--net/ipv4/syncookies.c279
-rw-r--r--net/ipv4/sysctl_net_ipv4.c698
-rw-r--r--net/ipv4/tcp.c2386
-rw-r--r--net/ipv4/tcp_diag.c802
-rw-r--r--net/ipv4/tcp_input.c4959
-rw-r--r--net/ipv4/tcp_ipv4.c2663
-rw-r--r--net/ipv4/tcp_minisocks.c1077
-rw-r--r--net/ipv4/tcp_output.c1739
-rw-r--r--net/ipv4/tcp_timer.c656
-rw-r--r--net/ipv4/udp.c1575
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_input.c160
-rw-r--r--net/ipv4/xfrm4_output.c141
-rw-r--r--net/ipv4/xfrm4_policy.c281
-rw-r--r--net/ipv4/xfrm4_state.c126
-rw-r--r--net/ipv4/xfrm4_tunnel.c144
155 files changed, 82733 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 000000000000..6d3e8b1bd1f2
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,411 @@
1#
2# IP configuration
3#
4config IP_MULTICAST
5 bool "IP: multicasting"
6 depends on INET
7 help
8 This is code for addressing several networked computers at once,
9 enlarging your kernel by about 2 KB. You need multicasting if you
10 intend to participate in the MBONE, a high bandwidth network on top
11 of the Internet which carries audio and video broadcasts. More
12 information about the MBONE is on the WWW at
13 <http://www-itg.lbl.gov/mbone/>. Information about the multicast
14 capabilities of the various network cards is contained in
15 <file:Documentation/networking/multicast.txt>. For most people, it's
16 safe to say N.
17
18config IP_ADVANCED_ROUTER
19 bool "IP: advanced router"
20 depends on INET
21 ---help---
22 If you intend to run your Linux box mostly as a router, i.e. as a
23 computer that forwards and redistributes network packets, say Y; you
24 will then be presented with several options that allow more precise
25 control about the routing process.
26
27 The answer to this question won't directly affect the kernel:
28 answering N will just cause the configurator to skip all the
29 questions about advanced routing.
30
31 Note that your box can only act as a router if you enable IP
32 forwarding in your kernel; you can do that by saying Y to "/proc
33 file system support" and "Sysctl support" below and executing the
34 line
35
36 echo "1" > /proc/sys/net/ipv4/ip_forward
37
38 at boot time after the /proc file system has been mounted.
39
40 If you turn on IP forwarding, you will also get the rp_filter, which
41 automatically rejects incoming packets if the routing table entry
42 for their source address doesn't match the network interface they're
43 arriving on. This has security advantages because it prevents the
44 so-called IP spoofing, however it can pose problems if you use
45 asymmetric routing (packets from you to a host take a different path
46 than packets from that host to you) or if you operate a non-routing
47 host which has several IP addresses on different interfaces. To turn
48 rp_filter off use:
49
50 echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter
51 or
52 echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
53
54 If unsure, say N here.
55
56config IP_MULTIPLE_TABLES
57 bool "IP: policy routing"
58 depends on IP_ADVANCED_ROUTER
59 ---help---
60 Normally, a router decides what to do with a received packet based
61 solely on the packet's final destination address. If you say Y here,
62 the Linux router will also be able to take the packet's source
63 address into account. Furthermore, the TOS (Type-Of-Service) field
64 of the packet can be used for routing decisions as well.
65
66 If you are interested in this, please see the preliminary
67 documentation at <http://www.compendium.com.ar/policy-routing.txt>
68 and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
69 You will need supporting software from
70 <ftp://ftp.tux.org/pub/net/ip-routing/>.
71
72 If unsure, say N.
73
74config IP_ROUTE_FWMARK
75 bool "IP: use netfilter MARK value as routing key"
76 depends on IP_MULTIPLE_TABLES && NETFILTER
77 help
78 If you say Y here, you will be able to specify different routes for
79 packets with different mark values (see iptables(8), MARK target).
80
81config IP_ROUTE_MULTIPATH
82 bool "IP: equal cost multipath"
83 depends on IP_ADVANCED_ROUTER
84 help
85 Normally, the routing tables specify a single action to be taken in
86 a deterministic manner for a given packet. If you say Y here
87 however, it becomes possible to attach several actions to a packet
88 pattern, in effect specifying several alternative paths to travel
89 for those packets. The router considers all these paths to be of
90 equal "cost" and chooses one of them in a non-deterministic fashion
91 if a matching packet arrives.
92
93config IP_ROUTE_MULTIPATH_CACHED
94 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
95 depends on: IP_ROUTE_MULTIPATH
96 help
97 Normally, equal cost multipath routing is not supported by the
98 routing cache. If you say Y here, alternative routes are cached
99 and on cache lookup a route is chosen in a configurable fashion.
100
101 If unsure, say N.
102
103config IP_ROUTE_MULTIPATH_RR
104 tristate "MULTIPATH: round robin algorithm"
105 depends on IP_ROUTE_MULTIPATH_CACHED
106 help
107 Mulitpath routes are chosen according to Round Robin
108
109config IP_ROUTE_MULTIPATH_RANDOM
110 tristate "MULTIPATH: random algorithm"
111 depends on IP_ROUTE_MULTIPATH_CACHED
112 help
113 Multipath routes are chosen in a random fashion. Actually,
114 there is no weight for a route. The advantage of this policy
115 is that it is implemented stateless and therefore introduces only
116 a very small delay.
117
118config IP_ROUTE_MULTIPATH_WRANDOM
119 tristate "MULTIPATH: weighted random algorithm"
120 depends on IP_ROUTE_MULTIPATH_CACHED
121 help
122 Multipath routes are chosen in a weighted random fashion.
123 The per route weights are the weights visible via ip route 2. As the
124 corresponding state management introduces some overhead routing delay
125 is increased.
126
127config IP_ROUTE_MULTIPATH_DRR
128 tristate "MULTIPATH: interface round robin algorithm"
129 depends on IP_ROUTE_MULTIPATH_CACHED
130 help
131 Connections are distributed in a round robin fashion over the
132 available interfaces. This policy makes sense if the connections
133 should be primarily distributed on interfaces and not on routes.
134
135config IP_ROUTE_VERBOSE
136 bool "IP: verbose route monitoring"
137 depends on IP_ADVANCED_ROUTER
138 help
139 If you say Y here, which is recommended, then the kernel will print
140 verbose messages regarding the routing, for example warnings about
141 received packets which look strange and could be evidence of an
142 attack or a misconfigured system somewhere. The information is
143 handled by the klogd daemon which is responsible for kernel messages
144 ("man klogd").
145
146config IP_PNP
147 bool "IP: kernel level autoconfiguration"
148 depends on INET
149 help
150 This enables automatic configuration of IP addresses of devices and
151 of the routing table during kernel boot, based on either information
152 supplied on the kernel command line or by BOOTP or RARP protocols.
153 You need to say Y only for diskless machines requiring network
154 access to boot (in which case you want to say Y to "Root file system
155 on NFS" as well), because all other machines configure the network
156 in their startup scripts.
157
158config IP_PNP_DHCP
159 bool "IP: DHCP support"
160 depends on IP_PNP
161 ---help---
162 If you want your Linux box to mount its whole root file system (the
163 one containing the directory /) from some other computer over the
164 net via NFS and you want the IP address of your computer to be
165 discovered automatically at boot time using the DHCP protocol (a
166 special protocol designed for doing this job), say Y here. In case
167 the boot ROM of your network card was designed for booting Linux and
168 does DHCP itself, providing all necessary information on the kernel
169 command line, you can say N here.
170
171 If unsure, say Y. Note that if you want to use DHCP, a DHCP server
172 must be operating on your network. Read
173 <file:Documentation/nfsroot.txt> for details.
174
175config IP_PNP_BOOTP
176 bool "IP: BOOTP support"
177 depends on IP_PNP
178 ---help---
179 If you want your Linux box to mount its whole root file system (the
180 one containing the directory /) from some other computer over the
181 net via NFS and you want the IP address of your computer to be
182 discovered automatically at boot time using the BOOTP protocol (a
183 special protocol designed for doing this job), say Y here. In case
184 the boot ROM of your network card was designed for booting Linux and
185 does BOOTP itself, providing all necessary information on the kernel
186 command line, you can say N here. If unsure, say Y. Note that if you
187 want to use BOOTP, a BOOTP server must be operating on your network.
188 Read <file:Documentation/nfsroot.txt> for details.
189
190config IP_PNP_RARP
191 bool "IP: RARP support"
192 depends on IP_PNP
193 help
194 If you want your Linux box to mount its whole root file system (the
195 one containing the directory /) from some other computer over the
196 net via NFS and you want the IP address of your computer to be
197 discovered automatically at boot time using the RARP protocol (an
198 older protocol which is being obsoleted by BOOTP and DHCP), say Y
199 here. Note that if you want to use RARP, a RARP server must be
200 operating on your network. Read <file:Documentation/nfsroot.txt> for
201 details.
202
203# not yet ready..
204# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
205config NET_IPIP
206 tristate "IP: tunneling"
207 depends on INET
208 select INET_TUNNEL
209 ---help---
210 Tunneling means encapsulating data of one protocol type within
211 another protocol and sending it over a channel that understands the
212 encapsulating protocol. This particular tunneling driver implements
213 encapsulation of IP within IP, which sounds kind of pointless, but
214 can be useful if you want to make your (or some other) machine
215 appear on a different network than it physically is, or to use
216 mobile-IP facilities (allowing laptops to seamlessly move between
217 networks without changing their IP addresses).
218
219 Saying Y to this option will produce two modules ( = code which can
220 be inserted in and removed from the running kernel whenever you
221 want). Most people won't need this and can say N.
222
223config NET_IPGRE
224 tristate "IP: GRE tunnels over IP"
225 depends on INET
226 select XFRM
227 help
228 Tunneling means encapsulating data of one protocol type within
229 another protocol and sending it over a channel that understands the
230 encapsulating protocol. This particular tunneling driver implements
231 GRE (Generic Routing Encapsulation) and at this time allows
232 encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
233 This driver is useful if the other endpoint is a Cisco router: Cisco
234 likes GRE much better than the other Linux tunneling driver ("IP
235 tunneling" above). In addition, GRE allows multicast redistribution
236 through the tunnel.
237
238config NET_IPGRE_BROADCAST
239 bool "IP: broadcast GRE over IP"
240 depends on IP_MULTICAST && NET_IPGRE
241 help
242 One application of GRE/IP is to construct a broadcast WAN (Wide Area
243 Network), which looks like a normal Ethernet LAN (Local Area
244 Network), but can be distributed all over the Internet. If you want
245 to do that, say Y here and to "IP multicast routing" below.
246
247config IP_MROUTE
248 bool "IP: multicast routing"
249 depends on IP_MULTICAST
250 help
251 This is used if you want your machine to act as a router for IP
252 packets that have several destination addresses. It is needed on the
253 MBONE, a high bandwidth network on top of the Internet which carries
254 audio and video broadcasts. In order to do that, you would most
255 likely run the program mrouted. Information about the multicast
256 capabilities of the various network cards is contained in
257 <file:Documentation/networking/multicast.txt>. If you haven't heard
258 about it, you don't need it.
259
260config IP_PIMSM_V1
261 bool "IP: PIM-SM version 1 support"
262 depends on IP_MROUTE
263 help
264 Kernel side support for Sparse Mode PIM (Protocol Independent
265 Multicast) version 1. This multicast routing protocol is used widely
266 because Cisco supports it. You need special software to use it
267 (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
268 information about PIM.
269
270 Say Y if you want to use PIM-SM v1. Note that you can say N here if
271 you just want to use Dense Mode PIM.
272
273config IP_PIMSM_V2
274 bool "IP: PIM-SM version 2 support"
275 depends on IP_MROUTE
276 help
277 Kernel side support for Sparse Mode PIM version 2. In order to use
278 this, you need an experimental routing daemon supporting it (pimd or
279 gated-5). This routing protocol is not used widely, so say N unless
280 you want to play with it.
281
282config ARPD
283 bool "IP: ARP daemon support (EXPERIMENTAL)"
284 depends on INET && EXPERIMENTAL
285 ---help---
286 Normally, the kernel maintains an internal cache which maps IP
287 addresses to hardware addresses on the local network, so that
288 Ethernet/Token Ring/ etc. frames are sent to the proper address on
289 the physical networking layer. For small networks having a few
290 hundred directly connected hosts or less, keeping this address
291 resolution (ARP) cache inside the kernel works well. However,
292 maintaining an internal ARP cache does not work well for very large
293 switched networks, and will use a lot of kernel memory if TCP/IP
294 connections are made to many machines on the network.
295
296 If you say Y here, the kernel's internal ARP cache will never grow
297 to more than 256 entries (the oldest entries are expired in a LIFO
298 manner) and communication will be attempted with the user space ARP
299 daemon arpd. Arpd then answers the address resolution request either
300 from its own cache or by asking the net.
301
302 This code is experimental and also obsolete. If you want to use it,
303 you need to find a version of the daemon arpd on the net somewhere,
304 and you should also say Y to "Kernel/User network link driver",
305 below. If unsure, say N.
306
307config SYN_COOKIES
308 bool "IP: TCP syncookie support (disabled per default)"
309 depends on INET
310 ---help---
311 Normal TCP/IP networking is open to an attack known as "SYN
312 flooding". This denial-of-service attack prevents legitimate remote
313 users from being able to connect to your computer during an ongoing
314 attack and requires very little work from the attacker, who can
315 operate from anywhere on the Internet.
316
317 SYN cookies provide protection against this type of attack. If you
318 say Y here, the TCP/IP stack will use a cryptographic challenge
319 protocol known as "SYN cookies" to enable legitimate users to
320 continue to connect, even when your machine is under attack. There
321 is no need for the legitimate users to change their TCP/IP software;
322 SYN cookies work transparently to them. For technical information
323 about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
324
325 If you are SYN flooded, the source address reported by the kernel is
326 likely to have been forged by the attacker; it is only reported as
327 an aid in tracing the packets to their actual source and should not
328 be taken as absolute truth.
329
330 SYN cookies may prevent correct error reporting on clients when the
331 server is really overloaded. If this happens frequently better turn
332 them off.
333
334 If you say Y here, note that SYN cookies aren't enabled by default;
335 you can enable them by saying Y to "/proc file system support" and
336 "Sysctl support" below and executing the command
337
338 echo 1 >/proc/sys/net/ipv4/tcp_syncookies
339
340 at boot time after the /proc file system has been mounted.
341
342 If unsure, say N.
343
344config INET_AH
345 tristate "IP: AH transformation"
346 depends on INET
347 select XFRM
348 select CRYPTO
349 select CRYPTO_HMAC
350 select CRYPTO_MD5
351 select CRYPTO_SHA1
352 ---help---
353 Support for IPsec AH.
354
355 If unsure, say Y.
356
357config INET_ESP
358 tristate "IP: ESP transformation"
359 depends on INET
360 select XFRM
361 select CRYPTO
362 select CRYPTO_HMAC
363 select CRYPTO_MD5
364 select CRYPTO_SHA1
365 select CRYPTO_DES
366 ---help---
367 Support for IPsec ESP.
368
369 If unsure, say Y.
370
371config INET_IPCOMP
372 tristate "IP: IPComp transformation"
373 depends on INET
374 select XFRM
375 select INET_TUNNEL
376 select CRYPTO
377 select CRYPTO_DEFLATE
378 ---help---
379 Support for IP Payload Compression Protocol (IPComp) (RFC3173),
380 typically needed for IPsec.
381
382 If unsure, say Y.
383
384config INET_TUNNEL
385 tristate "IP: tunnel transformation"
386 depends on INET
387 select XFRM
388 ---help---
389 Support for generic IP tunnel transformation, which is required by
390 the IP tunneling module as well as tunnel mode IPComp.
391
392 If unsure, say Y.
393
394config IP_TCPDIAG
395 tristate "IP: TCP socket monitoring interface"
396 depends on INET
397 default y
398 ---help---
399 Support for TCP socket monitoring interface used by native Linux
400 tools such as ss. ss is included in iproute2, currently downloadable
401 at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
402 and have selected IPv6 as a module, you need to build this as a
403 module too.
404
405 If unsure, say Y.
406
407config IP_TCPDIAG_IPV6
408 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
409
410source "net/ipv4/ipvs/Kconfig"
411
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 000000000000..8b379627ebb6
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,33 @@
1#
2# Makefile for the Linux TCP/IP (INET) layer.
3#
4
5obj-y := utils.o route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
11
12obj-$(CONFIG_PROC_FS) += proc.o
13obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
14obj-$(CONFIG_IP_MROUTE) += ipmr.o
15obj-$(CONFIG_NET_IPIP) += ipip.o
16obj-$(CONFIG_NET_IPGRE) += ip_gre.o
17obj-$(CONFIG_SYN_COOKIES) += syncookies.o
18obj-$(CONFIG_INET_AH) += ah4.o
19obj-$(CONFIG_INET_ESP) += esp4.o
20obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
21obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o
22obj-$(CONFIG_IP_PNP) += ipconfig.o
23obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
24obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
25obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
26obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
27obj-$(CONFIG_NETFILTER) += netfilter/
28obj-$(CONFIG_IP_VS) += ipvs/
29obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
30obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
31
32obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
33 xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 000000000000..c34dab67e461
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1188 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PF_INET protocol family socket handler.
7 *
8 * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Changes (see also sock.c)
16 *
17 * piggy,
18 * Karl Knutson : Socket protocol table
19 * A.N.Kuznetsov : Socket death error in accept().
20 * John Richardson : Fix non blocking error in connect()
21 * so sockets that fail to connect
22 * don't return -EINPROGRESS.
23 * Alan Cox : Asynchronous I/O support
24 * Alan Cox : Keep correct socket pointer on sock
25 * structures
26 * when accept() ed
27 * Alan Cox : Semantics of SO_LINGER aren't state
28 * moved to close when you look carefully.
29 * With this fixed and the accept bug fixed
30 * some RPC stuff seems happier.
31 * Niibe Yutaka : 4.4BSD style write async I/O
32 * Alan Cox,
33 * Tony Gale : Fixed reuse semantics.
34 * Alan Cox : bind() shouldn't abort existing but dead
35 * sockets. Stops FTP netin:.. I hope.
36 * Alan Cox : bind() works correctly for RAW sockets.
37 * Note that FreeBSD at least was broken
38 * in this respect so be careful with
39 * compatibility tests...
40 * Alan Cox : routing cache support
41 * Alan Cox : memzero the socket structure for
42 * compactness.
43 * Matt Day : nonblock connect error handler
44 * Alan Cox : Allow large numbers of pending sockets
45 * (eg for big web sites), but only if
46 * specifically application requested.
47 * Alan Cox : New buffering throughout IP. Used
48 * dumbly.
49 * Alan Cox : New buffering now used smartly.
50 * Alan Cox : BSD rather than common sense
51 * interpretation of listen.
52 * Germano Caronni : Assorted small races.
53 * Alan Cox : sendmsg/recvmsg basic support.
54 * Alan Cox : Only sendmsg/recvmsg now supported.
55 * Alan Cox : Locked down bind (see security list).
56 * Alan Cox : Loosened bind a little.
57 * Mike McLagan : ADD/DEL DLCI Ioctls
58 * Willy Konynenberg : Transparent proxying support.
59 * David S. Miller : New socket lookup architecture.
60 * Some other random speedups.
61 * Cyrus Durgin : Cleaned up file for kmod hacks.
62 * Andi Kleen : Fix inet_stream_connect TCP race.
63 *
64 * This program is free software; you can redistribute it and/or
65 * modify it under the terms of the GNU General Public License
66 * as published by the Free Software Foundation; either version
67 * 2 of the License, or (at your option) any later version.
68 */
69
70#include <linux/config.h>
71#include <linux/errno.h>
72#include <linux/types.h>
73#include <linux/socket.h>
74#include <linux/in.h>
75#include <linux/kernel.h>
76#include <linux/major.h>
77#include <linux/module.h>
78#include <linux/sched.h>
79#include <linux/timer.h>
80#include <linux/string.h>
81#include <linux/sockios.h>
82#include <linux/net.h>
83#include <linux/fcntl.h>
84#include <linux/mm.h>
85#include <linux/interrupt.h>
86#include <linux/stat.h>
87#include <linux/init.h>
88#include <linux/poll.h>
89#include <linux/netfilter_ipv4.h>
90
91#include <asm/uaccess.h>
92#include <asm/system.h>
93
94#include <linux/smp_lock.h>
95#include <linux/inet.h>
96#include <linux/igmp.h>
97#include <linux/netdevice.h>
98#include <net/ip.h>
99#include <net/protocol.h>
100#include <net/arp.h>
101#include <net/route.h>
102#include <net/ip_fib.h>
103#include <net/tcp.h>
104#include <net/udp.h>
105#include <linux/skbuff.h>
106#include <net/sock.h>
107#include <net/raw.h>
108#include <net/icmp.h>
109#include <net/ipip.h>
110#include <net/inet_common.h>
111#include <net/xfrm.h>
112#ifdef CONFIG_IP_MROUTE
113#include <linux/mroute.h>
114#endif
115
116DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
117
118#ifdef INET_REFCNT_DEBUG
119atomic_t inet_sock_nr;
120#endif
121
122extern void ip_mc_drop_socket(struct sock *sk);
123
124/* The inetsw table contains everything that inet_create needs to
125 * build a new socket.
126 */
127static struct list_head inetsw[SOCK_MAX];
128static DEFINE_SPINLOCK(inetsw_lock);
129
130/* New destruction routine */
131
132void inet_sock_destruct(struct sock *sk)
133{
134 struct inet_sock *inet = inet_sk(sk);
135
136 __skb_queue_purge(&sk->sk_receive_queue);
137 __skb_queue_purge(&sk->sk_error_queue);
138
139 if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
140 printk("Attempt to release TCP socket in state %d %p\n",
141 sk->sk_state, sk);
142 return;
143 }
144 if (!sock_flag(sk, SOCK_DEAD)) {
145 printk("Attempt to release alive inet socket %p\n", sk);
146 return;
147 }
148
149 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
150 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
151 BUG_TRAP(!sk->sk_wmem_queued);
152 BUG_TRAP(!sk->sk_forward_alloc);
153
154 if (inet->opt)
155 kfree(inet->opt);
156 dst_release(sk->sk_dst_cache);
157#ifdef INET_REFCNT_DEBUG
158 atomic_dec(&inet_sock_nr);
159 printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
160 sk, atomic_read(&inet_sock_nr));
161#endif
162}
163
164/*
165 * The routines beyond this point handle the behaviour of an AF_INET
166 * socket object. Mostly it punts to the subprotocols of IP to do
167 * the work.
168 */
169
170/*
171 * Automatically bind an unbound socket.
172 */
173
174static int inet_autobind(struct sock *sk)
175{
176 struct inet_sock *inet;
177 /* We may need to bind the socket. */
178 lock_sock(sk);
179 inet = inet_sk(sk);
180 if (!inet->num) {
181 if (sk->sk_prot->get_port(sk, 0)) {
182 release_sock(sk);
183 return -EAGAIN;
184 }
185 inet->sport = htons(inet->num);
186 }
187 release_sock(sk);
188 return 0;
189}
190
191/*
192 * Move a socket into listening state.
193 */
194int inet_listen(struct socket *sock, int backlog)
195{
196 struct sock *sk = sock->sk;
197 unsigned char old_state;
198 int err;
199
200 lock_sock(sk);
201
202 err = -EINVAL;
203 if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
204 goto out;
205
206 old_state = sk->sk_state;
207 if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
208 goto out;
209
210 /* Really, if the socket is already in listen state
211 * we can only allow the backlog to be adjusted.
212 */
213 if (old_state != TCP_LISTEN) {
214 err = tcp_listen_start(sk);
215 if (err)
216 goto out;
217 }
218 sk->sk_max_ack_backlog = backlog;
219 err = 0;
220
221out:
222 release_sock(sk);
223 return err;
224}
225
226/*
227 * Create an inet socket.
228 */
229
230static int inet_create(struct socket *sock, int protocol)
231{
232 struct sock *sk;
233 struct list_head *p;
234 struct inet_protosw *answer;
235 struct inet_sock *inet;
236 struct proto *answer_prot;
237 unsigned char answer_flags;
238 char answer_no_check;
239 int err;
240
241 sock->state = SS_UNCONNECTED;
242
243 /* Look for the requested type/protocol pair. */
244 answer = NULL;
245 rcu_read_lock();
246 list_for_each_rcu(p, &inetsw[sock->type]) {
247 answer = list_entry(p, struct inet_protosw, list);
248
249 /* Check the non-wild match. */
250 if (protocol == answer->protocol) {
251 if (protocol != IPPROTO_IP)
252 break;
253 } else {
254 /* Check for the two wild cases. */
255 if (IPPROTO_IP == protocol) {
256 protocol = answer->protocol;
257 break;
258 }
259 if (IPPROTO_IP == answer->protocol)
260 break;
261 }
262 answer = NULL;
263 }
264
265 err = -ESOCKTNOSUPPORT;
266 if (!answer)
267 goto out_rcu_unlock;
268 err = -EPERM;
269 if (answer->capability > 0 && !capable(answer->capability))
270 goto out_rcu_unlock;
271 err = -EPROTONOSUPPORT;
272 if (!protocol)
273 goto out_rcu_unlock;
274
275 sock->ops = answer->ops;
276 answer_prot = answer->prot;
277 answer_no_check = answer->no_check;
278 answer_flags = answer->flags;
279 rcu_read_unlock();
280
281 BUG_TRAP(answer_prot->slab != NULL);
282
283 err = -ENOBUFS;
284 sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
285 if (sk == NULL)
286 goto out;
287
288 err = 0;
289 sk->sk_no_check = answer_no_check;
290 if (INET_PROTOSW_REUSE & answer_flags)
291 sk->sk_reuse = 1;
292
293 inet = inet_sk(sk);
294
295 if (SOCK_RAW == sock->type) {
296 inet->num = protocol;
297 if (IPPROTO_RAW == protocol)
298 inet->hdrincl = 1;
299 }
300
301 if (ipv4_config.no_pmtu_disc)
302 inet->pmtudisc = IP_PMTUDISC_DONT;
303 else
304 inet->pmtudisc = IP_PMTUDISC_WANT;
305
306 inet->id = 0;
307
308 sock_init_data(sock, sk);
309
310 sk->sk_destruct = inet_sock_destruct;
311 sk->sk_family = PF_INET;
312 sk->sk_protocol = protocol;
313 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
314
315 inet->uc_ttl = -1;
316 inet->mc_loop = 1;
317 inet->mc_ttl = 1;
318 inet->mc_index = 0;
319 inet->mc_list = NULL;
320
321#ifdef INET_REFCNT_DEBUG
322 atomic_inc(&inet_sock_nr);
323#endif
324
325 if (inet->num) {
326 /* It assumes that any protocol which allows
327 * the user to assign a number at socket
328 * creation time automatically
329 * shares.
330 */
331 inet->sport = htons(inet->num);
332 /* Add to protocol hash chains. */
333 sk->sk_prot->hash(sk);
334 }
335
336 if (sk->sk_prot->init) {
337 err = sk->sk_prot->init(sk);
338 if (err)
339 sk_common_release(sk);
340 }
341out:
342 return err;
343out_rcu_unlock:
344 rcu_read_unlock();
345 goto out;
346}
347
348
349/*
350 * The peer socket should always be NULL (or else). When we call this
351 * function we are destroying the object and from then on nobody
352 * should refer to it.
353 */
354int inet_release(struct socket *sock)
355{
356 struct sock *sk = sock->sk;
357
358 if (sk) {
359 long timeout;
360
361 /* Applications forget to leave groups before exiting */
362 ip_mc_drop_socket(sk);
363
364 /* If linger is set, we don't return until the close
365 * is complete. Otherwise we return immediately. The
366 * actually closing is done the same either way.
367 *
368 * If the close is due to the process exiting, we never
369 * linger..
370 */
371 timeout = 0;
372 if (sock_flag(sk, SOCK_LINGER) &&
373 !(current->flags & PF_EXITING))
374 timeout = sk->sk_lingertime;
375 sock->sk = NULL;
376 sk->sk_prot->close(sk, timeout);
377 }
378 return 0;
379}
380
381/* It is off by default, see below. */
382int sysctl_ip_nonlocal_bind;
383
384int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
385{
386 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
387 struct sock *sk = sock->sk;
388 struct inet_sock *inet = inet_sk(sk);
389 unsigned short snum;
390 int chk_addr_ret;
391 int err;
392
393 /* If the socket has its own bind function then use it. (RAW) */
394 if (sk->sk_prot->bind) {
395 err = sk->sk_prot->bind(sk, uaddr, addr_len);
396 goto out;
397 }
398 err = -EINVAL;
399 if (addr_len < sizeof(struct sockaddr_in))
400 goto out;
401
402 chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
403
404 /* Not specified by any standard per-se, however it breaks too
405 * many applications when removed. It is unfortunate since
406 * allowing applications to make a non-local bind solves
407 * several problems with systems using dynamic addressing.
408 * (ie. your servers still start up even if your ISDN link
409 * is temporarily down)
410 */
411 err = -EADDRNOTAVAIL;
412 if (!sysctl_ip_nonlocal_bind &&
413 !inet->freebind &&
414 addr->sin_addr.s_addr != INADDR_ANY &&
415 chk_addr_ret != RTN_LOCAL &&
416 chk_addr_ret != RTN_MULTICAST &&
417 chk_addr_ret != RTN_BROADCAST)
418 goto out;
419
420 snum = ntohs(addr->sin_port);
421 err = -EACCES;
422 if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
423 goto out;
424
425 /* We keep a pair of addresses. rcv_saddr is the one
426 * used by hash lookups, and saddr is used for transmit.
427 *
428 * In the BSD API these are the same except where it
429 * would be illegal to use them (multicast/broadcast) in
430 * which case the sending device address is used.
431 */
432 lock_sock(sk);
433
434 /* Check these errors (active socket, double bind). */
435 err = -EINVAL;
436 if (sk->sk_state != TCP_CLOSE || inet->num)
437 goto out_release_sock;
438
439 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
440 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
441 inet->saddr = 0; /* Use device */
442
443 /* Make sure we are allowed to bind here. */
444 if (sk->sk_prot->get_port(sk, snum)) {
445 inet->saddr = inet->rcv_saddr = 0;
446 err = -EADDRINUSE;
447 goto out_release_sock;
448 }
449
450 if (inet->rcv_saddr)
451 sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
452 if (snum)
453 sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
454 inet->sport = htons(inet->num);
455 inet->daddr = 0;
456 inet->dport = 0;
457 sk_dst_reset(sk);
458 err = 0;
459out_release_sock:
460 release_sock(sk);
461out:
462 return err;
463}
464
465int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
466 int addr_len, int flags)
467{
468 struct sock *sk = sock->sk;
469
470 if (uaddr->sa_family == AF_UNSPEC)
471 return sk->sk_prot->disconnect(sk, flags);
472
473 if (!inet_sk(sk)->num && inet_autobind(sk))
474 return -EAGAIN;
475 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
476}
477
478static long inet_wait_for_connect(struct sock *sk, long timeo)
479{
480 DEFINE_WAIT(wait);
481
482 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
483
484 /* Basic assumption: if someone sets sk->sk_err, he _must_
485 * change state of the socket from TCP_SYN_*.
486 * Connect() does not allow to get error notifications
487 * without closing the socket.
488 */
489 while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
490 release_sock(sk);
491 timeo = schedule_timeout(timeo);
492 lock_sock(sk);
493 if (signal_pending(current) || !timeo)
494 break;
495 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
496 }
497 finish_wait(sk->sk_sleep, &wait);
498 return timeo;
499}
500
501/*
502 * Connect to a remote host. There is regrettably still a little
503 * TCP 'magic' in here.
504 */
505int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
506 int addr_len, int flags)
507{
508 struct sock *sk = sock->sk;
509 int err;
510 long timeo;
511
512 lock_sock(sk);
513
514 if (uaddr->sa_family == AF_UNSPEC) {
515 err = sk->sk_prot->disconnect(sk, flags);
516 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
517 goto out;
518 }
519
520 switch (sock->state) {
521 default:
522 err = -EINVAL;
523 goto out;
524 case SS_CONNECTED:
525 err = -EISCONN;
526 goto out;
527 case SS_CONNECTING:
528 err = -EALREADY;
529 /* Fall out of switch with err, set for this state */
530 break;
531 case SS_UNCONNECTED:
532 err = -EISCONN;
533 if (sk->sk_state != TCP_CLOSE)
534 goto out;
535
536 err = sk->sk_prot->connect(sk, uaddr, addr_len);
537 if (err < 0)
538 goto out;
539
540 sock->state = SS_CONNECTING;
541
542 /* Just entered SS_CONNECTING state; the only
543 * difference is that return value in non-blocking
544 * case is EINPROGRESS, rather than EALREADY.
545 */
546 err = -EINPROGRESS;
547 break;
548 }
549
550 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
551
552 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
553 /* Error code is set above */
554 if (!timeo || !inet_wait_for_connect(sk, timeo))
555 goto out;
556
557 err = sock_intr_errno(timeo);
558 if (signal_pending(current))
559 goto out;
560 }
561
562 /* Connection was closed by RST, timeout, ICMP error
563 * or another process disconnected us.
564 */
565 if (sk->sk_state == TCP_CLOSE)
566 goto sock_error;
567
568 /* sk->sk_err may be not zero now, if RECVERR was ordered by user
569 * and error was received after socket entered established state.
570 * Hence, it is handled normally after connect() return successfully.
571 */
572
573 sock->state = SS_CONNECTED;
574 err = 0;
575out:
576 release_sock(sk);
577 return err;
578
579sock_error:
580 err = sock_error(sk) ? : -ECONNABORTED;
581 sock->state = SS_UNCONNECTED;
582 if (sk->sk_prot->disconnect(sk, flags))
583 sock->state = SS_DISCONNECTING;
584 goto out;
585}
586
587/*
588 * Accept a pending connection. The TCP layer now gives BSD semantics.
589 */
590
591int inet_accept(struct socket *sock, struct socket *newsock, int flags)
592{
593 struct sock *sk1 = sock->sk;
594 int err = -EINVAL;
595 struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
596
597 if (!sk2)
598 goto do_err;
599
600 lock_sock(sk2);
601
602 BUG_TRAP((1 << sk2->sk_state) &
603 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
604
605 sock_graft(sk2, newsock);
606
607 newsock->state = SS_CONNECTED;
608 err = 0;
609 release_sock(sk2);
610do_err:
611 return err;
612}
613
614
615/*
616 * This does both peername and sockname.
617 */
618int inet_getname(struct socket *sock, struct sockaddr *uaddr,
619 int *uaddr_len, int peer)
620{
621 struct sock *sk = sock->sk;
622 struct inet_sock *inet = inet_sk(sk);
623 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
624
625 sin->sin_family = AF_INET;
626 if (peer) {
627 if (!inet->dport ||
628 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
629 peer == 1))
630 return -ENOTCONN;
631 sin->sin_port = inet->dport;
632 sin->sin_addr.s_addr = inet->daddr;
633 } else {
634 __u32 addr = inet->rcv_saddr;
635 if (!addr)
636 addr = inet->saddr;
637 sin->sin_port = inet->sport;
638 sin->sin_addr.s_addr = addr;
639 }
640 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
641 *uaddr_len = sizeof(*sin);
642 return 0;
643}
644
645int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
646 size_t size)
647{
648 struct sock *sk = sock->sk;
649
650 /* We may need to bind the socket. */
651 if (!inet_sk(sk)->num && inet_autobind(sk))
652 return -EAGAIN;
653
654 return sk->sk_prot->sendmsg(iocb, sk, msg, size);
655}
656
657
658static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
659{
660 struct sock *sk = sock->sk;
661
662 /* We may need to bind the socket. */
663 if (!inet_sk(sk)->num && inet_autobind(sk))
664 return -EAGAIN;
665
666 if (sk->sk_prot->sendpage)
667 return sk->sk_prot->sendpage(sk, page, offset, size, flags);
668 return sock_no_sendpage(sock, page, offset, size, flags);
669}
670
671
672int inet_shutdown(struct socket *sock, int how)
673{
674 struct sock *sk = sock->sk;
675 int err = 0;
676
677 /* This should really check to make sure
678 * the socket is a TCP socket. (WHY AC...)
679 */
680 how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
681 1->2 bit 2 snds.
682 2->3 */
683 if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
684 return -EINVAL;
685
686 lock_sock(sk);
687 if (sock->state == SS_CONNECTING) {
688 if ((1 << sk->sk_state) &
689 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
690 sock->state = SS_DISCONNECTING;
691 else
692 sock->state = SS_CONNECTED;
693 }
694
695 switch (sk->sk_state) {
696 case TCP_CLOSE:
697 err = -ENOTCONN;
698 /* Hack to wake up other listeners, who can poll for
699 POLLHUP, even on eg. unconnected UDP sockets -- RR */
700 default:
701 sk->sk_shutdown |= how;
702 if (sk->sk_prot->shutdown)
703 sk->sk_prot->shutdown(sk, how);
704 break;
705
706 /* Remaining two branches are temporary solution for missing
707 * close() in multithreaded environment. It is _not_ a good idea,
708 * but we have no choice until close() is repaired at VFS level.
709 */
710 case TCP_LISTEN:
711 if (!(how & RCV_SHUTDOWN))
712 break;
713 /* Fall through */
714 case TCP_SYN_SENT:
715 err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
716 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
717 break;
718 }
719
720 /* Wake up anyone sleeping in poll. */
721 sk->sk_state_change(sk);
722 release_sock(sk);
723 return err;
724}
725
726/*
727 * ioctl() calls you can issue on an INET socket. Most of these are
728 * device configuration and stuff and very rarely used. Some ioctls
729 * pass on to the socket itself.
730 *
731 * NOTE: I like the idea of a module for the config stuff. ie ifconfig
732 * loads the devconfigure module does its configuring and unloads it.
733 * There's a good 20K of config code hanging around the kernel.
734 */
735
736int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
737{
738 struct sock *sk = sock->sk;
739 int err = 0;
740
741 switch (cmd) {
742 case SIOCGSTAMP:
743 err = sock_get_timestamp(sk, (struct timeval __user *)arg);
744 break;
745 case SIOCADDRT:
746 case SIOCDELRT:
747 case SIOCRTMSG:
748 err = ip_rt_ioctl(cmd, (void __user *)arg);
749 break;
750 case SIOCDARP:
751 case SIOCGARP:
752 case SIOCSARP:
753 err = arp_ioctl(cmd, (void __user *)arg);
754 break;
755 case SIOCGIFADDR:
756 case SIOCSIFADDR:
757 case SIOCGIFBRDADDR:
758 case SIOCSIFBRDADDR:
759 case SIOCGIFNETMASK:
760 case SIOCSIFNETMASK:
761 case SIOCGIFDSTADDR:
762 case SIOCSIFDSTADDR:
763 case SIOCSIFPFLAGS:
764 case SIOCGIFPFLAGS:
765 case SIOCSIFFLAGS:
766 err = devinet_ioctl(cmd, (void __user *)arg);
767 break;
768 default:
769 if (!sk->sk_prot->ioctl ||
770 (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
771 -ENOIOCTLCMD)
772 err = dev_ioctl(cmd, (void __user *)arg);
773 break;
774 }
775 return err;
776}
777
778struct proto_ops inet_stream_ops = {
779 .family = PF_INET,
780 .owner = THIS_MODULE,
781 .release = inet_release,
782 .bind = inet_bind,
783 .connect = inet_stream_connect,
784 .socketpair = sock_no_socketpair,
785 .accept = inet_accept,
786 .getname = inet_getname,
787 .poll = tcp_poll,
788 .ioctl = inet_ioctl,
789 .listen = inet_listen,
790 .shutdown = inet_shutdown,
791 .setsockopt = sock_common_setsockopt,
792 .getsockopt = sock_common_getsockopt,
793 .sendmsg = inet_sendmsg,
794 .recvmsg = sock_common_recvmsg,
795 .mmap = sock_no_mmap,
796 .sendpage = tcp_sendpage
797};
798
799struct proto_ops inet_dgram_ops = {
800 .family = PF_INET,
801 .owner = THIS_MODULE,
802 .release = inet_release,
803 .bind = inet_bind,
804 .connect = inet_dgram_connect,
805 .socketpair = sock_no_socketpair,
806 .accept = sock_no_accept,
807 .getname = inet_getname,
808 .poll = udp_poll,
809 .ioctl = inet_ioctl,
810 .listen = sock_no_listen,
811 .shutdown = inet_shutdown,
812 .setsockopt = sock_common_setsockopt,
813 .getsockopt = sock_common_getsockopt,
814 .sendmsg = inet_sendmsg,
815 .recvmsg = sock_common_recvmsg,
816 .mmap = sock_no_mmap,
817 .sendpage = inet_sendpage,
818};
819
820/*
821 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
822 * udp_poll
823 */
824static struct proto_ops inet_sockraw_ops = {
825 .family = PF_INET,
826 .owner = THIS_MODULE,
827 .release = inet_release,
828 .bind = inet_bind,
829 .connect = inet_dgram_connect,
830 .socketpair = sock_no_socketpair,
831 .accept = sock_no_accept,
832 .getname = inet_getname,
833 .poll = datagram_poll,
834 .ioctl = inet_ioctl,
835 .listen = sock_no_listen,
836 .shutdown = inet_shutdown,
837 .setsockopt = sock_common_setsockopt,
838 .getsockopt = sock_common_getsockopt,
839 .sendmsg = inet_sendmsg,
840 .recvmsg = sock_common_recvmsg,
841 .mmap = sock_no_mmap,
842 .sendpage = inet_sendpage,
843};
844
845static struct net_proto_family inet_family_ops = {
846 .family = PF_INET,
847 .create = inet_create,
848 .owner = THIS_MODULE,
849};
850
851
852extern void tcp_init(void);
853extern void tcp_v4_init(struct net_proto_family *);
854
855/* Upon startup we insert all the elements in inetsw_array[] into
856 * the linked list inetsw.
857 */
858static struct inet_protosw inetsw_array[] =
859{
860 {
861 .type = SOCK_STREAM,
862 .protocol = IPPROTO_TCP,
863 .prot = &tcp_prot,
864 .ops = &inet_stream_ops,
865 .capability = -1,
866 .no_check = 0,
867 .flags = INET_PROTOSW_PERMANENT,
868 },
869
870 {
871 .type = SOCK_DGRAM,
872 .protocol = IPPROTO_UDP,
873 .prot = &udp_prot,
874 .ops = &inet_dgram_ops,
875 .capability = -1,
876 .no_check = UDP_CSUM_DEFAULT,
877 .flags = INET_PROTOSW_PERMANENT,
878 },
879
880
881 {
882 .type = SOCK_RAW,
883 .protocol = IPPROTO_IP, /* wild card */
884 .prot = &raw_prot,
885 .ops = &inet_sockraw_ops,
886 .capability = CAP_NET_RAW,
887 .no_check = UDP_CSUM_DEFAULT,
888 .flags = INET_PROTOSW_REUSE,
889 }
890};
891
892#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
893
894void inet_register_protosw(struct inet_protosw *p)
895{
896 struct list_head *lh;
897 struct inet_protosw *answer;
898 int protocol = p->protocol;
899 struct list_head *last_perm;
900
901 spin_lock_bh(&inetsw_lock);
902
903 if (p->type >= SOCK_MAX)
904 goto out_illegal;
905
906 /* If we are trying to override a permanent protocol, bail. */
907 answer = NULL;
908 last_perm = &inetsw[p->type];
909 list_for_each(lh, &inetsw[p->type]) {
910 answer = list_entry(lh, struct inet_protosw, list);
911
912 /* Check only the non-wild match. */
913 if (INET_PROTOSW_PERMANENT & answer->flags) {
914 if (protocol == answer->protocol)
915 break;
916 last_perm = lh;
917 }
918
919 answer = NULL;
920 }
921 if (answer)
922 goto out_permanent;
923
924 /* Add the new entry after the last permanent entry if any, so that
925 * the new entry does not override a permanent entry when matched with
926 * a wild-card protocol. But it is allowed to override any existing
927 * non-permanent entry. This means that when we remove this entry, the
928 * system automatically returns to the old behavior.
929 */
930 list_add_rcu(&p->list, last_perm);
931out:
932 spin_unlock_bh(&inetsw_lock);
933
934 synchronize_net();
935
936 return;
937
938out_permanent:
939 printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
940 protocol);
941 goto out;
942
943out_illegal:
944 printk(KERN_ERR
945 "Ignoring attempt to register invalid socket type %d.\n",
946 p->type);
947 goto out;
948}
949
950void inet_unregister_protosw(struct inet_protosw *p)
951{
952 if (INET_PROTOSW_PERMANENT & p->flags) {
953 printk(KERN_ERR
954 "Attempt to unregister permanent protocol %d.\n",
955 p->protocol);
956 } else {
957 spin_lock_bh(&inetsw_lock);
958 list_del_rcu(&p->list);
959 spin_unlock_bh(&inetsw_lock);
960
961 synchronize_net();
962 }
963}
964
965#ifdef CONFIG_IP_MULTICAST
966static struct net_protocol igmp_protocol = {
967 .handler = igmp_rcv,
968};
969#endif
970
971static struct net_protocol tcp_protocol = {
972 .handler = tcp_v4_rcv,
973 .err_handler = tcp_v4_err,
974 .no_policy = 1,
975};
976
977static struct net_protocol udp_protocol = {
978 .handler = udp_rcv,
979 .err_handler = udp_err,
980 .no_policy = 1,
981};
982
983static struct net_protocol icmp_protocol = {
984 .handler = icmp_rcv,
985};
986
987static int __init init_ipv4_mibs(void)
988{
989 net_statistics[0] = alloc_percpu(struct linux_mib);
990 net_statistics[1] = alloc_percpu(struct linux_mib);
991 ip_statistics[0] = alloc_percpu(struct ipstats_mib);
992 ip_statistics[1] = alloc_percpu(struct ipstats_mib);
993 icmp_statistics[0] = alloc_percpu(struct icmp_mib);
994 icmp_statistics[1] = alloc_percpu(struct icmp_mib);
995 tcp_statistics[0] = alloc_percpu(struct tcp_mib);
996 tcp_statistics[1] = alloc_percpu(struct tcp_mib);
997 udp_statistics[0] = alloc_percpu(struct udp_mib);
998 udp_statistics[1] = alloc_percpu(struct udp_mib);
999 if (!
1000 (net_statistics[0] && net_statistics[1] && ip_statistics[0]
1001 && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
1002 && udp_statistics[0] && udp_statistics[1]))
1003 return -ENOMEM;
1004
1005 (void) tcp_mib_init();
1006
1007 return 0;
1008}
1009
1010static int ipv4_proc_init(void);
1011extern void ipfrag_init(void);
1012
1013static int __init inet_init(void)
1014{
1015 struct sk_buff *dummy_skb;
1016 struct inet_protosw *q;
1017 struct list_head *r;
1018 int rc = -EINVAL;
1019
1020 if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) {
1021 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
1022 goto out;
1023 }
1024
1025 rc = proto_register(&tcp_prot, 1);
1026 if (rc)
1027 goto out;
1028
1029 rc = proto_register(&udp_prot, 1);
1030 if (rc)
1031 goto out_unregister_tcp_proto;
1032
1033 rc = proto_register(&raw_prot, 1);
1034 if (rc)
1035 goto out_unregister_udp_proto;
1036
1037 /*
1038 * Tell SOCKET that we are alive...
1039 */
1040
1041 (void)sock_register(&inet_family_ops);
1042
1043 /*
1044 * Add all the base protocols.
1045 */
1046
1047 if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
1048 printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
1049 if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
1050 printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
1051 if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
1052 printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
1053#ifdef CONFIG_IP_MULTICAST
1054 if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
1055 printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
1056#endif
1057
1058 /* Register the socket-side information for inet_create. */
1059 for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
1060 INIT_LIST_HEAD(r);
1061
1062 for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
1063 inet_register_protosw(q);
1064
1065 /*
1066 * Set the ARP module up
1067 */
1068
1069 arp_init();
1070
1071 /*
1072 * Set the IP module up
1073 */
1074
1075 ip_init();
1076
1077 tcp_v4_init(&inet_family_ops);
1078
1079 /* Setup TCP slab cache for open requests. */
1080 tcp_init();
1081
1082
1083 /*
1084 * Set the ICMP layer up
1085 */
1086
1087 icmp_init(&inet_family_ops);
1088
1089 /*
1090 * Initialise the multicast router
1091 */
1092#if defined(CONFIG_IP_MROUTE)
1093 ip_mr_init();
1094#endif
1095 /*
1096 * Initialise per-cpu ipv4 mibs
1097 */
1098
1099 if(init_ipv4_mibs())
1100 printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
1101
1102 ipv4_proc_init();
1103
1104 ipfrag_init();
1105
1106 rc = 0;
1107out:
1108 return rc;
1109out_unregister_tcp_proto:
1110 proto_unregister(&tcp_prot);
1111out_unregister_udp_proto:
1112 proto_unregister(&udp_prot);
1113 goto out;
1114}
1115
1116module_init(inet_init);
1117
1118/* ------------------------------------------------------------------------ */
1119
1120#ifdef CONFIG_PROC_FS
1121extern int fib_proc_init(void);
1122extern void fib_proc_exit(void);
1123extern int ip_misc_proc_init(void);
1124extern int raw_proc_init(void);
1125extern void raw_proc_exit(void);
1126extern int tcp4_proc_init(void);
1127extern void tcp4_proc_exit(void);
1128extern int udp4_proc_init(void);
1129extern void udp4_proc_exit(void);
1130
1131static int __init ipv4_proc_init(void)
1132{
1133 int rc = 0;
1134
1135 if (raw_proc_init())
1136 goto out_raw;
1137 if (tcp4_proc_init())
1138 goto out_tcp;
1139 if (udp4_proc_init())
1140 goto out_udp;
1141 if (fib_proc_init())
1142 goto out_fib;
1143 if (ip_misc_proc_init())
1144 goto out_misc;
1145out:
1146 return rc;
1147out_misc:
1148 fib_proc_exit();
1149out_fib:
1150 udp4_proc_exit();
1151out_udp:
1152 tcp4_proc_exit();
1153out_tcp:
1154 raw_proc_exit();
1155out_raw:
1156 rc = -ENOMEM;
1157 goto out;
1158}
1159
1160#else /* CONFIG_PROC_FS */
1161static int __init ipv4_proc_init(void)
1162{
1163 return 0;
1164}
1165#endif /* CONFIG_PROC_FS */
1166
1167MODULE_ALIAS_NETPROTO(PF_INET);
1168
1169EXPORT_SYMBOL(inet_accept);
1170EXPORT_SYMBOL(inet_bind);
1171EXPORT_SYMBOL(inet_dgram_connect);
1172EXPORT_SYMBOL(inet_dgram_ops);
1173EXPORT_SYMBOL(inet_getname);
1174EXPORT_SYMBOL(inet_ioctl);
1175EXPORT_SYMBOL(inet_listen);
1176EXPORT_SYMBOL(inet_register_protosw);
1177EXPORT_SYMBOL(inet_release);
1178EXPORT_SYMBOL(inet_sendmsg);
1179EXPORT_SYMBOL(inet_shutdown);
1180EXPORT_SYMBOL(inet_sock_destruct);
1181EXPORT_SYMBOL(inet_stream_connect);
1182EXPORT_SYMBOL(inet_stream_ops);
1183EXPORT_SYMBOL(inet_unregister_protosw);
1184EXPORT_SYMBOL(net_statistics);
1185
1186#ifdef INET_REFCNT_DEBUG
1187EXPORT_SYMBOL(inet_sock_nr);
1188#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 000000000000..0e98f2235b6e
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,335 @@
1#include <linux/config.h>
2#include <linux/module.h>
3#include <net/ip.h>
4#include <net/xfrm.h>
5#include <net/ah.h>
6#include <linux/crypto.h>
7#include <linux/pfkeyv2.h>
8#include <net/icmp.h>
9#include <asm/scatterlist.h>
10
11
12/* Clear mutable options and find final destination to substitute
13 * into IP header for icv calculation. Options are already checked
14 * for validity, so paranoia is not required. */
15
16static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr)
17{
18 unsigned char * optptr = (unsigned char*)(iph+1);
19 int l = iph->ihl*4 - sizeof(struct iphdr);
20 int optlen;
21
22 while (l > 0) {
23 switch (*optptr) {
24 case IPOPT_END:
25 return 0;
26 case IPOPT_NOOP:
27 l--;
28 optptr++;
29 continue;
30 }
31 optlen = optptr[1];
32 if (optlen<2 || optlen>l)
33 return -EINVAL;
34 switch (*optptr) {
35 case IPOPT_SEC:
36 case 0x85: /* Some "Extended Security" crap. */
37 case 0x86: /* Another "Commercial Security" crap. */
38 case IPOPT_RA:
39 case 0x80|21: /* RFC1770 */
40 break;
41 case IPOPT_LSRR:
42 case IPOPT_SSRR:
43 if (optlen < 6)
44 return -EINVAL;
45 memcpy(daddr, optptr+optlen-4, 4);
46 /* Fall through */
47 default:
48 memset(optptr+2, 0, optlen-2);
49 }
50 l -= optlen;
51 optptr += optlen;
52 }
53 return 0;
54}
55
56static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
57{
58 int err;
59 struct iphdr *iph, *top_iph;
60 struct ip_auth_hdr *ah;
61 struct ah_data *ahp;
62 union {
63 struct iphdr iph;
64 char buf[60];
65 } tmp_iph;
66
67 top_iph = skb->nh.iph;
68 iph = &tmp_iph.iph;
69
70 iph->tos = top_iph->tos;
71 iph->ttl = top_iph->ttl;
72 iph->frag_off = top_iph->frag_off;
73
74 if (top_iph->ihl != 5) {
75 iph->daddr = top_iph->daddr;
76 memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
77 err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
78 if (err)
79 goto error;
80 }
81
82 ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
83 ah->nexthdr = top_iph->protocol;
84
85 top_iph->tos = 0;
86 top_iph->tot_len = htons(skb->len);
87 top_iph->frag_off = 0;
88 top_iph->ttl = 0;
89 top_iph->protocol = IPPROTO_AH;
90 top_iph->check = 0;
91
92 ahp = x->data;
93 ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
94 ahp->icv_trunc_len) >> 2) - 2;
95
96 ah->reserved = 0;
97 ah->spi = x->id.spi;
98 ah->seq_no = htonl(++x->replay.oseq);
99 ahp->icv(ahp, skb, ah->auth_data);
100
101 top_iph->tos = iph->tos;
102 top_iph->ttl = iph->ttl;
103 top_iph->frag_off = iph->frag_off;
104 if (top_iph->ihl != 5) {
105 top_iph->daddr = iph->daddr;
106 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
107 }
108
109 ip_send_check(top_iph);
110
111 err = 0;
112
113error:
114 return err;
115}
116
117static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
118{
119 int ah_hlen;
120 struct iphdr *iph;
121 struct ip_auth_hdr *ah;
122 struct ah_data *ahp;
123 char work_buf[60];
124
125 if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
126 goto out;
127
128 ah = (struct ip_auth_hdr*)skb->data;
129 ahp = x->data;
130 ah_hlen = (ah->hdrlen + 2) << 2;
131
132 if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
133 ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len))
134 goto out;
135
136 if (!pskb_may_pull(skb, ah_hlen))
137 goto out;
138
139 /* We are going to _remove_ AH header to keep sockets happy,
140 * so... Later this can change. */
141 if (skb_cloned(skb) &&
142 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
143 goto out;
144
145 skb->ip_summed = CHECKSUM_NONE;
146
147 ah = (struct ip_auth_hdr*)skb->data;
148 iph = skb->nh.iph;
149
150 memcpy(work_buf, iph, iph->ihl*4);
151
152 iph->ttl = 0;
153 iph->tos = 0;
154 iph->frag_off = 0;
155 iph->check = 0;
156 if (iph->ihl != 5) {
157 u32 dummy;
158 if (ip_clear_mutable_options(iph, &dummy))
159 goto out;
160 }
161 {
162 u8 auth_data[MAX_AH_AUTH_LEN];
163
164 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
165 skb_push(skb, skb->data - skb->nh.raw);
166 ahp->icv(ahp, skb, ah->auth_data);
167 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
168 x->stats.integrity_failed++;
169 goto out;
170 }
171 }
172 ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
173 skb->nh.raw = skb_pull(skb, ah_hlen);
174 memcpy(skb->nh.raw, work_buf, iph->ihl*4);
175 skb->nh.iph->tot_len = htons(skb->len);
176 skb_pull(skb, skb->nh.iph->ihl*4);
177 skb->h.raw = skb->data;
178
179 return 0;
180
181out:
182 return -EINVAL;
183}
184
185static void ah4_err(struct sk_buff *skb, u32 info)
186{
187 struct iphdr *iph = (struct iphdr*)skb->data;
188 struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
189 struct xfrm_state *x;
190
191 if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
192 skb->h.icmph->code != ICMP_FRAG_NEEDED)
193 return;
194
195 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
196 if (!x)
197 return;
198 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
199 ntohl(ah->spi), ntohl(iph->daddr));
200 xfrm_state_put(x);
201}
202
203static int ah_init_state(struct xfrm_state *x, void *args)
204{
205 struct ah_data *ahp = NULL;
206 struct xfrm_algo_desc *aalg_desc;
207
208 if (!x->aalg)
209 goto error;
210
211 /* null auth can use a zero length key */
212 if (x->aalg->alg_key_len > 512)
213 goto error;
214
215 if (x->encap)
216 goto error;
217
218 ahp = kmalloc(sizeof(*ahp), GFP_KERNEL);
219 if (ahp == NULL)
220 return -ENOMEM;
221
222 memset(ahp, 0, sizeof(*ahp));
223
224 ahp->key = x->aalg->alg_key;
225 ahp->key_len = (x->aalg->alg_key_len+7)/8;
226 ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
227 if (!ahp->tfm)
228 goto error;
229 ahp->icv = ah_hmac_digest;
230
231 /*
232 * Lookup the algorithm description maintained by xfrm_algo,
233 * verify crypto transform properties, and store information
234 * we need for AH processing. This lookup cannot fail here
235 * after a successful crypto_alloc_tfm().
236 */
237 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
238 BUG_ON(!aalg_desc);
239
240 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
241 crypto_tfm_alg_digestsize(ahp->tfm)) {
242 printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
243 x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm),
244 aalg_desc->uinfo.auth.icv_fullbits/8);
245 goto error;
246 }
247
248 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
249 ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
250
251 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
252
253 ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
254 if (!ahp->work_icv)
255 goto error;
256
257 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
258 if (x->props.mode)
259 x->props.header_len += sizeof(struct iphdr);
260 x->data = ahp;
261
262 return 0;
263
264error:
265 if (ahp) {
266 if (ahp->work_icv)
267 kfree(ahp->work_icv);
268 if (ahp->tfm)
269 crypto_free_tfm(ahp->tfm);
270 kfree(ahp);
271 }
272 return -EINVAL;
273}
274
275static void ah_destroy(struct xfrm_state *x)
276{
277 struct ah_data *ahp = x->data;
278
279 if (!ahp)
280 return;
281
282 if (ahp->work_icv) {
283 kfree(ahp->work_icv);
284 ahp->work_icv = NULL;
285 }
286 if (ahp->tfm) {
287 crypto_free_tfm(ahp->tfm);
288 ahp->tfm = NULL;
289 }
290 kfree(ahp);
291}
292
293
294static struct xfrm_type ah_type =
295{
296 .description = "AH4",
297 .owner = THIS_MODULE,
298 .proto = IPPROTO_AH,
299 .init_state = ah_init_state,
300 .destructor = ah_destroy,
301 .input = ah_input,
302 .output = ah_output
303};
304
305static struct net_protocol ah4_protocol = {
306 .handler = xfrm4_rcv,
307 .err_handler = ah4_err,
308 .no_policy = 1,
309};
310
311static int __init ah4_init(void)
312{
313 if (xfrm_register_type(&ah_type, AF_INET) < 0) {
314 printk(KERN_INFO "ip ah init: can't add xfrm type\n");
315 return -EAGAIN;
316 }
317 if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
318 printk(KERN_INFO "ip ah init: can't add protocol\n");
319 xfrm_unregister_type(&ah_type, AF_INET);
320 return -EAGAIN;
321 }
322 return 0;
323}
324
325static void __exit ah4_fini(void)
326{
327 if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
328 printk(KERN_INFO "ip ah close: can't remove protocol\n");
329 if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
330 printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
331}
332
333module_init(ah4_init);
334module_exit(ah4_fini);
335MODULE_LICENSE("GPL");
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 000000000000..a642fd612853
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1425 @@
1/* linux/net/inet/arp.c
2 *
3 * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $
4 *
5 * Copyright (C) 1994 by Florian La Roche
6 *
7 * This module implements the Address Resolution Protocol ARP (RFC 826),
8 * which is used to convert IP addresses (or in the future maybe other
9 * high-level addresses) into a low-level hardware address (like an Ethernet
10 * address).
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Fixes:
18 * Alan Cox : Removed the Ethernet assumptions in
19 * Florian's code
20 * Alan Cox : Fixed some small errors in the ARP
21 * logic
22 * Alan Cox : Allow >4K in /proc
23 * Alan Cox : Make ARP add its own protocol entry
24 * Ross Martin : Rewrote arp_rcv() and arp_get_info()
25 * Stephen Henson : Add AX25 support to arp_get_info()
26 * Alan Cox : Drop data when a device is downed.
27 * Alan Cox : Use init_timer().
28 * Alan Cox : Double lock fixes.
29 * Martin Seine : Move the arphdr structure
30 * to if_arp.h for compatibility.
31 * with BSD based programs.
32 * Andrew Tridgell : Added ARP netmask code and
33 * re-arranged proxy handling.
34 * Alan Cox : Changed to use notifiers.
35 * Niibe Yutaka : Reply for this device or proxies only.
36 * Alan Cox : Don't proxy across hardware types!
37 * Jonathan Naylor : Added support for NET/ROM.
38 * Mike Shaver : RFC1122 checks.
39 * Jonathan Naylor : Only lookup the hardware address for
40 * the correct hardware type.
41 * Germano Caronni : Assorted subtle races.
42 * Craig Schlenter : Don't modify permanent entry
43 * during arp_rcv.
44 * Russ Nelson : Tidied up a few bits.
45 * Alexey Kuznetsov: Major changes to caching and behaviour,
46 * eg intelligent arp probing and
47 * generation
48 * of host down events.
49 * Alan Cox : Missing unlock in device events.
50 * Eckes : ARP ioctl control errors.
51 * Alexey Kuznetsov: Arp free fix.
52 * Manuel Rodriguez: Gratuitous ARP.
53 * Jonathan Layes : Added arpd support through kerneld
54 * message queue (960314)
55 * Mike Shaver : /proc/sys/net/ipv4/arp_* support
56 * Mike McLagan : Routing by source
57 * Stuart Cheshire : Metricom and grat arp fixes
58 * *** FOR 2.1 clean this up ***
59 * Lawrence V. Stefani: (08/12/96) Added FDDI support.
60 * Alan Cox : Took the AP1000 nasty FDDI hack and
61 * folded into the mainstream FDDI code.
62 * Ack spit, Linus how did you allow that
63 * one in...
64 * Jes Sorensen : Make FDDI work again in 2.1.x and
65 * clean up the APFDDI & gen. FDDI bits.
66 * Alexey Kuznetsov: new arp state machine;
67 * now it is in net/core/neighbour.c.
68 * Krzysztof Halasa: Added Frame Relay ARP support.
69 * Arnaldo C. Melo : convert /proc/net/arp to seq_file
70 * Shmulik Hen: Split arp_send to arp_create and
71 * arp_xmit so intermediate drivers like
72 * bonding can change the skb before
73 * sending (e.g. insert 8021q tag).
74 * Harald Welte : convert to make use of jenkins hash
75 */
76
77#include <linux/module.h>
78#include <linux/types.h>
79#include <linux/string.h>
80#include <linux/kernel.h>
81#include <linux/sched.h>
82#include <linux/config.h>
83#include <linux/socket.h>
84#include <linux/sockios.h>
85#include <linux/errno.h>
86#include <linux/in.h>
87#include <linux/mm.h>
88#include <linux/inet.h>
89#include <linux/netdevice.h>
90#include <linux/etherdevice.h>
91#include <linux/fddidevice.h>
92#include <linux/if_arp.h>
93#include <linux/trdevice.h>
94#include <linux/skbuff.h>
95#include <linux/proc_fs.h>
96#include <linux/seq_file.h>
97#include <linux/stat.h>
98#include <linux/init.h>
99#include <linux/net.h>
100#include <linux/rcupdate.h>
101#include <linux/jhash.h>
102#ifdef CONFIG_SYSCTL
103#include <linux/sysctl.h>
104#endif
105
106#include <net/ip.h>
107#include <net/icmp.h>
108#include <net/route.h>
109#include <net/protocol.h>
110#include <net/tcp.h>
111#include <net/sock.h>
112#include <net/arp.h>
113#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
114#include <net/ax25.h>
115#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
116#include <net/netrom.h>
117#endif
118#endif
119#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
120#include <net/atmclip.h>
121struct neigh_table *clip_tbl_hook;
122#endif
123
124#include <asm/system.h>
125#include <asm/uaccess.h>
126
127#include <linux/netfilter_arp.h>
128
129/*
130 * Interface to generic neighbour cache.
131 */
132static u32 arp_hash(const void *pkey, const struct net_device *dev);
133static int arp_constructor(struct neighbour *neigh);
134static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
135static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
136static void parp_redo(struct sk_buff *skb);
137
138static struct neigh_ops arp_generic_ops = {
139 .family = AF_INET,
140 .solicit = arp_solicit,
141 .error_report = arp_error_report,
142 .output = neigh_resolve_output,
143 .connected_output = neigh_connected_output,
144 .hh_output = dev_queue_xmit,
145 .queue_xmit = dev_queue_xmit,
146};
147
148static struct neigh_ops arp_hh_ops = {
149 .family = AF_INET,
150 .solicit = arp_solicit,
151 .error_report = arp_error_report,
152 .output = neigh_resolve_output,
153 .connected_output = neigh_resolve_output,
154 .hh_output = dev_queue_xmit,
155 .queue_xmit = dev_queue_xmit,
156};
157
158static struct neigh_ops arp_direct_ops = {
159 .family = AF_INET,
160 .output = dev_queue_xmit,
161 .connected_output = dev_queue_xmit,
162 .hh_output = dev_queue_xmit,
163 .queue_xmit = dev_queue_xmit,
164};
165
166struct neigh_ops arp_broken_ops = {
167 .family = AF_INET,
168 .solicit = arp_solicit,
169 .error_report = arp_error_report,
170 .output = neigh_compat_output,
171 .connected_output = neigh_compat_output,
172 .hh_output = dev_queue_xmit,
173 .queue_xmit = dev_queue_xmit,
174};
175
176struct neigh_table arp_tbl = {
177 .family = AF_INET,
178 .entry_size = sizeof(struct neighbour) + 4,
179 .key_len = 4,
180 .hash = arp_hash,
181 .constructor = arp_constructor,
182 .proxy_redo = parp_redo,
183 .id = "arp_cache",
184 .parms = {
185 .tbl = &arp_tbl,
186 .base_reachable_time = 30 * HZ,
187 .retrans_time = 1 * HZ,
188 .gc_staletime = 60 * HZ,
189 .reachable_time = 30 * HZ,
190 .delay_probe_time = 5 * HZ,
191 .queue_len = 3,
192 .ucast_probes = 3,
193 .mcast_probes = 3,
194 .anycast_delay = 1 * HZ,
195 .proxy_delay = (8 * HZ) / 10,
196 .proxy_qlen = 64,
197 .locktime = 1 * HZ,
198 },
199 .gc_interval = 30 * HZ,
200 .gc_thresh1 = 128,
201 .gc_thresh2 = 512,
202 .gc_thresh3 = 1024,
203};
204
205int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir)
206{
207 switch (dev->type) {
208 case ARPHRD_ETHER:
209 case ARPHRD_FDDI:
210 case ARPHRD_IEEE802:
211 ip_eth_mc_map(addr, haddr);
212 return 0;
213 case ARPHRD_IEEE802_TR:
214 ip_tr_mc_map(addr, haddr);
215 return 0;
216 case ARPHRD_INFINIBAND:
217 ip_ib_mc_map(addr, haddr);
218 return 0;
219 default:
220 if (dir) {
221 memcpy(haddr, dev->broadcast, dev->addr_len);
222 return 0;
223 }
224 }
225 return -EINVAL;
226}
227
228
229static u32 arp_hash(const void *pkey, const struct net_device *dev)
230{
231 return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
232}
233
234static int arp_constructor(struct neighbour *neigh)
235{
236 u32 addr = *(u32*)neigh->primary_key;
237 struct net_device *dev = neigh->dev;
238 struct in_device *in_dev;
239 struct neigh_parms *parms;
240
241 neigh->type = inet_addr_type(addr);
242
243 rcu_read_lock();
244 in_dev = rcu_dereference(__in_dev_get(dev));
245 if (in_dev == NULL) {
246 rcu_read_unlock();
247 return -EINVAL;
248 }
249
250 parms = in_dev->arp_parms;
251 __neigh_parms_put(neigh->parms);
252 neigh->parms = neigh_parms_clone(parms);
253 rcu_read_unlock();
254
255 if (dev->hard_header == NULL) {
256 neigh->nud_state = NUD_NOARP;
257 neigh->ops = &arp_direct_ops;
258 neigh->output = neigh->ops->queue_xmit;
259 } else {
260 /* Good devices (checked by reading texts, but only Ethernet is
261 tested)
262
263 ARPHRD_ETHER: (ethernet, apfddi)
264 ARPHRD_FDDI: (fddi)
265 ARPHRD_IEEE802: (tr)
266 ARPHRD_METRICOM: (strip)
267 ARPHRD_ARCNET:
268 etc. etc. etc.
269
270 ARPHRD_IPDDP will also work, if author repairs it.
271 I did not it, because this driver does not work even
272 in old paradigm.
273 */
274
275#if 1
276 /* So... these "amateur" devices are hopeless.
277 The only thing, that I can say now:
278 It is very sad that we need to keep ugly obsolete
279 code to make them happy.
280
281 They should be moved to more reasonable state, now
282 they use rebuild_header INSTEAD OF hard_start_xmit!!!
283 Besides that, they are sort of out of date
284 (a lot of redundant clones/copies, useless in 2.1),
285 I wonder why people believe that they work.
286 */
287 switch (dev->type) {
288 default:
289 break;
290 case ARPHRD_ROSE:
291#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
292 case ARPHRD_AX25:
293#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
294 case ARPHRD_NETROM:
295#endif
296 neigh->ops = &arp_broken_ops;
297 neigh->output = neigh->ops->output;
298 return 0;
299#endif
300 ;}
301#endif
302 if (neigh->type == RTN_MULTICAST) {
303 neigh->nud_state = NUD_NOARP;
304 arp_mc_map(addr, neigh->ha, dev, 1);
305 } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
306 neigh->nud_state = NUD_NOARP;
307 memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
308 } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
309 neigh->nud_state = NUD_NOARP;
310 memcpy(neigh->ha, dev->broadcast, dev->addr_len);
311 }
312 if (dev->hard_header_cache)
313 neigh->ops = &arp_hh_ops;
314 else
315 neigh->ops = &arp_generic_ops;
316 if (neigh->nud_state&NUD_VALID)
317 neigh->output = neigh->ops->connected_output;
318 else
319 neigh->output = neigh->ops->output;
320 }
321 return 0;
322}
323
324static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
325{
326 dst_link_failure(skb);
327 kfree_skb(skb);
328}
329
330static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
331{
332 u32 saddr = 0;
333 u8 *dst_ha = NULL;
334 struct net_device *dev = neigh->dev;
335 u32 target = *(u32*)neigh->primary_key;
336 int probes = atomic_read(&neigh->probes);
337 struct in_device *in_dev = in_dev_get(dev);
338
339 if (!in_dev)
340 return;
341
342 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
343 default:
344 case 0: /* By default announce any local IP */
345 if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
346 saddr = skb->nh.iph->saddr;
347 break;
348 case 1: /* Restrict announcements of saddr in same subnet */
349 if (!skb)
350 break;
351 saddr = skb->nh.iph->saddr;
352 if (inet_addr_type(saddr) == RTN_LOCAL) {
353 /* saddr should be known to target */
354 if (inet_addr_onlink(in_dev, target, saddr))
355 break;
356 }
357 saddr = 0;
358 break;
359 case 2: /* Avoid secondary IPs, get a primary/preferred one */
360 break;
361 }
362
363 if (in_dev)
364 in_dev_put(in_dev);
365 if (!saddr)
366 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
367
368 if ((probes -= neigh->parms->ucast_probes) < 0) {
369 if (!(neigh->nud_state&NUD_VALID))
370 printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
371 dst_ha = neigh->ha;
372 read_lock_bh(&neigh->lock);
373 } else if ((probes -= neigh->parms->app_probes) < 0) {
374#ifdef CONFIG_ARPD
375 neigh_app_ns(neigh);
376#endif
377 return;
378 }
379
380 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
381 dst_ha, dev->dev_addr, NULL);
382 if (dst_ha)
383 read_unlock_bh(&neigh->lock);
384}
385
386static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
387 u32 sip, u32 tip)
388{
389 int scope;
390
391 switch (IN_DEV_ARP_IGNORE(in_dev)) {
392 case 0: /* Reply, the tip is already validated */
393 return 0;
394 case 1: /* Reply only if tip is configured on the incoming interface */
395 sip = 0;
396 scope = RT_SCOPE_HOST;
397 break;
398 case 2: /*
399 * Reply only if tip is configured on the incoming interface
400 * and is in same subnet as sip
401 */
402 scope = RT_SCOPE_HOST;
403 break;
404 case 3: /* Do not reply for scope host addresses */
405 sip = 0;
406 scope = RT_SCOPE_LINK;
407 dev = NULL;
408 break;
409 case 4: /* Reserved */
410 case 5:
411 case 6:
412 case 7:
413 return 0;
414 case 8: /* Do not reply */
415 return 1;
416 default:
417 return 0;
418 }
419 return !inet_confirm_addr(dev, sip, tip, scope);
420}
421
422static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev)
423{
424 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
425 .saddr = tip } } };
426 struct rtable *rt;
427 int flag = 0;
428 /*unsigned long now; */
429
430 if (ip_route_output_key(&rt, &fl) < 0)
431 return 1;
432 if (rt->u.dst.dev != dev) {
433 NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
434 flag = 1;
435 }
436 ip_rt_put(rt);
437 return flag;
438}
439
440/* OBSOLETE FUNCTIONS */
441
442/*
443 * Find an arp mapping in the cache. If not found, post a request.
444 *
445 * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
446 * even if it exists. It is supposed that skb->dev was mangled
447 * by a virtual device (eql, shaper). Nobody but broken devices
448 * is allowed to use this function, it is scheduled to be removed. --ANK
449 */
450
451static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev)
452{
453 switch (addr_hint) {
454 case RTN_LOCAL:
455 printk(KERN_DEBUG "ARP: arp called for own IP address\n");
456 memcpy(haddr, dev->dev_addr, dev->addr_len);
457 return 1;
458 case RTN_MULTICAST:
459 arp_mc_map(paddr, haddr, dev, 1);
460 return 1;
461 case RTN_BROADCAST:
462 memcpy(haddr, dev->broadcast, dev->addr_len);
463 return 1;
464 }
465 return 0;
466}
467
468
469int arp_find(unsigned char *haddr, struct sk_buff *skb)
470{
471 struct net_device *dev = skb->dev;
472 u32 paddr;
473 struct neighbour *n;
474
475 if (!skb->dst) {
476 printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
477 kfree_skb(skb);
478 return 1;
479 }
480
481 paddr = ((struct rtable*)skb->dst)->rt_gateway;
482
483 if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
484 return 0;
485
486 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
487
488 if (n) {
489 n->used = jiffies;
490 if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
491 read_lock_bh(&n->lock);
492 memcpy(haddr, n->ha, dev->addr_len);
493 read_unlock_bh(&n->lock);
494 neigh_release(n);
495 return 0;
496 }
497 neigh_release(n);
498 } else
499 kfree_skb(skb);
500 return 1;
501}
502
503/* END OF OBSOLETE FUNCTIONS */
504
505int arp_bind_neighbour(struct dst_entry *dst)
506{
507 struct net_device *dev = dst->dev;
508 struct neighbour *n = dst->neighbour;
509
510 if (dev == NULL)
511 return -EINVAL;
512 if (n == NULL) {
513 u32 nexthop = ((struct rtable*)dst)->rt_gateway;
514 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
515 nexthop = 0;
516 n = __neigh_lookup_errno(
517#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
518 dev->type == ARPHRD_ATM ? clip_tbl_hook :
519#endif
520 &arp_tbl, &nexthop, dev);
521 if (IS_ERR(n))
522 return PTR_ERR(n);
523 dst->neighbour = n;
524 }
525 return 0;
526}
527
528/*
529 * Check if we can use proxy ARP for this path
530 */
531
532static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
533{
534 struct in_device *out_dev;
535 int imi, omi = -1;
536
537 if (!IN_DEV_PROXY_ARP(in_dev))
538 return 0;
539
540 if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
541 return 1;
542 if (imi == -1)
543 return 0;
544
545 /* place to check for proxy_arp for routes */
546
547 if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
548 omi = IN_DEV_MEDIUM_ID(out_dev);
549 in_dev_put(out_dev);
550 }
551 return (omi != imi && omi != -1);
552}
553
554/*
555 * Interface to link layer: send routine and receive handler.
556 */
557
558/*
559 * Create an arp packet. If (dest_hw == NULL), we create a broadcast
560 * message.
561 */
562struct sk_buff *arp_create(int type, int ptype, u32 dest_ip,
563 struct net_device *dev, u32 src_ip,
564 unsigned char *dest_hw, unsigned char *src_hw,
565 unsigned char *target_hw)
566{
567 struct sk_buff *skb;
568 struct arphdr *arp;
569 unsigned char *arp_ptr;
570
571 /*
572 * Allocate a buffer
573 */
574
575 skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
576 + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
577 if (skb == NULL)
578 return NULL;
579
580 skb_reserve(skb, LL_RESERVED_SPACE(dev));
581 skb->nh.raw = skb->data;
582 arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
583 skb->dev = dev;
584 skb->protocol = htons(ETH_P_ARP);
585 if (src_hw == NULL)
586 src_hw = dev->dev_addr;
587 if (dest_hw == NULL)
588 dest_hw = dev->broadcast;
589
590 /*
591 * Fill the device header for the ARP frame
592 */
593 if (dev->hard_header &&
594 dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
595 goto out;
596
597 /*
598 * Fill out the arp protocol part.
599 *
600 * The arp hardware type should match the device type, except for FDDI,
601 * which (according to RFC 1390) should always equal 1 (Ethernet).
602 */
603 /*
604 * Exceptions everywhere. AX.25 uses the AX.25 PID value not the
605 * DIX code for the protocol. Make these device structure fields.
606 */
607 switch (dev->type) {
608 default:
609 arp->ar_hrd = htons(dev->type);
610 arp->ar_pro = htons(ETH_P_IP);
611 break;
612
613#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
614 case ARPHRD_AX25:
615 arp->ar_hrd = htons(ARPHRD_AX25);
616 arp->ar_pro = htons(AX25_P_IP);
617 break;
618
619#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
620 case ARPHRD_NETROM:
621 arp->ar_hrd = htons(ARPHRD_NETROM);
622 arp->ar_pro = htons(AX25_P_IP);
623 break;
624#endif
625#endif
626
627#ifdef CONFIG_FDDI
628 case ARPHRD_FDDI:
629 arp->ar_hrd = htons(ARPHRD_ETHER);
630 arp->ar_pro = htons(ETH_P_IP);
631 break;
632#endif
633#ifdef CONFIG_TR
634 case ARPHRD_IEEE802_TR:
635 arp->ar_hrd = htons(ARPHRD_IEEE802);
636 arp->ar_pro = htons(ETH_P_IP);
637 break;
638#endif
639 }
640
641 arp->ar_hln = dev->addr_len;
642 arp->ar_pln = 4;
643 arp->ar_op = htons(type);
644
645 arp_ptr=(unsigned char *)(arp+1);
646
647 memcpy(arp_ptr, src_hw, dev->addr_len);
648 arp_ptr+=dev->addr_len;
649 memcpy(arp_ptr, &src_ip,4);
650 arp_ptr+=4;
651 if (target_hw != NULL)
652 memcpy(arp_ptr, target_hw, dev->addr_len);
653 else
654 memset(arp_ptr, 0, dev->addr_len);
655 arp_ptr+=dev->addr_len;
656 memcpy(arp_ptr, &dest_ip, 4);
657
658 return skb;
659
660out:
661 kfree_skb(skb);
662 return NULL;
663}
664
665/*
666 * Send an arp packet.
667 */
668void arp_xmit(struct sk_buff *skb)
669{
670 /* Send it off, maybe filter it using firewalling first. */
671 NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
672}
673
674/*
675 * Create and send an arp packet.
676 */
677void arp_send(int type, int ptype, u32 dest_ip,
678 struct net_device *dev, u32 src_ip,
679 unsigned char *dest_hw, unsigned char *src_hw,
680 unsigned char *target_hw)
681{
682 struct sk_buff *skb;
683
684 /*
685 * No arp on this interface.
686 */
687
688 if (dev->flags&IFF_NOARP)
689 return;
690
691 skb = arp_create(type, ptype, dest_ip, dev, src_ip,
692 dest_hw, src_hw, target_hw);
693 if (skb == NULL) {
694 return;
695 }
696
697 arp_xmit(skb);
698}
699
700static void parp_redo(struct sk_buff *skb)
701{
702 nf_reset(skb);
703 arp_rcv(skb, skb->dev, NULL);
704}
705
706/*
707 * Process an arp request.
708 */
709
710static int arp_process(struct sk_buff *skb)
711{
712 struct net_device *dev = skb->dev;
713 struct in_device *in_dev = in_dev_get(dev);
714 struct arphdr *arp;
715 unsigned char *arp_ptr;
716 struct rtable *rt;
717 unsigned char *sha, *tha;
718 u32 sip, tip;
719 u16 dev_type = dev->type;
720 int addr_type;
721 struct neighbour *n;
722
723 /* arp_rcv below verifies the ARP header and verifies the device
724 * is ARP'able.
725 */
726
727 if (in_dev == NULL)
728 goto out;
729
730 arp = skb->nh.arph;
731
732 switch (dev_type) {
733 default:
734 if (arp->ar_pro != htons(ETH_P_IP) ||
735 htons(dev_type) != arp->ar_hrd)
736 goto out;
737 break;
738#ifdef CONFIG_NET_ETHERNET
739 case ARPHRD_ETHER:
740#endif
741#ifdef CONFIG_TR
742 case ARPHRD_IEEE802_TR:
743#endif
744#ifdef CONFIG_FDDI
745 case ARPHRD_FDDI:
746#endif
747#ifdef CONFIG_NET_FC
748 case ARPHRD_IEEE802:
749#endif
750#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
751 defined(CONFIG_FDDI) || defined(CONFIG_NET_FC)
752 /*
753 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
754 * devices, according to RFC 2625) devices will accept ARP
755 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
756 * This is the case also of FDDI, where the RFC 1390 says that
757 * FDDI devices should accept ARP hardware of (1) Ethernet,
758 * however, to be more robust, we'll accept both 1 (Ethernet)
759 * or 6 (IEEE 802.2)
760 */
761 if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
762 arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
763 arp->ar_pro != htons(ETH_P_IP))
764 goto out;
765 break;
766#endif
767#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
768 case ARPHRD_AX25:
769 if (arp->ar_pro != htons(AX25_P_IP) ||
770 arp->ar_hrd != htons(ARPHRD_AX25))
771 goto out;
772 break;
773#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
774 case ARPHRD_NETROM:
775 if (arp->ar_pro != htons(AX25_P_IP) ||
776 arp->ar_hrd != htons(ARPHRD_NETROM))
777 goto out;
778 break;
779#endif
780#endif
781 }
782
783 /* Understand only these message types */
784
785 if (arp->ar_op != htons(ARPOP_REPLY) &&
786 arp->ar_op != htons(ARPOP_REQUEST))
787 goto out;
788
789/*
790 * Extract fields
791 */
792 arp_ptr= (unsigned char *)(arp+1);
793 sha = arp_ptr;
794 arp_ptr += dev->addr_len;
795 memcpy(&sip, arp_ptr, 4);
796 arp_ptr += 4;
797 tha = arp_ptr;
798 arp_ptr += dev->addr_len;
799 memcpy(&tip, arp_ptr, 4);
800/*
801 * Check for bad requests for 127.x.x.x and requests for multicast
802 * addresses. If this is one such, delete it.
803 */
804 if (LOOPBACK(tip) || MULTICAST(tip))
805 goto out;
806
807/*
808 * Special case: We must set Frame Relay source Q.922 address
809 */
810 if (dev_type == ARPHRD_DLCI)
811 sha = dev->broadcast;
812
813/*
814 * Process entry. The idea here is we want to send a reply if it is a
815 * request for us or if it is a request for someone else that we hold
816 * a proxy for. We want to add an entry to our cache if it is a reply
817 * to us or if it is a request for our address.
818 * (The assumption for this last is that if someone is requesting our
819 * address, they are probably intending to talk to us, so it saves time
820 * if we cache their address. Their address is also probably not in
821 * our cache, since ours is not in their cache.)
822 *
823 * Putting this another way, we only care about replies if they are to
824 * us, in which case we add them to the cache. For requests, we care
825 * about those for us and those for our proxies. We reply to both,
826 * and in the case of requests for us we add the requester to the arp
827 * cache.
828 */
829
830 /* Special case: IPv4 duplicate address detection packet (RFC2131) */
831 if (sip == 0) {
832 if (arp->ar_op == htons(ARPOP_REQUEST) &&
833 inet_addr_type(tip) == RTN_LOCAL &&
834 !arp_ignore(in_dev,dev,sip,tip))
835 arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
836 goto out;
837 }
838
839 if (arp->ar_op == htons(ARPOP_REQUEST) &&
840 ip_route_input(skb, tip, sip, 0, dev) == 0) {
841
842 rt = (struct rtable*)skb->dst;
843 addr_type = rt->rt_type;
844
845 if (addr_type == RTN_LOCAL) {
846 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
847 if (n) {
848 int dont_send = 0;
849
850 if (!dont_send)
851 dont_send |= arp_ignore(in_dev,dev,sip,tip);
852 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
853 dont_send |= arp_filter(sip,tip,dev);
854 if (!dont_send)
855 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
856
857 neigh_release(n);
858 }
859 goto out;
860 } else if (IN_DEV_FORWARD(in_dev)) {
861 if ((rt->rt_flags&RTCF_DNAT) ||
862 (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
863 (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
864 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
865 if (n)
866 neigh_release(n);
867
868 if (skb->stamp.tv_sec == LOCALLY_ENQUEUED ||
869 skb->pkt_type == PACKET_HOST ||
870 in_dev->arp_parms->proxy_delay == 0) {
871 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
872 } else {
873 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
874 in_dev_put(in_dev);
875 return 0;
876 }
877 goto out;
878 }
879 }
880 }
881
882 /* Update our ARP tables */
883
884 n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
885
886#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
887 /* Unsolicited ARP is not accepted by default.
888 It is possible, that this option should be enabled for some
889 devices (strip is candidate)
890 */
891 if (n == NULL &&
892 arp->ar_op == htons(ARPOP_REPLY) &&
893 inet_addr_type(sip) == RTN_UNICAST)
894 n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
895#endif
896
897 if (n) {
898 int state = NUD_REACHABLE;
899 int override;
900
901 /* If several different ARP replies follows back-to-back,
902 use the FIRST one. It is possible, if several proxy
903 agents are active. Taking the first reply prevents
904 arp trashing and chooses the fastest router.
905 */
906 override = time_after(jiffies, n->updated + n->parms->locktime);
907
908 /* Broadcast replies and request packets
909 do not assert neighbour reachability.
910 */
911 if (arp->ar_op != htons(ARPOP_REPLY) ||
912 skb->pkt_type != PACKET_HOST)
913 state = NUD_STALE;
914 neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
915 neigh_release(n);
916 }
917
918out:
919 if (in_dev)
920 in_dev_put(in_dev);
921 kfree_skb(skb);
922 return 0;
923}
924
925
926/*
927 * Receive an arp request from the device layer.
928 */
929
930int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
931{
932 struct arphdr *arp;
933
934 /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
935 if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
936 (2 * dev->addr_len) +
937 (2 * sizeof(u32)))))
938 goto freeskb;
939
940 arp = skb->nh.arph;
941 if (arp->ar_hln != dev->addr_len ||
942 dev->flags & IFF_NOARP ||
943 skb->pkt_type == PACKET_OTHERHOST ||
944 skb->pkt_type == PACKET_LOOPBACK ||
945 arp->ar_pln != 4)
946 goto freeskb;
947
948 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
949 goto out_of_mem;
950
951 return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
952
953freeskb:
954 kfree_skb(skb);
955out_of_mem:
956 return 0;
957}
958
959/*
960 * User level interface (ioctl)
961 */
962
963/*
964 * Set (create) an ARP cache entry.
965 */
966
967static int arp_req_set(struct arpreq *r, struct net_device * dev)
968{
969 u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
970 struct neighbour *neigh;
971 int err;
972
973 if (r->arp_flags&ATF_PUBL) {
974 u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
975 if (mask && mask != 0xFFFFFFFF)
976 return -EINVAL;
977 if (!dev && (r->arp_flags & ATF_COM)) {
978 dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
979 if (!dev)
980 return -ENODEV;
981 }
982 if (mask) {
983 if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
984 return -ENOBUFS;
985 return 0;
986 }
987 if (dev == NULL) {
988 ipv4_devconf.proxy_arp = 1;
989 return 0;
990 }
991 if (__in_dev_get(dev)) {
992 __in_dev_get(dev)->cnf.proxy_arp = 1;
993 return 0;
994 }
995 return -ENXIO;
996 }
997
998 if (r->arp_flags & ATF_PERM)
999 r->arp_flags |= ATF_COM;
1000 if (dev == NULL) {
1001 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
1002 .tos = RTO_ONLINK } } };
1003 struct rtable * rt;
1004 if ((err = ip_route_output_key(&rt, &fl)) != 0)
1005 return err;
1006 dev = rt->u.dst.dev;
1007 ip_rt_put(rt);
1008 if (!dev)
1009 return -EINVAL;
1010 }
1011 switch (dev->type) {
1012#ifdef CONFIG_FDDI
1013 case ARPHRD_FDDI:
1014 /*
1015 * According to RFC 1390, FDDI devices should accept ARP
1016 * hardware types of 1 (Ethernet). However, to be more
1017 * robust, we'll accept hardware types of either 1 (Ethernet)
1018 * or 6 (IEEE 802.2).
1019 */
1020 if (r->arp_ha.sa_family != ARPHRD_FDDI &&
1021 r->arp_ha.sa_family != ARPHRD_ETHER &&
1022 r->arp_ha.sa_family != ARPHRD_IEEE802)
1023 return -EINVAL;
1024 break;
1025#endif
1026 default:
1027 if (r->arp_ha.sa_family != dev->type)
1028 return -EINVAL;
1029 break;
1030 }
1031
1032 neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
1033 err = PTR_ERR(neigh);
1034 if (!IS_ERR(neigh)) {
1035 unsigned state = NUD_STALE;
1036 if (r->arp_flags & ATF_PERM)
1037 state = NUD_PERMANENT;
1038 err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
1039 r->arp_ha.sa_data : NULL, state,
1040 NEIGH_UPDATE_F_OVERRIDE|
1041 NEIGH_UPDATE_F_ADMIN);
1042 neigh_release(neigh);
1043 }
1044 return err;
1045}
1046
1047static unsigned arp_state_to_flags(struct neighbour *neigh)
1048{
1049 unsigned flags = 0;
1050 if (neigh->nud_state&NUD_PERMANENT)
1051 flags = ATF_PERM|ATF_COM;
1052 else if (neigh->nud_state&NUD_VALID)
1053 flags = ATF_COM;
1054 return flags;
1055}
1056
1057/*
1058 * Get an ARP cache entry.
1059 */
1060
1061static int arp_req_get(struct arpreq *r, struct net_device *dev)
1062{
1063 u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
1064 struct neighbour *neigh;
1065 int err = -ENXIO;
1066
1067 neigh = neigh_lookup(&arp_tbl, &ip, dev);
1068 if (neigh) {
1069 read_lock_bh(&neigh->lock);
1070 memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
1071 r->arp_flags = arp_state_to_flags(neigh);
1072 read_unlock_bh(&neigh->lock);
1073 r->arp_ha.sa_family = dev->type;
1074 strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
1075 neigh_release(neigh);
1076 err = 0;
1077 }
1078 return err;
1079}
1080
1081static int arp_req_delete(struct arpreq *r, struct net_device * dev)
1082{
1083 int err;
1084 u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1085 struct neighbour *neigh;
1086
1087 if (r->arp_flags & ATF_PUBL) {
1088 u32 mask =
1089 ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
1090 if (mask == 0xFFFFFFFF)
1091 return pneigh_delete(&arp_tbl, &ip, dev);
1092 if (mask == 0) {
1093 if (dev == NULL) {
1094 ipv4_devconf.proxy_arp = 0;
1095 return 0;
1096 }
1097 if (__in_dev_get(dev)) {
1098 __in_dev_get(dev)->cnf.proxy_arp = 0;
1099 return 0;
1100 }
1101 return -ENXIO;
1102 }
1103 return -EINVAL;
1104 }
1105
1106 if (dev == NULL) {
1107 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
1108 .tos = RTO_ONLINK } } };
1109 struct rtable * rt;
1110 if ((err = ip_route_output_key(&rt, &fl)) != 0)
1111 return err;
1112 dev = rt->u.dst.dev;
1113 ip_rt_put(rt);
1114 if (!dev)
1115 return -EINVAL;
1116 }
1117 err = -ENXIO;
1118 neigh = neigh_lookup(&arp_tbl, &ip, dev);
1119 if (neigh) {
1120 if (neigh->nud_state&~NUD_NOARP)
1121 err = neigh_update(neigh, NULL, NUD_FAILED,
1122 NEIGH_UPDATE_F_OVERRIDE|
1123 NEIGH_UPDATE_F_ADMIN);
1124 neigh_release(neigh);
1125 }
1126 return err;
1127}
1128
1129/*
1130 * Handle an ARP layer I/O control request.
1131 */
1132
1133int arp_ioctl(unsigned int cmd, void __user *arg)
1134{
1135 int err;
1136 struct arpreq r;
1137 struct net_device *dev = NULL;
1138
1139 switch (cmd) {
1140 case SIOCDARP:
1141 case SIOCSARP:
1142 if (!capable(CAP_NET_ADMIN))
1143 return -EPERM;
1144 case SIOCGARP:
1145 err = copy_from_user(&r, arg, sizeof(struct arpreq));
1146 if (err)
1147 return -EFAULT;
1148 break;
1149 default:
1150 return -EINVAL;
1151 }
1152
1153 if (r.arp_pa.sa_family != AF_INET)
1154 return -EPFNOSUPPORT;
1155
1156 if (!(r.arp_flags & ATF_PUBL) &&
1157 (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
1158 return -EINVAL;
1159 if (!(r.arp_flags & ATF_NETMASK))
1160 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
1161 htonl(0xFFFFFFFFUL);
1162 rtnl_lock();
1163 if (r.arp_dev[0]) {
1164 err = -ENODEV;
1165 if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
1166 goto out;
1167
1168 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
1169 if (!r.arp_ha.sa_family)
1170 r.arp_ha.sa_family = dev->type;
1171 err = -EINVAL;
1172 if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
1173 goto out;
1174 } else if (cmd == SIOCGARP) {
1175 err = -ENODEV;
1176 goto out;
1177 }
1178
1179 switch(cmd) {
1180 case SIOCDARP:
1181 err = arp_req_delete(&r, dev);
1182 break;
1183 case SIOCSARP:
1184 err = arp_req_set(&r, dev);
1185 break;
1186 case SIOCGARP:
1187 err = arp_req_get(&r, dev);
1188 if (!err && copy_to_user(arg, &r, sizeof(r)))
1189 err = -EFAULT;
1190 break;
1191 }
1192out:
1193 rtnl_unlock();
1194 return err;
1195}
1196
1197static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1198{
1199 struct net_device *dev = ptr;
1200
1201 switch (event) {
1202 case NETDEV_CHANGEADDR:
1203 neigh_changeaddr(&arp_tbl, dev);
1204 rt_cache_flush(0);
1205 break;
1206 default:
1207 break;
1208 }
1209
1210 return NOTIFY_DONE;
1211}
1212
1213static struct notifier_block arp_netdev_notifier = {
1214 .notifier_call = arp_netdev_event,
1215};
1216
1217/* Note, that it is not on notifier chain.
1218 It is necessary, that this routine was called after route cache will be
1219 flushed.
1220 */
1221void arp_ifdown(struct net_device *dev)
1222{
1223 neigh_ifdown(&arp_tbl, dev);
1224}
1225
1226
1227/*
1228 * Called once on startup.
1229 */
1230
1231static struct packet_type arp_packet_type = {
1232 .type = __constant_htons(ETH_P_ARP),
1233 .func = arp_rcv,
1234};
1235
1236static int arp_proc_init(void);
1237
1238void __init arp_init(void)
1239{
1240 neigh_table_init(&arp_tbl);
1241
1242 dev_add_pack(&arp_packet_type);
1243 arp_proc_init();
1244#ifdef CONFIG_SYSCTL
1245 neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
1246 NET_IPV4_NEIGH, "ipv4", NULL, NULL);
1247#endif
1248 register_netdevice_notifier(&arp_netdev_notifier);
1249}
1250
1251#ifdef CONFIG_PROC_FS
1252#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1253
1254/* ------------------------------------------------------------------------ */
1255/*
1256 * ax25 -> ASCII conversion
1257 */
1258static char *ax2asc2(ax25_address *a, char *buf)
1259{
1260 char c, *s;
1261 int n;
1262
1263 for (n = 0, s = buf; n < 6; n++) {
1264 c = (a->ax25_call[n] >> 1) & 0x7F;
1265
1266 if (c != ' ') *s++ = c;
1267 }
1268
1269 *s++ = '-';
1270
1271 if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
1272 *s++ = '1';
1273 n -= 10;
1274 }
1275
1276 *s++ = n + '0';
1277 *s++ = '\0';
1278
1279 if (*buf == '\0' || *buf == '-')
1280 return "*";
1281
1282 return buf;
1283
1284}
1285#endif /* CONFIG_AX25 */
1286
1287#define HBUFFERLEN 30
1288
1289static void arp_format_neigh_entry(struct seq_file *seq,
1290 struct neighbour *n)
1291{
1292 char hbuffer[HBUFFERLEN];
1293 const char hexbuf[] = "0123456789ABCDEF";
1294 int k, j;
1295 char tbuf[16];
1296 struct net_device *dev = n->dev;
1297 int hatype = dev->type;
1298
1299 read_lock(&n->lock);
1300 /* Convert hardware address to XX:XX:XX:XX ... form. */
1301#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1302 if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
1303 ax2asc2((ax25_address *)n->ha, hbuffer);
1304 else {
1305#endif
1306 for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
1307 hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
1308 hbuffer[k++] = hexbuf[n->ha[j] & 15];
1309 hbuffer[k++] = ':';
1310 }
1311 hbuffer[--k] = 0;
1312#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1313 }
1314#endif
1315 sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
1316 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
1317 tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
1318 read_unlock(&n->lock);
1319}
1320
1321static void arp_format_pneigh_entry(struct seq_file *seq,
1322 struct pneigh_entry *n)
1323{
1324 struct net_device *dev = n->dev;
1325 int hatype = dev ? dev->type : 0;
1326 char tbuf[16];
1327
1328 sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key));
1329 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
1330 tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
1331 dev ? dev->name : "*");
1332}
1333
1334static int arp_seq_show(struct seq_file *seq, void *v)
1335{
1336 if (v == SEQ_START_TOKEN) {
1337 seq_puts(seq, "IP address HW type Flags "
1338 "HW address Mask Device\n");
1339 } else {
1340 struct neigh_seq_state *state = seq->private;
1341
1342 if (state->flags & NEIGH_SEQ_IS_PNEIGH)
1343 arp_format_pneigh_entry(seq, v);
1344 else
1345 arp_format_neigh_entry(seq, v);
1346 }
1347
1348 return 0;
1349}
1350
1351static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
1352{
1353 /* Don't want to confuse "arp -a" w/ magic entries,
1354 * so we tell the generic iterator to skip NUD_NOARP.
1355 */
1356 return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
1357}
1358
1359/* ------------------------------------------------------------------------ */
1360
1361static struct seq_operations arp_seq_ops = {
1362 .start = arp_seq_start,
1363 .next = neigh_seq_next,
1364 .stop = neigh_seq_stop,
1365 .show = arp_seq_show,
1366};
1367
1368static int arp_seq_open(struct inode *inode, struct file *file)
1369{
1370 struct seq_file *seq;
1371 int rc = -ENOMEM;
1372 struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
1373
1374 if (!s)
1375 goto out;
1376
1377 memset(s, 0, sizeof(*s));
1378 rc = seq_open(file, &arp_seq_ops);
1379 if (rc)
1380 goto out_kfree;
1381
1382 seq = file->private_data;
1383 seq->private = s;
1384out:
1385 return rc;
1386out_kfree:
1387 kfree(s);
1388 goto out;
1389}
1390
1391static struct file_operations arp_seq_fops = {
1392 .owner = THIS_MODULE,
1393 .open = arp_seq_open,
1394 .read = seq_read,
1395 .llseek = seq_lseek,
1396 .release = seq_release_private,
1397};
1398
1399static int __init arp_proc_init(void)
1400{
1401 if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
1402 return -ENOMEM;
1403 return 0;
1404}
1405
1406#else /* CONFIG_PROC_FS */
1407
1408static int __init arp_proc_init(void)
1409{
1410 return 0;
1411}
1412
1413#endif /* CONFIG_PROC_FS */
1414
1415EXPORT_SYMBOL(arp_broken_ops);
1416EXPORT_SYMBOL(arp_find);
1417EXPORT_SYMBOL(arp_rcv);
1418EXPORT_SYMBOL(arp_create);
1419EXPORT_SYMBOL(arp_xmit);
1420EXPORT_SYMBOL(arp_send);
1421EXPORT_SYMBOL(arp_tbl);
1422
1423#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1424EXPORT_SYMBOL(clip_tbl_hook);
1425#endif
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 000000000000..b1db561f2542
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,73 @@
1/*
2 * common UDP/RAW code
3 * Linux INET implementation
4 *
5 * Authors:
6 * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/config.h>
15#include <linux/types.h>
16#include <linux/module.h>
17#include <linux/ip.h>
18#include <linux/in.h>
19#include <net/sock.h>
20#include <net/tcp.h>
21#include <net/route.h>
22
23int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
24{
25 struct inet_sock *inet = inet_sk(sk);
26 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
27 struct rtable *rt;
28 u32 saddr;
29 int oif;
30 int err;
31
32
33 if (addr_len < sizeof(*usin))
34 return -EINVAL;
35
36 if (usin->sin_family != AF_INET)
37 return -EAFNOSUPPORT;
38
39 sk_dst_reset(sk);
40
41 oif = sk->sk_bound_dev_if;
42 saddr = inet->saddr;
43 if (MULTICAST(usin->sin_addr.s_addr)) {
44 if (!oif)
45 oif = inet->mc_index;
46 if (!saddr)
47 saddr = inet->mc_addr;
48 }
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
50 RT_CONN_FLAGS(sk), oif,
51 sk->sk_protocol,
52 inet->sport, usin->sin_port, sk);
53 if (err)
54 return err;
55 if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
56 ip_rt_put(rt);
57 return -EACCES;
58 }
59 if (!inet->saddr)
60 inet->saddr = rt->rt_src; /* Update source address */
61 if (!inet->rcv_saddr)
62 inet->rcv_saddr = rt->rt_src;
63 inet->daddr = rt->rt_dst;
64 inet->dport = usin->sin_port;
65 sk->sk_state = TCP_ESTABLISHED;
66 inet->id = jiffies;
67
68 sk_dst_set(sk, &rt->u.dst);
69 return(0);
70}
71
72EXPORT_SYMBOL(ip4_datagram_connect);
73
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 000000000000..eea7ef010776
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,1508 @@
1/*
2 * NET3 IP device support routines.
3 *
4 * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Derived from the IP parts of dev.c 1.0.19
12 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 *
16 * Additional Authors:
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
19 *
20 * Changes:
21 * Alexey Kuznetsov: pa_* fields are replaced with ifaddr
22 * lists.
23 * Cyrus Durgin: updated for kmod
24 * Matthias Andree: in devinet_ioctl, compare label and
25 * address (4.4BSD alias style support),
26 * fall back to comparing just the label
27 * if no match found.
28 */
29
30#include <linux/config.h>
31
32#include <asm/uaccess.h>
33#include <asm/system.h>
34#include <linux/bitops.h>
35#include <linux/module.h>
36#include <linux/types.h>
37#include <linux/kernel.h>
38#include <linux/sched.h>
39#include <linux/string.h>
40#include <linux/mm.h>
41#include <linux/socket.h>
42#include <linux/sockios.h>
43#include <linux/in.h>
44#include <linux/errno.h>
45#include <linux/interrupt.h>
46#include <linux/if_ether.h>
47#include <linux/inet.h>
48#include <linux/netdevice.h>
49#include <linux/etherdevice.h>
50#include <linux/skbuff.h>
51#include <linux/rtnetlink.h>
52#include <linux/init.h>
53#include <linux/notifier.h>
54#include <linux/inetdevice.h>
55#include <linux/igmp.h>
56#ifdef CONFIG_SYSCTL
57#include <linux/sysctl.h>
58#endif
59#include <linux/kmod.h>
60
61#include <net/ip.h>
62#include <net/route.h>
63#include <net/ip_fib.h>
64
65struct ipv4_devconf ipv4_devconf = {
66 .accept_redirects = 1,
67 .send_redirects = 1,
68 .secure_redirects = 1,
69 .shared_media = 1,
70};
71
72static struct ipv4_devconf ipv4_devconf_dflt = {
73 .accept_redirects = 1,
74 .send_redirects = 1,
75 .secure_redirects = 1,
76 .shared_media = 1,
77 .accept_source_route = 1,
78};
79
80static void rtmsg_ifa(int event, struct in_ifaddr *);
81
82static struct notifier_block *inetaddr_chain;
83static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
84 int destroy);
85#ifdef CONFIG_SYSCTL
86static void devinet_sysctl_register(struct in_device *in_dev,
87 struct ipv4_devconf *p);
88static void devinet_sysctl_unregister(struct ipv4_devconf *p);
89#endif
90
91/* Locks all the inet devices. */
92
93static struct in_ifaddr *inet_alloc_ifa(void)
94{
95 struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
96
97 if (ifa) {
98 memset(ifa, 0, sizeof(*ifa));
99 INIT_RCU_HEAD(&ifa->rcu_head);
100 }
101
102 return ifa;
103}
104
105static void inet_rcu_free_ifa(struct rcu_head *head)
106{
107 struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
108 if (ifa->ifa_dev)
109 in_dev_put(ifa->ifa_dev);
110 kfree(ifa);
111}
112
113static inline void inet_free_ifa(struct in_ifaddr *ifa)
114{
115 call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
116}
117
118void in_dev_finish_destroy(struct in_device *idev)
119{
120 struct net_device *dev = idev->dev;
121
122 BUG_TRAP(!idev->ifa_list);
123 BUG_TRAP(!idev->mc_list);
124#ifdef NET_REFCNT_DEBUG
125 printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
126 idev, dev ? dev->name : "NIL");
127#endif
128 dev_put(dev);
129 if (!idev->dead)
130 printk("Freeing alive in_device %p\n", idev);
131 else {
132 kfree(idev);
133 }
134}
135
136struct in_device *inetdev_init(struct net_device *dev)
137{
138 struct in_device *in_dev;
139
140 ASSERT_RTNL();
141
142 in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL);
143 if (!in_dev)
144 goto out;
145 memset(in_dev, 0, sizeof(*in_dev));
146 INIT_RCU_HEAD(&in_dev->rcu_head);
147 memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
148 in_dev->cnf.sysctl = NULL;
149 in_dev->dev = dev;
150 if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
151 goto out_kfree;
152 /* Reference in_dev->dev */
153 dev_hold(dev);
154#ifdef CONFIG_SYSCTL
155 neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
156 NET_IPV4_NEIGH, "ipv4", NULL, NULL);
157#endif
158
159 /* Account for reference dev->ip_ptr */
160 in_dev_hold(in_dev);
161 rcu_assign_pointer(dev->ip_ptr, in_dev);
162
163#ifdef CONFIG_SYSCTL
164 devinet_sysctl_register(in_dev, &in_dev->cnf);
165#endif
166 ip_mc_init_dev(in_dev);
167 if (dev->flags & IFF_UP)
168 ip_mc_up(in_dev);
169out:
170 return in_dev;
171out_kfree:
172 kfree(in_dev);
173 in_dev = NULL;
174 goto out;
175}
176
177static void in_dev_rcu_put(struct rcu_head *head)
178{
179 struct in_device *idev = container_of(head, struct in_device, rcu_head);
180 in_dev_put(idev);
181}
182
183static void inetdev_destroy(struct in_device *in_dev)
184{
185 struct in_ifaddr *ifa;
186 struct net_device *dev;
187
188 ASSERT_RTNL();
189
190 dev = in_dev->dev;
191 if (dev == &loopback_dev)
192 return;
193
194 in_dev->dead = 1;
195
196 ip_mc_destroy_dev(in_dev);
197
198 while ((ifa = in_dev->ifa_list) != NULL) {
199 inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
200 inet_free_ifa(ifa);
201 }
202
203#ifdef CONFIG_SYSCTL
204 devinet_sysctl_unregister(&in_dev->cnf);
205#endif
206
207 dev->ip_ptr = NULL;
208
209#ifdef CONFIG_SYSCTL
210 neigh_sysctl_unregister(in_dev->arp_parms);
211#endif
212 neigh_parms_release(&arp_tbl, in_dev->arp_parms);
213 arp_ifdown(dev);
214
215 call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
216}
217
218int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
219{
220 rcu_read_lock();
221 for_primary_ifa(in_dev) {
222 if (inet_ifa_match(a, ifa)) {
223 if (!b || inet_ifa_match(b, ifa)) {
224 rcu_read_unlock();
225 return 1;
226 }
227 }
228 } endfor_ifa(in_dev);
229 rcu_read_unlock();
230 return 0;
231}
232
233static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
234 int destroy)
235{
236 struct in_ifaddr *ifa1 = *ifap;
237
238 ASSERT_RTNL();
239
240 /* 1. Deleting primary ifaddr forces deletion all secondaries */
241
242 if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
243 struct in_ifaddr *ifa;
244 struct in_ifaddr **ifap1 = &ifa1->ifa_next;
245
246 while ((ifa = *ifap1) != NULL) {
247 if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
248 ifa1->ifa_mask != ifa->ifa_mask ||
249 !inet_ifa_match(ifa1->ifa_address, ifa)) {
250 ifap1 = &ifa->ifa_next;
251 continue;
252 }
253
254 *ifap1 = ifa->ifa_next;
255
256 rtmsg_ifa(RTM_DELADDR, ifa);
257 notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
258 inet_free_ifa(ifa);
259 }
260 }
261
262 /* 2. Unlink it */
263
264 *ifap = ifa1->ifa_next;
265
266 /* 3. Announce address deletion */
267
268 /* Send message first, then call notifier.
269 At first sight, FIB update triggered by notifier
270 will refer to already deleted ifaddr, that could confuse
271 netlink listeners. It is not true: look, gated sees
272 that route deleted and if it still thinks that ifaddr
273 is valid, it will try to restore deleted routes... Grr.
274 So that, this order is correct.
275 */
276 rtmsg_ifa(RTM_DELADDR, ifa1);
277 notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
278 if (destroy) {
279 inet_free_ifa(ifa1);
280
281 if (!in_dev->ifa_list)
282 inetdev_destroy(in_dev);
283 }
284}
285
286static int inet_insert_ifa(struct in_ifaddr *ifa)
287{
288 struct in_device *in_dev = ifa->ifa_dev;
289 struct in_ifaddr *ifa1, **ifap, **last_primary;
290
291 ASSERT_RTNL();
292
293 if (!ifa->ifa_local) {
294 inet_free_ifa(ifa);
295 return 0;
296 }
297
298 ifa->ifa_flags &= ~IFA_F_SECONDARY;
299 last_primary = &in_dev->ifa_list;
300
301 for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
302 ifap = &ifa1->ifa_next) {
303 if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
304 ifa->ifa_scope <= ifa1->ifa_scope)
305 last_primary = &ifa1->ifa_next;
306 if (ifa1->ifa_mask == ifa->ifa_mask &&
307 inet_ifa_match(ifa1->ifa_address, ifa)) {
308 if (ifa1->ifa_local == ifa->ifa_local) {
309 inet_free_ifa(ifa);
310 return -EEXIST;
311 }
312 if (ifa1->ifa_scope != ifa->ifa_scope) {
313 inet_free_ifa(ifa);
314 return -EINVAL;
315 }
316 ifa->ifa_flags |= IFA_F_SECONDARY;
317 }
318 }
319
320 if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
321 net_srandom(ifa->ifa_local);
322 ifap = last_primary;
323 }
324
325 ifa->ifa_next = *ifap;
326 *ifap = ifa;
327
328 /* Send message first, then call notifier.
329 Notifier will trigger FIB update, so that
330 listeners of netlink will know about new ifaddr */
331 rtmsg_ifa(RTM_NEWADDR, ifa);
332 notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
333
334 return 0;
335}
336
337static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
338{
339 struct in_device *in_dev = __in_dev_get(dev);
340
341 ASSERT_RTNL();
342
343 if (!in_dev) {
344 in_dev = inetdev_init(dev);
345 if (!in_dev) {
346 inet_free_ifa(ifa);
347 return -ENOBUFS;
348 }
349 }
350 if (ifa->ifa_dev != in_dev) {
351 BUG_TRAP(!ifa->ifa_dev);
352 in_dev_hold(in_dev);
353 ifa->ifa_dev = in_dev;
354 }
355 if (LOOPBACK(ifa->ifa_local))
356 ifa->ifa_scope = RT_SCOPE_HOST;
357 return inet_insert_ifa(ifa);
358}
359
360struct in_device *inetdev_by_index(int ifindex)
361{
362 struct net_device *dev;
363 struct in_device *in_dev = NULL;
364 read_lock(&dev_base_lock);
365 dev = __dev_get_by_index(ifindex);
366 if (dev)
367 in_dev = in_dev_get(dev);
368 read_unlock(&dev_base_lock);
369 return in_dev;
370}
371
372/* Called only from RTNL semaphored context. No locks. */
373
374struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix,
375 u32 mask)
376{
377 ASSERT_RTNL();
378
379 for_primary_ifa(in_dev) {
380 if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
381 return ifa;
382 } endfor_ifa(in_dev);
383 return NULL;
384}
385
386static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
387{
388 struct rtattr **rta = arg;
389 struct in_device *in_dev;
390 struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
391 struct in_ifaddr *ifa, **ifap;
392
393 ASSERT_RTNL();
394
395 if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL)
396 goto out;
397 __in_dev_put(in_dev);
398
399 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
400 ifap = &ifa->ifa_next) {
401 if ((rta[IFA_LOCAL - 1] &&
402 memcmp(RTA_DATA(rta[IFA_LOCAL - 1]),
403 &ifa->ifa_local, 4)) ||
404 (rta[IFA_LABEL - 1] &&
405 rtattr_strcmp(rta[IFA_LABEL - 1], ifa->ifa_label)) ||
406 (rta[IFA_ADDRESS - 1] &&
407 (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
408 !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS - 1]),
409 ifa))))
410 continue;
411 inet_del_ifa(in_dev, ifap, 1);
412 return 0;
413 }
414out:
415 return -EADDRNOTAVAIL;
416}
417
418static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
419{
420 struct rtattr **rta = arg;
421 struct net_device *dev;
422 struct in_device *in_dev;
423 struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
424 struct in_ifaddr *ifa;
425 int rc = -EINVAL;
426
427 ASSERT_RTNL();
428
429 if (ifm->ifa_prefixlen > 32 || !rta[IFA_LOCAL - 1])
430 goto out;
431
432 rc = -ENODEV;
433 if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL)
434 goto out;
435
436 rc = -ENOBUFS;
437 if ((in_dev = __in_dev_get(dev)) == NULL) {
438 in_dev = inetdev_init(dev);
439 if (!in_dev)
440 goto out;
441 }
442
443 if ((ifa = inet_alloc_ifa()) == NULL)
444 goto out;
445
446 if (!rta[IFA_ADDRESS - 1])
447 rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1];
448 memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL - 1]), 4);
449 memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS - 1]), 4);
450 ifa->ifa_prefixlen = ifm->ifa_prefixlen;
451 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
452 if (rta[IFA_BROADCAST - 1])
453 memcpy(&ifa->ifa_broadcast,
454 RTA_DATA(rta[IFA_BROADCAST - 1]), 4);
455 if (rta[IFA_ANYCAST - 1])
456 memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST - 1]), 4);
457 ifa->ifa_flags = ifm->ifa_flags;
458 ifa->ifa_scope = ifm->ifa_scope;
459 in_dev_hold(in_dev);
460 ifa->ifa_dev = in_dev;
461 if (rta[IFA_LABEL - 1])
462 rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL - 1], IFNAMSIZ);
463 else
464 memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
465
466 rc = inet_insert_ifa(ifa);
467out:
468 return rc;
469}
470
471/*
472 * Determine a default network mask, based on the IP address.
473 */
474
475static __inline__ int inet_abc_len(u32 addr)
476{
477 int rc = -1; /* Something else, probably a multicast. */
478
479 if (ZERONET(addr))
480 rc = 0;
481 else {
482 addr = ntohl(addr);
483
484 if (IN_CLASSA(addr))
485 rc = 8;
486 else if (IN_CLASSB(addr))
487 rc = 16;
488 else if (IN_CLASSC(addr))
489 rc = 24;
490 }
491
492 return rc;
493}
494
495
496int devinet_ioctl(unsigned int cmd, void __user *arg)
497{
498 struct ifreq ifr;
499 struct sockaddr_in sin_orig;
500 struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
501 struct in_device *in_dev;
502 struct in_ifaddr **ifap = NULL;
503 struct in_ifaddr *ifa = NULL;
504 struct net_device *dev;
505 char *colon;
506 int ret = -EFAULT;
507 int tryaddrmatch = 0;
508
509 /*
510 * Fetch the caller's info block into kernel space
511 */
512
513 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
514 goto out;
515 ifr.ifr_name[IFNAMSIZ - 1] = 0;
516
517 /* save original address for comparison */
518 memcpy(&sin_orig, sin, sizeof(*sin));
519
520 colon = strchr(ifr.ifr_name, ':');
521 if (colon)
522 *colon = 0;
523
524#ifdef CONFIG_KMOD
525 dev_load(ifr.ifr_name);
526#endif
527
528 switch(cmd) {
529 case SIOCGIFADDR: /* Get interface address */
530 case SIOCGIFBRDADDR: /* Get the broadcast address */
531 case SIOCGIFDSTADDR: /* Get the destination address */
532 case SIOCGIFNETMASK: /* Get the netmask for the interface */
533 /* Note that these ioctls will not sleep,
534 so that we do not impose a lock.
535 One day we will be forced to put shlock here (I mean SMP)
536 */
537 tryaddrmatch = (sin_orig.sin_family == AF_INET);
538 memset(sin, 0, sizeof(*sin));
539 sin->sin_family = AF_INET;
540 break;
541
542 case SIOCSIFFLAGS:
543 ret = -EACCES;
544 if (!capable(CAP_NET_ADMIN))
545 goto out;
546 break;
547 case SIOCSIFADDR: /* Set interface address (and family) */
548 case SIOCSIFBRDADDR: /* Set the broadcast address */
549 case SIOCSIFDSTADDR: /* Set the destination address */
550 case SIOCSIFNETMASK: /* Set the netmask for the interface */
551 ret = -EACCES;
552 if (!capable(CAP_NET_ADMIN))
553 goto out;
554 ret = -EINVAL;
555 if (sin->sin_family != AF_INET)
556 goto out;
557 break;
558 default:
559 ret = -EINVAL;
560 goto out;
561 }
562
563 rtnl_lock();
564
565 ret = -ENODEV;
566 if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
567 goto done;
568
569 if (colon)
570 *colon = ':';
571
572 if ((in_dev = __in_dev_get(dev)) != NULL) {
573 if (tryaddrmatch) {
574 /* Matthias Andree */
575 /* compare label and address (4.4BSD style) */
576 /* note: we only do this for a limited set of ioctls
577 and only if the original address family was AF_INET.
578 This is checked above. */
579 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
580 ifap = &ifa->ifa_next) {
581 if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
582 sin_orig.sin_addr.s_addr ==
583 ifa->ifa_address) {
584 break; /* found */
585 }
586 }
587 }
588 /* we didn't get a match, maybe the application is
589 4.3BSD-style and passed in junk so we fall back to
590 comparing just the label */
591 if (!ifa) {
592 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
593 ifap = &ifa->ifa_next)
594 if (!strcmp(ifr.ifr_name, ifa->ifa_label))
595 break;
596 }
597 }
598
599 ret = -EADDRNOTAVAIL;
600 if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
601 goto done;
602
603 switch(cmd) {
604 case SIOCGIFADDR: /* Get interface address */
605 sin->sin_addr.s_addr = ifa->ifa_local;
606 goto rarok;
607
608 case SIOCGIFBRDADDR: /* Get the broadcast address */
609 sin->sin_addr.s_addr = ifa->ifa_broadcast;
610 goto rarok;
611
612 case SIOCGIFDSTADDR: /* Get the destination address */
613 sin->sin_addr.s_addr = ifa->ifa_address;
614 goto rarok;
615
616 case SIOCGIFNETMASK: /* Get the netmask for the interface */
617 sin->sin_addr.s_addr = ifa->ifa_mask;
618 goto rarok;
619
620 case SIOCSIFFLAGS:
621 if (colon) {
622 ret = -EADDRNOTAVAIL;
623 if (!ifa)
624 break;
625 ret = 0;
626 if (!(ifr.ifr_flags & IFF_UP))
627 inet_del_ifa(in_dev, ifap, 1);
628 break;
629 }
630 ret = dev_change_flags(dev, ifr.ifr_flags);
631 break;
632
633 case SIOCSIFADDR: /* Set interface address (and family) */
634 ret = -EINVAL;
635 if (inet_abc_len(sin->sin_addr.s_addr) < 0)
636 break;
637
638 if (!ifa) {
639 ret = -ENOBUFS;
640 if ((ifa = inet_alloc_ifa()) == NULL)
641 break;
642 if (colon)
643 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
644 else
645 memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
646 } else {
647 ret = 0;
648 if (ifa->ifa_local == sin->sin_addr.s_addr)
649 break;
650 inet_del_ifa(in_dev, ifap, 0);
651 ifa->ifa_broadcast = 0;
652 ifa->ifa_anycast = 0;
653 }
654
655 ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
656
657 if (!(dev->flags & IFF_POINTOPOINT)) {
658 ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
659 ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
660 if ((dev->flags & IFF_BROADCAST) &&
661 ifa->ifa_prefixlen < 31)
662 ifa->ifa_broadcast = ifa->ifa_address |
663 ~ifa->ifa_mask;
664 } else {
665 ifa->ifa_prefixlen = 32;
666 ifa->ifa_mask = inet_make_mask(32);
667 }
668 ret = inet_set_ifa(dev, ifa);
669 break;
670
671 case SIOCSIFBRDADDR: /* Set the broadcast address */
672 ret = 0;
673 if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
674 inet_del_ifa(in_dev, ifap, 0);
675 ifa->ifa_broadcast = sin->sin_addr.s_addr;
676 inet_insert_ifa(ifa);
677 }
678 break;
679
680 case SIOCSIFDSTADDR: /* Set the destination address */
681 ret = 0;
682 if (ifa->ifa_address == sin->sin_addr.s_addr)
683 break;
684 ret = -EINVAL;
685 if (inet_abc_len(sin->sin_addr.s_addr) < 0)
686 break;
687 ret = 0;
688 inet_del_ifa(in_dev, ifap, 0);
689 ifa->ifa_address = sin->sin_addr.s_addr;
690 inet_insert_ifa(ifa);
691 break;
692
693 case SIOCSIFNETMASK: /* Set the netmask for the interface */
694
695 /*
696 * The mask we set must be legal.
697 */
698 ret = -EINVAL;
699 if (bad_mask(sin->sin_addr.s_addr, 0))
700 break;
701 ret = 0;
702 if (ifa->ifa_mask != sin->sin_addr.s_addr) {
703 inet_del_ifa(in_dev, ifap, 0);
704 ifa->ifa_mask = sin->sin_addr.s_addr;
705 ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
706
707 /* See if current broadcast address matches
708 * with current netmask, then recalculate
709 * the broadcast address. Otherwise it's a
710 * funny address, so don't touch it since
711 * the user seems to know what (s)he's doing...
712 */
713 if ((dev->flags & IFF_BROADCAST) &&
714 (ifa->ifa_prefixlen < 31) &&
715 (ifa->ifa_broadcast ==
716 (ifa->ifa_local|~ifa->ifa_mask))) {
717 ifa->ifa_broadcast = (ifa->ifa_local |
718 ~sin->sin_addr.s_addr);
719 }
720 inet_insert_ifa(ifa);
721 }
722 break;
723 }
724done:
725 rtnl_unlock();
726out:
727 return ret;
728rarok:
729 rtnl_unlock();
730 ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
731 goto out;
732}
733
734static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
735{
736 struct in_device *in_dev = __in_dev_get(dev);
737 struct in_ifaddr *ifa;
738 struct ifreq ifr;
739 int done = 0;
740
741 if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
742 goto out;
743
744 for (; ifa; ifa = ifa->ifa_next) {
745 if (!buf) {
746 done += sizeof(ifr);
747 continue;
748 }
749 if (len < (int) sizeof(ifr))
750 break;
751 memset(&ifr, 0, sizeof(struct ifreq));
752 if (ifa->ifa_label)
753 strcpy(ifr.ifr_name, ifa->ifa_label);
754 else
755 strcpy(ifr.ifr_name, dev->name);
756
757 (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
758 (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
759 ifa->ifa_local;
760
761 if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
762 done = -EFAULT;
763 break;
764 }
765 buf += sizeof(struct ifreq);
766 len -= sizeof(struct ifreq);
767 done += sizeof(struct ifreq);
768 }
769out:
770 return done;
771}
772
773u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
774{
775 u32 addr = 0;
776 struct in_device *in_dev;
777
778 rcu_read_lock();
779 in_dev = __in_dev_get(dev);
780 if (!in_dev)
781 goto no_in_dev;
782
783 for_primary_ifa(in_dev) {
784 if (ifa->ifa_scope > scope)
785 continue;
786 if (!dst || inet_ifa_match(dst, ifa)) {
787 addr = ifa->ifa_local;
788 break;
789 }
790 if (!addr)
791 addr = ifa->ifa_local;
792 } endfor_ifa(in_dev);
793no_in_dev:
794 rcu_read_unlock();
795
796 if (addr)
797 goto out;
798
799 /* Not loopback addresses on loopback should be preferred
800 in this case. It is importnat that lo is the first interface
801 in dev_base list.
802 */
803 read_lock(&dev_base_lock);
804 rcu_read_lock();
805 for (dev = dev_base; dev; dev = dev->next) {
806 if ((in_dev = __in_dev_get(dev)) == NULL)
807 continue;
808
809 for_primary_ifa(in_dev) {
810 if (ifa->ifa_scope != RT_SCOPE_LINK &&
811 ifa->ifa_scope <= scope) {
812 addr = ifa->ifa_local;
813 goto out_unlock_both;
814 }
815 } endfor_ifa(in_dev);
816 }
817out_unlock_both:
818 read_unlock(&dev_base_lock);
819 rcu_read_unlock();
820out:
821 return addr;
822}
823
824static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst,
825 u32 local, int scope)
826{
827 int same = 0;
828 u32 addr = 0;
829
830 for_ifa(in_dev) {
831 if (!addr &&
832 (local == ifa->ifa_local || !local) &&
833 ifa->ifa_scope <= scope) {
834 addr = ifa->ifa_local;
835 if (same)
836 break;
837 }
838 if (!same) {
839 same = (!local || inet_ifa_match(local, ifa)) &&
840 (!dst || inet_ifa_match(dst, ifa));
841 if (same && addr) {
842 if (local || !dst)
843 break;
844 /* Is the selected addr into dst subnet? */
845 if (inet_ifa_match(addr, ifa))
846 break;
847 /* No, then can we use new local src? */
848 if (ifa->ifa_scope <= scope) {
849 addr = ifa->ifa_local;
850 break;
851 }
852 /* search for large dst subnet for addr */
853 same = 0;
854 }
855 }
856 } endfor_ifa(in_dev);
857
858 return same? addr : 0;
859}
860
861/*
862 * Confirm that local IP address exists using wildcards:
863 * - dev: only on this interface, 0=any interface
864 * - dst: only in the same subnet as dst, 0=any dst
865 * - local: address, 0=autoselect the local address
866 * - scope: maximum allowed scope value for the local address
867 */
868u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope)
869{
870 u32 addr = 0;
871 struct in_device *in_dev;
872
873 if (dev) {
874 rcu_read_lock();
875 if ((in_dev = __in_dev_get(dev)))
876 addr = confirm_addr_indev(in_dev, dst, local, scope);
877 rcu_read_unlock();
878
879 return addr;
880 }
881
882 read_lock(&dev_base_lock);
883 rcu_read_lock();
884 for (dev = dev_base; dev; dev = dev->next) {
885 if ((in_dev = __in_dev_get(dev))) {
886 addr = confirm_addr_indev(in_dev, dst, local, scope);
887 if (addr)
888 break;
889 }
890 }
891 rcu_read_unlock();
892 read_unlock(&dev_base_lock);
893
894 return addr;
895}
896
897/*
898 * Device notifier
899 */
900
901int register_inetaddr_notifier(struct notifier_block *nb)
902{
903 return notifier_chain_register(&inetaddr_chain, nb);
904}
905
906int unregister_inetaddr_notifier(struct notifier_block *nb)
907{
908 return notifier_chain_unregister(&inetaddr_chain, nb);
909}
910
911/* Rename ifa_labels for a device name change. Make some effort to preserve existing
912 * alias numbering and to create unique labels if possible.
913*/
914static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
915{
916 struct in_ifaddr *ifa;
917 int named = 0;
918
919 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
920 char old[IFNAMSIZ], *dot;
921
922 memcpy(old, ifa->ifa_label, IFNAMSIZ);
923 memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
924 if (named++ == 0)
925 continue;
926 dot = strchr(ifa->ifa_label, ':');
927 if (dot == NULL) {
928 sprintf(old, ":%d", named);
929 dot = old;
930 }
931 if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) {
932 strcat(ifa->ifa_label, dot);
933 } else {
934 strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
935 }
936 }
937}
938
939/* Called only under RTNL semaphore */
940
941static int inetdev_event(struct notifier_block *this, unsigned long event,
942 void *ptr)
943{
944 struct net_device *dev = ptr;
945 struct in_device *in_dev = __in_dev_get(dev);
946
947 ASSERT_RTNL();
948
949 if (!in_dev) {
950 if (event == NETDEV_REGISTER && dev == &loopback_dev) {
951 in_dev = inetdev_init(dev);
952 if (!in_dev)
953 panic("devinet: Failed to create loopback\n");
954 in_dev->cnf.no_xfrm = 1;
955 in_dev->cnf.no_policy = 1;
956 }
957 goto out;
958 }
959
960 switch (event) {
961 case NETDEV_REGISTER:
962 printk(KERN_DEBUG "inetdev_event: bug\n");
963 dev->ip_ptr = NULL;
964 break;
965 case NETDEV_UP:
966 if (dev->mtu < 68)
967 break;
968 if (dev == &loopback_dev) {
969 struct in_ifaddr *ifa;
970 if ((ifa = inet_alloc_ifa()) != NULL) {
971 ifa->ifa_local =
972 ifa->ifa_address = htonl(INADDR_LOOPBACK);
973 ifa->ifa_prefixlen = 8;
974 ifa->ifa_mask = inet_make_mask(8);
975 in_dev_hold(in_dev);
976 ifa->ifa_dev = in_dev;
977 ifa->ifa_scope = RT_SCOPE_HOST;
978 memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
979 inet_insert_ifa(ifa);
980 }
981 }
982 ip_mc_up(in_dev);
983 break;
984 case NETDEV_DOWN:
985 ip_mc_down(in_dev);
986 break;
987 case NETDEV_CHANGEMTU:
988 if (dev->mtu >= 68)
989 break;
990 /* MTU falled under 68, disable IP */
991 case NETDEV_UNREGISTER:
992 inetdev_destroy(in_dev);
993 break;
994 case NETDEV_CHANGENAME:
995 /* Do not notify about label change, this event is
996 * not interesting to applications using netlink.
997 */
998 inetdev_changename(dev, in_dev);
999
1000#ifdef CONFIG_SYSCTL
1001 devinet_sysctl_unregister(&in_dev->cnf);
1002 neigh_sysctl_unregister(in_dev->arp_parms);
1003 neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
1004 NET_IPV4_NEIGH, "ipv4", NULL, NULL);
1005 devinet_sysctl_register(in_dev, &in_dev->cnf);
1006#endif
1007 break;
1008 }
1009out:
1010 return NOTIFY_DONE;
1011}
1012
1013static struct notifier_block ip_netdev_notifier = {
1014 .notifier_call =inetdev_event,
1015};
1016
1017static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1018 u32 pid, u32 seq, int event)
1019{
1020 struct ifaddrmsg *ifm;
1021 struct nlmsghdr *nlh;
1022 unsigned char *b = skb->tail;
1023
1024 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
1025 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
1026 ifm = NLMSG_DATA(nlh);
1027 ifm->ifa_family = AF_INET;
1028 ifm->ifa_prefixlen = ifa->ifa_prefixlen;
1029 ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
1030 ifm->ifa_scope = ifa->ifa_scope;
1031 ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
1032 if (ifa->ifa_address)
1033 RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
1034 if (ifa->ifa_local)
1035 RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
1036 if (ifa->ifa_broadcast)
1037 RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast);
1038 if (ifa->ifa_anycast)
1039 RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast);
1040 if (ifa->ifa_label[0])
1041 RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label);
1042 nlh->nlmsg_len = skb->tail - b;
1043 return skb->len;
1044
1045nlmsg_failure:
1046rtattr_failure:
1047 skb_trim(skb, b - skb->data);
1048 return -1;
1049}
1050
1051static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1052{
1053 int idx, ip_idx;
1054 struct net_device *dev;
1055 struct in_device *in_dev;
1056 struct in_ifaddr *ifa;
1057 int s_ip_idx, s_idx = cb->args[0];
1058
1059 s_ip_idx = ip_idx = cb->args[1];
1060 read_lock(&dev_base_lock);
1061 for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
1062 if (idx < s_idx)
1063 continue;
1064 if (idx > s_idx)
1065 s_ip_idx = 0;
1066 rcu_read_lock();
1067 if ((in_dev = __in_dev_get(dev)) == NULL) {
1068 rcu_read_unlock();
1069 continue;
1070 }
1071
1072 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
1073 ifa = ifa->ifa_next, ip_idx++) {
1074 if (ip_idx < s_ip_idx)
1075 continue;
1076 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
1077 cb->nlh->nlmsg_seq,
1078 RTM_NEWADDR) <= 0) {
1079 rcu_read_unlock();
1080 goto done;
1081 }
1082 }
1083 rcu_read_unlock();
1084 }
1085
1086done:
1087 read_unlock(&dev_base_lock);
1088 cb->args[0] = idx;
1089 cb->args[1] = ip_idx;
1090
1091 return skb->len;
1092}
1093
1094static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
1095{
1096 int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + 128);
1097 struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
1098
1099 if (!skb)
1100 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
1101 else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
1102 kfree_skb(skb);
1103 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
1104 } else {
1105 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
1106 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
1107 }
1108}
1109
1110static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = {
1111 [4] = { .doit = inet_rtm_newaddr, },
1112 [5] = { .doit = inet_rtm_deladdr, },
1113 [6] = { .dumpit = inet_dump_ifaddr, },
1114 [8] = { .doit = inet_rtm_newroute, },
1115 [9] = { .doit = inet_rtm_delroute, },
1116 [10] = { .doit = inet_rtm_getroute, .dumpit = inet_dump_fib, },
1117#ifdef CONFIG_IP_MULTIPLE_TABLES
1118 [16] = { .doit = inet_rtm_newrule, },
1119 [17] = { .doit = inet_rtm_delrule, },
1120 [18] = { .dumpit = inet_dump_rules, },
1121#endif
1122};
1123
1124#ifdef CONFIG_SYSCTL
1125
1126void inet_forward_change(void)
1127{
1128 struct net_device *dev;
1129 int on = ipv4_devconf.forwarding;
1130
1131 ipv4_devconf.accept_redirects = !on;
1132 ipv4_devconf_dflt.forwarding = on;
1133
1134 read_lock(&dev_base_lock);
1135 for (dev = dev_base; dev; dev = dev->next) {
1136 struct in_device *in_dev;
1137 rcu_read_lock();
1138 in_dev = __in_dev_get(dev);
1139 if (in_dev)
1140 in_dev->cnf.forwarding = on;
1141 rcu_read_unlock();
1142 }
1143 read_unlock(&dev_base_lock);
1144
1145 rt_cache_flush(0);
1146}
1147
1148static int devinet_sysctl_forward(ctl_table *ctl, int write,
1149 struct file* filp, void __user *buffer,
1150 size_t *lenp, loff_t *ppos)
1151{
1152 int *valp = ctl->data;
1153 int val = *valp;
1154 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
1155
1156 if (write && *valp != val) {
1157 if (valp == &ipv4_devconf.forwarding)
1158 inet_forward_change();
1159 else if (valp != &ipv4_devconf_dflt.forwarding)
1160 rt_cache_flush(0);
1161 }
1162
1163 return ret;
1164}
1165
1166int ipv4_doint_and_flush(ctl_table *ctl, int write,
1167 struct file* filp, void __user *buffer,
1168 size_t *lenp, loff_t *ppos)
1169{
1170 int *valp = ctl->data;
1171 int val = *valp;
1172 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
1173
1174 if (write && *valp != val)
1175 rt_cache_flush(0);
1176
1177 return ret;
1178}
1179
1180int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
1181 void __user *oldval, size_t __user *oldlenp,
1182 void __user *newval, size_t newlen,
1183 void **context)
1184{
1185 int *valp = table->data;
1186 int new;
1187
1188 if (!newval || !newlen)
1189 return 0;
1190
1191 if (newlen != sizeof(int))
1192 return -EINVAL;
1193
1194 if (get_user(new, (int __user *)newval))
1195 return -EFAULT;
1196
1197 if (new == *valp)
1198 return 0;
1199
1200 if (oldval && oldlenp) {
1201 size_t len;
1202
1203 if (get_user(len, oldlenp))
1204 return -EFAULT;
1205
1206 if (len) {
1207 if (len > table->maxlen)
1208 len = table->maxlen;
1209 if (copy_to_user(oldval, valp, len))
1210 return -EFAULT;
1211 if (put_user(len, oldlenp))
1212 return -EFAULT;
1213 }
1214 }
1215
1216 *valp = new;
1217 rt_cache_flush(0);
1218 return 1;
1219}
1220
1221
1222static struct devinet_sysctl_table {
1223 struct ctl_table_header *sysctl_header;
1224 ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
1225 ctl_table devinet_dev[2];
1226 ctl_table devinet_conf_dir[2];
1227 ctl_table devinet_proto_dir[2];
1228 ctl_table devinet_root_dir[2];
1229} devinet_sysctl = {
1230 .devinet_vars = {
1231 {
1232 .ctl_name = NET_IPV4_CONF_FORWARDING,
1233 .procname = "forwarding",
1234 .data = &ipv4_devconf.forwarding,
1235 .maxlen = sizeof(int),
1236 .mode = 0644,
1237 .proc_handler = &devinet_sysctl_forward,
1238 },
1239 {
1240 .ctl_name = NET_IPV4_CONF_MC_FORWARDING,
1241 .procname = "mc_forwarding",
1242 .data = &ipv4_devconf.mc_forwarding,
1243 .maxlen = sizeof(int),
1244 .mode = 0444,
1245 .proc_handler = &proc_dointvec,
1246 },
1247 {
1248 .ctl_name = NET_IPV4_CONF_ACCEPT_REDIRECTS,
1249 .procname = "accept_redirects",
1250 .data = &ipv4_devconf.accept_redirects,
1251 .maxlen = sizeof(int),
1252 .mode = 0644,
1253 .proc_handler = &proc_dointvec,
1254 },
1255 {
1256 .ctl_name = NET_IPV4_CONF_SECURE_REDIRECTS,
1257 .procname = "secure_redirects",
1258 .data = &ipv4_devconf.secure_redirects,
1259 .maxlen = sizeof(int),
1260 .mode = 0644,
1261 .proc_handler = &proc_dointvec,
1262 },
1263 {
1264 .ctl_name = NET_IPV4_CONF_SHARED_MEDIA,
1265 .procname = "shared_media",
1266 .data = &ipv4_devconf.shared_media,
1267 .maxlen = sizeof(int),
1268 .mode = 0644,
1269 .proc_handler = &proc_dointvec,
1270 },
1271 {
1272 .ctl_name = NET_IPV4_CONF_RP_FILTER,
1273 .procname = "rp_filter",
1274 .data = &ipv4_devconf.rp_filter,
1275 .maxlen = sizeof(int),
1276 .mode = 0644,
1277 .proc_handler = &proc_dointvec,
1278 },
1279 {
1280 .ctl_name = NET_IPV4_CONF_SEND_REDIRECTS,
1281 .procname = "send_redirects",
1282 .data = &ipv4_devconf.send_redirects,
1283 .maxlen = sizeof(int),
1284 .mode = 0644,
1285 .proc_handler = &proc_dointvec,
1286 },
1287 {
1288 .ctl_name = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,
1289 .procname = "accept_source_route",
1290 .data = &ipv4_devconf.accept_source_route,
1291 .maxlen = sizeof(int),
1292 .mode = 0644,
1293 .proc_handler = &proc_dointvec,
1294 },
1295 {
1296 .ctl_name = NET_IPV4_CONF_PROXY_ARP,
1297 .procname = "proxy_arp",
1298 .data = &ipv4_devconf.proxy_arp,
1299 .maxlen = sizeof(int),
1300 .mode = 0644,
1301 .proc_handler = &proc_dointvec,
1302 },
1303 {
1304 .ctl_name = NET_IPV4_CONF_MEDIUM_ID,
1305 .procname = "medium_id",
1306 .data = &ipv4_devconf.medium_id,
1307 .maxlen = sizeof(int),
1308 .mode = 0644,
1309 .proc_handler = &proc_dointvec,
1310 },
1311 {
1312 .ctl_name = NET_IPV4_CONF_BOOTP_RELAY,
1313 .procname = "bootp_relay",
1314 .data = &ipv4_devconf.bootp_relay,
1315 .maxlen = sizeof(int),
1316 .mode = 0644,
1317 .proc_handler = &proc_dointvec,
1318 },
1319 {
1320 .ctl_name = NET_IPV4_CONF_LOG_MARTIANS,
1321 .procname = "log_martians",
1322 .data = &ipv4_devconf.log_martians,
1323 .maxlen = sizeof(int),
1324 .mode = 0644,
1325 .proc_handler = &proc_dointvec,
1326 },
1327 {
1328 .ctl_name = NET_IPV4_CONF_TAG,
1329 .procname = "tag",
1330 .data = &ipv4_devconf.tag,
1331 .maxlen = sizeof(int),
1332 .mode = 0644,
1333 .proc_handler = &proc_dointvec,
1334 },
1335 {
1336 .ctl_name = NET_IPV4_CONF_ARPFILTER,
1337 .procname = "arp_filter",
1338 .data = &ipv4_devconf.arp_filter,
1339 .maxlen = sizeof(int),
1340 .mode = 0644,
1341 .proc_handler = &proc_dointvec,
1342 },
1343 {
1344 .ctl_name = NET_IPV4_CONF_ARP_ANNOUNCE,
1345 .procname = "arp_announce",
1346 .data = &ipv4_devconf.arp_announce,
1347 .maxlen = sizeof(int),
1348 .mode = 0644,
1349 .proc_handler = &proc_dointvec,
1350 },
1351 {
1352 .ctl_name = NET_IPV4_CONF_ARP_IGNORE,
1353 .procname = "arp_ignore",
1354 .data = &ipv4_devconf.arp_ignore,
1355 .maxlen = sizeof(int),
1356 .mode = 0644,
1357 .proc_handler = &proc_dointvec,
1358 },
1359 {
1360 .ctl_name = NET_IPV4_CONF_NOXFRM,
1361 .procname = "disable_xfrm",
1362 .data = &ipv4_devconf.no_xfrm,
1363 .maxlen = sizeof(int),
1364 .mode = 0644,
1365 .proc_handler = &ipv4_doint_and_flush,
1366 .strategy = &ipv4_doint_and_flush_strategy,
1367 },
1368 {
1369 .ctl_name = NET_IPV4_CONF_NOPOLICY,
1370 .procname = "disable_policy",
1371 .data = &ipv4_devconf.no_policy,
1372 .maxlen = sizeof(int),
1373 .mode = 0644,
1374 .proc_handler = &ipv4_doint_and_flush,
1375 .strategy = &ipv4_doint_and_flush_strategy,
1376 },
1377 {
1378 .ctl_name = NET_IPV4_CONF_FORCE_IGMP_VERSION,
1379 .procname = "force_igmp_version",
1380 .data = &ipv4_devconf.force_igmp_version,
1381 .maxlen = sizeof(int),
1382 .mode = 0644,
1383 .proc_handler = &ipv4_doint_and_flush,
1384 .strategy = &ipv4_doint_and_flush_strategy,
1385 },
1386 },
1387 .devinet_dev = {
1388 {
1389 .ctl_name = NET_PROTO_CONF_ALL,
1390 .procname = "all",
1391 .mode = 0555,
1392 .child = devinet_sysctl.devinet_vars,
1393 },
1394 },
1395 .devinet_conf_dir = {
1396 {
1397 .ctl_name = NET_IPV4_CONF,
1398 .procname = "conf",
1399 .mode = 0555,
1400 .child = devinet_sysctl.devinet_dev,
1401 },
1402 },
1403 .devinet_proto_dir = {
1404 {
1405 .ctl_name = NET_IPV4,
1406 .procname = "ipv4",
1407 .mode = 0555,
1408 .child = devinet_sysctl.devinet_conf_dir,
1409 },
1410 },
1411 .devinet_root_dir = {
1412 {
1413 .ctl_name = CTL_NET,
1414 .procname = "net",
1415 .mode = 0555,
1416 .child = devinet_sysctl.devinet_proto_dir,
1417 },
1418 },
1419};
1420
1421static void devinet_sysctl_register(struct in_device *in_dev,
1422 struct ipv4_devconf *p)
1423{
1424 int i;
1425 struct net_device *dev = in_dev ? in_dev->dev : NULL;
1426 struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
1427 char *dev_name = NULL;
1428
1429 if (!t)
1430 return;
1431 memcpy(t, &devinet_sysctl, sizeof(*t));
1432 for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
1433 t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
1434 t->devinet_vars[i].de = NULL;
1435 }
1436
1437 if (dev) {
1438 dev_name = dev->name;
1439 t->devinet_dev[0].ctl_name = dev->ifindex;
1440 } else {
1441 dev_name = "default";
1442 t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
1443 }
1444
1445 /*
1446 * Make a copy of dev_name, because '.procname' is regarded as const
1447 * by sysctl and we wouldn't want anyone to change it under our feet
1448 * (see SIOCSIFNAME).
1449 */
1450 dev_name = net_sysctl_strdup(dev_name);
1451 if (!dev_name)
1452 goto free;
1453
1454 t->devinet_dev[0].procname = dev_name;
1455 t->devinet_dev[0].child = t->devinet_vars;
1456 t->devinet_dev[0].de = NULL;
1457 t->devinet_conf_dir[0].child = t->devinet_dev;
1458 t->devinet_conf_dir[0].de = NULL;
1459 t->devinet_proto_dir[0].child = t->devinet_conf_dir;
1460 t->devinet_proto_dir[0].de = NULL;
1461 t->devinet_root_dir[0].child = t->devinet_proto_dir;
1462 t->devinet_root_dir[0].de = NULL;
1463
1464 t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
1465 if (!t->sysctl_header)
1466 goto free_procname;
1467
1468 p->sysctl = t;
1469 return;
1470
1471 /* error path */
1472 free_procname:
1473 kfree(dev_name);
1474 free:
1475 kfree(t);
1476 return;
1477}
1478
1479static void devinet_sysctl_unregister(struct ipv4_devconf *p)
1480{
1481 if (p->sysctl) {
1482 struct devinet_sysctl_table *t = p->sysctl;
1483 p->sysctl = NULL;
1484 unregister_sysctl_table(t->sysctl_header);
1485 kfree(t->devinet_dev[0].procname);
1486 kfree(t);
1487 }
1488}
1489#endif
1490
1491void __init devinet_init(void)
1492{
1493 register_gifconf(PF_INET, inet_gifconf);
1494 register_netdevice_notifier(&ip_netdev_notifier);
1495 rtnetlink_links[PF_INET] = inet_rtnetlink_table;
1496#ifdef CONFIG_SYSCTL
1497 devinet_sysctl.sysctl_header =
1498 register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
1499 devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
1500#endif
1501}
1502
1503EXPORT_SYMBOL(devinet_ioctl);
1504EXPORT_SYMBOL(in_dev_finish_destroy);
1505EXPORT_SYMBOL(inet_select_addr);
1506EXPORT_SYMBOL(inetdev_by_index);
1507EXPORT_SYMBOL(register_inetaddr_notifier);
1508EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 000000000000..053a883247ba
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,510 @@
1#include <linux/config.h>
2#include <linux/module.h>
3#include <net/ip.h>
4#include <net/xfrm.h>
5#include <net/esp.h>
6#include <asm/scatterlist.h>
7#include <linux/crypto.h>
8#include <linux/pfkeyv2.h>
9#include <linux/random.h>
10#include <net/icmp.h>
11#include <net/udp.h>
12
13/* decapsulation data for use when post-processing */
14struct esp_decap_data {
15 xfrm_address_t saddr;
16 __u16 sport;
17 __u8 proto;
18};
19
20static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
21{
22 int err;
23 struct iphdr *top_iph;
24 struct ip_esp_hdr *esph;
25 struct crypto_tfm *tfm;
26 struct esp_data *esp;
27 struct sk_buff *trailer;
28 int blksize;
29 int clen;
30 int alen;
31 int nfrags;
32
33 /* Strip IP+ESP header. */
34 __skb_pull(skb, skb->h.raw - skb->data);
35 /* Now skb is pure payload to encrypt */
36
37 err = -ENOMEM;
38
39 /* Round to block size */
40 clen = skb->len;
41
42 esp = x->data;
43 alen = esp->auth.icv_trunc_len;
44 tfm = esp->conf.tfm;
45 blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
46 clen = (clen + 2 + blksize-1)&~(blksize-1);
47 if (esp->conf.padlen)
48 clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
49
50 if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
51 goto error;
52
53 /* Fill padding... */
54 do {
55 int i;
56 for (i=0; i<clen-skb->len - 2; i++)
57 *(u8*)(trailer->tail + i) = i+1;
58 } while (0);
59 *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
60 pskb_put(skb, trailer, clen - skb->len);
61
62 __skb_push(skb, skb->data - skb->nh.raw);
63 top_iph = skb->nh.iph;
64 esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4);
65 top_iph->tot_len = htons(skb->len + alen);
66 *(u8*)(trailer->tail - 1) = top_iph->protocol;
67
68 /* this is non-NULL only with UDP Encapsulation */
69 if (x->encap) {
70 struct xfrm_encap_tmpl *encap = x->encap;
71 struct udphdr *uh;
72 u32 *udpdata32;
73
74 uh = (struct udphdr *)esph;
75 uh->source = encap->encap_sport;
76 uh->dest = encap->encap_dport;
77 uh->len = htons(skb->len + alen - top_iph->ihl*4);
78 uh->check = 0;
79
80 switch (encap->encap_type) {
81 default:
82 case UDP_ENCAP_ESPINUDP:
83 esph = (struct ip_esp_hdr *)(uh + 1);
84 break;
85 case UDP_ENCAP_ESPINUDP_NON_IKE:
86 udpdata32 = (u32 *)(uh + 1);
87 udpdata32[0] = udpdata32[1] = 0;
88 esph = (struct ip_esp_hdr *)(udpdata32 + 2);
89 break;
90 }
91
92 top_iph->protocol = IPPROTO_UDP;
93 } else
94 top_iph->protocol = IPPROTO_ESP;
95
96 esph->spi = x->id.spi;
97 esph->seq_no = htonl(++x->replay.oseq);
98
99 if (esp->conf.ivlen)
100 crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
101
102 do {
103 struct scatterlist *sg = &esp->sgbuf[0];
104
105 if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
106 sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
107 if (!sg)
108 goto error;
109 }
110 skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
111 crypto_cipher_encrypt(tfm, sg, sg, clen);
112 if (unlikely(sg != &esp->sgbuf[0]))
113 kfree(sg);
114 } while (0);
115
116 if (esp->conf.ivlen) {
117 memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
118 crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
119 }
120
121 if (esp->auth.icv_full_len) {
122 esp->auth.icv(esp, skb, (u8*)esph-skb->data,
123 sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail);
124 pskb_put(skb, trailer, alen);
125 }
126
127 ip_send_check(top_iph);
128
129 err = 0;
130
131error:
132 return err;
133}
134
135/*
136 * Note: detecting truncated vs. non-truncated authentication data is very
137 * expensive, so we only support truncated data, which is the recommended
138 * and common case.
139 */
140static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
141{
142 struct iphdr *iph;
143 struct ip_esp_hdr *esph;
144 struct esp_data *esp = x->data;
145 struct sk_buff *trailer;
146 int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
147 int alen = esp->auth.icv_trunc_len;
148 int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
149 int nfrags;
150 int encap_len = 0;
151
152 if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
153 goto out;
154
155 if (elen <= 0 || (elen & (blksize-1)))
156 goto out;
157
158 /* If integrity check is required, do this. */
159 if (esp->auth.icv_full_len) {
160 u8 sum[esp->auth.icv_full_len];
161 u8 sum1[alen];
162
163 esp->auth.icv(esp, skb, 0, skb->len-alen, sum);
164
165 if (skb_copy_bits(skb, skb->len-alen, sum1, alen))
166 BUG();
167
168 if (unlikely(memcmp(sum, sum1, alen))) {
169 x->stats.integrity_failed++;
170 goto out;
171 }
172 }
173
174 if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
175 goto out;
176
177 skb->ip_summed = CHECKSUM_NONE;
178
179 esph = (struct ip_esp_hdr*)skb->data;
180 iph = skb->nh.iph;
181
182 /* Get ivec. This can be wrong, check against another impls. */
183 if (esp->conf.ivlen)
184 crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm));
185
186 {
187 u8 nexthdr[2];
188 struct scatterlist *sg = &esp->sgbuf[0];
189 u8 workbuf[60];
190 int padlen;
191
192 if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
193 sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
194 if (!sg)
195 goto out;
196 }
197 skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
198 crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen);
199 if (unlikely(sg != &esp->sgbuf[0]))
200 kfree(sg);
201
202 if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
203 BUG();
204
205 padlen = nexthdr[0];
206 if (padlen+2 >= elen)
207 goto out;
208
209 /* ... check padding bits here. Silly. :-) */
210
211 if (x->encap && decap && decap->decap_type) {
212 struct esp_decap_data *encap_data;
213 struct udphdr *uh = (struct udphdr *) (iph+1);
214
215 encap_data = (struct esp_decap_data *) (decap->decap_data);
216 encap_data->proto = 0;
217
218 switch (decap->decap_type) {
219 case UDP_ENCAP_ESPINUDP:
220 case UDP_ENCAP_ESPINUDP_NON_IKE:
221 encap_data->proto = AF_INET;
222 encap_data->saddr.a4 = iph->saddr;
223 encap_data->sport = uh->source;
224 encap_len = (void*)esph - (void*)uh;
225 break;
226
227 default:
228 goto out;
229 }
230 }
231
232 iph->protocol = nexthdr[1];
233 pskb_trim(skb, skb->len - alen - padlen - 2);
234 memcpy(workbuf, skb->nh.raw, iph->ihl*4);
235 skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
236 skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
237 memcpy(skb->nh.raw, workbuf, iph->ihl*4);
238 skb->nh.iph->tot_len = htons(skb->len);
239 }
240
241 return 0;
242
243out:
244 return -EINVAL;
245}
246
247static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
248{
249
250 if (x->encap) {
251 struct xfrm_encap_tmpl *encap;
252 struct esp_decap_data *decap_data;
253
254 encap = x->encap;
255 decap_data = (struct esp_decap_data *)(decap->decap_data);
256
257 /* first, make sure that the decap type == the encap type */
258 if (encap->encap_type != decap->decap_type)
259 return -EINVAL;
260
261 switch (encap->encap_type) {
262 default:
263 case UDP_ENCAP_ESPINUDP:
264 case UDP_ENCAP_ESPINUDP_NON_IKE:
265 /*
266 * 1) if the NAT-T peer's IP or port changed then
267 * advertize the change to the keying daemon.
268 * This is an inbound SA, so just compare
269 * SRC ports.
270 */
271 if (decap_data->proto == AF_INET &&
272 (decap_data->saddr.a4 != x->props.saddr.a4 ||
273 decap_data->sport != encap->encap_sport)) {
274 xfrm_address_t ipaddr;
275
276 ipaddr.a4 = decap_data->saddr.a4;
277 km_new_mapping(x, &ipaddr, decap_data->sport);
278
279 /* XXX: perhaps add an extra
280 * policy check here, to see
281 * if we should allow or
282 * reject a packet from a
283 * different source
284 * address/port.
285 */
286 }
287
288 /*
289 * 2) ignore UDP/TCP checksums in case
290 * of NAT-T in Transport Mode, or
291 * perform other post-processing fixes
292 * as per * draft-ietf-ipsec-udp-encaps-06,
293 * section 3.1.2
294 */
295 if (!x->props.mode)
296 skb->ip_summed = CHECKSUM_UNNECESSARY;
297
298 break;
299 }
300 }
301 return 0;
302}
303
304static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
305{
306 struct esp_data *esp = x->data;
307 u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
308
309 if (x->props.mode) {
310 mtu = (mtu + 2 + blksize-1)&~(blksize-1);
311 } else {
312 /* The worst case. */
313 mtu += 2 + blksize;
314 }
315 if (esp->conf.padlen)
316 mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
317
318 return mtu + x->props.header_len + esp->auth.icv_trunc_len;
319}
320
321static void esp4_err(struct sk_buff *skb, u32 info)
322{
323 struct iphdr *iph = (struct iphdr*)skb->data;
324 struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
325 struct xfrm_state *x;
326
327 if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
328 skb->h.icmph->code != ICMP_FRAG_NEEDED)
329 return;
330
331 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
332 if (!x)
333 return;
334 NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
335 ntohl(esph->spi), ntohl(iph->daddr)));
336 xfrm_state_put(x);
337}
338
339static void esp_destroy(struct xfrm_state *x)
340{
341 struct esp_data *esp = x->data;
342
343 if (!esp)
344 return;
345
346 if (esp->conf.tfm) {
347 crypto_free_tfm(esp->conf.tfm);
348 esp->conf.tfm = NULL;
349 }
350 if (esp->conf.ivec) {
351 kfree(esp->conf.ivec);
352 esp->conf.ivec = NULL;
353 }
354 if (esp->auth.tfm) {
355 crypto_free_tfm(esp->auth.tfm);
356 esp->auth.tfm = NULL;
357 }
358 if (esp->auth.work_icv) {
359 kfree(esp->auth.work_icv);
360 esp->auth.work_icv = NULL;
361 }
362 kfree(esp);
363}
364
365static int esp_init_state(struct xfrm_state *x, void *args)
366{
367 struct esp_data *esp = NULL;
368
369 /* null auth and encryption can have zero length keys */
370 if (x->aalg) {
371 if (x->aalg->alg_key_len > 512)
372 goto error;
373 }
374 if (x->ealg == NULL)
375 goto error;
376
377 esp = kmalloc(sizeof(*esp), GFP_KERNEL);
378 if (esp == NULL)
379 return -ENOMEM;
380
381 memset(esp, 0, sizeof(*esp));
382
383 if (x->aalg) {
384 struct xfrm_algo_desc *aalg_desc;
385
386 esp->auth.key = x->aalg->alg_key;
387 esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
388 esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
389 if (esp->auth.tfm == NULL)
390 goto error;
391 esp->auth.icv = esp_hmac_digest;
392
393 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
394 BUG_ON(!aalg_desc);
395
396 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
397 crypto_tfm_alg_digestsize(esp->auth.tfm)) {
398 NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
399 x->aalg->alg_name,
400 crypto_tfm_alg_digestsize(esp->auth.tfm),
401 aalg_desc->uinfo.auth.icv_fullbits/8));
402 goto error;
403 }
404
405 esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
406 esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
407
408 esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
409 if (!esp->auth.work_icv)
410 goto error;
411 }
412 esp->conf.key = x->ealg->alg_key;
413 esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
414 if (x->props.ealgo == SADB_EALG_NULL)
415 esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB);
416 else
417 esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC);
418 if (esp->conf.tfm == NULL)
419 goto error;
420 esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm);
421 esp->conf.padlen = 0;
422 if (esp->conf.ivlen) {
423 esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
424 if (unlikely(esp->conf.ivec == NULL))
425 goto error;
426 get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
427 }
428 if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len))
429 goto error;
430 x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
431 if (x->props.mode)
432 x->props.header_len += sizeof(struct iphdr);
433 if (x->encap) {
434 struct xfrm_encap_tmpl *encap = x->encap;
435
436 switch (encap->encap_type) {
437 default:
438 goto error;
439 case UDP_ENCAP_ESPINUDP:
440 x->props.header_len += sizeof(struct udphdr);
441 break;
442 case UDP_ENCAP_ESPINUDP_NON_IKE:
443 x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
444 break;
445 }
446 }
447 x->data = esp;
448 x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len;
449 return 0;
450
451error:
452 x->data = esp;
453 esp_destroy(x);
454 x->data = NULL;
455 return -EINVAL;
456}
457
458static struct xfrm_type esp_type =
459{
460 .description = "ESP4",
461 .owner = THIS_MODULE,
462 .proto = IPPROTO_ESP,
463 .init_state = esp_init_state,
464 .destructor = esp_destroy,
465 .get_max_size = esp4_get_max_size,
466 .input = esp_input,
467 .post_input = esp_post_input,
468 .output = esp_output
469};
470
471static struct net_protocol esp4_protocol = {
472 .handler = xfrm4_rcv,
473 .err_handler = esp4_err,
474 .no_policy = 1,
475};
476
477static int __init esp4_init(void)
478{
479 struct xfrm_decap_state decap;
480
481 if (sizeof(struct esp_decap_data) <
482 sizeof(decap.decap_data)) {
483 extern void decap_data_too_small(void);
484
485 decap_data_too_small();
486 }
487
488 if (xfrm_register_type(&esp_type, AF_INET) < 0) {
489 printk(KERN_INFO "ip esp init: can't add xfrm type\n");
490 return -EAGAIN;
491 }
492 if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
493 printk(KERN_INFO "ip esp init: can't add protocol\n");
494 xfrm_unregister_type(&esp_type, AF_INET);
495 return -EAGAIN;
496 }
497 return 0;
498}
499
500static void __exit esp4_fini(void)
501{
502 if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
503 printk(KERN_INFO "ip esp close: can't remove protocol\n");
504 if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
505 printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
506}
507
508module_init(esp4_init);
509module_exit(esp4_fini);
510MODULE_LICENSE("GPL");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 000000000000..563e7d612706
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,611 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 Forwarding Information Base: FIB frontend.
7 *
8 * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $
9 *
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <asm/uaccess.h>
21#include <asm/system.h>
22#include <linux/bitops.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/sched.h>
26#include <linux/mm.h>
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/sockios.h>
30#include <linux/errno.h>
31#include <linux/in.h>
32#include <linux/inet.h>
33#include <linux/netdevice.h>
34#include <linux/if_arp.h>
35#include <linux/skbuff.h>
36#include <linux/netlink.h>
37#include <linux/init.h>
38
39#include <net/ip.h>
40#include <net/protocol.h>
41#include <net/route.h>
42#include <net/tcp.h>
43#include <net/sock.h>
44#include <net/icmp.h>
45#include <net/arp.h>
46#include <net/ip_fib.h>
47
48#define FFprint(a...) printk(KERN_DEBUG a)
49
50#ifndef CONFIG_IP_MULTIPLE_TABLES
51
52#define RT_TABLE_MIN RT_TABLE_MAIN
53
54struct fib_table *ip_fib_local_table;
55struct fib_table *ip_fib_main_table;
56
57#else
58
59#define RT_TABLE_MIN 1
60
61struct fib_table *fib_tables[RT_TABLE_MAX+1];
62
63struct fib_table *__fib_new_table(int id)
64{
65 struct fib_table *tb;
66
67 tb = fib_hash_init(id);
68 if (!tb)
69 return NULL;
70 fib_tables[id] = tb;
71 return tb;
72}
73
74
75#endif /* CONFIG_IP_MULTIPLE_TABLES */
76
77
78static void fib_flush(void)
79{
80 int flushed = 0;
81#ifdef CONFIG_IP_MULTIPLE_TABLES
82 struct fib_table *tb;
83 int id;
84
85 for (id = RT_TABLE_MAX; id>0; id--) {
86 if ((tb = fib_get_table(id))==NULL)
87 continue;
88 flushed += tb->tb_flush(tb);
89 }
90#else /* CONFIG_IP_MULTIPLE_TABLES */
91 flushed += ip_fib_main_table->tb_flush(ip_fib_main_table);
92 flushed += ip_fib_local_table->tb_flush(ip_fib_local_table);
93#endif /* CONFIG_IP_MULTIPLE_TABLES */
94
95 if (flushed)
96 rt_cache_flush(-1);
97}
98
99/*
100 * Find the first device with a given source address.
101 */
102
103struct net_device * ip_dev_find(u32 addr)
104{
105 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
106 struct fib_result res;
107 struct net_device *dev = NULL;
108
109#ifdef CONFIG_IP_MULTIPLE_TABLES
110 res.r = NULL;
111#endif
112
113 if (!ip_fib_local_table ||
114 ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res))
115 return NULL;
116 if (res.type != RTN_LOCAL)
117 goto out;
118 dev = FIB_RES_DEV(res);
119
120 if (dev)
121 dev_hold(dev);
122out:
123 fib_res_put(&res);
124 return dev;
125}
126
127unsigned inet_addr_type(u32 addr)
128{
129 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
130 struct fib_result res;
131 unsigned ret = RTN_BROADCAST;
132
133 if (ZERONET(addr) || BADCLASS(addr))
134 return RTN_BROADCAST;
135 if (MULTICAST(addr))
136 return RTN_MULTICAST;
137
138#ifdef CONFIG_IP_MULTIPLE_TABLES
139 res.r = NULL;
140#endif
141
142 if (ip_fib_local_table) {
143 ret = RTN_UNICAST;
144 if (!ip_fib_local_table->tb_lookup(ip_fib_local_table,
145 &fl, &res)) {
146 ret = res.type;
147 fib_res_put(&res);
148 }
149 }
150 return ret;
151}
152
153/* Given (packet source, input interface) and optional (dst, oif, tos):
154 - (main) check, that source is valid i.e. not broadcast or our local
155 address.
156 - figure out what "logical" interface this packet arrived
157 and calculate "specific destination" address.
158 - check, that packet arrived from expected physical interface.
159 */
160
161int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
162 struct net_device *dev, u32 *spec_dst, u32 *itag)
163{
164 struct in_device *in_dev;
165 struct flowi fl = { .nl_u = { .ip4_u =
166 { .daddr = src,
167 .saddr = dst,
168 .tos = tos } },
169 .iif = oif };
170 struct fib_result res;
171 int no_addr, rpf;
172 int ret;
173
174 no_addr = rpf = 0;
175 rcu_read_lock();
176 in_dev = __in_dev_get(dev);
177 if (in_dev) {
178 no_addr = in_dev->ifa_list == NULL;
179 rpf = IN_DEV_RPFILTER(in_dev);
180 }
181 rcu_read_unlock();
182
183 if (in_dev == NULL)
184 goto e_inval;
185
186 if (fib_lookup(&fl, &res))
187 goto last_resort;
188 if (res.type != RTN_UNICAST)
189 goto e_inval_res;
190 *spec_dst = FIB_RES_PREFSRC(res);
191 fib_combine_itag(itag, &res);
192#ifdef CONFIG_IP_ROUTE_MULTIPATH
193 if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
194#else
195 if (FIB_RES_DEV(res) == dev)
196#endif
197 {
198 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
199 fib_res_put(&res);
200 return ret;
201 }
202 fib_res_put(&res);
203 if (no_addr)
204 goto last_resort;
205 if (rpf)
206 goto e_inval;
207 fl.oif = dev->ifindex;
208
209 ret = 0;
210 if (fib_lookup(&fl, &res) == 0) {
211 if (res.type == RTN_UNICAST) {
212 *spec_dst = FIB_RES_PREFSRC(res);
213 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
214 }
215 fib_res_put(&res);
216 }
217 return ret;
218
219last_resort:
220 if (rpf)
221 goto e_inval;
222 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
223 *itag = 0;
224 return 0;
225
226e_inval_res:
227 fib_res_put(&res);
228e_inval:
229 return -EINVAL;
230}
231
232#ifndef CONFIG_IP_NOSIOCRT
233
234/*
235 * Handle IP routing ioctl calls. These are used to manipulate the routing tables
236 */
237
238int ip_rt_ioctl(unsigned int cmd, void __user *arg)
239{
240 int err;
241 struct kern_rta rta;
242 struct rtentry r;
243 struct {
244 struct nlmsghdr nlh;
245 struct rtmsg rtm;
246 } req;
247
248 switch (cmd) {
249 case SIOCADDRT: /* Add a route */
250 case SIOCDELRT: /* Delete a route */
251 if (!capable(CAP_NET_ADMIN))
252 return -EPERM;
253 if (copy_from_user(&r, arg, sizeof(struct rtentry)))
254 return -EFAULT;
255 rtnl_lock();
256 err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
257 if (err == 0) {
258 if (cmd == SIOCDELRT) {
259 struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
260 err = -ESRCH;
261 if (tb)
262 err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
263 } else {
264 struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
265 err = -ENOBUFS;
266 if (tb)
267 err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
268 }
269 if (rta.rta_mx)
270 kfree(rta.rta_mx);
271 }
272 rtnl_unlock();
273 return err;
274 }
275 return -EINVAL;
276}
277
278#else
279
280int ip_rt_ioctl(unsigned int cmd, void *arg)
281{
282 return -EINVAL;
283}
284
285#endif
286
287static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
288{
289 int i;
290
291 for (i=1; i<=RTA_MAX; i++) {
292 struct rtattr *attr = rta[i-1];
293 if (attr) {
294 if (RTA_PAYLOAD(attr) < 4)
295 return -EINVAL;
296 if (i != RTA_MULTIPATH && i != RTA_METRICS)
297 rta[i-1] = (struct rtattr*)RTA_DATA(attr);
298 }
299 }
300 return 0;
301}
302
303int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
304{
305 struct fib_table * tb;
306 struct rtattr **rta = arg;
307 struct rtmsg *r = NLMSG_DATA(nlh);
308
309 if (inet_check_attr(r, rta))
310 return -EINVAL;
311
312 tb = fib_get_table(r->rtm_table);
313 if (tb)
314 return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
315 return -ESRCH;
316}
317
318int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
319{
320 struct fib_table * tb;
321 struct rtattr **rta = arg;
322 struct rtmsg *r = NLMSG_DATA(nlh);
323
324 if (inet_check_attr(r, rta))
325 return -EINVAL;
326
327 tb = fib_new_table(r->rtm_table);
328 if (tb)
329 return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
330 return -ENOBUFS;
331}
332
333int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
334{
335 int t;
336 int s_t;
337 struct fib_table *tb;
338
339 if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
340 ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
341 return ip_rt_dump(skb, cb);
342
343 s_t = cb->args[0];
344 if (s_t == 0)
345 s_t = cb->args[0] = RT_TABLE_MIN;
346
347 for (t=s_t; t<=RT_TABLE_MAX; t++) {
348 if (t < s_t) continue;
349 if (t > s_t)
350 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
351 if ((tb = fib_get_table(t))==NULL)
352 continue;
353 if (tb->tb_dump(tb, skb, cb) < 0)
354 break;
355 }
356
357 cb->args[0] = t;
358
359 return skb->len;
360}
361
362/* Prepare and feed intra-kernel routing request.
363 Really, it should be netlink message, but :-( netlink
364 can be not configured, so that we feed it directly
365 to fib engine. It is legal, because all events occur
366 only when netlink is already locked.
367 */
368
369static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
370{
371 struct fib_table * tb;
372 struct {
373 struct nlmsghdr nlh;
374 struct rtmsg rtm;
375 } req;
376 struct kern_rta rta;
377
378 memset(&req.rtm, 0, sizeof(req.rtm));
379 memset(&rta, 0, sizeof(rta));
380
381 if (type == RTN_UNICAST)
382 tb = fib_new_table(RT_TABLE_MAIN);
383 else
384 tb = fib_new_table(RT_TABLE_LOCAL);
385
386 if (tb == NULL)
387 return;
388
389 req.nlh.nlmsg_len = sizeof(req);
390 req.nlh.nlmsg_type = cmd;
391 req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
392 req.nlh.nlmsg_pid = 0;
393 req.nlh.nlmsg_seq = 0;
394
395 req.rtm.rtm_dst_len = dst_len;
396 req.rtm.rtm_table = tb->tb_id;
397 req.rtm.rtm_protocol = RTPROT_KERNEL;
398 req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
399 req.rtm.rtm_type = type;
400
401 rta.rta_dst = &dst;
402 rta.rta_prefsrc = &ifa->ifa_local;
403 rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
404
405 if (cmd == RTM_NEWROUTE)
406 tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
407 else
408 tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
409}
410
411static void fib_add_ifaddr(struct in_ifaddr *ifa)
412{
413 struct in_device *in_dev = ifa->ifa_dev;
414 struct net_device *dev = in_dev->dev;
415 struct in_ifaddr *prim = ifa;
416 u32 mask = ifa->ifa_mask;
417 u32 addr = ifa->ifa_local;
418 u32 prefix = ifa->ifa_address&mask;
419
420 if (ifa->ifa_flags&IFA_F_SECONDARY) {
421 prim = inet_ifa_byprefix(in_dev, prefix, mask);
422 if (prim == NULL) {
423 printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
424 return;
425 }
426 }
427
428 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
429
430 if (!(dev->flags&IFF_UP))
431 return;
432
433 /* Add broadcast address, if it is explicitly assigned. */
434 if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
435 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
436
437 if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
438 (prefix != addr || ifa->ifa_prefixlen < 32)) {
439 fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
440 RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
441
442 /* Add network specific broadcasts, when it takes a sense */
443 if (ifa->ifa_prefixlen < 31) {
444 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
445 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
446 }
447 }
448}
449
450static void fib_del_ifaddr(struct in_ifaddr *ifa)
451{
452 struct in_device *in_dev = ifa->ifa_dev;
453 struct net_device *dev = in_dev->dev;
454 struct in_ifaddr *ifa1;
455 struct in_ifaddr *prim = ifa;
456 u32 brd = ifa->ifa_address|~ifa->ifa_mask;
457 u32 any = ifa->ifa_address&ifa->ifa_mask;
458#define LOCAL_OK 1
459#define BRD_OK 2
460#define BRD0_OK 4
461#define BRD1_OK 8
462 unsigned ok = 0;
463
464 if (!(ifa->ifa_flags&IFA_F_SECONDARY))
465 fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
466 RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
467 else {
468 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
469 if (prim == NULL) {
470 printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
471 return;
472 }
473 }
474
475 /* Deletion is more complicated than add.
476 We should take care of not to delete too much :-)
477
478 Scan address list to be sure that addresses are really gone.
479 */
480
481 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
482 if (ifa->ifa_local == ifa1->ifa_local)
483 ok |= LOCAL_OK;
484 if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
485 ok |= BRD_OK;
486 if (brd == ifa1->ifa_broadcast)
487 ok |= BRD1_OK;
488 if (any == ifa1->ifa_broadcast)
489 ok |= BRD0_OK;
490 }
491
492 if (!(ok&BRD_OK))
493 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
494 if (!(ok&BRD1_OK))
495 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
496 if (!(ok&BRD0_OK))
497 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
498 if (!(ok&LOCAL_OK)) {
499 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
500
501 /* Check, that this local address finally disappeared. */
502 if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
503 /* And the last, but not the least thing.
504 We must flush stray FIB entries.
505
506 First of all, we scan fib_info list searching
507 for stray nexthop entries, then ignite fib_flush.
508 */
509 if (fib_sync_down(ifa->ifa_local, NULL, 0))
510 fib_flush();
511 }
512 }
513#undef LOCAL_OK
514#undef BRD_OK
515#undef BRD0_OK
516#undef BRD1_OK
517}
518
519static void fib_disable_ip(struct net_device *dev, int force)
520{
521 if (fib_sync_down(0, dev, force))
522 fib_flush();
523 rt_cache_flush(0);
524 arp_ifdown(dev);
525}
526
527static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
528{
529 struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
530
531 switch (event) {
532 case NETDEV_UP:
533 fib_add_ifaddr(ifa);
534#ifdef CONFIG_IP_ROUTE_MULTIPATH
535 fib_sync_up(ifa->ifa_dev->dev);
536#endif
537 rt_cache_flush(-1);
538 break;
539 case NETDEV_DOWN:
540 fib_del_ifaddr(ifa);
541 if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
542 /* Last address was deleted from this interface.
543 Disable IP.
544 */
545 fib_disable_ip(ifa->ifa_dev->dev, 1);
546 } else {
547 rt_cache_flush(-1);
548 }
549 break;
550 }
551 return NOTIFY_DONE;
552}
553
554static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
555{
556 struct net_device *dev = ptr;
557 struct in_device *in_dev = __in_dev_get(dev);
558
559 if (event == NETDEV_UNREGISTER) {
560 fib_disable_ip(dev, 2);
561 return NOTIFY_DONE;
562 }
563
564 if (!in_dev)
565 return NOTIFY_DONE;
566
567 switch (event) {
568 case NETDEV_UP:
569 for_ifa(in_dev) {
570 fib_add_ifaddr(ifa);
571 } endfor_ifa(in_dev);
572#ifdef CONFIG_IP_ROUTE_MULTIPATH
573 fib_sync_up(dev);
574#endif
575 rt_cache_flush(-1);
576 break;
577 case NETDEV_DOWN:
578 fib_disable_ip(dev, 0);
579 break;
580 case NETDEV_CHANGEMTU:
581 case NETDEV_CHANGE:
582 rt_cache_flush(0);
583 break;
584 }
585 return NOTIFY_DONE;
586}
587
588static struct notifier_block fib_inetaddr_notifier = {
589 .notifier_call =fib_inetaddr_event,
590};
591
592static struct notifier_block fib_netdev_notifier = {
593 .notifier_call =fib_netdev_event,
594};
595
596void __init ip_fib_init(void)
597{
598#ifndef CONFIG_IP_MULTIPLE_TABLES
599 ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
600 ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
601#else
602 fib_rules_init();
603#endif
604
605 register_netdevice_notifier(&fib_netdev_notifier);
606 register_inetaddr_notifier(&fib_inetaddr_notifier);
607}
608
609EXPORT_SYMBOL(inet_addr_type);
610EXPORT_SYMBOL(ip_dev_find);
611EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
new file mode 100644
index 000000000000..6506dcc01b46
--- /dev/null
+++ b/net/ipv4/fib_hash.c
@@ -0,0 +1,1086 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 FIB: lookup engine and maintenance routines.
7 *
8 * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $
9 *
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 */
17
18#include <linux/config.h>
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <linux/bitops.h>
22#include <linux/types.h>
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/mm.h>
26#include <linux/string.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/errno.h>
30#include <linux/in.h>
31#include <linux/inet.h>
32#include <linux/netdevice.h>
33#include <linux/if_arp.h>
34#include <linux/proc_fs.h>
35#include <linux/skbuff.h>
36#include <linux/netlink.h>
37#include <linux/init.h>
38
39#include <net/ip.h>
40#include <net/protocol.h>
41#include <net/route.h>
42#include <net/tcp.h>
43#include <net/sock.h>
44#include <net/ip_fib.h>
45
46#include "fib_lookup.h"
47
48static kmem_cache_t *fn_hash_kmem;
49static kmem_cache_t *fn_alias_kmem;
50
51struct fib_node {
52 struct hlist_node fn_hash;
53 struct list_head fn_alias;
54 u32 fn_key;
55};
56
57struct fn_zone {
58 struct fn_zone *fz_next; /* Next not empty zone */
59 struct hlist_head *fz_hash; /* Hash table pointer */
60 int fz_nent; /* Number of entries */
61
62 int fz_divisor; /* Hash divisor */
63 u32 fz_hashmask; /* (fz_divisor - 1) */
64#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
65
66 int fz_order; /* Zone order */
67 u32 fz_mask;
68#define FZ_MASK(fz) ((fz)->fz_mask)
69};
70
71/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
72 * can be cheaper than memory lookup, so that FZ_* macros are used.
73 */
74
75struct fn_hash {
76 struct fn_zone *fn_zones[33];
77 struct fn_zone *fn_zone_list;
78};
79
80static inline u32 fn_hash(u32 key, struct fn_zone *fz)
81{
82 u32 h = ntohl(key)>>(32 - fz->fz_order);
83 h ^= (h>>20);
84 h ^= (h>>10);
85 h ^= (h>>5);
86 h &= FZ_HASHMASK(fz);
87 return h;
88}
89
90static inline u32 fz_key(u32 dst, struct fn_zone *fz)
91{
92 return dst & FZ_MASK(fz);
93}
94
95static DEFINE_RWLOCK(fib_hash_lock);
96static unsigned int fib_hash_genid;
97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
99
100static struct hlist_head *fz_hash_alloc(int divisor)
101{
102 unsigned long size = divisor * sizeof(struct hlist_head);
103
104 if (size <= PAGE_SIZE) {
105 return kmalloc(size, GFP_KERNEL);
106 } else {
107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL, get_order(size));
109 }
110}
111
112/* The fib hash lock must be held when this is called. */
113static inline void fn_rebuild_zone(struct fn_zone *fz,
114 struct hlist_head *old_ht,
115 int old_divisor)
116{
117 int i;
118
119 for (i = 0; i < old_divisor; i++) {
120 struct hlist_node *node, *n;
121 struct fib_node *f;
122
123 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
124 struct hlist_head *new_head;
125
126 hlist_del(&f->fn_hash);
127
128 new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
129 hlist_add_head(&f->fn_hash, new_head);
130 }
131 }
132}
133
134static void fz_hash_free(struct hlist_head *hash, int divisor)
135{
136 unsigned long size = divisor * sizeof(struct hlist_head);
137
138 if (size <= PAGE_SIZE)
139 kfree(hash);
140 else
141 free_pages((unsigned long)hash, get_order(size));
142}
143
144static void fn_rehash_zone(struct fn_zone *fz)
145{
146 struct hlist_head *ht, *old_ht;
147 int old_divisor, new_divisor;
148 u32 new_hashmask;
149
150 old_divisor = fz->fz_divisor;
151
152 switch (old_divisor) {
153 case 16:
154 new_divisor = 256;
155 break;
156 case 256:
157 new_divisor = 1024;
158 break;
159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
161 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
162 return;
163 }
164 new_divisor = (old_divisor << 1);
165 break;
166 }
167
168 new_hashmask = (new_divisor - 1);
169
170#if RT_CACHE_DEBUG >= 2
171 printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
172#endif
173
174 ht = fz_hash_alloc(new_divisor);
175
176 if (ht) {
177 memset(ht, 0, new_divisor * sizeof(struct hlist_head));
178
179 write_lock_bh(&fib_hash_lock);
180 old_ht = fz->fz_hash;
181 fz->fz_hash = ht;
182 fz->fz_hashmask = new_hashmask;
183 fz->fz_divisor = new_divisor;
184 fn_rebuild_zone(fz, old_ht, old_divisor);
185 fib_hash_genid++;
186 write_unlock_bh(&fib_hash_lock);
187
188 fz_hash_free(old_ht, old_divisor);
189 }
190}
191
192static inline void fn_free_node(struct fib_node * f)
193{
194 kmem_cache_free(fn_hash_kmem, f);
195}
196
197static inline void fn_free_alias(struct fib_alias *fa)
198{
199 fib_release_info(fa->fa_info);
200 kmem_cache_free(fn_alias_kmem, fa);
201}
202
203static struct fn_zone *
204fn_new_zone(struct fn_hash *table, int z)
205{
206 int i;
207 struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL);
208 if (!fz)
209 return NULL;
210
211 memset(fz, 0, sizeof(struct fn_zone));
212 if (z) {
213 fz->fz_divisor = 16;
214 } else {
215 fz->fz_divisor = 1;
216 }
217 fz->fz_hashmask = (fz->fz_divisor - 1);
218 fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
219 if (!fz->fz_hash) {
220 kfree(fz);
221 return NULL;
222 }
223 memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
224 fz->fz_order = z;
225 fz->fz_mask = inet_make_mask(z);
226
227 /* Find the first not empty zone with more specific mask */
228 for (i=z+1; i<=32; i++)
229 if (table->fn_zones[i])
230 break;
231 write_lock_bh(&fib_hash_lock);
232 if (i>32) {
233 /* No more specific masks, we are the first. */
234 fz->fz_next = table->fn_zone_list;
235 table->fn_zone_list = fz;
236 } else {
237 fz->fz_next = table->fn_zones[i]->fz_next;
238 table->fn_zones[i]->fz_next = fz;
239 }
240 table->fn_zones[z] = fz;
241 fib_hash_genid++;
242 write_unlock_bh(&fib_hash_lock);
243 return fz;
244}
245
246static int
247fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
248{
249 int err;
250 struct fn_zone *fz;
251 struct fn_hash *t = (struct fn_hash*)tb->tb_data;
252
253 read_lock(&fib_hash_lock);
254 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
255 struct hlist_head *head;
256 struct hlist_node *node;
257 struct fib_node *f;
258 u32 k = fz_key(flp->fl4_dst, fz);
259
260 head = &fz->fz_hash[fn_hash(k, fz)];
261 hlist_for_each_entry(f, node, head, fn_hash) {
262 if (f->fn_key != k)
263 continue;
264
265 err = fib_semantic_match(&f->fn_alias,
266 flp, res,
267 f->fn_key, fz->fz_mask,
268 fz->fz_order);
269 if (err <= 0)
270 goto out;
271 }
272 }
273 err = 1;
274out:
275 read_unlock(&fib_hash_lock);
276 return err;
277}
278
279static int fn_hash_last_dflt=-1;
280
281static void
282fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
283{
284 int order, last_idx;
285 struct hlist_node *node;
286 struct fib_node *f;
287 struct fib_info *fi = NULL;
288 struct fib_info *last_resort;
289 struct fn_hash *t = (struct fn_hash*)tb->tb_data;
290 struct fn_zone *fz = t->fn_zones[0];
291
292 if (fz == NULL)
293 return;
294
295 last_idx = -1;
296 last_resort = NULL;
297 order = -1;
298
299 read_lock(&fib_hash_lock);
300 hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
301 struct fib_alias *fa;
302
303 list_for_each_entry(fa, &f->fn_alias, fa_list) {
304 struct fib_info *next_fi = fa->fa_info;
305
306 if (fa->fa_scope != res->scope ||
307 fa->fa_type != RTN_UNICAST)
308 continue;
309
310 if (next_fi->fib_priority > res->fi->fib_priority)
311 break;
312 if (!next_fi->fib_nh[0].nh_gw ||
313 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
314 continue;
315 fa->fa_state |= FA_S_ACCESSED;
316
317 if (fi == NULL) {
318 if (next_fi != res->fi)
319 break;
320 } else if (!fib_detect_death(fi, order, &last_resort,
321 &last_idx, &fn_hash_last_dflt)) {
322 if (res->fi)
323 fib_info_put(res->fi);
324 res->fi = fi;
325 atomic_inc(&fi->fib_clntref);
326 fn_hash_last_dflt = order;
327 goto out;
328 }
329 fi = next_fi;
330 order++;
331 }
332 }
333
334 if (order <= 0 || fi == NULL) {
335 fn_hash_last_dflt = -1;
336 goto out;
337 }
338
339 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
340 if (res->fi)
341 fib_info_put(res->fi);
342 res->fi = fi;
343 atomic_inc(&fi->fib_clntref);
344 fn_hash_last_dflt = order;
345 goto out;
346 }
347
348 if (last_idx >= 0) {
349 if (res->fi)
350 fib_info_put(res->fi);
351 res->fi = last_resort;
352 if (last_resort)
353 atomic_inc(&last_resort->fib_clntref);
354 }
355 fn_hash_last_dflt = last_idx;
356out:
357 read_unlock(&fib_hash_lock);
358}
359
360/* Insert node F to FZ. */
361static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
362{
363 struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
364
365 hlist_add_head(&f->fn_hash, head);
366}
367
368/* Return the node in FZ matching KEY. */
369static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key)
370{
371 struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
372 struct hlist_node *node;
373 struct fib_node *f;
374
375 hlist_for_each_entry(f, node, head, fn_hash) {
376 if (f->fn_key == key)
377 return f;
378 }
379
380 return NULL;
381}
382
383static int
384fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
385 struct nlmsghdr *n, struct netlink_skb_parms *req)
386{
387 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
388 struct fib_node *new_f, *f;
389 struct fib_alias *fa, *new_fa;
390 struct fn_zone *fz;
391 struct fib_info *fi;
392 int z = r->rtm_dst_len;
393 int type = r->rtm_type;
394 u8 tos = r->rtm_tos;
395 u32 key;
396 int err;
397
398 if (z > 32)
399 return -EINVAL;
400 fz = table->fn_zones[z];
401 if (!fz && !(fz = fn_new_zone(table, z)))
402 return -ENOBUFS;
403
404 key = 0;
405 if (rta->rta_dst) {
406 u32 dst;
407 memcpy(&dst, rta->rta_dst, 4);
408 if (dst & ~FZ_MASK(fz))
409 return -EINVAL;
410 key = fz_key(dst, fz);
411 }
412
413 if ((fi = fib_create_info(r, rta, n, &err)) == NULL)
414 return err;
415
416 if (fz->fz_nent > (fz->fz_divisor<<1) &&
417 fz->fz_divisor < FZ_MAX_DIVISOR &&
418 (z==32 || (1<<z) > fz->fz_divisor))
419 fn_rehash_zone(fz);
420
421 f = fib_find_node(fz, key);
422
423 if (!f)
424 fa = NULL;
425 else
426 fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
427
428 /* Now fa, if non-NULL, points to the first fib alias
429 * with the same keys [prefix,tos,priority], if such key already
430 * exists or to the node before which we will insert new one.
431 *
432 * If fa is NULL, we will need to allocate a new one and
433 * insert to the head of f.
434 *
435 * If f is NULL, no fib node matched the destination key
436 * and we need to allocate a new one of those as well.
437 */
438
439 if (fa && fa->fa_tos == tos &&
440 fa->fa_info->fib_priority == fi->fib_priority) {
441 struct fib_alias *fa_orig;
442
443 err = -EEXIST;
444 if (n->nlmsg_flags & NLM_F_EXCL)
445 goto out;
446
447 if (n->nlmsg_flags & NLM_F_REPLACE) {
448 struct fib_info *fi_drop;
449 u8 state;
450
451 write_lock_bh(&fib_hash_lock);
452 fi_drop = fa->fa_info;
453 fa->fa_info = fi;
454 fa->fa_type = type;
455 fa->fa_scope = r->rtm_scope;
456 state = fa->fa_state;
457 fa->fa_state &= ~FA_S_ACCESSED;
458 fib_hash_genid++;
459 write_unlock_bh(&fib_hash_lock);
460
461 fib_release_info(fi_drop);
462 if (state & FA_S_ACCESSED)
463 rt_cache_flush(-1);
464 return 0;
465 }
466
467 /* Error if we find a perfect match which
468 * uses the same scope, type, and nexthop
469 * information.
470 */
471 fa_orig = fa;
472 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
473 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
474 if (fa->fa_tos != tos)
475 break;
476 if (fa->fa_info->fib_priority != fi->fib_priority)
477 break;
478 if (fa->fa_type == type &&
479 fa->fa_scope == r->rtm_scope &&
480 fa->fa_info == fi)
481 goto out;
482 }
483 if (!(n->nlmsg_flags & NLM_F_APPEND))
484 fa = fa_orig;
485 }
486
487 err = -ENOENT;
488 if (!(n->nlmsg_flags&NLM_F_CREATE))
489 goto out;
490
491 err = -ENOBUFS;
492 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
493 if (new_fa == NULL)
494 goto out;
495
496 new_f = NULL;
497 if (!f) {
498 new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
499 if (new_f == NULL)
500 goto out_free_new_fa;
501
502 INIT_HLIST_NODE(&new_f->fn_hash);
503 INIT_LIST_HEAD(&new_f->fn_alias);
504 new_f->fn_key = key;
505 f = new_f;
506 }
507
508 new_fa->fa_info = fi;
509 new_fa->fa_tos = tos;
510 new_fa->fa_type = type;
511 new_fa->fa_scope = r->rtm_scope;
512 new_fa->fa_state = 0;
513
514 /*
515 * Insert new entry to the list.
516 */
517
518 write_lock_bh(&fib_hash_lock);
519 if (new_f)
520 fib_insert_node(fz, new_f);
521 list_add_tail(&new_fa->fa_list,
522 (fa ? &fa->fa_list : &f->fn_alias));
523 fib_hash_genid++;
524 write_unlock_bh(&fib_hash_lock);
525
526 if (new_f)
527 fz->fz_nent++;
528 rt_cache_flush(-1);
529
530 rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req);
531 return 0;
532
533out_free_new_fa:
534 kmem_cache_free(fn_alias_kmem, new_fa);
535out:
536 fib_release_info(fi);
537 return err;
538}
539
540
541static int
542fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
543 struct nlmsghdr *n, struct netlink_skb_parms *req)
544{
545 struct fn_hash *table = (struct fn_hash*)tb->tb_data;
546 struct fib_node *f;
547 struct fib_alias *fa, *fa_to_delete;
548 int z = r->rtm_dst_len;
549 struct fn_zone *fz;
550 u32 key;
551 u8 tos = r->rtm_tos;
552
553 if (z > 32)
554 return -EINVAL;
555 if ((fz = table->fn_zones[z]) == NULL)
556 return -ESRCH;
557
558 key = 0;
559 if (rta->rta_dst) {
560 u32 dst;
561 memcpy(&dst, rta->rta_dst, 4);
562 if (dst & ~FZ_MASK(fz))
563 return -EINVAL;
564 key = fz_key(dst, fz);
565 }
566
567 f = fib_find_node(fz, key);
568
569 if (!f)
570 fa = NULL;
571 else
572 fa = fib_find_alias(&f->fn_alias, tos, 0);
573 if (!fa)
574 return -ESRCH;
575
576 fa_to_delete = NULL;
577 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
578 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
579 struct fib_info *fi = fa->fa_info;
580
581 if (fa->fa_tos != tos)
582 break;
583
584 if ((!r->rtm_type ||
585 fa->fa_type == r->rtm_type) &&
586 (r->rtm_scope == RT_SCOPE_NOWHERE ||
587 fa->fa_scope == r->rtm_scope) &&
588 (!r->rtm_protocol ||
589 fi->fib_protocol == r->rtm_protocol) &&
590 fib_nh_match(r, n, rta, fi) == 0) {
591 fa_to_delete = fa;
592 break;
593 }
594 }
595
596 if (fa_to_delete) {
597 int kill_fn;
598
599 fa = fa_to_delete;
600 rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req);
601
602 kill_fn = 0;
603 write_lock_bh(&fib_hash_lock);
604 list_del(&fa->fa_list);
605 if (list_empty(&f->fn_alias)) {
606 hlist_del(&f->fn_hash);
607 kill_fn = 1;
608 }
609 fib_hash_genid++;
610 write_unlock_bh(&fib_hash_lock);
611
612 if (fa->fa_state & FA_S_ACCESSED)
613 rt_cache_flush(-1);
614 fn_free_alias(fa);
615 if (kill_fn) {
616 fn_free_node(f);
617 fz->fz_nent--;
618 }
619
620 return 0;
621 }
622 return -ESRCH;
623}
624
625static int fn_flush_list(struct fn_zone *fz, int idx)
626{
627 struct hlist_head *head = &fz->fz_hash[idx];
628 struct hlist_node *node, *n;
629 struct fib_node *f;
630 int found = 0;
631
632 hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
633 struct fib_alias *fa, *fa_node;
634 int kill_f;
635
636 kill_f = 0;
637 list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
638 struct fib_info *fi = fa->fa_info;
639
640 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
641 write_lock_bh(&fib_hash_lock);
642 list_del(&fa->fa_list);
643 if (list_empty(&f->fn_alias)) {
644 hlist_del(&f->fn_hash);
645 kill_f = 1;
646 }
647 fib_hash_genid++;
648 write_unlock_bh(&fib_hash_lock);
649
650 fn_free_alias(fa);
651 found++;
652 }
653 }
654 if (kill_f) {
655 fn_free_node(f);
656 fz->fz_nent--;
657 }
658 }
659 return found;
660}
661
662static int fn_hash_flush(struct fib_table *tb)
663{
664 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
665 struct fn_zone *fz;
666 int found = 0;
667
668 for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
669 int i;
670
671 for (i = fz->fz_divisor - 1; i >= 0; i--)
672 found += fn_flush_list(fz, i);
673 }
674 return found;
675}
676
677
678static inline int
679fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
680 struct fib_table *tb,
681 struct fn_zone *fz,
682 struct hlist_head *head)
683{
684 struct hlist_node *node;
685 struct fib_node *f;
686 int i, s_i;
687
688 s_i = cb->args[3];
689 i = 0;
690 hlist_for_each_entry(f, node, head, fn_hash) {
691 struct fib_alias *fa;
692
693 list_for_each_entry(fa, &f->fn_alias, fa_list) {
694 if (i < s_i)
695 goto next;
696
697 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
698 cb->nlh->nlmsg_seq,
699 RTM_NEWROUTE,
700 tb->tb_id,
701 fa->fa_type,
702 fa->fa_scope,
703 &f->fn_key,
704 fz->fz_order,
705 fa->fa_tos,
706 fa->fa_info) < 0) {
707 cb->args[3] = i;
708 return -1;
709 }
710 next:
711 i++;
712 }
713 }
714 cb->args[3] = i;
715 return skb->len;
716}
717
718static inline int
719fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
720 struct fib_table *tb,
721 struct fn_zone *fz)
722{
723 int h, s_h;
724
725 s_h = cb->args[2];
726 for (h=0; h < fz->fz_divisor; h++) {
727 if (h < s_h) continue;
728 if (h > s_h)
729 memset(&cb->args[3], 0,
730 sizeof(cb->args) - 3*sizeof(cb->args[0]));
731 if (fz->fz_hash == NULL ||
732 hlist_empty(&fz->fz_hash[h]))
733 continue;
734 if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
735 cb->args[2] = h;
736 return -1;
737 }
738 }
739 cb->args[2] = h;
740 return skb->len;
741}
742
743static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
744{
745 int m, s_m;
746 struct fn_zone *fz;
747 struct fn_hash *table = (struct fn_hash*)tb->tb_data;
748
749 s_m = cb->args[1];
750 read_lock(&fib_hash_lock);
751 for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
752 if (m < s_m) continue;
753 if (m > s_m)
754 memset(&cb->args[2], 0,
755 sizeof(cb->args) - 2*sizeof(cb->args[0]));
756 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
757 cb->args[1] = m;
758 read_unlock(&fib_hash_lock);
759 return -1;
760 }
761 }
762 read_unlock(&fib_hash_lock);
763 cb->args[1] = m;
764 return skb->len;
765}
766
767#ifdef CONFIG_IP_MULTIPLE_TABLES
768struct fib_table * fib_hash_init(int id)
769#else
770struct fib_table * __init fib_hash_init(int id)
771#endif
772{
773 struct fib_table *tb;
774
775 if (fn_hash_kmem == NULL)
776 fn_hash_kmem = kmem_cache_create("ip_fib_hash",
777 sizeof(struct fib_node),
778 0, SLAB_HWCACHE_ALIGN,
779 NULL, NULL);
780
781 if (fn_alias_kmem == NULL)
782 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
783 sizeof(struct fib_alias),
784 0, SLAB_HWCACHE_ALIGN,
785 NULL, NULL);
786
787 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
788 GFP_KERNEL);
789 if (tb == NULL)
790 return NULL;
791
792 tb->tb_id = id;
793 tb->tb_lookup = fn_hash_lookup;
794 tb->tb_insert = fn_hash_insert;
795 tb->tb_delete = fn_hash_delete;
796 tb->tb_flush = fn_hash_flush;
797 tb->tb_select_default = fn_hash_select_default;
798 tb->tb_dump = fn_hash_dump;
799 memset(tb->tb_data, 0, sizeof(struct fn_hash));
800 return tb;
801}
802
803/* ------------------------------------------------------------------------ */
804#ifdef CONFIG_PROC_FS
805
806struct fib_iter_state {
807 struct fn_zone *zone;
808 int bucket;
809 struct hlist_head *hash_head;
810 struct fib_node *fn;
811 struct fib_alias *fa;
812 loff_t pos;
813 unsigned int genid;
814 int valid;
815};
816
817static struct fib_alias *fib_get_first(struct seq_file *seq)
818{
819 struct fib_iter_state *iter = seq->private;
820 struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
821
822 iter->bucket = 0;
823 iter->hash_head = NULL;
824 iter->fn = NULL;
825 iter->fa = NULL;
826 iter->pos = 0;
827 iter->genid = fib_hash_genid;
828 iter->valid = 1;
829
830 for (iter->zone = table->fn_zone_list; iter->zone;
831 iter->zone = iter->zone->fz_next) {
832 int maxslot;
833
834 if (!iter->zone->fz_nent)
835 continue;
836
837 iter->hash_head = iter->zone->fz_hash;
838 maxslot = iter->zone->fz_divisor;
839
840 for (iter->bucket = 0; iter->bucket < maxslot;
841 ++iter->bucket, ++iter->hash_head) {
842 struct hlist_node *node;
843 struct fib_node *fn;
844
845 hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
846 struct fib_alias *fa;
847
848 list_for_each_entry(fa,&fn->fn_alias,fa_list) {
849 iter->fn = fn;
850 iter->fa = fa;
851 goto out;
852 }
853 }
854 }
855 }
856out:
857 return iter->fa;
858}
859
860static struct fib_alias *fib_get_next(struct seq_file *seq)
861{
862 struct fib_iter_state *iter = seq->private;
863 struct fib_node *fn;
864 struct fib_alias *fa;
865
866 /* Advance FA, if any. */
867 fn = iter->fn;
868 fa = iter->fa;
869 if (fa) {
870 BUG_ON(!fn);
871 list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
872 iter->fa = fa;
873 goto out;
874 }
875 }
876
877 fa = iter->fa = NULL;
878
879 /* Advance FN. */
880 if (fn) {
881 struct hlist_node *node = &fn->fn_hash;
882 hlist_for_each_entry_continue(fn, node, fn_hash) {
883 iter->fn = fn;
884
885 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
886 iter->fa = fa;
887 goto out;
888 }
889 }
890 }
891
892 fn = iter->fn = NULL;
893
894 /* Advance hash chain. */
895 if (!iter->zone)
896 goto out;
897
898 for (;;) {
899 struct hlist_node *node;
900 int maxslot;
901
902 maxslot = iter->zone->fz_divisor;
903
904 while (++iter->bucket < maxslot) {
905 iter->hash_head++;
906
907 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
908 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
909 iter->fn = fn;
910 iter->fa = fa;
911 goto out;
912 }
913 }
914 }
915
916 iter->zone = iter->zone->fz_next;
917
918 if (!iter->zone)
919 goto out;
920
921 iter->bucket = 0;
922 iter->hash_head = iter->zone->fz_hash;
923
924 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
925 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
926 iter->fn = fn;
927 iter->fa = fa;
928 goto out;
929 }
930 }
931 }
932out:
933 iter->pos++;
934 return fa;
935}
936
937static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
938{
939 struct fib_iter_state *iter = seq->private;
940 struct fib_alias *fa;
941
942 if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
943 fa = iter->fa;
944 pos -= iter->pos;
945 } else
946 fa = fib_get_first(seq);
947
948 if (fa)
949 while (pos && (fa = fib_get_next(seq)))
950 --pos;
951 return pos ? NULL : fa;
952}
953
954static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
955{
956 void *v = NULL;
957
958 read_lock(&fib_hash_lock);
959 if (ip_fib_main_table)
960 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
961 return v;
962}
963
964static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
965{
966 ++*pos;
967 return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
968}
969
970static void fib_seq_stop(struct seq_file *seq, void *v)
971{
972 read_unlock(&fib_hash_lock);
973}
974
975static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi)
976{
977 static unsigned type2flags[RTN_MAX + 1] = {
978 [7] = RTF_REJECT, [8] = RTF_REJECT,
979 };
980 unsigned flags = type2flags[type];
981
982 if (fi && fi->fib_nh->nh_gw)
983 flags |= RTF_GATEWAY;
984 if (mask == 0xFFFFFFFF)
985 flags |= RTF_HOST;
986 flags |= RTF_UP;
987 return flags;
988}
989
990/*
991 * This outputs /proc/net/route.
992 *
993 * It always works in backward compatibility mode.
994 * The format of the file is not supposed to be changed.
995 */
996static int fib_seq_show(struct seq_file *seq, void *v)
997{
998 struct fib_iter_state *iter;
999 char bf[128];
1000 u32 prefix, mask;
1001 unsigned flags;
1002 struct fib_node *f;
1003 struct fib_alias *fa;
1004 struct fib_info *fi;
1005
1006 if (v == SEQ_START_TOKEN) {
1007 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
1008 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1009 "\tWindow\tIRTT");
1010 goto out;
1011 }
1012
1013 iter = seq->private;
1014 f = iter->fn;
1015 fa = iter->fa;
1016 fi = fa->fa_info;
1017 prefix = f->fn_key;
1018 mask = FZ_MASK(iter->zone);
1019 flags = fib_flag_trans(fa->fa_type, mask, fi);
1020 if (fi)
1021 snprintf(bf, sizeof(bf),
1022 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
1023 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1024 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1025 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1026 fi->fib_window,
1027 fi->fib_rtt >> 3);
1028 else
1029 snprintf(bf, sizeof(bf),
1030 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
1031 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0);
1032 seq_printf(seq, "%-127s\n", bf);
1033out:
1034 return 0;
1035}
1036
1037static struct seq_operations fib_seq_ops = {
1038 .start = fib_seq_start,
1039 .next = fib_seq_next,
1040 .stop = fib_seq_stop,
1041 .show = fib_seq_show,
1042};
1043
1044static int fib_seq_open(struct inode *inode, struct file *file)
1045{
1046 struct seq_file *seq;
1047 int rc = -ENOMEM;
1048 struct fib_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
1049
1050 if (!s)
1051 goto out;
1052
1053 rc = seq_open(file, &fib_seq_ops);
1054 if (rc)
1055 goto out_kfree;
1056
1057 seq = file->private_data;
1058 seq->private = s;
1059 memset(s, 0, sizeof(*s));
1060out:
1061 return rc;
1062out_kfree:
1063 kfree(s);
1064 goto out;
1065}
1066
1067static struct file_operations fib_seq_fops = {
1068 .owner = THIS_MODULE,
1069 .open = fib_seq_open,
1070 .read = seq_read,
1071 .llseek = seq_lseek,
1072 .release = seq_release_private,
1073};
1074
1075int __init fib_proc_init(void)
1076{
1077 if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
1078 return -ENOMEM;
1079 return 0;
1080}
1081
1082void __init fib_proc_exit(void)
1083{
1084 proc_net_remove("route");
1085}
1086#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 000000000000..ac4485f75e97
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,43 @@
1#ifndef _FIB_LOOKUP_H
2#define _FIB_LOOKUP_H
3
4#include <linux/types.h>
5#include <linux/list.h>
6#include <net/ip_fib.h>
7
8struct fib_alias {
9 struct list_head fa_list;
10 struct fib_info *fa_info;
11 u8 fa_tos;
12 u8 fa_type;
13 u8 fa_scope;
14 u8 fa_state;
15};
16
17#define FA_S_ACCESSED 0x01
18
19/* Exported by fib_semantics.c */
20extern int fib_semantic_match(struct list_head *head,
21 const struct flowi *flp,
22 struct fib_result *res, __u32 zone, __u32 mask,
23 int prefixlen);
24extern void fib_release_info(struct fib_info *);
25extern struct fib_info *fib_create_info(const struct rtmsg *r,
26 struct kern_rta *rta,
27 const struct nlmsghdr *,
28 int *err);
29extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
30 struct kern_rta *rta, struct fib_info *fi);
31extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
32 u8 tb_id, u8 type, u8 scope, void *dst,
33 int dst_len, u8 tos, struct fib_info *fi);
34extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
35 int z, int tb_id,
36 struct nlmsghdr *n, struct netlink_skb_parms *req);
37extern struct fib_alias *fib_find_alias(struct list_head *fah,
38 u8 tos, u32 prio);
39extern int fib_detect_death(struct fib_info *fi, int order,
40 struct fib_info **last_resort,
41 int *last_idx, int *dflt);
42
43#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 000000000000..39d0aadb9a2a
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,437 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 Forwarding Information Base: policy rules.
7 *
8 * Version: $Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $
9 *
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Fixes:
18 * Rani Assaf : local_rule cannot be deleted
19 * Marc Boucher : routing by fwmark
20 */
21
22#include <linux/config.h>
23#include <asm/uaccess.h>
24#include <asm/system.h>
25#include <linux/bitops.h>
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/sched.h>
29#include <linux/mm.h>
30#include <linux/string.h>
31#include <linux/socket.h>
32#include <linux/sockios.h>
33#include <linux/errno.h>
34#include <linux/in.h>
35#include <linux/inet.h>
36#include <linux/netdevice.h>
37#include <linux/if_arp.h>
38#include <linux/proc_fs.h>
39#include <linux/skbuff.h>
40#include <linux/netlink.h>
41#include <linux/init.h>
42
43#include <net/ip.h>
44#include <net/protocol.h>
45#include <net/route.h>
46#include <net/tcp.h>
47#include <net/sock.h>
48#include <net/ip_fib.h>
49
50#define FRprintk(a...)
51
52struct fib_rule
53{
54 struct fib_rule *r_next;
55 atomic_t r_clntref;
56 u32 r_preference;
57 unsigned char r_table;
58 unsigned char r_action;
59 unsigned char r_dst_len;
60 unsigned char r_src_len;
61 u32 r_src;
62 u32 r_srcmask;
63 u32 r_dst;
64 u32 r_dstmask;
65 u32 r_srcmap;
66 u8 r_flags;
67 u8 r_tos;
68#ifdef CONFIG_IP_ROUTE_FWMARK
69 u32 r_fwmark;
70#endif
71 int r_ifindex;
72#ifdef CONFIG_NET_CLS_ROUTE
73 __u32 r_tclassid;
74#endif
75 char r_ifname[IFNAMSIZ];
76 int r_dead;
77};
78
79static struct fib_rule default_rule = {
80 .r_clntref = ATOMIC_INIT(2),
81 .r_preference = 0x7FFF,
82 .r_table = RT_TABLE_DEFAULT,
83 .r_action = RTN_UNICAST,
84};
85
86static struct fib_rule main_rule = {
87 .r_next = &default_rule,
88 .r_clntref = ATOMIC_INIT(2),
89 .r_preference = 0x7FFE,
90 .r_table = RT_TABLE_MAIN,
91 .r_action = RTN_UNICAST,
92};
93
94static struct fib_rule local_rule = {
95 .r_next = &main_rule,
96 .r_clntref = ATOMIC_INIT(2),
97 .r_table = RT_TABLE_LOCAL,
98 .r_action = RTN_UNICAST,
99};
100
101static struct fib_rule *fib_rules = &local_rule;
102static DEFINE_RWLOCK(fib_rules_lock);
103
104int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
105{
106 struct rtattr **rta = arg;
107 struct rtmsg *rtm = NLMSG_DATA(nlh);
108 struct fib_rule *r, **rp;
109 int err = -ESRCH;
110
111 for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
112 if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
113 rtm->rtm_src_len == r->r_src_len &&
114 rtm->rtm_dst_len == r->r_dst_len &&
115 (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
116 rtm->rtm_tos == r->r_tos &&
117#ifdef CONFIG_IP_ROUTE_FWMARK
118 (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
119#endif
120 (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
121 (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
122 (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) &&
123 (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
124 err = -EPERM;
125 if (r == &local_rule)
126 break;
127
128 write_lock_bh(&fib_rules_lock);
129 *rp = r->r_next;
130 r->r_dead = 1;
131 write_unlock_bh(&fib_rules_lock);
132 fib_rule_put(r);
133 err = 0;
134 break;
135 }
136 }
137 return err;
138}
139
140/* Allocate new unique table id */
141
142static struct fib_table *fib_empty_table(void)
143{
144 int id;
145
146 for (id = 1; id <= RT_TABLE_MAX; id++)
147 if (fib_tables[id] == NULL)
148 return __fib_new_table(id);
149 return NULL;
150}
151
152void fib_rule_put(struct fib_rule *r)
153{
154 if (atomic_dec_and_test(&r->r_clntref)) {
155 if (r->r_dead)
156 kfree(r);
157 else
158 printk("Freeing alive rule %p\n", r);
159 }
160}
161
162int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
163{
164 struct rtattr **rta = arg;
165 struct rtmsg *rtm = NLMSG_DATA(nlh);
166 struct fib_rule *r, *new_r, **rp;
167 unsigned char table_id;
168
169 if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 ||
170 (rtm->rtm_tos & ~IPTOS_TOS_MASK))
171 return -EINVAL;
172
173 if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
174 return -EINVAL;
175
176 table_id = rtm->rtm_table;
177 if (table_id == RT_TABLE_UNSPEC) {
178 struct fib_table *table;
179 if (rtm->rtm_type == RTN_UNICAST) {
180 if ((table = fib_empty_table()) == NULL)
181 return -ENOBUFS;
182 table_id = table->tb_id;
183 }
184 }
185
186 new_r = kmalloc(sizeof(*new_r), GFP_KERNEL);
187 if (!new_r)
188 return -ENOMEM;
189 memset(new_r, 0, sizeof(*new_r));
190 if (rta[RTA_SRC-1])
191 memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4);
192 if (rta[RTA_DST-1])
193 memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4);
194 if (rta[RTA_GATEWAY-1])
195 memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4);
196 new_r->r_src_len = rtm->rtm_src_len;
197 new_r->r_dst_len = rtm->rtm_dst_len;
198 new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
199 new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
200 new_r->r_tos = rtm->rtm_tos;
201#ifdef CONFIG_IP_ROUTE_FWMARK
202 if (rta[RTA_PROTOINFO-1])
203 memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
204#endif
205 new_r->r_action = rtm->rtm_type;
206 new_r->r_flags = rtm->rtm_flags;
207 if (rta[RTA_PRIORITY-1])
208 memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
209 new_r->r_table = table_id;
210 if (rta[RTA_IIF-1]) {
211 struct net_device *dev;
212 rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ);
213 new_r->r_ifindex = -1;
214 dev = __dev_get_by_name(new_r->r_ifname);
215 if (dev)
216 new_r->r_ifindex = dev->ifindex;
217 }
218#ifdef CONFIG_NET_CLS_ROUTE
219 if (rta[RTA_FLOW-1])
220 memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4);
221#endif
222
223 rp = &fib_rules;
224 if (!new_r->r_preference) {
225 r = fib_rules;
226 if (r && (r = r->r_next) != NULL) {
227 rp = &fib_rules->r_next;
228 if (r->r_preference)
229 new_r->r_preference = r->r_preference - 1;
230 }
231 }
232
233 while ( (r = *rp) != NULL ) {
234 if (r->r_preference > new_r->r_preference)
235 break;
236 rp = &r->r_next;
237 }
238
239 new_r->r_next = r;
240 atomic_inc(&new_r->r_clntref);
241 write_lock_bh(&fib_rules_lock);
242 *rp = new_r;
243 write_unlock_bh(&fib_rules_lock);
244 return 0;
245}
246
247#ifdef CONFIG_NET_CLS_ROUTE
248u32 fib_rules_tclass(struct fib_result *res)
249{
250 if (res->r)
251 return res->r->r_tclassid;
252 return 0;
253}
254#endif
255
256
257static void fib_rules_detach(struct net_device *dev)
258{
259 struct fib_rule *r;
260
261 for (r=fib_rules; r; r=r->r_next) {
262 if (r->r_ifindex == dev->ifindex) {
263 write_lock_bh(&fib_rules_lock);
264 r->r_ifindex = -1;
265 write_unlock_bh(&fib_rules_lock);
266 }
267 }
268}
269
270static void fib_rules_attach(struct net_device *dev)
271{
272 struct fib_rule *r;
273
274 for (r=fib_rules; r; r=r->r_next) {
275 if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) {
276 write_lock_bh(&fib_rules_lock);
277 r->r_ifindex = dev->ifindex;
278 write_unlock_bh(&fib_rules_lock);
279 }
280 }
281}
282
283int fib_lookup(const struct flowi *flp, struct fib_result *res)
284{
285 int err;
286 struct fib_rule *r, *policy;
287 struct fib_table *tb;
288
289 u32 daddr = flp->fl4_dst;
290 u32 saddr = flp->fl4_src;
291
292FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ",
293 NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src));
294 read_lock(&fib_rules_lock);
295 for (r = fib_rules; r; r=r->r_next) {
296 if (((saddr^r->r_src) & r->r_srcmask) ||
297 ((daddr^r->r_dst) & r->r_dstmask) ||
298 (r->r_tos && r->r_tos != flp->fl4_tos) ||
299#ifdef CONFIG_IP_ROUTE_FWMARK
300 (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) ||
301#endif
302 (r->r_ifindex && r->r_ifindex != flp->iif))
303 continue;
304
305FRprintk("tb %d r %d ", r->r_table, r->r_action);
306 switch (r->r_action) {
307 case RTN_UNICAST:
308 policy = r;
309 break;
310 case RTN_UNREACHABLE:
311 read_unlock(&fib_rules_lock);
312 return -ENETUNREACH;
313 default:
314 case RTN_BLACKHOLE:
315 read_unlock(&fib_rules_lock);
316 return -EINVAL;
317 case RTN_PROHIBIT:
318 read_unlock(&fib_rules_lock);
319 return -EACCES;
320 }
321
322 if ((tb = fib_get_table(r->r_table)) == NULL)
323 continue;
324 err = tb->tb_lookup(tb, flp, res);
325 if (err == 0) {
326 res->r = policy;
327 if (policy)
328 atomic_inc(&policy->r_clntref);
329 read_unlock(&fib_rules_lock);
330 return 0;
331 }
332 if (err < 0 && err != -EAGAIN) {
333 read_unlock(&fib_rules_lock);
334 return err;
335 }
336 }
337FRprintk("FAILURE\n");
338 read_unlock(&fib_rules_lock);
339 return -ENETUNREACH;
340}
341
342void fib_select_default(const struct flowi *flp, struct fib_result *res)
343{
344 if (res->r && res->r->r_action == RTN_UNICAST &&
345 FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
346 struct fib_table *tb;
347 if ((tb = fib_get_table(res->r->r_table)) != NULL)
348 tb->tb_select_default(tb, flp, res);
349 }
350}
351
352static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
353{
354 struct net_device *dev = ptr;
355
356 if (event == NETDEV_UNREGISTER)
357 fib_rules_detach(dev);
358 else if (event == NETDEV_REGISTER)
359 fib_rules_attach(dev);
360 return NOTIFY_DONE;
361}
362
363
364static struct notifier_block fib_rules_notifier = {
365 .notifier_call =fib_rules_event,
366};
367
368static __inline__ int inet_fill_rule(struct sk_buff *skb,
369 struct fib_rule *r,
370 struct netlink_callback *cb)
371{
372 struct rtmsg *rtm;
373 struct nlmsghdr *nlh;
374 unsigned char *b = skb->tail;
375
376 nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
377 rtm = NLMSG_DATA(nlh);
378 rtm->rtm_family = AF_INET;
379 rtm->rtm_dst_len = r->r_dst_len;
380 rtm->rtm_src_len = r->r_src_len;
381 rtm->rtm_tos = r->r_tos;
382#ifdef CONFIG_IP_ROUTE_FWMARK
383 if (r->r_fwmark)
384 RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
385#endif
386 rtm->rtm_table = r->r_table;
387 rtm->rtm_protocol = 0;
388 rtm->rtm_scope = 0;
389 rtm->rtm_type = r->r_action;
390 rtm->rtm_flags = r->r_flags;
391
392 if (r->r_dst_len)
393 RTA_PUT(skb, RTA_DST, 4, &r->r_dst);
394 if (r->r_src_len)
395 RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
396 if (r->r_ifname[0])
397 RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
398 if (r->r_preference)
399 RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
400 if (r->r_srcmap)
401 RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap);
402#ifdef CONFIG_NET_CLS_ROUTE
403 if (r->r_tclassid)
404 RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid);
405#endif
406 nlh->nlmsg_len = skb->tail - b;
407 return skb->len;
408
409nlmsg_failure:
410rtattr_failure:
411 skb_trim(skb, b - skb->data);
412 return -1;
413}
414
415int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
416{
417 int idx;
418 int s_idx = cb->args[0];
419 struct fib_rule *r;
420
421 read_lock(&fib_rules_lock);
422 for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
423 if (idx < s_idx)
424 continue;
425 if (inet_fill_rule(skb, r, cb) < 0)
426 break;
427 }
428 read_unlock(&fib_rules_lock);
429 cb->args[0] = idx;
430
431 return skb->len;
432}
433
434void __init fib_rules_init(void)
435{
436 register_netdevice_notifier(&fib_rules_notifier);
437}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 000000000000..029362d66135
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1332 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 Forwarding Information Base: semantics.
7 *
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9 *
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 */
17
18#include <linux/config.h>
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <linux/bitops.h>
22#include <linux/types.h>
23#include <linux/kernel.h>
24#include <linux/jiffies.h>
25#include <linux/mm.h>
26#include <linux/string.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/errno.h>
30#include <linux/in.h>
31#include <linux/inet.h>
32#include <linux/netdevice.h>
33#include <linux/if_arp.h>
34#include <linux/proc_fs.h>
35#include <linux/skbuff.h>
36#include <linux/netlink.h>
37#include <linux/init.h>
38
39#include <net/ip.h>
40#include <net/protocol.h>
41#include <net/route.h>
42#include <net/tcp.h>
43#include <net/sock.h>
44#include <net/ip_fib.h>
45#include <net/ip_mp_alg.h>
46
47#include "fib_lookup.h"
48
49#define FSprintk(a...)
50
51static DEFINE_RWLOCK(fib_info_lock);
52static struct hlist_head *fib_info_hash;
53static struct hlist_head *fib_info_laddrhash;
54static unsigned int fib_hash_size;
55static unsigned int fib_info_cnt;
56
57#define DEVINDEX_HASHBITS 8
58#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
59static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60
61#ifdef CONFIG_IP_ROUTE_MULTIPATH
62
63static DEFINE_SPINLOCK(fib_multipath_lock);
64
65#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
66for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
67
68#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
69for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
70
71#else /* CONFIG_IP_ROUTE_MULTIPATH */
72
73/* Hope, that gcc will optimize it to get rid of dummy loop */
74
75#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
76for (nhsel=0; nhsel < 1; nhsel++)
77
78#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
79for (nhsel=0; nhsel < 1; nhsel++)
80
81#endif /* CONFIG_IP_ROUTE_MULTIPATH */
82
83#define endfor_nexthops(fi) }
84
85
86static struct
87{
88 int error;
89 u8 scope;
90} fib_props[RTA_MAX + 1] = {
91 {
92 .error = 0,
93 .scope = RT_SCOPE_NOWHERE,
94 }, /* RTN_UNSPEC */
95 {
96 .error = 0,
97 .scope = RT_SCOPE_UNIVERSE,
98 }, /* RTN_UNICAST */
99 {
100 .error = 0,
101 .scope = RT_SCOPE_HOST,
102 }, /* RTN_LOCAL */
103 {
104 .error = 0,
105 .scope = RT_SCOPE_LINK,
106 }, /* RTN_BROADCAST */
107 {
108 .error = 0,
109 .scope = RT_SCOPE_LINK,
110 }, /* RTN_ANYCAST */
111 {
112 .error = 0,
113 .scope = RT_SCOPE_UNIVERSE,
114 }, /* RTN_MULTICAST */
115 {
116 .error = -EINVAL,
117 .scope = RT_SCOPE_UNIVERSE,
118 }, /* RTN_BLACKHOLE */
119 {
120 .error = -EHOSTUNREACH,
121 .scope = RT_SCOPE_UNIVERSE,
122 }, /* RTN_UNREACHABLE */
123 {
124 .error = -EACCES,
125 .scope = RT_SCOPE_UNIVERSE,
126 }, /* RTN_PROHIBIT */
127 {
128 .error = -EAGAIN,
129 .scope = RT_SCOPE_UNIVERSE,
130 }, /* RTN_THROW */
131 {
132 .error = -EINVAL,
133 .scope = RT_SCOPE_NOWHERE,
134 }, /* RTN_NAT */
135 {
136 .error = -EINVAL,
137 .scope = RT_SCOPE_NOWHERE,
138 }, /* RTN_XRESOLVE */
139};
140
141
142/* Release a nexthop info record */
143
144void free_fib_info(struct fib_info *fi)
145{
146 if (fi->fib_dead == 0) {
147 printk("Freeing alive fib_info %p\n", fi);
148 return;
149 }
150 change_nexthops(fi) {
151 if (nh->nh_dev)
152 dev_put(nh->nh_dev);
153 nh->nh_dev = NULL;
154 } endfor_nexthops(fi);
155 fib_info_cnt--;
156 kfree(fi);
157}
158
159void fib_release_info(struct fib_info *fi)
160{
161 write_lock(&fib_info_lock);
162 if (fi && --fi->fib_treeref == 0) {
163 hlist_del(&fi->fib_hash);
164 if (fi->fib_prefsrc)
165 hlist_del(&fi->fib_lhash);
166 change_nexthops(fi) {
167 if (!nh->nh_dev)
168 continue;
169 hlist_del(&nh->nh_hash);
170 } endfor_nexthops(fi)
171 fi->fib_dead = 1;
172 fib_info_put(fi);
173 }
174 write_unlock(&fib_info_lock);
175}
176
177static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
178{
179 const struct fib_nh *onh = ofi->fib_nh;
180
181 for_nexthops(fi) {
182 if (nh->nh_oif != onh->nh_oif ||
183 nh->nh_gw != onh->nh_gw ||
184 nh->nh_scope != onh->nh_scope ||
185#ifdef CONFIG_IP_ROUTE_MULTIPATH
186 nh->nh_weight != onh->nh_weight ||
187#endif
188#ifdef CONFIG_NET_CLS_ROUTE
189 nh->nh_tclassid != onh->nh_tclassid ||
190#endif
191 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
192 return -1;
193 onh++;
194 } endfor_nexthops(fi);
195 return 0;
196}
197
198static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
199{
200 unsigned int mask = (fib_hash_size - 1);
201 unsigned int val = fi->fib_nhs;
202
203 val ^= fi->fib_protocol;
204 val ^= fi->fib_prefsrc;
205 val ^= fi->fib_priority;
206
207 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
208}
209
210static struct fib_info *fib_find_info(const struct fib_info *nfi)
211{
212 struct hlist_head *head;
213 struct hlist_node *node;
214 struct fib_info *fi;
215 unsigned int hash;
216
217 hash = fib_info_hashfn(nfi);
218 head = &fib_info_hash[hash];
219
220 hlist_for_each_entry(fi, node, head, fib_hash) {
221 if (fi->fib_nhs != nfi->fib_nhs)
222 continue;
223 if (nfi->fib_protocol == fi->fib_protocol &&
224 nfi->fib_prefsrc == fi->fib_prefsrc &&
225 nfi->fib_priority == fi->fib_priority &&
226 memcmp(nfi->fib_metrics, fi->fib_metrics,
227 sizeof(fi->fib_metrics)) == 0 &&
228 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
229 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
230 return fi;
231 }
232
233 return NULL;
234}
235
236static inline unsigned int fib_devindex_hashfn(unsigned int val)
237{
238 unsigned int mask = DEVINDEX_HASHSIZE - 1;
239
240 return (val ^
241 (val >> DEVINDEX_HASHBITS) ^
242 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
243}
244
245/* Check, that the gateway is already configured.
246 Used only by redirect accept routine.
247 */
248
249int ip_fib_check_default(u32 gw, struct net_device *dev)
250{
251 struct hlist_head *head;
252 struct hlist_node *node;
253 struct fib_nh *nh;
254 unsigned int hash;
255
256 read_lock(&fib_info_lock);
257
258 hash = fib_devindex_hashfn(dev->ifindex);
259 head = &fib_info_devhash[hash];
260 hlist_for_each_entry(nh, node, head, nh_hash) {
261 if (nh->nh_dev == dev &&
262 nh->nh_gw == gw &&
263 !(nh->nh_flags&RTNH_F_DEAD)) {
264 read_unlock(&fib_info_lock);
265 return 0;
266 }
267 }
268
269 read_unlock(&fib_info_lock);
270
271 return -1;
272}
273
274void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
275 int z, int tb_id,
276 struct nlmsghdr *n, struct netlink_skb_parms *req)
277{
278 struct sk_buff *skb;
279 u32 pid = req ? req->pid : 0;
280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
281
282 skb = alloc_skb(size, GFP_KERNEL);
283 if (!skb)
284 return;
285
286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
287 fa->fa_type, fa->fa_scope, &key, z,
288 fa->fa_tos,
289 fa->fa_info) < 0) {
290 kfree_skb(skb);
291 return;
292 }
293 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
294 if (n->nlmsg_flags&NLM_F_ECHO)
295 atomic_inc(&skb->users);
296 netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
297 if (n->nlmsg_flags&NLM_F_ECHO)
298 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
299}
300
301/* Return the first fib alias matching TOS with
302 * priority less than or equal to PRIO.
303 */
304struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
305{
306 if (fah) {
307 struct fib_alias *fa;
308 list_for_each_entry(fa, fah, fa_list) {
309 if (fa->fa_tos > tos)
310 continue;
311 if (fa->fa_info->fib_priority >= prio ||
312 fa->fa_tos < tos)
313 return fa;
314 }
315 }
316 return NULL;
317}
318
319int fib_detect_death(struct fib_info *fi, int order,
320 struct fib_info **last_resort, int *last_idx, int *dflt)
321{
322 struct neighbour *n;
323 int state = NUD_NONE;
324
325 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
326 if (n) {
327 state = n->nud_state;
328 neigh_release(n);
329 }
330 if (state==NUD_REACHABLE)
331 return 0;
332 if ((state&NUD_VALID) && order != *dflt)
333 return 0;
334 if ((state&NUD_VALID) ||
335 (*last_idx<0 && order > *dflt)) {
336 *last_resort = fi;
337 *last_idx = order;
338 }
339 return 1;
340}
341
342#ifdef CONFIG_IP_ROUTE_MULTIPATH
343
344static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
345{
346 while (RTA_OK(attr,attrlen)) {
347 if (attr->rta_type == type)
348 return *(u32*)RTA_DATA(attr);
349 attr = RTA_NEXT(attr, attrlen);
350 }
351 return 0;
352}
353
354static int
355fib_count_nexthops(struct rtattr *rta)
356{
357 int nhs = 0;
358 struct rtnexthop *nhp = RTA_DATA(rta);
359 int nhlen = RTA_PAYLOAD(rta);
360
361 while (nhlen >= (int)sizeof(struct rtnexthop)) {
362 if ((nhlen -= nhp->rtnh_len) < 0)
363 return 0;
364 nhs++;
365 nhp = RTNH_NEXT(nhp);
366 };
367 return nhs;
368}
369
370static int
371fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
372{
373 struct rtnexthop *nhp = RTA_DATA(rta);
374 int nhlen = RTA_PAYLOAD(rta);
375
376 change_nexthops(fi) {
377 int attrlen = nhlen - sizeof(struct rtnexthop);
378 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
379 return -EINVAL;
380 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
381 nh->nh_oif = nhp->rtnh_ifindex;
382 nh->nh_weight = nhp->rtnh_hops + 1;
383 if (attrlen) {
384 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
385#ifdef CONFIG_NET_CLS_ROUTE
386 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
387#endif
388 }
389 nhp = RTNH_NEXT(nhp);
390 } endfor_nexthops(fi);
391 return 0;
392}
393
394#endif
395
396int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
397 struct fib_info *fi)
398{
399#ifdef CONFIG_IP_ROUTE_MULTIPATH
400 struct rtnexthop *nhp;
401 int nhlen;
402#endif
403
404 if (rta->rta_priority &&
405 *rta->rta_priority != fi->fib_priority)
406 return 1;
407
408 if (rta->rta_oif || rta->rta_gw) {
409 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
410 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
411 return 0;
412 return 1;
413 }
414
415#ifdef CONFIG_IP_ROUTE_MULTIPATH
416 if (rta->rta_mp == NULL)
417 return 0;
418 nhp = RTA_DATA(rta->rta_mp);
419 nhlen = RTA_PAYLOAD(rta->rta_mp);
420
421 for_nexthops(fi) {
422 int attrlen = nhlen - sizeof(struct rtnexthop);
423 u32 gw;
424
425 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
426 return -EINVAL;
427 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
428 return 1;
429 if (attrlen) {
430 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
431 if (gw && gw != nh->nh_gw)
432 return 1;
433#ifdef CONFIG_NET_CLS_ROUTE
434 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
435 if (gw && gw != nh->nh_tclassid)
436 return 1;
437#endif
438 }
439 nhp = RTNH_NEXT(nhp);
440 } endfor_nexthops(fi);
441#endif
442 return 0;
443}
444
445
446/*
447 Picture
448 -------
449
450 Semantics of nexthop is very messy by historical reasons.
451 We have to take into account, that:
452 a) gateway can be actually local interface address,
453 so that gatewayed route is direct.
454 b) gateway must be on-link address, possibly
455 described not by an ifaddr, but also by a direct route.
456 c) If both gateway and interface are specified, they should not
457 contradict.
458 d) If we use tunnel routes, gateway could be not on-link.
459
460 Attempt to reconcile all of these (alas, self-contradictory) conditions
461 results in pretty ugly and hairy code with obscure logic.
462
463 I chose to generalized it instead, so that the size
464 of code does not increase practically, but it becomes
465 much more general.
466 Every prefix is assigned a "scope" value: "host" is local address,
467 "link" is direct route,
468 [ ... "site" ... "interior" ... ]
469 and "universe" is true gateway route with global meaning.
470
471 Every prefix refers to a set of "nexthop"s (gw, oif),
472 where gw must have narrower scope. This recursion stops
473 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
474 which means that gw is forced to be on link.
475
476 Code is still hairy, but now it is apparently logically
477 consistent and very flexible. F.e. as by-product it allows
478 to co-exists in peace independent exterior and interior
479 routing processes.
480
481 Normally it looks as following.
482
483 {universe prefix} -> (gw, oif) [scope link]
484 |
485 |-> {link prefix} -> (gw, oif) [scope local]
486 |
487 |-> {local prefix} (terminal node)
488 */
489
490static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
491{
492 int err;
493
494 if (nh->nh_gw) {
495 struct fib_result res;
496
497#ifdef CONFIG_IP_ROUTE_PERVASIVE
498 if (nh->nh_flags&RTNH_F_PERVASIVE)
499 return 0;
500#endif
501 if (nh->nh_flags&RTNH_F_ONLINK) {
502 struct net_device *dev;
503
504 if (r->rtm_scope >= RT_SCOPE_LINK)
505 return -EINVAL;
506 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
507 return -EINVAL;
508 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
509 return -ENODEV;
510 if (!(dev->flags&IFF_UP))
511 return -ENETDOWN;
512 nh->nh_dev = dev;
513 dev_hold(dev);
514 nh->nh_scope = RT_SCOPE_LINK;
515 return 0;
516 }
517 {
518 struct flowi fl = { .nl_u = { .ip4_u =
519 { .daddr = nh->nh_gw,
520 .scope = r->rtm_scope + 1 } },
521 .oif = nh->nh_oif };
522
523 /* It is not necessary, but requires a bit of thinking */
524 if (fl.fl4_scope < RT_SCOPE_LINK)
525 fl.fl4_scope = RT_SCOPE_LINK;
526 if ((err = fib_lookup(&fl, &res)) != 0)
527 return err;
528 }
529 err = -EINVAL;
530 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
531 goto out;
532 nh->nh_scope = res.scope;
533 nh->nh_oif = FIB_RES_OIF(res);
534 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
535 goto out;
536 dev_hold(nh->nh_dev);
537 err = -ENETDOWN;
538 if (!(nh->nh_dev->flags & IFF_UP))
539 goto out;
540 err = 0;
541out:
542 fib_res_put(&res);
543 return err;
544 } else {
545 struct in_device *in_dev;
546
547 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
548 return -EINVAL;
549
550 in_dev = inetdev_by_index(nh->nh_oif);
551 if (in_dev == NULL)
552 return -ENODEV;
553 if (!(in_dev->dev->flags&IFF_UP)) {
554 in_dev_put(in_dev);
555 return -ENETDOWN;
556 }
557 nh->nh_dev = in_dev->dev;
558 dev_hold(nh->nh_dev);
559 nh->nh_scope = RT_SCOPE_HOST;
560 in_dev_put(in_dev);
561 }
562 return 0;
563}
564
565static inline unsigned int fib_laddr_hashfn(u32 val)
566{
567 unsigned int mask = (fib_hash_size - 1);
568
569 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
570}
571
572static struct hlist_head *fib_hash_alloc(int bytes)
573{
574 if (bytes <= PAGE_SIZE)
575 return kmalloc(bytes, GFP_KERNEL);
576 else
577 return (struct hlist_head *)
578 __get_free_pages(GFP_KERNEL, get_order(bytes));
579}
580
581static void fib_hash_free(struct hlist_head *hash, int bytes)
582{
583 if (!hash)
584 return;
585
586 if (bytes <= PAGE_SIZE)
587 kfree(hash);
588 else
589 free_pages((unsigned long) hash, get_order(bytes));
590}
591
592static void fib_hash_move(struct hlist_head *new_info_hash,
593 struct hlist_head *new_laddrhash,
594 unsigned int new_size)
595{
596 unsigned int old_size = fib_hash_size;
597 unsigned int i;
598
599 write_lock(&fib_info_lock);
600 fib_hash_size = new_size;
601
602 for (i = 0; i < old_size; i++) {
603 struct hlist_head *head = &fib_info_hash[i];
604 struct hlist_node *node, *n;
605 struct fib_info *fi;
606
607 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
608 struct hlist_head *dest;
609 unsigned int new_hash;
610
611 hlist_del(&fi->fib_hash);
612
613 new_hash = fib_info_hashfn(fi);
614 dest = &new_info_hash[new_hash];
615 hlist_add_head(&fi->fib_hash, dest);
616 }
617 }
618 fib_info_hash = new_info_hash;
619
620 for (i = 0; i < old_size; i++) {
621 struct hlist_head *lhead = &fib_info_laddrhash[i];
622 struct hlist_node *node, *n;
623 struct fib_info *fi;
624
625 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
626 struct hlist_head *ldest;
627 unsigned int new_hash;
628
629 hlist_del(&fi->fib_lhash);
630
631 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
632 ldest = &new_laddrhash[new_hash];
633 hlist_add_head(&fi->fib_lhash, ldest);
634 }
635 }
636 fib_info_laddrhash = new_laddrhash;
637
638 write_unlock(&fib_info_lock);
639}
640
641struct fib_info *
642fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
643 const struct nlmsghdr *nlh, int *errp)
644{
645 int err;
646 struct fib_info *fi = NULL;
647 struct fib_info *ofi;
648#ifdef CONFIG_IP_ROUTE_MULTIPATH
649 int nhs = 1;
650#else
651 const int nhs = 1;
652#endif
653#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
654 u32 mp_alg = IP_MP_ALG_NONE;
655#endif
656
657 /* Fast check to catch the most weird cases */
658 if (fib_props[r->rtm_type].scope > r->rtm_scope)
659 goto err_inval;
660
661#ifdef CONFIG_IP_ROUTE_MULTIPATH
662 if (rta->rta_mp) {
663 nhs = fib_count_nexthops(rta->rta_mp);
664 if (nhs == 0)
665 goto err_inval;
666 }
667#endif
668#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
669 if (rta->rta_mp_alg) {
670 mp_alg = *rta->rta_mp_alg;
671
672 if (mp_alg < IP_MP_ALG_NONE ||
673 mp_alg > IP_MP_ALG_MAX)
674 goto err_inval;
675 }
676#endif
677
678 err = -ENOBUFS;
679 if (fib_info_cnt >= fib_hash_size) {
680 unsigned int new_size = fib_hash_size << 1;
681 struct hlist_head *new_info_hash;
682 struct hlist_head *new_laddrhash;
683 unsigned int bytes;
684
685 if (!new_size)
686 new_size = 1;
687 bytes = new_size * sizeof(struct hlist_head *);
688 new_info_hash = fib_hash_alloc(bytes);
689 new_laddrhash = fib_hash_alloc(bytes);
690 if (!new_info_hash || !new_laddrhash) {
691 fib_hash_free(new_info_hash, bytes);
692 fib_hash_free(new_laddrhash, bytes);
693 } else {
694 memset(new_info_hash, 0, bytes);
695 memset(new_laddrhash, 0, bytes);
696
697 fib_hash_move(new_info_hash, new_laddrhash, new_size);
698 }
699
700 if (!fib_hash_size)
701 goto failure;
702 }
703
704 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
705 if (fi == NULL)
706 goto failure;
707 fib_info_cnt++;
708 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
709
710 fi->fib_protocol = r->rtm_protocol;
711
712 fi->fib_nhs = nhs;
713 change_nexthops(fi) {
714 nh->nh_parent = fi;
715 } endfor_nexthops(fi)
716
717 fi->fib_flags = r->rtm_flags;
718 if (rta->rta_priority)
719 fi->fib_priority = *rta->rta_priority;
720 if (rta->rta_mx) {
721 int attrlen = RTA_PAYLOAD(rta->rta_mx);
722 struct rtattr *attr = RTA_DATA(rta->rta_mx);
723
724 while (RTA_OK(attr, attrlen)) {
725 unsigned flavor = attr->rta_type;
726 if (flavor) {
727 if (flavor > RTAX_MAX)
728 goto err_inval;
729 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
730 }
731 attr = RTA_NEXT(attr, attrlen);
732 }
733 }
734 if (rta->rta_prefsrc)
735 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
736
737 if (rta->rta_mp) {
738#ifdef CONFIG_IP_ROUTE_MULTIPATH
739 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
740 goto failure;
741 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
742 goto err_inval;
743 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
744 goto err_inval;
745#ifdef CONFIG_NET_CLS_ROUTE
746 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
747 goto err_inval;
748#endif
749#else
750 goto err_inval;
751#endif
752 } else {
753 struct fib_nh *nh = fi->fib_nh;
754 if (rta->rta_oif)
755 nh->nh_oif = *rta->rta_oif;
756 if (rta->rta_gw)
757 memcpy(&nh->nh_gw, rta->rta_gw, 4);
758#ifdef CONFIG_NET_CLS_ROUTE
759 if (rta->rta_flow)
760 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
761#endif
762 nh->nh_flags = r->rtm_flags;
763#ifdef CONFIG_IP_ROUTE_MULTIPATH
764 nh->nh_weight = 1;
765#endif
766 }
767
768#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
769 fi->fib_mp_alg = mp_alg;
770#endif
771
772 if (fib_props[r->rtm_type].error) {
773 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
774 goto err_inval;
775 goto link_it;
776 }
777
778 if (r->rtm_scope > RT_SCOPE_HOST)
779 goto err_inval;
780
781 if (r->rtm_scope == RT_SCOPE_HOST) {
782 struct fib_nh *nh = fi->fib_nh;
783
784 /* Local address is added. */
785 if (nhs != 1 || nh->nh_gw)
786 goto err_inval;
787 nh->nh_scope = RT_SCOPE_NOWHERE;
788 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
789 err = -ENODEV;
790 if (nh->nh_dev == NULL)
791 goto failure;
792 } else {
793 change_nexthops(fi) {
794 if ((err = fib_check_nh(r, fi, nh)) != 0)
795 goto failure;
796 } endfor_nexthops(fi)
797 }
798
799 if (fi->fib_prefsrc) {
800 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
801 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
802 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
803 goto err_inval;
804 }
805
806link_it:
807 if ((ofi = fib_find_info(fi)) != NULL) {
808 fi->fib_dead = 1;
809 free_fib_info(fi);
810 ofi->fib_treeref++;
811 return ofi;
812 }
813
814 fi->fib_treeref++;
815 atomic_inc(&fi->fib_clntref);
816 write_lock(&fib_info_lock);
817 hlist_add_head(&fi->fib_hash,
818 &fib_info_hash[fib_info_hashfn(fi)]);
819 if (fi->fib_prefsrc) {
820 struct hlist_head *head;
821
822 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
823 hlist_add_head(&fi->fib_lhash, head);
824 }
825 change_nexthops(fi) {
826 struct hlist_head *head;
827 unsigned int hash;
828
829 if (!nh->nh_dev)
830 continue;
831 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
832 head = &fib_info_devhash[hash];
833 hlist_add_head(&nh->nh_hash, head);
834 } endfor_nexthops(fi)
835 write_unlock(&fib_info_lock);
836 return fi;
837
838err_inval:
839 err = -EINVAL;
840
841failure:
842 *errp = err;
843 if (fi) {
844 fi->fib_dead = 1;
845 free_fib_info(fi);
846 }
847 return NULL;
848}
849
850int fib_semantic_match(struct list_head *head, const struct flowi *flp,
851 struct fib_result *res, __u32 zone, __u32 mask,
852 int prefixlen)
853{
854 struct fib_alias *fa;
855 int nh_sel = 0;
856
857 list_for_each_entry(fa, head, fa_list) {
858 int err;
859
860 if (fa->fa_tos &&
861 fa->fa_tos != flp->fl4_tos)
862 continue;
863
864 if (fa->fa_scope < flp->fl4_scope)
865 continue;
866
867 fa->fa_state |= FA_S_ACCESSED;
868
869 err = fib_props[fa->fa_type].error;
870 if (err == 0) {
871 struct fib_info *fi = fa->fa_info;
872
873 if (fi->fib_flags & RTNH_F_DEAD)
874 continue;
875
876 switch (fa->fa_type) {
877 case RTN_UNICAST:
878 case RTN_LOCAL:
879 case RTN_BROADCAST:
880 case RTN_ANYCAST:
881 case RTN_MULTICAST:
882 for_nexthops(fi) {
883 if (nh->nh_flags&RTNH_F_DEAD)
884 continue;
885 if (!flp->oif || flp->oif == nh->nh_oif)
886 break;
887 }
888#ifdef CONFIG_IP_ROUTE_MULTIPATH
889 if (nhsel < fi->fib_nhs) {
890 nh_sel = nhsel;
891 goto out_fill_res;
892 }
893#else
894 if (nhsel < 1) {
895 goto out_fill_res;
896 }
897#endif
898 endfor_nexthops(fi);
899 continue;
900
901 default:
902 printk(KERN_DEBUG "impossible 102\n");
903 return -EINVAL;
904 };
905 }
906 return err;
907 }
908 return 1;
909
910out_fill_res:
911 res->prefixlen = prefixlen;
912 res->nh_sel = nh_sel;
913 res->type = fa->fa_type;
914 res->scope = fa->fa_scope;
915 res->fi = fa->fa_info;
916#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
917 res->netmask = mask;
918 res->network = zone &
919 (0xFFFFFFFF >> (32 - prefixlen));
920#endif
921 atomic_inc(&res->fi->fib_clntref);
922 return 0;
923}
924
925/* Find appropriate source address to this destination */
926
927u32 __fib_res_prefsrc(struct fib_result *res)
928{
929 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
930}
931
932int
933fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
934 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
935 struct fib_info *fi)
936{
937 struct rtmsg *rtm;
938 struct nlmsghdr *nlh;
939 unsigned char *b = skb->tail;
940
941 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
942 rtm = NLMSG_DATA(nlh);
943 rtm->rtm_family = AF_INET;
944 rtm->rtm_dst_len = dst_len;
945 rtm->rtm_src_len = 0;
946 rtm->rtm_tos = tos;
947 rtm->rtm_table = tb_id;
948 rtm->rtm_type = type;
949 rtm->rtm_flags = fi->fib_flags;
950 rtm->rtm_scope = scope;
951 if (rtm->rtm_dst_len)
952 RTA_PUT(skb, RTA_DST, 4, dst);
953 rtm->rtm_protocol = fi->fib_protocol;
954 if (fi->fib_priority)
955 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
956#ifdef CONFIG_NET_CLS_ROUTE
957 if (fi->fib_nh[0].nh_tclassid)
958 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
959#endif
960 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
961 goto rtattr_failure;
962 if (fi->fib_prefsrc)
963 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
964 if (fi->fib_nhs == 1) {
965 if (fi->fib_nh->nh_gw)
966 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
967 if (fi->fib_nh->nh_oif)
968 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
969 }
970#ifdef CONFIG_IP_ROUTE_MULTIPATH
971 if (fi->fib_nhs > 1) {
972 struct rtnexthop *nhp;
973 struct rtattr *mp_head;
974 if (skb_tailroom(skb) <= RTA_SPACE(0))
975 goto rtattr_failure;
976 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
977
978 for_nexthops(fi) {
979 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
980 goto rtattr_failure;
981 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
982 nhp->rtnh_flags = nh->nh_flags & 0xFF;
983 nhp->rtnh_hops = nh->nh_weight-1;
984 nhp->rtnh_ifindex = nh->nh_oif;
985 if (nh->nh_gw)
986 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
987 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
988 } endfor_nexthops(fi);
989 mp_head->rta_type = RTA_MULTIPATH;
990 mp_head->rta_len = skb->tail - (u8*)mp_head;
991 }
992#endif
993 nlh->nlmsg_len = skb->tail - b;
994 return skb->len;
995
996nlmsg_failure:
997rtattr_failure:
998 skb_trim(skb, b - skb->data);
999 return -1;
1000}
1001
1002#ifndef CONFIG_IP_NOSIOCRT
1003
1004int
1005fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1006 struct kern_rta *rta, struct rtentry *r)
1007{
1008 int plen;
1009 u32 *ptr;
1010
1011 memset(rtm, 0, sizeof(*rtm));
1012 memset(rta, 0, sizeof(*rta));
1013
1014 if (r->rt_dst.sa_family != AF_INET)
1015 return -EAFNOSUPPORT;
1016
1017 /* Check mask for validity:
1018 a) it must be contiguous.
1019 b) destination must have all host bits clear.
1020 c) if application forgot to set correct family (AF_INET),
1021 reject request unless it is absolutely clear i.e.
1022 both family and mask are zero.
1023 */
1024 plen = 32;
1025 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
1026 if (!(r->rt_flags&RTF_HOST)) {
1027 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
1028 if (r->rt_genmask.sa_family != AF_INET) {
1029 if (mask || r->rt_genmask.sa_family)
1030 return -EAFNOSUPPORT;
1031 }
1032 if (bad_mask(mask, *ptr))
1033 return -EINVAL;
1034 plen = inet_mask_len(mask);
1035 }
1036
1037 nl->nlmsg_flags = NLM_F_REQUEST;
1038 nl->nlmsg_pid = 0;
1039 nl->nlmsg_seq = 0;
1040 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1041 if (cmd == SIOCDELRT) {
1042 nl->nlmsg_type = RTM_DELROUTE;
1043 nl->nlmsg_flags = 0;
1044 } else {
1045 nl->nlmsg_type = RTM_NEWROUTE;
1046 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
1047 rtm->rtm_protocol = RTPROT_BOOT;
1048 }
1049
1050 rtm->rtm_dst_len = plen;
1051 rta->rta_dst = ptr;
1052
1053 if (r->rt_metric) {
1054 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
1055 rta->rta_priority = (u32*)&r->rt_pad3;
1056 }
1057 if (r->rt_flags&RTF_REJECT) {
1058 rtm->rtm_scope = RT_SCOPE_HOST;
1059 rtm->rtm_type = RTN_UNREACHABLE;
1060 return 0;
1061 }
1062 rtm->rtm_scope = RT_SCOPE_NOWHERE;
1063 rtm->rtm_type = RTN_UNICAST;
1064
1065 if (r->rt_dev) {
1066 char *colon;
1067 struct net_device *dev;
1068 char devname[IFNAMSIZ];
1069
1070 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
1071 return -EFAULT;
1072 devname[IFNAMSIZ-1] = 0;
1073 colon = strchr(devname, ':');
1074 if (colon)
1075 *colon = 0;
1076 dev = __dev_get_by_name(devname);
1077 if (!dev)
1078 return -ENODEV;
1079 rta->rta_oif = &dev->ifindex;
1080 if (colon) {
1081 struct in_ifaddr *ifa;
1082 struct in_device *in_dev = __in_dev_get(dev);
1083 if (!in_dev)
1084 return -ENODEV;
1085 *colon = ':';
1086 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
1087 if (strcmp(ifa->ifa_label, devname) == 0)
1088 break;
1089 if (ifa == NULL)
1090 return -ENODEV;
1091 rta->rta_prefsrc = &ifa->ifa_local;
1092 }
1093 }
1094
1095 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1096 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1097 rta->rta_gw = ptr;
1098 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1099 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1100 }
1101
1102 if (cmd == SIOCDELRT)
1103 return 0;
1104
1105 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1106 return -EINVAL;
1107
1108 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1109 rtm->rtm_scope = RT_SCOPE_LINK;
1110
1111 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1112 struct rtattr *rec;
1113 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1114 if (mx == NULL)
1115 return -ENOMEM;
1116 rta->rta_mx = mx;
1117 mx->rta_type = RTA_METRICS;
1118 mx->rta_len = RTA_LENGTH(0);
1119 if (r->rt_flags&RTF_MTU) {
1120 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1121 rec->rta_type = RTAX_ADVMSS;
1122 rec->rta_len = RTA_LENGTH(4);
1123 mx->rta_len += RTA_LENGTH(4);
1124 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1125 }
1126 if (r->rt_flags&RTF_WINDOW) {
1127 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1128 rec->rta_type = RTAX_WINDOW;
1129 rec->rta_len = RTA_LENGTH(4);
1130 mx->rta_len += RTA_LENGTH(4);
1131 *(u32*)RTA_DATA(rec) = r->rt_window;
1132 }
1133 if (r->rt_flags&RTF_IRTT) {
1134 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1135 rec->rta_type = RTAX_RTT;
1136 rec->rta_len = RTA_LENGTH(4);
1137 mx->rta_len += RTA_LENGTH(4);
1138 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1139 }
1140 }
1141 return 0;
1142}
1143
1144#endif
1145
1146/*
1147 Update FIB if:
1148 - local address disappeared -> we must delete all the entries
1149 referring to it.
1150 - device went down -> we must shutdown all nexthops going via it.
1151 */
1152
1153int fib_sync_down(u32 local, struct net_device *dev, int force)
1154{
1155 int ret = 0;
1156 int scope = RT_SCOPE_NOWHERE;
1157
1158 if (force)
1159 scope = -1;
1160
1161 if (local && fib_info_laddrhash) {
1162 unsigned int hash = fib_laddr_hashfn(local);
1163 struct hlist_head *head = &fib_info_laddrhash[hash];
1164 struct hlist_node *node;
1165 struct fib_info *fi;
1166
1167 hlist_for_each_entry(fi, node, head, fib_lhash) {
1168 if (fi->fib_prefsrc == local) {
1169 fi->fib_flags |= RTNH_F_DEAD;
1170 ret++;
1171 }
1172 }
1173 }
1174
1175 if (dev) {
1176 struct fib_info *prev_fi = NULL;
1177 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1178 struct hlist_head *head = &fib_info_devhash[hash];
1179 struct hlist_node *node;
1180 struct fib_nh *nh;
1181
1182 hlist_for_each_entry(nh, node, head, nh_hash) {
1183 struct fib_info *fi = nh->nh_parent;
1184 int dead;
1185
1186 BUG_ON(!fi->fib_nhs);
1187 if (nh->nh_dev != dev || fi == prev_fi)
1188 continue;
1189 prev_fi = fi;
1190 dead = 0;
1191 change_nexthops(fi) {
1192 if (nh->nh_flags&RTNH_F_DEAD)
1193 dead++;
1194 else if (nh->nh_dev == dev &&
1195 nh->nh_scope != scope) {
1196 nh->nh_flags |= RTNH_F_DEAD;
1197#ifdef CONFIG_IP_ROUTE_MULTIPATH
1198 spin_lock_bh(&fib_multipath_lock);
1199 fi->fib_power -= nh->nh_power;
1200 nh->nh_power = 0;
1201 spin_unlock_bh(&fib_multipath_lock);
1202#endif
1203 dead++;
1204 }
1205#ifdef CONFIG_IP_ROUTE_MULTIPATH
1206 if (force > 1 && nh->nh_dev == dev) {
1207 dead = fi->fib_nhs;
1208 break;
1209 }
1210#endif
1211 } endfor_nexthops(fi)
1212 if (dead == fi->fib_nhs) {
1213 fi->fib_flags |= RTNH_F_DEAD;
1214 ret++;
1215 }
1216 }
1217 }
1218
1219 return ret;
1220}
1221
1222#ifdef CONFIG_IP_ROUTE_MULTIPATH
1223
1224/*
1225 Dead device goes up. We wake up dead nexthops.
1226 It takes sense only on multipath routes.
1227 */
1228
1229int fib_sync_up(struct net_device *dev)
1230{
1231 struct fib_info *prev_fi;
1232 unsigned int hash;
1233 struct hlist_head *head;
1234 struct hlist_node *node;
1235 struct fib_nh *nh;
1236 int ret;
1237
1238 if (!(dev->flags&IFF_UP))
1239 return 0;
1240
1241 prev_fi = NULL;
1242 hash = fib_devindex_hashfn(dev->ifindex);
1243 head = &fib_info_devhash[hash];
1244 ret = 0;
1245
1246 hlist_for_each_entry(nh, node, head, nh_hash) {
1247 struct fib_info *fi = nh->nh_parent;
1248 int alive;
1249
1250 BUG_ON(!fi->fib_nhs);
1251 if (nh->nh_dev != dev || fi == prev_fi)
1252 continue;
1253
1254 prev_fi = fi;
1255 alive = 0;
1256 change_nexthops(fi) {
1257 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1258 alive++;
1259 continue;
1260 }
1261 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1262 continue;
1263 if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1264 continue;
1265 alive++;
1266 spin_lock_bh(&fib_multipath_lock);
1267 nh->nh_power = 0;
1268 nh->nh_flags &= ~RTNH_F_DEAD;
1269 spin_unlock_bh(&fib_multipath_lock);
1270 } endfor_nexthops(fi)
1271
1272 if (alive > 0) {
1273 fi->fib_flags &= ~RTNH_F_DEAD;
1274 ret++;
1275 }
1276 }
1277
1278 return ret;
1279}
1280
1281/*
1282 The algorithm is suboptimal, but it provides really
1283 fair weighted route distribution.
1284 */
1285
1286void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1287{
1288 struct fib_info *fi = res->fi;
1289 int w;
1290
1291 spin_lock_bh(&fib_multipath_lock);
1292 if (fi->fib_power <= 0) {
1293 int power = 0;
1294 change_nexthops(fi) {
1295 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1296 power += nh->nh_weight;
1297 nh->nh_power = nh->nh_weight;
1298 }
1299 } endfor_nexthops(fi);
1300 fi->fib_power = power;
1301 if (power <= 0) {
1302 spin_unlock_bh(&fib_multipath_lock);
1303 /* Race condition: route has just become dead. */
1304 res->nh_sel = 0;
1305 return;
1306 }
1307 }
1308
1309
1310 /* w should be random number [0..fi->fib_power-1],
1311 it is pretty bad approximation.
1312 */
1313
1314 w = jiffies % fi->fib_power;
1315
1316 change_nexthops(fi) {
1317 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1318 if ((w -= nh->nh_power) <= 0) {
1319 nh->nh_power--;
1320 fi->fib_power--;
1321 res->nh_sel = nhsel;
1322 spin_unlock_bh(&fib_multipath_lock);
1323 return;
1324 }
1325 }
1326 } endfor_nexthops(fi);
1327
1328 /* Race condition: route has just become dead. */
1329 res->nh_sel = 0;
1330 spin_unlock_bh(&fib_multipath_lock);
1331}
1332#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 000000000000..85bf0d3e294b
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1143 @@
1/*
2 * NET3: Implementation of the ICMP protocol layer.
3 *
4 * Alan Cox, <alan@redhat.com>
5 *
6 * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Some of the function names and the icmp unreach table for this
14 * module were derived from [icmp.c 1.0.11 06/02/93] by
15 * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
16 * Other than that this module is a complete rewrite.
17 *
18 * Fixes:
19 * Clemens Fruhwirth : introduce global icmp rate limiting
20 * with icmp type masking ability instead
21 * of broken per type icmp timeouts.
22 * Mike Shaver : RFC1122 checks.
23 * Alan Cox : Multicast ping reply as self.
24 * Alan Cox : Fix atomicity lockup in ip_build_xmit
25 * call.
26 * Alan Cox : Added 216,128 byte paths to the MTU
27 * code.
28 * Martin Mares : RFC1812 checks.
29 * Martin Mares : Can be configured to follow redirects
30 * if acting as a router _without_ a
31 * routing protocol (RFC 1812).
32 * Martin Mares : Echo requests may be configured to
33 * be ignored (RFC 1812).
34 * Martin Mares : Limitation of ICMP error message
35 * transmit rate (RFC 1812).
36 * Martin Mares : TOS and Precedence set correctly
37 * (RFC 1812).
38 * Martin Mares : Now copying as much data from the
39 * original packet as we can without
40 * exceeding 576 bytes (RFC 1812).
41 * Willy Konynenberg : Transparent proxying support.
42 * Keith Owens : RFC1191 correction for 4.2BSD based
43 * path MTU bug.
44 * Thomas Quinot : ICMP Dest Unreach codes up to 15 are
45 * valid (RFC 1812).
46 * Andi Kleen : Check all packet lengths properly
47 * and moved all kfree_skb() up to
48 * icmp_rcv.
49 * Andi Kleen : Move the rate limit bookkeeping
50 * into the dest entry and use a token
51 * bucket filter (thanks to ANK). Make
52 * the rates sysctl configurable.
53 * Yu Tianli : Fixed two ugly bugs in icmp_send
54 * - IP option length was accounted wrongly
55 * - ICMP header length was not accounted
56 * at all.
57 * Tristan Greaves : Added sysctl option to ignore bogus
58 * broadcast responses from broken routers.
59 *
60 * To Fix:
61 *
62 * - Should use skb_pull() instead of all the manual checking.
63 * This would also greatly simply some upper layer error handlers. --AK
64 *
65 */
66
67#include <linux/config.h>
68#include <linux/module.h>
69#include <linux/types.h>
70#include <linux/jiffies.h>
71#include <linux/kernel.h>
72#include <linux/fcntl.h>
73#include <linux/socket.h>
74#include <linux/in.h>
75#include <linux/inet.h>
76#include <linux/netdevice.h>
77#include <linux/string.h>
78#include <linux/netfilter_ipv4.h>
79#include <net/snmp.h>
80#include <net/ip.h>
81#include <net/route.h>
82#include <net/protocol.h>
83#include <net/icmp.h>
84#include <net/tcp.h>
85#include <net/udp.h>
86#include <net/raw.h>
87#include <linux/skbuff.h>
88#include <net/sock.h>
89#include <linux/errno.h>
90#include <linux/timer.h>
91#include <linux/init.h>
92#include <asm/system.h>
93#include <asm/uaccess.h>
94#include <net/checksum.h>
95
96/*
97 * Build xmit assembly blocks
98 */
99
100struct icmp_bxm {
101 struct sk_buff *skb;
102 int offset;
103 int data_len;
104
105 struct {
106 struct icmphdr icmph;
107 __u32 times[3];
108 } data;
109 int head_len;
110 struct ip_options replyopts;
111 unsigned char optbuf[40];
112};
113
114/*
115 * Statistics
116 */
117DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
118
119/* An array of errno for error messages from dest unreach. */
120/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
121
122struct icmp_err icmp_err_convert[] = {
123 {
124 .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
125 .fatal = 0,
126 },
127 {
128 .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
129 .fatal = 0,
130 },
131 {
132 .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
133 .fatal = 1,
134 },
135 {
136 .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
137 .fatal = 1,
138 },
139 {
140 .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
141 .fatal = 0,
142 },
143 {
144 .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
145 .fatal = 0,
146 },
147 {
148 .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
149 .fatal = 1,
150 },
151 {
152 .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
153 .fatal = 1,
154 },
155 {
156 .errno = ENONET, /* ICMP_HOST_ISOLATED */
157 .fatal = 1,
158 },
159 {
160 .errno = ENETUNREACH, /* ICMP_NET_ANO */
161 .fatal = 1,
162 },
163 {
164 .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
165 .fatal = 1,
166 },
167 {
168 .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
169 .fatal = 0,
170 },
171 {
172 .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
173 .fatal = 0,
174 },
175 {
176 .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
177 .fatal = 1,
178 },
179 {
180 .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
181 .fatal = 1,
182 },
183 {
184 .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
185 .fatal = 1,
186 },
187};
188
189/* Control parameters for ECHO replies. */
190int sysctl_icmp_echo_ignore_all;
191int sysctl_icmp_echo_ignore_broadcasts;
192
193/* Control parameter - ignore bogus broadcast responses? */
194int sysctl_icmp_ignore_bogus_error_responses;
195
196/*
197 * Configurable global rate limit.
198 *
199 * ratelimit defines tokens/packet consumed for dst->rate_token bucket
200 * ratemask defines which icmp types are ratelimited by setting
201 * it's bit position.
202 *
203 * default:
204 * dest unreachable (3), source quench (4),
205 * time exceeded (11), parameter problem (12)
206 */
207
208int sysctl_icmp_ratelimit = 1 * HZ;
209int sysctl_icmp_ratemask = 0x1818;
210
211/*
212 * ICMP control array. This specifies what to do with each ICMP.
213 */
214
215struct icmp_control {
216 int output_entry; /* Field for increment on output */
217 int input_entry; /* Field for increment on input */
218 void (*handler)(struct sk_buff *skb);
219 short error; /* This ICMP is classed as an error message */
220};
221
222static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
223
224/*
225 * The ICMP socket(s). This is the most convenient way to flow control
226 * our ICMP output as well as maintain a clean interface throughout
227 * all layers. All Socketless IP sends will soon be gone.
228 *
229 * On SMP we have one ICMP socket per-cpu.
230 */
231static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
232#define icmp_socket __get_cpu_var(__icmp_socket)
233
234static __inline__ int icmp_xmit_lock(void)
235{
236 local_bh_disable();
237
238 if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) {
239 /* This can happen if the output path signals a
240 * dst_link_failure() for an outgoing ICMP packet.
241 */
242 local_bh_enable();
243 return 1;
244 }
245 return 0;
246}
247
248static void icmp_xmit_unlock(void)
249{
250 spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
251}
252
253/*
254 * Send an ICMP frame.
255 */
256
257/*
258 * Check transmit rate limitation for given message.
259 * The rate information is held in the destination cache now.
260 * This function is generic and could be used for other purposes
261 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
262 *
263 * Note that the same dst_entry fields are modified by functions in
264 * route.c too, but these work for packet destinations while xrlim_allow
265 * works for icmp destinations. This means the rate limiting information
266 * for one "ip object" is shared - and these ICMPs are twice limited:
267 * by source and by destination.
268 *
269 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
270 * SHOULD allow setting of rate limits
271 *
272 * Shared between ICMPv4 and ICMPv6.
273 */
274#define XRLIM_BURST_FACTOR 6
275int xrlim_allow(struct dst_entry *dst, int timeout)
276{
277 unsigned long now;
278 int rc = 0;
279
280 now = jiffies;
281 dst->rate_tokens += now - dst->rate_last;
282 dst->rate_last = now;
283 if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
284 dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
285 if (dst->rate_tokens >= timeout) {
286 dst->rate_tokens -= timeout;
287 rc = 1;
288 }
289 return rc;
290}
291
292static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
293{
294 struct dst_entry *dst = &rt->u.dst;
295 int rc = 1;
296
297 if (type > NR_ICMP_TYPES)
298 goto out;
299
300 /* Don't limit PMTU discovery. */
301 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
302 goto out;
303
304 /* No rate limit on loopback */
305 if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
306 goto out;
307
308 /* Limit if icmp type is enabled in ratemask. */
309 if ((1 << type) & sysctl_icmp_ratemask)
310 rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
311out:
312 return rc;
313}
314
315/*
316 * Maintain the counters used in the SNMP statistics for outgoing ICMP
317 */
318static void icmp_out_count(int type)
319{
320 if (type <= NR_ICMP_TYPES) {
321 ICMP_INC_STATS(icmp_pointers[type].output_entry);
322 ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
323 }
324}
325
326/*
327 * Checksum each fragment, and on the first include the headers and final
328 * checksum.
329 */
330static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
331 struct sk_buff *skb)
332{
333 struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
334 unsigned int csum;
335
336 csum = skb_copy_and_csum_bits(icmp_param->skb,
337 icmp_param->offset + offset,
338 to, len, 0);
339
340 skb->csum = csum_block_add(skb->csum, csum, odd);
341 if (icmp_pointers[icmp_param->data.icmph.type].error)
342 nf_ct_attach(skb, icmp_param->skb);
343 return 0;
344}
345
346static void icmp_push_reply(struct icmp_bxm *icmp_param,
347 struct ipcm_cookie *ipc, struct rtable *rt)
348{
349 struct sk_buff *skb;
350
351 ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
352 icmp_param->data_len+icmp_param->head_len,
353 icmp_param->head_len,
354 ipc, rt, MSG_DONTWAIT);
355
356 if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
357 struct icmphdr *icmph = skb->h.icmph;
358 unsigned int csum = 0;
359 struct sk_buff *skb1;
360
361 skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
362 csum = csum_add(csum, skb1->csum);
363 }
364 csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
365 (char *)icmph,
366 icmp_param->head_len, csum);
367 icmph->checksum = csum_fold(csum);
368 skb->ip_summed = CHECKSUM_NONE;
369 ip_push_pending_frames(icmp_socket->sk);
370 }
371}
372
373/*
374 * Driving logic for building and sending ICMP messages.
375 */
376
377static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
378{
379 struct sock *sk = icmp_socket->sk;
380 struct inet_sock *inet = inet_sk(sk);
381 struct ipcm_cookie ipc;
382 struct rtable *rt = (struct rtable *)skb->dst;
383 u32 daddr;
384
385 if (ip_options_echo(&icmp_param->replyopts, skb))
386 goto out;
387
388 if (icmp_xmit_lock())
389 return;
390
391 icmp_param->data.icmph.checksum = 0;
392 icmp_out_count(icmp_param->data.icmph.type);
393
394 inet->tos = skb->nh.iph->tos;
395 daddr = ipc.addr = rt->rt_src;
396 ipc.opt = NULL;
397 if (icmp_param->replyopts.optlen) {
398 ipc.opt = &icmp_param->replyopts;
399 if (ipc.opt->srr)
400 daddr = icmp_param->replyopts.faddr;
401 }
402 {
403 struct flowi fl = { .nl_u = { .ip4_u =
404 { .daddr = daddr,
405 .saddr = rt->rt_spec_dst,
406 .tos = RT_TOS(skb->nh.iph->tos) } },
407 .proto = IPPROTO_ICMP };
408 if (ip_route_output_key(&rt, &fl))
409 goto out_unlock;
410 }
411 if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
412 icmp_param->data.icmph.code))
413 icmp_push_reply(icmp_param, &ipc, rt);
414 ip_rt_put(rt);
415out_unlock:
416 icmp_xmit_unlock();
417out:;
418}
419
420
421/*
422 * Send an ICMP message in response to a situation
423 *
424 * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
425 * MAY send more (we do).
426 * MUST NOT change this header information.
427 * MUST NOT reply to a multicast/broadcast IP address.
428 * MUST NOT reply to a multicast/broadcast MAC address.
429 * MUST reply to only the first fragment.
430 */
431
432void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
433{
434 struct iphdr *iph;
435 int room;
436 struct icmp_bxm icmp_param;
437 struct rtable *rt = (struct rtable *)skb_in->dst;
438 struct ipcm_cookie ipc;
439 u32 saddr;
440 u8 tos;
441
442 if (!rt)
443 goto out;
444
445 /*
446 * Find the original header. It is expected to be valid, of course.
447 * Check this, icmp_send is called from the most obscure devices
448 * sometimes.
449 */
450 iph = skb_in->nh.iph;
451
452 if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail)
453 goto out;
454
455 /*
456 * No replies to physical multicast/broadcast
457 */
458 if (skb_in->pkt_type != PACKET_HOST)
459 goto out;
460
461 /*
462 * Now check at the protocol level
463 */
464 if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
465 goto out;
466
467 /*
468 * Only reply to fragment 0. We byte re-order the constant
469 * mask for efficiency.
470 */
471 if (iph->frag_off & htons(IP_OFFSET))
472 goto out;
473
474 /*
475 * If we send an ICMP error to an ICMP error a mess would result..
476 */
477 if (icmp_pointers[type].error) {
478 /*
479 * We are an error, check if we are replying to an
480 * ICMP error
481 */
482 if (iph->protocol == IPPROTO_ICMP) {
483 u8 _inner_type, *itp;
484
485 itp = skb_header_pointer(skb_in,
486 skb_in->nh.raw +
487 (iph->ihl << 2) +
488 offsetof(struct icmphdr,
489 type) -
490 skb_in->data,
491 sizeof(_inner_type),
492 &_inner_type);
493 if (itp == NULL)
494 goto out;
495
496 /*
497 * Assume any unknown ICMP type is an error. This
498 * isn't specified by the RFC, but think about it..
499 */
500 if (*itp > NR_ICMP_TYPES ||
501 icmp_pointers[*itp].error)
502 goto out;
503 }
504 }
505
506 if (icmp_xmit_lock())
507 return;
508
509 /*
510 * Construct source address and options.
511 */
512
513 saddr = iph->daddr;
514 if (!(rt->rt_flags & RTCF_LOCAL))
515 saddr = 0;
516
517 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
518 IPTOS_PREC_INTERNETCONTROL) :
519 iph->tos;
520
521 if (ip_options_echo(&icmp_param.replyopts, skb_in))
522 goto ende;
523
524
525 /*
526 * Prepare data for ICMP header.
527 */
528
529 icmp_param.data.icmph.type = type;
530 icmp_param.data.icmph.code = code;
531 icmp_param.data.icmph.un.gateway = info;
532 icmp_param.data.icmph.checksum = 0;
533 icmp_param.skb = skb_in;
534 icmp_param.offset = skb_in->nh.raw - skb_in->data;
535 icmp_out_count(icmp_param.data.icmph.type);
536 inet_sk(icmp_socket->sk)->tos = tos;
537 ipc.addr = iph->saddr;
538 ipc.opt = &icmp_param.replyopts;
539
540 {
541 struct flowi fl = {
542 .nl_u = {
543 .ip4_u = {
544 .daddr = icmp_param.replyopts.srr ?
545 icmp_param.replyopts.faddr :
546 iph->saddr,
547 .saddr = saddr,
548 .tos = RT_TOS(tos)
549 }
550 },
551 .proto = IPPROTO_ICMP,
552 .uli_u = {
553 .icmpt = {
554 .type = type,
555 .code = code
556 }
557 }
558 };
559 if (ip_route_output_key(&rt, &fl))
560 goto out_unlock;
561 }
562
563 if (!icmpv4_xrlim_allow(rt, type, code))
564 goto ende;
565
566 /* RFC says return as much as we can without exceeding 576 bytes. */
567
568 room = dst_mtu(&rt->u.dst);
569 if (room > 576)
570 room = 576;
571 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
572 room -= sizeof(struct icmphdr);
573
574 icmp_param.data_len = skb_in->len - icmp_param.offset;
575 if (icmp_param.data_len > room)
576 icmp_param.data_len = room;
577 icmp_param.head_len = sizeof(struct icmphdr);
578
579 icmp_push_reply(&icmp_param, &ipc, rt);
580ende:
581 ip_rt_put(rt);
582out_unlock:
583 icmp_xmit_unlock();
584out:;
585}
586
587
588/*
589 * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
590 */
591
592static void icmp_unreach(struct sk_buff *skb)
593{
594 struct iphdr *iph;
595 struct icmphdr *icmph;
596 int hash, protocol;
597 struct net_protocol *ipprot;
598 struct sock *raw_sk;
599 u32 info = 0;
600
601 /*
602 * Incomplete header ?
603 * Only checks for the IP header, there should be an
604 * additional check for longer headers in upper levels.
605 */
606
607 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
608 goto out_err;
609
610 icmph = skb->h.icmph;
611 iph = (struct iphdr *)skb->data;
612
613 if (iph->ihl < 5) /* Mangled header, drop. */
614 goto out_err;
615
616 if (icmph->type == ICMP_DEST_UNREACH) {
617 switch (icmph->code & 15) {
618 case ICMP_NET_UNREACH:
619 case ICMP_HOST_UNREACH:
620 case ICMP_PROT_UNREACH:
621 case ICMP_PORT_UNREACH:
622 break;
623 case ICMP_FRAG_NEEDED:
624 if (ipv4_config.no_pmtu_disc) {
625 LIMIT_NETDEBUG(
626 printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
627 "fragmentation needed "
628 "and DF set.\n",
629 NIPQUAD(iph->daddr)));
630 } else {
631 info = ip_rt_frag_needed(iph,
632 ntohs(icmph->un.frag.mtu));
633 if (!info)
634 goto out;
635 }
636 break;
637 case ICMP_SR_FAILED:
638 LIMIT_NETDEBUG(
639 printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
640 "Route Failed.\n",
641 NIPQUAD(iph->daddr)));
642 break;
643 default:
644 break;
645 }
646 if (icmph->code > NR_ICMP_UNREACH)
647 goto out;
648 } else if (icmph->type == ICMP_PARAMETERPROB)
649 info = ntohl(icmph->un.gateway) >> 24;
650
651 /*
652 * Throw it at our lower layers
653 *
654 * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
655 * header.
656 * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
657 * transport layer.
658 * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
659 * transport layer.
660 */
661
662 /*
663 * Check the other end isnt violating RFC 1122. Some routers send
664 * bogus responses to broadcast frames. If you see this message
665 * first check your netmask matches at both ends, if it does then
666 * get the other vendor to fix their kit.
667 */
668
669 if (!sysctl_icmp_ignore_bogus_error_responses &&
670 inet_addr_type(iph->daddr) == RTN_BROADCAST) {
671 if (net_ratelimit())
672 printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
673 "type %u, code %u "
674 "error to a broadcast: %u.%u.%u.%u on %s\n",
675 NIPQUAD(skb->nh.iph->saddr),
676 icmph->type, icmph->code,
677 NIPQUAD(iph->daddr),
678 skb->dev->name);
679 goto out;
680 }
681
682 /* Checkin full IP header plus 8 bytes of protocol to
683 * avoid additional coding at protocol handlers.
684 */
685 if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
686 goto out;
687
688 iph = (struct iphdr *)skb->data;
689 protocol = iph->protocol;
690
691 /*
692 * Deliver ICMP message to raw sockets. Pretty useless feature?
693 */
694
695 /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
696 hash = protocol & (MAX_INET_PROTOS - 1);
697 read_lock(&raw_v4_lock);
698 if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
699 while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
700 iph->saddr,
701 skb->dev->ifindex)) != NULL) {
702 raw_err(raw_sk, skb, info);
703 raw_sk = sk_next(raw_sk);
704 iph = (struct iphdr *)skb->data;
705 }
706 }
707 read_unlock(&raw_v4_lock);
708
709 rcu_read_lock();
710 ipprot = rcu_dereference(inet_protos[hash]);
711 if (ipprot && ipprot->err_handler)
712 ipprot->err_handler(skb, info);
713 rcu_read_unlock();
714
715out:
716 return;
717out_err:
718 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
719 goto out;
720}
721
722
723/*
724 * Handle ICMP_REDIRECT.
725 */
726
727static void icmp_redirect(struct sk_buff *skb)
728{
729 struct iphdr *iph;
730 unsigned long ip;
731
732 if (skb->len < sizeof(struct iphdr))
733 goto out_err;
734
735 /*
736 * Get the copied header of the packet that caused the redirect
737 */
738 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
739 goto out;
740
741 iph = (struct iphdr *)skb->data;
742 ip = iph->daddr;
743
744 switch (skb->h.icmph->code & 7) {
745 case ICMP_REDIR_NET:
746 case ICMP_REDIR_NETTOS:
747 /*
748 * As per RFC recommendations now handle it as a host redirect.
749 */
750 case ICMP_REDIR_HOST:
751 case ICMP_REDIR_HOSTTOS:
752 ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway,
753 iph->saddr, iph->tos, skb->dev);
754 break;
755 }
756out:
757 return;
758out_err:
759 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
760 goto out;
761}
762
763/*
764 * Handle ICMP_ECHO ("ping") requests.
765 *
766 * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
767 * requests.
768 * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
769 * included in the reply.
770 * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
771 * echo requests, MUST have default=NOT.
772 * See also WRT handling of options once they are done and working.
773 */
774
775static void icmp_echo(struct sk_buff *skb)
776{
777 if (!sysctl_icmp_echo_ignore_all) {
778 struct icmp_bxm icmp_param;
779
780 icmp_param.data.icmph = *skb->h.icmph;
781 icmp_param.data.icmph.type = ICMP_ECHOREPLY;
782 icmp_param.skb = skb;
783 icmp_param.offset = 0;
784 icmp_param.data_len = skb->len;
785 icmp_param.head_len = sizeof(struct icmphdr);
786 icmp_reply(&icmp_param, skb);
787 }
788}
789
790/*
791 * Handle ICMP Timestamp requests.
792 * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
793 * SHOULD be in the kernel for minimum random latency.
794 * MUST be accurate to a few minutes.
795 * MUST be updated at least at 15Hz.
796 */
797static void icmp_timestamp(struct sk_buff *skb)
798{
799 struct timeval tv;
800 struct icmp_bxm icmp_param;
801 /*
802 * Too short.
803 */
804 if (skb->len < 4)
805 goto out_err;
806
807 /*
808 * Fill in the current time as ms since midnight UT:
809 */
810 do_gettimeofday(&tv);
811 icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 +
812 tv.tv_usec / 1000);
813 icmp_param.data.times[2] = icmp_param.data.times[1];
814 if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
815 BUG();
816 icmp_param.data.icmph = *skb->h.icmph;
817 icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
818 icmp_param.data.icmph.code = 0;
819 icmp_param.skb = skb;
820 icmp_param.offset = 0;
821 icmp_param.data_len = 0;
822 icmp_param.head_len = sizeof(struct icmphdr) + 12;
823 icmp_reply(&icmp_param, skb);
824out:
825 return;
826out_err:
827 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
828 goto out;
829}
830
831
832/*
833 * Handle ICMP_ADDRESS_MASK requests. (RFC950)
834 *
835 * RFC1122 (3.2.2.9). A host MUST only send replies to
836 * ADDRESS_MASK requests if it's been configured as an address mask
837 * agent. Receiving a request doesn't constitute implicit permission to
838 * act as one. Of course, implementing this correctly requires (SHOULD)
839 * a way to turn the functionality on and off. Another one for sysctl(),
840 * I guess. -- MS
841 *
842 * RFC1812 (4.3.3.9). A router MUST implement it.
843 * A router SHOULD have switch turning it on/off.
844 * This switch MUST be ON by default.
845 *
846 * Gratuitous replies, zero-source replies are not implemented,
847 * that complies with RFC. DO NOT implement them!!! All the idea
848 * of broadcast addrmask replies as specified in RFC950 is broken.
849 * The problem is that it is not uncommon to have several prefixes
850 * on one physical interface. Moreover, addrmask agent can even be
851 * not aware of existing another prefixes.
852 * If source is zero, addrmask agent cannot choose correct prefix.
853 * Gratuitous mask announcements suffer from the same problem.
854 * RFC1812 explains it, but still allows to use ADDRMASK,
855 * that is pretty silly. --ANK
856 *
857 * All these rules are so bizarre, that I removed kernel addrmask
858 * support at all. It is wrong, it is obsolete, nobody uses it in
859 * any case. --ANK
860 *
861 * Furthermore you can do it with a usermode address agent program
862 * anyway...
863 */
864
865static void icmp_address(struct sk_buff *skb)
866{
867#if 0
868 if (net_ratelimit())
869 printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
870#endif
871}
872
873/*
874 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
875 * loudly if an inconsistency is found.
876 */
877
878static void icmp_address_reply(struct sk_buff *skb)
879{
880 struct rtable *rt = (struct rtable *)skb->dst;
881 struct net_device *dev = skb->dev;
882 struct in_device *in_dev;
883 struct in_ifaddr *ifa;
884
885 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
886 goto out;
887
888 in_dev = in_dev_get(dev);
889 if (!in_dev)
890 goto out;
891 rcu_read_lock();
892 if (in_dev->ifa_list &&
893 IN_DEV_LOG_MARTIANS(in_dev) &&
894 IN_DEV_FORWARD(in_dev)) {
895 u32 _mask, *mp;
896
897 mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
898 if (mp == NULL)
899 BUG();
900 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
901 if (*mp == ifa->ifa_mask &&
902 inet_ifa_match(rt->rt_src, ifa))
903 break;
904 }
905 if (!ifa && net_ratelimit()) {
906 printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from "
907 "%s/%u.%u.%u.%u\n",
908 NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
909 }
910 }
911 rcu_read_unlock();
912 in_dev_put(in_dev);
913out:;
914}
915
916static void icmp_discard(struct sk_buff *skb)
917{
918}
919
920/*
921 * Deal with incoming ICMP packets.
922 */
923int icmp_rcv(struct sk_buff *skb)
924{
925 struct icmphdr *icmph;
926 struct rtable *rt = (struct rtable *)skb->dst;
927
928 ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
929
930 switch (skb->ip_summed) {
931 case CHECKSUM_HW:
932 if (!(u16)csum_fold(skb->csum))
933 break;
934 NETDEBUG(if (net_ratelimit())
935 printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
936 case CHECKSUM_NONE:
937 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
938 goto error;
939 default:;
940 }
941
942 if (!pskb_pull(skb, sizeof(struct icmphdr)))
943 goto error;
944
945 icmph = skb->h.icmph;
946
947 /*
948 * 18 is the highest 'known' ICMP type. Anything else is a mystery
949 *
950 * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
951 * discarded.
952 */
953 if (icmph->type > NR_ICMP_TYPES)
954 goto error;
955
956
957 /*
958 * Parse the ICMP message
959 */
960
961 if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
962 /*
963 * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
964 * silently ignored (we let user decide with a sysctl).
965 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
966 * discarded if to broadcast/multicast.
967 */
968 if (icmph->type == ICMP_ECHO &&
969 sysctl_icmp_echo_ignore_broadcasts) {
970 goto error;
971 }
972 if (icmph->type != ICMP_ECHO &&
973 icmph->type != ICMP_TIMESTAMP &&
974 icmph->type != ICMP_ADDRESS &&
975 icmph->type != ICMP_ADDRESSREPLY) {
976 goto error;
977 }
978 }
979
980 ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
981 icmp_pointers[icmph->type].handler(skb);
982
983drop:
984 kfree_skb(skb);
985 return 0;
986error:
987 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
988 goto drop;
989}
990
991/*
992 * This table is the definition of how we handle ICMP.
993 */
994static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
995 [ICMP_ECHOREPLY] = {
996 .output_entry = ICMP_MIB_OUTECHOREPS,
997 .input_entry = ICMP_MIB_INECHOREPS,
998 .handler = icmp_discard,
999 },
1000 [1] = {
1001 .output_entry = ICMP_MIB_DUMMY,
1002 .input_entry = ICMP_MIB_INERRORS,
1003 .handler = icmp_discard,
1004 .error = 1,
1005 },
1006 [2] = {
1007 .output_entry = ICMP_MIB_DUMMY,
1008 .input_entry = ICMP_MIB_INERRORS,
1009 .handler = icmp_discard,
1010 .error = 1,
1011 },
1012 [ICMP_DEST_UNREACH] = {
1013 .output_entry = ICMP_MIB_OUTDESTUNREACHS,
1014 .input_entry = ICMP_MIB_INDESTUNREACHS,
1015 .handler = icmp_unreach,
1016 .error = 1,
1017 },
1018 [ICMP_SOURCE_QUENCH] = {
1019 .output_entry = ICMP_MIB_OUTSRCQUENCHS,
1020 .input_entry = ICMP_MIB_INSRCQUENCHS,
1021 .handler = icmp_unreach,
1022 .error = 1,
1023 },
1024 [ICMP_REDIRECT] = {
1025 .output_entry = ICMP_MIB_OUTREDIRECTS,
1026 .input_entry = ICMP_MIB_INREDIRECTS,
1027 .handler = icmp_redirect,
1028 .error = 1,
1029 },
1030 [6] = {
1031 .output_entry = ICMP_MIB_DUMMY,
1032 .input_entry = ICMP_MIB_INERRORS,
1033 .handler = icmp_discard,
1034 .error = 1,
1035 },
1036 [7] = {
1037 .output_entry = ICMP_MIB_DUMMY,
1038 .input_entry = ICMP_MIB_INERRORS,
1039 .handler = icmp_discard,
1040 .error = 1,
1041 },
1042 [ICMP_ECHO] = {
1043 .output_entry = ICMP_MIB_OUTECHOS,
1044 .input_entry = ICMP_MIB_INECHOS,
1045 .handler = icmp_echo,
1046 },
1047 [9] = {
1048 .output_entry = ICMP_MIB_DUMMY,
1049 .input_entry = ICMP_MIB_INERRORS,
1050 .handler = icmp_discard,
1051 .error = 1,
1052 },
1053 [10] = {
1054 .output_entry = ICMP_MIB_DUMMY,
1055 .input_entry = ICMP_MIB_INERRORS,
1056 .handler = icmp_discard,
1057 .error = 1,
1058 },
1059 [ICMP_TIME_EXCEEDED] = {
1060 .output_entry = ICMP_MIB_OUTTIMEEXCDS,
1061 .input_entry = ICMP_MIB_INTIMEEXCDS,
1062 .handler = icmp_unreach,
1063 .error = 1,
1064 },
1065 [ICMP_PARAMETERPROB] = {
1066 .output_entry = ICMP_MIB_OUTPARMPROBS,
1067 .input_entry = ICMP_MIB_INPARMPROBS,
1068 .handler = icmp_unreach,
1069 .error = 1,
1070 },
1071 [ICMP_TIMESTAMP] = {
1072 .output_entry = ICMP_MIB_OUTTIMESTAMPS,
1073 .input_entry = ICMP_MIB_INTIMESTAMPS,
1074 .handler = icmp_timestamp,
1075 },
1076 [ICMP_TIMESTAMPREPLY] = {
1077 .output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
1078 .input_entry = ICMP_MIB_INTIMESTAMPREPS,
1079 .handler = icmp_discard,
1080 },
1081 [ICMP_INFO_REQUEST] = {
1082 .output_entry = ICMP_MIB_DUMMY,
1083 .input_entry = ICMP_MIB_DUMMY,
1084 .handler = icmp_discard,
1085 },
1086 [ICMP_INFO_REPLY] = {
1087 .output_entry = ICMP_MIB_DUMMY,
1088 .input_entry = ICMP_MIB_DUMMY,
1089 .handler = icmp_discard,
1090 },
1091 [ICMP_ADDRESS] = {
1092 .output_entry = ICMP_MIB_OUTADDRMASKS,
1093 .input_entry = ICMP_MIB_INADDRMASKS,
1094 .handler = icmp_address,
1095 },
1096 [ICMP_ADDRESSREPLY] = {
1097 .output_entry = ICMP_MIB_OUTADDRMASKREPS,
1098 .input_entry = ICMP_MIB_INADDRMASKREPS,
1099 .handler = icmp_address_reply,
1100 },
1101};
1102
1103void __init icmp_init(struct net_proto_family *ops)
1104{
1105 struct inet_sock *inet;
1106 int i;
1107
1108 for (i = 0; i < NR_CPUS; i++) {
1109 int err;
1110
1111 if (!cpu_possible(i))
1112 continue;
1113
1114 err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
1115 &per_cpu(__icmp_socket, i));
1116
1117 if (err < 0)
1118 panic("Failed to create the ICMP control socket.\n");
1119
1120 per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC;
1121
1122 /* Enough space for 2 64K ICMP packets, including
1123 * sk_buff struct overhead.
1124 */
1125 per_cpu(__icmp_socket, i)->sk->sk_sndbuf =
1126 (2 * ((64 * 1024) + sizeof(struct sk_buff)));
1127
1128 inet = inet_sk(per_cpu(__icmp_socket, i)->sk);
1129 inet->uc_ttl = -1;
1130 inet->pmtudisc = IP_PMTUDISC_DONT;
1131
1132 /* Unhash it so that IP input processing does not even
1133 * see it, we do not wish this socket to see incoming
1134 * packets.
1135 */
1136 per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
1137 }
1138}
1139
1140EXPORT_SYMBOL(icmp_err_convert);
1141EXPORT_SYMBOL(icmp_send);
1142EXPORT_SYMBOL(icmp_statistics);
1143EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 000000000000..1f3183168a90
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2473 @@
1/*
2 * Linux NET3: Internet Group Management Protocol [IGMP]
3 *
4 * This code implements the IGMP protocol as defined in RFC1112. There has
5 * been a further revision of this protocol since which is now supported.
6 *
7 * If you have trouble with this module be careful what gcc you have used,
8 * the older version didn't come out right using gcc 2.5.8, the newer one
9 * seems to fall out with gcc 2.6.2.
10 *
11 * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $
12 *
13 * Authors:
14 * Alan Cox <Alan.Cox@linux.org>
15 *
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version
19 * 2 of the License, or (at your option) any later version.
20 *
21 * Fixes:
22 *
23 * Alan Cox : Added lots of __inline__ to optimise
24 * the memory usage of all the tiny little
25 * functions.
26 * Alan Cox : Dumped the header building experiment.
27 * Alan Cox : Minor tweaks ready for multicast routing
28 * and extended IGMP protocol.
29 * Alan Cox : Removed a load of inline directives. Gcc 2.5.8
30 * writes utterly bogus code otherwise (sigh)
31 * fixed IGMP loopback to behave in the manner
32 * desired by mrouted, fixed the fact it has been
33 * broken since 1.3.6 and cleaned up a few minor
34 * points.
35 *
36 * Chih-Jen Chang : Tried to revise IGMP to Version 2
37 * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
38 * The enhancements are mainly based on Steve Deering's
39 * ipmulti-3.5 source code.
40 * Chih-Jen Chang : Added the igmp_get_mrouter_info and
41 * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
42 * the mrouted version on that device.
43 * Chih-Jen Chang : Added the max_resp_time parameter to
44 * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
45 * to identify the multicast router version
46 * and do what the IGMP version 2 specified.
47 * Chih-Jen Chang : Added a timer to revert to IGMP V2 router
48 * Tsu-Sheng Tsao if the specified time expired.
49 * Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
50 * Alan Cox : Use GFP_ATOMIC in the right places.
51 * Christian Daudt : igmp timer wasn't set for local group
52 * memberships but was being deleted,
53 * which caused a "del_timer() called
54 * from %p with timer not initialized\n"
55 * message (960131).
56 * Christian Daudt : removed del_timer from
57 * igmp_timer_expire function (960205).
58 * Christian Daudt : igmp_heard_report now only calls
59 * igmp_timer_expire if tm->running is
60 * true (960216).
61 * Malcolm Beattie : ttl comparison wrong in igmp_rcv made
62 * igmp_heard_query never trigger. Expiry
63 * miscalculation fixed in igmp_heard_query
64 * and random() made to return unsigned to
65 * prevent negative expiry times.
66 * Alexey Kuznetsov: Wrong group leaving behaviour, backport
67 * fix from pending 2.1.x patches.
68 * Alan Cox: Forget to enable FDDI support earlier.
69 * Alexey Kuznetsov: Fixed leaving groups on device down.
70 * Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
71 * David L Stevens: IGMPv3 support, with help from
72 * Vinay Kulkarni
73 */
74
75#include <linux/config.h>
76#include <linux/module.h>
77#include <asm/uaccess.h>
78#include <asm/system.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/jiffies.h>
82#include <linux/string.h>
83#include <linux/socket.h>
84#include <linux/sockios.h>
85#include <linux/in.h>
86#include <linux/inet.h>
87#include <linux/netdevice.h>
88#include <linux/skbuff.h>
89#include <linux/inetdevice.h>
90#include <linux/igmp.h>
91#include <linux/if_arp.h>
92#include <linux/rtnetlink.h>
93#include <linux/times.h>
94#include <net/ip.h>
95#include <net/protocol.h>
96#include <net/route.h>
97#include <net/sock.h>
98#include <net/checksum.h>
99#include <linux/netfilter_ipv4.h>
100#ifdef CONFIG_IP_MROUTE
101#include <linux/mroute.h>
102#endif
103#ifdef CONFIG_PROC_FS
104#include <linux/proc_fs.h>
105#include <linux/seq_file.h>
106#endif
107
108#define IP_MAX_MEMBERSHIPS 20
109#define IP_MAX_MSF 10
110
111#ifdef CONFIG_IP_MULTICAST
112/* Parameter names and values are taken from igmp-v2-06 draft */
113
114#define IGMP_V1_Router_Present_Timeout (400*HZ)
115#define IGMP_V2_Router_Present_Timeout (400*HZ)
116#define IGMP_Unsolicited_Report_Interval (10*HZ)
117#define IGMP_Query_Response_Interval (10*HZ)
118#define IGMP_Unsolicited_Report_Count 2
119
120
121#define IGMP_Initial_Report_Delay (1)
122
123/* IGMP_Initial_Report_Delay is not from IGMP specs!
124 * IGMP specs require to report membership immediately after
125 * joining a group, but we delay the first report by a
126 * small interval. It seems more natural and still does not
127 * contradict to specs provided this delay is small enough.
128 */
129
130#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \
131 (in_dev)->cnf.force_igmp_version == 1 || \
132 ((in_dev)->mr_v1_seen && \
133 time_before(jiffies, (in_dev)->mr_v1_seen)))
134#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \
135 (in_dev)->cnf.force_igmp_version == 2 || \
136 ((in_dev)->mr_v2_seen && \
137 time_before(jiffies, (in_dev)->mr_v2_seen)))
138
139static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
140static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr);
141static void igmpv3_clear_delrec(struct in_device *in_dev);
142static int sf_setstate(struct ip_mc_list *pmc);
143static void sf_markstate(struct ip_mc_list *pmc);
144#endif
145static void ip_mc_clear_src(struct ip_mc_list *pmc);
146static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
147 int sfcount, __u32 *psfsrc, int delta);
148
149static void ip_ma_put(struct ip_mc_list *im)
150{
151 if (atomic_dec_and_test(&im->refcnt)) {
152 in_dev_put(im->interface);
153 kfree(im);
154 }
155}
156
157#ifdef CONFIG_IP_MULTICAST
158
159/*
160 * Timer management
161 */
162
163static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
164{
165 spin_lock_bh(&im->lock);
166 if (del_timer(&im->timer))
167 atomic_dec(&im->refcnt);
168 im->tm_running=0;
169 im->reporter = 0;
170 im->unsolicit_count = 0;
171 spin_unlock_bh(&im->lock);
172}
173
174/* It must be called with locked im->lock */
175static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
176{
177 int tv=net_random() % max_delay;
178
179 im->tm_running=1;
180 if (!mod_timer(&im->timer, jiffies+tv+2))
181 atomic_inc(&im->refcnt);
182}
183
184static void igmp_gq_start_timer(struct in_device *in_dev)
185{
186 int tv = net_random() % in_dev->mr_maxdelay;
187
188 in_dev->mr_gq_running = 1;
189 if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
190 in_dev_hold(in_dev);
191}
192
193static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
194{
195 int tv = net_random() % delay;
196
197 if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
198 in_dev_hold(in_dev);
199}
200
201static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
202{
203 spin_lock_bh(&im->lock);
204 im->unsolicit_count = 0;
205 if (del_timer(&im->timer)) {
206 if ((long)(im->timer.expires-jiffies) < max_delay) {
207 add_timer(&im->timer);
208 im->tm_running=1;
209 spin_unlock_bh(&im->lock);
210 return;
211 }
212 atomic_dec(&im->refcnt);
213 }
214 igmp_start_timer(im, max_delay);
215 spin_unlock_bh(&im->lock);
216}
217
218
219/*
220 * Send an IGMP report.
221 */
222
223#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
224
225
226static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
227 int gdeleted, int sdeleted)
228{
229 switch (type) {
230 case IGMPV3_MODE_IS_INCLUDE:
231 case IGMPV3_MODE_IS_EXCLUDE:
232 if (gdeleted || sdeleted)
233 return 0;
234 return !(pmc->gsquery && !psf->sf_gsresp);
235 case IGMPV3_CHANGE_TO_INCLUDE:
236 if (gdeleted || sdeleted)
237 return 0;
238 return psf->sf_count[MCAST_INCLUDE] != 0;
239 case IGMPV3_CHANGE_TO_EXCLUDE:
240 if (gdeleted || sdeleted)
241 return 0;
242 if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
243 psf->sf_count[MCAST_INCLUDE])
244 return 0;
245 return pmc->sfcount[MCAST_EXCLUDE] ==
246 psf->sf_count[MCAST_EXCLUDE];
247 case IGMPV3_ALLOW_NEW_SOURCES:
248 if (gdeleted || !psf->sf_crcount)
249 return 0;
250 return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
251 case IGMPV3_BLOCK_OLD_SOURCES:
252 if (pmc->sfmode == MCAST_INCLUDE)
253 return gdeleted || (psf->sf_crcount && sdeleted);
254 return psf->sf_crcount && !gdeleted && !sdeleted;
255 }
256 return 0;
257}
258
259static int
260igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
261{
262 struct ip_sf_list *psf;
263 int scount = 0;
264
265 for (psf=pmc->sources; psf; psf=psf->sf_next) {
266 if (!is_in(pmc, psf, type, gdeleted, sdeleted))
267 continue;
268 scount++;
269 }
270 return scount;
271}
272
273static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
274{
275 struct sk_buff *skb;
276 struct rtable *rt;
277 struct iphdr *pip;
278 struct igmpv3_report *pig;
279
280 skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
281 if (skb == NULL)
282 return NULL;
283
284 {
285 struct flowi fl = { .oif = dev->ifindex,
286 .nl_u = { .ip4_u = {
287 .daddr = IGMPV3_ALL_MCR } },
288 .proto = IPPROTO_IGMP };
289 if (ip_route_output_key(&rt, &fl)) {
290 kfree_skb(skb);
291 return NULL;
292 }
293 }
294 if (rt->rt_src == 0) {
295 kfree_skb(skb);
296 ip_rt_put(rt);
297 return NULL;
298 }
299
300 skb->dst = &rt->u.dst;
301 skb->dev = dev;
302
303 skb_reserve(skb, LL_RESERVED_SPACE(dev));
304
305 skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
306
307 pip->version = 4;
308 pip->ihl = (sizeof(struct iphdr)+4)>>2;
309 pip->tos = 0xc0;
310 pip->frag_off = htons(IP_DF);
311 pip->ttl = 1;
312 pip->daddr = rt->rt_dst;
313 pip->saddr = rt->rt_src;
314 pip->protocol = IPPROTO_IGMP;
315 pip->tot_len = 0; /* filled in later */
316 ip_select_ident(pip, &rt->u.dst, NULL);
317 ((u8*)&pip[1])[0] = IPOPT_RA;
318 ((u8*)&pip[1])[1] = 4;
319 ((u8*)&pip[1])[2] = 0;
320 ((u8*)&pip[1])[3] = 0;
321
322 pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig));
323 skb->h.igmph = (struct igmphdr *)pig;
324 pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
325 pig->resv1 = 0;
326 pig->csum = 0;
327 pig->resv2 = 0;
328 pig->ngrec = 0;
329 return skb;
330}
331
332static int igmpv3_sendpack(struct sk_buff *skb)
333{
334 struct iphdr *pip = skb->nh.iph;
335 struct igmphdr *pig = skb->h.igmph;
336 int iplen, igmplen;
337
338 iplen = skb->tail - (unsigned char *)skb->nh.iph;
339 pip->tot_len = htons(iplen);
340 ip_send_check(pip);
341
342 igmplen = skb->tail - (unsigned char *)skb->h.igmph;
343 pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
344
345 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
346 dst_output);
347}
348
349static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
350{
351 return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel);
352}
353
354static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
355 int type, struct igmpv3_grec **ppgr)
356{
357 struct net_device *dev = pmc->interface->dev;
358 struct igmpv3_report *pih;
359 struct igmpv3_grec *pgr;
360
361 if (!skb)
362 skb = igmpv3_newpack(dev, dev->mtu);
363 if (!skb)
364 return NULL;
365 pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
366 pgr->grec_type = type;
367 pgr->grec_auxwords = 0;
368 pgr->grec_nsrcs = 0;
369 pgr->grec_mca = pmc->multiaddr;
370 pih = (struct igmpv3_report *)skb->h.igmph;
371 pih->ngrec = htons(ntohs(pih->ngrec)+1);
372 *ppgr = pgr;
373 return skb;
374}
375
376#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
377 skb_tailroom(skb)) : 0)
378
379static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
380 int type, int gdeleted, int sdeleted)
381{
382 struct net_device *dev = pmc->interface->dev;
383 struct igmpv3_report *pih;
384 struct igmpv3_grec *pgr = NULL;
385 struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
386 int scount, first, isquery, truncate;
387
388 if (pmc->multiaddr == IGMP_ALL_HOSTS)
389 return skb;
390
391 isquery = type == IGMPV3_MODE_IS_INCLUDE ||
392 type == IGMPV3_MODE_IS_EXCLUDE;
393 truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
394 type == IGMPV3_CHANGE_TO_EXCLUDE;
395
396 psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
397
398 if (!*psf_list) {
399 if (type == IGMPV3_ALLOW_NEW_SOURCES ||
400 type == IGMPV3_BLOCK_OLD_SOURCES)
401 return skb;
402 if (pmc->crcount || isquery) {
403 /* make sure we have room for group header and at
404 * least one source.
405 */
406 if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)+
407 sizeof(__u32)) {
408 igmpv3_sendpack(skb);
409 skb = NULL; /* add_grhead will get a new one */
410 }
411 skb = add_grhead(skb, pmc, type, &pgr);
412 }
413 return skb;
414 }
415 pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL;
416
417 /* EX and TO_EX get a fresh packet, if needed */
418 if (truncate) {
419 if (pih && pih->ngrec &&
420 AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
421 if (skb)
422 igmpv3_sendpack(skb);
423 skb = igmpv3_newpack(dev, dev->mtu);
424 }
425 }
426 first = 1;
427 scount = 0;
428 psf_prev = NULL;
429 for (psf=*psf_list; psf; psf=psf_next) {
430 u32 *psrc;
431
432 psf_next = psf->sf_next;
433
434 if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
435 psf_prev = psf;
436 continue;
437 }
438
439 /* clear marks on query responses */
440 if (isquery)
441 psf->sf_gsresp = 0;
442
443 if (AVAILABLE(skb) < sizeof(u32) +
444 first*sizeof(struct igmpv3_grec)) {
445 if (truncate && !first)
446 break; /* truncate these */
447 if (pgr)
448 pgr->grec_nsrcs = htons(scount);
449 if (skb)
450 igmpv3_sendpack(skb);
451 skb = igmpv3_newpack(dev, dev->mtu);
452 first = 1;
453 scount = 0;
454 }
455 if (first) {
456 skb = add_grhead(skb, pmc, type, &pgr);
457 first = 0;
458 }
459 psrc = (u32 *)skb_put(skb, sizeof(u32));
460 *psrc = psf->sf_inaddr;
461 scount++;
462 if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
463 type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
464 psf->sf_crcount--;
465 if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
466 if (psf_prev)
467 psf_prev->sf_next = psf->sf_next;
468 else
469 *psf_list = psf->sf_next;
470 kfree(psf);
471 continue;
472 }
473 }
474 psf_prev = psf;
475 }
476 if (pgr)
477 pgr->grec_nsrcs = htons(scount);
478
479 if (isquery)
480 pmc->gsquery = 0; /* clear query state on report */
481 return skb;
482}
483
484static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
485{
486 struct sk_buff *skb = NULL;
487 int type;
488
489 if (!pmc) {
490 read_lock(&in_dev->mc_list_lock);
491 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
492 if (pmc->multiaddr == IGMP_ALL_HOSTS)
493 continue;
494 spin_lock_bh(&pmc->lock);
495 if (pmc->sfcount[MCAST_EXCLUDE])
496 type = IGMPV3_MODE_IS_EXCLUDE;
497 else
498 type = IGMPV3_MODE_IS_INCLUDE;
499 skb = add_grec(skb, pmc, type, 0, 0);
500 spin_unlock_bh(&pmc->lock);
501 }
502 read_unlock(&in_dev->mc_list_lock);
503 } else {
504 spin_lock_bh(&pmc->lock);
505 if (pmc->sfcount[MCAST_EXCLUDE])
506 type = IGMPV3_MODE_IS_EXCLUDE;
507 else
508 type = IGMPV3_MODE_IS_INCLUDE;
509 skb = add_grec(skb, pmc, type, 0, 0);
510 spin_unlock_bh(&pmc->lock);
511 }
512 if (!skb)
513 return 0;
514 return igmpv3_sendpack(skb);
515}
516
517/*
518 * remove zero-count source records from a source filter list
519 */
520static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
521{
522 struct ip_sf_list *psf_prev, *psf_next, *psf;
523
524 psf_prev = NULL;
525 for (psf=*ppsf; psf; psf = psf_next) {
526 psf_next = psf->sf_next;
527 if (psf->sf_crcount == 0) {
528 if (psf_prev)
529 psf_prev->sf_next = psf->sf_next;
530 else
531 *ppsf = psf->sf_next;
532 kfree(psf);
533 } else
534 psf_prev = psf;
535 }
536}
537
538static void igmpv3_send_cr(struct in_device *in_dev)
539{
540 struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
541 struct sk_buff *skb = NULL;
542 int type, dtype;
543
544 read_lock(&in_dev->mc_list_lock);
545 spin_lock_bh(&in_dev->mc_tomb_lock);
546
547 /* deleted MCA's */
548 pmc_prev = NULL;
549 for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
550 pmc_next = pmc->next;
551 if (pmc->sfmode == MCAST_INCLUDE) {
552 type = IGMPV3_BLOCK_OLD_SOURCES;
553 dtype = IGMPV3_BLOCK_OLD_SOURCES;
554 skb = add_grec(skb, pmc, type, 1, 0);
555 skb = add_grec(skb, pmc, dtype, 1, 1);
556 }
557 if (pmc->crcount) {
558 pmc->crcount--;
559 if (pmc->sfmode == MCAST_EXCLUDE) {
560 type = IGMPV3_CHANGE_TO_INCLUDE;
561 skb = add_grec(skb, pmc, type, 1, 0);
562 }
563 if (pmc->crcount == 0) {
564 igmpv3_clear_zeros(&pmc->tomb);
565 igmpv3_clear_zeros(&pmc->sources);
566 }
567 }
568 if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
569 if (pmc_prev)
570 pmc_prev->next = pmc_next;
571 else
572 in_dev->mc_tomb = pmc_next;
573 in_dev_put(pmc->interface);
574 kfree(pmc);
575 } else
576 pmc_prev = pmc;
577 }
578 spin_unlock_bh(&in_dev->mc_tomb_lock);
579
580 /* change recs */
581 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
582 spin_lock_bh(&pmc->lock);
583 if (pmc->sfcount[MCAST_EXCLUDE]) {
584 type = IGMPV3_BLOCK_OLD_SOURCES;
585 dtype = IGMPV3_ALLOW_NEW_SOURCES;
586 } else {
587 type = IGMPV3_ALLOW_NEW_SOURCES;
588 dtype = IGMPV3_BLOCK_OLD_SOURCES;
589 }
590 skb = add_grec(skb, pmc, type, 0, 0);
591 skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
592
593 /* filter mode changes */
594 if (pmc->crcount) {
595 pmc->crcount--;
596 if (pmc->sfmode == MCAST_EXCLUDE)
597 type = IGMPV3_CHANGE_TO_EXCLUDE;
598 else
599 type = IGMPV3_CHANGE_TO_INCLUDE;
600 skb = add_grec(skb, pmc, type, 0, 0);
601 }
602 spin_unlock_bh(&pmc->lock);
603 }
604 read_unlock(&in_dev->mc_list_lock);
605
606 if (!skb)
607 return;
608 (void) igmpv3_sendpack(skb);
609}
610
611static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
612 int type)
613{
614 struct sk_buff *skb;
615 struct iphdr *iph;
616 struct igmphdr *ih;
617 struct rtable *rt;
618 struct net_device *dev = in_dev->dev;
619 u32 group = pmc ? pmc->multiaddr : 0;
620 u32 dst;
621
622 if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
623 return igmpv3_send_report(in_dev, pmc);
624 else if (type == IGMP_HOST_LEAVE_MESSAGE)
625 dst = IGMP_ALL_ROUTER;
626 else
627 dst = group;
628
629 {
630 struct flowi fl = { .oif = dev->ifindex,
631 .nl_u = { .ip4_u = { .daddr = dst } },
632 .proto = IPPROTO_IGMP };
633 if (ip_route_output_key(&rt, &fl))
634 return -1;
635 }
636 if (rt->rt_src == 0) {
637 ip_rt_put(rt);
638 return -1;
639 }
640
641 skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
642 if (skb == NULL) {
643 ip_rt_put(rt);
644 return -1;
645 }
646
647 skb->dst = &rt->u.dst;
648
649 skb_reserve(skb, LL_RESERVED_SPACE(dev));
650
651 skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
652
653 iph->version = 4;
654 iph->ihl = (sizeof(struct iphdr)+4)>>2;
655 iph->tos = 0xc0;
656 iph->frag_off = htons(IP_DF);
657 iph->ttl = 1;
658 iph->daddr = dst;
659 iph->saddr = rt->rt_src;
660 iph->protocol = IPPROTO_IGMP;
661 iph->tot_len = htons(IGMP_SIZE);
662 ip_select_ident(iph, &rt->u.dst, NULL);
663 ((u8*)&iph[1])[0] = IPOPT_RA;
664 ((u8*)&iph[1])[1] = 4;
665 ((u8*)&iph[1])[2] = 0;
666 ((u8*)&iph[1])[3] = 0;
667 ip_send_check(iph);
668
669 ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
670 ih->type=type;
671 ih->code=0;
672 ih->csum=0;
673 ih->group=group;
674 ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
675
676 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
677 dst_output);
678}
679
680static void igmp_gq_timer_expire(unsigned long data)
681{
682 struct in_device *in_dev = (struct in_device *)data;
683
684 in_dev->mr_gq_running = 0;
685 igmpv3_send_report(in_dev, NULL);
686 __in_dev_put(in_dev);
687}
688
689static void igmp_ifc_timer_expire(unsigned long data)
690{
691 struct in_device *in_dev = (struct in_device *)data;
692
693 igmpv3_send_cr(in_dev);
694 if (in_dev->mr_ifc_count) {
695 in_dev->mr_ifc_count--;
696 igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
697 }
698 __in_dev_put(in_dev);
699}
700
701static void igmp_ifc_event(struct in_device *in_dev)
702{
703 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
704 return;
705 in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv :
706 IGMP_Unsolicited_Report_Count;
707 igmp_ifc_start_timer(in_dev, 1);
708}
709
710
711static void igmp_timer_expire(unsigned long data)
712{
713 struct ip_mc_list *im=(struct ip_mc_list *)data;
714 struct in_device *in_dev = im->interface;
715
716 spin_lock(&im->lock);
717 im->tm_running=0;
718
719 if (im->unsolicit_count) {
720 im->unsolicit_count--;
721 igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
722 }
723 im->reporter = 1;
724 spin_unlock(&im->lock);
725
726 if (IGMP_V1_SEEN(in_dev))
727 igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
728 else if (IGMP_V2_SEEN(in_dev))
729 igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
730 else
731 igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
732
733 ip_ma_put(im);
734}
735
736static void igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __u32 *srcs)
737{
738 struct ip_sf_list *psf;
739 int i, scount;
740
741 scount = 0;
742 for (psf=pmc->sources; psf; psf=psf->sf_next) {
743 if (scount == nsrcs)
744 break;
745 for (i=0; i<nsrcs; i++)
746 if (srcs[i] == psf->sf_inaddr) {
747 psf->sf_gsresp = 1;
748 scount++;
749 break;
750 }
751 }
752}
753
754static void igmp_heard_report(struct in_device *in_dev, u32 group)
755{
756 struct ip_mc_list *im;
757
758 /* Timers are only set for non-local groups */
759
760 if (group == IGMP_ALL_HOSTS)
761 return;
762
763 read_lock(&in_dev->mc_list_lock);
764 for (im=in_dev->mc_list; im!=NULL; im=im->next) {
765 if (im->multiaddr == group) {
766 igmp_stop_timer(im);
767 break;
768 }
769 }
770 read_unlock(&in_dev->mc_list_lock);
771}
772
773static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
774 int len)
775{
776 struct igmphdr *ih = skb->h.igmph;
777 struct igmpv3_query *ih3 = (struct igmpv3_query *)ih;
778 struct ip_mc_list *im;
779 u32 group = ih->group;
780 int max_delay;
781 int mark = 0;
782
783
784 if (len == 8) {
785 if (ih->code == 0) {
786 /* Alas, old v1 router presents here. */
787
788 max_delay = IGMP_Query_Response_Interval;
789 in_dev->mr_v1_seen = jiffies +
790 IGMP_V1_Router_Present_Timeout;
791 group = 0;
792 } else {
793 /* v2 router present */
794 max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
795 in_dev->mr_v2_seen = jiffies +
796 IGMP_V2_Router_Present_Timeout;
797 }
798 /* cancel the interface change timer */
799 in_dev->mr_ifc_count = 0;
800 if (del_timer(&in_dev->mr_ifc_timer))
801 __in_dev_put(in_dev);
802 /* clear deleted report items */
803 igmpv3_clear_delrec(in_dev);
804 } else if (len < 12) {
805 return; /* ignore bogus packet; freed by caller */
806 } else { /* v3 */
807 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
808 return;
809
810 ih3 = (struct igmpv3_query *) skb->h.raw;
811 if (ih3->nsrcs) {
812 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
813 + ntohs(ih3->nsrcs)*sizeof(__u32)))
814 return;
815 ih3 = (struct igmpv3_query *) skb->h.raw;
816 }
817
818 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
819 if (!max_delay)
820 max_delay = 1; /* can't mod w/ 0 */
821 in_dev->mr_maxdelay = max_delay;
822 if (ih3->qrv)
823 in_dev->mr_qrv = ih3->qrv;
824 if (!group) { /* general query */
825 if (ih3->nsrcs)
826 return; /* no sources allowed */
827 igmp_gq_start_timer(in_dev);
828 return;
829 }
830 /* mark sources to include, if group & source-specific */
831 mark = ih3->nsrcs != 0;
832 }
833
834 /*
835 * - Start the timers in all of our membership records
836 * that the query applies to for the interface on
837 * which the query arrived excl. those that belong
838 * to a "local" group (224.0.0.X)
839 * - For timers already running check if they need to
840 * be reset.
841 * - Use the igmp->igmp_code field as the maximum
842 * delay possible
843 */
844 read_lock(&in_dev->mc_list_lock);
845 for (im=in_dev->mc_list; im!=NULL; im=im->next) {
846 if (group && group != im->multiaddr)
847 continue;
848 if (im->multiaddr == IGMP_ALL_HOSTS)
849 continue;
850 spin_lock_bh(&im->lock);
851 if (im->tm_running)
852 im->gsquery = im->gsquery && mark;
853 else
854 im->gsquery = mark;
855 if (im->gsquery)
856 igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
857 spin_unlock_bh(&im->lock);
858 igmp_mod_timer(im, max_delay);
859 }
860 read_unlock(&in_dev->mc_list_lock);
861}
862
863int igmp_rcv(struct sk_buff *skb)
864{
865 /* This basically follows the spec line by line -- see RFC1112 */
866 struct igmphdr *ih;
867 struct in_device *in_dev = in_dev_get(skb->dev);
868 int len = skb->len;
869
870 if (in_dev==NULL) {
871 kfree_skb(skb);
872 return 0;
873 }
874
875 if (!pskb_may_pull(skb, sizeof(struct igmphdr)) ||
876 (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
877 in_dev_put(in_dev);
878 kfree_skb(skb);
879 return 0;
880 }
881
882 ih = skb->h.igmph;
883 switch (ih->type) {
884 case IGMP_HOST_MEMBERSHIP_QUERY:
885 igmp_heard_query(in_dev, skb, len);
886 break;
887 case IGMP_HOST_MEMBERSHIP_REPORT:
888 case IGMPV2_HOST_MEMBERSHIP_REPORT:
889 case IGMPV3_HOST_MEMBERSHIP_REPORT:
890 /* Is it our report looped back? */
891 if (((struct rtable*)skb->dst)->fl.iif == 0)
892 break;
893 igmp_heard_report(in_dev, ih->group);
894 break;
895 case IGMP_PIM:
896#ifdef CONFIG_IP_PIMSM_V1
897 in_dev_put(in_dev);
898 return pim_rcv_v1(skb);
899#endif
900 case IGMP_DVMRP:
901 case IGMP_TRACE:
902 case IGMP_HOST_LEAVE_MESSAGE:
903 case IGMP_MTRACE:
904 case IGMP_MTRACE_RESP:
905 break;
906 default:
907 NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
908 }
909 in_dev_put(in_dev);
910 kfree_skb(skb);
911 return 0;
912}
913
914#endif
915
916
917/*
918 * Add a filter to a device
919 */
920
921static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
922{
923 char buf[MAX_ADDR_LEN];
924 struct net_device *dev = in_dev->dev;
925
926 /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
927 We will get multicast token leakage, when IFF_MULTICAST
928 is changed. This check should be done in dev->set_multicast_list
929 routine. Something sort of:
930 if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
931 --ANK
932 */
933 if (arp_mc_map(addr, buf, dev, 0) == 0)
934 dev_mc_add(dev,buf,dev->addr_len,0);
935}
936
937/*
938 * Remove a filter from a device
939 */
940
941static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
942{
943 char buf[MAX_ADDR_LEN];
944 struct net_device *dev = in_dev->dev;
945
946 if (arp_mc_map(addr, buf, dev, 0) == 0)
947 dev_mc_delete(dev,buf,dev->addr_len,0);
948}
949
950#ifdef CONFIG_IP_MULTICAST
951/*
952 * deleted ip_mc_list manipulation
953 */
954static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
955{
956 struct ip_mc_list *pmc;
957
958 /* this is an "ip_mc_list" for convenience; only the fields below
959 * are actually used. In particular, the refcnt and users are not
960 * used for management of the delete list. Using the same structure
961 * for deleted items allows change reports to use common code with
962 * non-deleted or query-response MCA's.
963 */
964 pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL);
965 if (!pmc)
966 return;
967 memset(pmc, 0, sizeof(*pmc));
968 spin_lock_bh(&im->lock);
969 pmc->interface = im->interface;
970 in_dev_hold(in_dev);
971 pmc->multiaddr = im->multiaddr;
972 pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
973 IGMP_Unsolicited_Report_Count;
974 pmc->sfmode = im->sfmode;
975 if (pmc->sfmode == MCAST_INCLUDE) {
976 struct ip_sf_list *psf;
977
978 pmc->tomb = im->tomb;
979 pmc->sources = im->sources;
980 im->tomb = im->sources = NULL;
981 for (psf=pmc->sources; psf; psf=psf->sf_next)
982 psf->sf_crcount = pmc->crcount;
983 }
984 spin_unlock_bh(&im->lock);
985
986 spin_lock_bh(&in_dev->mc_tomb_lock);
987 pmc->next = in_dev->mc_tomb;
988 in_dev->mc_tomb = pmc;
989 spin_unlock_bh(&in_dev->mc_tomb_lock);
990}
991
992static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr)
993{
994 struct ip_mc_list *pmc, *pmc_prev;
995 struct ip_sf_list *psf, *psf_next;
996
997 spin_lock_bh(&in_dev->mc_tomb_lock);
998 pmc_prev = NULL;
999 for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
1000 if (pmc->multiaddr == multiaddr)
1001 break;
1002 pmc_prev = pmc;
1003 }
1004 if (pmc) {
1005 if (pmc_prev)
1006 pmc_prev->next = pmc->next;
1007 else
1008 in_dev->mc_tomb = pmc->next;
1009 }
1010 spin_unlock_bh(&in_dev->mc_tomb_lock);
1011 if (pmc) {
1012 for (psf=pmc->tomb; psf; psf=psf_next) {
1013 psf_next = psf->sf_next;
1014 kfree(psf);
1015 }
1016 in_dev_put(pmc->interface);
1017 kfree(pmc);
1018 }
1019}
1020
1021static void igmpv3_clear_delrec(struct in_device *in_dev)
1022{
1023 struct ip_mc_list *pmc, *nextpmc;
1024
1025 spin_lock_bh(&in_dev->mc_tomb_lock);
1026 pmc = in_dev->mc_tomb;
1027 in_dev->mc_tomb = NULL;
1028 spin_unlock_bh(&in_dev->mc_tomb_lock);
1029
1030 for (; pmc; pmc = nextpmc) {
1031 nextpmc = pmc->next;
1032 ip_mc_clear_src(pmc);
1033 in_dev_put(pmc->interface);
1034 kfree(pmc);
1035 }
1036 /* clear dead sources, too */
1037 read_lock(&in_dev->mc_list_lock);
1038 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
1039 struct ip_sf_list *psf, *psf_next;
1040
1041 spin_lock_bh(&pmc->lock);
1042 psf = pmc->tomb;
1043 pmc->tomb = NULL;
1044 spin_unlock_bh(&pmc->lock);
1045 for (; psf; psf=psf_next) {
1046 psf_next = psf->sf_next;
1047 kfree(psf);
1048 }
1049 }
1050 read_unlock(&in_dev->mc_list_lock);
1051}
1052#endif
1053
1054static void igmp_group_dropped(struct ip_mc_list *im)
1055{
1056 struct in_device *in_dev = im->interface;
1057#ifdef CONFIG_IP_MULTICAST
1058 int reporter;
1059#endif
1060
1061 if (im->loaded) {
1062 im->loaded = 0;
1063 ip_mc_filter_del(in_dev, im->multiaddr);
1064 }
1065
1066#ifdef CONFIG_IP_MULTICAST
1067 if (im->multiaddr == IGMP_ALL_HOSTS)
1068 return;
1069
1070 reporter = im->reporter;
1071 igmp_stop_timer(im);
1072
1073 if (!in_dev->dead) {
1074 if (IGMP_V1_SEEN(in_dev))
1075 goto done;
1076 if (IGMP_V2_SEEN(in_dev)) {
1077 if (reporter)
1078 igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
1079 goto done;
1080 }
1081 /* IGMPv3 */
1082 igmpv3_add_delrec(in_dev, im);
1083
1084 igmp_ifc_event(in_dev);
1085 }
1086done:
1087#endif
1088 ip_mc_clear_src(im);
1089}
1090
1091static void igmp_group_added(struct ip_mc_list *im)
1092{
1093 struct in_device *in_dev = im->interface;
1094
1095 if (im->loaded == 0) {
1096 im->loaded = 1;
1097 ip_mc_filter_add(in_dev, im->multiaddr);
1098 }
1099
1100#ifdef CONFIG_IP_MULTICAST
1101 if (im->multiaddr == IGMP_ALL_HOSTS)
1102 return;
1103
1104 if (in_dev->dead)
1105 return;
1106 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
1107 spin_lock_bh(&im->lock);
1108 igmp_start_timer(im, IGMP_Initial_Report_Delay);
1109 spin_unlock_bh(&im->lock);
1110 return;
1111 }
1112 /* else, v3 */
1113
1114 im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
1115 IGMP_Unsolicited_Report_Count;
1116 igmp_ifc_event(in_dev);
1117#endif
1118}
1119
1120
1121/*
1122 * Multicast list managers
1123 */
1124
1125
1126/*
1127 * A socket has joined a multicast group on device dev.
1128 */
1129
1130void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
1131{
1132 struct ip_mc_list *im;
1133
1134 ASSERT_RTNL();
1135
1136 for (im=in_dev->mc_list; im; im=im->next) {
1137 if (im->multiaddr == addr) {
1138 im->users++;
1139 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
1140 goto out;
1141 }
1142 }
1143
1144 im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
1145 if (!im)
1146 goto out;
1147
1148 im->users=1;
1149 im->interface=in_dev;
1150 in_dev_hold(in_dev);
1151 im->multiaddr=addr;
1152 /* initial mode is (EX, empty) */
1153 im->sfmode = MCAST_EXCLUDE;
1154 im->sfcount[MCAST_INCLUDE] = 0;
1155 im->sfcount[MCAST_EXCLUDE] = 1;
1156 im->sources = NULL;
1157 im->tomb = NULL;
1158 im->crcount = 0;
1159 atomic_set(&im->refcnt, 1);
1160 spin_lock_init(&im->lock);
1161#ifdef CONFIG_IP_MULTICAST
1162 im->tm_running=0;
1163 init_timer(&im->timer);
1164 im->timer.data=(unsigned long)im;
1165 im->timer.function=&igmp_timer_expire;
1166 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1167 im->reporter = 0;
1168 im->gsquery = 0;
1169#endif
1170 im->loaded = 0;
1171 write_lock_bh(&in_dev->mc_list_lock);
1172 im->next=in_dev->mc_list;
1173 in_dev->mc_list=im;
1174 write_unlock_bh(&in_dev->mc_list_lock);
1175#ifdef CONFIG_IP_MULTICAST
1176 igmpv3_del_delrec(in_dev, im->multiaddr);
1177#endif
1178 igmp_group_added(im);
1179 if (!in_dev->dead)
1180 ip_rt_multicast_event(in_dev);
1181out:
1182 return;
1183}
1184
1185/*
1186 * A socket has left a multicast group on device dev
1187 */
1188
1189void ip_mc_dec_group(struct in_device *in_dev, u32 addr)
1190{
1191 struct ip_mc_list *i, **ip;
1192
1193 ASSERT_RTNL();
1194
1195 for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
1196 if (i->multiaddr==addr) {
1197 if (--i->users == 0) {
1198 write_lock_bh(&in_dev->mc_list_lock);
1199 *ip = i->next;
1200 write_unlock_bh(&in_dev->mc_list_lock);
1201 igmp_group_dropped(i);
1202
1203 if (!in_dev->dead)
1204 ip_rt_multicast_event(in_dev);
1205
1206 ip_ma_put(i);
1207 return;
1208 }
1209 break;
1210 }
1211 }
1212}
1213
1214/* Device going down */
1215
1216void ip_mc_down(struct in_device *in_dev)
1217{
1218 struct ip_mc_list *i;
1219
1220 ASSERT_RTNL();
1221
1222 for (i=in_dev->mc_list; i; i=i->next)
1223 igmp_group_dropped(i);
1224
1225#ifdef CONFIG_IP_MULTICAST
1226 in_dev->mr_ifc_count = 0;
1227 if (del_timer(&in_dev->mr_ifc_timer))
1228 __in_dev_put(in_dev);
1229 in_dev->mr_gq_running = 0;
1230 if (del_timer(&in_dev->mr_gq_timer))
1231 __in_dev_put(in_dev);
1232 igmpv3_clear_delrec(in_dev);
1233#endif
1234
1235 ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
1236}
1237
1238void ip_mc_init_dev(struct in_device *in_dev)
1239{
1240 ASSERT_RTNL();
1241
1242 in_dev->mc_tomb = NULL;
1243#ifdef CONFIG_IP_MULTICAST
1244 in_dev->mr_gq_running = 0;
1245 init_timer(&in_dev->mr_gq_timer);
1246 in_dev->mr_gq_timer.data=(unsigned long) in_dev;
1247 in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
1248 in_dev->mr_ifc_count = 0;
1249 init_timer(&in_dev->mr_ifc_timer);
1250 in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
1251 in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
1252 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
1253#endif
1254
1255 rwlock_init(&in_dev->mc_list_lock);
1256 spin_lock_init(&in_dev->mc_tomb_lock);
1257}
1258
1259/* Device going up */
1260
1261void ip_mc_up(struct in_device *in_dev)
1262{
1263 struct ip_mc_list *i;
1264
1265 ASSERT_RTNL();
1266
1267 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
1268
1269 for (i=in_dev->mc_list; i; i=i->next)
1270 igmp_group_added(i);
1271}
1272
1273/*
1274 * Device is about to be destroyed: clean up.
1275 */
1276
1277void ip_mc_destroy_dev(struct in_device *in_dev)
1278{
1279 struct ip_mc_list *i;
1280
1281 ASSERT_RTNL();
1282
1283 /* Deactivate timers */
1284 ip_mc_down(in_dev);
1285
1286 write_lock_bh(&in_dev->mc_list_lock);
1287 while ((i = in_dev->mc_list) != NULL) {
1288 in_dev->mc_list = i->next;
1289 write_unlock_bh(&in_dev->mc_list_lock);
1290
1291 igmp_group_dropped(i);
1292 ip_ma_put(i);
1293
1294 write_lock_bh(&in_dev->mc_list_lock);
1295 }
1296 write_unlock_bh(&in_dev->mc_list_lock);
1297}
1298
1299static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
1300{
1301 struct flowi fl = { .nl_u = { .ip4_u =
1302 { .daddr = imr->imr_multiaddr.s_addr } } };
1303 struct rtable *rt;
1304 struct net_device *dev = NULL;
1305 struct in_device *idev = NULL;
1306
1307 if (imr->imr_ifindex) {
1308 idev = inetdev_by_index(imr->imr_ifindex);
1309 if (idev)
1310 __in_dev_put(idev);
1311 return idev;
1312 }
1313 if (imr->imr_address.s_addr) {
1314 dev = ip_dev_find(imr->imr_address.s_addr);
1315 if (!dev)
1316 return NULL;
1317 __dev_put(dev);
1318 }
1319
1320 if (!dev && !ip_route_output_key(&rt, &fl)) {
1321 dev = rt->u.dst.dev;
1322 ip_rt_put(rt);
1323 }
1324 if (dev) {
1325 imr->imr_ifindex = dev->ifindex;
1326 idev = __in_dev_get(dev);
1327 }
1328 return idev;
1329}
1330
1331/*
1332 * Join a socket to a group
1333 */
1334int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS;
1335int sysctl_igmp_max_msf = IP_MAX_MSF;
1336
1337
1338static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
1339 __u32 *psfsrc)
1340{
1341 struct ip_sf_list *psf, *psf_prev;
1342 int rv = 0;
1343
1344 psf_prev = NULL;
1345 for (psf=pmc->sources; psf; psf=psf->sf_next) {
1346 if (psf->sf_inaddr == *psfsrc)
1347 break;
1348 psf_prev = psf;
1349 }
1350 if (!psf || psf->sf_count[sfmode] == 0) {
1351 /* source filter not found, or count wrong => bug */
1352 return -ESRCH;
1353 }
1354 psf->sf_count[sfmode]--;
1355 if (psf->sf_count[sfmode] == 0) {
1356 ip_rt_multicast_event(pmc->interface);
1357 }
1358 if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
1359#ifdef CONFIG_IP_MULTICAST
1360 struct in_device *in_dev = pmc->interface;
1361#endif
1362
1363 /* no more filters for this source */
1364 if (psf_prev)
1365 psf_prev->sf_next = psf->sf_next;
1366 else
1367 pmc->sources = psf->sf_next;
1368#ifdef CONFIG_IP_MULTICAST
1369 if (psf->sf_oldin &&
1370 !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
1371 psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
1372 IGMP_Unsolicited_Report_Count;
1373 psf->sf_next = pmc->tomb;
1374 pmc->tomb = psf;
1375 rv = 1;
1376 } else
1377#endif
1378 kfree(psf);
1379 }
1380 return rv;
1381}
1382
1383#ifndef CONFIG_IP_MULTICAST
1384#define igmp_ifc_event(x) do { } while (0)
1385#endif
1386
1387static int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
1388 int sfcount, __u32 *psfsrc, int delta)
1389{
1390 struct ip_mc_list *pmc;
1391 int changerec = 0;
1392 int i, err;
1393
1394 if (!in_dev)
1395 return -ENODEV;
1396 read_lock(&in_dev->mc_list_lock);
1397 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
1398 if (*pmca == pmc->multiaddr)
1399 break;
1400 }
1401 if (!pmc) {
1402 /* MCA not found?? bug */
1403 read_unlock(&in_dev->mc_list_lock);
1404 return -ESRCH;
1405 }
1406 spin_lock_bh(&pmc->lock);
1407 read_unlock(&in_dev->mc_list_lock);
1408#ifdef CONFIG_IP_MULTICAST
1409 sf_markstate(pmc);
1410#endif
1411 if (!delta) {
1412 err = -EINVAL;
1413 if (!pmc->sfcount[sfmode])
1414 goto out_unlock;
1415 pmc->sfcount[sfmode]--;
1416 }
1417 err = 0;
1418 for (i=0; i<sfcount; i++) {
1419 int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
1420
1421 changerec |= rv > 0;
1422 if (!err && rv < 0)
1423 err = rv;
1424 }
1425 if (pmc->sfmode == MCAST_EXCLUDE &&
1426 pmc->sfcount[MCAST_EXCLUDE] == 0 &&
1427 pmc->sfcount[MCAST_INCLUDE]) {
1428#ifdef CONFIG_IP_MULTICAST
1429 struct ip_sf_list *psf;
1430#endif
1431
1432 /* filter mode change */
1433 pmc->sfmode = MCAST_INCLUDE;
1434#ifdef CONFIG_IP_MULTICAST
1435 pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
1436 IGMP_Unsolicited_Report_Count;
1437 in_dev->mr_ifc_count = pmc->crcount;
1438 for (psf=pmc->sources; psf; psf = psf->sf_next)
1439 psf->sf_crcount = 0;
1440 igmp_ifc_event(pmc->interface);
1441 } else if (sf_setstate(pmc) || changerec) {
1442 igmp_ifc_event(pmc->interface);
1443#endif
1444 }
1445out_unlock:
1446 spin_unlock_bh(&pmc->lock);
1447 return err;
1448}
1449
1450/*
1451 * Add multicast single-source filter to the interface list
1452 */
1453static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
1454 __u32 *psfsrc, int delta)
1455{
1456 struct ip_sf_list *psf, *psf_prev;
1457
1458 psf_prev = NULL;
1459 for (psf=pmc->sources; psf; psf=psf->sf_next) {
1460 if (psf->sf_inaddr == *psfsrc)
1461 break;
1462 psf_prev = psf;
1463 }
1464 if (!psf) {
1465 psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC);
1466 if (!psf)
1467 return -ENOBUFS;
1468 memset(psf, 0, sizeof(*psf));
1469 psf->sf_inaddr = *psfsrc;
1470 if (psf_prev) {
1471 psf_prev->sf_next = psf;
1472 } else
1473 pmc->sources = psf;
1474 }
1475 psf->sf_count[sfmode]++;
1476 if (psf->sf_count[sfmode] == 1) {
1477 ip_rt_multicast_event(pmc->interface);
1478 }
1479 return 0;
1480}
1481
1482#ifdef CONFIG_IP_MULTICAST
1483static void sf_markstate(struct ip_mc_list *pmc)
1484{
1485 struct ip_sf_list *psf;
1486 int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
1487
1488 for (psf=pmc->sources; psf; psf=psf->sf_next)
1489 if (pmc->sfcount[MCAST_EXCLUDE]) {
1490 psf->sf_oldin = mca_xcount ==
1491 psf->sf_count[MCAST_EXCLUDE] &&
1492 !psf->sf_count[MCAST_INCLUDE];
1493 } else
1494 psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
1495}
1496
1497static int sf_setstate(struct ip_mc_list *pmc)
1498{
1499 struct ip_sf_list *psf;
1500 int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
1501 int qrv = pmc->interface->mr_qrv;
1502 int new_in, rv;
1503
1504 rv = 0;
1505 for (psf=pmc->sources; psf; psf=psf->sf_next) {
1506 if (pmc->sfcount[MCAST_EXCLUDE]) {
1507 new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
1508 !psf->sf_count[MCAST_INCLUDE];
1509 } else
1510 new_in = psf->sf_count[MCAST_INCLUDE] != 0;
1511 if (new_in != psf->sf_oldin) {
1512 psf->sf_crcount = qrv;
1513 rv++;
1514 }
1515 }
1516 return rv;
1517}
1518#endif
1519
1520/*
1521 * Add multicast source filter list to the interface list
1522 */
1523static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
1524 int sfcount, __u32 *psfsrc, int delta)
1525{
1526 struct ip_mc_list *pmc;
1527 int isexclude;
1528 int i, err;
1529
1530 if (!in_dev)
1531 return -ENODEV;
1532 read_lock(&in_dev->mc_list_lock);
1533 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
1534 if (*pmca == pmc->multiaddr)
1535 break;
1536 }
1537 if (!pmc) {
1538 /* MCA not found?? bug */
1539 read_unlock(&in_dev->mc_list_lock);
1540 return -ESRCH;
1541 }
1542 spin_lock_bh(&pmc->lock);
1543 read_unlock(&in_dev->mc_list_lock);
1544
1545#ifdef CONFIG_IP_MULTICAST
1546 sf_markstate(pmc);
1547#endif
1548 isexclude = pmc->sfmode == MCAST_EXCLUDE;
1549 if (!delta)
1550 pmc->sfcount[sfmode]++;
1551 err = 0;
1552 for (i=0; i<sfcount; i++) {
1553 err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
1554 if (err)
1555 break;
1556 }
1557 if (err) {
1558 int j;
1559
1560 pmc->sfcount[sfmode]--;
1561 for (j=0; j<i; j++)
1562 (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
1563 } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
1564#ifdef CONFIG_IP_MULTICAST
1565 struct in_device *in_dev = pmc->interface;
1566 struct ip_sf_list *psf;
1567#endif
1568
1569 /* filter mode change */
1570 if (pmc->sfcount[MCAST_EXCLUDE])
1571 pmc->sfmode = MCAST_EXCLUDE;
1572 else if (pmc->sfcount[MCAST_INCLUDE])
1573 pmc->sfmode = MCAST_INCLUDE;
1574#ifdef CONFIG_IP_MULTICAST
1575 /* else no filters; keep old mode for reports */
1576
1577 pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
1578 IGMP_Unsolicited_Report_Count;
1579 in_dev->mr_ifc_count = pmc->crcount;
1580 for (psf=pmc->sources; psf; psf = psf->sf_next)
1581 psf->sf_crcount = 0;
1582 igmp_ifc_event(in_dev);
1583 } else if (sf_setstate(pmc)) {
1584 igmp_ifc_event(in_dev);
1585#endif
1586 }
1587 spin_unlock_bh(&pmc->lock);
1588 return err;
1589}
1590
1591static void ip_mc_clear_src(struct ip_mc_list *pmc)
1592{
1593 struct ip_sf_list *psf, *nextpsf;
1594
1595 for (psf=pmc->tomb; psf; psf=nextpsf) {
1596 nextpsf = psf->sf_next;
1597 kfree(psf);
1598 }
1599 pmc->tomb = NULL;
1600 for (psf=pmc->sources; psf; psf=nextpsf) {
1601 nextpsf = psf->sf_next;
1602 kfree(psf);
1603 }
1604 pmc->sources = NULL;
1605 pmc->sfmode = MCAST_EXCLUDE;
1606 pmc->sfcount[MCAST_EXCLUDE] = 0;
1607 pmc->sfcount[MCAST_EXCLUDE] = 1;
1608}
1609
1610
1611/*
1612 * Join a multicast group
1613 */
1614int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1615{
1616 int err;
1617 u32 addr = imr->imr_multiaddr.s_addr;
1618 struct ip_mc_socklist *iml, *i;
1619 struct in_device *in_dev;
1620 struct inet_sock *inet = inet_sk(sk);
1621 int count = 0;
1622
1623 if (!MULTICAST(addr))
1624 return -EINVAL;
1625
1626 rtnl_shlock();
1627
1628 in_dev = ip_mc_find_dev(imr);
1629
1630 if (!in_dev) {
1631 iml = NULL;
1632 err = -ENODEV;
1633 goto done;
1634 }
1635
1636 iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1637
1638 err = -EADDRINUSE;
1639 for (i = inet->mc_list; i; i = i->next) {
1640 if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
1641 /* New style additions are reference counted */
1642 if (imr->imr_address.s_addr == 0) {
1643 i->count++;
1644 err = 0;
1645 }
1646 goto done;
1647 }
1648 count++;
1649 }
1650 err = -ENOBUFS;
1651 if (iml == NULL || count >= sysctl_igmp_max_memberships)
1652 goto done;
1653 memcpy(&iml->multi, imr, sizeof(*imr));
1654 iml->next = inet->mc_list;
1655 iml->count = 1;
1656 iml->sflist = NULL;
1657 iml->sfmode = MCAST_EXCLUDE;
1658 inet->mc_list = iml;
1659 ip_mc_inc_group(in_dev, addr);
1660 iml = NULL;
1661 err = 0;
1662
1663done:
1664 rtnl_shunlock();
1665 if (iml)
1666 sock_kfree_s(sk, iml, sizeof(*iml));
1667 return err;
1668}
1669
1670static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1671 struct in_device *in_dev)
1672{
1673 int err;
1674
1675 if (iml->sflist == 0) {
1676 /* any-source empty exclude case */
1677 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1678 iml->sfmode, 0, NULL, 0);
1679 }
1680 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1681 iml->sfmode, iml->sflist->sl_count,
1682 iml->sflist->sl_addr, 0);
1683 sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
1684 iml->sflist = NULL;
1685 return err;
1686}
1687
1688/*
1689 * Ask a socket to leave a group.
1690 */
1691
1692int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1693{
1694 struct inet_sock *inet = inet_sk(sk);
1695 struct ip_mc_socklist *iml, **imlp;
1696
1697 rtnl_lock();
1698 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
1699 if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
1700 iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
1701 (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
1702 struct in_device *in_dev;
1703
1704 in_dev = inetdev_by_index(iml->multi.imr_ifindex);
1705 if (in_dev)
1706 (void) ip_mc_leave_src(sk, iml, in_dev);
1707 if (--iml->count) {
1708 rtnl_unlock();
1709 if (in_dev)
1710 in_dev_put(in_dev);
1711 return 0;
1712 }
1713
1714 *imlp = iml->next;
1715
1716 if (in_dev) {
1717 ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
1718 in_dev_put(in_dev);
1719 }
1720 rtnl_unlock();
1721 sock_kfree_s(sk, iml, sizeof(*iml));
1722 return 0;
1723 }
1724 }
1725 rtnl_unlock();
1726 return -EADDRNOTAVAIL;
1727}
1728
1729int ip_mc_source(int add, int omode, struct sock *sk, struct
1730 ip_mreq_source *mreqs, int ifindex)
1731{
1732 int err;
1733 struct ip_mreqn imr;
1734 u32 addr = mreqs->imr_multiaddr;
1735 struct ip_mc_socklist *pmc;
1736 struct in_device *in_dev = NULL;
1737 struct inet_sock *inet = inet_sk(sk);
1738 struct ip_sf_socklist *psl;
1739 int i, j, rv;
1740
1741 if (!MULTICAST(addr))
1742 return -EINVAL;
1743
1744 rtnl_shlock();
1745
1746 imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
1747 imr.imr_address.s_addr = mreqs->imr_interface;
1748 imr.imr_ifindex = ifindex;
1749 in_dev = ip_mc_find_dev(&imr);
1750
1751 if (!in_dev) {
1752 err = -ENODEV;
1753 goto done;
1754 }
1755 err = -EADDRNOTAVAIL;
1756
1757 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1758 if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
1759 break;
1760 }
1761 if (!pmc) /* must have a prior join */
1762 goto done;
1763 /* if a source filter was set, must be the same mode as before */
1764 if (pmc->sflist) {
1765 if (pmc->sfmode != omode)
1766 goto done;
1767 } else if (pmc->sfmode != omode) {
1768 /* allow mode switches for empty-set filters */
1769 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
1770 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
1771 NULL, 0);
1772 pmc->sfmode = omode;
1773 }
1774
1775 psl = pmc->sflist;
1776 if (!add) {
1777 if (!psl)
1778 goto done;
1779 rv = !0;
1780 for (i=0; i<psl->sl_count; i++) {
1781 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
1782 sizeof(__u32));
1783 if (rv == 0)
1784 break;
1785 }
1786 if (rv) /* source not found */
1787 goto done;
1788
1789 /* update the interface filter */
1790 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
1791 &mreqs->imr_sourceaddr, 1);
1792
1793 for (j=i+1; j<psl->sl_count; j++)
1794 psl->sl_addr[j-1] = psl->sl_addr[j];
1795 psl->sl_count--;
1796 err = 0;
1797 goto done;
1798 }
1799 /* else, add a new source to the filter */
1800
1801 if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
1802 err = -ENOBUFS;
1803 goto done;
1804 }
1805 if (!psl || psl->sl_count == psl->sl_max) {
1806 struct ip_sf_socklist *newpsl;
1807 int count = IP_SFBLOCK;
1808
1809 if (psl)
1810 count += psl->sl_max;
1811 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
1812 IP_SFLSIZE(count), GFP_KERNEL);
1813 if (!newpsl) {
1814 err = -ENOBUFS;
1815 goto done;
1816 }
1817 newpsl->sl_max = count;
1818 newpsl->sl_count = count - IP_SFBLOCK;
1819 if (psl) {
1820 for (i=0; i<psl->sl_count; i++)
1821 newpsl->sl_addr[i] = psl->sl_addr[i];
1822 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
1823 }
1824 pmc->sflist = psl = newpsl;
1825 }
1826 rv = 1; /* > 0 for insert logic below if sl_count is 0 */
1827 for (i=0; i<psl->sl_count; i++) {
1828 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
1829 sizeof(__u32));
1830 if (rv == 0)
1831 break;
1832 }
1833 if (rv == 0) /* address already there is an error */
1834 goto done;
1835 for (j=psl->sl_count-1; j>=i; j--)
1836 psl->sl_addr[j+1] = psl->sl_addr[j];
1837 psl->sl_addr[i] = mreqs->imr_sourceaddr;
1838 psl->sl_count++;
1839 err = 0;
1840 /* update the interface list */
1841 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
1842 &mreqs->imr_sourceaddr, 1);
1843done:
1844 rtnl_shunlock();
1845 return err;
1846}
1847
1848int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1849{
1850 int err;
1851 struct ip_mreqn imr;
1852 u32 addr = msf->imsf_multiaddr;
1853 struct ip_mc_socklist *pmc;
1854 struct in_device *in_dev;
1855 struct inet_sock *inet = inet_sk(sk);
1856 struct ip_sf_socklist *newpsl, *psl;
1857
1858 if (!MULTICAST(addr))
1859 return -EINVAL;
1860 if (msf->imsf_fmode != MCAST_INCLUDE &&
1861 msf->imsf_fmode != MCAST_EXCLUDE)
1862 return -EINVAL;
1863
1864 rtnl_shlock();
1865
1866 imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
1867 imr.imr_address.s_addr = msf->imsf_interface;
1868 imr.imr_ifindex = ifindex;
1869 in_dev = ip_mc_find_dev(&imr);
1870
1871 if (!in_dev) {
1872 err = -ENODEV;
1873 goto done;
1874 }
1875 err = -EADDRNOTAVAIL;
1876
1877 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1878 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
1879 pmc->multi.imr_ifindex == imr.imr_ifindex)
1880 break;
1881 }
1882 if (!pmc) /* must have a prior join */
1883 goto done;
1884 if (msf->imsf_numsrc) {
1885 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
1886 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
1887 if (!newpsl) {
1888 err = -ENOBUFS;
1889 goto done;
1890 }
1891 newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
1892 memcpy(newpsl->sl_addr, msf->imsf_slist,
1893 msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
1894 err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
1895 msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
1896 if (err) {
1897 sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
1898 goto done;
1899 }
1900 } else
1901 newpsl = NULL;
1902 psl = pmc->sflist;
1903 if (psl) {
1904 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
1905 psl->sl_count, psl->sl_addr, 0);
1906 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
1907 } else
1908 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
1909 0, NULL, 0);
1910 pmc->sflist = newpsl;
1911 pmc->sfmode = msf->imsf_fmode;
1912done:
1913 rtnl_shunlock();
1914 return err;
1915}
1916
1917int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
1918 struct ip_msfilter __user *optval, int __user *optlen)
1919{
1920 int err, len, count, copycount;
1921 struct ip_mreqn imr;
1922 u32 addr = msf->imsf_multiaddr;
1923 struct ip_mc_socklist *pmc;
1924 struct in_device *in_dev;
1925 struct inet_sock *inet = inet_sk(sk);
1926 struct ip_sf_socklist *psl;
1927
1928 if (!MULTICAST(addr))
1929 return -EINVAL;
1930
1931 rtnl_shlock();
1932
1933 imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
1934 imr.imr_address.s_addr = msf->imsf_interface;
1935 imr.imr_ifindex = 0;
1936 in_dev = ip_mc_find_dev(&imr);
1937
1938 if (!in_dev) {
1939 err = -ENODEV;
1940 goto done;
1941 }
1942 err = -EADDRNOTAVAIL;
1943
1944 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1945 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
1946 pmc->multi.imr_ifindex == imr.imr_ifindex)
1947 break;
1948 }
1949 if (!pmc) /* must have a prior join */
1950 goto done;
1951 msf->imsf_fmode = pmc->sfmode;
1952 psl = pmc->sflist;
1953 rtnl_shunlock();
1954 if (!psl) {
1955 len = 0;
1956 count = 0;
1957 } else {
1958 count = psl->sl_count;
1959 }
1960 copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
1961 len = copycount * sizeof(psl->sl_addr[0]);
1962 msf->imsf_numsrc = count;
1963 if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
1964 copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
1965 return -EFAULT;
1966 }
1967 if (len &&
1968 copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
1969 return -EFAULT;
1970 return 0;
1971done:
1972 rtnl_shunlock();
1973 return err;
1974}
1975
1976int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
1977 struct group_filter __user *optval, int __user *optlen)
1978{
1979 int err, i, count, copycount;
1980 struct sockaddr_in *psin;
1981 u32 addr;
1982 struct ip_mc_socklist *pmc;
1983 struct inet_sock *inet = inet_sk(sk);
1984 struct ip_sf_socklist *psl;
1985
1986 psin = (struct sockaddr_in *)&gsf->gf_group;
1987 if (psin->sin_family != AF_INET)
1988 return -EINVAL;
1989 addr = psin->sin_addr.s_addr;
1990 if (!MULTICAST(addr))
1991 return -EINVAL;
1992
1993 rtnl_shlock();
1994
1995 err = -EADDRNOTAVAIL;
1996
1997 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1998 if (pmc->multi.imr_multiaddr.s_addr == addr &&
1999 pmc->multi.imr_ifindex == gsf->gf_interface)
2000 break;
2001 }
2002 if (!pmc) /* must have a prior join */
2003 goto done;
2004 gsf->gf_fmode = pmc->sfmode;
2005 psl = pmc->sflist;
2006 rtnl_shunlock();
2007 count = psl ? psl->sl_count : 0;
2008 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
2009 gsf->gf_numsrc = count;
2010 if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
2011 copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
2012 return -EFAULT;
2013 }
2014 for (i=0; i<copycount; i++) {
2015 struct sockaddr_in *psin;
2016 struct sockaddr_storage ss;
2017
2018 psin = (struct sockaddr_in *)&ss;
2019 memset(&ss, 0, sizeof(ss));
2020 psin->sin_family = AF_INET;
2021 psin->sin_addr.s_addr = psl->sl_addr[i];
2022 if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
2023 return -EFAULT;
2024 }
2025 return 0;
2026done:
2027 rtnl_shunlock();
2028 return err;
2029}
2030
2031/*
2032 * check if a multicast source filter allows delivery for a given <src,dst,intf>
2033 */
2034int ip_mc_sf_allow(struct sock *sk, u32 loc_addr, u32 rmt_addr, int dif)
2035{
2036 struct inet_sock *inet = inet_sk(sk);
2037 struct ip_mc_socklist *pmc;
2038 struct ip_sf_socklist *psl;
2039 int i;
2040
2041 if (!MULTICAST(loc_addr))
2042 return 1;
2043
2044 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
2045 if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
2046 pmc->multi.imr_ifindex == dif)
2047 break;
2048 }
2049 if (!pmc)
2050 return 1;
2051 psl = pmc->sflist;
2052 if (!psl)
2053 return pmc->sfmode == MCAST_EXCLUDE;
2054
2055 for (i=0; i<psl->sl_count; i++) {
2056 if (psl->sl_addr[i] == rmt_addr)
2057 break;
2058 }
2059 if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
2060 return 0;
2061 if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
2062 return 0;
2063 return 1;
2064}
2065
2066/*
2067 * A socket is closing.
2068 */
2069
2070void ip_mc_drop_socket(struct sock *sk)
2071{
2072 struct inet_sock *inet = inet_sk(sk);
2073 struct ip_mc_socklist *iml;
2074
2075 if (inet->mc_list == NULL)
2076 return;
2077
2078 rtnl_lock();
2079 while ((iml = inet->mc_list) != NULL) {
2080 struct in_device *in_dev;
2081 inet->mc_list = iml->next;
2082
2083 if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) {
2084 (void) ip_mc_leave_src(sk, iml, in_dev);
2085 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
2086 in_dev_put(in_dev);
2087 }
2088 sock_kfree_s(sk, iml, sizeof(*iml));
2089
2090 }
2091 rtnl_unlock();
2092}
2093
2094int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto)
2095{
2096 struct ip_mc_list *im;
2097 struct ip_sf_list *psf;
2098 int rv = 0;
2099
2100 read_lock(&in_dev->mc_list_lock);
2101 for (im=in_dev->mc_list; im; im=im->next) {
2102 if (im->multiaddr == mc_addr)
2103 break;
2104 }
2105 if (im && proto == IPPROTO_IGMP) {
2106 rv = 1;
2107 } else if (im) {
2108 if (src_addr) {
2109 for (psf=im->sources; psf; psf=psf->sf_next) {
2110 if (psf->sf_inaddr == src_addr)
2111 break;
2112 }
2113 if (psf)
2114 rv = psf->sf_count[MCAST_INCLUDE] ||
2115 psf->sf_count[MCAST_EXCLUDE] !=
2116 im->sfcount[MCAST_EXCLUDE];
2117 else
2118 rv = im->sfcount[MCAST_EXCLUDE] != 0;
2119 } else
2120 rv = 1; /* unspecified source; tentatively allow */
2121 }
2122 read_unlock(&in_dev->mc_list_lock);
2123 return rv;
2124}
2125
2126#if defined(CONFIG_PROC_FS)
2127struct igmp_mc_iter_state {
2128 struct net_device *dev;
2129 struct in_device *in_dev;
2130};
2131
2132#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private)
2133
2134static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2135{
2136 struct ip_mc_list *im = NULL;
2137 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2138
2139 for (state->dev = dev_base, state->in_dev = NULL;
2140 state->dev;
2141 state->dev = state->dev->next) {
2142 struct in_device *in_dev;
2143 in_dev = in_dev_get(state->dev);
2144 if (!in_dev)
2145 continue;
2146 read_lock(&in_dev->mc_list_lock);
2147 im = in_dev->mc_list;
2148 if (im) {
2149 state->in_dev = in_dev;
2150 break;
2151 }
2152 read_unlock(&in_dev->mc_list_lock);
2153 in_dev_put(in_dev);
2154 }
2155 return im;
2156}
2157
2158static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
2159{
2160 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2161 im = im->next;
2162 while (!im) {
2163 if (likely(state->in_dev != NULL)) {
2164 read_unlock(&state->in_dev->mc_list_lock);
2165 in_dev_put(state->in_dev);
2166 }
2167 state->dev = state->dev->next;
2168 if (!state->dev) {
2169 state->in_dev = NULL;
2170 break;
2171 }
2172 state->in_dev = in_dev_get(state->dev);
2173 if (!state->in_dev)
2174 continue;
2175 read_lock(&state->in_dev->mc_list_lock);
2176 im = state->in_dev->mc_list;
2177 }
2178 return im;
2179}
2180
2181static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
2182{
2183 struct ip_mc_list *im = igmp_mc_get_first(seq);
2184 if (im)
2185 while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
2186 --pos;
2187 return pos ? NULL : im;
2188}
2189
2190static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
2191{
2192 read_lock(&dev_base_lock);
2193 return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2194}
2195
2196static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2197{
2198 struct ip_mc_list *im;
2199 if (v == SEQ_START_TOKEN)
2200 im = igmp_mc_get_first(seq);
2201 else
2202 im = igmp_mc_get_next(seq, v);
2203 ++*pos;
2204 return im;
2205}
2206
2207static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
2208{
2209 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2210 if (likely(state->in_dev != NULL)) {
2211 read_unlock(&state->in_dev->mc_list_lock);
2212 in_dev_put(state->in_dev);
2213 state->in_dev = NULL;
2214 }
2215 state->dev = NULL;
2216 read_unlock(&dev_base_lock);
2217}
2218
2219static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2220{
2221 if (v == SEQ_START_TOKEN)
2222 seq_puts(seq,
2223 "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
2224 else {
2225 struct ip_mc_list *im = (struct ip_mc_list *)v;
2226 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2227 char *querier;
2228#ifdef CONFIG_IP_MULTICAST
2229 querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
2230 IGMP_V2_SEEN(state->in_dev) ? "V2" :
2231 "V3";
2232#else
2233 querier = "NONE";
2234#endif
2235
2236 if (state->in_dev->mc_list == im) {
2237 seq_printf(seq, "%d\t%-10s: %5d %7s\n",
2238 state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
2239 }
2240
2241 seq_printf(seq,
2242 "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
2243 im->multiaddr, im->users,
2244 im->tm_running, im->tm_running ?
2245 jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
2246 im->reporter);
2247 }
2248 return 0;
2249}
2250
2251static struct seq_operations igmp_mc_seq_ops = {
2252 .start = igmp_mc_seq_start,
2253 .next = igmp_mc_seq_next,
2254 .stop = igmp_mc_seq_stop,
2255 .show = igmp_mc_seq_show,
2256};
2257
2258static int igmp_mc_seq_open(struct inode *inode, struct file *file)
2259{
2260 struct seq_file *seq;
2261 int rc = -ENOMEM;
2262 struct igmp_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
2263
2264 if (!s)
2265 goto out;
2266 rc = seq_open(file, &igmp_mc_seq_ops);
2267 if (rc)
2268 goto out_kfree;
2269
2270 seq = file->private_data;
2271 seq->private = s;
2272 memset(s, 0, sizeof(*s));
2273out:
2274 return rc;
2275out_kfree:
2276 kfree(s);
2277 goto out;
2278}
2279
2280static struct file_operations igmp_mc_seq_fops = {
2281 .owner = THIS_MODULE,
2282 .open = igmp_mc_seq_open,
2283 .read = seq_read,
2284 .llseek = seq_lseek,
2285 .release = seq_release_private,
2286};
2287
2288struct igmp_mcf_iter_state {
2289 struct net_device *dev;
2290 struct in_device *idev;
2291 struct ip_mc_list *im;
2292};
2293
2294#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private)
2295
2296static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2297{
2298 struct ip_sf_list *psf = NULL;
2299 struct ip_mc_list *im = NULL;
2300 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2301
2302 for (state->dev = dev_base, state->idev = NULL, state->im = NULL;
2303 state->dev;
2304 state->dev = state->dev->next) {
2305 struct in_device *idev;
2306 idev = in_dev_get(state->dev);
2307 if (unlikely(idev == NULL))
2308 continue;
2309 read_lock(&idev->mc_list_lock);
2310 im = idev->mc_list;
2311 if (likely(im != NULL)) {
2312 spin_lock_bh(&im->lock);
2313 psf = im->sources;
2314 if (likely(psf != NULL)) {
2315 state->im = im;
2316 state->idev = idev;
2317 break;
2318 }
2319 spin_unlock_bh(&im->lock);
2320 }
2321 read_unlock(&idev->mc_list_lock);
2322 in_dev_put(idev);
2323 }
2324 return psf;
2325}
2326
2327static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
2328{
2329 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2330
2331 psf = psf->sf_next;
2332 while (!psf) {
2333 spin_unlock_bh(&state->im->lock);
2334 state->im = state->im->next;
2335 while (!state->im) {
2336 if (likely(state->idev != NULL)) {
2337 read_unlock(&state->idev->mc_list_lock);
2338 in_dev_put(state->idev);
2339 }
2340 state->dev = state->dev->next;
2341 if (!state->dev) {
2342 state->idev = NULL;
2343 goto out;
2344 }
2345 state->idev = in_dev_get(state->dev);
2346 if (!state->idev)
2347 continue;
2348 read_lock(&state->idev->mc_list_lock);
2349 state->im = state->idev->mc_list;
2350 }
2351 if (!state->im)
2352 break;
2353 spin_lock_bh(&state->im->lock);
2354 psf = state->im->sources;
2355 }
2356out:
2357 return psf;
2358}
2359
2360static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
2361{
2362 struct ip_sf_list *psf = igmp_mcf_get_first(seq);
2363 if (psf)
2364 while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
2365 --pos;
2366 return pos ? NULL : psf;
2367}
2368
2369static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
2370{
2371 read_lock(&dev_base_lock);
2372 return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2373}
2374
2375static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2376{
2377 struct ip_sf_list *psf;
2378 if (v == SEQ_START_TOKEN)
2379 psf = igmp_mcf_get_first(seq);
2380 else
2381 psf = igmp_mcf_get_next(seq, v);
2382 ++*pos;
2383 return psf;
2384}
2385
2386static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2387{
2388 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2389 if (likely(state->im != NULL)) {
2390 spin_unlock_bh(&state->im->lock);
2391 state->im = NULL;
2392 }
2393 if (likely(state->idev != NULL)) {
2394 read_unlock(&state->idev->mc_list_lock);
2395 in_dev_put(state->idev);
2396 state->idev = NULL;
2397 }
2398 state->dev = NULL;
2399 read_unlock(&dev_base_lock);
2400}
2401
2402static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
2403{
2404 struct ip_sf_list *psf = (struct ip_sf_list *)v;
2405 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2406
2407 if (v == SEQ_START_TOKEN) {
2408 seq_printf(seq,
2409 "%3s %6s "
2410 "%10s %10s %6s %6s\n", "Idx",
2411 "Device", "MCA",
2412 "SRC", "INC", "EXC");
2413 } else {
2414 seq_printf(seq,
2415 "%3d %6.6s 0x%08x "
2416 "0x%08x %6lu %6lu\n",
2417 state->dev->ifindex, state->dev->name,
2418 ntohl(state->im->multiaddr),
2419 ntohl(psf->sf_inaddr),
2420 psf->sf_count[MCAST_INCLUDE],
2421 psf->sf_count[MCAST_EXCLUDE]);
2422 }
2423 return 0;
2424}
2425
2426static struct seq_operations igmp_mcf_seq_ops = {
2427 .start = igmp_mcf_seq_start,
2428 .next = igmp_mcf_seq_next,
2429 .stop = igmp_mcf_seq_stop,
2430 .show = igmp_mcf_seq_show,
2431};
2432
2433static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
2434{
2435 struct seq_file *seq;
2436 int rc = -ENOMEM;
2437 struct igmp_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
2438
2439 if (!s)
2440 goto out;
2441 rc = seq_open(file, &igmp_mcf_seq_ops);
2442 if (rc)
2443 goto out_kfree;
2444
2445 seq = file->private_data;
2446 seq->private = s;
2447 memset(s, 0, sizeof(*s));
2448out:
2449 return rc;
2450out_kfree:
2451 kfree(s);
2452 goto out;
2453}
2454
2455static struct file_operations igmp_mcf_seq_fops = {
2456 .owner = THIS_MODULE,
2457 .open = igmp_mcf_seq_open,
2458 .read = seq_read,
2459 .llseek = seq_lseek,
2460 .release = seq_release_private,
2461};
2462
2463int __init igmp_mc_proc_init(void)
2464{
2465 proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
2466 proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
2467 return 0;
2468}
2469#endif
2470
2471EXPORT_SYMBOL(ip_mc_dec_group);
2472EXPORT_SYMBOL(ip_mc_inc_group);
2473EXPORT_SYMBOL(ip_mc_join_group);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 000000000000..95473953c406
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,460 @@
1/*
2 * INETPEER - A storage for permanent information about peers
3 *
4 * This source is covered by the GNU GPL, the same as all kernel sources.
5 *
6 * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $
7 *
8 * Authors: Andrey V. Savochkin <saw@msu.ru>
9 */
10
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/slab.h>
14#include <linux/interrupt.h>
15#include <linux/spinlock.h>
16#include <linux/random.h>
17#include <linux/sched.h>
18#include <linux/timer.h>
19#include <linux/time.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/net.h>
23#include <net/inetpeer.h>
24
25/*
26 * Theory of operations.
27 * We keep one entry for each peer IP address. The nodes contains long-living
28 * information about the peer which doesn't depend on routes.
29 * At this moment this information consists only of ID field for the next
30 * outgoing IP packet. This field is incremented with each packet as encoded
31 * in inet_getid() function (include/net/inetpeer.h).
32 * At the moment of writing this notes identifier of IP packets is generated
33 * to be unpredictable using this code only for packets subjected
34 * (actually or potentially) to defragmentation. I.e. DF packets less than
35 * PMTU in size uses a constant ID and do not use this code (see
36 * ip_select_ident() in include/net/ip.h).
37 *
38 * Route cache entries hold references to our nodes.
39 * New cache entries get references via lookup by destination IP address in
40 * the avl tree. The reference is grabbed only when it's needed i.e. only
41 * when we try to output IP packet which needs an unpredictable ID (see
42 * __ip_select_ident() in net/ipv4/route.c).
43 * Nodes are removed only when reference counter goes to 0.
44 * When it's happened the node may be removed when a sufficient amount of
45 * time has been passed since its last use. The less-recently-used entry can
46 * also be removed if the pool is overloaded i.e. if the total amount of
47 * entries is greater-or-equal than the threshold.
48 *
49 * Node pool is organised as an AVL tree.
50 * Such an implementation has been chosen not just for fun. It's a way to
51 * prevent easy and efficient DoS attacks by creating hash collisions. A huge
52 * amount of long living nodes in a single hash slot would significantly delay
53 * lookups performed with disabled BHs.
54 *
55 * Serialisation issues.
56 * 1. Nodes may appear in the tree only with the pool write lock held.
57 * 2. Nodes may disappear from the tree only with the pool write lock held
58 * AND reference count being 0.
59 * 3. Nodes appears and disappears from unused node list only under
60 * "inet_peer_unused_lock".
61 * 4. Global variable peer_total is modified under the pool lock.
62 * 5. struct inet_peer fields modification:
63 * avl_left, avl_right, avl_parent, avl_height: pool lock
64 * unused_next, unused_prevp: unused node list lock
65 * refcnt: atomically against modifications on other CPU;
66 * usually under some other lock to prevent node disappearing
67 * dtime: unused node list lock
68 * v4daddr: unchangeable
69 * ip_id_count: idlock
70 */
71
72/* Exported for inet_getid inline function. */
73DEFINE_SPINLOCK(inet_peer_idlock);
74
75static kmem_cache_t *peer_cachep;
76
77#define node_height(x) x->avl_height
78static struct inet_peer peer_fake_node = {
79 .avl_left = &peer_fake_node,
80 .avl_right = &peer_fake_node,
81 .avl_height = 0
82};
83#define peer_avl_empty (&peer_fake_node)
84static struct inet_peer *peer_root = peer_avl_empty;
85static DEFINE_RWLOCK(peer_pool_lock);
86#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
87
88static volatile int peer_total;
89/* Exported for sysctl_net_ipv4. */
90int inet_peer_threshold = 65536 + 128; /* start to throw entries more
91 * aggressively at this stage */
92int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */
93int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */
94
95static struct inet_peer *inet_peer_unused_head;
96/* Exported for inet_putpeer inline function. */
97struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
98DEFINE_SPINLOCK(inet_peer_unused_lock);
99#define PEER_MAX_CLEANUP_WORK 30
100
101static void peer_check_expire(unsigned long dummy);
102static struct timer_list peer_periodic_timer =
103 TIMER_INITIALIZER(peer_check_expire, 0, 0);
104
105/* Exported for sysctl_net_ipv4. */
106int inet_peer_gc_mintime = 10 * HZ,
107 inet_peer_gc_maxtime = 120 * HZ;
108
109/* Called from ip_output.c:ip_init */
110void __init inet_initpeers(void)
111{
112 struct sysinfo si;
113
114 /* Use the straight interface to information about memory. */
115 si_meminfo(&si);
116 /* The values below were suggested by Alexey Kuznetsov
117 * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
118 * myself. --SAW
119 */
120 if (si.totalram <= (32768*1024)/PAGE_SIZE)
121 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
122 if (si.totalram <= (16384*1024)/PAGE_SIZE)
123 inet_peer_threshold >>= 1; /* about 512KB */
124 if (si.totalram <= (8192*1024)/PAGE_SIZE)
125 inet_peer_threshold >>= 2; /* about 128KB */
126
127 peer_cachep = kmem_cache_create("inet_peer_cache",
128 sizeof(struct inet_peer),
129 0, SLAB_HWCACHE_ALIGN,
130 NULL, NULL);
131
132 if (!peer_cachep)
133 panic("cannot create inet_peer_cache");
134
135 /* All the timers, started at system startup tend
136 to synchronize. Perturb it a bit.
137 */
138 peer_periodic_timer.expires = jiffies
139 + net_random() % inet_peer_gc_maxtime
140 + inet_peer_gc_maxtime;
141 add_timer(&peer_periodic_timer);
142}
143
144/* Called with or without local BH being disabled. */
145static void unlink_from_unused(struct inet_peer *p)
146{
147 spin_lock_bh(&inet_peer_unused_lock);
148 if (p->unused_prevp != NULL) {
149 /* On unused list. */
150 *p->unused_prevp = p->unused_next;
151 if (p->unused_next != NULL)
152 p->unused_next->unused_prevp = p->unused_prevp;
153 else
154 inet_peer_unused_tailp = p->unused_prevp;
155 p->unused_prevp = NULL; /* mark it as removed */
156 }
157 spin_unlock_bh(&inet_peer_unused_lock);
158}
159
160/* Called with local BH disabled and the pool lock held. */
161#define lookup(daddr) \
162({ \
163 struct inet_peer *u, **v; \
164 stackptr = stack; \
165 *stackptr++ = &peer_root; \
166 for (u = peer_root; u != peer_avl_empty; ) { \
167 if (daddr == u->v4daddr) \
168 break; \
169 if (daddr < u->v4daddr) \
170 v = &u->avl_left; \
171 else \
172 v = &u->avl_right; \
173 *stackptr++ = v; \
174 u = *v; \
175 } \
176 u; \
177})
178
179/* Called with local BH disabled and the pool write lock held. */
180#define lookup_rightempty(start) \
181({ \
182 struct inet_peer *u, **v; \
183 *stackptr++ = &start->avl_left; \
184 v = &start->avl_left; \
185 for (u = *v; u->avl_right != peer_avl_empty; ) { \
186 v = &u->avl_right; \
187 *stackptr++ = v; \
188 u = *v; \
189 } \
190 u; \
191})
192
193/* Called with local BH disabled and the pool write lock held.
194 * Variable names are the proof of operation correctness.
195 * Look into mm/map_avl.c for more detail description of the ideas. */
196static void peer_avl_rebalance(struct inet_peer **stack[],
197 struct inet_peer ***stackend)
198{
199 struct inet_peer **nodep, *node, *l, *r;
200 int lh, rh;
201
202 while (stackend > stack) {
203 nodep = *--stackend;
204 node = *nodep;
205 l = node->avl_left;
206 r = node->avl_right;
207 lh = node_height(l);
208 rh = node_height(r);
209 if (lh > rh + 1) { /* l: RH+2 */
210 struct inet_peer *ll, *lr, *lrl, *lrr;
211 int lrh;
212 ll = l->avl_left;
213 lr = l->avl_right;
214 lrh = node_height(lr);
215 if (lrh <= node_height(ll)) { /* ll: RH+1 */
216 node->avl_left = lr; /* lr: RH or RH+1 */
217 node->avl_right = r; /* r: RH */
218 node->avl_height = lrh + 1; /* RH+1 or RH+2 */
219 l->avl_left = ll; /* ll: RH+1 */
220 l->avl_right = node; /* node: RH+1 or RH+2 */
221 l->avl_height = node->avl_height + 1;
222 *nodep = l;
223 } else { /* ll: RH, lr: RH+1 */
224 lrl = lr->avl_left; /* lrl: RH or RH-1 */
225 lrr = lr->avl_right; /* lrr: RH or RH-1 */
226 node->avl_left = lrr; /* lrr: RH or RH-1 */
227 node->avl_right = r; /* r: RH */
228 node->avl_height = rh + 1; /* node: RH+1 */
229 l->avl_left = ll; /* ll: RH */
230 l->avl_right = lrl; /* lrl: RH or RH-1 */
231 l->avl_height = rh + 1; /* l: RH+1 */
232 lr->avl_left = l; /* l: RH+1 */
233 lr->avl_right = node; /* node: RH+1 */
234 lr->avl_height = rh + 2;
235 *nodep = lr;
236 }
237 } else if (rh > lh + 1) { /* r: LH+2 */
238 struct inet_peer *rr, *rl, *rlr, *rll;
239 int rlh;
240 rr = r->avl_right;
241 rl = r->avl_left;
242 rlh = node_height(rl);
243 if (rlh <= node_height(rr)) { /* rr: LH+1 */
244 node->avl_right = rl; /* rl: LH or LH+1 */
245 node->avl_left = l; /* l: LH */
246 node->avl_height = rlh + 1; /* LH+1 or LH+2 */
247 r->avl_right = rr; /* rr: LH+1 */
248 r->avl_left = node; /* node: LH+1 or LH+2 */
249 r->avl_height = node->avl_height + 1;
250 *nodep = r;
251 } else { /* rr: RH, rl: RH+1 */
252 rlr = rl->avl_right; /* rlr: LH or LH-1 */
253 rll = rl->avl_left; /* rll: LH or LH-1 */
254 node->avl_right = rll; /* rll: LH or LH-1 */
255 node->avl_left = l; /* l: LH */
256 node->avl_height = lh + 1; /* node: LH+1 */
257 r->avl_right = rr; /* rr: LH */
258 r->avl_left = rlr; /* rlr: LH or LH-1 */
259 r->avl_height = lh + 1; /* r: LH+1 */
260 rl->avl_right = r; /* r: LH+1 */
261 rl->avl_left = node; /* node: LH+1 */
262 rl->avl_height = lh + 2;
263 *nodep = rl;
264 }
265 } else {
266 node->avl_height = (lh > rh ? lh : rh) + 1;
267 }
268 }
269}
270
271/* Called with local BH disabled and the pool write lock held. */
272#define link_to_pool(n) \
273do { \
274 n->avl_height = 1; \
275 n->avl_left = peer_avl_empty; \
276 n->avl_right = peer_avl_empty; \
277 **--stackptr = n; \
278 peer_avl_rebalance(stack, stackptr); \
279} while(0)
280
281/* May be called with local BH enabled. */
282static void unlink_from_pool(struct inet_peer *p)
283{
284 int do_free;
285
286 do_free = 0;
287
288 write_lock_bh(&peer_pool_lock);
289 /* Check the reference counter. It was artificially incremented by 1
290 * in cleanup() function to prevent sudden disappearing. If the
291 * reference count is still 1 then the node is referenced only as `p'
292 * here and from the pool. So under the exclusive pool lock it's safe
293 * to remove the node and free it later. */
294 if (atomic_read(&p->refcnt) == 1) {
295 struct inet_peer **stack[PEER_MAXDEPTH];
296 struct inet_peer ***stackptr, ***delp;
297 if (lookup(p->v4daddr) != p)
298 BUG();
299 delp = stackptr - 1; /* *delp[0] == p */
300 if (p->avl_left == peer_avl_empty) {
301 *delp[0] = p->avl_right;
302 --stackptr;
303 } else {
304 /* look for a node to insert instead of p */
305 struct inet_peer *t;
306 t = lookup_rightempty(p);
307 if (*stackptr[-1] != t)
308 BUG();
309 **--stackptr = t->avl_left;
310 /* t is removed, t->v4daddr > x->v4daddr for any
311 * x in p->avl_left subtree.
312 * Put t in the old place of p. */
313 *delp[0] = t;
314 t->avl_left = p->avl_left;
315 t->avl_right = p->avl_right;
316 t->avl_height = p->avl_height;
317 if (delp[1] != &p->avl_left)
318 BUG();
319 delp[1] = &t->avl_left; /* was &p->avl_left */
320 }
321 peer_avl_rebalance(stack, stackptr);
322 peer_total--;
323 do_free = 1;
324 }
325 write_unlock_bh(&peer_pool_lock);
326
327 if (do_free)
328 kmem_cache_free(peer_cachep, p);
329 else
330 /* The node is used again. Decrease the reference counter
331 * back. The loop "cleanup -> unlink_from_unused
332 * -> unlink_from_pool -> putpeer -> link_to_unused
333 * -> cleanup (for the same node)"
334 * doesn't really exist because the entry will have a
335 * recent deletion time and will not be cleaned again soon. */
336 inet_putpeer(p);
337}
338
339/* May be called with local BH enabled. */
340static int cleanup_once(unsigned long ttl)
341{
342 struct inet_peer *p;
343
344 /* Remove the first entry from the list of unused nodes. */
345 spin_lock_bh(&inet_peer_unused_lock);
346 p = inet_peer_unused_head;
347 if (p != NULL) {
348 if (time_after(p->dtime + ttl, jiffies)) {
349 /* Do not prune fresh entries. */
350 spin_unlock_bh(&inet_peer_unused_lock);
351 return -1;
352 }
353 inet_peer_unused_head = p->unused_next;
354 if (p->unused_next != NULL)
355 p->unused_next->unused_prevp = p->unused_prevp;
356 else
357 inet_peer_unused_tailp = p->unused_prevp;
358 p->unused_prevp = NULL; /* mark as not on the list */
359 /* Grab an extra reference to prevent node disappearing
360 * before unlink_from_pool() call. */
361 atomic_inc(&p->refcnt);
362 }
363 spin_unlock_bh(&inet_peer_unused_lock);
364
365 if (p == NULL)
366 /* It means that the total number of USED entries has
367 * grown over inet_peer_threshold. It shouldn't really
368 * happen because of entry limits in route cache. */
369 return -1;
370
371 unlink_from_pool(p);
372 return 0;
373}
374
375/* Called with or without local BH being disabled. */
376struct inet_peer *inet_getpeer(__u32 daddr, int create)
377{
378 struct inet_peer *p, *n;
379 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
380
381 /* Look up for the address quickly. */
382 read_lock_bh(&peer_pool_lock);
383 p = lookup(daddr);
384 if (p != peer_avl_empty)
385 atomic_inc(&p->refcnt);
386 read_unlock_bh(&peer_pool_lock);
387
388 if (p != peer_avl_empty) {
389 /* The existing node has been found. */
390 /* Remove the entry from unused list if it was there. */
391 unlink_from_unused(p);
392 return p;
393 }
394
395 if (!create)
396 return NULL;
397
398 /* Allocate the space outside the locked region. */
399 n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
400 if (n == NULL)
401 return NULL;
402 n->v4daddr = daddr;
403 atomic_set(&n->refcnt, 1);
404 n->ip_id_count = secure_ip_id(daddr);
405 n->tcp_ts_stamp = 0;
406
407 write_lock_bh(&peer_pool_lock);
408 /* Check if an entry has suddenly appeared. */
409 p = lookup(daddr);
410 if (p != peer_avl_empty)
411 goto out_free;
412
413 /* Link the node. */
414 link_to_pool(n);
415 n->unused_prevp = NULL; /* not on the list */
416 peer_total++;
417 write_unlock_bh(&peer_pool_lock);
418
419 if (peer_total >= inet_peer_threshold)
420 /* Remove one less-recently-used entry. */
421 cleanup_once(0);
422
423 return n;
424
425out_free:
426 /* The appropriate node is already in the pool. */
427 atomic_inc(&p->refcnt);
428 write_unlock_bh(&peer_pool_lock);
429 /* Remove the entry from unused list if it was there. */
430 unlink_from_unused(p);
431 /* Free preallocated the preallocated node. */
432 kmem_cache_free(peer_cachep, n);
433 return p;
434}
435
436/* Called with local BH disabled. */
437static void peer_check_expire(unsigned long dummy)
438{
439 int i;
440 int ttl;
441
442 if (peer_total >= inet_peer_threshold)
443 ttl = inet_peer_minttl;
444 else
445 ttl = inet_peer_maxttl
446 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
447 peer_total / inet_peer_threshold * HZ;
448 for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++);
449
450 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
451 * interval depending on the total number of entries (more entries,
452 * less interval). */
453 peer_periodic_timer.expires = jiffies
454 + inet_peer_gc_maxtime
455 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
456 peer_total / inet_peer_threshold * HZ;
457 add_timer(&peer_periodic_timer);
458}
459
460EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 000000000000..77094aac6c28
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,127 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The IP forwarding functionality.
7 *
8 * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $
9 *
10 * Authors: see ip.c
11 *
12 * Fixes:
13 * Many : Split from ip.c , see ip_input.c for
14 * history.
15 * Dave Gregorich : NULL ip_rt_put fix for multicast
16 * routing.
17 * Jos Vos : Add call_out_firewall before sending,
18 * use output device for accounting.
19 * Jos Vos : Call forward firewall after routing
20 * (always use output device).
21 * Mike McLagan : Routing by source
22 */
23
24#include <linux/config.h>
25#include <linux/types.h>
26#include <linux/mm.h>
27#include <linux/sched.h>
28#include <linux/skbuff.h>
29#include <linux/ip.h>
30#include <linux/icmp.h>
31#include <linux/netdevice.h>
32#include <net/sock.h>
33#include <net/ip.h>
34#include <net/tcp.h>
35#include <net/udp.h>
36#include <net/icmp.h>
37#include <linux/tcp.h>
38#include <linux/udp.h>
39#include <linux/netfilter_ipv4.h>
40#include <net/checksum.h>
41#include <linux/route.h>
42#include <net/route.h>
43#include <net/xfrm.h>
44
45static inline int ip_forward_finish(struct sk_buff *skb)
46{
47 struct ip_options * opt = &(IPCB(skb)->opt);
48
49 IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
50
51 if (unlikely(opt->optlen))
52 ip_forward_options(skb);
53
54 return dst_output(skb);
55}
56
57int ip_forward(struct sk_buff *skb)
58{
59 struct iphdr *iph; /* Our header */
60 struct rtable *rt; /* Route we use */
61 struct ip_options * opt = &(IPCB(skb)->opt);
62
63 if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
64 goto drop;
65
66 if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
67 return NET_RX_SUCCESS;
68
69 if (skb->pkt_type != PACKET_HOST)
70 goto drop;
71
72 skb->ip_summed = CHECKSUM_NONE;
73
74 /*
75 * According to the RFC, we must first decrease the TTL field. If
76 * that reaches zero, we must reply an ICMP control message telling
77 * that the packet's lifetime expired.
78 */
79
80 iph = skb->nh.iph;
81
82 if (iph->ttl <= 1)
83 goto too_many_hops;
84
85 if (!xfrm4_route_forward(skb))
86 goto drop;
87
88 iph = skb->nh.iph;
89 rt = (struct rtable*)skb->dst;
90
91 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
92 goto sr_failed;
93
94 /* We are about to mangle packet. Copy it! */
95 if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
96 goto drop;
97 iph = skb->nh.iph;
98
99 /* Decrease ttl after skb cow done */
100 ip_decrease_ttl(iph);
101
102 /*
103 * We now generate an ICMP HOST REDIRECT giving the route
104 * we calculated.
105 */
106 if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
107 ip_rt_send_redirect(skb);
108
109 skb->priority = rt_tos2priority(iph->tos);
110
111 return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
112 ip_forward_finish);
113
114sr_failed:
115 /*
116 * Strict routing permits no gatewaying
117 */
118 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
119 goto drop;
120
121too_many_hops:
122 /* Tell the sender its packet died... */
123 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
124drop:
125 kfree_skb(skb);
126 return NET_RX_DROP;
127}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 000000000000..7f68e27eb4ea
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,691 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The IP fragmentation functionality.
7 *
8 * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $
9 *
10 * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox <Alan.Cox@linux.org>
12 *
13 * Fixes:
14 * Alan Cox : Split from ip.c , see ip_input.c for history.
15 * David S. Miller : Begin massive cleanup...
16 * Andi Kleen : Add sysctls.
17 * xxxx : Overlapfrag bug.
18 * Ultima : ip_expire() kernel panic.
19 * Bill Hawes : Frag accounting and evictor fixes.
20 * John McDonald : 0 length frag bug.
21 * Alexey Kuznetsov: SMP races, threading, cleanup.
22 * Patrick McHardy : LRU queue of frag heads for evictor.
23 */
24
25#include <linux/config.h>
26#include <linux/module.h>
27#include <linux/types.h>
28#include <linux/mm.h>
29#include <linux/jiffies.h>
30#include <linux/skbuff.h>
31#include <linux/list.h>
32#include <linux/ip.h>
33#include <linux/icmp.h>
34#include <linux/netdevice.h>
35#include <linux/jhash.h>
36#include <linux/random.h>
37#include <net/sock.h>
38#include <net/ip.h>
39#include <net/icmp.h>
40#include <net/checksum.h>
41#include <linux/tcp.h>
42#include <linux/udp.h>
43#include <linux/inet.h>
44#include <linux/netfilter_ipv4.h>
45
46/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
47 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
48 * as well. Or notify me, at least. --ANK
49 */
50
51/* Fragment cache limits. We will commit 256K at one time. Should we
52 * cross that limit we will prune down to 192K. This should cope with
53 * even the most extreme cases without allowing an attacker to measurably
54 * harm machine performance.
55 */
56int sysctl_ipfrag_high_thresh = 256*1024;
57int sysctl_ipfrag_low_thresh = 192*1024;
58
59/* Important NOTE! Fragment queue must be destroyed before MSL expires.
60 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
61 */
62int sysctl_ipfrag_time = IP_FRAG_TIME;
63
64struct ipfrag_skb_cb
65{
66 struct inet_skb_parm h;
67 int offset;
68};
69
70#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb))
71
72/* Describe an entry in the "incomplete datagrams" queue. */
73struct ipq {
74 struct ipq *next; /* linked list pointers */
75 struct list_head lru_list; /* lru list member */
76 u32 user;
77 u32 saddr;
78 u32 daddr;
79 u16 id;
80 u8 protocol;
81 u8 last_in;
82#define COMPLETE 4
83#define FIRST_IN 2
84#define LAST_IN 1
85
86 struct sk_buff *fragments; /* linked list of received fragments */
87 int len; /* total length of original datagram */
88 int meat;
89 spinlock_t lock;
90 atomic_t refcnt;
91 struct timer_list timer; /* when will this queue expire? */
92 struct ipq **pprev;
93 int iif;
94 struct timeval stamp;
95};
96
97/* Hash table. */
98
99#define IPQ_HASHSZ 64
100
101/* Per-bucket lock is easy to add now. */
102static struct ipq *ipq_hash[IPQ_HASHSZ];
103static DEFINE_RWLOCK(ipfrag_lock);
104static u32 ipfrag_hash_rnd;
105static LIST_HEAD(ipq_lru_list);
106int ip_frag_nqueues = 0;
107
108static __inline__ void __ipq_unlink(struct ipq *qp)
109{
110 if(qp->next)
111 qp->next->pprev = qp->pprev;
112 *qp->pprev = qp->next;
113 list_del(&qp->lru_list);
114 ip_frag_nqueues--;
115}
116
117static __inline__ void ipq_unlink(struct ipq *ipq)
118{
119 write_lock(&ipfrag_lock);
120 __ipq_unlink(ipq);
121 write_unlock(&ipfrag_lock);
122}
123
124static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
125{
126 return jhash_3words((u32)id << 16 | prot, saddr, daddr,
127 ipfrag_hash_rnd) & (IPQ_HASHSZ - 1);
128}
129
130static struct timer_list ipfrag_secret_timer;
131int sysctl_ipfrag_secret_interval = 10 * 60 * HZ;
132
133static void ipfrag_secret_rebuild(unsigned long dummy)
134{
135 unsigned long now = jiffies;
136 int i;
137
138 write_lock(&ipfrag_lock);
139 get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
140 for (i = 0; i < IPQ_HASHSZ; i++) {
141 struct ipq *q;
142
143 q = ipq_hash[i];
144 while (q) {
145 struct ipq *next = q->next;
146 unsigned int hval = ipqhashfn(q->id, q->saddr,
147 q->daddr, q->protocol);
148
149 if (hval != i) {
150 /* Unlink. */
151 if (q->next)
152 q->next->pprev = q->pprev;
153 *q->pprev = q->next;
154
155 /* Relink to new hash chain. */
156 if ((q->next = ipq_hash[hval]) != NULL)
157 q->next->pprev = &q->next;
158 ipq_hash[hval] = q;
159 q->pprev = &ipq_hash[hval];
160 }
161
162 q = next;
163 }
164 }
165 write_unlock(&ipfrag_lock);
166
167 mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval);
168}
169
170atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
171
172/* Memory Tracking Functions. */
173static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
174{
175 if (work)
176 *work -= skb->truesize;
177 atomic_sub(skb->truesize, &ip_frag_mem);
178 kfree_skb(skb);
179}
180
181static __inline__ void frag_free_queue(struct ipq *qp, int *work)
182{
183 if (work)
184 *work -= sizeof(struct ipq);
185 atomic_sub(sizeof(struct ipq), &ip_frag_mem);
186 kfree(qp);
187}
188
189static __inline__ struct ipq *frag_alloc_queue(void)
190{
191 struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
192
193 if(!qp)
194 return NULL;
195 atomic_add(sizeof(struct ipq), &ip_frag_mem);
196 return qp;
197}
198
199
200/* Destruction primitives. */
201
202/* Complete destruction of ipq. */
203static void ip_frag_destroy(struct ipq *qp, int *work)
204{
205 struct sk_buff *fp;
206
207 BUG_TRAP(qp->last_in&COMPLETE);
208 BUG_TRAP(del_timer(&qp->timer) == 0);
209
210 /* Release all fragment data. */
211 fp = qp->fragments;
212 while (fp) {
213 struct sk_buff *xp = fp->next;
214
215 frag_kfree_skb(fp, work);
216 fp = xp;
217 }
218
219 /* Finally, release the queue descriptor itself. */
220 frag_free_queue(qp, work);
221}
222
223static __inline__ void ipq_put(struct ipq *ipq, int *work)
224{
225 if (atomic_dec_and_test(&ipq->refcnt))
226 ip_frag_destroy(ipq, work);
227}
228
229/* Kill ipq entry. It is not destroyed immediately,
230 * because caller (and someone more) holds reference count.
231 */
232static void ipq_kill(struct ipq *ipq)
233{
234 if (del_timer(&ipq->timer))
235 atomic_dec(&ipq->refcnt);
236
237 if (!(ipq->last_in & COMPLETE)) {
238 ipq_unlink(ipq);
239 atomic_dec(&ipq->refcnt);
240 ipq->last_in |= COMPLETE;
241 }
242}
243
244/* Memory limiting on fragments. Evictor trashes the oldest
245 * fragment queue until we are back under the threshold.
246 */
247static void ip_evictor(void)
248{
249 struct ipq *qp;
250 struct list_head *tmp;
251 int work;
252
253 work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
254 if (work <= 0)
255 return;
256
257 while (work > 0) {
258 read_lock(&ipfrag_lock);
259 if (list_empty(&ipq_lru_list)) {
260 read_unlock(&ipfrag_lock);
261 return;
262 }
263 tmp = ipq_lru_list.next;
264 qp = list_entry(tmp, struct ipq, lru_list);
265 atomic_inc(&qp->refcnt);
266 read_unlock(&ipfrag_lock);
267
268 spin_lock(&qp->lock);
269 if (!(qp->last_in&COMPLETE))
270 ipq_kill(qp);
271 spin_unlock(&qp->lock);
272
273 ipq_put(qp, &work);
274 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
275 }
276}
277
278/*
279 * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
280 */
281static void ip_expire(unsigned long arg)
282{
283 struct ipq *qp = (struct ipq *) arg;
284
285 spin_lock(&qp->lock);
286
287 if (qp->last_in & COMPLETE)
288 goto out;
289
290 ipq_kill(qp);
291
292 IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT);
293 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
294
295 if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
296 struct sk_buff *head = qp->fragments;
297 /* Send an ICMP "Fragment Reassembly Timeout" message. */
298 if ((head->dev = dev_get_by_index(qp->iif)) != NULL) {
299 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
300 dev_put(head->dev);
301 }
302 }
303out:
304 spin_unlock(&qp->lock);
305 ipq_put(qp, NULL);
306}
307
308/* Creation primitives. */
309
310static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
311{
312 struct ipq *qp;
313
314 write_lock(&ipfrag_lock);
315#ifdef CONFIG_SMP
316 /* With SMP race we have to recheck hash table, because
317 * such entry could be created on other cpu, while we
318 * promoted read lock to write lock.
319 */
320 for(qp = ipq_hash[hash]; qp; qp = qp->next) {
321 if(qp->id == qp_in->id &&
322 qp->saddr == qp_in->saddr &&
323 qp->daddr == qp_in->daddr &&
324 qp->protocol == qp_in->protocol &&
325 qp->user == qp_in->user) {
326 atomic_inc(&qp->refcnt);
327 write_unlock(&ipfrag_lock);
328 qp_in->last_in |= COMPLETE;
329 ipq_put(qp_in, NULL);
330 return qp;
331 }
332 }
333#endif
334 qp = qp_in;
335
336 if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time))
337 atomic_inc(&qp->refcnt);
338
339 atomic_inc(&qp->refcnt);
340 if((qp->next = ipq_hash[hash]) != NULL)
341 qp->next->pprev = &qp->next;
342 ipq_hash[hash] = qp;
343 qp->pprev = &ipq_hash[hash];
344 INIT_LIST_HEAD(&qp->lru_list);
345 list_add_tail(&qp->lru_list, &ipq_lru_list);
346 ip_frag_nqueues++;
347 write_unlock(&ipfrag_lock);
348 return qp;
349}
350
351/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
352static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
353{
354 struct ipq *qp;
355
356 if ((qp = frag_alloc_queue()) == NULL)
357 goto out_nomem;
358
359 qp->protocol = iph->protocol;
360 qp->last_in = 0;
361 qp->id = iph->id;
362 qp->saddr = iph->saddr;
363 qp->daddr = iph->daddr;
364 qp->user = user;
365 qp->len = 0;
366 qp->meat = 0;
367 qp->fragments = NULL;
368 qp->iif = 0;
369
370 /* Initialize a timer for this entry. */
371 init_timer(&qp->timer);
372 qp->timer.data = (unsigned long) qp; /* pointer to queue */
373 qp->timer.function = ip_expire; /* expire function */
374 spin_lock_init(&qp->lock);
375 atomic_set(&qp->refcnt, 1);
376
377 return ip_frag_intern(hash, qp);
378
379out_nomem:
380 NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
381 return NULL;
382}
383
384/* Find the correct entry in the "incomplete datagrams" queue for
385 * this IP datagram, and create new one, if nothing is found.
386 */
387static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
388{
389 __u16 id = iph->id;
390 __u32 saddr = iph->saddr;
391 __u32 daddr = iph->daddr;
392 __u8 protocol = iph->protocol;
393 unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
394 struct ipq *qp;
395
396 read_lock(&ipfrag_lock);
397 for(qp = ipq_hash[hash]; qp; qp = qp->next) {
398 if(qp->id == id &&
399 qp->saddr == saddr &&
400 qp->daddr == daddr &&
401 qp->protocol == protocol &&
402 qp->user == user) {
403 atomic_inc(&qp->refcnt);
404 read_unlock(&ipfrag_lock);
405 return qp;
406 }
407 }
408 read_unlock(&ipfrag_lock);
409
410 return ip_frag_create(hash, iph, user);
411}
412
413/* Add new segment to existing queue. */
414static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
415{
416 struct sk_buff *prev, *next;
417 int flags, offset;
418 int ihl, end;
419
420 if (qp->last_in & COMPLETE)
421 goto err;
422
423 offset = ntohs(skb->nh.iph->frag_off);
424 flags = offset & ~IP_OFFSET;
425 offset &= IP_OFFSET;
426 offset <<= 3; /* offset is in 8-byte chunks */
427 ihl = skb->nh.iph->ihl * 4;
428
429 /* Determine the position of this fragment. */
430 end = offset + skb->len - ihl;
431
432 /* Is this the final fragment? */
433 if ((flags & IP_MF) == 0) {
434 /* If we already have some bits beyond end
435 * or have different end, the segment is corrrupted.
436 */
437 if (end < qp->len ||
438 ((qp->last_in & LAST_IN) && end != qp->len))
439 goto err;
440 qp->last_in |= LAST_IN;
441 qp->len = end;
442 } else {
443 if (end&7) {
444 end &= ~7;
445 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
446 skb->ip_summed = CHECKSUM_NONE;
447 }
448 if (end > qp->len) {
449 /* Some bits beyond end -> corruption. */
450 if (qp->last_in & LAST_IN)
451 goto err;
452 qp->len = end;
453 }
454 }
455 if (end == offset)
456 goto err;
457
458 if (pskb_pull(skb, ihl) == NULL)
459 goto err;
460 if (pskb_trim(skb, end-offset))
461 goto err;
462
463 /* Find out which fragments are in front and at the back of us
464 * in the chain of fragments so far. We must know where to put
465 * this fragment, right?
466 */
467 prev = NULL;
468 for(next = qp->fragments; next != NULL; next = next->next) {
469 if (FRAG_CB(next)->offset >= offset)
470 break; /* bingo! */
471 prev = next;
472 }
473
474 /* We found where to put this one. Check for overlap with
475 * preceding fragment, and, if needed, align things so that
476 * any overlaps are eliminated.
477 */
478 if (prev) {
479 int i = (FRAG_CB(prev)->offset + prev->len) - offset;
480
481 if (i > 0) {
482 offset += i;
483 if (end <= offset)
484 goto err;
485 if (!pskb_pull(skb, i))
486 goto err;
487 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
488 skb->ip_summed = CHECKSUM_NONE;
489 }
490 }
491
492 while (next && FRAG_CB(next)->offset < end) {
493 int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
494
495 if (i < next->len) {
496 /* Eat head of the next overlapped fragment
497 * and leave the loop. The next ones cannot overlap.
498 */
499 if (!pskb_pull(next, i))
500 goto err;
501 FRAG_CB(next)->offset += i;
502 qp->meat -= i;
503 if (next->ip_summed != CHECKSUM_UNNECESSARY)
504 next->ip_summed = CHECKSUM_NONE;
505 break;
506 } else {
507 struct sk_buff *free_it = next;
508
509 /* Old fragmnet is completely overridden with
510 * new one drop it.
511 */
512 next = next->next;
513
514 if (prev)
515 prev->next = next;
516 else
517 qp->fragments = next;
518
519 qp->meat -= free_it->len;
520 frag_kfree_skb(free_it, NULL);
521 }
522 }
523
524 FRAG_CB(skb)->offset = offset;
525
526 /* Insert this fragment in the chain of fragments. */
527 skb->next = next;
528 if (prev)
529 prev->next = skb;
530 else
531 qp->fragments = skb;
532
533 if (skb->dev)
534 qp->iif = skb->dev->ifindex;
535 skb->dev = NULL;
536 qp->stamp = skb->stamp;
537 qp->meat += skb->len;
538 atomic_add(skb->truesize, &ip_frag_mem);
539 if (offset == 0)
540 qp->last_in |= FIRST_IN;
541
542 write_lock(&ipfrag_lock);
543 list_move_tail(&qp->lru_list, &ipq_lru_list);
544 write_unlock(&ipfrag_lock);
545
546 return;
547
548err:
549 kfree_skb(skb);
550}
551
552
553/* Build a new IP datagram from all its fragments. */
554
555static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
556{
557 struct iphdr *iph;
558 struct sk_buff *fp, *head = qp->fragments;
559 int len;
560 int ihlen;
561
562 ipq_kill(qp);
563
564 BUG_TRAP(head != NULL);
565 BUG_TRAP(FRAG_CB(head)->offset == 0);
566
567 /* Allocate a new buffer for the datagram. */
568 ihlen = head->nh.iph->ihl*4;
569 len = ihlen + qp->len;
570
571 if(len > 65535)
572 goto out_oversize;
573
574 /* Head of list must not be cloned. */
575 if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
576 goto out_nomem;
577
578 /* If the first fragment is fragmented itself, we split
579 * it to two chunks: the first with data and paged part
580 * and the second, holding only fragments. */
581 if (skb_shinfo(head)->frag_list) {
582 struct sk_buff *clone;
583 int i, plen = 0;
584
585 if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
586 goto out_nomem;
587 clone->next = head->next;
588 head->next = clone;
589 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
590 skb_shinfo(head)->frag_list = NULL;
591 for (i=0; i<skb_shinfo(head)->nr_frags; i++)
592 plen += skb_shinfo(head)->frags[i].size;
593 clone->len = clone->data_len = head->data_len - plen;
594 head->data_len -= clone->len;
595 head->len -= clone->len;
596 clone->csum = 0;
597 clone->ip_summed = head->ip_summed;
598 atomic_add(clone->truesize, &ip_frag_mem);
599 }
600
601 skb_shinfo(head)->frag_list = head->next;
602 skb_push(head, head->data - head->nh.raw);
603 atomic_sub(head->truesize, &ip_frag_mem);
604
605 for (fp=head->next; fp; fp = fp->next) {
606 head->data_len += fp->len;
607 head->len += fp->len;
608 if (head->ip_summed != fp->ip_summed)
609 head->ip_summed = CHECKSUM_NONE;
610 else if (head->ip_summed == CHECKSUM_HW)
611 head->csum = csum_add(head->csum, fp->csum);
612 head->truesize += fp->truesize;
613 atomic_sub(fp->truesize, &ip_frag_mem);
614 }
615
616 head->next = NULL;
617 head->dev = dev;
618 head->stamp = qp->stamp;
619
620 iph = head->nh.iph;
621 iph->frag_off = 0;
622 iph->tot_len = htons(len);
623 IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
624 qp->fragments = NULL;
625 return head;
626
627out_nomem:
628 NETDEBUG(if (net_ratelimit())
629 printk(KERN_ERR
630 "IP: queue_glue: no memory for gluing queue %p\n",
631 qp));
632 goto out_fail;
633out_oversize:
634 if (net_ratelimit())
635 printk(KERN_INFO
636 "Oversized IP packet from %d.%d.%d.%d.\n",
637 NIPQUAD(qp->saddr));
638out_fail:
639 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
640 return NULL;
641}
642
643/* Process an incoming IP datagram fragment. */
644struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
645{
646 struct iphdr *iph = skb->nh.iph;
647 struct ipq *qp;
648 struct net_device *dev;
649
650 IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
651
652 /* Start by cleaning up the memory. */
653 if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
654 ip_evictor();
655
656 dev = skb->dev;
657
658 /* Lookup (or create) queue header */
659 if ((qp = ip_find(iph, user)) != NULL) {
660 struct sk_buff *ret = NULL;
661
662 spin_lock(&qp->lock);
663
664 ip_frag_queue(qp, skb);
665
666 if (qp->last_in == (FIRST_IN|LAST_IN) &&
667 qp->meat == qp->len)
668 ret = ip_frag_reasm(qp, dev);
669
670 spin_unlock(&qp->lock);
671 ipq_put(qp, NULL);
672 return ret;
673 }
674
675 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
676 kfree_skb(skb);
677 return NULL;
678}
679
680void ipfrag_init(void)
681{
682 ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
683 (jiffies ^ (jiffies >> 6)));
684
685 init_timer(&ipfrag_secret_timer);
686 ipfrag_secret_timer.function = ipfrag_secret_rebuild;
687 ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
688 add_timer(&ipfrag_secret_timer);
689}
690
691EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 000000000000..884835522224
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,1290 @@
1/*
2 * Linux NET3: GRE over IP protocol decoder.
3 *
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <linux/types.h>
16#include <linux/sched.h>
17#include <linux/kernel.h>
18#include <asm/uaccess.h>
19#include <linux/skbuff.h>
20#include <linux/netdevice.h>
21#include <linux/in.h>
22#include <linux/tcp.h>
23#include <linux/udp.h>
24#include <linux/if_arp.h>
25#include <linux/mroute.h>
26#include <linux/init.h>
27#include <linux/in6.h>
28#include <linux/inetdevice.h>
29#include <linux/igmp.h>
30#include <linux/netfilter_ipv4.h>
31
32#include <net/sock.h>
33#include <net/ip.h>
34#include <net/icmp.h>
35#include <net/protocol.h>
36#include <net/ipip.h>
37#include <net/arp.h>
38#include <net/checksum.h>
39#include <net/dsfield.h>
40#include <net/inet_ecn.h>
41#include <net/xfrm.h>
42
43#ifdef CONFIG_IPV6
44#include <net/ipv6.h>
45#include <net/ip6_fib.h>
46#include <net/ip6_route.h>
47#endif
48
49/*
50 Problems & solutions
51 --------------------
52
53 1. The most important issue is detecting local dead loops.
54 They would cause complete host lockup in transmit, which
55 would be "resolved" by stack overflow or, if queueing is enabled,
56 with infinite looping in net_bh.
57
58 We cannot track such dead loops during route installation,
59 it is infeasible task. The most general solutions would be
60 to keep skb->encapsulation counter (sort of local ttl),
61 and silently drop packet when it expires. It is the best
62 solution, but it supposes maintaing new variable in ALL
63 skb, even if no tunneling is used.
64
65 Current solution: t->recursion lock breaks dead loops. It looks
66 like dev->tbusy flag, but I preferred new variable, because
67 the semantics is different. One day, when hard_start_xmit
68 will be multithreaded we will have to use skb->encapsulation.
69
70
71
72 2. Networking dead loops would not kill routers, but would really
73 kill network. IP hop limit plays role of "t->recursion" in this case,
74 if we copy it from packet being encapsulated to upper header.
75 It is very good solution, but it introduces two problems:
76
77 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
78 do not work over tunnels.
79 - traceroute does not work. I planned to relay ICMP from tunnel,
80 so that this problem would be solved and traceroute output
81 would even more informative. This idea appeared to be wrong:
82 only Linux complies to rfc1812 now (yes, guys, Linux is the only
83 true router now :-)), all routers (at least, in neighbourhood of mine)
84 return only 8 bytes of payload. It is the end.
85
86 Hence, if we want that OSPF worked or traceroute said something reasonable,
87 we should search for another solution.
88
89 One of them is to parse packet trying to detect inner encapsulation
90 made by our node. It is difficult or even impossible, especially,
91 taking into account fragmentation. TO be short, tt is not solution at all.
92
93 Current solution: The solution was UNEXPECTEDLY SIMPLE.
94 We force DF flag on tunnels with preconfigured hop limit,
95 that is ALL. :-) Well, it does not remove the problem completely,
96 but exponential growth of network traffic is changed to linear
97 (branches, that exceed pmtu are pruned) and tunnel mtu
98 fastly degrades to value <68, where looping stops.
99 Yes, it is not good if there exists a router in the loop,
100 which does not force DF, even when encapsulating packets have DF set.
101 But it is not our problem! Nobody could accuse us, we made
102 all that we could make. Even if it is your gated who injected
103 fatal route to network, even if it were you who configured
104 fatal static route: you are innocent. :-)
105
106
107
108 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
109 practically identical code. It would be good to glue them
110 together, but it is not very evident, how to make them modular.
111 sit is integral part of IPv6, ipip and gre are naturally modular.
112 We could extract common parts (hash table, ioctl etc)
113 to a separate module (ip_tunnel.c).
114
115 Alexey Kuznetsov.
116 */
117
118static int ipgre_tunnel_init(struct net_device *dev);
119static void ipgre_tunnel_setup(struct net_device *dev);
120
121/* Fallback tunnel: no source, no destination, no key, no options */
122
123static int ipgre_fb_tunnel_init(struct net_device *dev);
124
125static struct net_device *ipgre_fb_tunnel_dev;
126
127/* Tunnel hash table */
128
129/*
130 4 hash tables:
131
132 3: (remote,local)
133 2: (remote,*)
134 1: (*,local)
135 0: (*,*)
136
137 We require exact key match i.e. if a key is present in packet
138 it will match only tunnel with the same key; if it is not present,
139 it will match only keyless tunnel.
140
141 All keysless packets, if not matched configured keyless tunnels
142 will match fallback tunnel.
143 */
144
145#define HASH_SIZE 16
146#define HASH(addr) ((addr^(addr>>4))&0xF)
147
148static struct ip_tunnel *tunnels[4][HASH_SIZE];
149
150#define tunnels_r_l (tunnels[3])
151#define tunnels_r (tunnels[2])
152#define tunnels_l (tunnels[1])
153#define tunnels_wc (tunnels[0])
154
155static DEFINE_RWLOCK(ipgre_lock);
156
157/* Given src, dst and key, find appropriate for input tunnel. */
158
159static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
160{
161 unsigned h0 = HASH(remote);
162 unsigned h1 = HASH(key);
163 struct ip_tunnel *t;
164
165 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
166 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
167 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
168 return t;
169 }
170 }
171 for (t = tunnels_r[h0^h1]; t; t = t->next) {
172 if (remote == t->parms.iph.daddr) {
173 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
174 return t;
175 }
176 }
177 for (t = tunnels_l[h1]; t; t = t->next) {
178 if (local == t->parms.iph.saddr ||
179 (local == t->parms.iph.daddr && MULTICAST(local))) {
180 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
181 return t;
182 }
183 }
184 for (t = tunnels_wc[h1]; t; t = t->next) {
185 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
186 return t;
187 }
188
189 if (ipgre_fb_tunnel_dev->flags&IFF_UP)
190 return ipgre_fb_tunnel_dev->priv;
191 return NULL;
192}
193
194static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
195{
196 u32 remote = t->parms.iph.daddr;
197 u32 local = t->parms.iph.saddr;
198 u32 key = t->parms.i_key;
199 unsigned h = HASH(key);
200 int prio = 0;
201
202 if (local)
203 prio |= 1;
204 if (remote && !MULTICAST(remote)) {
205 prio |= 2;
206 h ^= HASH(remote);
207 }
208
209 return &tunnels[prio][h];
210}
211
212static void ipgre_tunnel_link(struct ip_tunnel *t)
213{
214 struct ip_tunnel **tp = ipgre_bucket(t);
215
216 t->next = *tp;
217 write_lock_bh(&ipgre_lock);
218 *tp = t;
219 write_unlock_bh(&ipgre_lock);
220}
221
222static void ipgre_tunnel_unlink(struct ip_tunnel *t)
223{
224 struct ip_tunnel **tp;
225
226 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
227 if (t == *tp) {
228 write_lock_bh(&ipgre_lock);
229 *tp = t->next;
230 write_unlock_bh(&ipgre_lock);
231 break;
232 }
233 }
234}
235
236static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
237{
238 u32 remote = parms->iph.daddr;
239 u32 local = parms->iph.saddr;
240 u32 key = parms->i_key;
241 struct ip_tunnel *t, **tp, *nt;
242 struct net_device *dev;
243 unsigned h = HASH(key);
244 int prio = 0;
245 char name[IFNAMSIZ];
246
247 if (local)
248 prio |= 1;
249 if (remote && !MULTICAST(remote)) {
250 prio |= 2;
251 h ^= HASH(remote);
252 }
253 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
254 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
255 if (key == t->parms.i_key)
256 return t;
257 }
258 }
259 if (!create)
260 return NULL;
261
262 if (parms->name[0])
263 strlcpy(name, parms->name, IFNAMSIZ);
264 else {
265 int i;
266 for (i=1; i<100; i++) {
267 sprintf(name, "gre%d", i);
268 if (__dev_get_by_name(name) == NULL)
269 break;
270 }
271 if (i==100)
272 goto failed;
273 }
274
275 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
276 if (!dev)
277 return NULL;
278
279 dev->init = ipgre_tunnel_init;
280 nt = dev->priv;
281 nt->parms = *parms;
282
283 if (register_netdevice(dev) < 0) {
284 free_netdev(dev);
285 goto failed;
286 }
287
288 nt = dev->priv;
289 nt->parms = *parms;
290
291 dev_hold(dev);
292 ipgre_tunnel_link(nt);
293 /* Do not decrement MOD_USE_COUNT here. */
294 return nt;
295
296failed:
297 return NULL;
298}
299
300static void ipgre_tunnel_uninit(struct net_device *dev)
301{
302 ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
303 dev_put(dev);
304}
305
306
307static void ipgre_err(struct sk_buff *skb, u32 info)
308{
309#ifndef I_WISH_WORLD_WERE_PERFECT
310
311/* It is not :-( All the routers (except for Linux) return only
312 8 bytes of packet payload. It means, that precise relaying of
313 ICMP in the real Internet is absolutely infeasible.
314
315 Moreover, Cisco "wise men" put GRE key to the third word
316 in GRE header. It makes impossible maintaining even soft state for keyed
317 GRE tunnels with enabled checksum. Tell them "thank you".
318
319 Well, I wonder, rfc1812 was written by Cisco employee,
320 what the hell these idiots break standrads established
321 by themself???
322 */
323
324 struct iphdr *iph = (struct iphdr*)skb->data;
325 u16 *p = (u16*)(skb->data+(iph->ihl<<2));
326 int grehlen = (iph->ihl<<2) + 4;
327 int type = skb->h.icmph->type;
328 int code = skb->h.icmph->code;
329 struct ip_tunnel *t;
330 u16 flags;
331
332 flags = p[0];
333 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
334 if (flags&(GRE_VERSION|GRE_ROUTING))
335 return;
336 if (flags&GRE_KEY) {
337 grehlen += 4;
338 if (flags&GRE_CSUM)
339 grehlen += 4;
340 }
341 }
342
343 /* If only 8 bytes returned, keyed message will be dropped here */
344 if (skb_headlen(skb) < grehlen)
345 return;
346
347 switch (type) {
348 default:
349 case ICMP_PARAMETERPROB:
350 return;
351
352 case ICMP_DEST_UNREACH:
353 switch (code) {
354 case ICMP_SR_FAILED:
355 case ICMP_PORT_UNREACH:
356 /* Impossible event. */
357 return;
358 case ICMP_FRAG_NEEDED:
359 /* Soft state for pmtu is maintained by IP core. */
360 return;
361 default:
362 /* All others are translated to HOST_UNREACH.
363 rfc2003 contains "deep thoughts" about NET_UNREACH,
364 I believe they are just ether pollution. --ANK
365 */
366 break;
367 }
368 break;
369 case ICMP_TIME_EXCEEDED:
370 if (code != ICMP_EXC_TTL)
371 return;
372 break;
373 }
374
375 read_lock(&ipgre_lock);
376 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
377 if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
378 goto out;
379
380 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
381 goto out;
382
383 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
384 t->err_count++;
385 else
386 t->err_count = 1;
387 t->err_time = jiffies;
388out:
389 read_unlock(&ipgre_lock);
390 return;
391#else
392 struct iphdr *iph = (struct iphdr*)dp;
393 struct iphdr *eiph;
394 u16 *p = (u16*)(dp+(iph->ihl<<2));
395 int type = skb->h.icmph->type;
396 int code = skb->h.icmph->code;
397 int rel_type = 0;
398 int rel_code = 0;
399 int rel_info = 0;
400 u16 flags;
401 int grehlen = (iph->ihl<<2) + 4;
402 struct sk_buff *skb2;
403 struct flowi fl;
404 struct rtable *rt;
405
406 if (p[1] != htons(ETH_P_IP))
407 return;
408
409 flags = p[0];
410 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
411 if (flags&(GRE_VERSION|GRE_ROUTING))
412 return;
413 if (flags&GRE_CSUM)
414 grehlen += 4;
415 if (flags&GRE_KEY)
416 grehlen += 4;
417 if (flags&GRE_SEQ)
418 grehlen += 4;
419 }
420 if (len < grehlen + sizeof(struct iphdr))
421 return;
422 eiph = (struct iphdr*)(dp + grehlen);
423
424 switch (type) {
425 default:
426 return;
427 case ICMP_PARAMETERPROB:
428 if (skb->h.icmph->un.gateway < (iph->ihl<<2))
429 return;
430
431 /* So... This guy found something strange INSIDE encapsulated
432 packet. Well, he is fool, but what can we do ?
433 */
434 rel_type = ICMP_PARAMETERPROB;
435 rel_info = skb->h.icmph->un.gateway - grehlen;
436 break;
437
438 case ICMP_DEST_UNREACH:
439 switch (code) {
440 case ICMP_SR_FAILED:
441 case ICMP_PORT_UNREACH:
442 /* Impossible event. */
443 return;
444 case ICMP_FRAG_NEEDED:
445 /* And it is the only really necessary thing :-) */
446 rel_info = ntohs(skb->h.icmph->un.frag.mtu);
447 if (rel_info < grehlen+68)
448 return;
449 rel_info -= grehlen;
450 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
451 if (rel_info > ntohs(eiph->tot_len))
452 return;
453 break;
454 default:
455 /* All others are translated to HOST_UNREACH.
456 rfc2003 contains "deep thoughts" about NET_UNREACH,
457 I believe, it is just ether pollution. --ANK
458 */
459 rel_type = ICMP_DEST_UNREACH;
460 rel_code = ICMP_HOST_UNREACH;
461 break;
462 }
463 break;
464 case ICMP_TIME_EXCEEDED:
465 if (code != ICMP_EXC_TTL)
466 return;
467 break;
468 }
469
470 /* Prepare fake skb to feed it to icmp_send */
471 skb2 = skb_clone(skb, GFP_ATOMIC);
472 if (skb2 == NULL)
473 return;
474 dst_release(skb2->dst);
475 skb2->dst = NULL;
476 skb_pull(skb2, skb->data - (u8*)eiph);
477 skb2->nh.raw = skb2->data;
478
479 /* Try to guess incoming interface */
480 memset(&fl, 0, sizeof(fl));
481 fl.fl4_dst = eiph->saddr;
482 fl.fl4_tos = RT_TOS(eiph->tos);
483 fl.proto = IPPROTO_GRE;
484 if (ip_route_output_key(&rt, &fl)) {
485 kfree_skb(skb2);
486 return;
487 }
488 skb2->dev = rt->u.dst.dev;
489
490 /* route "incoming" packet */
491 if (rt->rt_flags&RTCF_LOCAL) {
492 ip_rt_put(rt);
493 rt = NULL;
494 fl.fl4_dst = eiph->daddr;
495 fl.fl4_src = eiph->saddr;
496 fl.fl4_tos = eiph->tos;
497 if (ip_route_output_key(&rt, &fl) ||
498 rt->u.dst.dev->type != ARPHRD_IPGRE) {
499 ip_rt_put(rt);
500 kfree_skb(skb2);
501 return;
502 }
503 } else {
504 ip_rt_put(rt);
505 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
506 skb2->dst->dev->type != ARPHRD_IPGRE) {
507 kfree_skb(skb2);
508 return;
509 }
510 }
511
512 /* change mtu on this route */
513 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
514 if (rel_info > dst_mtu(skb2->dst)) {
515 kfree_skb(skb2);
516 return;
517 }
518 skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
519 rel_info = htonl(rel_info);
520 } else if (type == ICMP_TIME_EXCEEDED) {
521 struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
522 if (t->parms.iph.ttl) {
523 rel_type = ICMP_DEST_UNREACH;
524 rel_code = ICMP_HOST_UNREACH;
525 }
526 }
527
528 icmp_send(skb2, rel_type, rel_code, rel_info);
529 kfree_skb(skb2);
530#endif
531}
532
533static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
534{
535 if (INET_ECN_is_ce(iph->tos)) {
536 if (skb->protocol == htons(ETH_P_IP)) {
537 IP_ECN_set_ce(skb->nh.iph);
538 } else if (skb->protocol == htons(ETH_P_IPV6)) {
539 IP6_ECN_set_ce(skb->nh.ipv6h);
540 }
541 }
542}
543
544static inline u8
545ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
546{
547 u8 inner = 0;
548 if (skb->protocol == htons(ETH_P_IP))
549 inner = old_iph->tos;
550 else if (skb->protocol == htons(ETH_P_IPV6))
551 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
552 return INET_ECN_encapsulate(tos, inner);
553}
554
555static int ipgre_rcv(struct sk_buff *skb)
556{
557 struct iphdr *iph;
558 u8 *h;
559 u16 flags;
560 u16 csum = 0;
561 u32 key = 0;
562 u32 seqno = 0;
563 struct ip_tunnel *tunnel;
564 int offset = 4;
565
566 if (!pskb_may_pull(skb, 16))
567 goto drop_nolock;
568
569 iph = skb->nh.iph;
570 h = skb->data;
571 flags = *(u16*)h;
572
573 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
574 /* - Version must be 0.
575 - We do not support routing headers.
576 */
577 if (flags&(GRE_VERSION|GRE_ROUTING))
578 goto drop_nolock;
579
580 if (flags&GRE_CSUM) {
581 if (skb->ip_summed == CHECKSUM_HW) {
582 csum = (u16)csum_fold(skb->csum);
583 if (csum)
584 skb->ip_summed = CHECKSUM_NONE;
585 }
586 if (skb->ip_summed == CHECKSUM_NONE) {
587 skb->csum = skb_checksum(skb, 0, skb->len, 0);
588 skb->ip_summed = CHECKSUM_HW;
589 csum = (u16)csum_fold(skb->csum);
590 }
591 offset += 4;
592 }
593 if (flags&GRE_KEY) {
594 key = *(u32*)(h + offset);
595 offset += 4;
596 }
597 if (flags&GRE_SEQ) {
598 seqno = ntohl(*(u32*)(h + offset));
599 offset += 4;
600 }
601 }
602
603 read_lock(&ipgre_lock);
604 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
605 secpath_reset(skb);
606
607 skb->protocol = *(u16*)(h + 2);
608 /* WCCP version 1 and 2 protocol decoding.
609 * - Change protocol to IP
610 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
611 */
612 if (flags == 0 &&
613 skb->protocol == __constant_htons(ETH_P_WCCP)) {
614 skb->protocol = __constant_htons(ETH_P_IP);
615 if ((*(h + offset) & 0xF0) != 0x40)
616 offset += 4;
617 }
618
619 skb->mac.raw = skb->nh.raw;
620 skb->nh.raw = __pskb_pull(skb, offset);
621 skb_postpull_rcsum(skb, skb->mac.raw, offset);
622 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
623 skb->pkt_type = PACKET_HOST;
624#ifdef CONFIG_NET_IPGRE_BROADCAST
625 if (MULTICAST(iph->daddr)) {
626 /* Looped back packet, drop it! */
627 if (((struct rtable*)skb->dst)->fl.iif == 0)
628 goto drop;
629 tunnel->stat.multicast++;
630 skb->pkt_type = PACKET_BROADCAST;
631 }
632#endif
633
634 if (((flags&GRE_CSUM) && csum) ||
635 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
636 tunnel->stat.rx_crc_errors++;
637 tunnel->stat.rx_errors++;
638 goto drop;
639 }
640 if (tunnel->parms.i_flags&GRE_SEQ) {
641 if (!(flags&GRE_SEQ) ||
642 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
643 tunnel->stat.rx_fifo_errors++;
644 tunnel->stat.rx_errors++;
645 goto drop;
646 }
647 tunnel->i_seqno = seqno + 1;
648 }
649 tunnel->stat.rx_packets++;
650 tunnel->stat.rx_bytes += skb->len;
651 skb->dev = tunnel->dev;
652 dst_release(skb->dst);
653 skb->dst = NULL;
654 nf_reset(skb);
655 ipgre_ecn_decapsulate(iph, skb);
656 netif_rx(skb);
657 read_unlock(&ipgre_lock);
658 return(0);
659 }
660 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
661
662drop:
663 read_unlock(&ipgre_lock);
664drop_nolock:
665 kfree_skb(skb);
666 return(0);
667}
668
669static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
670{
671 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
672 struct net_device_stats *stats = &tunnel->stat;
673 struct iphdr *old_iph = skb->nh.iph;
674 struct iphdr *tiph;
675 u8 tos;
676 u16 df;
677 struct rtable *rt; /* Route to the other host */
678 struct net_device *tdev; /* Device to other host */
679 struct iphdr *iph; /* Our new IP header */
680 int max_headroom; /* The extra header space needed */
681 int gre_hlen;
682 u32 dst;
683 int mtu;
684
685 if (tunnel->recursion++) {
686 tunnel->stat.collisions++;
687 goto tx_error;
688 }
689
690 if (dev->hard_header) {
691 gre_hlen = 0;
692 tiph = (struct iphdr*)skb->data;
693 } else {
694 gre_hlen = tunnel->hlen;
695 tiph = &tunnel->parms.iph;
696 }
697
698 if ((dst = tiph->daddr) == 0) {
699 /* NBMA tunnel */
700
701 if (skb->dst == NULL) {
702 tunnel->stat.tx_fifo_errors++;
703 goto tx_error;
704 }
705
706 if (skb->protocol == htons(ETH_P_IP)) {
707 rt = (struct rtable*)skb->dst;
708 if ((dst = rt->rt_gateway) == 0)
709 goto tx_error_icmp;
710 }
711#ifdef CONFIG_IPV6
712 else if (skb->protocol == htons(ETH_P_IPV6)) {
713 struct in6_addr *addr6;
714 int addr_type;
715 struct neighbour *neigh = skb->dst->neighbour;
716
717 if (neigh == NULL)
718 goto tx_error;
719
720 addr6 = (struct in6_addr*)&neigh->primary_key;
721 addr_type = ipv6_addr_type(addr6);
722
723 if (addr_type == IPV6_ADDR_ANY) {
724 addr6 = &skb->nh.ipv6h->daddr;
725 addr_type = ipv6_addr_type(addr6);
726 }
727
728 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
729 goto tx_error_icmp;
730
731 dst = addr6->s6_addr32[3];
732 }
733#endif
734 else
735 goto tx_error;
736 }
737
738 tos = tiph->tos;
739 if (tos&1) {
740 if (skb->protocol == htons(ETH_P_IP))
741 tos = old_iph->tos;
742 tos &= ~1;
743 }
744
745 {
746 struct flowi fl = { .oif = tunnel->parms.link,
747 .nl_u = { .ip4_u =
748 { .daddr = dst,
749 .saddr = tiph->saddr,
750 .tos = RT_TOS(tos) } },
751 .proto = IPPROTO_GRE };
752 if (ip_route_output_key(&rt, &fl)) {
753 tunnel->stat.tx_carrier_errors++;
754 goto tx_error;
755 }
756 }
757 tdev = rt->u.dst.dev;
758
759 if (tdev == dev) {
760 ip_rt_put(rt);
761 tunnel->stat.collisions++;
762 goto tx_error;
763 }
764
765 df = tiph->frag_off;
766 if (df)
767 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
768 else
769 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
770
771 if (skb->dst)
772 skb->dst->ops->update_pmtu(skb->dst, mtu);
773
774 if (skb->protocol == htons(ETH_P_IP)) {
775 df |= (old_iph->frag_off&htons(IP_DF));
776
777 if ((old_iph->frag_off&htons(IP_DF)) &&
778 mtu < ntohs(old_iph->tot_len)) {
779 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
780 ip_rt_put(rt);
781 goto tx_error;
782 }
783 }
784#ifdef CONFIG_IPV6
785 else if (skb->protocol == htons(ETH_P_IPV6)) {
786 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
787
788 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
789 if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
790 rt6->rt6i_dst.plen == 128) {
791 rt6->rt6i_flags |= RTF_MODIFIED;
792 skb->dst->metrics[RTAX_MTU-1] = mtu;
793 }
794 }
795
796 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
797 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
798 ip_rt_put(rt);
799 goto tx_error;
800 }
801 }
802#endif
803
804 if (tunnel->err_count > 0) {
805 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
806 tunnel->err_count--;
807
808 dst_link_failure(skb);
809 } else
810 tunnel->err_count = 0;
811 }
812
813 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
814
815 if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
816 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
817 if (!new_skb) {
818 ip_rt_put(rt);
819 stats->tx_dropped++;
820 dev_kfree_skb(skb);
821 tunnel->recursion--;
822 return 0;
823 }
824 if (skb->sk)
825 skb_set_owner_w(new_skb, skb->sk);
826 dev_kfree_skb(skb);
827 skb = new_skb;
828 old_iph = skb->nh.iph;
829 }
830
831 skb->h.raw = skb->nh.raw;
832 skb->nh.raw = skb_push(skb, gre_hlen);
833 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
834 dst_release(skb->dst);
835 skb->dst = &rt->u.dst;
836
837 /*
838 * Push down and install the IPIP header.
839 */
840
841 iph = skb->nh.iph;
842 iph->version = 4;
843 iph->ihl = sizeof(struct iphdr) >> 2;
844 iph->frag_off = df;
845 iph->protocol = IPPROTO_GRE;
846 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
847 iph->daddr = rt->rt_dst;
848 iph->saddr = rt->rt_src;
849
850 if ((iph->ttl = tiph->ttl) == 0) {
851 if (skb->protocol == htons(ETH_P_IP))
852 iph->ttl = old_iph->ttl;
853#ifdef CONFIG_IPV6
854 else if (skb->protocol == htons(ETH_P_IPV6))
855 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
856#endif
857 else
858 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
859 }
860
861 ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
862 ((u16*)(iph+1))[1] = skb->protocol;
863
864 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
865 u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
866
867 if (tunnel->parms.o_flags&GRE_SEQ) {
868 ++tunnel->o_seqno;
869 *ptr = htonl(tunnel->o_seqno);
870 ptr--;
871 }
872 if (tunnel->parms.o_flags&GRE_KEY) {
873 *ptr = tunnel->parms.o_key;
874 ptr--;
875 }
876 if (tunnel->parms.o_flags&GRE_CSUM) {
877 *ptr = 0;
878 *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
879 }
880 }
881
882 nf_reset(skb);
883
884 IPTUNNEL_XMIT();
885 tunnel->recursion--;
886 return 0;
887
888tx_error_icmp:
889 dst_link_failure(skb);
890
891tx_error:
892 stats->tx_errors++;
893 dev_kfree_skb(skb);
894 tunnel->recursion--;
895 return 0;
896}
897
898static int
899ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
900{
901 int err = 0;
902 struct ip_tunnel_parm p;
903 struct ip_tunnel *t;
904
905 switch (cmd) {
906 case SIOCGETTUNNEL:
907 t = NULL;
908 if (dev == ipgre_fb_tunnel_dev) {
909 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
910 err = -EFAULT;
911 break;
912 }
913 t = ipgre_tunnel_locate(&p, 0);
914 }
915 if (t == NULL)
916 t = (struct ip_tunnel*)dev->priv;
917 memcpy(&p, &t->parms, sizeof(p));
918 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
919 err = -EFAULT;
920 break;
921
922 case SIOCADDTUNNEL:
923 case SIOCCHGTUNNEL:
924 err = -EPERM;
925 if (!capable(CAP_NET_ADMIN))
926 goto done;
927
928 err = -EFAULT;
929 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
930 goto done;
931
932 err = -EINVAL;
933 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
934 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
935 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
936 goto done;
937 if (p.iph.ttl)
938 p.iph.frag_off |= htons(IP_DF);
939
940 if (!(p.i_flags&GRE_KEY))
941 p.i_key = 0;
942 if (!(p.o_flags&GRE_KEY))
943 p.o_key = 0;
944
945 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
946
947 if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
948 if (t != NULL) {
949 if (t->dev != dev) {
950 err = -EEXIST;
951 break;
952 }
953 } else {
954 unsigned nflags=0;
955
956 t = (struct ip_tunnel*)dev->priv;
957
958 if (MULTICAST(p.iph.daddr))
959 nflags = IFF_BROADCAST;
960 else if (p.iph.daddr)
961 nflags = IFF_POINTOPOINT;
962
963 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
964 err = -EINVAL;
965 break;
966 }
967 ipgre_tunnel_unlink(t);
968 t->parms.iph.saddr = p.iph.saddr;
969 t->parms.iph.daddr = p.iph.daddr;
970 t->parms.i_key = p.i_key;
971 t->parms.o_key = p.o_key;
972 memcpy(dev->dev_addr, &p.iph.saddr, 4);
973 memcpy(dev->broadcast, &p.iph.daddr, 4);
974 ipgre_tunnel_link(t);
975 netdev_state_change(dev);
976 }
977 }
978
979 if (t) {
980 err = 0;
981 if (cmd == SIOCCHGTUNNEL) {
982 t->parms.iph.ttl = p.iph.ttl;
983 t->parms.iph.tos = p.iph.tos;
984 t->parms.iph.frag_off = p.iph.frag_off;
985 }
986 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
987 err = -EFAULT;
988 } else
989 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
990 break;
991
992 case SIOCDELTUNNEL:
993 err = -EPERM;
994 if (!capable(CAP_NET_ADMIN))
995 goto done;
996
997 if (dev == ipgre_fb_tunnel_dev) {
998 err = -EFAULT;
999 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1000 goto done;
1001 err = -ENOENT;
1002 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
1003 goto done;
1004 err = -EPERM;
1005 if (t == ipgre_fb_tunnel_dev->priv)
1006 goto done;
1007 dev = t->dev;
1008 }
1009 err = unregister_netdevice(dev);
1010 break;
1011
1012 default:
1013 err = -EINVAL;
1014 }
1015
1016done:
1017 return err;
1018}
1019
1020static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1021{
1022 return &(((struct ip_tunnel*)dev->priv)->stat);
1023}
1024
1025static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1026{
1027 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1028 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1029 return -EINVAL;
1030 dev->mtu = new_mtu;
1031 return 0;
1032}
1033
1034#ifdef CONFIG_NET_IPGRE_BROADCAST
1035/* Nice toy. Unfortunately, useless in real life :-)
1036 It allows to construct virtual multiprotocol broadcast "LAN"
1037 over the Internet, provided multicast routing is tuned.
1038
1039
1040 I have no idea was this bicycle invented before me,
1041 so that I had to set ARPHRD_IPGRE to a random value.
1042 I have an impression, that Cisco could make something similar,
1043 but this feature is apparently missing in IOS<=11.2(8).
1044
1045 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1046 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1047
1048 ping -t 255 224.66.66.66
1049
1050 If nobody answers, mbone does not work.
1051
1052 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1053 ip addr add 10.66.66.<somewhat>/24 dev Universe
1054 ifconfig Universe up
1055 ifconfig Universe add fe80::<Your_real_addr>/10
1056 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1057 ftp 10.66.66.66
1058 ...
1059 ftp fec0:6666:6666::193.233.7.65
1060 ...
1061
1062 */
1063
1064static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1065 void *daddr, void *saddr, unsigned len)
1066{
1067 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1068 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1069 u16 *p = (u16*)(iph+1);
1070
1071 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1072 p[0] = t->parms.o_flags;
1073 p[1] = htons(type);
1074
1075 /*
1076 * Set the source hardware address.
1077 */
1078
1079 if (saddr)
1080 memcpy(&iph->saddr, saddr, 4);
1081
1082 if (daddr) {
1083 memcpy(&iph->daddr, daddr, 4);
1084 return t->hlen;
1085 }
1086 if (iph->daddr && !MULTICAST(iph->daddr))
1087 return t->hlen;
1088
1089 return -t->hlen;
1090}
1091
1092static int ipgre_open(struct net_device *dev)
1093{
1094 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1095
1096 if (MULTICAST(t->parms.iph.daddr)) {
1097 struct flowi fl = { .oif = t->parms.link,
1098 .nl_u = { .ip4_u =
1099 { .daddr = t->parms.iph.daddr,
1100 .saddr = t->parms.iph.saddr,
1101 .tos = RT_TOS(t->parms.iph.tos) } },
1102 .proto = IPPROTO_GRE };
1103 struct rtable *rt;
1104 if (ip_route_output_key(&rt, &fl))
1105 return -EADDRNOTAVAIL;
1106 dev = rt->u.dst.dev;
1107 ip_rt_put(rt);
1108 if (__in_dev_get(dev) == NULL)
1109 return -EADDRNOTAVAIL;
1110 t->mlink = dev->ifindex;
1111 ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
1112 }
1113 return 0;
1114}
1115
1116static int ipgre_close(struct net_device *dev)
1117{
1118 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1119 if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
1120 struct in_device *in_dev = inetdev_by_index(t->mlink);
1121 if (in_dev) {
1122 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1123 in_dev_put(in_dev);
1124 }
1125 }
1126 return 0;
1127}
1128
1129#endif
1130
1131static void ipgre_tunnel_setup(struct net_device *dev)
1132{
1133 SET_MODULE_OWNER(dev);
1134 dev->uninit = ipgre_tunnel_uninit;
1135 dev->destructor = free_netdev;
1136 dev->hard_start_xmit = ipgre_tunnel_xmit;
1137 dev->get_stats = ipgre_tunnel_get_stats;
1138 dev->do_ioctl = ipgre_tunnel_ioctl;
1139 dev->change_mtu = ipgre_tunnel_change_mtu;
1140
1141 dev->type = ARPHRD_IPGRE;
1142 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1143 dev->mtu = 1500 - sizeof(struct iphdr) - 4;
1144 dev->flags = IFF_NOARP;
1145 dev->iflink = 0;
1146 dev->addr_len = 4;
1147}
1148
1149static int ipgre_tunnel_init(struct net_device *dev)
1150{
1151 struct net_device *tdev = NULL;
1152 struct ip_tunnel *tunnel;
1153 struct iphdr *iph;
1154 int hlen = LL_MAX_HEADER;
1155 int mtu = 1500;
1156 int addend = sizeof(struct iphdr) + 4;
1157
1158 tunnel = (struct ip_tunnel*)dev->priv;
1159 iph = &tunnel->parms.iph;
1160
1161 tunnel->dev = dev;
1162 strcpy(tunnel->parms.name, dev->name);
1163
1164 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1165 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1166
1167 /* Guess output device to choose reasonable mtu and hard_header_len */
1168
1169 if (iph->daddr) {
1170 struct flowi fl = { .oif = tunnel->parms.link,
1171 .nl_u = { .ip4_u =
1172 { .daddr = iph->daddr,
1173 .saddr = iph->saddr,
1174 .tos = RT_TOS(iph->tos) } },
1175 .proto = IPPROTO_GRE };
1176 struct rtable *rt;
1177 if (!ip_route_output_key(&rt, &fl)) {
1178 tdev = rt->u.dst.dev;
1179 ip_rt_put(rt);
1180 }
1181
1182 dev->flags |= IFF_POINTOPOINT;
1183
1184#ifdef CONFIG_NET_IPGRE_BROADCAST
1185 if (MULTICAST(iph->daddr)) {
1186 if (!iph->saddr)
1187 return -EINVAL;
1188 dev->flags = IFF_BROADCAST;
1189 dev->hard_header = ipgre_header;
1190 dev->open = ipgre_open;
1191 dev->stop = ipgre_close;
1192 }
1193#endif
1194 }
1195
1196 if (!tdev && tunnel->parms.link)
1197 tdev = __dev_get_by_index(tunnel->parms.link);
1198
1199 if (tdev) {
1200 hlen = tdev->hard_header_len;
1201 mtu = tdev->mtu;
1202 }
1203 dev->iflink = tunnel->parms.link;
1204
1205 /* Precalculate GRE options length */
1206 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1207 if (tunnel->parms.o_flags&GRE_CSUM)
1208 addend += 4;
1209 if (tunnel->parms.o_flags&GRE_KEY)
1210 addend += 4;
1211 if (tunnel->parms.o_flags&GRE_SEQ)
1212 addend += 4;
1213 }
1214 dev->hard_header_len = hlen + addend;
1215 dev->mtu = mtu - addend;
1216 tunnel->hlen = addend;
1217 return 0;
1218}
1219
1220int __init ipgre_fb_tunnel_init(struct net_device *dev)
1221{
1222 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1223 struct iphdr *iph = &tunnel->parms.iph;
1224
1225 tunnel->dev = dev;
1226 strcpy(tunnel->parms.name, dev->name);
1227
1228 iph->version = 4;
1229 iph->protocol = IPPROTO_GRE;
1230 iph->ihl = 5;
1231 tunnel->hlen = sizeof(struct iphdr) + 4;
1232
1233 dev_hold(dev);
1234 tunnels_wc[0] = tunnel;
1235 return 0;
1236}
1237
1238
1239static struct net_protocol ipgre_protocol = {
1240 .handler = ipgre_rcv,
1241 .err_handler = ipgre_err,
1242};
1243
1244
1245/*
1246 * And now the modules code and kernel interface.
1247 */
1248
1249static int __init ipgre_init(void)
1250{
1251 int err;
1252
1253 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1254
1255 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1256 printk(KERN_INFO "ipgre init: can't add protocol\n");
1257 return -EAGAIN;
1258 }
1259
1260 ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1261 ipgre_tunnel_setup);
1262 if (!ipgre_fb_tunnel_dev) {
1263 err = -ENOMEM;
1264 goto err1;
1265 }
1266
1267 ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1268
1269 if ((err = register_netdev(ipgre_fb_tunnel_dev)))
1270 goto err2;
1271out:
1272 return err;
1273err2:
1274 free_netdev(ipgre_fb_tunnel_dev);
1275err1:
1276 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1277 goto out;
1278}
1279
1280static void ipgre_fini(void)
1281{
1282 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1283 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1284
1285 unregister_netdev(ipgre_fb_tunnel_dev);
1286}
1287
1288module_init(ipgre_init);
1289module_exit(ipgre_fini);
1290MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 000000000000..a0d0833034be
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,431 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) module.
7 *
8 * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *
19 *
20 * Fixes:
21 * Alan Cox : Commented a couple of minor bits of surplus code
22 * Alan Cox : Undefining IP_FORWARD doesn't include the code
23 * (just stops a compiler warning).
24 * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
25 * are junked rather than corrupting things.
26 * Alan Cox : Frames to bad broadcast subnets are dumped
27 * We used to process them non broadcast and
28 * boy could that cause havoc.
29 * Alan Cox : ip_forward sets the free flag on the
30 * new frame it queues. Still crap because
31 * it copies the frame but at least it
32 * doesn't eat memory too.
33 * Alan Cox : Generic queue code and memory fixes.
34 * Fred Van Kempen : IP fragment support (borrowed from NET2E)
35 * Gerhard Koerting: Forward fragmented frames correctly.
36 * Gerhard Koerting: Fixes to my fix of the above 8-).
37 * Gerhard Koerting: IP interface addressing fix.
38 * Linus Torvalds : More robustness checks
39 * Alan Cox : Even more checks: Still not as robust as it ought to be
40 * Alan Cox : Save IP header pointer for later
41 * Alan Cox : ip option setting
42 * Alan Cox : Use ip_tos/ip_ttl settings
43 * Alan Cox : Fragmentation bogosity removed
44 * (Thanks to Mark.Bush@prg.ox.ac.uk)
45 * Dmitry Gorodchanin : Send of a raw packet crash fix.
46 * Alan Cox : Silly ip bug when an overlength
47 * fragment turns up. Now frees the
48 * queue.
49 * Linus Torvalds/ : Memory leakage on fragmentation
50 * Alan Cox : handling.
51 * Gerhard Koerting: Forwarding uses IP priority hints
52 * Teemu Rantanen : Fragment problems.
53 * Alan Cox : General cleanup, comments and reformat
54 * Alan Cox : SNMP statistics
55 * Alan Cox : BSD address rule semantics. Also see
56 * UDP as there is a nasty checksum issue
57 * if you do things the wrong way.
58 * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
59 * Alan Cox : IP options adjust sk->priority.
60 * Pedro Roque : Fix mtu/length error in ip_forward.
61 * Alan Cox : Avoid ip_chk_addr when possible.
62 * Richard Underwood : IP multicasting.
63 * Alan Cox : Cleaned up multicast handlers.
64 * Alan Cox : RAW sockets demultiplex in the BSD style.
65 * Gunther Mayer : Fix the SNMP reporting typo
66 * Alan Cox : Always in group 224.0.0.1
67 * Pauline Middelink : Fast ip_checksum update when forwarding
68 * Masquerading support.
69 * Alan Cox : Multicast loopback error for 224.0.0.1
70 * Alan Cox : IP_MULTICAST_LOOP option.
71 * Alan Cox : Use notifiers.
72 * Bjorn Ekwall : Removed ip_csum (from slhc.c too)
73 * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
74 * Stefan Becker : Send out ICMP HOST REDIRECT
75 * Arnt Gulbrandsen : ip_build_xmit
76 * Alan Cox : Per socket routing cache
77 * Alan Cox : Fixed routing cache, added header cache.
78 * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
79 * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
80 * Alan Cox : Incoming IP option handling.
81 * Alan Cox : Set saddr on raw output frames as per BSD.
82 * Alan Cox : Stopped broadcast source route explosions.
83 * Alan Cox : Can disable source routing
84 * Takeshi Sone : Masquerading didn't work.
85 * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
86 * Alan Cox : Memory leaks, tramples, misc debugging.
87 * Alan Cox : Fixed multicast (by popular demand 8))
88 * Alan Cox : Fixed forwarding (by even more popular demand 8))
89 * Alan Cox : Fixed SNMP statistics [I think]
90 * Gerhard Koerting : IP fragmentation forwarding fix
91 * Alan Cox : Device lock against page fault.
92 * Alan Cox : IP_HDRINCL facility.
93 * Werner Almesberger : Zero fragment bug
94 * Alan Cox : RAW IP frame length bug
95 * Alan Cox : Outgoing firewall on build_xmit
96 * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
97 * Alan Cox : Multicast routing hooks
98 * Jos Vos : Do accounting *before* call_in_firewall
99 * Willy Konynenberg : Transparent proxying support
100 *
101 *
102 *
103 * To Fix:
104 * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
105 * and could be made very efficient with the addition of some virtual memory hacks to permit
106 * the allocation of a buffer that can then be 'grown' by twiddling page tables.
107 * Output fragmentation wants updating along with the buffer management to use a single
108 * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
109 * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
110 * fragmentation anyway.
111 *
112 * This program is free software; you can redistribute it and/or
113 * modify it under the terms of the GNU General Public License
114 * as published by the Free Software Foundation; either version
115 * 2 of the License, or (at your option) any later version.
116 */
117
118#include <asm/system.h>
119#include <linux/module.h>
120#include <linux/types.h>
121#include <linux/kernel.h>
122#include <linux/string.h>
123#include <linux/errno.h>
124#include <linux/config.h>
125
126#include <linux/net.h>
127#include <linux/socket.h>
128#include <linux/sockios.h>
129#include <linux/in.h>
130#include <linux/inet.h>
131#include <linux/netdevice.h>
132#include <linux/etherdevice.h>
133
134#include <net/snmp.h>
135#include <net/ip.h>
136#include <net/protocol.h>
137#include <net/route.h>
138#include <linux/skbuff.h>
139#include <net/sock.h>
140#include <net/arp.h>
141#include <net/icmp.h>
142#include <net/raw.h>
143#include <net/checksum.h>
144#include <linux/netfilter_ipv4.h>
145#include <net/xfrm.h>
146#include <linux/mroute.h>
147#include <linux/netlink.h>
148
149/*
150 * SNMP management statistics
151 */
152
153DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
154
155/*
156 * Process Router Attention IP option
157 */
158int ip_call_ra_chain(struct sk_buff *skb)
159{
160 struct ip_ra_chain *ra;
161 u8 protocol = skb->nh.iph->protocol;
162 struct sock *last = NULL;
163
164 read_lock(&ip_ra_lock);
165 for (ra = ip_ra_chain; ra; ra = ra->next) {
166 struct sock *sk = ra->sk;
167
168 /* If socket is bound to an interface, only report
169 * the packet if it came from that interface.
170 */
171 if (sk && inet_sk(sk)->num == protocol &&
172 (!sk->sk_bound_dev_if ||
173 sk->sk_bound_dev_if == skb->dev->ifindex)) {
174 if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
175 skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
176 if (skb == NULL) {
177 read_unlock(&ip_ra_lock);
178 return 1;
179 }
180 }
181 if (last) {
182 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
183 if (skb2)
184 raw_rcv(last, skb2);
185 }
186 last = sk;
187 }
188 }
189
190 if (last) {
191 raw_rcv(last, skb);
192 read_unlock(&ip_ra_lock);
193 return 1;
194 }
195 read_unlock(&ip_ra_lock);
196 return 0;
197}
198
199static inline int ip_local_deliver_finish(struct sk_buff *skb)
200{
201 int ihl = skb->nh.iph->ihl*4;
202
203#ifdef CONFIG_NETFILTER_DEBUG
204 nf_debug_ip_local_deliver(skb);
205#endif /*CONFIG_NETFILTER_DEBUG*/
206
207 __skb_pull(skb, ihl);
208
209 /* Free reference early: we don't need it any more, and it may
210 hold ip_conntrack module loaded indefinitely. */
211 nf_reset(skb);
212
213 /* Point into the IP datagram, just past the header. */
214 skb->h.raw = skb->data;
215
216 rcu_read_lock();
217 {
218 /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
219 int protocol = skb->nh.iph->protocol;
220 int hash;
221 struct sock *raw_sk;
222 struct net_protocol *ipprot;
223
224 resubmit:
225 hash = protocol & (MAX_INET_PROTOS - 1);
226 raw_sk = sk_head(&raw_v4_htable[hash]);
227
228 /* If there maybe a raw socket we must check - if not we
229 * don't care less
230 */
231 if (raw_sk)
232 raw_v4_input(skb, skb->nh.iph, hash);
233
234 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
235 int ret;
236
237 if (!ipprot->no_policy &&
238 !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
239 kfree_skb(skb);
240 goto out;
241 }
242 ret = ipprot->handler(skb);
243 if (ret < 0) {
244 protocol = -ret;
245 goto resubmit;
246 }
247 IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
248 } else {
249 if (!raw_sk) {
250 if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
251 IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
252 icmp_send(skb, ICMP_DEST_UNREACH,
253 ICMP_PROT_UNREACH, 0);
254 }
255 } else
256 IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
257 kfree_skb(skb);
258 }
259 }
260 out:
261 rcu_read_unlock();
262
263 return 0;
264}
265
266/*
267 * Deliver IP Packets to the higher protocol layers.
268 */
269int ip_local_deliver(struct sk_buff *skb)
270{
271 /*
272 * Reassemble IP fragments.
273 */
274
275 if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
276 skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
277 if (!skb)
278 return 0;
279 }
280
281 return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
282 ip_local_deliver_finish);
283}
284
285static inline int ip_rcv_finish(struct sk_buff *skb)
286{
287 struct net_device *dev = skb->dev;
288 struct iphdr *iph = skb->nh.iph;
289
290 /*
291 * Initialise the virtual path cache for the packet. It describes
292 * how the packet travels inside Linux networking.
293 */
294 if (skb->dst == NULL) {
295 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
296 goto drop;
297 }
298
299#ifdef CONFIG_NET_CLS_ROUTE
300 if (skb->dst->tclassid) {
301 struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
302 u32 idx = skb->dst->tclassid;
303 st[idx&0xFF].o_packets++;
304 st[idx&0xFF].o_bytes+=skb->len;
305 st[(idx>>16)&0xFF].i_packets++;
306 st[(idx>>16)&0xFF].i_bytes+=skb->len;
307 }
308#endif
309
310 if (iph->ihl > 5) {
311 struct ip_options *opt;
312
313 /* It looks as overkill, because not all
314 IP options require packet mangling.
315 But it is the easiest for now, especially taking
316 into account that combination of IP options
317 and running sniffer is extremely rare condition.
318 --ANK (980813)
319 */
320
321 if (skb_cow(skb, skb_headroom(skb))) {
322 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
323 goto drop;
324 }
325 iph = skb->nh.iph;
326
327 if (ip_options_compile(NULL, skb))
328 goto inhdr_error;
329
330 opt = &(IPCB(skb)->opt);
331 if (opt->srr) {
332 struct in_device *in_dev = in_dev_get(dev);
333 if (in_dev) {
334 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
335 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
336 printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
337 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
338 in_dev_put(in_dev);
339 goto drop;
340 }
341 in_dev_put(in_dev);
342 }
343 if (ip_options_rcv_srr(skb))
344 goto drop;
345 }
346 }
347
348 return dst_input(skb);
349
350inhdr_error:
351 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
352drop:
353 kfree_skb(skb);
354 return NET_RX_DROP;
355}
356
357/*
358 * Main IP Receive routine.
359 */
360int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
361{
362 struct iphdr *iph;
363
364 /* When the interface is in promisc. mode, drop all the crap
365 * that it receives, do not try to analyse it.
366 */
367 if (skb->pkt_type == PACKET_OTHERHOST)
368 goto drop;
369
370 IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
371
372 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
373 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
374 goto out;
375 }
376
377 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
378 goto inhdr_error;
379
380 iph = skb->nh.iph;
381
382 /*
383 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
384 *
385 * Is the datagram acceptable?
386 *
387 * 1. Length at least the size of an ip header
388 * 2. Version of 4
389 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
390 * 4. Doesn't have a bogus length
391 */
392
393 if (iph->ihl < 5 || iph->version != 4)
394 goto inhdr_error;
395
396 if (!pskb_may_pull(skb, iph->ihl*4))
397 goto inhdr_error;
398
399 iph = skb->nh.iph;
400
401 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
402 goto inhdr_error;
403
404 {
405 __u32 len = ntohs(iph->tot_len);
406 if (skb->len < len || len < (iph->ihl<<2))
407 goto inhdr_error;
408
409 /* Our transport medium may have padded the buffer out. Now we know it
410 * is IP we can trim to the true length of the frame.
411 * Note this now means skb->len holds ntohs(iph->tot_len).
412 */
413 if (pskb_trim_rcsum(skb, len)) {
414 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
415 goto drop;
416 }
417 }
418
419 return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
420 ip_rcv_finish);
421
422inhdr_error:
423 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
424drop:
425 kfree_skb(skb);
426out:
427 return NET_RX_DROP;
428}
429
430EXPORT_SYMBOL(ip_rcv);
431EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 000000000000..6d89f3f3e701
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,625 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The options processing module for ip.c
7 *
8 * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $
9 *
10 * Authors: A.N.Kuznetsov
11 *
12 */
13
14#include <linux/module.h>
15#include <linux/types.h>
16#include <asm/uaccess.h>
17#include <linux/skbuff.h>
18#include <linux/ip.h>
19#include <linux/icmp.h>
20#include <linux/netdevice.h>
21#include <linux/rtnetlink.h>
22#include <net/sock.h>
23#include <net/ip.h>
24#include <net/icmp.h>
25
26/*
27 * Write options to IP header, record destination address to
28 * source route option, address of outgoing interface
29 * (we should already know it, so that this function is allowed be
30 * called only after routing decision) and timestamp,
31 * if we originate this datagram.
32 *
33 * daddr is real destination address, next hop is recorded in IP header.
34 * saddr is address of outgoing interface.
35 */
36
37void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
38 u32 daddr, struct rtable *rt, int is_frag)
39{
40 unsigned char * iph = skb->nh.raw;
41
42 memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
43 memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
44 opt = &(IPCB(skb)->opt);
45 opt->is_data = 0;
46
47 if (opt->srr)
48 memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
49
50 if (!is_frag) {
51 if (opt->rr_needaddr)
52 ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
53 if (opt->ts_needaddr)
54 ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
55 if (opt->ts_needtime) {
56 struct timeval tv;
57 __u32 midtime;
58 do_gettimeofday(&tv);
59 midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
60 memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
61 }
62 return;
63 }
64 if (opt->rr) {
65 memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
66 opt->rr = 0;
67 opt->rr_needaddr = 0;
68 }
69 if (opt->ts) {
70 memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
71 opt->ts = 0;
72 opt->ts_needaddr = opt->ts_needtime = 0;
73 }
74}
75
76/*
77 * Provided (sopt, skb) points to received options,
78 * build in dopt compiled option set appropriate for answering.
79 * i.e. invert SRR option, copy anothers,
80 * and grab room in RR/TS options.
81 *
82 * NOTE: dopt cannot point to skb.
83 */
84
85int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
86{
87 struct ip_options *sopt;
88 unsigned char *sptr, *dptr;
89 int soffset, doffset;
90 int optlen;
91 u32 daddr;
92
93 memset(dopt, 0, sizeof(struct ip_options));
94
95 dopt->is_data = 1;
96
97 sopt = &(IPCB(skb)->opt);
98
99 if (sopt->optlen == 0) {
100 dopt->optlen = 0;
101 return 0;
102 }
103
104 sptr = skb->nh.raw;
105 dptr = dopt->__data;
106
107 if (skb->dst)
108 daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
109 else
110 daddr = skb->nh.iph->daddr;
111
112 if (sopt->rr) {
113 optlen = sptr[sopt->rr+1];
114 soffset = sptr[sopt->rr+2];
115 dopt->rr = dopt->optlen + sizeof(struct iphdr);
116 memcpy(dptr, sptr+sopt->rr, optlen);
117 if (sopt->rr_needaddr && soffset <= optlen) {
118 if (soffset + 3 > optlen)
119 return -EINVAL;
120 dptr[2] = soffset + 4;
121 dopt->rr_needaddr = 1;
122 }
123 dptr += optlen;
124 dopt->optlen += optlen;
125 }
126 if (sopt->ts) {
127 optlen = sptr[sopt->ts+1];
128 soffset = sptr[sopt->ts+2];
129 dopt->ts = dopt->optlen + sizeof(struct iphdr);
130 memcpy(dptr, sptr+sopt->ts, optlen);
131 if (soffset <= optlen) {
132 if (sopt->ts_needaddr) {
133 if (soffset + 3 > optlen)
134 return -EINVAL;
135 dopt->ts_needaddr = 1;
136 soffset += 4;
137 }
138 if (sopt->ts_needtime) {
139 if (soffset + 3 > optlen)
140 return -EINVAL;
141 if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
142 dopt->ts_needtime = 1;
143 soffset += 4;
144 } else {
145 dopt->ts_needtime = 0;
146
147 if (soffset + 8 <= optlen) {
148 __u32 addr;
149
150 memcpy(&addr, sptr+soffset-1, 4);
151 if (inet_addr_type(addr) != RTN_LOCAL) {
152 dopt->ts_needtime = 1;
153 soffset += 8;
154 }
155 }
156 }
157 }
158 dptr[2] = soffset;
159 }
160 dptr += optlen;
161 dopt->optlen += optlen;
162 }
163 if (sopt->srr) {
164 unsigned char * start = sptr+sopt->srr;
165 u32 faddr;
166
167 optlen = start[1];
168 soffset = start[2];
169 doffset = 0;
170 if (soffset > optlen)
171 soffset = optlen + 1;
172 soffset -= 4;
173 if (soffset > 3) {
174 memcpy(&faddr, &start[soffset-1], 4);
175 for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
176 memcpy(&dptr[doffset-1], &start[soffset-1], 4);
177 /*
178 * RFC1812 requires to fix illegal source routes.
179 */
180 if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0)
181 doffset -= 4;
182 }
183 if (doffset > 3) {
184 memcpy(&start[doffset-1], &daddr, 4);
185 dopt->faddr = faddr;
186 dptr[0] = start[0];
187 dptr[1] = doffset+3;
188 dptr[2] = 4;
189 dptr += doffset+3;
190 dopt->srr = dopt->optlen + sizeof(struct iphdr);
191 dopt->optlen += doffset+3;
192 dopt->is_strictroute = sopt->is_strictroute;
193 }
194 }
195 while (dopt->optlen & 3) {
196 *dptr++ = IPOPT_END;
197 dopt->optlen++;
198 }
199 return 0;
200}
201
202/*
203 * Options "fragmenting", just fill options not
204 * allowed in fragments with NOOPs.
205 * Simple and stupid 8), but the most efficient way.
206 */
207
208void ip_options_fragment(struct sk_buff * skb)
209{
210 unsigned char * optptr = skb->nh.raw;
211 struct ip_options * opt = &(IPCB(skb)->opt);
212 int l = opt->optlen;
213 int optlen;
214
215 while (l > 0) {
216 switch (*optptr) {
217 case IPOPT_END:
218 return;
219 case IPOPT_NOOP:
220 l--;
221 optptr++;
222 continue;
223 }
224 optlen = optptr[1];
225 if (optlen<2 || optlen>l)
226 return;
227 if (!IPOPT_COPIED(*optptr))
228 memset(optptr, IPOPT_NOOP, optlen);
229 l -= optlen;
230 optptr += optlen;
231 }
232 opt->ts = 0;
233 opt->rr = 0;
234 opt->rr_needaddr = 0;
235 opt->ts_needaddr = 0;
236 opt->ts_needtime = 0;
237 return;
238}
239
240/*
241 * Verify options and fill pointers in struct options.
242 * Caller should clear *opt, and set opt->data.
243 * If opt == NULL, then skb->data should point to IP header.
244 */
245
246int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
247{
248 int l;
249 unsigned char * iph;
250 unsigned char * optptr;
251 int optlen;
252 unsigned char * pp_ptr = NULL;
253 struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL;
254
255 if (!opt) {
256 opt = &(IPCB(skb)->opt);
257 memset(opt, 0, sizeof(struct ip_options));
258 iph = skb->nh.raw;
259 opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
260 optptr = iph + sizeof(struct iphdr);
261 opt->is_data = 0;
262 } else {
263 optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]);
264 iph = optptr - sizeof(struct iphdr);
265 }
266
267 for (l = opt->optlen; l > 0; ) {
268 switch (*optptr) {
269 case IPOPT_END:
270 for (optptr++, l--; l>0; optptr++, l--) {
271 if (*optptr != IPOPT_END) {
272 *optptr = IPOPT_END;
273 opt->is_changed = 1;
274 }
275 }
276 goto eol;
277 case IPOPT_NOOP:
278 l--;
279 optptr++;
280 continue;
281 }
282 optlen = optptr[1];
283 if (optlen<2 || optlen>l) {
284 pp_ptr = optptr;
285 goto error;
286 }
287 switch (*optptr) {
288 case IPOPT_SSRR:
289 case IPOPT_LSRR:
290 if (optlen < 3) {
291 pp_ptr = optptr + 1;
292 goto error;
293 }
294 if (optptr[2] < 4) {
295 pp_ptr = optptr + 2;
296 goto error;
297 }
298 /* NB: cf RFC-1812 5.2.4.1 */
299 if (opt->srr) {
300 pp_ptr = optptr;
301 goto error;
302 }
303 if (!skb) {
304 if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
305 pp_ptr = optptr + 1;
306 goto error;
307 }
308 memcpy(&opt->faddr, &optptr[3], 4);
309 if (optlen > 7)
310 memmove(&optptr[3], &optptr[7], optlen-7);
311 }
312 opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
313 opt->srr = optptr - iph;
314 break;
315 case IPOPT_RR:
316 if (opt->rr) {
317 pp_ptr = optptr;
318 goto error;
319 }
320 if (optlen < 3) {
321 pp_ptr = optptr + 1;
322 goto error;
323 }
324 if (optptr[2] < 4) {
325 pp_ptr = optptr + 2;
326 goto error;
327 }
328 if (optptr[2] <= optlen) {
329 if (optptr[2]+3 > optlen) {
330 pp_ptr = optptr + 2;
331 goto error;
332 }
333 if (skb) {
334 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
335 opt->is_changed = 1;
336 }
337 optptr[2] += 4;
338 opt->rr_needaddr = 1;
339 }
340 opt->rr = optptr - iph;
341 break;
342 case IPOPT_TIMESTAMP:
343 if (opt->ts) {
344 pp_ptr = optptr;
345 goto error;
346 }
347 if (optlen < 4) {
348 pp_ptr = optptr + 1;
349 goto error;
350 }
351 if (optptr[2] < 5) {
352 pp_ptr = optptr + 2;
353 goto error;
354 }
355 if (optptr[2] <= optlen) {
356 __u32 * timeptr = NULL;
357 if (optptr[2]+3 > optptr[1]) {
358 pp_ptr = optptr + 2;
359 goto error;
360 }
361 switch (optptr[3]&0xF) {
362 case IPOPT_TS_TSONLY:
363 opt->ts = optptr - iph;
364 if (skb)
365 timeptr = (__u32*)&optptr[optptr[2]-1];
366 opt->ts_needtime = 1;
367 optptr[2] += 4;
368 break;
369 case IPOPT_TS_TSANDADDR:
370 if (optptr[2]+7 > optptr[1]) {
371 pp_ptr = optptr + 2;
372 goto error;
373 }
374 opt->ts = optptr - iph;
375 if (skb) {
376 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
377 timeptr = (__u32*)&optptr[optptr[2]+3];
378 }
379 opt->ts_needaddr = 1;
380 opt->ts_needtime = 1;
381 optptr[2] += 8;
382 break;
383 case IPOPT_TS_PRESPEC:
384 if (optptr[2]+7 > optptr[1]) {
385 pp_ptr = optptr + 2;
386 goto error;
387 }
388 opt->ts = optptr - iph;
389 {
390 u32 addr;
391 memcpy(&addr, &optptr[optptr[2]-1], 4);
392 if (inet_addr_type(addr) == RTN_UNICAST)
393 break;
394 if (skb)
395 timeptr = (__u32*)&optptr[optptr[2]+3];
396 }
397 opt->ts_needtime = 1;
398 optptr[2] += 8;
399 break;
400 default:
401 if (!skb && !capable(CAP_NET_RAW)) {
402 pp_ptr = optptr + 3;
403 goto error;
404 }
405 break;
406 }
407 if (timeptr) {
408 struct timeval tv;
409 __u32 midtime;
410 do_gettimeofday(&tv);
411 midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
412 memcpy(timeptr, &midtime, sizeof(__u32));
413 opt->is_changed = 1;
414 }
415 } else {
416 unsigned overflow = optptr[3]>>4;
417 if (overflow == 15) {
418 pp_ptr = optptr + 3;
419 goto error;
420 }
421 opt->ts = optptr - iph;
422 if (skb) {
423 optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
424 opt->is_changed = 1;
425 }
426 }
427 break;
428 case IPOPT_RA:
429 if (optlen < 4) {
430 pp_ptr = optptr + 1;
431 goto error;
432 }
433 if (optptr[2] == 0 && optptr[3] == 0)
434 opt->router_alert = optptr - iph;
435 break;
436 case IPOPT_SEC:
437 case IPOPT_SID:
438 default:
439 if (!skb && !capable(CAP_NET_RAW)) {
440 pp_ptr = optptr;
441 goto error;
442 }
443 break;
444 }
445 l -= optlen;
446 optptr += optlen;
447 }
448
449eol:
450 if (!pp_ptr)
451 return 0;
452
453error:
454 if (skb) {
455 icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
456 }
457 return -EINVAL;
458}
459
460
461/*
462 * Undo all the changes done by ip_options_compile().
463 */
464
465void ip_options_undo(struct ip_options * opt)
466{
467 if (opt->srr) {
468 unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
469 memmove(optptr+7, optptr+3, optptr[1]-7);
470 memcpy(optptr+3, &opt->faddr, 4);
471 }
472 if (opt->rr_needaddr) {
473 unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
474 optptr[2] -= 4;
475 memset(&optptr[optptr[2]-1], 0, 4);
476 }
477 if (opt->ts) {
478 unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
479 if (opt->ts_needtime) {
480 optptr[2] -= 4;
481 memset(&optptr[optptr[2]-1], 0, 4);
482 if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
483 optptr[2] -= 4;
484 }
485 if (opt->ts_needaddr) {
486 optptr[2] -= 4;
487 memset(&optptr[optptr[2]-1], 0, 4);
488 }
489 }
490}
491
492int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
493{
494 struct ip_options *opt;
495
496 opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
497 if (!opt)
498 return -ENOMEM;
499 memset(opt, 0, sizeof(struct ip_options));
500 if (optlen) {
501 if (user) {
502 if (copy_from_user(opt->__data, data, optlen)) {
503 kfree(opt);
504 return -EFAULT;
505 }
506 } else
507 memcpy(opt->__data, data, optlen);
508 }
509 while (optlen & 3)
510 opt->__data[optlen++] = IPOPT_END;
511 opt->optlen = optlen;
512 opt->is_data = 1;
513 opt->is_setbyuser = 1;
514 if (optlen && ip_options_compile(opt, NULL)) {
515 kfree(opt);
516 return -EINVAL;
517 }
518 if (*optp)
519 kfree(*optp);
520 *optp = opt;
521 return 0;
522}
523
524void ip_forward_options(struct sk_buff *skb)
525{
526 struct ip_options * opt = &(IPCB(skb)->opt);
527 unsigned char * optptr;
528 struct rtable *rt = (struct rtable*)skb->dst;
529 unsigned char *raw = skb->nh.raw;
530
531 if (opt->rr_needaddr) {
532 optptr = (unsigned char *)raw + opt->rr;
533 ip_rt_get_source(&optptr[optptr[2]-5], rt);
534 opt->is_changed = 1;
535 }
536 if (opt->srr_is_hit) {
537 int srrptr, srrspace;
538
539 optptr = raw + opt->srr;
540
541 for ( srrptr=optptr[2], srrspace = optptr[1];
542 srrptr <= srrspace;
543 srrptr += 4
544 ) {
545 if (srrptr + 3 > srrspace)
546 break;
547 if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
548 break;
549 }
550 if (srrptr + 3 <= srrspace) {
551 opt->is_changed = 1;
552 ip_rt_get_source(&optptr[srrptr-1], rt);
553 skb->nh.iph->daddr = rt->rt_dst;
554 optptr[2] = srrptr+4;
555 } else if (net_ratelimit())
556 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
557 if (opt->ts_needaddr) {
558 optptr = raw + opt->ts;
559 ip_rt_get_source(&optptr[optptr[2]-9], rt);
560 opt->is_changed = 1;
561 }
562 }
563 if (opt->is_changed) {
564 opt->is_changed = 0;
565 ip_send_check(skb->nh.iph);
566 }
567}
568
569int ip_options_rcv_srr(struct sk_buff *skb)
570{
571 struct ip_options *opt = &(IPCB(skb)->opt);
572 int srrspace, srrptr;
573 u32 nexthop;
574 struct iphdr *iph = skb->nh.iph;
575 unsigned char * optptr = skb->nh.raw + opt->srr;
576 struct rtable *rt = (struct rtable*)skb->dst;
577 struct rtable *rt2;
578 int err;
579
580 if (!opt->srr)
581 return 0;
582
583 if (skb->pkt_type != PACKET_HOST)
584 return -EINVAL;
585 if (rt->rt_type == RTN_UNICAST) {
586 if (!opt->is_strictroute)
587 return 0;
588 icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
589 return -EINVAL;
590 }
591 if (rt->rt_type != RTN_LOCAL)
592 return -EINVAL;
593
594 for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
595 if (srrptr + 3 > srrspace) {
596 icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
597 return -EINVAL;
598 }
599 memcpy(&nexthop, &optptr[srrptr-1], 4);
600
601 rt = (struct rtable*)skb->dst;
602 skb->dst = NULL;
603 err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
604 rt2 = (struct rtable*)skb->dst;
605 if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
606 ip_rt_put(rt2);
607 skb->dst = &rt->u.dst;
608 return -EINVAL;
609 }
610 ip_rt_put(rt);
611 if (rt2->rt_type != RTN_LOCAL)
612 break;
613 /* Superfast 8) loopback forward */
614 memcpy(&iph->daddr, &optptr[srrptr-1], 4);
615 opt->is_changed = 1;
616 }
617 if (srrptr <= srrspace) {
618 opt->srr_is_hit = 1;
619 opt->is_changed = 1;
620 }
621 return 0;
622}
623
624EXPORT_SYMBOL(ip_options_compile);
625EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 000000000000..30ab7b6ab761
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1359 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 * See ip_input.c for original log
21 *
22 * Fixes:
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
26 * no route is found.
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
43 * datagrams.
44 * Hirokazu Takahashi: sendfile() on UDP works now.
45 */
46
47#include <asm/uaccess.h>
48#include <asm/system.h>
49#include <linux/module.h>
50#include <linux/types.h>
51#include <linux/kernel.h>
52#include <linux/sched.h>
53#include <linux/mm.h>
54#include <linux/string.h>
55#include <linux/errno.h>
56#include <linux/config.h>
57
58#include <linux/socket.h>
59#include <linux/sockios.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/etherdevice.h>
64#include <linux/proc_fs.h>
65#include <linux/stat.h>
66#include <linux/init.h>
67
68#include <net/snmp.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/udp.h>
74#include <linux/skbuff.h>
75#include <net/sock.h>
76#include <net/arp.h>
77#include <net/icmp.h>
78#include <net/raw.h>
79#include <net/checksum.h>
80#include <net/inetpeer.h>
81#include <net/checksum.h>
82#include <linux/igmp.h>
83#include <linux/netfilter_ipv4.h>
84#include <linux/netfilter_bridge.h>
85#include <linux/mroute.h>
86#include <linux/netlink.h>
87
88/*
89 * Shall we try to damage output packets if routing dev changes?
90 */
91
92int sysctl_ip_dynaddr;
93int sysctl_ip_default_ttl = IPDEFTTL;
94
95/* Generate a checksum for an outgoing IP datagram. */
96__inline__ void ip_send_check(struct iphdr *iph)
97{
98 iph->check = 0;
99 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
100}
101
102/* dev_loopback_xmit for use with netfilter. */
103static int ip_dev_loopback_xmit(struct sk_buff *newskb)
104{
105 newskb->mac.raw = newskb->data;
106 __skb_pull(newskb, newskb->nh.raw - newskb->data);
107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst);
110
111#ifdef CONFIG_NETFILTER_DEBUG
112 nf_debug_ip_loopback_xmit(newskb);
113#endif
114 netif_rx(newskb);
115 return 0;
116}
117
118static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
119{
120 int ttl = inet->uc_ttl;
121
122 if (ttl < 0)
123 ttl = dst_metric(dst, RTAX_HOPLIMIT);
124 return ttl;
125}
126
127/*
128 * Add an ip header to a skbuff and send it out.
129 *
130 */
131int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
132 u32 saddr, u32 daddr, struct ip_options *opt)
133{
134 struct inet_sock *inet = inet_sk(sk);
135 struct rtable *rt = (struct rtable *)skb->dst;
136 struct iphdr *iph;
137
138 /* Build the IP header. */
139 if (opt)
140 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
141 else
142 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
143
144 iph->version = 4;
145 iph->ihl = 5;
146 iph->tos = inet->tos;
147 if (ip_dont_fragment(sk, &rt->u.dst))
148 iph->frag_off = htons(IP_DF);
149 else
150 iph->frag_off = 0;
151 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
152 iph->daddr = rt->rt_dst;
153 iph->saddr = rt->rt_src;
154 iph->protocol = sk->sk_protocol;
155 iph->tot_len = htons(skb->len);
156 ip_select_ident(iph, &rt->u.dst, sk);
157 skb->nh.iph = iph;
158
159 if (opt && opt->optlen) {
160 iph->ihl += opt->optlen>>2;
161 ip_options_build(skb, opt, daddr, rt, 0);
162 }
163 ip_send_check(iph);
164
165 skb->priority = sk->sk_priority;
166
167 /* Send it out. */
168 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
169 dst_output);
170}
171
172static inline int ip_finish_output2(struct sk_buff *skb)
173{
174 struct dst_entry *dst = skb->dst;
175 struct hh_cache *hh = dst->hh;
176 struct net_device *dev = dst->dev;
177 int hh_len = LL_RESERVED_SPACE(dev);
178
179 /* Be paranoid, rather than too clever. */
180 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
181 struct sk_buff *skb2;
182
183 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
184 if (skb2 == NULL) {
185 kfree_skb(skb);
186 return -ENOMEM;
187 }
188 if (skb->sk)
189 skb_set_owner_w(skb2, skb->sk);
190 kfree_skb(skb);
191 skb = skb2;
192 }
193
194#ifdef CONFIG_NETFILTER_DEBUG
195 nf_debug_ip_finish_output2(skb);
196#endif /*CONFIG_NETFILTER_DEBUG*/
197
198 if (hh) {
199 int hh_alen;
200
201 read_lock_bh(&hh->hh_lock);
202 hh_alen = HH_DATA_ALIGN(hh->hh_len);
203 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
204 read_unlock_bh(&hh->hh_lock);
205 skb_push(skb, hh->hh_len);
206 return hh->hh_output(skb);
207 } else if (dst->neighbour)
208 return dst->neighbour->output(skb);
209
210 if (net_ratelimit())
211 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
212 kfree_skb(skb);
213 return -EINVAL;
214}
215
216int ip_finish_output(struct sk_buff *skb)
217{
218 struct net_device *dev = skb->dst->dev;
219
220 skb->dev = dev;
221 skb->protocol = htons(ETH_P_IP);
222
223 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
224 ip_finish_output2);
225}
226
227int ip_mc_output(struct sk_buff *skb)
228{
229 struct sock *sk = skb->sk;
230 struct rtable *rt = (struct rtable*)skb->dst;
231 struct net_device *dev = rt->u.dst.dev;
232
233 /*
234 * If the indicated interface is up and running, send the packet.
235 */
236 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
237
238 skb->dev = dev;
239 skb->protocol = htons(ETH_P_IP);
240
241 /*
242 * Multicasts are looped back for other local users
243 */
244
245 if (rt->rt_flags&RTCF_MULTICAST) {
246 if ((!sk || inet_sk(sk)->mc_loop)
247#ifdef CONFIG_IP_MROUTE
248 /* Small optimization: do not loopback not local frames,
249 which returned after forwarding; they will be dropped
250 by ip_mr_input in any case.
251 Note, that local frames are looped back to be delivered
252 to local recipients.
253
254 This check is duplicated in ip_mr_input at the moment.
255 */
256 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
257#endif
258 ) {
259 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
260 if (newskb)
261 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
262 newskb->dev,
263 ip_dev_loopback_xmit);
264 }
265
266 /* Multicasts with ttl 0 must not go beyond the host */
267
268 if (skb->nh.iph->ttl == 0) {
269 kfree_skb(skb);
270 return 0;
271 }
272 }
273
274 if (rt->rt_flags&RTCF_BROADCAST) {
275 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
276 if (newskb)
277 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
278 newskb->dev, ip_dev_loopback_xmit);
279 }
280
281 if (skb->len > dst_mtu(&rt->u.dst))
282 return ip_fragment(skb, ip_finish_output);
283 else
284 return ip_finish_output(skb);
285}
286
287int ip_output(struct sk_buff *skb)
288{
289 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
290
291 if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
292 return ip_fragment(skb, ip_finish_output);
293 else
294 return ip_finish_output(skb);
295}
296
297int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
298{
299 struct sock *sk = skb->sk;
300 struct inet_sock *inet = inet_sk(sk);
301 struct ip_options *opt = inet->opt;
302 struct rtable *rt;
303 struct iphdr *iph;
304
305 /* Skip all of this if the packet is already routed,
306 * f.e. by something like SCTP.
307 */
308 rt = (struct rtable *) skb->dst;
309 if (rt != NULL)
310 goto packet_routed;
311
312 /* Make sure we can route this packet. */
313 rt = (struct rtable *)__sk_dst_check(sk, 0);
314 if (rt == NULL) {
315 u32 daddr;
316
317 /* Use correct destination address if we have options. */
318 daddr = inet->daddr;
319 if(opt && opt->srr)
320 daddr = opt->faddr;
321
322 {
323 struct flowi fl = { .oif = sk->sk_bound_dev_if,
324 .nl_u = { .ip4_u =
325 { .daddr = daddr,
326 .saddr = inet->saddr,
327 .tos = RT_CONN_FLAGS(sk) } },
328 .proto = sk->sk_protocol,
329 .uli_u = { .ports =
330 { .sport = inet->sport,
331 .dport = inet->dport } } };
332
333 /* If this fails, retransmit mechanism of transport layer will
334 * keep trying until route appears or the connection times
335 * itself out.
336 */
337 if (ip_route_output_flow(&rt, &fl, sk, 0))
338 goto no_route;
339 }
340 __sk_dst_set(sk, &rt->u.dst);
341 tcp_v4_setup_caps(sk, &rt->u.dst);
342 }
343 skb->dst = dst_clone(&rt->u.dst);
344
345packet_routed:
346 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
347 goto no_route;
348
349 /* OK, we know where to send it, allocate and build IP header. */
350 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
351 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
352 iph->tot_len = htons(skb->len);
353 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
354 iph->frag_off = htons(IP_DF);
355 else
356 iph->frag_off = 0;
357 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
358 iph->protocol = sk->sk_protocol;
359 iph->saddr = rt->rt_src;
360 iph->daddr = rt->rt_dst;
361 skb->nh.iph = iph;
362 /* Transport layer set skb->h.foo itself. */
363
364 if (opt && opt->optlen) {
365 iph->ihl += opt->optlen >> 2;
366 ip_options_build(skb, opt, inet->daddr, rt, 0);
367 }
368
369 ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
370
371 /* Add an IP checksum. */
372 ip_send_check(iph);
373
374 skb->priority = sk->sk_priority;
375
376 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
377 dst_output);
378
379no_route:
380 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
381 kfree_skb(skb);
382 return -EHOSTUNREACH;
383}
384
385
386static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
387{
388 to->pkt_type = from->pkt_type;
389 to->priority = from->priority;
390 to->protocol = from->protocol;
391 to->security = from->security;
392 dst_release(to->dst);
393 to->dst = dst_clone(from->dst);
394 to->dev = from->dev;
395
396 /* Copy the flags to each fragment. */
397 IPCB(to)->flags = IPCB(from)->flags;
398
399#ifdef CONFIG_NET_SCHED
400 to->tc_index = from->tc_index;
401#endif
402#ifdef CONFIG_NETFILTER
403 to->nfmark = from->nfmark;
404 to->nfcache = from->nfcache;
405 /* Connection association is same as pre-frag packet */
406 nf_conntrack_put(to->nfct);
407 to->nfct = from->nfct;
408 nf_conntrack_get(to->nfct);
409 to->nfctinfo = from->nfctinfo;
410#ifdef CONFIG_BRIDGE_NETFILTER
411 nf_bridge_put(to->nf_bridge);
412 to->nf_bridge = from->nf_bridge;
413 nf_bridge_get(to->nf_bridge);
414#endif
415#ifdef CONFIG_NETFILTER_DEBUG
416 to->nf_debug = from->nf_debug;
417#endif
418#endif
419}
420
421/*
422 * This IP datagram is too large to be sent in one piece. Break it up into
423 * smaller pieces (each of size equal to IP header plus
424 * a block of the data of the original IP data part) that will yet fit in a
425 * single device frame, and queue such a frame for sending.
426 */
427
428int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
429{
430 struct iphdr *iph;
431 int raw = 0;
432 int ptr;
433 struct net_device *dev;
434 struct sk_buff *skb2;
435 unsigned int mtu, hlen, left, len, ll_rs;
436 int offset;
437 int not_last_frag;
438 struct rtable *rt = (struct rtable*)skb->dst;
439 int err = 0;
440
441 dev = rt->u.dst.dev;
442
443 /*
444 * Point into the IP datagram header.
445 */
446
447 iph = skb->nh.iph;
448
449 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
450 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
451 htonl(dst_mtu(&rt->u.dst)));
452 kfree_skb(skb);
453 return -EMSGSIZE;
454 }
455
456 /*
457 * Setup starting values.
458 */
459
460 hlen = iph->ihl * 4;
461 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
462
463 /* When frag_list is given, use it. First, check its validity:
464 * some transformers could create wrong frag_list or break existing
465 * one, it is not prohibited. In this case fall back to copying.
466 *
467 * LATER: this step can be merged to real generation of fragments,
468 * we can switch to copy when see the first bad fragment.
469 */
470 if (skb_shinfo(skb)->frag_list) {
471 struct sk_buff *frag;
472 int first_len = skb_pagelen(skb);
473
474 if (first_len - hlen > mtu ||
475 ((first_len - hlen) & 7) ||
476 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
477 skb_cloned(skb))
478 goto slow_path;
479
480 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
481 /* Correct geometry. */
482 if (frag->len > mtu ||
483 ((frag->len & 7) && frag->next) ||
484 skb_headroom(frag) < hlen)
485 goto slow_path;
486
487 /* Partially cloned skb? */
488 if (skb_shared(frag))
489 goto slow_path;
490 }
491
492 /* Everything is OK. Generate! */
493
494 err = 0;
495 offset = 0;
496 frag = skb_shinfo(skb)->frag_list;
497 skb_shinfo(skb)->frag_list = NULL;
498 skb->data_len = first_len - skb_headlen(skb);
499 skb->len = first_len;
500 iph->tot_len = htons(first_len);
501 iph->frag_off = htons(IP_MF);
502 ip_send_check(iph);
503
504 for (;;) {
505 /* Prepare header of the next frame,
506 * before previous one went down. */
507 if (frag) {
508 frag->ip_summed = CHECKSUM_NONE;
509 frag->h.raw = frag->data;
510 frag->nh.raw = __skb_push(frag, hlen);
511 memcpy(frag->nh.raw, iph, hlen);
512 iph = frag->nh.iph;
513 iph->tot_len = htons(frag->len);
514 ip_copy_metadata(frag, skb);
515 if (offset == 0)
516 ip_options_fragment(frag);
517 offset += skb->len - hlen;
518 iph->frag_off = htons(offset>>3);
519 if (frag->next != NULL)
520 iph->frag_off |= htons(IP_MF);
521 /* Ready, complete checksum */
522 ip_send_check(iph);
523 }
524
525 err = output(skb);
526
527 if (err || !frag)
528 break;
529
530 skb = frag;
531 frag = skb->next;
532 skb->next = NULL;
533 }
534
535 if (err == 0) {
536 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
537 return 0;
538 }
539
540 while (frag) {
541 skb = frag->next;
542 kfree_skb(frag);
543 frag = skb;
544 }
545 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
546 return err;
547 }
548
549slow_path:
550 left = skb->len - hlen; /* Space per frame */
551 ptr = raw + hlen; /* Where to start from */
552
553#ifdef CONFIG_BRIDGE_NETFILTER
554 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
555 * we need to make room for the encapsulating header */
556 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
557 mtu -= nf_bridge_pad(skb);
558#else
559 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
560#endif
561 /*
562 * Fragment the datagram.
563 */
564
565 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
566 not_last_frag = iph->frag_off & htons(IP_MF);
567
568 /*
569 * Keep copying data until we run out.
570 */
571
572 while(left > 0) {
573 len = left;
574 /* IF: it doesn't fit, use 'mtu' - the data space left */
575 if (len > mtu)
576 len = mtu;
577 /* IF: we are not sending upto and including the packet end
578 then align the next start on an eight byte boundary */
579 if (len < left) {
580 len &= ~7;
581 }
582 /*
583 * Allocate buffer.
584 */
585
586 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
587 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
588 err = -ENOMEM;
589 goto fail;
590 }
591
592 /*
593 * Set up data on packet
594 */
595
596 ip_copy_metadata(skb2, skb);
597 skb_reserve(skb2, ll_rs);
598 skb_put(skb2, len + hlen);
599 skb2->nh.raw = skb2->data;
600 skb2->h.raw = skb2->data + hlen;
601
602 /*
603 * Charge the memory for the fragment to any owner
604 * it might possess
605 */
606
607 if (skb->sk)
608 skb_set_owner_w(skb2, skb->sk);
609
610 /*
611 * Copy the packet header into the new buffer.
612 */
613
614 memcpy(skb2->nh.raw, skb->data, hlen);
615
616 /*
617 * Copy a block of the IP datagram.
618 */
619 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
620 BUG();
621 left -= len;
622
623 /*
624 * Fill in the new header fields.
625 */
626 iph = skb2->nh.iph;
627 iph->frag_off = htons((offset >> 3));
628
629 /* ANK: dirty, but effective trick. Upgrade options only if
630 * the segment to be fragmented was THE FIRST (otherwise,
631 * options are already fixed) and make it ONCE
632 * on the initial skb, so that all the following fragments
633 * will inherit fixed options.
634 */
635 if (offset == 0)
636 ip_options_fragment(skb);
637
638 /*
639 * Added AC : If we are fragmenting a fragment that's not the
640 * last fragment then keep MF on each bit
641 */
642 if (left > 0 || not_last_frag)
643 iph->frag_off |= htons(IP_MF);
644 ptr += len;
645 offset += len;
646
647 /*
648 * Put this fragment into the sending queue.
649 */
650
651 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
652
653 iph->tot_len = htons(len + hlen);
654
655 ip_send_check(iph);
656
657 err = output(skb2);
658 if (err)
659 goto fail;
660 }
661 kfree_skb(skb);
662 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
663 return err;
664
665fail:
666 kfree_skb(skb);
667 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
668 return err;
669}
670
671int
672ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
673{
674 struct iovec *iov = from;
675
676 if (skb->ip_summed == CHECKSUM_HW) {
677 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
678 return -EFAULT;
679 } else {
680 unsigned int csum = 0;
681 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
682 return -EFAULT;
683 skb->csum = csum_block_add(skb->csum, csum, odd);
684 }
685 return 0;
686}
687
688static inline unsigned int
689csum_page(struct page *page, int offset, int copy)
690{
691 char *kaddr;
692 unsigned int csum;
693 kaddr = kmap(page);
694 csum = csum_partial(kaddr + offset, copy, 0);
695 kunmap(page);
696 return csum;
697}
698
699/*
700 * ip_append_data() and ip_append_page() can make one large IP datagram
701 * from many pieces of data. Each pieces will be holded on the socket
702 * until ip_push_pending_frames() is called. Each piece can be a page
703 * or non-page data.
704 *
705 * Not only UDP, other transport protocols - e.g. raw sockets - can use
706 * this interface potentially.
707 *
708 * LATER: length must be adjusted by pad at tail, when it is required.
709 */
710int ip_append_data(struct sock *sk,
711 int getfrag(void *from, char *to, int offset, int len,
712 int odd, struct sk_buff *skb),
713 void *from, int length, int transhdrlen,
714 struct ipcm_cookie *ipc, struct rtable *rt,
715 unsigned int flags)
716{
717 struct inet_sock *inet = inet_sk(sk);
718 struct sk_buff *skb;
719
720 struct ip_options *opt = NULL;
721 int hh_len;
722 int exthdrlen;
723 int mtu;
724 int copy;
725 int err;
726 int offset = 0;
727 unsigned int maxfraglen, fragheaderlen;
728 int csummode = CHECKSUM_NONE;
729
730 if (flags&MSG_PROBE)
731 return 0;
732
733 if (skb_queue_empty(&sk->sk_write_queue)) {
734 /*
735 * setup for corking.
736 */
737 opt = ipc->opt;
738 if (opt) {
739 if (inet->cork.opt == NULL) {
740 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
741 if (unlikely(inet->cork.opt == NULL))
742 return -ENOBUFS;
743 }
744 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
745 inet->cork.flags |= IPCORK_OPT;
746 inet->cork.addr = ipc->addr;
747 }
748 dst_hold(&rt->u.dst);
749 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
750 inet->cork.rt = rt;
751 inet->cork.length = 0;
752 sk->sk_sndmsg_page = NULL;
753 sk->sk_sndmsg_off = 0;
754 if ((exthdrlen = rt->u.dst.header_len) != 0) {
755 length += exthdrlen;
756 transhdrlen += exthdrlen;
757 }
758 } else {
759 rt = inet->cork.rt;
760 if (inet->cork.flags & IPCORK_OPT)
761 opt = inet->cork.opt;
762
763 transhdrlen = 0;
764 exthdrlen = 0;
765 mtu = inet->cork.fragsize;
766 }
767 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
768
769 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
770 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
771
772 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
773 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
774 return -EMSGSIZE;
775 }
776
777 /*
778 * transhdrlen > 0 means that this is the first fragment and we wish
779 * it won't be fragmented in the future.
780 */
781 if (transhdrlen &&
782 length + fragheaderlen <= mtu &&
783 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
784 !exthdrlen)
785 csummode = CHECKSUM_HW;
786
787 inet->cork.length += length;
788
789 /* So, what's going on in the loop below?
790 *
791 * We use calculated fragment length to generate chained skb,
792 * each of segments is IP fragment ready for sending to network after
793 * adding appropriate IP header.
794 */
795
796 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
797 goto alloc_new_skb;
798
799 while (length > 0) {
800 /* Check if the remaining data fits into current packet. */
801 copy = mtu - skb->len;
802 if (copy < length)
803 copy = maxfraglen - skb->len;
804 if (copy <= 0) {
805 char *data;
806 unsigned int datalen;
807 unsigned int fraglen;
808 unsigned int fraggap;
809 unsigned int alloclen;
810 struct sk_buff *skb_prev;
811alloc_new_skb:
812 skb_prev = skb;
813 if (skb_prev)
814 fraggap = skb_prev->len - maxfraglen;
815 else
816 fraggap = 0;
817
818 /*
819 * If remaining data exceeds the mtu,
820 * we know we need more fragment(s).
821 */
822 datalen = length + fraggap;
823 if (datalen > mtu - fragheaderlen)
824 datalen = maxfraglen - fragheaderlen;
825 fraglen = datalen + fragheaderlen;
826
827 if ((flags & MSG_MORE) &&
828 !(rt->u.dst.dev->features&NETIF_F_SG))
829 alloclen = mtu;
830 else
831 alloclen = datalen + fragheaderlen;
832
833 /* The last fragment gets additional space at tail.
834 * Note, with MSG_MORE we overallocate on fragments,
835 * because we have no idea what fragment will be
836 * the last.
837 */
838 if (datalen == length)
839 alloclen += rt->u.dst.trailer_len;
840
841 if (transhdrlen) {
842 skb = sock_alloc_send_skb(sk,
843 alloclen + hh_len + 15,
844 (flags & MSG_DONTWAIT), &err);
845 } else {
846 skb = NULL;
847 if (atomic_read(&sk->sk_wmem_alloc) <=
848 2 * sk->sk_sndbuf)
849 skb = sock_wmalloc(sk,
850 alloclen + hh_len + 15, 1,
851 sk->sk_allocation);
852 if (unlikely(skb == NULL))
853 err = -ENOBUFS;
854 }
855 if (skb == NULL)
856 goto error;
857
858 /*
859 * Fill in the control structures
860 */
861 skb->ip_summed = csummode;
862 skb->csum = 0;
863 skb_reserve(skb, hh_len);
864
865 /*
866 * Find where to start putting bytes.
867 */
868 data = skb_put(skb, fraglen);
869 skb->nh.raw = data + exthdrlen;
870 data += fragheaderlen;
871 skb->h.raw = data + exthdrlen;
872
873 if (fraggap) {
874 skb->csum = skb_copy_and_csum_bits(
875 skb_prev, maxfraglen,
876 data + transhdrlen, fraggap, 0);
877 skb_prev->csum = csum_sub(skb_prev->csum,
878 skb->csum);
879 data += fraggap;
880 skb_trim(skb_prev, maxfraglen);
881 }
882
883 copy = datalen - transhdrlen - fraggap;
884 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
885 err = -EFAULT;
886 kfree_skb(skb);
887 goto error;
888 }
889
890 offset += copy;
891 length -= datalen - fraggap;
892 transhdrlen = 0;
893 exthdrlen = 0;
894 csummode = CHECKSUM_NONE;
895
896 /*
897 * Put the packet on the pending queue.
898 */
899 __skb_queue_tail(&sk->sk_write_queue, skb);
900 continue;
901 }
902
903 if (copy > length)
904 copy = length;
905
906 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
907 unsigned int off;
908
909 off = skb->len;
910 if (getfrag(from, skb_put(skb, copy),
911 offset, copy, off, skb) < 0) {
912 __skb_trim(skb, off);
913 err = -EFAULT;
914 goto error;
915 }
916 } else {
917 int i = skb_shinfo(skb)->nr_frags;
918 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
919 struct page *page = sk->sk_sndmsg_page;
920 int off = sk->sk_sndmsg_off;
921 unsigned int left;
922
923 if (page && (left = PAGE_SIZE - off) > 0) {
924 if (copy >= left)
925 copy = left;
926 if (page != frag->page) {
927 if (i == MAX_SKB_FRAGS) {
928 err = -EMSGSIZE;
929 goto error;
930 }
931 get_page(page);
932 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
933 frag = &skb_shinfo(skb)->frags[i];
934 }
935 } else if (i < MAX_SKB_FRAGS) {
936 if (copy > PAGE_SIZE)
937 copy = PAGE_SIZE;
938 page = alloc_pages(sk->sk_allocation, 0);
939 if (page == NULL) {
940 err = -ENOMEM;
941 goto error;
942 }
943 sk->sk_sndmsg_page = page;
944 sk->sk_sndmsg_off = 0;
945
946 skb_fill_page_desc(skb, i, page, 0, 0);
947 frag = &skb_shinfo(skb)->frags[i];
948 skb->truesize += PAGE_SIZE;
949 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
950 } else {
951 err = -EMSGSIZE;
952 goto error;
953 }
954 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
955 err = -EFAULT;
956 goto error;
957 }
958 sk->sk_sndmsg_off += copy;
959 frag->size += copy;
960 skb->len += copy;
961 skb->data_len += copy;
962 }
963 offset += copy;
964 length -= copy;
965 }
966
967 return 0;
968
969error:
970 inet->cork.length -= length;
971 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
972 return err;
973}
974
975ssize_t ip_append_page(struct sock *sk, struct page *page,
976 int offset, size_t size, int flags)
977{
978 struct inet_sock *inet = inet_sk(sk);
979 struct sk_buff *skb;
980 struct rtable *rt;
981 struct ip_options *opt = NULL;
982 int hh_len;
983 int mtu;
984 int len;
985 int err;
986 unsigned int maxfraglen, fragheaderlen, fraggap;
987
988 if (inet->hdrincl)
989 return -EPERM;
990
991 if (flags&MSG_PROBE)
992 return 0;
993
994 if (skb_queue_empty(&sk->sk_write_queue))
995 return -EINVAL;
996
997 rt = inet->cork.rt;
998 if (inet->cork.flags & IPCORK_OPT)
999 opt = inet->cork.opt;
1000
1001 if (!(rt->u.dst.dev->features&NETIF_F_SG))
1002 return -EOPNOTSUPP;
1003
1004 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1005 mtu = inet->cork.fragsize;
1006
1007 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1008 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1009
1010 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1011 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1012 return -EMSGSIZE;
1013 }
1014
1015 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1016 return -EINVAL;
1017
1018 inet->cork.length += size;
1019
1020 while (size > 0) {
1021 int i;
1022
1023 /* Check if the remaining data fits into current packet. */
1024 len = mtu - skb->len;
1025 if (len < size)
1026 len = maxfraglen - skb->len;
1027 if (len <= 0) {
1028 struct sk_buff *skb_prev;
1029 char *data;
1030 struct iphdr *iph;
1031 int alloclen;
1032
1033 skb_prev = skb;
1034 if (skb_prev)
1035 fraggap = skb_prev->len - maxfraglen;
1036 else
1037 fraggap = 0;
1038
1039 alloclen = fragheaderlen + hh_len + fraggap + 15;
1040 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1041 if (unlikely(!skb)) {
1042 err = -ENOBUFS;
1043 goto error;
1044 }
1045
1046 /*
1047 * Fill in the control structures
1048 */
1049 skb->ip_summed = CHECKSUM_NONE;
1050 skb->csum = 0;
1051 skb_reserve(skb, hh_len);
1052
1053 /*
1054 * Find where to start putting bytes.
1055 */
1056 data = skb_put(skb, fragheaderlen + fraggap);
1057 skb->nh.iph = iph = (struct iphdr *)data;
1058 data += fragheaderlen;
1059 skb->h.raw = data;
1060
1061 if (fraggap) {
1062 skb->csum = skb_copy_and_csum_bits(
1063 skb_prev, maxfraglen,
1064 data, fraggap, 0);
1065 skb_prev->csum = csum_sub(skb_prev->csum,
1066 skb->csum);
1067 skb_trim(skb_prev, maxfraglen);
1068 }
1069
1070 /*
1071 * Put the packet on the pending queue.
1072 */
1073 __skb_queue_tail(&sk->sk_write_queue, skb);
1074 continue;
1075 }
1076
1077 i = skb_shinfo(skb)->nr_frags;
1078 if (len > size)
1079 len = size;
1080 if (skb_can_coalesce(skb, i, page, offset)) {
1081 skb_shinfo(skb)->frags[i-1].size += len;
1082 } else if (i < MAX_SKB_FRAGS) {
1083 get_page(page);
1084 skb_fill_page_desc(skb, i, page, offset, len);
1085 } else {
1086 err = -EMSGSIZE;
1087 goto error;
1088 }
1089
1090 if (skb->ip_summed == CHECKSUM_NONE) {
1091 unsigned int csum;
1092 csum = csum_page(page, offset, len);
1093 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1094 }
1095
1096 skb->len += len;
1097 skb->data_len += len;
1098 offset += len;
1099 size -= len;
1100 }
1101 return 0;
1102
1103error:
1104 inet->cork.length -= size;
1105 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1106 return err;
1107}
1108
1109/*
1110 * Combined all pending IP fragments on the socket as one IP datagram
1111 * and push them out.
1112 */
1113int ip_push_pending_frames(struct sock *sk)
1114{
1115 struct sk_buff *skb, *tmp_skb;
1116 struct sk_buff **tail_skb;
1117 struct inet_sock *inet = inet_sk(sk);
1118 struct ip_options *opt = NULL;
1119 struct rtable *rt = inet->cork.rt;
1120 struct iphdr *iph;
1121 int df = 0;
1122 __u8 ttl;
1123 int err = 0;
1124
1125 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1126 goto out;
1127 tail_skb = &(skb_shinfo(skb)->frag_list);
1128
1129 /* move skb->data to ip header from ext header */
1130 if (skb->data < skb->nh.raw)
1131 __skb_pull(skb, skb->nh.raw - skb->data);
1132 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1133 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1134 *tail_skb = tmp_skb;
1135 tail_skb = &(tmp_skb->next);
1136 skb->len += tmp_skb->len;
1137 skb->data_len += tmp_skb->len;
1138 skb->truesize += tmp_skb->truesize;
1139 __sock_put(tmp_skb->sk);
1140 tmp_skb->destructor = NULL;
1141 tmp_skb->sk = NULL;
1142 }
1143
1144 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1145 * to fragment the frame generated here. No matter, what transforms
1146 * how transforms change size of the packet, it will come out.
1147 */
1148 if (inet->pmtudisc != IP_PMTUDISC_DO)
1149 skb->local_df = 1;
1150
1151 /* DF bit is set when we want to see DF on outgoing frames.
1152 * If local_df is set too, we still allow to fragment this frame
1153 * locally. */
1154 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1155 (skb->len <= dst_mtu(&rt->u.dst) &&
1156 ip_dont_fragment(sk, &rt->u.dst)))
1157 df = htons(IP_DF);
1158
1159 if (inet->cork.flags & IPCORK_OPT)
1160 opt = inet->cork.opt;
1161
1162 if (rt->rt_type == RTN_MULTICAST)
1163 ttl = inet->mc_ttl;
1164 else
1165 ttl = ip_select_ttl(inet, &rt->u.dst);
1166
1167 iph = (struct iphdr *)skb->data;
1168 iph->version = 4;
1169 iph->ihl = 5;
1170 if (opt) {
1171 iph->ihl += opt->optlen>>2;
1172 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1173 }
1174 iph->tos = inet->tos;
1175 iph->tot_len = htons(skb->len);
1176 iph->frag_off = df;
1177 if (!df) {
1178 __ip_select_ident(iph, &rt->u.dst, 0);
1179 } else {
1180 iph->id = htons(inet->id++);
1181 }
1182 iph->ttl = ttl;
1183 iph->protocol = sk->sk_protocol;
1184 iph->saddr = rt->rt_src;
1185 iph->daddr = rt->rt_dst;
1186 ip_send_check(iph);
1187
1188 skb->priority = sk->sk_priority;
1189 skb->dst = dst_clone(&rt->u.dst);
1190
1191 /* Netfilter gets whole the not fragmented skb. */
1192 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1193 skb->dst->dev, dst_output);
1194 if (err) {
1195 if (err > 0)
1196 err = inet->recverr ? net_xmit_errno(err) : 0;
1197 if (err)
1198 goto error;
1199 }
1200
1201out:
1202 inet->cork.flags &= ~IPCORK_OPT;
1203 if (inet->cork.opt) {
1204 kfree(inet->cork.opt);
1205 inet->cork.opt = NULL;
1206 }
1207 if (inet->cork.rt) {
1208 ip_rt_put(inet->cork.rt);
1209 inet->cork.rt = NULL;
1210 }
1211 return err;
1212
1213error:
1214 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1215 goto out;
1216}
1217
1218/*
1219 * Throw away all pending data on the socket.
1220 */
1221void ip_flush_pending_frames(struct sock *sk)
1222{
1223 struct inet_sock *inet = inet_sk(sk);
1224 struct sk_buff *skb;
1225
1226 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1227 kfree_skb(skb);
1228
1229 inet->cork.flags &= ~IPCORK_OPT;
1230 if (inet->cork.opt) {
1231 kfree(inet->cork.opt);
1232 inet->cork.opt = NULL;
1233 }
1234 if (inet->cork.rt) {
1235 ip_rt_put(inet->cork.rt);
1236 inet->cork.rt = NULL;
1237 }
1238}
1239
1240
1241/*
1242 * Fetch data from kernel space and fill in checksum if needed.
1243 */
1244static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1245 int len, int odd, struct sk_buff *skb)
1246{
1247 unsigned int csum;
1248
1249 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1250 skb->csum = csum_block_add(skb->csum, csum, odd);
1251 return 0;
1252}
1253
1254/*
1255 * Generic function to send a packet as reply to another packet.
1256 * Used to send TCP resets so far. ICMP should use this function too.
1257 *
1258 * Should run single threaded per socket because it uses the sock
1259 * structure to pass arguments.
1260 *
1261 * LATER: switch from ip_build_xmit to ip_append_*
1262 */
1263void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1264 unsigned int len)
1265{
1266 struct inet_sock *inet = inet_sk(sk);
1267 struct {
1268 struct ip_options opt;
1269 char data[40];
1270 } replyopts;
1271 struct ipcm_cookie ipc;
1272 u32 daddr;
1273 struct rtable *rt = (struct rtable*)skb->dst;
1274
1275 if (ip_options_echo(&replyopts.opt, skb))
1276 return;
1277
1278 daddr = ipc.addr = rt->rt_src;
1279 ipc.opt = NULL;
1280
1281 if (replyopts.opt.optlen) {
1282 ipc.opt = &replyopts.opt;
1283
1284 if (ipc.opt->srr)
1285 daddr = replyopts.opt.faddr;
1286 }
1287
1288 {
1289 struct flowi fl = { .nl_u = { .ip4_u =
1290 { .daddr = daddr,
1291 .saddr = rt->rt_spec_dst,
1292 .tos = RT_TOS(skb->nh.iph->tos) } },
1293 /* Not quite clean, but right. */
1294 .uli_u = { .ports =
1295 { .sport = skb->h.th->dest,
1296 .dport = skb->h.th->source } },
1297 .proto = sk->sk_protocol };
1298 if (ip_route_output_key(&rt, &fl))
1299 return;
1300 }
1301
1302 /* And let IP do all the hard work.
1303
1304 This chunk is not reenterable, hence spinlock.
1305 Note that it uses the fact, that this function is called
1306 with locally disabled BH and that sk cannot be already spinlocked.
1307 */
1308 bh_lock_sock(sk);
1309 inet->tos = skb->nh.iph->tos;
1310 sk->sk_priority = skb->priority;
1311 sk->sk_protocol = skb->nh.iph->protocol;
1312 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1313 &ipc, rt, MSG_DONTWAIT);
1314 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1315 if (arg->csumoffset >= 0)
1316 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1317 skb->ip_summed = CHECKSUM_NONE;
1318 ip_push_pending_frames(sk);
1319 }
1320
1321 bh_unlock_sock(sk);
1322
1323 ip_rt_put(rt);
1324}
1325
1326/*
1327 * IP protocol layer initialiser
1328 */
1329
1330static struct packet_type ip_packet_type = {
1331 .type = __constant_htons(ETH_P_IP),
1332 .func = ip_rcv,
1333};
1334
1335/*
1336 * IP registers the packet type and then calls the subprotocol initialisers
1337 */
1338
1339void __init ip_init(void)
1340{
1341 dev_add_pack(&ip_packet_type);
1342
1343 ip_rt_init();
1344 inet_initpeers();
1345
1346#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1347 igmp_mc_proc_init();
1348#endif
1349}
1350
1351EXPORT_SYMBOL(ip_finish_output);
1352EXPORT_SYMBOL(ip_fragment);
1353EXPORT_SYMBOL(ip_generic_getfrag);
1354EXPORT_SYMBOL(ip_queue_xmit);
1355EXPORT_SYMBOL(ip_send_check);
1356
1357#ifdef CONFIG_SYSCTL
1358EXPORT_SYMBOL(sysctl_ip_default_ttl);
1359#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 000000000000..47012b93cad2
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1093 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The IP to API glue.
7 *
8 * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: see ip.c
11 *
12 * Fixes:
13 * Many : Split from ip.c , see ip.c for history.
14 * Martin Mares : TOS setting fixed.
15 * Alan Cox : Fixed a couple of oopses in Martin's
16 * TOS tweaks.
17 * Mike McLagan : Routing by source
18 */
19
20#include <linux/config.h>
21#include <linux/module.h>
22#include <linux/types.h>
23#include <linux/mm.h>
24#include <linux/sched.h>
25#include <linux/skbuff.h>
26#include <linux/ip.h>
27#include <linux/icmp.h>
28#include <linux/netdevice.h>
29#include <net/sock.h>
30#include <net/ip.h>
31#include <net/icmp.h>
32#include <net/tcp.h>
33#include <linux/tcp.h>
34#include <linux/udp.h>
35#include <linux/igmp.h>
36#include <linux/netfilter.h>
37#include <linux/route.h>
38#include <linux/mroute.h>
39#include <net/route.h>
40#include <net/xfrm.h>
41#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
42#include <net/transp_v6.h>
43#endif
44
45#include <linux/errqueue.h>
46#include <asm/uaccess.h>
47
48#define IP_CMSG_PKTINFO 1
49#define IP_CMSG_TTL 2
50#define IP_CMSG_TOS 4
51#define IP_CMSG_RECVOPTS 8
52#define IP_CMSG_RETOPTS 16
53
54/*
55 * SOL_IP control messages.
56 */
57
58static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
59{
60 struct in_pktinfo info;
61 struct rtable *rt = (struct rtable *)skb->dst;
62
63 info.ipi_addr.s_addr = skb->nh.iph->daddr;
64 if (rt) {
65 info.ipi_ifindex = rt->rt_iif;
66 info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
67 } else {
68 info.ipi_ifindex = 0;
69 info.ipi_spec_dst.s_addr = 0;
70 }
71
72 put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
73}
74
75static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
76{
77 int ttl = skb->nh.iph->ttl;
78 put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
79}
80
81static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
82{
83 put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos);
84}
85
86static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
87{
88 if (IPCB(skb)->opt.optlen == 0)
89 return;
90
91 put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1);
92}
93
94
95static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
96{
97 unsigned char optbuf[sizeof(struct ip_options) + 40];
98 struct ip_options * opt = (struct ip_options*)optbuf;
99
100 if (IPCB(skb)->opt.optlen == 0)
101 return;
102
103 if (ip_options_echo(opt, skb)) {
104 msg->msg_flags |= MSG_CTRUNC;
105 return;
106 }
107 ip_options_undo(opt);
108
109 put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
110}
111
112
113void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
114{
115 struct inet_sock *inet = inet_sk(skb->sk);
116 unsigned flags = inet->cmsg_flags;
117
118 /* Ordered by supposed usage frequency */
119 if (flags & 1)
120 ip_cmsg_recv_pktinfo(msg, skb);
121 if ((flags>>=1) == 0)
122 return;
123
124 if (flags & 1)
125 ip_cmsg_recv_ttl(msg, skb);
126 if ((flags>>=1) == 0)
127 return;
128
129 if (flags & 1)
130 ip_cmsg_recv_tos(msg, skb);
131 if ((flags>>=1) == 0)
132 return;
133
134 if (flags & 1)
135 ip_cmsg_recv_opts(msg, skb);
136 if ((flags>>=1) == 0)
137 return;
138
139 if (flags & 1)
140 ip_cmsg_recv_retopts(msg, skb);
141}
142
143int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
144{
145 int err;
146 struct cmsghdr *cmsg;
147
148 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
149 if (!CMSG_OK(msg, cmsg))
150 return -EINVAL;
151 if (cmsg->cmsg_level != SOL_IP)
152 continue;
153 switch (cmsg->cmsg_type) {
154 case IP_RETOPTS:
155 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
156 err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
157 if (err)
158 return err;
159 break;
160 case IP_PKTINFO:
161 {
162 struct in_pktinfo *info;
163 if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
164 return -EINVAL;
165 info = (struct in_pktinfo *)CMSG_DATA(cmsg);
166 ipc->oif = info->ipi_ifindex;
167 ipc->addr = info->ipi_spec_dst.s_addr;
168 break;
169 }
170 default:
171 return -EINVAL;
172 }
173 }
174 return 0;
175}
176
177
178/* Special input handler for packets caught by router alert option.
179 They are selected only by protocol field, and then processed likely
180 local ones; but only if someone wants them! Otherwise, router
181 not running rsvpd will kill RSVP.
182
183 It is user level problem, what it will make with them.
184 I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
185 but receiver should be enough clever f.e. to forward mtrace requests,
186 sent to multicast group to reach destination designated router.
187 */
188struct ip_ra_chain *ip_ra_chain;
189DEFINE_RWLOCK(ip_ra_lock);
190
191int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
192{
193 struct ip_ra_chain *ra, *new_ra, **rap;
194
195 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
196 return -EINVAL;
197
198 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
199
200 write_lock_bh(&ip_ra_lock);
201 for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
202 if (ra->sk == sk) {
203 if (on) {
204 write_unlock_bh(&ip_ra_lock);
205 if (new_ra)
206 kfree(new_ra);
207 return -EADDRINUSE;
208 }
209 *rap = ra->next;
210 write_unlock_bh(&ip_ra_lock);
211
212 if (ra->destructor)
213 ra->destructor(sk);
214 sock_put(sk);
215 kfree(ra);
216 return 0;
217 }
218 }
219 if (new_ra == NULL) {
220 write_unlock_bh(&ip_ra_lock);
221 return -ENOBUFS;
222 }
223 new_ra->sk = sk;
224 new_ra->destructor = destructor;
225
226 new_ra->next = ra;
227 *rap = new_ra;
228 sock_hold(sk);
229 write_unlock_bh(&ip_ra_lock);
230
231 return 0;
232}
233
234void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
235 u16 port, u32 info, u8 *payload)
236{
237 struct inet_sock *inet = inet_sk(sk);
238 struct sock_exterr_skb *serr;
239
240 if (!inet->recverr)
241 return;
242
243 skb = skb_clone(skb, GFP_ATOMIC);
244 if (!skb)
245 return;
246
247 serr = SKB_EXT_ERR(skb);
248 serr->ee.ee_errno = err;
249 serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
250 serr->ee.ee_type = skb->h.icmph->type;
251 serr->ee.ee_code = skb->h.icmph->code;
252 serr->ee.ee_pad = 0;
253 serr->ee.ee_info = info;
254 serr->ee.ee_data = 0;
255 serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw;
256 serr->port = port;
257
258 skb->h.raw = payload;
259 if (!skb_pull(skb, payload - skb->data) ||
260 sock_queue_err_skb(sk, skb))
261 kfree_skb(skb);
262}
263
264void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info)
265{
266 struct inet_sock *inet = inet_sk(sk);
267 struct sock_exterr_skb *serr;
268 struct iphdr *iph;
269 struct sk_buff *skb;
270
271 if (!inet->recverr)
272 return;
273
274 skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
275 if (!skb)
276 return;
277
278 iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr));
279 skb->nh.iph = iph;
280 iph->daddr = daddr;
281
282 serr = SKB_EXT_ERR(skb);
283 serr->ee.ee_errno = err;
284 serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
285 serr->ee.ee_type = 0;
286 serr->ee.ee_code = 0;
287 serr->ee.ee_pad = 0;
288 serr->ee.ee_info = info;
289 serr->ee.ee_data = 0;
290 serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
291 serr->port = port;
292
293 skb->h.raw = skb->tail;
294 __skb_pull(skb, skb->tail - skb->data);
295
296 if (sock_queue_err_skb(sk, skb))
297 kfree_skb(skb);
298}
299
300/*
301 * Handle MSG_ERRQUEUE
302 */
303int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
304{
305 struct sock_exterr_skb *serr;
306 struct sk_buff *skb, *skb2;
307 struct sockaddr_in *sin;
308 struct {
309 struct sock_extended_err ee;
310 struct sockaddr_in offender;
311 } errhdr;
312 int err;
313 int copied;
314
315 err = -EAGAIN;
316 skb = skb_dequeue(&sk->sk_error_queue);
317 if (skb == NULL)
318 goto out;
319
320 copied = skb->len;
321 if (copied > len) {
322 msg->msg_flags |= MSG_TRUNC;
323 copied = len;
324 }
325 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
326 if (err)
327 goto out_free_skb;
328
329 sock_recv_timestamp(msg, sk, skb);
330
331 serr = SKB_EXT_ERR(skb);
332
333 sin = (struct sockaddr_in *)msg->msg_name;
334 if (sin) {
335 sin->sin_family = AF_INET;
336 sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset);
337 sin->sin_port = serr->port;
338 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
339 }
340
341 memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
342 sin = &errhdr.offender;
343 sin->sin_family = AF_UNSPEC;
344 if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
345 struct inet_sock *inet = inet_sk(sk);
346
347 sin->sin_family = AF_INET;
348 sin->sin_addr.s_addr = skb->nh.iph->saddr;
349 sin->sin_port = 0;
350 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
351 if (inet->cmsg_flags)
352 ip_cmsg_recv(msg, skb);
353 }
354
355 put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
356
357 /* Now we could try to dump offended packet options */
358
359 msg->msg_flags |= MSG_ERRQUEUE;
360 err = copied;
361
362 /* Reset and regenerate socket error */
363 spin_lock_irq(&sk->sk_error_queue.lock);
364 sk->sk_err = 0;
365 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
366 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
367 spin_unlock_irq(&sk->sk_error_queue.lock);
368 sk->sk_error_report(sk);
369 } else
370 spin_unlock_irq(&sk->sk_error_queue.lock);
371
372out_free_skb:
373 kfree_skb(skb);
374out:
375 return err;
376}
377
378
379/*
380 * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
381 * an IP socket.
382 */
383
384int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen)
385{
386 struct inet_sock *inet = inet_sk(sk);
387 int val=0,err;
388
389 if (level != SOL_IP)
390 return -ENOPROTOOPT;
391
392 if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
393 (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
394 (1<<IP_RETOPTS) | (1<<IP_TOS) |
395 (1<<IP_TTL) | (1<<IP_HDRINCL) |
396 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
397 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND))) ||
398 optname == IP_MULTICAST_TTL ||
399 optname == IP_MULTICAST_LOOP) {
400 if (optlen >= sizeof(int)) {
401 if (get_user(val, (int __user *) optval))
402 return -EFAULT;
403 } else if (optlen >= sizeof(char)) {
404 unsigned char ucval;
405
406 if (get_user(ucval, (unsigned char __user *) optval))
407 return -EFAULT;
408 val = (int) ucval;
409 }
410 }
411
412 /* If optlen==0, it is equivalent to val == 0 */
413
414#ifdef CONFIG_IP_MROUTE
415 if (optname >= MRT_BASE && optname <= (MRT_BASE + 10))
416 return ip_mroute_setsockopt(sk,optname,optval,optlen);
417#endif
418
419 err = 0;
420 lock_sock(sk);
421
422 switch (optname) {
423 case IP_OPTIONS:
424 {
425 struct ip_options * opt = NULL;
426 if (optlen > 40 || optlen < 0)
427 goto e_inval;
428 err = ip_options_get(&opt, optval, optlen, 1);
429 if (err)
430 break;
431 if (sk->sk_type == SOCK_STREAM) {
432 struct tcp_sock *tp = tcp_sk(sk);
433#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
434 if (sk->sk_family == PF_INET ||
435 (!((1 << sk->sk_state) &
436 (TCPF_LISTEN | TCPF_CLOSE)) &&
437 inet->daddr != LOOPBACK4_IPV6)) {
438#endif
439 if (inet->opt)
440 tp->ext_header_len -= inet->opt->optlen;
441 if (opt)
442 tp->ext_header_len += opt->optlen;
443 tcp_sync_mss(sk, tp->pmtu_cookie);
444#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
445 }
446#endif
447 }
448 opt = xchg(&inet->opt, opt);
449 if (opt)
450 kfree(opt);
451 break;
452 }
453 case IP_PKTINFO:
454 if (val)
455 inet->cmsg_flags |= IP_CMSG_PKTINFO;
456 else
457 inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
458 break;
459 case IP_RECVTTL:
460 if (val)
461 inet->cmsg_flags |= IP_CMSG_TTL;
462 else
463 inet->cmsg_flags &= ~IP_CMSG_TTL;
464 break;
465 case IP_RECVTOS:
466 if (val)
467 inet->cmsg_flags |= IP_CMSG_TOS;
468 else
469 inet->cmsg_flags &= ~IP_CMSG_TOS;
470 break;
471 case IP_RECVOPTS:
472 if (val)
473 inet->cmsg_flags |= IP_CMSG_RECVOPTS;
474 else
475 inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
476 break;
477 case IP_RETOPTS:
478 if (val)
479 inet->cmsg_flags |= IP_CMSG_RETOPTS;
480 else
481 inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
482 break;
483 case IP_TOS: /* This sets both TOS and Precedence */
484 if (sk->sk_type == SOCK_STREAM) {
485 val &= ~3;
486 val |= inet->tos & 3;
487 }
488 if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
489 !capable(CAP_NET_ADMIN)) {
490 err = -EPERM;
491 break;
492 }
493 if (inet->tos != val) {
494 inet->tos = val;
495 sk->sk_priority = rt_tos2priority(val);
496 sk_dst_reset(sk);
497 }
498 break;
499 case IP_TTL:
500 if (optlen<1)
501 goto e_inval;
502 if (val != -1 && (val < 1 || val>255))
503 goto e_inval;
504 inet->uc_ttl = val;
505 break;
506 case IP_HDRINCL:
507 if (sk->sk_type != SOCK_RAW) {
508 err = -ENOPROTOOPT;
509 break;
510 }
511 inet->hdrincl = val ? 1 : 0;
512 break;
513 case IP_MTU_DISCOVER:
514 if (val<0 || val>2)
515 goto e_inval;
516 inet->pmtudisc = val;
517 break;
518 case IP_RECVERR:
519 inet->recverr = !!val;
520 if (!val)
521 skb_queue_purge(&sk->sk_error_queue);
522 break;
523 case IP_MULTICAST_TTL:
524 if (sk->sk_type == SOCK_STREAM)
525 goto e_inval;
526 if (optlen<1)
527 goto e_inval;
528 if (val==-1)
529 val = 1;
530 if (val < 0 || val > 255)
531 goto e_inval;
532 inet->mc_ttl = val;
533 break;
534 case IP_MULTICAST_LOOP:
535 if (optlen<1)
536 goto e_inval;
537 inet->mc_loop = !!val;
538 break;
539 case IP_MULTICAST_IF:
540 {
541 struct ip_mreqn mreq;
542 struct net_device *dev = NULL;
543
544 if (sk->sk_type == SOCK_STREAM)
545 goto e_inval;
546 /*
547 * Check the arguments are allowable
548 */
549
550 err = -EFAULT;
551 if (optlen >= sizeof(struct ip_mreqn)) {
552 if (copy_from_user(&mreq,optval,sizeof(mreq)))
553 break;
554 } else {
555 memset(&mreq, 0, sizeof(mreq));
556 if (optlen >= sizeof(struct in_addr) &&
557 copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
558 break;
559 }
560
561 if (!mreq.imr_ifindex) {
562 if (mreq.imr_address.s_addr == INADDR_ANY) {
563 inet->mc_index = 0;
564 inet->mc_addr = 0;
565 err = 0;
566 break;
567 }
568 dev = ip_dev_find(mreq.imr_address.s_addr);
569 if (dev) {
570 mreq.imr_ifindex = dev->ifindex;
571 dev_put(dev);
572 }
573 } else
574 dev = __dev_get_by_index(mreq.imr_ifindex);
575
576
577 err = -EADDRNOTAVAIL;
578 if (!dev)
579 break;
580
581 err = -EINVAL;
582 if (sk->sk_bound_dev_if &&
583 mreq.imr_ifindex != sk->sk_bound_dev_if)
584 break;
585
586 inet->mc_index = mreq.imr_ifindex;
587 inet->mc_addr = mreq.imr_address.s_addr;
588 err = 0;
589 break;
590 }
591
592 case IP_ADD_MEMBERSHIP:
593 case IP_DROP_MEMBERSHIP:
594 {
595 struct ip_mreqn mreq;
596
597 if (optlen < sizeof(struct ip_mreq))
598 goto e_inval;
599 err = -EFAULT;
600 if (optlen >= sizeof(struct ip_mreqn)) {
601 if(copy_from_user(&mreq,optval,sizeof(mreq)))
602 break;
603 } else {
604 memset(&mreq, 0, sizeof(mreq));
605 if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
606 break;
607 }
608
609 if (optname == IP_ADD_MEMBERSHIP)
610 err = ip_mc_join_group(sk, &mreq);
611 else
612 err = ip_mc_leave_group(sk, &mreq);
613 break;
614 }
615 case IP_MSFILTER:
616 {
617 extern int sysctl_optmem_max;
618 extern int sysctl_igmp_max_msf;
619 struct ip_msfilter *msf;
620
621 if (optlen < IP_MSFILTER_SIZE(0))
622 goto e_inval;
623 if (optlen > sysctl_optmem_max) {
624 err = -ENOBUFS;
625 break;
626 }
627 msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL);
628 if (msf == 0) {
629 err = -ENOBUFS;
630 break;
631 }
632 err = -EFAULT;
633 if (copy_from_user(msf, optval, optlen)) {
634 kfree(msf);
635 break;
636 }
637 /* numsrc >= (1G-4) overflow in 32 bits */
638 if (msf->imsf_numsrc >= 0x3ffffffcU ||
639 msf->imsf_numsrc > sysctl_igmp_max_msf) {
640 kfree(msf);
641 err = -ENOBUFS;
642 break;
643 }
644 if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
645 kfree(msf);
646 err = -EINVAL;
647 break;
648 }
649 err = ip_mc_msfilter(sk, msf, 0);
650 kfree(msf);
651 break;
652 }
653 case IP_BLOCK_SOURCE:
654 case IP_UNBLOCK_SOURCE:
655 case IP_ADD_SOURCE_MEMBERSHIP:
656 case IP_DROP_SOURCE_MEMBERSHIP:
657 {
658 struct ip_mreq_source mreqs;
659 int omode, add;
660
661 if (optlen != sizeof(struct ip_mreq_source))
662 goto e_inval;
663 if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
664 err = -EFAULT;
665 break;
666 }
667 if (optname == IP_BLOCK_SOURCE) {
668 omode = MCAST_EXCLUDE;
669 add = 1;
670 } else if (optname == IP_UNBLOCK_SOURCE) {
671 omode = MCAST_EXCLUDE;
672 add = 0;
673 } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
674 struct ip_mreqn mreq;
675
676 mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
677 mreq.imr_address.s_addr = mreqs.imr_interface;
678 mreq.imr_ifindex = 0;
679 err = ip_mc_join_group(sk, &mreq);
680 if (err)
681 break;
682 omode = MCAST_INCLUDE;
683 add = 1;
684 } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
685 omode = MCAST_INCLUDE;
686 add = 0;
687 }
688 err = ip_mc_source(add, omode, sk, &mreqs, 0);
689 break;
690 }
691 case MCAST_JOIN_GROUP:
692 case MCAST_LEAVE_GROUP:
693 {
694 struct group_req greq;
695 struct sockaddr_in *psin;
696 struct ip_mreqn mreq;
697
698 if (optlen < sizeof(struct group_req))
699 goto e_inval;
700 err = -EFAULT;
701 if(copy_from_user(&greq, optval, sizeof(greq)))
702 break;
703 psin = (struct sockaddr_in *)&greq.gr_group;
704 if (psin->sin_family != AF_INET)
705 goto e_inval;
706 memset(&mreq, 0, sizeof(mreq));
707 mreq.imr_multiaddr = psin->sin_addr;
708 mreq.imr_ifindex = greq.gr_interface;
709
710 if (optname == MCAST_JOIN_GROUP)
711 err = ip_mc_join_group(sk, &mreq);
712 else
713 err = ip_mc_leave_group(sk, &mreq);
714 break;
715 }
716 case MCAST_JOIN_SOURCE_GROUP:
717 case MCAST_LEAVE_SOURCE_GROUP:
718 case MCAST_BLOCK_SOURCE:
719 case MCAST_UNBLOCK_SOURCE:
720 {
721 struct group_source_req greqs;
722 struct ip_mreq_source mreqs;
723 struct sockaddr_in *psin;
724 int omode, add;
725
726 if (optlen != sizeof(struct group_source_req))
727 goto e_inval;
728 if (copy_from_user(&greqs, optval, sizeof(greqs))) {
729 err = -EFAULT;
730 break;
731 }
732 if (greqs.gsr_group.ss_family != AF_INET ||
733 greqs.gsr_source.ss_family != AF_INET) {
734 err = -EADDRNOTAVAIL;
735 break;
736 }
737 psin = (struct sockaddr_in *)&greqs.gsr_group;
738 mreqs.imr_multiaddr = psin->sin_addr.s_addr;
739 psin = (struct sockaddr_in *)&greqs.gsr_source;
740 mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
741 mreqs.imr_interface = 0; /* use index for mc_source */
742
743 if (optname == MCAST_BLOCK_SOURCE) {
744 omode = MCAST_EXCLUDE;
745 add = 1;
746 } else if (optname == MCAST_UNBLOCK_SOURCE) {
747 omode = MCAST_EXCLUDE;
748 add = 0;
749 } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
750 struct ip_mreqn mreq;
751
752 psin = (struct sockaddr_in *)&greqs.gsr_group;
753 mreq.imr_multiaddr = psin->sin_addr;
754 mreq.imr_address.s_addr = 0;
755 mreq.imr_ifindex = greqs.gsr_interface;
756 err = ip_mc_join_group(sk, &mreq);
757 if (err)
758 break;
759 greqs.gsr_interface = mreq.imr_ifindex;
760 omode = MCAST_INCLUDE;
761 add = 1;
762 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
763 omode = MCAST_INCLUDE;
764 add = 0;
765 }
766 err = ip_mc_source(add, omode, sk, &mreqs,
767 greqs.gsr_interface);
768 break;
769 }
770 case MCAST_MSFILTER:
771 {
772 extern int sysctl_optmem_max;
773 extern int sysctl_igmp_max_msf;
774 struct sockaddr_in *psin;
775 struct ip_msfilter *msf = NULL;
776 struct group_filter *gsf = NULL;
777 int msize, i, ifindex;
778
779 if (optlen < GROUP_FILTER_SIZE(0))
780 goto e_inval;
781 if (optlen > sysctl_optmem_max) {
782 err = -ENOBUFS;
783 break;
784 }
785 gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL);
786 if (gsf == 0) {
787 err = -ENOBUFS;
788 break;
789 }
790 err = -EFAULT;
791 if (copy_from_user(gsf, optval, optlen)) {
792 goto mc_msf_out;
793 }
794 /* numsrc >= (4G-140)/128 overflow in 32 bits */
795 if (gsf->gf_numsrc >= 0x1ffffff ||
796 gsf->gf_numsrc > sysctl_igmp_max_msf) {
797 err = -ENOBUFS;
798 goto mc_msf_out;
799 }
800 if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
801 err = -EINVAL;
802 goto mc_msf_out;
803 }
804 msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
805 msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL);
806 if (msf == 0) {
807 err = -ENOBUFS;
808 goto mc_msf_out;
809 }
810 ifindex = gsf->gf_interface;
811 psin = (struct sockaddr_in *)&gsf->gf_group;
812 if (psin->sin_family != AF_INET) {
813 err = -EADDRNOTAVAIL;
814 goto mc_msf_out;
815 }
816 msf->imsf_multiaddr = psin->sin_addr.s_addr;
817 msf->imsf_interface = 0;
818 msf->imsf_fmode = gsf->gf_fmode;
819 msf->imsf_numsrc = gsf->gf_numsrc;
820 err = -EADDRNOTAVAIL;
821 for (i=0; i<gsf->gf_numsrc; ++i) {
822 psin = (struct sockaddr_in *)&gsf->gf_slist[i];
823
824 if (psin->sin_family != AF_INET)
825 goto mc_msf_out;
826 msf->imsf_slist[i] = psin->sin_addr.s_addr;
827 }
828 kfree(gsf);
829 gsf = NULL;
830
831 err = ip_mc_msfilter(sk, msf, ifindex);
832mc_msf_out:
833 if (msf)
834 kfree(msf);
835 if (gsf)
836 kfree(gsf);
837 break;
838 }
839 case IP_ROUTER_ALERT:
840 err = ip_ra_control(sk, val ? 1 : 0, NULL);
841 break;
842
843 case IP_FREEBIND:
844 if (optlen<1)
845 goto e_inval;
846 inet->freebind = !!val;
847 break;
848
849 case IP_IPSEC_POLICY:
850 case IP_XFRM_POLICY:
851 err = xfrm_user_policy(sk, optname, optval, optlen);
852 break;
853
854 default:
855#ifdef CONFIG_NETFILTER
856 err = nf_setsockopt(sk, PF_INET, optname, optval,
857 optlen);
858#else
859 err = -ENOPROTOOPT;
860#endif
861 break;
862 }
863 release_sock(sk);
864 return err;
865
866e_inval:
867 release_sock(sk);
868 return -EINVAL;
869}
870
871/*
872 * Get the options. Note for future reference. The GET of IP options gets the
873 * _received_ ones. The set sets the _sent_ ones.
874 */
875
876int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
877{
878 struct inet_sock *inet = inet_sk(sk);
879 int val;
880 int len;
881
882 if(level!=SOL_IP)
883 return -EOPNOTSUPP;
884
885#ifdef CONFIG_IP_MROUTE
886 if(optname>=MRT_BASE && optname <=MRT_BASE+10)
887 {
888 return ip_mroute_getsockopt(sk,optname,optval,optlen);
889 }
890#endif
891
892 if(get_user(len,optlen))
893 return -EFAULT;
894 if(len < 0)
895 return -EINVAL;
896
897 lock_sock(sk);
898
899 switch(optname) {
900 case IP_OPTIONS:
901 {
902 unsigned char optbuf[sizeof(struct ip_options)+40];
903 struct ip_options * opt = (struct ip_options*)optbuf;
904 opt->optlen = 0;
905 if (inet->opt)
906 memcpy(optbuf, inet->opt,
907 sizeof(struct ip_options)+
908 inet->opt->optlen);
909 release_sock(sk);
910
911 if (opt->optlen == 0)
912 return put_user(0, optlen);
913
914 ip_options_undo(opt);
915
916 len = min_t(unsigned int, len, opt->optlen);
917 if(put_user(len, optlen))
918 return -EFAULT;
919 if(copy_to_user(optval, opt->__data, len))
920 return -EFAULT;
921 return 0;
922 }
923 case IP_PKTINFO:
924 val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
925 break;
926 case IP_RECVTTL:
927 val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
928 break;
929 case IP_RECVTOS:
930 val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
931 break;
932 case IP_RECVOPTS:
933 val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
934 break;
935 case IP_RETOPTS:
936 val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
937 break;
938 case IP_TOS:
939 val = inet->tos;
940 break;
941 case IP_TTL:
942 val = (inet->uc_ttl == -1 ?
943 sysctl_ip_default_ttl :
944 inet->uc_ttl);
945 break;
946 case IP_HDRINCL:
947 val = inet->hdrincl;
948 break;
949 case IP_MTU_DISCOVER:
950 val = inet->pmtudisc;
951 break;
952 case IP_MTU:
953 {
954 struct dst_entry *dst;
955 val = 0;
956 dst = sk_dst_get(sk);
957 if (dst) {
958 val = dst_mtu(dst);
959 dst_release(dst);
960 }
961 if (!val) {
962 release_sock(sk);
963 return -ENOTCONN;
964 }
965 break;
966 }
967 case IP_RECVERR:
968 val = inet->recverr;
969 break;
970 case IP_MULTICAST_TTL:
971 val = inet->mc_ttl;
972 break;
973 case IP_MULTICAST_LOOP:
974 val = inet->mc_loop;
975 break;
976 case IP_MULTICAST_IF:
977 {
978 struct in_addr addr;
979 len = min_t(unsigned int, len, sizeof(struct in_addr));
980 addr.s_addr = inet->mc_addr;
981 release_sock(sk);
982
983 if(put_user(len, optlen))
984 return -EFAULT;
985 if(copy_to_user(optval, &addr, len))
986 return -EFAULT;
987 return 0;
988 }
989 case IP_MSFILTER:
990 {
991 struct ip_msfilter msf;
992 int err;
993
994 if (len < IP_MSFILTER_SIZE(0)) {
995 release_sock(sk);
996 return -EINVAL;
997 }
998 if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
999 release_sock(sk);
1000 return -EFAULT;
1001 }
1002 err = ip_mc_msfget(sk, &msf,
1003 (struct ip_msfilter __user *)optval, optlen);
1004 release_sock(sk);
1005 return err;
1006 }
1007 case MCAST_MSFILTER:
1008 {
1009 struct group_filter gsf;
1010 int err;
1011
1012 if (len < GROUP_FILTER_SIZE(0)) {
1013 release_sock(sk);
1014 return -EINVAL;
1015 }
1016 if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
1017 release_sock(sk);
1018 return -EFAULT;
1019 }
1020 err = ip_mc_gsfget(sk, &gsf,
1021 (struct group_filter __user *)optval, optlen);
1022 release_sock(sk);
1023 return err;
1024 }
1025 case IP_PKTOPTIONS:
1026 {
1027 struct msghdr msg;
1028
1029 release_sock(sk);
1030
1031 if (sk->sk_type != SOCK_STREAM)
1032 return -ENOPROTOOPT;
1033
1034 msg.msg_control = optval;
1035 msg.msg_controllen = len;
1036 msg.msg_flags = 0;
1037
1038 if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
1039 struct in_pktinfo info;
1040
1041 info.ipi_addr.s_addr = inet->rcv_saddr;
1042 info.ipi_spec_dst.s_addr = inet->rcv_saddr;
1043 info.ipi_ifindex = inet->mc_index;
1044 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
1045 }
1046 if (inet->cmsg_flags & IP_CMSG_TTL) {
1047 int hlim = inet->mc_ttl;
1048 put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
1049 }
1050 len -= msg.msg_controllen;
1051 return put_user(len, optlen);
1052 }
1053 case IP_FREEBIND:
1054 val = inet->freebind;
1055 break;
1056 default:
1057#ifdef CONFIG_NETFILTER
1058 val = nf_getsockopt(sk, PF_INET, optname, optval,
1059 &len);
1060 release_sock(sk);
1061 if (val >= 0)
1062 val = put_user(len, optlen);
1063 return val;
1064#else
1065 release_sock(sk);
1066 return -ENOPROTOOPT;
1067#endif
1068 }
1069 release_sock(sk);
1070
1071 if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
1072 unsigned char ucval = (unsigned char)val;
1073 len = 1;
1074 if(put_user(len, optlen))
1075 return -EFAULT;
1076 if(copy_to_user(optval,&ucval,1))
1077 return -EFAULT;
1078 } else {
1079 len = min_t(unsigned int, sizeof(int), len);
1080 if(put_user(len, optlen))
1081 return -EFAULT;
1082 if(copy_to_user(optval,&val,len))
1083 return -EFAULT;
1084 }
1085 return 0;
1086}
1087
1088EXPORT_SYMBOL(ip_cmsg_recv);
1089
1090#ifdef CONFIG_IP_SCTP_MODULE
1091EXPORT_SYMBOL(ip_getsockopt);
1092EXPORT_SYMBOL(ip_setsockopt);
1093#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 000000000000..1a23c5263b99
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,524 @@
1/*
2 * IP Payload Compression Protocol (IPComp) - RFC3173.
3 *
4 * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation; either version 2 of the License, or (at your option)
9 * any later version.
10 *
11 * Todo:
12 * - Tunable compression parameters.
13 * - Compression stats.
14 * - Adaptive compression.
15 */
16#include <linux/config.h>
17#include <linux/module.h>
18#include <asm/scatterlist.h>
19#include <asm/semaphore.h>
20#include <linux/crypto.h>
21#include <linux/pfkeyv2.h>
22#include <linux/percpu.h>
23#include <linux/smp.h>
24#include <linux/list.h>
25#include <linux/vmalloc.h>
26#include <linux/rtnetlink.h>
27#include <net/ip.h>
28#include <net/xfrm.h>
29#include <net/icmp.h>
30#include <net/ipcomp.h>
31
32struct ipcomp_tfms {
33 struct list_head list;
34 struct crypto_tfm **tfms;
35 int users;
36};
37
38static DECLARE_MUTEX(ipcomp_resource_sem);
39static void **ipcomp_scratches;
40static int ipcomp_scratch_users;
41static LIST_HEAD(ipcomp_tfms_list);
42
43static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
44{
45 int err, plen, dlen;
46 struct iphdr *iph;
47 struct ipcomp_data *ipcd = x->data;
48 u8 *start, *scratch;
49 struct crypto_tfm *tfm;
50 int cpu;
51
52 plen = skb->len;
53 dlen = IPCOMP_SCRATCH_SIZE;
54 start = skb->data;
55
56 cpu = get_cpu();
57 scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
58 tfm = *per_cpu_ptr(ipcd->tfms, cpu);
59
60 err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
61 if (err)
62 goto out;
63
64 if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
65 err = -EINVAL;
66 goto out;
67 }
68
69 err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
70 if (err)
71 goto out;
72
73 skb_put(skb, dlen - plen);
74 memcpy(skb->data, scratch, dlen);
75 iph = skb->nh.iph;
76 iph->tot_len = htons(dlen + iph->ihl * 4);
77out:
78 put_cpu();
79 return err;
80}
81
82static int ipcomp_input(struct xfrm_state *x,
83 struct xfrm_decap_state *decap, struct sk_buff *skb)
84{
85 u8 nexthdr;
86 int err = 0;
87 struct iphdr *iph;
88 union {
89 struct iphdr iph;
90 char buf[60];
91 } tmp_iph;
92
93
94 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
95 skb_linearize(skb, GFP_ATOMIC) != 0) {
96 err = -ENOMEM;
97 goto out;
98 }
99
100 skb->ip_summed = CHECKSUM_NONE;
101
102 /* Remove ipcomp header and decompress original payload */
103 iph = skb->nh.iph;
104 memcpy(&tmp_iph, iph, iph->ihl * 4);
105 nexthdr = *(u8 *)skb->data;
106 skb_pull(skb, sizeof(struct ip_comp_hdr));
107 skb->nh.raw += sizeof(struct ip_comp_hdr);
108 memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
109 iph = skb->nh.iph;
110 iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
111 iph->protocol = nexthdr;
112 skb->h.raw = skb->data;
113 err = ipcomp_decompress(x, skb);
114
115out:
116 return err;
117}
118
119static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
120{
121 int err, plen, dlen, ihlen;
122 struct iphdr *iph = skb->nh.iph;
123 struct ipcomp_data *ipcd = x->data;
124 u8 *start, *scratch;
125 struct crypto_tfm *tfm;
126 int cpu;
127
128 ihlen = iph->ihl * 4;
129 plen = skb->len - ihlen;
130 dlen = IPCOMP_SCRATCH_SIZE;
131 start = skb->data + ihlen;
132
133 cpu = get_cpu();
134 scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
135 tfm = *per_cpu_ptr(ipcd->tfms, cpu);
136
137 err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
138 if (err)
139 goto out;
140
141 if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
142 err = -EMSGSIZE;
143 goto out;
144 }
145
146 memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
147 put_cpu();
148
149 pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
150 return 0;
151
152out:
153 put_cpu();
154 return err;
155}
156
157static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
158{
159 int err;
160 struct iphdr *iph;
161 struct ip_comp_hdr *ipch;
162 struct ipcomp_data *ipcd = x->data;
163 int hdr_len = 0;
164
165 iph = skb->nh.iph;
166 iph->tot_len = htons(skb->len);
167 hdr_len = iph->ihl * 4;
168 if ((skb->len - hdr_len) < ipcd->threshold) {
169 /* Don't bother compressing */
170 goto out_ok;
171 }
172
173 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
174 skb_linearize(skb, GFP_ATOMIC) != 0) {
175 goto out_ok;
176 }
177
178 err = ipcomp_compress(x, skb);
179 iph = skb->nh.iph;
180
181 if (err) {
182 goto out_ok;
183 }
184
185 /* Install ipcomp header, convert into ipcomp datagram. */
186 iph->tot_len = htons(skb->len);
187 ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
188 ipch->nexthdr = iph->protocol;
189 ipch->flags = 0;
190 ipch->cpi = htons((u16 )ntohl(x->id.spi));
191 iph->protocol = IPPROTO_COMP;
192 ip_send_check(iph);
193 return 0;
194
195out_ok:
196 if (x->props.mode)
197 ip_send_check(iph);
198 return 0;
199}
200
201static void ipcomp4_err(struct sk_buff *skb, u32 info)
202{
203 u32 spi;
204 struct iphdr *iph = (struct iphdr *)skb->data;
205 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
206 struct xfrm_state *x;
207
208 if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
209 skb->h.icmph->code != ICMP_FRAG_NEEDED)
210 return;
211
212 spi = ntohl(ntohs(ipch->cpi));
213 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr,
214 spi, IPPROTO_COMP, AF_INET);
215 if (!x)
216 return;
217 NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
218 spi, NIPQUAD(iph->daddr)));
219 xfrm_state_put(x);
220}
221
222/* We always hold one tunnel user reference to indicate a tunnel */
223static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
224{
225 struct xfrm_state *t;
226
227 t = xfrm_state_alloc();
228 if (t == NULL)
229 goto out;
230
231 t->id.proto = IPPROTO_IPIP;
232 t->id.spi = x->props.saddr.a4;
233 t->id.daddr.a4 = x->id.daddr.a4;
234 memcpy(&t->sel, &x->sel, sizeof(t->sel));
235 t->props.family = AF_INET;
236 t->props.mode = 1;
237 t->props.saddr.a4 = x->props.saddr.a4;
238 t->props.flags = x->props.flags;
239
240 t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
241 if (t->type == NULL)
242 goto error;
243
244 if (t->type->init_state(t, NULL))
245 goto error;
246
247 t->km.state = XFRM_STATE_VALID;
248 atomic_set(&t->tunnel_users, 1);
249out:
250 return t;
251
252error:
253 t->km.state = XFRM_STATE_DEAD;
254 xfrm_state_put(t);
255 t = NULL;
256 goto out;
257}
258
259/*
260 * Must be protected by xfrm_cfg_sem. State and tunnel user references are
261 * always incremented on success.
262 */
263static int ipcomp_tunnel_attach(struct xfrm_state *x)
264{
265 int err = 0;
266 struct xfrm_state *t;
267
268 t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4,
269 x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
270 if (!t) {
271 t = ipcomp_tunnel_create(x);
272 if (!t) {
273 err = -EINVAL;
274 goto out;
275 }
276 xfrm_state_insert(t);
277 xfrm_state_hold(t);
278 }
279 x->tunnel = t;
280 atomic_inc(&t->tunnel_users);
281out:
282 return err;
283}
284
285static void ipcomp_free_scratches(void)
286{
287 int i;
288 void **scratches;
289
290 if (--ipcomp_scratch_users)
291 return;
292
293 scratches = ipcomp_scratches;
294 if (!scratches)
295 return;
296
297 for_each_cpu(i) {
298 void *scratch = *per_cpu_ptr(scratches, i);
299 if (scratch)
300 vfree(scratch);
301 }
302
303 free_percpu(scratches);
304}
305
306static void **ipcomp_alloc_scratches(void)
307{
308 int i;
309 void **scratches;
310
311 if (ipcomp_scratch_users++)
312 return ipcomp_scratches;
313
314 scratches = alloc_percpu(void *);
315 if (!scratches)
316 return NULL;
317
318 ipcomp_scratches = scratches;
319
320 for_each_cpu(i) {
321 void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
322 if (!scratch)
323 return NULL;
324 *per_cpu_ptr(scratches, i) = scratch;
325 }
326
327 return scratches;
328}
329
330static void ipcomp_free_tfms(struct crypto_tfm **tfms)
331{
332 struct ipcomp_tfms *pos;
333 int cpu;
334
335 list_for_each_entry(pos, &ipcomp_tfms_list, list) {
336 if (pos->tfms == tfms)
337 break;
338 }
339
340 BUG_TRAP(pos);
341
342 if (--pos->users)
343 return;
344
345 list_del(&pos->list);
346 kfree(pos);
347
348 if (!tfms)
349 return;
350
351 for_each_cpu(cpu) {
352 struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
353 if (tfm)
354 crypto_free_tfm(tfm);
355 }
356 free_percpu(tfms);
357}
358
359static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
360{
361 struct ipcomp_tfms *pos;
362 struct crypto_tfm **tfms;
363 int cpu;
364
365 /* This can be any valid CPU ID so we don't need locking. */
366 cpu = smp_processor_id();
367
368 list_for_each_entry(pos, &ipcomp_tfms_list, list) {
369 struct crypto_tfm *tfm;
370
371 tfms = pos->tfms;
372 tfm = *per_cpu_ptr(tfms, cpu);
373
374 if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) {
375 pos->users++;
376 return tfms;
377 }
378 }
379
380 pos = kmalloc(sizeof(*pos), GFP_KERNEL);
381 if (!pos)
382 return NULL;
383
384 pos->users = 1;
385 INIT_LIST_HEAD(&pos->list);
386 list_add(&pos->list, &ipcomp_tfms_list);
387
388 pos->tfms = tfms = alloc_percpu(struct crypto_tfm *);
389 if (!tfms)
390 goto error;
391
392 for_each_cpu(cpu) {
393 struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0);
394 if (!tfm)
395 goto error;
396 *per_cpu_ptr(tfms, cpu) = tfm;
397 }
398
399 return tfms;
400
401error:
402 ipcomp_free_tfms(tfms);
403 return NULL;
404}
405
406static void ipcomp_free_data(struct ipcomp_data *ipcd)
407{
408 if (ipcd->tfms)
409 ipcomp_free_tfms(ipcd->tfms);
410 ipcomp_free_scratches();
411}
412
413static void ipcomp_destroy(struct xfrm_state *x)
414{
415 struct ipcomp_data *ipcd = x->data;
416 if (!ipcd)
417 return;
418 xfrm_state_delete_tunnel(x);
419 down(&ipcomp_resource_sem);
420 ipcomp_free_data(ipcd);
421 up(&ipcomp_resource_sem);
422 kfree(ipcd);
423}
424
425static int ipcomp_init_state(struct xfrm_state *x, void *args)
426{
427 int err;
428 struct ipcomp_data *ipcd;
429 struct xfrm_algo_desc *calg_desc;
430
431 err = -EINVAL;
432 if (!x->calg)
433 goto out;
434
435 if (x->encap)
436 goto out;
437
438 err = -ENOMEM;
439 ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL);
440 if (!ipcd)
441 goto out;
442
443 memset(ipcd, 0, sizeof(*ipcd));
444 x->props.header_len = 0;
445 if (x->props.mode)
446 x->props.header_len += sizeof(struct iphdr);
447
448 down(&ipcomp_resource_sem);
449 if (!ipcomp_alloc_scratches())
450 goto error;
451
452 ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
453 if (!ipcd->tfms)
454 goto error;
455 up(&ipcomp_resource_sem);
456
457 if (x->props.mode) {
458 err = ipcomp_tunnel_attach(x);
459 if (err)
460 goto error_tunnel;
461 }
462
463 calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
464 BUG_ON(!calg_desc);
465 ipcd->threshold = calg_desc->uinfo.comp.threshold;
466 x->data = ipcd;
467 err = 0;
468out:
469 return err;
470
471error_tunnel:
472 down(&ipcomp_resource_sem);
473error:
474 ipcomp_free_data(ipcd);
475 up(&ipcomp_resource_sem);
476 kfree(ipcd);
477 goto out;
478}
479
480static struct xfrm_type ipcomp_type = {
481 .description = "IPCOMP4",
482 .owner = THIS_MODULE,
483 .proto = IPPROTO_COMP,
484 .init_state = ipcomp_init_state,
485 .destructor = ipcomp_destroy,
486 .input = ipcomp_input,
487 .output = ipcomp_output
488};
489
490static struct net_protocol ipcomp4_protocol = {
491 .handler = xfrm4_rcv,
492 .err_handler = ipcomp4_err,
493 .no_policy = 1,
494};
495
496static int __init ipcomp4_init(void)
497{
498 if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
499 printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
500 return -EAGAIN;
501 }
502 if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
503 printk(KERN_INFO "ipcomp init: can't add protocol\n");
504 xfrm_unregister_type(&ipcomp_type, AF_INET);
505 return -EAGAIN;
506 }
507 return 0;
508}
509
510static void __exit ipcomp4_fini(void)
511{
512 if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
513 printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
514 if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
515 printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
516}
517
518module_init(ipcomp4_init);
519module_exit(ipcomp4_fini);
520
521MODULE_LICENSE("GPL");
522MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
523MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
524
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 000000000000..f2509034ce72
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1507 @@
1/*
2 * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $
3 *
4 * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
5 * user-supplied information to configure own IP address and routes.
6 *
7 * Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
8 *
9 * Derived from network configuration code in fs/nfs/nfsroot.c,
10 * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
11 *
12 * BOOTP rewritten to construct and analyse packets itself instead
13 * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
14 * -- MJ, December 1998
15 *
16 * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
17 * initialization scheme.
18 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
19 *
20 * DHCP support added. To users this looks like a whole separate
21 * protocol, but we know it's just a bag on the side of BOOTP.
22 * -- Chip Salzenberg <chip@valinux.com>, May 2000
23 *
24 * Ported DHCP support from 2.2.16 to 2.4.0-test4
25 * -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
26 *
27 * Merged changes from 2.2.19 into 2.4.3
28 * -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
29 *
30 * Multiple Nameservers in /proc/net/pnp
31 * -- Josef Siemes <jsiemes@web.de>, Aug 2002
32 */
33
34#include <linux/config.h>
35#include <linux/types.h>
36#include <linux/string.h>
37#include <linux/kernel.h>
38#include <linux/jiffies.h>
39#include <linux/random.h>
40#include <linux/init.h>
41#include <linux/utsname.h>
42#include <linux/in.h>
43#include <linux/if.h>
44#include <linux/inet.h>
45#include <linux/netdevice.h>
46#include <linux/if_arp.h>
47#include <linux/skbuff.h>
48#include <linux/ip.h>
49#include <linux/socket.h>
50#include <linux/route.h>
51#include <linux/udp.h>
52#include <linux/proc_fs.h>
53#include <linux/seq_file.h>
54#include <linux/major.h>
55#include <linux/root_dev.h>
56#include <linux/delay.h>
57#include <net/arp.h>
58#include <net/ip.h>
59#include <net/ipconfig.h>
60
61#include <asm/uaccess.h>
62#include <net/checksum.h>
63#include <asm/processor.h>
64
65/* Define this to allow debugging output */
66#undef IPCONFIG_DEBUG
67
68#ifdef IPCONFIG_DEBUG
69#define DBG(x) printk x
70#else
71#define DBG(x) do { } while(0)
72#endif
73
74#if defined(CONFIG_IP_PNP_DHCP)
75#define IPCONFIG_DHCP
76#endif
77#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
78#define IPCONFIG_BOOTP
79#endif
80#if defined(CONFIG_IP_PNP_RARP)
81#define IPCONFIG_RARP
82#endif
83#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
84#define IPCONFIG_DYNAMIC
85#endif
86
87/* Define the friendly delay before and after opening net devices */
88#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */
89#define CONF_POST_OPEN 1 /* After opening: 1 second */
90
91/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
92#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
93#define CONF_SEND_RETRIES 6 /* Send six requests per open */
94#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
95#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
96#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
97#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
98#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
99#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
100 - '3' from resolv.h */
101
102
103/*
104 * Public IP configuration
105 */
106
107/* This is used by platforms which might be able to set the ipconfig
108 * variables using firmware environment vars. If this is set, it will
109 * ignore such firmware variables.
110 */
111int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
112
113static int ic_enable __initdata = 0; /* IP config enabled? */
114
115/* Protocol choice */
116int ic_proto_enabled __initdata = 0
117#ifdef IPCONFIG_BOOTP
118 | IC_BOOTP
119#endif
120#ifdef CONFIG_IP_PNP_DHCP
121 | IC_USE_DHCP
122#endif
123#ifdef IPCONFIG_RARP
124 | IC_RARP
125#endif
126 ;
127
128static int ic_host_name_set __initdata = 0; /* Host name set by us? */
129
130u32 ic_myaddr = INADDR_NONE; /* My IP address */
131static u32 ic_netmask = INADDR_NONE; /* Netmask for local subnet */
132u32 ic_gateway = INADDR_NONE; /* Gateway IP address */
133
134u32 ic_servaddr = INADDR_NONE; /* Boot server IP address */
135
136u32 root_server_addr = INADDR_NONE; /* Address of NFS server */
137u8 root_server_path[256] = { 0, }; /* Path to mount as root */
138
139/* Persistent data: */
140
141static int ic_proto_used; /* Protocol used, if any */
142static u32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
143static u8 ic_domain[64]; /* DNS (not NIS) domain name */
144
145/*
146 * Private state.
147 */
148
149/* Name of user-selected boot device */
150static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
151
152/* Protocols supported by available interfaces */
153static int ic_proto_have_if __initdata = 0;
154
155#ifdef IPCONFIG_DYNAMIC
156static DEFINE_SPINLOCK(ic_recv_lock);
157static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */
158#endif
159#ifdef IPCONFIG_DHCP
160static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */
161#endif
162
163
164/*
165 * Network devices
166 */
167
168struct ic_device {
169 struct ic_device *next;
170 struct net_device *dev;
171 unsigned short flags;
172 short able;
173 u32 xid;
174};
175
176static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
177static struct net_device *ic_dev __initdata = NULL; /* Selected device */
178
179static int __init ic_open_devs(void)
180{
181 struct ic_device *d, **last;
182 struct net_device *dev;
183 unsigned short oflags;
184
185 last = &ic_first_dev;
186 rtnl_shlock();
187
188 /* bring loopback device up first */
189 if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
190 printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name);
191
192 for (dev = dev_base; dev; dev = dev->next) {
193 if (dev == &loopback_dev)
194 continue;
195 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
196 (!(dev->flags & IFF_LOOPBACK) &&
197 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
198 strncmp(dev->name, "dummy", 5))) {
199 int able = 0;
200 if (dev->mtu >= 364)
201 able |= IC_BOOTP;
202 else
203 printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
204 if (!(dev->flags & IFF_NOARP))
205 able |= IC_RARP;
206 able &= ic_proto_enabled;
207 if (ic_proto_enabled && !able)
208 continue;
209 oflags = dev->flags;
210 if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
211 printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
212 continue;
213 }
214 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
215 rtnl_shunlock();
216 return -1;
217 }
218 d->dev = dev;
219 *last = d;
220 last = &d->next;
221 d->flags = oflags;
222 d->able = able;
223 if (able & IC_BOOTP)
224 get_random_bytes(&d->xid, sizeof(u32));
225 else
226 d->xid = 0;
227 ic_proto_have_if |= able;
228 DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
229 dev->name, able, d->xid));
230 }
231 }
232 rtnl_shunlock();
233
234 *last = NULL;
235
236 if (!ic_first_dev) {
237 if (user_dev_name[0])
238 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
239 else
240 printk(KERN_ERR "IP-Config: No network devices available.\n");
241 return -1;
242 }
243 return 0;
244}
245
246static void __init ic_close_devs(void)
247{
248 struct ic_device *d, *next;
249 struct net_device *dev;
250
251 rtnl_shlock();
252 next = ic_first_dev;
253 while ((d = next)) {
254 next = d->next;
255 dev = d->dev;
256 if (dev != ic_dev) {
257 DBG(("IP-Config: Downing %s\n", dev->name));
258 dev_change_flags(dev, d->flags);
259 }
260 kfree(d);
261 }
262 rtnl_shunlock();
263}
264
265/*
266 * Interface to various network functions.
267 */
268
269static inline void
270set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port)
271{
272 sin->sin_family = AF_INET;
273 sin->sin_addr.s_addr = addr;
274 sin->sin_port = port;
275}
276
277static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
278{
279 int res;
280
281 mm_segment_t oldfs = get_fs();
282 set_fs(get_ds());
283 res = devinet_ioctl(cmd, (struct ifreq __user *) arg);
284 set_fs(oldfs);
285 return res;
286}
287
288static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
289{
290 int res;
291
292 mm_segment_t oldfs = get_fs();
293 set_fs(get_ds());
294 res = ip_rt_ioctl(cmd, (void __user *) arg);
295 set_fs(oldfs);
296 return res;
297}
298
299/*
300 * Set up interface addresses and routes.
301 */
302
303static int __init ic_setup_if(void)
304{
305 struct ifreq ir;
306 struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
307 int err;
308
309 memset(&ir, 0, sizeof(ir));
310 strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
311 set_sockaddr(sin, ic_myaddr, 0);
312 if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
313 printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
314 return -1;
315 }
316 set_sockaddr(sin, ic_netmask, 0);
317 if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
318 printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
319 return -1;
320 }
321 set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
322 if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
323 printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
324 return -1;
325 }
326 return 0;
327}
328
329static int __init ic_setup_routes(void)
330{
331 /* No need to setup device routes, only the default route... */
332
333 if (ic_gateway != INADDR_NONE) {
334 struct rtentry rm;
335 int err;
336
337 memset(&rm, 0, sizeof(rm));
338 if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
339 printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
340 return -1;
341 }
342 set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
343 set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
344 set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
345 rm.rt_flags = RTF_UP | RTF_GATEWAY;
346 if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
347 printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
348 return -1;
349 }
350 }
351
352 return 0;
353}
354
355/*
356 * Fill in default values for all missing parameters.
357 */
358
359static int __init ic_defaults(void)
360{
361 /*
362 * At this point we have no userspace running so need not
363 * claim locks on system_utsname
364 */
365
366 if (!ic_host_name_set)
367 sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));
368
369 if (root_server_addr == INADDR_NONE)
370 root_server_addr = ic_servaddr;
371
372 if (ic_netmask == INADDR_NONE) {
373 if (IN_CLASSA(ntohl(ic_myaddr)))
374 ic_netmask = htonl(IN_CLASSA_NET);
375 else if (IN_CLASSB(ntohl(ic_myaddr)))
376 ic_netmask = htonl(IN_CLASSB_NET);
377 else if (IN_CLASSC(ntohl(ic_myaddr)))
378 ic_netmask = htonl(IN_CLASSC_NET);
379 else {
380 printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n",
381 NIPQUAD(ic_myaddr));
382 return -1;
383 }
384 printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask));
385 }
386
387 return 0;
388}
389
390/*
391 * RARP support.
392 */
393
394#ifdef IPCONFIG_RARP
395
396static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
397
398static struct packet_type rarp_packet_type __initdata = {
399 .type = __constant_htons(ETH_P_RARP),
400 .func = ic_rarp_recv,
401};
402
403static inline void ic_rarp_init(void)
404{
405 dev_add_pack(&rarp_packet_type);
406}
407
408static inline void ic_rarp_cleanup(void)
409{
410 dev_remove_pack(&rarp_packet_type);
411}
412
413/*
414 * Process received RARP packet.
415 */
416static int __init
417ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
418{
419 struct arphdr *rarp;
420 unsigned char *rarp_ptr;
421 unsigned long sip, tip;
422 unsigned char *sha, *tha; /* s for "source", t for "target" */
423 struct ic_device *d;
424
425 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
426 return NET_RX_DROP;
427
428 if (!pskb_may_pull(skb, sizeof(struct arphdr)))
429 goto drop;
430
431 /* Basic sanity checks can be done without the lock. */
432 rarp = (struct arphdr *)skb->h.raw;
433
434 /* If this test doesn't pass, it's not IP, or we should
435 * ignore it anyway.
436 */
437 if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
438 goto drop;
439
440 /* If it's not a RARP reply, delete it. */
441 if (rarp->ar_op != htons(ARPOP_RREPLY))
442 goto drop;
443
444 /* If it's not Ethernet, delete it. */
445 if (rarp->ar_pro != htons(ETH_P_IP))
446 goto drop;
447
448 if (!pskb_may_pull(skb,
449 sizeof(struct arphdr) +
450 (2 * dev->addr_len) +
451 (2 * 4)))
452 goto drop;
453
454 /* OK, it is all there and looks valid, process... */
455 rarp = (struct arphdr *)skb->h.raw;
456 rarp_ptr = (unsigned char *) (rarp + 1);
457
458 /* One reply at a time, please. */
459 spin_lock(&ic_recv_lock);
460
461 /* If we already have a reply, just drop the packet */
462 if (ic_got_reply)
463 goto drop_unlock;
464
465 /* Find the ic_device that the packet arrived on */
466 d = ic_first_dev;
467 while (d && d->dev != dev)
468 d = d->next;
469 if (!d)
470 goto drop_unlock; /* should never happen */
471
472 /* Extract variable-width fields */
473 sha = rarp_ptr;
474 rarp_ptr += dev->addr_len;
475 memcpy(&sip, rarp_ptr, 4);
476 rarp_ptr += 4;
477 tha = rarp_ptr;
478 rarp_ptr += dev->addr_len;
479 memcpy(&tip, rarp_ptr, 4);
480
481 /* Discard packets which are not meant for us. */
482 if (memcmp(tha, dev->dev_addr, dev->addr_len))
483 goto drop_unlock;
484
485 /* Discard packets which are not from specified server. */
486 if (ic_servaddr != INADDR_NONE && ic_servaddr != sip)
487 goto drop_unlock;
488
489 /* We have a winner! */
490 ic_dev = dev;
491 if (ic_myaddr == INADDR_NONE)
492 ic_myaddr = tip;
493 ic_servaddr = sip;
494 ic_got_reply = IC_RARP;
495
496drop_unlock:
497 /* Show's over. Nothing to see here. */
498 spin_unlock(&ic_recv_lock);
499
500drop:
501 /* Throw the packet out. */
502 kfree_skb(skb);
503 return 0;
504}
505
506
507/*
508 * Send RARP request packet over a single interface.
509 */
510static void __init ic_rarp_send_if(struct ic_device *d)
511{
512 struct net_device *dev = d->dev;
513 arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
514 dev->dev_addr, dev->dev_addr);
515}
516#endif
517
518/*
519 * DHCP/BOOTP support.
520 */
521
522#ifdef IPCONFIG_BOOTP
523
524struct bootp_pkt { /* BOOTP packet format */
525 struct iphdr iph; /* IP header */
526 struct udphdr udph; /* UDP header */
527 u8 op; /* 1=request, 2=reply */
528 u8 htype; /* HW address type */
529 u8 hlen; /* HW address length */
530 u8 hops; /* Used only by gateways */
531 u32 xid; /* Transaction ID */
532 u16 secs; /* Seconds since we started */
533 u16 flags; /* Just what it says */
534 u32 client_ip; /* Client's IP address if known */
535 u32 your_ip; /* Assigned IP address */
536 u32 server_ip; /* (Next, e.g. NFS) Server's IP address */
537 u32 relay_ip; /* IP address of BOOTP relay */
538 u8 hw_addr[16]; /* Client's HW address */
539 u8 serv_name[64]; /* Server host name */
540 u8 boot_file[128]; /* Name of boot file */
541 u8 exten[312]; /* DHCP options / BOOTP vendor extensions */
542};
543
544/* packet ops */
545#define BOOTP_REQUEST 1
546#define BOOTP_REPLY 2
547
548/* DHCP message types */
549#define DHCPDISCOVER 1
550#define DHCPOFFER 2
551#define DHCPREQUEST 3
552#define DHCPDECLINE 4
553#define DHCPACK 5
554#define DHCPNAK 6
555#define DHCPRELEASE 7
556#define DHCPINFORM 8
557
558static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
559
560static struct packet_type bootp_packet_type __initdata = {
561 .type = __constant_htons(ETH_P_IP),
562 .func = ic_bootp_recv,
563};
564
565
566/*
567 * Initialize DHCP/BOOTP extension fields in the request.
568 */
569
570static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
571
572#ifdef IPCONFIG_DHCP
573
574static void __init
575ic_dhcp_init_options(u8 *options)
576{
577 u8 mt = ((ic_servaddr == INADDR_NONE)
578 ? DHCPDISCOVER : DHCPREQUEST);
579 u8 *e = options;
580
581#ifdef IPCONFIG_DEBUG
582 printk("DHCP: Sending message type %d\n", mt);
583#endif
584
585 memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
586 e += 4;
587
588 *e++ = 53; /* DHCP message type */
589 *e++ = 1;
590 *e++ = mt;
591
592 if (mt == DHCPREQUEST) {
593 *e++ = 54; /* Server ID (IP address) */
594 *e++ = 4;
595 memcpy(e, &ic_servaddr, 4);
596 e += 4;
597
598 *e++ = 50; /* Requested IP address */
599 *e++ = 4;
600 memcpy(e, &ic_myaddr, 4);
601 e += 4;
602 }
603
604 /* always? */
605 {
606 static const u8 ic_req_params[] = {
607 1, /* Subnet mask */
608 3, /* Default gateway */
609 6, /* DNS server */
610 12, /* Host name */
611 15, /* Domain name */
612 17, /* Boot path */
613 40, /* NIS domain name */
614 };
615
616 *e++ = 55; /* Parameter request list */
617 *e++ = sizeof(ic_req_params);
618 memcpy(e, ic_req_params, sizeof(ic_req_params));
619 e += sizeof(ic_req_params);
620 }
621
622 *e++ = 255; /* End of the list */
623}
624
625#endif /* IPCONFIG_DHCP */
626
627static void __init ic_bootp_init_ext(u8 *e)
628{
629 memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
630 e += 4;
631 *e++ = 1; /* Subnet mask request */
632 *e++ = 4;
633 e += 4;
634 *e++ = 3; /* Default gateway request */
635 *e++ = 4;
636 e += 4;
637 *e++ = 5; /* Name server request */
638 *e++ = 8;
639 e += 8;
640 *e++ = 12; /* Host name request */
641 *e++ = 32;
642 e += 32;
643 *e++ = 40; /* NIS Domain name request */
644 *e++ = 32;
645 e += 32;
646 *e++ = 17; /* Boot path */
647 *e++ = 40;
648 e += 40;
649
650 *e++ = 57; /* set extension buffer size for reply */
651 *e++ = 2;
652 *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */
653 *e++ = 150;
654
655 *e++ = 255; /* End of the list */
656}
657
658
659/*
660 * Initialize the DHCP/BOOTP mechanism.
661 */
662static inline void ic_bootp_init(void)
663{
664 int i;
665
666 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
667 ic_nameservers[i] = INADDR_NONE;
668
669 dev_add_pack(&bootp_packet_type);
670}
671
672
673/*
674 * DHCP/BOOTP cleanup.
675 */
676static inline void ic_bootp_cleanup(void)
677{
678 dev_remove_pack(&bootp_packet_type);
679}
680
681
682/*
683 * Send DHCP/BOOTP request to single interface.
684 */
685static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
686{
687 struct net_device *dev = d->dev;
688 struct sk_buff *skb;
689 struct bootp_pkt *b;
690 int hh_len = LL_RESERVED_SPACE(dev);
691 struct iphdr *h;
692
693 /* Allocate packet */
694 skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL);
695 if (!skb)
696 return;
697 skb_reserve(skb, hh_len);
698 b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
699 memset(b, 0, sizeof(struct bootp_pkt));
700
701 /* Construct IP header */
702 skb->nh.iph = h = &b->iph;
703 h->version = 4;
704 h->ihl = 5;
705 h->tot_len = htons(sizeof(struct bootp_pkt));
706 h->frag_off = htons(IP_DF);
707 h->ttl = 64;
708 h->protocol = IPPROTO_UDP;
709 h->daddr = INADDR_BROADCAST;
710 h->check = ip_fast_csum((unsigned char *) h, h->ihl);
711
712 /* Construct UDP header */
713 b->udph.source = htons(68);
714 b->udph.dest = htons(67);
715 b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
716 /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
717
718 /* Construct DHCP/BOOTP header */
719 b->op = BOOTP_REQUEST;
720 if (dev->type < 256) /* check for false types */
721 b->htype = dev->type;
722 else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
723 b->htype = ARPHRD_IEEE802;
724 else if (dev->type == ARPHRD_FDDI)
725 b->htype = ARPHRD_ETHER;
726 else {
727 printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
728 b->htype = dev->type; /* can cause undefined behavior */
729 }
730 b->hlen = dev->addr_len;
731 b->your_ip = INADDR_NONE;
732 b->server_ip = INADDR_NONE;
733 memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
734 b->secs = htons(jiffies_diff / HZ);
735 b->xid = d->xid;
736
737 /* add DHCP options or BOOTP extensions */
738#ifdef IPCONFIG_DHCP
739 if (ic_proto_enabled & IC_USE_DHCP)
740 ic_dhcp_init_options(b->exten);
741 else
742#endif
743 ic_bootp_init_ext(b->exten);
744
745 /* Chain packet down the line... */
746 skb->dev = dev;
747 skb->protocol = htons(ETH_P_IP);
748 if ((dev->hard_header &&
749 dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) ||
750 dev_queue_xmit(skb) < 0)
751 printk("E");
752}
753
754
755/*
756 * Copy BOOTP-supplied string if not already set.
757 */
758static int __init ic_bootp_string(char *dest, char *src, int len, int max)
759{
760 if (!len)
761 return 0;
762 if (len > max-1)
763 len = max-1;
764 memcpy(dest, src, len);
765 dest[len] = '\0';
766 return 1;
767}
768
769
770/*
771 * Process BOOTP extensions.
772 */
773static void __init ic_do_bootp_ext(u8 *ext)
774{
775 u8 servers;
776 int i;
777
778#ifdef IPCONFIG_DEBUG
779 u8 *c;
780
781 printk("DHCP/BOOTP: Got extension %d:",*ext);
782 for(c=ext+2; c<ext+2+ext[1]; c++)
783 printk(" %02x", *c);
784 printk("\n");
785#endif
786
787 switch (*ext++) {
788 case 1: /* Subnet mask */
789 if (ic_netmask == INADDR_NONE)
790 memcpy(&ic_netmask, ext+1, 4);
791 break;
792 case 3: /* Default gateway */
793 if (ic_gateway == INADDR_NONE)
794 memcpy(&ic_gateway, ext+1, 4);
795 break;
796 case 6: /* DNS server */
797 servers= *ext/4;
798 if (servers > CONF_NAMESERVERS_MAX)
799 servers = CONF_NAMESERVERS_MAX;
800 for (i = 0; i < servers; i++) {
801 if (ic_nameservers[i] == INADDR_NONE)
802 memcpy(&ic_nameservers[i], ext+1+4*i, 4);
803 }
804 break;
805 case 12: /* Host name */
806 ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN);
807 ic_host_name_set = 1;
808 break;
809 case 15: /* Domain name (DNS) */
810 ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
811 break;
812 case 17: /* Root path */
813 if (!root_server_path[0])
814 ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
815 break;
816 case 40: /* NIS Domain name (_not_ DNS) */
817 ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN);
818 break;
819 }
820}
821
822
823/*
824 * Receive BOOTP reply.
825 */
826static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
827{
828 struct bootp_pkt *b;
829 struct iphdr *h;
830 struct ic_device *d;
831 int len, ext_len;
832
833 /* Perform verifications before taking the lock. */
834 if (skb->pkt_type == PACKET_OTHERHOST)
835 goto drop;
836
837 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
838 return NET_RX_DROP;
839
840 if (!pskb_may_pull(skb,
841 sizeof(struct iphdr) +
842 sizeof(struct udphdr)))
843 goto drop;
844
845 b = (struct bootp_pkt *) skb->nh.iph;
846 h = &b->iph;
847
848 if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
849 goto drop;
850
851 /* Fragments are not supported */
852 if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
853 if (net_ratelimit())
854 printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
855 "reply.\n");
856 goto drop;
857 }
858
859 if (skb->len < ntohs(h->tot_len))
860 goto drop;
861
862 if (ip_fast_csum((char *) h, h->ihl))
863 goto drop;
864
865 if (b->udph.source != htons(67) || b->udph.dest != htons(68))
866 goto drop;
867
868 if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
869 goto drop;
870
871 len = ntohs(b->udph.len) - sizeof(struct udphdr);
872 ext_len = len - (sizeof(*b) -
873 sizeof(struct iphdr) -
874 sizeof(struct udphdr) -
875 sizeof(b->exten));
876 if (ext_len < 0)
877 goto drop;
878
879 /* Ok the front looks good, make sure we can get at the rest. */
880 if (!pskb_may_pull(skb, skb->len))
881 goto drop;
882
883 b = (struct bootp_pkt *) skb->nh.iph;
884 h = &b->iph;
885
886 /* One reply at a time, please. */
887 spin_lock(&ic_recv_lock);
888
889 /* If we already have a reply, just drop the packet */
890 if (ic_got_reply)
891 goto drop_unlock;
892
893 /* Find the ic_device that the packet arrived on */
894 d = ic_first_dev;
895 while (d && d->dev != dev)
896 d = d->next;
897 if (!d)
898 goto drop_unlock; /* should never happen */
899
900 /* Is it a reply to our BOOTP request? */
901 if (b->op != BOOTP_REPLY ||
902 b->xid != d->xid) {
903 if (net_ratelimit())
904 printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
905 "op[%x] xid[%x]\n",
906 b->op, b->xid);
907 goto drop_unlock;
908 }
909
910 /* Parse extensions */
911 if (ext_len >= 4 &&
912 !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
913 u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
914 u8 *ext;
915
916#ifdef IPCONFIG_DHCP
917 if (ic_proto_enabled & IC_USE_DHCP) {
918 u32 server_id = INADDR_NONE;
919 int mt = 0;
920
921 ext = &b->exten[4];
922 while (ext < end && *ext != 0xff) {
923 u8 *opt = ext++;
924 if (*opt == 0) /* Padding */
925 continue;
926 ext += *ext + 1;
927 if (ext >= end)
928 break;
929 switch (*opt) {
930 case 53: /* Message type */
931 if (opt[1])
932 mt = opt[2];
933 break;
934 case 54: /* Server ID (IP address) */
935 if (opt[1] >= 4)
936 memcpy(&server_id, opt + 2, 4);
937 break;
938 };
939 }
940
941#ifdef IPCONFIG_DEBUG
942 printk("DHCP: Got message type %d\n", mt);
943#endif
944
945 switch (mt) {
946 case DHCPOFFER:
947 /* While in the process of accepting one offer,
948 * ignore all others.
949 */
950 if (ic_myaddr != INADDR_NONE)
951 goto drop_unlock;
952
953 /* Let's accept that offer. */
954 ic_myaddr = b->your_ip;
955 ic_servaddr = server_id;
956#ifdef IPCONFIG_DEBUG
957 printk("DHCP: Offered address %u.%u.%u.%u",
958 NIPQUAD(ic_myaddr));
959 printk(" by server %u.%u.%u.%u\n",
960 NIPQUAD(ic_servaddr));
961#endif
962 /* The DHCP indicated server address takes
963 * precedence over the bootp header one if
964 * they are different.
965 */
966 if ((server_id != INADDR_NONE) &&
967 (b->server_ip != server_id))
968 b->server_ip = ic_servaddr;
969 break;
970
971 case DHCPACK:
972 if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
973 goto drop_unlock;
974
975 /* Yeah! */
976 break;
977
978 default:
979 /* Urque. Forget it*/
980 ic_myaddr = INADDR_NONE;
981 ic_servaddr = INADDR_NONE;
982 goto drop_unlock;
983 };
984
985 ic_dhcp_msgtype = mt;
986
987 }
988#endif /* IPCONFIG_DHCP */
989
990 ext = &b->exten[4];
991 while (ext < end && *ext != 0xff) {
992 u8 *opt = ext++;
993 if (*opt == 0) /* Padding */
994 continue;
995 ext += *ext + 1;
996 if (ext < end)
997 ic_do_bootp_ext(opt);
998 }
999 }
1000
1001 /* We have a winner! */
1002 ic_dev = dev;
1003 ic_myaddr = b->your_ip;
1004 ic_servaddr = b->server_ip;
1005 if (ic_gateway == INADDR_NONE && b->relay_ip)
1006 ic_gateway = b->relay_ip;
1007 if (ic_nameservers[0] == INADDR_NONE)
1008 ic_nameservers[0] = ic_servaddr;
1009 ic_got_reply = IC_BOOTP;
1010
1011drop_unlock:
1012 /* Show's over. Nothing to see here. */
1013 spin_unlock(&ic_recv_lock);
1014
1015drop:
1016 /* Throw the packet out. */
1017 kfree_skb(skb);
1018
1019 return 0;
1020}
1021
1022
1023#endif
1024
1025
1026/*
1027 * Dynamic IP configuration -- DHCP, BOOTP, RARP.
1028 */
1029
1030#ifdef IPCONFIG_DYNAMIC
1031
1032static int __init ic_dynamic(void)
1033{
1034 int retries;
1035 struct ic_device *d;
1036 unsigned long start_jiffies, timeout, jiff;
1037 int do_bootp = ic_proto_have_if & IC_BOOTP;
1038 int do_rarp = ic_proto_have_if & IC_RARP;
1039
1040 /*
1041 * If none of DHCP/BOOTP/RARP was selected, return with an error.
1042 * This routine gets only called when some pieces of information
1043 * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
1044 */
1045 if (!ic_proto_enabled) {
1046 printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
1047 return -1;
1048 }
1049
1050#ifdef IPCONFIG_BOOTP
1051 if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
1052 printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
1053#endif
1054#ifdef IPCONFIG_RARP
1055 if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
1056 printk(KERN_ERR "RARP: No suitable device found.\n");
1057#endif
1058
1059 if (!ic_proto_have_if)
1060 /* Error message already printed */
1061 return -1;
1062
1063 /*
1064 * Setup protocols
1065 */
1066#ifdef IPCONFIG_BOOTP
1067 if (do_bootp)
1068 ic_bootp_init();
1069#endif
1070#ifdef IPCONFIG_RARP
1071 if (do_rarp)
1072 ic_rarp_init();
1073#endif
1074
1075 /*
1076 * Send requests and wait, until we get an answer. This loop
1077 * seems to be a terrible waste of CPU time, but actually there is
1078 * only one process running at all, so we don't need to use any
1079 * scheduler functions.
1080 * [Actually we could now, but the nothing else running note still
1081 * applies.. - AC]
1082 */
1083 printk(KERN_NOTICE "Sending %s%s%s requests .",
1084 do_bootp
1085 ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
1086 (do_bootp && do_rarp) ? " and " : "",
1087 do_rarp ? "RARP" : "");
1088
1089 start_jiffies = jiffies;
1090 d = ic_first_dev;
1091 retries = CONF_SEND_RETRIES;
1092 get_random_bytes(&timeout, sizeof(timeout));
1093 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
1094 for(;;) {
1095#ifdef IPCONFIG_BOOTP
1096 if (do_bootp && (d->able & IC_BOOTP))
1097 ic_bootp_send_if(d, jiffies - start_jiffies);
1098#endif
1099#ifdef IPCONFIG_RARP
1100 if (do_rarp && (d->able & IC_RARP))
1101 ic_rarp_send_if(d);
1102#endif
1103
1104 jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
1105 while (time_before(jiffies, jiff) && !ic_got_reply) {
1106 set_current_state(TASK_UNINTERRUPTIBLE);
1107 schedule_timeout(1);
1108 }
1109#ifdef IPCONFIG_DHCP
1110 /* DHCP isn't done until we get a DHCPACK. */
1111 if ((ic_got_reply & IC_BOOTP)
1112 && (ic_proto_enabled & IC_USE_DHCP)
1113 && ic_dhcp_msgtype != DHCPACK)
1114 {
1115 ic_got_reply = 0;
1116 printk(",");
1117 continue;
1118 }
1119#endif /* IPCONFIG_DHCP */
1120
1121 if (ic_got_reply) {
1122 printk(" OK\n");
1123 break;
1124 }
1125
1126 if ((d = d->next))
1127 continue;
1128
1129 if (! --retries) {
1130 printk(" timed out!\n");
1131 break;
1132 }
1133
1134 d = ic_first_dev;
1135
1136 timeout = timeout CONF_TIMEOUT_MULT;
1137 if (timeout > CONF_TIMEOUT_MAX)
1138 timeout = CONF_TIMEOUT_MAX;
1139
1140 printk(".");
1141 }
1142
1143#ifdef IPCONFIG_BOOTP
1144 if (do_bootp)
1145 ic_bootp_cleanup();
1146#endif
1147#ifdef IPCONFIG_RARP
1148 if (do_rarp)
1149 ic_rarp_cleanup();
1150#endif
1151
1152 if (!ic_got_reply)
1153 return -1;
1154
1155 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
1156 ((ic_got_reply & IC_RARP) ? "RARP"
1157 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
1158 NIPQUAD(ic_servaddr));
1159 printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr));
1160
1161 return 0;
1162}
1163
1164#endif /* IPCONFIG_DYNAMIC */
1165
1166#ifdef CONFIG_PROC_FS
1167
1168static int pnp_seq_show(struct seq_file *seq, void *v)
1169{
1170 int i;
1171
1172 if (ic_proto_used & IC_PROTO)
1173 seq_printf(seq, "#PROTO: %s\n",
1174 (ic_proto_used & IC_RARP) ? "RARP"
1175 : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
1176 else
1177 seq_puts(seq, "#MANUAL\n");
1178
1179 if (ic_domain[0])
1180 seq_printf(seq,
1181 "domain %s\n", ic_domain);
1182 for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
1183 if (ic_nameservers[i] != INADDR_NONE)
1184 seq_printf(seq,
1185 "nameserver %u.%u.%u.%u\n",
1186 NIPQUAD(ic_nameservers[i]));
1187 }
1188 if (ic_servaddr != INADDR_NONE)
1189 seq_printf(seq,
1190 "bootserver %u.%u.%u.%u\n",
1191 NIPQUAD(ic_servaddr));
1192 return 0;
1193}
1194
1195static int pnp_seq_open(struct inode *indoe, struct file *file)
1196{
1197 return single_open(file, pnp_seq_show, NULL);
1198}
1199
1200static struct file_operations pnp_seq_fops = {
1201 .owner = THIS_MODULE,
1202 .open = pnp_seq_open,
1203 .read = seq_read,
1204 .llseek = seq_lseek,
1205 .release = single_release,
1206};
1207#endif /* CONFIG_PROC_FS */
1208
1209/*
1210 * Extract IP address from the parameter string if needed. Note that we
1211 * need to have root_server_addr set _before_ IPConfig gets called as it
1212 * can override it.
1213 */
1214u32 __init root_nfs_parse_addr(char *name)
1215{
1216 u32 addr;
1217 int octets = 0;
1218 char *cp, *cq;
1219
1220 cp = cq = name;
1221 while (octets < 4) {
1222 while (*cp >= '0' && *cp <= '9')
1223 cp++;
1224 if (cp == cq || cp - cq > 3)
1225 break;
1226 if (*cp == '.' || octets == 3)
1227 octets++;
1228 if (octets < 4)
1229 cp++;
1230 cq = cp;
1231 }
1232 if (octets == 4 && (*cp == ':' || *cp == '\0')) {
1233 if (*cp == ':')
1234 *cp++ = '\0';
1235 addr = in_aton(name);
1236 memmove(name, cp, strlen(cp) + 1);
1237 } else
1238 addr = INADDR_NONE;
1239
1240 return addr;
1241}
1242
1243/*
1244 * IP Autoconfig dispatcher.
1245 */
1246
1247static int __init ip_auto_config(void)
1248{
1249 u32 addr;
1250
1251#ifdef CONFIG_PROC_FS
1252 proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops);
1253#endif /* CONFIG_PROC_FS */
1254
1255 if (!ic_enable)
1256 return 0;
1257
1258 DBG(("IP-Config: Entered.\n"));
1259#ifdef IPCONFIG_DYNAMIC
1260 try_try_again:
1261#endif
1262 /* Give hardware a chance to settle */
1263 msleep(CONF_PRE_OPEN);
1264
1265 /* Setup all network devices */
1266 if (ic_open_devs() < 0)
1267 return -1;
1268
1269 /* Give drivers a chance to settle */
1270 ssleep(CONF_POST_OPEN);
1271
1272 /*
1273 * If the config information is insufficient (e.g., our IP address or
1274 * IP address of the boot server is missing or we have multiple network
1275 * interfaces and no default was set), use BOOTP or RARP to get the
1276 * missing values.
1277 */
1278 if (ic_myaddr == INADDR_NONE ||
1279#ifdef CONFIG_ROOT_NFS
1280 (MAJOR(ROOT_DEV) == UNNAMED_MAJOR
1281 && root_server_addr == INADDR_NONE
1282 && ic_servaddr == INADDR_NONE) ||
1283#endif
1284 ic_first_dev->next) {
1285#ifdef IPCONFIG_DYNAMIC
1286
1287 int retries = CONF_OPEN_RETRIES;
1288
1289 if (ic_dynamic() < 0) {
1290 ic_close_devs();
1291
1292 /*
1293 * I don't know why, but sometimes the
1294 * eepro100 driver (at least) gets upset and
1295 * doesn't work the first time it's opened.
1296 * But then if you close it and reopen it, it
1297 * works just fine. So we need to try that at
1298 * least once before giving up.
1299 *
1300 * Also, if the root will be NFS-mounted, we
1301 * have nowhere to go if DHCP fails. So we
1302 * just have to keep trying forever.
1303 *
1304 * -- Chip
1305 */
1306#ifdef CONFIG_ROOT_NFS
1307 if (ROOT_DEV == Root_NFS) {
1308 printk(KERN_ERR
1309 "IP-Config: Retrying forever (NFS root)...\n");
1310 goto try_try_again;
1311 }
1312#endif
1313
1314 if (--retries) {
1315 printk(KERN_ERR
1316 "IP-Config: Reopening network devices...\n");
1317 goto try_try_again;
1318 }
1319
1320 /* Oh, well. At least we tried. */
1321 printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
1322 return -1;
1323 }
1324#else /* !DYNAMIC */
1325 printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
1326 ic_close_devs();
1327 return -1;
1328#endif /* IPCONFIG_DYNAMIC */
1329 } else {
1330 /* Device selected manually or only one device -> use it */
1331 ic_dev = ic_first_dev->dev;
1332 }
1333
1334 addr = root_nfs_parse_addr(root_server_path);
1335 if (root_server_addr == INADDR_NONE)
1336 root_server_addr = addr;
1337
1338 /*
1339 * Use defaults whereever applicable.
1340 */
1341 if (ic_defaults() < 0)
1342 return -1;
1343
1344 /*
1345 * Close all network devices except the device we've
1346 * autoconfigured and set up routes.
1347 */
1348 ic_close_devs();
1349 if (ic_setup_if() < 0 || ic_setup_routes() < 0)
1350 return -1;
1351
1352 /*
1353 * Record which protocol was actually used.
1354 */
1355#ifdef IPCONFIG_DYNAMIC
1356 ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
1357#endif
1358
1359#ifndef IPCONFIG_SILENT
1360 /*
1361 * Clue in the operator.
1362 */
1363 printk("IP-Config: Complete:");
1364 printk("\n device=%s", ic_dev->name);
1365 printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr));
1366 printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask));
1367 printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway));
1368 printk(",\n host=%s, domain=%s, nis-domain=%s",
1369 system_utsname.nodename, ic_domain, system_utsname.domainname);
1370 printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr));
1371 printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr));
1372 printk(", rootpath=%s", root_server_path);
1373 printk("\n");
1374#endif /* !SILENT */
1375
1376 return 0;
1377}
1378
1379late_initcall(ip_auto_config);
1380
1381
1382/*
1383 * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
1384 * command line parameter. It consists of option fields separated by colons in
1385 * the following order:
1386 *
1387 * <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<PROTO>
1388 *
1389 * Any of the fields can be empty which means to use a default value:
1390 * <client-ip> - address given by BOOTP or RARP
1391 * <server-ip> - address of host returning BOOTP or RARP packet
1392 * <gw-ip> - none, or the address returned by BOOTP
1393 * <netmask> - automatically determined from <client-ip>, or the
1394 * one returned by BOOTP
1395 * <host name> - <client-ip> in ASCII notation, or the name returned
1396 * by BOOTP
1397 * <device> - use all available devices
1398 * <PROTO>:
1399 * off|none - don't do autoconfig at all (DEFAULT)
1400 * on|any - use any configured protocol
1401 * dhcp|bootp|rarp - use only the specified protocol
1402 * both - use both BOOTP and RARP (not DHCP)
1403 */
1404static int __init ic_proto_name(char *name)
1405{
1406 if (!strcmp(name, "on") || !strcmp(name, "any")) {
1407 return 1;
1408 }
1409#ifdef CONFIG_IP_PNP_DHCP
1410 else if (!strcmp(name, "dhcp")) {
1411 ic_proto_enabled &= ~IC_RARP;
1412 return 1;
1413 }
1414#endif
1415#ifdef CONFIG_IP_PNP_BOOTP
1416 else if (!strcmp(name, "bootp")) {
1417 ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
1418 return 1;
1419 }
1420#endif
1421#ifdef CONFIG_IP_PNP_RARP
1422 else if (!strcmp(name, "rarp")) {
1423 ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
1424 return 1;
1425 }
1426#endif
1427#ifdef IPCONFIG_DYNAMIC
1428 else if (!strcmp(name, "both")) {
1429 ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
1430 return 1;
1431 }
1432#endif
1433 return 0;
1434}
1435
1436static int __init ip_auto_config_setup(char *addrs)
1437{
1438 char *cp, *ip, *dp;
1439 int num = 0;
1440
1441 ic_set_manually = 1;
1442
1443 ic_enable = (*addrs &&
1444 (strcmp(addrs, "off") != 0) &&
1445 (strcmp(addrs, "none") != 0));
1446 if (!ic_enable)
1447 return 1;
1448
1449 if (ic_proto_name(addrs))
1450 return 1;
1451
1452 /* Parse the whole string */
1453 ip = addrs;
1454 while (ip && *ip) {
1455 if ((cp = strchr(ip, ':')))
1456 *cp++ = '\0';
1457 if (strlen(ip) > 0) {
1458 DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
1459 switch (num) {
1460 case 0:
1461 if ((ic_myaddr = in_aton(ip)) == INADDR_ANY)
1462 ic_myaddr = INADDR_NONE;
1463 break;
1464 case 1:
1465 if ((ic_servaddr = in_aton(ip)) == INADDR_ANY)
1466 ic_servaddr = INADDR_NONE;
1467 break;
1468 case 2:
1469 if ((ic_gateway = in_aton(ip)) == INADDR_ANY)
1470 ic_gateway = INADDR_NONE;
1471 break;
1472 case 3:
1473 if ((ic_netmask = in_aton(ip)) == INADDR_ANY)
1474 ic_netmask = INADDR_NONE;
1475 break;
1476 case 4:
1477 if ((dp = strchr(ip, '.'))) {
1478 *dp++ = '\0';
1479 strlcpy(system_utsname.domainname, dp,
1480 sizeof(system_utsname.domainname));
1481 }
1482 strlcpy(system_utsname.nodename, ip,
1483 sizeof(system_utsname.nodename));
1484 ic_host_name_set = 1;
1485 break;
1486 case 5:
1487 strlcpy(user_dev_name, ip, sizeof(user_dev_name));
1488 break;
1489 case 6:
1490 ic_proto_name(ip);
1491 break;
1492 }
1493 }
1494 ip = cp;
1495 num++;
1496 }
1497
1498 return 1;
1499}
1500
1501static int __init nfsaddrs_config_setup(char *addrs)
1502{
1503 return ip_auto_config_setup(addrs);
1504}
1505
1506__setup("ip=", ip_auto_config_setup);
1507__setup("nfsaddrs=", nfsaddrs_config_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 000000000000..68a78731f722
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,905 @@
1/*
2 * Linux NET3: IP/IP protocol decoder.
3 *
4 * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
5 *
6 * Authors:
7 * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
8 *
9 * Fixes:
10 * Alan Cox : Merged and made usable non modular (its so tiny its silly as
11 * a module taking up 2 pages).
12 * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
13 * to keep ip_forward happy.
14 * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
15 * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
16 * David Woodhouse : Perform some basic ICMP handling.
17 * IPIP Routing without decapsulation.
18 * Carlos Picoto : GRE over IP support
19 * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
20 * I do not want to merge them together.
21 *
22 * This program is free software; you can redistribute it and/or
23 * modify it under the terms of the GNU General Public License
24 * as published by the Free Software Foundation; either version
25 * 2 of the License, or (at your option) any later version.
26 *
27 */
28
29/* tunnel.c: an IP tunnel driver
30
31 The purpose of this driver is to provide an IP tunnel through
32 which you can tunnel network traffic transparently across subnets.
33
34 This was written by looking at Nick Holloway's dummy driver
35 Thanks for the great code!
36
37 -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
38
39 Minor tweaks:
40 Cleaned up the code a little and added some pre-1.3.0 tweaks.
41 dev->hard_header/hard_header_len changed to use no headers.
42 Comments/bracketing tweaked.
43 Made the tunnels use dev->name not tunnel: when error reporting.
44 Added tx_dropped stat
45
46 -Alan Cox (Alan.Cox@linux.org) 21 March 95
47
48 Reworked:
49 Changed to tunnel to destination gateway in addition to the
50 tunnel's pointopoint address
51 Almost completely rewritten
52 Note: There is currently no firewall or ICMP handling done.
53
54 -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
55
56*/
57
58/* Things I wish I had known when writing the tunnel driver:
59
60 When the tunnel_xmit() function is called, the skb contains the
61 packet to be sent (plus a great deal of extra info), and dev
62 contains the tunnel device that _we_ are.
63
64 When we are passed a packet, we are expected to fill in the
65 source address with our source IP address.
66
67 What is the proper way to allocate, copy and free a buffer?
68 After you allocate it, it is a "0 length" chunk of memory
69 starting at zero. If you want to add headers to the buffer
70 later, you'll have to call "skb_reserve(skb, amount)" with
71 the amount of memory you want reserved. Then, you call
72 "skb_put(skb, amount)" with the amount of space you want in
73 the buffer. skb_put() returns a pointer to the top (#0) of
74 that buffer. skb->len is set to the amount of space you have
75 "allocated" with skb_put(). You can then write up to skb->len
76 bytes to that buffer. If you need more, you can call skb_put()
77 again with the additional amount of space you need. You can
78 find out how much more space you can allocate by calling
79 "skb_tailroom(skb)".
80 Now, to add header space, call "skb_push(skb, header_len)".
81 This creates space at the beginning of the buffer and returns
82 a pointer to this new space. If later you need to strip a
83 header from a buffer, call "skb_pull(skb, header_len)".
84 skb_headroom() will return how much space is left at the top
85 of the buffer (before the main data). Remember, this headroom
86 space must be reserved before the skb_put() function is called.
87 */
88
89/*
90 This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
91
92 For comments look at net/ipv4/ip_gre.c --ANK
93 */
94
95
96#include <linux/config.h>
97#include <linux/module.h>
98#include <linux/types.h>
99#include <linux/sched.h>
100#include <linux/kernel.h>
101#include <asm/uaccess.h>
102#include <linux/skbuff.h>
103#include <linux/netdevice.h>
104#include <linux/in.h>
105#include <linux/tcp.h>
106#include <linux/udp.h>
107#include <linux/if_arp.h>
108#include <linux/mroute.h>
109#include <linux/init.h>
110#include <linux/netfilter_ipv4.h>
111
112#include <net/sock.h>
113#include <net/ip.h>
114#include <net/icmp.h>
115#include <net/protocol.h>
116#include <net/ipip.h>
117#include <net/inet_ecn.h>
118#include <net/xfrm.h>
119
120#define HASH_SIZE 16
121#define HASH(addr) ((addr^(addr>>4))&0xF)
122
123static int ipip_fb_tunnel_init(struct net_device *dev);
124static int ipip_tunnel_init(struct net_device *dev);
125static void ipip_tunnel_setup(struct net_device *dev);
126
127static struct net_device *ipip_fb_tunnel_dev;
128
129static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
130static struct ip_tunnel *tunnels_r[HASH_SIZE];
131static struct ip_tunnel *tunnels_l[HASH_SIZE];
132static struct ip_tunnel *tunnels_wc[1];
133static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
134
135static DEFINE_RWLOCK(ipip_lock);
136
137static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local)
138{
139 unsigned h0 = HASH(remote);
140 unsigned h1 = HASH(local);
141 struct ip_tunnel *t;
142
143 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
144 if (local == t->parms.iph.saddr &&
145 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
146 return t;
147 }
148 for (t = tunnels_r[h0]; t; t = t->next) {
149 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
150 return t;
151 }
152 for (t = tunnels_l[h1]; t; t = t->next) {
153 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
154 return t;
155 }
156 if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
157 return t;
158 return NULL;
159}
160
161static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
162{
163 u32 remote = t->parms.iph.daddr;
164 u32 local = t->parms.iph.saddr;
165 unsigned h = 0;
166 int prio = 0;
167
168 if (remote) {
169 prio |= 2;
170 h ^= HASH(remote);
171 }
172 if (local) {
173 prio |= 1;
174 h ^= HASH(local);
175 }
176 return &tunnels[prio][h];
177}
178
179
180static void ipip_tunnel_unlink(struct ip_tunnel *t)
181{
182 struct ip_tunnel **tp;
183
184 for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
185 if (t == *tp) {
186 write_lock_bh(&ipip_lock);
187 *tp = t->next;
188 write_unlock_bh(&ipip_lock);
189 break;
190 }
191 }
192}
193
194static void ipip_tunnel_link(struct ip_tunnel *t)
195{
196 struct ip_tunnel **tp = ipip_bucket(t);
197
198 t->next = *tp;
199 write_lock_bh(&ipip_lock);
200 *tp = t;
201 write_unlock_bh(&ipip_lock);
202}
203
204static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
205{
206 u32 remote = parms->iph.daddr;
207 u32 local = parms->iph.saddr;
208 struct ip_tunnel *t, **tp, *nt;
209 struct net_device *dev;
210 unsigned h = 0;
211 int prio = 0;
212 char name[IFNAMSIZ];
213
214 if (remote) {
215 prio |= 2;
216 h ^= HASH(remote);
217 }
218 if (local) {
219 prio |= 1;
220 h ^= HASH(local);
221 }
222 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
223 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
224 return t;
225 }
226 if (!create)
227 return NULL;
228
229 if (parms->name[0])
230 strlcpy(name, parms->name, IFNAMSIZ);
231 else {
232 int i;
233 for (i=1; i<100; i++) {
234 sprintf(name, "tunl%d", i);
235 if (__dev_get_by_name(name) == NULL)
236 break;
237 }
238 if (i==100)
239 goto failed;
240 }
241
242 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
243 if (dev == NULL)
244 return NULL;
245
246 nt = dev->priv;
247 SET_MODULE_OWNER(dev);
248 dev->init = ipip_tunnel_init;
249 nt->parms = *parms;
250
251 if (register_netdevice(dev) < 0) {
252 free_netdev(dev);
253 goto failed;
254 }
255
256 dev_hold(dev);
257 ipip_tunnel_link(nt);
258 /* Do not decrement MOD_USE_COUNT here. */
259 return nt;
260
261failed:
262 return NULL;
263}
264
265static void ipip_tunnel_uninit(struct net_device *dev)
266{
267 if (dev == ipip_fb_tunnel_dev) {
268 write_lock_bh(&ipip_lock);
269 tunnels_wc[0] = NULL;
270 write_unlock_bh(&ipip_lock);
271 } else
272 ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
273 dev_put(dev);
274}
275
276static void ipip_err(struct sk_buff *skb, void *__unused)
277{
278#ifndef I_WISH_WORLD_WERE_PERFECT
279
280/* It is not :-( All the routers (except for Linux) return only
281 8 bytes of packet payload. It means, that precise relaying of
282 ICMP in the real Internet is absolutely infeasible.
283 */
284 struct iphdr *iph = (struct iphdr*)skb->data;
285 int type = skb->h.icmph->type;
286 int code = skb->h.icmph->code;
287 struct ip_tunnel *t;
288
289 switch (type) {
290 default:
291 case ICMP_PARAMETERPROB:
292 return;
293
294 case ICMP_DEST_UNREACH:
295 switch (code) {
296 case ICMP_SR_FAILED:
297 case ICMP_PORT_UNREACH:
298 /* Impossible event. */
299 return;
300 case ICMP_FRAG_NEEDED:
301 /* Soft state for pmtu is maintained by IP core. */
302 return;
303 default:
304 /* All others are translated to HOST_UNREACH.
305 rfc2003 contains "deep thoughts" about NET_UNREACH,
306 I believe they are just ether pollution. --ANK
307 */
308 break;
309 }
310 break;
311 case ICMP_TIME_EXCEEDED:
312 if (code != ICMP_EXC_TTL)
313 return;
314 break;
315 }
316
317 read_lock(&ipip_lock);
318 t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
319 if (t == NULL || t->parms.iph.daddr == 0)
320 goto out;
321 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
322 goto out;
323
324 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
325 t->err_count++;
326 else
327 t->err_count = 1;
328 t->err_time = jiffies;
329out:
330 read_unlock(&ipip_lock);
331 return;
332#else
333 struct iphdr *iph = (struct iphdr*)dp;
334 int hlen = iph->ihl<<2;
335 struct iphdr *eiph;
336 int type = skb->h.icmph->type;
337 int code = skb->h.icmph->code;
338 int rel_type = 0;
339 int rel_code = 0;
340 int rel_info = 0;
341 struct sk_buff *skb2;
342 struct flowi fl;
343 struct rtable *rt;
344
345 if (len < hlen + sizeof(struct iphdr))
346 return;
347 eiph = (struct iphdr*)(dp + hlen);
348
349 switch (type) {
350 default:
351 return;
352 case ICMP_PARAMETERPROB:
353 if (skb->h.icmph->un.gateway < hlen)
354 return;
355
356 /* So... This guy found something strange INSIDE encapsulated
357 packet. Well, he is fool, but what can we do ?
358 */
359 rel_type = ICMP_PARAMETERPROB;
360 rel_info = skb->h.icmph->un.gateway - hlen;
361 break;
362
363 case ICMP_DEST_UNREACH:
364 switch (code) {
365 case ICMP_SR_FAILED:
366 case ICMP_PORT_UNREACH:
367 /* Impossible event. */
368 return;
369 case ICMP_FRAG_NEEDED:
370 /* And it is the only really necessary thing :-) */
371 rel_info = ntohs(skb->h.icmph->un.frag.mtu);
372 if (rel_info < hlen+68)
373 return;
374 rel_info -= hlen;
375 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
376 if (rel_info > ntohs(eiph->tot_len))
377 return;
378 break;
379 default:
380 /* All others are translated to HOST_UNREACH.
381 rfc2003 contains "deep thoughts" about NET_UNREACH,
382 I believe, it is just ether pollution. --ANK
383 */
384 rel_type = ICMP_DEST_UNREACH;
385 rel_code = ICMP_HOST_UNREACH;
386 break;
387 }
388 break;
389 case ICMP_TIME_EXCEEDED:
390 if (code != ICMP_EXC_TTL)
391 return;
392 break;
393 }
394
395 /* Prepare fake skb to feed it to icmp_send */
396 skb2 = skb_clone(skb, GFP_ATOMIC);
397 if (skb2 == NULL)
398 return;
399 dst_release(skb2->dst);
400 skb2->dst = NULL;
401 skb_pull(skb2, skb->data - (u8*)eiph);
402 skb2->nh.raw = skb2->data;
403
404 /* Try to guess incoming interface */
405 memset(&fl, 0, sizeof(fl));
406 fl.fl4_daddr = eiph->saddr;
407 fl.fl4_tos = RT_TOS(eiph->tos);
408 fl.proto = IPPROTO_IPIP;
409 if (ip_route_output_key(&rt, &key)) {
410 kfree_skb(skb2);
411 return;
412 }
413 skb2->dev = rt->u.dst.dev;
414
415 /* route "incoming" packet */
416 if (rt->rt_flags&RTCF_LOCAL) {
417 ip_rt_put(rt);
418 rt = NULL;
419 fl.fl4_daddr = eiph->daddr;
420 fl.fl4_src = eiph->saddr;
421 fl.fl4_tos = eiph->tos;
422 if (ip_route_output_key(&rt, &fl) ||
423 rt->u.dst.dev->type != ARPHRD_TUNNEL) {
424 ip_rt_put(rt);
425 kfree_skb(skb2);
426 return;
427 }
428 } else {
429 ip_rt_put(rt);
430 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
431 skb2->dst->dev->type != ARPHRD_TUNNEL) {
432 kfree_skb(skb2);
433 return;
434 }
435 }
436
437 /* change mtu on this route */
438 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
439 if (rel_info > dst_mtu(skb2->dst)) {
440 kfree_skb(skb2);
441 return;
442 }
443 skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
444 rel_info = htonl(rel_info);
445 } else if (type == ICMP_TIME_EXCEEDED) {
446 struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
447 if (t->parms.iph.ttl) {
448 rel_type = ICMP_DEST_UNREACH;
449 rel_code = ICMP_HOST_UNREACH;
450 }
451 }
452
453 icmp_send(skb2, rel_type, rel_code, rel_info);
454 kfree_skb(skb2);
455 return;
456#endif
457}
458
459static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb)
460{
461 struct iphdr *inner_iph = skb->nh.iph;
462
463 if (INET_ECN_is_ce(outer_iph->tos))
464 IP_ECN_set_ce(inner_iph);
465}
466
467static int ipip_rcv(struct sk_buff *skb)
468{
469 struct iphdr *iph;
470 struct ip_tunnel *tunnel;
471
472 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
473 goto out;
474
475 iph = skb->nh.iph;
476
477 read_lock(&ipip_lock);
478 if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
479 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
480 read_unlock(&ipip_lock);
481 kfree_skb(skb);
482 return 0;
483 }
484
485 secpath_reset(skb);
486
487 skb->mac.raw = skb->nh.raw;
488 skb->nh.raw = skb->data;
489 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
490 skb->protocol = htons(ETH_P_IP);
491 skb->pkt_type = PACKET_HOST;
492
493 tunnel->stat.rx_packets++;
494 tunnel->stat.rx_bytes += skb->len;
495 skb->dev = tunnel->dev;
496 dst_release(skb->dst);
497 skb->dst = NULL;
498 nf_reset(skb);
499 ipip_ecn_decapsulate(iph, skb);
500 netif_rx(skb);
501 read_unlock(&ipip_lock);
502 return 0;
503 }
504 read_unlock(&ipip_lock);
505
506out:
507 return -1;
508}
509
510/*
511 * This function assumes it is being called from dev_queue_xmit()
512 * and that skb is filled properly by that function.
513 */
514
515static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
516{
517 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
518 struct net_device_stats *stats = &tunnel->stat;
519 struct iphdr *tiph = &tunnel->parms.iph;
520 u8 tos = tunnel->parms.iph.tos;
521 u16 df = tiph->frag_off;
522 struct rtable *rt; /* Route to the other host */
523 struct net_device *tdev; /* Device to other host */
524 struct iphdr *old_iph = skb->nh.iph;
525 struct iphdr *iph; /* Our new IP header */
526 int max_headroom; /* The extra header space needed */
527 u32 dst = tiph->daddr;
528 int mtu;
529
530 if (tunnel->recursion++) {
531 tunnel->stat.collisions++;
532 goto tx_error;
533 }
534
535 if (skb->protocol != htons(ETH_P_IP))
536 goto tx_error;
537
538 if (tos&1)
539 tos = old_iph->tos;
540
541 if (!dst) {
542 /* NBMA tunnel */
543 if ((rt = (struct rtable*)skb->dst) == NULL) {
544 tunnel->stat.tx_fifo_errors++;
545 goto tx_error;
546 }
547 if ((dst = rt->rt_gateway) == 0)
548 goto tx_error_icmp;
549 }
550
551 {
552 struct flowi fl = { .oif = tunnel->parms.link,
553 .nl_u = { .ip4_u =
554 { .daddr = dst,
555 .saddr = tiph->saddr,
556 .tos = RT_TOS(tos) } },
557 .proto = IPPROTO_IPIP };
558 if (ip_route_output_key(&rt, &fl)) {
559 tunnel->stat.tx_carrier_errors++;
560 goto tx_error_icmp;
561 }
562 }
563 tdev = rt->u.dst.dev;
564
565 if (tdev == dev) {
566 ip_rt_put(rt);
567 tunnel->stat.collisions++;
568 goto tx_error;
569 }
570
571 if (tiph->frag_off)
572 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
573 else
574 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
575
576 if (mtu < 68) {
577 tunnel->stat.collisions++;
578 ip_rt_put(rt);
579 goto tx_error;
580 }
581 if (skb->dst)
582 skb->dst->ops->update_pmtu(skb->dst, mtu);
583
584 df |= (old_iph->frag_off&htons(IP_DF));
585
586 if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
587 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
588 ip_rt_put(rt);
589 goto tx_error;
590 }
591
592 if (tunnel->err_count > 0) {
593 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
594 tunnel->err_count--;
595 dst_link_failure(skb);
596 } else
597 tunnel->err_count = 0;
598 }
599
600 /*
601 * Okay, now see if we can stuff it in the buffer as-is.
602 */
603 max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
604
605 if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
606 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
607 if (!new_skb) {
608 ip_rt_put(rt);
609 stats->tx_dropped++;
610 dev_kfree_skb(skb);
611 tunnel->recursion--;
612 return 0;
613 }
614 if (skb->sk)
615 skb_set_owner_w(new_skb, skb->sk);
616 dev_kfree_skb(skb);
617 skb = new_skb;
618 old_iph = skb->nh.iph;
619 }
620
621 skb->h.raw = skb->nh.raw;
622 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
623 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
624 dst_release(skb->dst);
625 skb->dst = &rt->u.dst;
626
627 /*
628 * Push down and install the IPIP header.
629 */
630
631 iph = skb->nh.iph;
632 iph->version = 4;
633 iph->ihl = sizeof(struct iphdr)>>2;
634 iph->frag_off = df;
635 iph->protocol = IPPROTO_IPIP;
636 iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
637 iph->daddr = rt->rt_dst;
638 iph->saddr = rt->rt_src;
639
640 if ((iph->ttl = tiph->ttl) == 0)
641 iph->ttl = old_iph->ttl;
642
643 nf_reset(skb);
644
645 IPTUNNEL_XMIT();
646 tunnel->recursion--;
647 return 0;
648
649tx_error_icmp:
650 dst_link_failure(skb);
651tx_error:
652 stats->tx_errors++;
653 dev_kfree_skb(skb);
654 tunnel->recursion--;
655 return 0;
656}
657
658static int
659ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
660{
661 int err = 0;
662 struct ip_tunnel_parm p;
663 struct ip_tunnel *t;
664
665 switch (cmd) {
666 case SIOCGETTUNNEL:
667 t = NULL;
668 if (dev == ipip_fb_tunnel_dev) {
669 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
670 err = -EFAULT;
671 break;
672 }
673 t = ipip_tunnel_locate(&p, 0);
674 }
675 if (t == NULL)
676 t = (struct ip_tunnel*)dev->priv;
677 memcpy(&p, &t->parms, sizeof(p));
678 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
679 err = -EFAULT;
680 break;
681
682 case SIOCADDTUNNEL:
683 case SIOCCHGTUNNEL:
684 err = -EPERM;
685 if (!capable(CAP_NET_ADMIN))
686 goto done;
687
688 err = -EFAULT;
689 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
690 goto done;
691
692 err = -EINVAL;
693 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
694 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
695 goto done;
696 if (p.iph.ttl)
697 p.iph.frag_off |= htons(IP_DF);
698
699 t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
700
701 if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
702 if (t != NULL) {
703 if (t->dev != dev) {
704 err = -EEXIST;
705 break;
706 }
707 } else {
708 if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
709 (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
710 err = -EINVAL;
711 break;
712 }
713 t = (struct ip_tunnel*)dev->priv;
714 ipip_tunnel_unlink(t);
715 t->parms.iph.saddr = p.iph.saddr;
716 t->parms.iph.daddr = p.iph.daddr;
717 memcpy(dev->dev_addr, &p.iph.saddr, 4);
718 memcpy(dev->broadcast, &p.iph.daddr, 4);
719 ipip_tunnel_link(t);
720 netdev_state_change(dev);
721 }
722 }
723
724 if (t) {
725 err = 0;
726 if (cmd == SIOCCHGTUNNEL) {
727 t->parms.iph.ttl = p.iph.ttl;
728 t->parms.iph.tos = p.iph.tos;
729 t->parms.iph.frag_off = p.iph.frag_off;
730 }
731 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
732 err = -EFAULT;
733 } else
734 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
735 break;
736
737 case SIOCDELTUNNEL:
738 err = -EPERM;
739 if (!capable(CAP_NET_ADMIN))
740 goto done;
741
742 if (dev == ipip_fb_tunnel_dev) {
743 err = -EFAULT;
744 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
745 goto done;
746 err = -ENOENT;
747 if ((t = ipip_tunnel_locate(&p, 0)) == NULL)
748 goto done;
749 err = -EPERM;
750 if (t->dev == ipip_fb_tunnel_dev)
751 goto done;
752 dev = t->dev;
753 }
754 err = unregister_netdevice(dev);
755 break;
756
757 default:
758 err = -EINVAL;
759 }
760
761done:
762 return err;
763}
764
765static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
766{
767 return &(((struct ip_tunnel*)dev->priv)->stat);
768}
769
770static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
771{
772 if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
773 return -EINVAL;
774 dev->mtu = new_mtu;
775 return 0;
776}
777
778static void ipip_tunnel_setup(struct net_device *dev)
779{
780 SET_MODULE_OWNER(dev);
781 dev->uninit = ipip_tunnel_uninit;
782 dev->hard_start_xmit = ipip_tunnel_xmit;
783 dev->get_stats = ipip_tunnel_get_stats;
784 dev->do_ioctl = ipip_tunnel_ioctl;
785 dev->change_mtu = ipip_tunnel_change_mtu;
786 dev->destructor = free_netdev;
787
788 dev->type = ARPHRD_TUNNEL;
789 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
790 dev->mtu = 1500 - sizeof(struct iphdr);
791 dev->flags = IFF_NOARP;
792 dev->iflink = 0;
793 dev->addr_len = 4;
794}
795
796static int ipip_tunnel_init(struct net_device *dev)
797{
798 struct net_device *tdev = NULL;
799 struct ip_tunnel *tunnel;
800 struct iphdr *iph;
801
802 tunnel = (struct ip_tunnel*)dev->priv;
803 iph = &tunnel->parms.iph;
804
805 tunnel->dev = dev;
806 strcpy(tunnel->parms.name, dev->name);
807
808 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
809 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
810
811 if (iph->daddr) {
812 struct flowi fl = { .oif = tunnel->parms.link,
813 .nl_u = { .ip4_u =
814 { .daddr = iph->daddr,
815 .saddr = iph->saddr,
816 .tos = RT_TOS(iph->tos) } },
817 .proto = IPPROTO_IPIP };
818 struct rtable *rt;
819 if (!ip_route_output_key(&rt, &fl)) {
820 tdev = rt->u.dst.dev;
821 ip_rt_put(rt);
822 }
823 dev->flags |= IFF_POINTOPOINT;
824 }
825
826 if (!tdev && tunnel->parms.link)
827 tdev = __dev_get_by_index(tunnel->parms.link);
828
829 if (tdev) {
830 dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
831 dev->mtu = tdev->mtu - sizeof(struct iphdr);
832 }
833 dev->iflink = tunnel->parms.link;
834
835 return 0;
836}
837
838static int __init ipip_fb_tunnel_init(struct net_device *dev)
839{
840 struct ip_tunnel *tunnel = dev->priv;
841 struct iphdr *iph = &tunnel->parms.iph;
842
843 tunnel->dev = dev;
844 strcpy(tunnel->parms.name, dev->name);
845
846 iph->version = 4;
847 iph->protocol = IPPROTO_IPIP;
848 iph->ihl = 5;
849
850 dev_hold(dev);
851 tunnels_wc[0] = tunnel;
852 return 0;
853}
854
855static struct xfrm_tunnel ipip_handler = {
856 .handler = ipip_rcv,
857 .err_handler = ipip_err,
858};
859
860static char banner[] __initdata =
861 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
862
863static int __init ipip_init(void)
864{
865 int err;
866
867 printk(banner);
868
869 if (xfrm4_tunnel_register(&ipip_handler) < 0) {
870 printk(KERN_INFO "ipip init: can't register tunnel\n");
871 return -EAGAIN;
872 }
873
874 ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
875 "tunl0",
876 ipip_tunnel_setup);
877 if (!ipip_fb_tunnel_dev) {
878 err = -ENOMEM;
879 goto err1;
880 }
881
882 ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init;
883
884 if ((err = register_netdev(ipip_fb_tunnel_dev)))
885 goto err2;
886 out:
887 return err;
888 err2:
889 free_netdev(ipip_fb_tunnel_dev);
890 err1:
891 xfrm4_tunnel_deregister(&ipip_handler);
892 goto out;
893}
894
895static void __exit ipip_fini(void)
896{
897 if (xfrm4_tunnel_deregister(&ipip_handler) < 0)
898 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
899
900 unregister_netdev(ipip_fb_tunnel_dev);
901}
902
903module_init(ipip_init);
904module_exit(ipip_fini);
905MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 000000000000..e21c049ec62a
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,1900 @@
1/*
2 * IP multicast routing support for mrouted 3.6/3.8
3 *
4 * (c) 1995 Alan Cox, <alan@redhat.com>
5 * Linux Consultancy and Custom Driver Development
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13 *
14 * Fixes:
15 * Michael Chastain : Incorrect size of copying.
16 * Alan Cox : Added the cache manager code
17 * Alan Cox : Fixed the clone/copy bug and device race.
18 * Mike McLagan : Routing by source
19 * Malcolm Beattie : Buffer handling fixes.
20 * Alexey Kuznetsov : Double buffer free and other fixes.
21 * SVR Anand : Fixed several multicast bugs and problems.
22 * Alexey Kuznetsov : Status, optimisations and more.
23 * Brad Parker : Better behaviour on mrouted upcall
24 * overflow.
25 * Carlos Picoto : PIMv1 Support
26 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
27 * Relax this requrement to work with older peers.
28 *
29 */
30
31#include <linux/config.h>
32#include <asm/system.h>
33#include <asm/uaccess.h>
34#include <linux/types.h>
35#include <linux/sched.h>
36#include <linux/errno.h>
37#include <linux/timer.h>
38#include <linux/mm.h>
39#include <linux/kernel.h>
40#include <linux/fcntl.h>
41#include <linux/stat.h>
42#include <linux/socket.h>
43#include <linux/in.h>
44#include <linux/inet.h>
45#include <linux/netdevice.h>
46#include <linux/inetdevice.h>
47#include <linux/igmp.h>
48#include <linux/proc_fs.h>
49#include <linux/seq_file.h>
50#include <linux/mroute.h>
51#include <linux/init.h>
52#include <net/ip.h>
53#include <net/protocol.h>
54#include <linux/skbuff.h>
55#include <net/sock.h>
56#include <net/icmp.h>
57#include <net/udp.h>
58#include <net/raw.h>
59#include <linux/notifier.h>
60#include <linux/if_arp.h>
61#include <linux/netfilter_ipv4.h>
62#include <net/ipip.h>
63#include <net/checksum.h>
64
65#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
66#define CONFIG_IP_PIMSM 1
67#endif
68
69static struct sock *mroute_socket;
70
71
72/* Big lock, protecting vif table, mrt cache and mroute socket state.
73 Note that the changes are semaphored via rtnl_lock.
74 */
75
76static DEFINE_RWLOCK(mrt_lock);
77
78/*
79 * Multicast router control variables
80 */
81
82static struct vif_device vif_table[MAXVIFS]; /* Devices */
83static int maxvif;
84
85#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
86
87static int mroute_do_assert; /* Set in PIM assert */
88static int mroute_do_pim;
89
90static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */
91
92static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
93static atomic_t cache_resolve_queue_len; /* Size of unresolved */
94
95/* Special spinlock for queue of unresolved entries */
96static DEFINE_SPINLOCK(mfc_unres_lock);
97
98/* We return to original Alan's scheme. Hash table of resolved
99 entries is changed only in process context and protected
100 with weak lock mrt_lock. Queue of unresolved entries is protected
101 with strong spinlock mfc_unres_lock.
102
103 In this case data path is free of exclusive locks at all.
104 */
105
106static kmem_cache_t *mrt_cachep;
107
108static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
110static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
111
112#ifdef CONFIG_IP_PIMSM_V2
113static struct net_protocol pim_protocol;
114#endif
115
116static struct timer_list ipmr_expire_timer;
117
118/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
119
120static
121struct net_device *ipmr_new_tunnel(struct vifctl *v)
122{
123 struct net_device *dev;
124
125 dev = __dev_get_by_name("tunl0");
126
127 if (dev) {
128 int err;
129 struct ifreq ifr;
130 mm_segment_t oldfs;
131 struct ip_tunnel_parm p;
132 struct in_device *in_dev;
133
134 memset(&p, 0, sizeof(p));
135 p.iph.daddr = v->vifc_rmt_addr.s_addr;
136 p.iph.saddr = v->vifc_lcl_addr.s_addr;
137 p.iph.version = 4;
138 p.iph.ihl = 5;
139 p.iph.protocol = IPPROTO_IPIP;
140 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
141 ifr.ifr_ifru.ifru_data = (void*)&p;
142
143 oldfs = get_fs(); set_fs(KERNEL_DS);
144 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
145 set_fs(oldfs);
146
147 dev = NULL;
148
149 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
150 dev->flags |= IFF_MULTICAST;
151
152 in_dev = __in_dev_get(dev);
153 if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
154 goto failure;
155 in_dev->cnf.rp_filter = 0;
156
157 if (dev_open(dev))
158 goto failure;
159 }
160 }
161 return dev;
162
163failure:
164 /* allow the register to be completed before unregistering. */
165 rtnl_unlock();
166 rtnl_lock();
167
168 unregister_netdevice(dev);
169 return NULL;
170}
171
172#ifdef CONFIG_IP_PIMSM
173
174static int reg_vif_num = -1;
175
176static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
177{
178 read_lock(&mrt_lock);
179 ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
180 ((struct net_device_stats*)dev->priv)->tx_packets++;
181 ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
182 read_unlock(&mrt_lock);
183 kfree_skb(skb);
184 return 0;
185}
186
187static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
188{
189 return (struct net_device_stats*)dev->priv;
190}
191
192static void reg_vif_setup(struct net_device *dev)
193{
194 dev->type = ARPHRD_PIMREG;
195 dev->mtu = 1500 - sizeof(struct iphdr) - 8;
196 dev->flags = IFF_NOARP;
197 dev->hard_start_xmit = reg_vif_xmit;
198 dev->get_stats = reg_vif_get_stats;
199 dev->destructor = free_netdev;
200}
201
202static struct net_device *ipmr_reg_vif(void)
203{
204 struct net_device *dev;
205 struct in_device *in_dev;
206
207 dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
208 reg_vif_setup);
209
210 if (dev == NULL)
211 return NULL;
212
213 if (register_netdevice(dev)) {
214 free_netdev(dev);
215 return NULL;
216 }
217 dev->iflink = 0;
218
219 if ((in_dev = inetdev_init(dev)) == NULL)
220 goto failure;
221
222 in_dev->cnf.rp_filter = 0;
223
224 if (dev_open(dev))
225 goto failure;
226
227 return dev;
228
229failure:
230 /* allow the register to be completed before unregistering. */
231 rtnl_unlock();
232 rtnl_lock();
233
234 unregister_netdevice(dev);
235 return NULL;
236}
237#endif
238
239/*
240 * Delete a VIF entry
241 */
242
243static int vif_delete(int vifi)
244{
245 struct vif_device *v;
246 struct net_device *dev;
247 struct in_device *in_dev;
248
249 if (vifi < 0 || vifi >= maxvif)
250 return -EADDRNOTAVAIL;
251
252 v = &vif_table[vifi];
253
254 write_lock_bh(&mrt_lock);
255 dev = v->dev;
256 v->dev = NULL;
257
258 if (!dev) {
259 write_unlock_bh(&mrt_lock);
260 return -EADDRNOTAVAIL;
261 }
262
263#ifdef CONFIG_IP_PIMSM
264 if (vifi == reg_vif_num)
265 reg_vif_num = -1;
266#endif
267
268 if (vifi+1 == maxvif) {
269 int tmp;
270 for (tmp=vifi-1; tmp>=0; tmp--) {
271 if (VIF_EXISTS(tmp))
272 break;
273 }
274 maxvif = tmp+1;
275 }
276
277 write_unlock_bh(&mrt_lock);
278
279 dev_set_allmulti(dev, -1);
280
281 if ((in_dev = __in_dev_get(dev)) != NULL) {
282 in_dev->cnf.mc_forwarding--;
283 ip_rt_multicast_event(in_dev);
284 }
285
286 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
287 unregister_netdevice(dev);
288
289 dev_put(dev);
290 return 0;
291}
292
293/* Destroy an unresolved cache entry, killing queued skbs
294 and reporting error to netlink readers.
295 */
296
297static void ipmr_destroy_unres(struct mfc_cache *c)
298{
299 struct sk_buff *skb;
300
301 atomic_dec(&cache_resolve_queue_len);
302
303 while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
304 if (skb->nh.iph->version == 0) {
305 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
306 nlh->nlmsg_type = NLMSG_ERROR;
307 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 skb_trim(skb, nlh->nlmsg_len);
309 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
310 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311 } else
312 kfree_skb(skb);
313 }
314
315 kmem_cache_free(mrt_cachep, c);
316}
317
318
319/* Single timer process for all the unresolved queue. */
320
321static void ipmr_expire_process(unsigned long dummy)
322{
323 unsigned long now;
324 unsigned long expires;
325 struct mfc_cache *c, **cp;
326
327 if (!spin_trylock(&mfc_unres_lock)) {
328 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
329 return;
330 }
331
332 if (atomic_read(&cache_resolve_queue_len) == 0)
333 goto out;
334
335 now = jiffies;
336 expires = 10*HZ;
337 cp = &mfc_unres_queue;
338
339 while ((c=*cp) != NULL) {
340 if (time_after(c->mfc_un.unres.expires, now)) {
341 unsigned long interval = c->mfc_un.unres.expires - now;
342 if (interval < expires)
343 expires = interval;
344 cp = &c->next;
345 continue;
346 }
347
348 *cp = c->next;
349
350 ipmr_destroy_unres(c);
351 }
352
353 if (atomic_read(&cache_resolve_queue_len))
354 mod_timer(&ipmr_expire_timer, jiffies + expires);
355
356out:
357 spin_unlock(&mfc_unres_lock);
358}
359
360/* Fill oifs list. It is called under write locked mrt_lock. */
361
362static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
363{
364 int vifi;
365
366 cache->mfc_un.res.minvif = MAXVIFS;
367 cache->mfc_un.res.maxvif = 0;
368 memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
369
370 for (vifi=0; vifi<maxvif; vifi++) {
371 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
372 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
373 if (cache->mfc_un.res.minvif > vifi)
374 cache->mfc_un.res.minvif = vifi;
375 if (cache->mfc_un.res.maxvif <= vifi)
376 cache->mfc_un.res.maxvif = vifi + 1;
377 }
378 }
379}
380
381static int vif_add(struct vifctl *vifc, int mrtsock)
382{
383 int vifi = vifc->vifc_vifi;
384 struct vif_device *v = &vif_table[vifi];
385 struct net_device *dev;
386 struct in_device *in_dev;
387
388 /* Is vif busy ? */
389 if (VIF_EXISTS(vifi))
390 return -EADDRINUSE;
391
392 switch (vifc->vifc_flags) {
393#ifdef CONFIG_IP_PIMSM
394 case VIFF_REGISTER:
395 /*
396 * Special Purpose VIF in PIM
397 * All the packets will be sent to the daemon
398 */
399 if (reg_vif_num >= 0)
400 return -EADDRINUSE;
401 dev = ipmr_reg_vif();
402 if (!dev)
403 return -ENOBUFS;
404 break;
405#endif
406 case VIFF_TUNNEL:
407 dev = ipmr_new_tunnel(vifc);
408 if (!dev)
409 return -ENOBUFS;
410 break;
411 case 0:
412 dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
413 if (!dev)
414 return -EADDRNOTAVAIL;
415 __dev_put(dev);
416 break;
417 default:
418 return -EINVAL;
419 }
420
421 if ((in_dev = __in_dev_get(dev)) == NULL)
422 return -EADDRNOTAVAIL;
423 in_dev->cnf.mc_forwarding++;
424 dev_set_allmulti(dev, +1);
425 ip_rt_multicast_event(in_dev);
426
427 /*
428 * Fill in the VIF structures
429 */
430 v->rate_limit=vifc->vifc_rate_limit;
431 v->local=vifc->vifc_lcl_addr.s_addr;
432 v->remote=vifc->vifc_rmt_addr.s_addr;
433 v->flags=vifc->vifc_flags;
434 if (!mrtsock)
435 v->flags |= VIFF_STATIC;
436 v->threshold=vifc->vifc_threshold;
437 v->bytes_in = 0;
438 v->bytes_out = 0;
439 v->pkt_in = 0;
440 v->pkt_out = 0;
441 v->link = dev->ifindex;
442 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
443 v->link = dev->iflink;
444
445 /* And finish update writing critical data */
446 write_lock_bh(&mrt_lock);
447 dev_hold(dev);
448 v->dev=dev;
449#ifdef CONFIG_IP_PIMSM
450 if (v->flags&VIFF_REGISTER)
451 reg_vif_num = vifi;
452#endif
453 if (vifi+1 > maxvif)
454 maxvif = vifi+1;
455 write_unlock_bh(&mrt_lock);
456 return 0;
457}
458
459static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
460{
461 int line=MFC_HASH(mcastgrp,origin);
462 struct mfc_cache *c;
463
464 for (c=mfc_cache_array[line]; c; c = c->next) {
465 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
466 break;
467 }
468 return c;
469}
470
471/*
472 * Allocate a multicast cache entry
473 */
474static struct mfc_cache *ipmr_cache_alloc(void)
475{
476 struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
477 if(c==NULL)
478 return NULL;
479 memset(c, 0, sizeof(*c));
480 c->mfc_un.res.minvif = MAXVIFS;
481 return c;
482}
483
484static struct mfc_cache *ipmr_cache_alloc_unres(void)
485{
486 struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
487 if(c==NULL)
488 return NULL;
489 memset(c, 0, sizeof(*c));
490 skb_queue_head_init(&c->mfc_un.unres.unresolved);
491 c->mfc_un.unres.expires = jiffies + 10*HZ;
492 return c;
493}
494
495/*
496 * A cache entry has gone into a resolved state from queued
497 */
498
499static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500{
501 struct sk_buff *skb;
502
503 /*
504 * Play the pending entries through our router
505 */
506
507 while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
508 if (skb->nh.iph->version == 0) {
509 int err;
510 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
511
512 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
513 nlh->nlmsg_len = skb->tail - (u8*)nlh;
514 } else {
515 nlh->nlmsg_type = NLMSG_ERROR;
516 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 skb_trim(skb, nlh->nlmsg_len);
518 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
519 }
520 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521 } else
522 ip_mr_forward(skb, c, 0);
523 }
524}
525
526/*
527 * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
528 * expects the following bizarre scheme.
529 *
530 * Called under mrt_lock.
531 */
532
533static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
534{
535 struct sk_buff *skb;
536 int ihl = pkt->nh.iph->ihl<<2;
537 struct igmphdr *igmp;
538 struct igmpmsg *msg;
539 int ret;
540
541#ifdef CONFIG_IP_PIMSM
542 if (assert == IGMPMSG_WHOLEPKT)
543 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
544 else
545#endif
546 skb = alloc_skb(128, GFP_ATOMIC);
547
548 if(!skb)
549 return -ENOBUFS;
550
551#ifdef CONFIG_IP_PIMSM
552 if (assert == IGMPMSG_WHOLEPKT) {
553 /* Ugly, but we have no choice with this interface.
554 Duplicate old header, fix ihl, length etc.
555 And all this only to mangle msg->im_msgtype and
556 to set msg->im_mbz to "mbz" :-)
557 */
558 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
559 skb->nh.raw = skb->h.raw = (u8*)msg;
560 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
561 msg->im_msgtype = IGMPMSG_WHOLEPKT;
562 msg->im_mbz = 0;
563 msg->im_vif = reg_vif_num;
564 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
565 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
566 } else
567#endif
568 {
569
570 /*
571 * Copy the IP header
572 */
573
574 skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
575 memcpy(skb->data,pkt->data,ihl);
576 skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */
577 msg = (struct igmpmsg*)skb->nh.iph;
578 msg->im_vif = vifi;
579 skb->dst = dst_clone(pkt->dst);
580
581 /*
582 * Add our header
583 */
584
585 igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
586 igmp->type =
587 msg->im_msgtype = assert;
588 igmp->code = 0;
589 skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */
590 skb->h.raw = skb->nh.raw;
591 }
592
593 if (mroute_socket == NULL) {
594 kfree_skb(skb);
595 return -EINVAL;
596 }
597
598 /*
599 * Deliver to mrouted
600 */
601 if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
602 if (net_ratelimit())
603 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
604 kfree_skb(skb);
605 }
606
607 return ret;
608}
609
610/*
611 * Queue a packet for resolution. It gets locked cache entry!
612 */
613
614static int
615ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
616{
617 int err;
618 struct mfc_cache *c;
619
620 spin_lock_bh(&mfc_unres_lock);
621 for (c=mfc_unres_queue; c; c=c->next) {
622 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
623 c->mfc_origin == skb->nh.iph->saddr)
624 break;
625 }
626
627 if (c == NULL) {
628 /*
629 * Create a new entry if allowable
630 */
631
632 if (atomic_read(&cache_resolve_queue_len)>=10 ||
633 (c=ipmr_cache_alloc_unres())==NULL) {
634 spin_unlock_bh(&mfc_unres_lock);
635
636 kfree_skb(skb);
637 return -ENOBUFS;
638 }
639
640 /*
641 * Fill in the new cache entry
642 */
643 c->mfc_parent=-1;
644 c->mfc_origin=skb->nh.iph->saddr;
645 c->mfc_mcastgrp=skb->nh.iph->daddr;
646
647 /*
648 * Reflect first query at mrouted.
649 */
650 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
651 /* If the report failed throw the cache entry
652 out - Brad Parker
653 */
654 spin_unlock_bh(&mfc_unres_lock);
655
656 kmem_cache_free(mrt_cachep, c);
657 kfree_skb(skb);
658 return err;
659 }
660
661 atomic_inc(&cache_resolve_queue_len);
662 c->next = mfc_unres_queue;
663 mfc_unres_queue = c;
664
665 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
666 }
667
668 /*
669 * See if we can append the packet
670 */
671 if (c->mfc_un.unres.unresolved.qlen>3) {
672 kfree_skb(skb);
673 err = -ENOBUFS;
674 } else {
675 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
676 err = 0;
677 }
678
679 spin_unlock_bh(&mfc_unres_lock);
680 return err;
681}
682
683/*
684 * MFC cache manipulation by user space mroute daemon
685 */
686
687static int ipmr_mfc_delete(struct mfcctl *mfc)
688{
689 int line;
690 struct mfc_cache *c, **cp;
691
692 line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
693
694 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
695 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
696 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
697 write_lock_bh(&mrt_lock);
698 *cp = c->next;
699 write_unlock_bh(&mrt_lock);
700
701 kmem_cache_free(mrt_cachep, c);
702 return 0;
703 }
704 }
705 return -ENOENT;
706}
707
708static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
709{
710 int line;
711 struct mfc_cache *uc, *c, **cp;
712
713 line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
714
715 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
718 break;
719 }
720
721 if (c != NULL) {
722 write_lock_bh(&mrt_lock);
723 c->mfc_parent = mfc->mfcc_parent;
724 ipmr_update_threshoulds(c, mfc->mfcc_ttls);
725 if (!mrtsock)
726 c->mfc_flags |= MFC_STATIC;
727 write_unlock_bh(&mrt_lock);
728 return 0;
729 }
730
731 if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
732 return -EINVAL;
733
734 c=ipmr_cache_alloc();
735 if (c==NULL)
736 return -ENOMEM;
737
738 c->mfc_origin=mfc->mfcc_origin.s_addr;
739 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
740 c->mfc_parent=mfc->mfcc_parent;
741 ipmr_update_threshoulds(c, mfc->mfcc_ttls);
742 if (!mrtsock)
743 c->mfc_flags |= MFC_STATIC;
744
745 write_lock_bh(&mrt_lock);
746 c->next = mfc_cache_array[line];
747 mfc_cache_array[line] = c;
748 write_unlock_bh(&mrt_lock);
749
750 /*
751 * Check to see if we resolved a queued list. If so we
752 * need to send on the frames and tidy up.
753 */
754 spin_lock_bh(&mfc_unres_lock);
755 for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
756 cp = &uc->next) {
757 if (uc->mfc_origin == c->mfc_origin &&
758 uc->mfc_mcastgrp == c->mfc_mcastgrp) {
759 *cp = uc->next;
760 if (atomic_dec_and_test(&cache_resolve_queue_len))
761 del_timer(&ipmr_expire_timer);
762 break;
763 }
764 }
765 spin_unlock_bh(&mfc_unres_lock);
766
767 if (uc) {
768 ipmr_cache_resolve(uc, c);
769 kmem_cache_free(mrt_cachep, uc);
770 }
771 return 0;
772}
773
774/*
775 * Close the multicast socket, and clear the vif tables etc
776 */
777
778static void mroute_clean_tables(struct sock *sk)
779{
780 int i;
781
782 /*
783 * Shut down all active vif entries
784 */
785 for(i=0; i<maxvif; i++) {
786 if (!(vif_table[i].flags&VIFF_STATIC))
787 vif_delete(i);
788 }
789
790 /*
791 * Wipe the cache
792 */
793 for (i=0;i<MFC_LINES;i++) {
794 struct mfc_cache *c, **cp;
795
796 cp = &mfc_cache_array[i];
797 while ((c = *cp) != NULL) {
798 if (c->mfc_flags&MFC_STATIC) {
799 cp = &c->next;
800 continue;
801 }
802 write_lock_bh(&mrt_lock);
803 *cp = c->next;
804 write_unlock_bh(&mrt_lock);
805
806 kmem_cache_free(mrt_cachep, c);
807 }
808 }
809
810 if (atomic_read(&cache_resolve_queue_len) != 0) {
811 struct mfc_cache *c;
812
813 spin_lock_bh(&mfc_unres_lock);
814 while (mfc_unres_queue != NULL) {
815 c = mfc_unres_queue;
816 mfc_unres_queue = c->next;
817 spin_unlock_bh(&mfc_unres_lock);
818
819 ipmr_destroy_unres(c);
820
821 spin_lock_bh(&mfc_unres_lock);
822 }
823 spin_unlock_bh(&mfc_unres_lock);
824 }
825}
826
827static void mrtsock_destruct(struct sock *sk)
828{
829 rtnl_lock();
830 if (sk == mroute_socket) {
831 ipv4_devconf.mc_forwarding--;
832
833 write_lock_bh(&mrt_lock);
834 mroute_socket=NULL;
835 write_unlock_bh(&mrt_lock);
836
837 mroute_clean_tables(sk);
838 }
839 rtnl_unlock();
840}
841
842/*
843 * Socket options and virtual interface manipulation. The whole
844 * virtual interface system is a complete heap, but unfortunately
845 * that's how BSD mrouted happens to think. Maybe one day with a proper
846 * MOSPF/PIM router set up we can clean this up.
847 */
848
849int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
850{
851 int ret;
852 struct vifctl vif;
853 struct mfcctl mfc;
854
855 if(optname!=MRT_INIT)
856 {
857 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
858 return -EACCES;
859 }
860
861 switch(optname)
862 {
863 case MRT_INIT:
864 if (sk->sk_type != SOCK_RAW ||
865 inet_sk(sk)->num != IPPROTO_IGMP)
866 return -EOPNOTSUPP;
867 if(optlen!=sizeof(int))
868 return -ENOPROTOOPT;
869
870 rtnl_lock();
871 if (mroute_socket) {
872 rtnl_unlock();
873 return -EADDRINUSE;
874 }
875
876 ret = ip_ra_control(sk, 1, mrtsock_destruct);
877 if (ret == 0) {
878 write_lock_bh(&mrt_lock);
879 mroute_socket=sk;
880 write_unlock_bh(&mrt_lock);
881
882 ipv4_devconf.mc_forwarding++;
883 }
884 rtnl_unlock();
885 return ret;
886 case MRT_DONE:
887 if (sk!=mroute_socket)
888 return -EACCES;
889 return ip_ra_control(sk, 0, NULL);
890 case MRT_ADD_VIF:
891 case MRT_DEL_VIF:
892 if(optlen!=sizeof(vif))
893 return -EINVAL;
894 if (copy_from_user(&vif,optval,sizeof(vif)))
895 return -EFAULT;
896 if(vif.vifc_vifi >= MAXVIFS)
897 return -ENFILE;
898 rtnl_lock();
899 if (optname==MRT_ADD_VIF) {
900 ret = vif_add(&vif, sk==mroute_socket);
901 } else {
902 ret = vif_delete(vif.vifc_vifi);
903 }
904 rtnl_unlock();
905 return ret;
906
907 /*
908 * Manipulate the forwarding caches. These live
909 * in a sort of kernel/user symbiosis.
910 */
911 case MRT_ADD_MFC:
912 case MRT_DEL_MFC:
913 if(optlen!=sizeof(mfc))
914 return -EINVAL;
915 if (copy_from_user(&mfc,optval, sizeof(mfc)))
916 return -EFAULT;
917 rtnl_lock();
918 if (optname==MRT_DEL_MFC)
919 ret = ipmr_mfc_delete(&mfc);
920 else
921 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
922 rtnl_unlock();
923 return ret;
924 /*
925 * Control PIM assert.
926 */
927 case MRT_ASSERT:
928 {
929 int v;
930 if(get_user(v,(int __user *)optval))
931 return -EFAULT;
932 mroute_do_assert=(v)?1:0;
933 return 0;
934 }
935#ifdef CONFIG_IP_PIMSM
936 case MRT_PIM:
937 {
938 int v, ret;
939 if(get_user(v,(int __user *)optval))
940 return -EFAULT;
941 v = (v)?1:0;
942 rtnl_lock();
943 ret = 0;
944 if (v != mroute_do_pim) {
945 mroute_do_pim = v;
946 mroute_do_assert = v;
947#ifdef CONFIG_IP_PIMSM_V2
948 if (mroute_do_pim)
949 ret = inet_add_protocol(&pim_protocol,
950 IPPROTO_PIM);
951 else
952 ret = inet_del_protocol(&pim_protocol,
953 IPPROTO_PIM);
954 if (ret < 0)
955 ret = -EAGAIN;
956#endif
957 }
958 rtnl_unlock();
959 return ret;
960 }
961#endif
962 /*
963 * Spurious command, or MRT_VERSION which you cannot
964 * set.
965 */
966 default:
967 return -ENOPROTOOPT;
968 }
969}
970
971/*
972 * Getsock opt support for the multicast routing system.
973 */
974
975int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
976{
977 int olr;
978 int val;
979
980 if(optname!=MRT_VERSION &&
981#ifdef CONFIG_IP_PIMSM
982 optname!=MRT_PIM &&
983#endif
984 optname!=MRT_ASSERT)
985 return -ENOPROTOOPT;
986
987 if (get_user(olr, optlen))
988 return -EFAULT;
989
990 olr = min_t(unsigned int, olr, sizeof(int));
991 if (olr < 0)
992 return -EINVAL;
993
994 if(put_user(olr,optlen))
995 return -EFAULT;
996 if(optname==MRT_VERSION)
997 val=0x0305;
998#ifdef CONFIG_IP_PIMSM
999 else if(optname==MRT_PIM)
1000 val=mroute_do_pim;
1001#endif
1002 else
1003 val=mroute_do_assert;
1004 if(copy_to_user(optval,&val,olr))
1005 return -EFAULT;
1006 return 0;
1007}
1008
1009/*
1010 * The IP multicast ioctl support routines.
1011 */
1012
1013int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1014{
1015 struct sioc_sg_req sr;
1016 struct sioc_vif_req vr;
1017 struct vif_device *vif;
1018 struct mfc_cache *c;
1019
1020 switch(cmd)
1021 {
1022 case SIOCGETVIFCNT:
1023 if (copy_from_user(&vr,arg,sizeof(vr)))
1024 return -EFAULT;
1025 if(vr.vifi>=maxvif)
1026 return -EINVAL;
1027 read_lock(&mrt_lock);
1028 vif=&vif_table[vr.vifi];
1029 if(VIF_EXISTS(vr.vifi)) {
1030 vr.icount=vif->pkt_in;
1031 vr.ocount=vif->pkt_out;
1032 vr.ibytes=vif->bytes_in;
1033 vr.obytes=vif->bytes_out;
1034 read_unlock(&mrt_lock);
1035
1036 if (copy_to_user(arg,&vr,sizeof(vr)))
1037 return -EFAULT;
1038 return 0;
1039 }
1040 read_unlock(&mrt_lock);
1041 return -EADDRNOTAVAIL;
1042 case SIOCGETSGCNT:
1043 if (copy_from_user(&sr,arg,sizeof(sr)))
1044 return -EFAULT;
1045
1046 read_lock(&mrt_lock);
1047 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1048 if (c) {
1049 sr.pktcnt = c->mfc_un.res.pkt;
1050 sr.bytecnt = c->mfc_un.res.bytes;
1051 sr.wrong_if = c->mfc_un.res.wrong_if;
1052 read_unlock(&mrt_lock);
1053
1054 if (copy_to_user(arg,&sr,sizeof(sr)))
1055 return -EFAULT;
1056 return 0;
1057 }
1058 read_unlock(&mrt_lock);
1059 return -EADDRNOTAVAIL;
1060 default:
1061 return -ENOIOCTLCMD;
1062 }
1063}
1064
1065
1066static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1067{
1068 struct vif_device *v;
1069 int ct;
1070 if (event != NETDEV_UNREGISTER)
1071 return NOTIFY_DONE;
1072 v=&vif_table[0];
1073 for(ct=0;ct<maxvif;ct++,v++) {
1074 if (v->dev==ptr)
1075 vif_delete(ct);
1076 }
1077 return NOTIFY_DONE;
1078}
1079
1080
1081static struct notifier_block ip_mr_notifier={
1082 .notifier_call = ipmr_device_event,
1083};
1084
1085/*
1086 * Encapsulate a packet by attaching a valid IPIP header to it.
1087 * This avoids tunnel drivers and other mess and gives us the speed so
1088 * important for multicast video.
1089 */
1090
1091static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1092{
1093 struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1094
1095 iph->version = 4;
1096 iph->tos = skb->nh.iph->tos;
1097 iph->ttl = skb->nh.iph->ttl;
1098 iph->frag_off = 0;
1099 iph->daddr = daddr;
1100 iph->saddr = saddr;
1101 iph->protocol = IPPROTO_IPIP;
1102 iph->ihl = 5;
1103 iph->tot_len = htons(skb->len);
1104 ip_select_ident(iph, skb->dst, NULL);
1105 ip_send_check(iph);
1106
1107 skb->h.ipiph = skb->nh.iph;
1108 skb->nh.iph = iph;
1109 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1110 nf_reset(skb);
1111}
1112
1113static inline int ipmr_forward_finish(struct sk_buff *skb)
1114{
1115 struct ip_options * opt = &(IPCB(skb)->opt);
1116
1117 IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1118
1119 if (unlikely(opt->optlen))
1120 ip_forward_options(skb);
1121
1122 return dst_output(skb);
1123}
1124
1125/*
1126 * Processing handlers for ipmr_forward
1127 */
1128
1129static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1130{
1131 struct iphdr *iph = skb->nh.iph;
1132 struct vif_device *vif = &vif_table[vifi];
1133 struct net_device *dev;
1134 struct rtable *rt;
1135 int encap = 0;
1136
1137 if (vif->dev == NULL)
1138 goto out_free;
1139
1140#ifdef CONFIG_IP_PIMSM
1141 if (vif->flags & VIFF_REGISTER) {
1142 vif->pkt_out++;
1143 vif->bytes_out+=skb->len;
1144 ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1145 ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1146 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1147 kfree_skb(skb);
1148 return;
1149 }
1150#endif
1151
1152 if (vif->flags&VIFF_TUNNEL) {
1153 struct flowi fl = { .oif = vif->link,
1154 .nl_u = { .ip4_u =
1155 { .daddr = vif->remote,
1156 .saddr = vif->local,
1157 .tos = RT_TOS(iph->tos) } },
1158 .proto = IPPROTO_IPIP };
1159 if (ip_route_output_key(&rt, &fl))
1160 goto out_free;
1161 encap = sizeof(struct iphdr);
1162 } else {
1163 struct flowi fl = { .oif = vif->link,
1164 .nl_u = { .ip4_u =
1165 { .daddr = iph->daddr,
1166 .tos = RT_TOS(iph->tos) } },
1167 .proto = IPPROTO_IPIP };
1168 if (ip_route_output_key(&rt, &fl))
1169 goto out_free;
1170 }
1171
1172 dev = rt->u.dst.dev;
1173
1174 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1175 /* Do not fragment multicasts. Alas, IPv4 does not
1176 allow to send ICMP, so that packets will disappear
1177 to blackhole.
1178 */
1179
1180 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1181 ip_rt_put(rt);
1182 goto out_free;
1183 }
1184
1185 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1186
1187 if (skb_cow(skb, encap)) {
1188 ip_rt_put(rt);
1189 goto out_free;
1190 }
1191
1192 vif->pkt_out++;
1193 vif->bytes_out+=skb->len;
1194
1195 dst_release(skb->dst);
1196 skb->dst = &rt->u.dst;
1197 iph = skb->nh.iph;
1198 ip_decrease_ttl(iph);
1199
1200 /* FIXME: forward and output firewalls used to be called here.
1201 * What do we do with netfilter? -- RR */
1202 if (vif->flags & VIFF_TUNNEL) {
1203 ip_encap(skb, vif->local, vif->remote);
1204 /* FIXME: extra output firewall step used to be here. --RR */
1205 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1206 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1207 }
1208
1209 IPCB(skb)->flags |= IPSKB_FORWARDED;
1210
1211 /*
1212 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1213 * not only before forwarding, but after forwarding on all output
1214 * interfaces. It is clear, if mrouter runs a multicasting
1215 * program, it should receive packets not depending to what interface
1216 * program is joined.
1217 * If we will not make it, the program will have to join on all
1218 * interfaces. On the other hand, multihoming host (or router, but
1219 * not mrouter) cannot join to more than one interface - it will
1220 * result in receiving multiple packets.
1221 */
1222 NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1223 ipmr_forward_finish);
1224 return;
1225
1226out_free:
1227 kfree_skb(skb);
1228 return;
1229}
1230
1231static int ipmr_find_vif(struct net_device *dev)
1232{
1233 int ct;
1234 for (ct=maxvif-1; ct>=0; ct--) {
1235 if (vif_table[ct].dev == dev)
1236 break;
1237 }
1238 return ct;
1239}
1240
1241/* "local" means that we should preserve one skb (for local delivery) */
1242
1243static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1244{
1245 int psend = -1;
1246 int vif, ct;
1247
1248 vif = cache->mfc_parent;
1249 cache->mfc_un.res.pkt++;
1250 cache->mfc_un.res.bytes += skb->len;
1251
1252 /*
1253 * Wrong interface: drop packet and (maybe) send PIM assert.
1254 */
1255 if (vif_table[vif].dev != skb->dev) {
1256 int true_vifi;
1257
1258 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1259 /* It is our own packet, looped back.
1260 Very complicated situation...
1261
1262 The best workaround until routing daemons will be
1263 fixed is not to redistribute packet, if it was
1264 send through wrong interface. It means, that
1265 multicast applications WILL NOT work for
1266 (S,G), which have default multicast route pointing
1267 to wrong oif. In any case, it is not a good
1268 idea to use multicasting applications on router.
1269 */
1270 goto dont_forward;
1271 }
1272
1273 cache->mfc_un.res.wrong_if++;
1274 true_vifi = ipmr_find_vif(skb->dev);
1275
1276 if (true_vifi >= 0 && mroute_do_assert &&
1277 /* pimsm uses asserts, when switching from RPT to SPT,
1278 so that we cannot check that packet arrived on an oif.
1279 It is bad, but otherwise we would need to move pretty
1280 large chunk of pimd to kernel. Ough... --ANK
1281 */
1282 (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1283 time_after(jiffies,
1284 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1285 cache->mfc_un.res.last_assert = jiffies;
1286 ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1287 }
1288 goto dont_forward;
1289 }
1290
1291 vif_table[vif].pkt_in++;
1292 vif_table[vif].bytes_in+=skb->len;
1293
1294 /*
1295 * Forward the frame
1296 */
1297 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1298 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1299 if (psend != -1) {
1300 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1301 if (skb2)
1302 ipmr_queue_xmit(skb2, cache, psend);
1303 }
1304 psend=ct;
1305 }
1306 }
1307 if (psend != -1) {
1308 if (local) {
1309 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1310 if (skb2)
1311 ipmr_queue_xmit(skb2, cache, psend);
1312 } else {
1313 ipmr_queue_xmit(skb, cache, psend);
1314 return 0;
1315 }
1316 }
1317
1318dont_forward:
1319 if (!local)
1320 kfree_skb(skb);
1321 return 0;
1322}
1323
1324
1325/*
1326 * Multicast packets for forwarding arrive here
1327 */
1328
1329int ip_mr_input(struct sk_buff *skb)
1330{
1331 struct mfc_cache *cache;
1332 int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1333
1334 /* Packet is looped back after forward, it should not be
1335 forwarded second time, but still can be delivered locally.
1336 */
1337 if (IPCB(skb)->flags&IPSKB_FORWARDED)
1338 goto dont_forward;
1339
1340 if (!local) {
1341 if (IPCB(skb)->opt.router_alert) {
1342 if (ip_call_ra_chain(skb))
1343 return 0;
1344 } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1345 /* IGMPv1 (and broken IGMPv2 implementations sort of
1346 Cisco IOS <= 11.2(8)) do not put router alert
1347 option to IGMP packets destined to routable
1348 groups. It is very bad, because it means
1349 that we can forward NO IGMP messages.
1350 */
1351 read_lock(&mrt_lock);
1352 if (mroute_socket) {
1353 raw_rcv(mroute_socket, skb);
1354 read_unlock(&mrt_lock);
1355 return 0;
1356 }
1357 read_unlock(&mrt_lock);
1358 }
1359 }
1360
1361 read_lock(&mrt_lock);
1362 cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1363
1364 /*
1365 * No usable cache entry
1366 */
1367 if (cache==NULL) {
1368 int vif;
1369
1370 if (local) {
1371 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1372 ip_local_deliver(skb);
1373 if (skb2 == NULL) {
1374 read_unlock(&mrt_lock);
1375 return -ENOBUFS;
1376 }
1377 skb = skb2;
1378 }
1379
1380 vif = ipmr_find_vif(skb->dev);
1381 if (vif >= 0) {
1382 int err = ipmr_cache_unresolved(vif, skb);
1383 read_unlock(&mrt_lock);
1384
1385 return err;
1386 }
1387 read_unlock(&mrt_lock);
1388 kfree_skb(skb);
1389 return -ENODEV;
1390 }
1391
1392 ip_mr_forward(skb, cache, local);
1393
1394 read_unlock(&mrt_lock);
1395
1396 if (local)
1397 return ip_local_deliver(skb);
1398
1399 return 0;
1400
1401dont_forward:
1402 if (local)
1403 return ip_local_deliver(skb);
1404 kfree_skb(skb);
1405 return 0;
1406}
1407
1408#ifdef CONFIG_IP_PIMSM_V1
1409/*
1410 * Handle IGMP messages of PIMv1
1411 */
1412
1413int pim_rcv_v1(struct sk_buff * skb)
1414{
1415 struct igmphdr *pim;
1416 struct iphdr *encap;
1417 struct net_device *reg_dev = NULL;
1418
1419 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1420 goto drop;
1421
1422 pim = (struct igmphdr*)skb->h.raw;
1423
1424 if (!mroute_do_pim ||
1425 skb->len < sizeof(*pim) + sizeof(*encap) ||
1426 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1427 goto drop;
1428
1429 encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1430 /*
1431 Check that:
1432 a. packet is really destinted to a multicast group
1433 b. packet is not a NULL-REGISTER
1434 c. packet is not truncated
1435 */
1436 if (!MULTICAST(encap->daddr) ||
1437 encap->tot_len == 0 ||
1438 ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1439 goto drop;
1440
1441 read_lock(&mrt_lock);
1442 if (reg_vif_num >= 0)
1443 reg_dev = vif_table[reg_vif_num].dev;
1444 if (reg_dev)
1445 dev_hold(reg_dev);
1446 read_unlock(&mrt_lock);
1447
1448 if (reg_dev == NULL)
1449 goto drop;
1450
1451 skb->mac.raw = skb->nh.raw;
1452 skb_pull(skb, (u8*)encap - skb->data);
1453 skb->nh.iph = (struct iphdr *)skb->data;
1454 skb->dev = reg_dev;
1455 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1456 skb->protocol = htons(ETH_P_IP);
1457 skb->ip_summed = 0;
1458 skb->pkt_type = PACKET_HOST;
1459 dst_release(skb->dst);
1460 skb->dst = NULL;
1461 ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1462 ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1463 nf_reset(skb);
1464 netif_rx(skb);
1465 dev_put(reg_dev);
1466 return 0;
1467 drop:
1468 kfree_skb(skb);
1469 return 0;
1470}
1471#endif
1472
1473#ifdef CONFIG_IP_PIMSM_V2
1474static int pim_rcv(struct sk_buff * skb)
1475{
1476 struct pimreghdr *pim;
1477 struct iphdr *encap;
1478 struct net_device *reg_dev = NULL;
1479
1480 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1481 goto drop;
1482
1483 pim = (struct pimreghdr*)skb->h.raw;
1484 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1485 (pim->flags&PIM_NULL_REGISTER) ||
1486 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1487 (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1488 goto drop;
1489
1490 /* check if the inner packet is destined to mcast group */
1491 encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1492 if (!MULTICAST(encap->daddr) ||
1493 encap->tot_len == 0 ||
1494 ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1495 goto drop;
1496
1497 read_lock(&mrt_lock);
1498 if (reg_vif_num >= 0)
1499 reg_dev = vif_table[reg_vif_num].dev;
1500 if (reg_dev)
1501 dev_hold(reg_dev);
1502 read_unlock(&mrt_lock);
1503
1504 if (reg_dev == NULL)
1505 goto drop;
1506
1507 skb->mac.raw = skb->nh.raw;
1508 skb_pull(skb, (u8*)encap - skb->data);
1509 skb->nh.iph = (struct iphdr *)skb->data;
1510 skb->dev = reg_dev;
1511 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1512 skb->protocol = htons(ETH_P_IP);
1513 skb->ip_summed = 0;
1514 skb->pkt_type = PACKET_HOST;
1515 dst_release(skb->dst);
1516 ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1517 ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1518 skb->dst = NULL;
1519 nf_reset(skb);
1520 netif_rx(skb);
1521 dev_put(reg_dev);
1522 return 0;
1523 drop:
1524 kfree_skb(skb);
1525 return 0;
1526}
1527#endif
1528
1529static int
1530ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1531{
1532 int ct;
1533 struct rtnexthop *nhp;
1534 struct net_device *dev = vif_table[c->mfc_parent].dev;
1535 u8 *b = skb->tail;
1536 struct rtattr *mp_head;
1537
1538 if (dev)
1539 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1540
1541 mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1542
1543 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1544 if (c->mfc_un.res.ttls[ct] < 255) {
1545 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1546 goto rtattr_failure;
1547 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1548 nhp->rtnh_flags = 0;
1549 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1550 nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1551 nhp->rtnh_len = sizeof(*nhp);
1552 }
1553 }
1554 mp_head->rta_type = RTA_MULTIPATH;
1555 mp_head->rta_len = skb->tail - (u8*)mp_head;
1556 rtm->rtm_type = RTN_MULTICAST;
1557 return 1;
1558
1559rtattr_failure:
1560 skb_trim(skb, b - skb->data);
1561 return -EMSGSIZE;
1562}
1563
1564int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1565{
1566 int err;
1567 struct mfc_cache *cache;
1568 struct rtable *rt = (struct rtable*)skb->dst;
1569
1570 read_lock(&mrt_lock);
1571 cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1572
1573 if (cache==NULL) {
1574 struct net_device *dev;
1575 int vif;
1576
1577 if (nowait) {
1578 read_unlock(&mrt_lock);
1579 return -EAGAIN;
1580 }
1581
1582 dev = skb->dev;
1583 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1584 read_unlock(&mrt_lock);
1585 return -ENODEV;
1586 }
1587 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1588 skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1589 skb->nh.iph->saddr = rt->rt_src;
1590 skb->nh.iph->daddr = rt->rt_dst;
1591 skb->nh.iph->version = 0;
1592 err = ipmr_cache_unresolved(vif, skb);
1593 read_unlock(&mrt_lock);
1594 return err;
1595 }
1596
1597 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1598 cache->mfc_flags |= MFC_NOTIFY;
1599 err = ipmr_fill_mroute(skb, cache, rtm);
1600 read_unlock(&mrt_lock);
1601 return err;
1602}
1603
1604#ifdef CONFIG_PROC_FS
1605/*
1606 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1607 */
1608struct ipmr_vif_iter {
1609 int ct;
1610};
1611
1612static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1613 loff_t pos)
1614{
1615 for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1616 if(!VIF_EXISTS(iter->ct))
1617 continue;
1618 if (pos-- == 0)
1619 return &vif_table[iter->ct];
1620 }
1621 return NULL;
1622}
1623
1624static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1625{
1626 read_lock(&mrt_lock);
1627 return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1628 : SEQ_START_TOKEN;
1629}
1630
1631static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1632{
1633 struct ipmr_vif_iter *iter = seq->private;
1634
1635 ++*pos;
1636 if (v == SEQ_START_TOKEN)
1637 return ipmr_vif_seq_idx(iter, 0);
1638
1639 while (++iter->ct < maxvif) {
1640 if(!VIF_EXISTS(iter->ct))
1641 continue;
1642 return &vif_table[iter->ct];
1643 }
1644 return NULL;
1645}
1646
1647static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1648{
1649 read_unlock(&mrt_lock);
1650}
1651
1652static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1653{
1654 if (v == SEQ_START_TOKEN) {
1655 seq_puts(seq,
1656 "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
1657 } else {
1658 const struct vif_device *vif = v;
1659 const char *name = vif->dev ? vif->dev->name : "none";
1660
1661 seq_printf(seq,
1662 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
1663 vif - vif_table,
1664 name, vif->bytes_in, vif->pkt_in,
1665 vif->bytes_out, vif->pkt_out,
1666 vif->flags, vif->local, vif->remote);
1667 }
1668 return 0;
1669}
1670
1671static struct seq_operations ipmr_vif_seq_ops = {
1672 .start = ipmr_vif_seq_start,
1673 .next = ipmr_vif_seq_next,
1674 .stop = ipmr_vif_seq_stop,
1675 .show = ipmr_vif_seq_show,
1676};
1677
1678static int ipmr_vif_open(struct inode *inode, struct file *file)
1679{
1680 struct seq_file *seq;
1681 int rc = -ENOMEM;
1682 struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1683
1684 if (!s)
1685 goto out;
1686
1687 rc = seq_open(file, &ipmr_vif_seq_ops);
1688 if (rc)
1689 goto out_kfree;
1690
1691 s->ct = 0;
1692 seq = file->private_data;
1693 seq->private = s;
1694out:
1695 return rc;
1696out_kfree:
1697 kfree(s);
1698 goto out;
1699
1700}
1701
1702static struct file_operations ipmr_vif_fops = {
1703 .owner = THIS_MODULE,
1704 .open = ipmr_vif_open,
1705 .read = seq_read,
1706 .llseek = seq_lseek,
1707 .release = seq_release_private,
1708};
1709
1710struct ipmr_mfc_iter {
1711 struct mfc_cache **cache;
1712 int ct;
1713};
1714
1715
1716static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1717{
1718 struct mfc_cache *mfc;
1719
1720 it->cache = mfc_cache_array;
1721 read_lock(&mrt_lock);
1722 for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1723 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1724 if (pos-- == 0)
1725 return mfc;
1726 read_unlock(&mrt_lock);
1727
1728 it->cache = &mfc_unres_queue;
1729 spin_lock_bh(&mfc_unres_lock);
1730 for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1731 if (pos-- == 0)
1732 return mfc;
1733 spin_unlock_bh(&mfc_unres_lock);
1734
1735 it->cache = NULL;
1736 return NULL;
1737}
1738
1739
1740static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1741{
1742 struct ipmr_mfc_iter *it = seq->private;
1743 it->cache = NULL;
1744 it->ct = 0;
1745 return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1746 : SEQ_START_TOKEN;
1747}
1748
1749static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1750{
1751 struct mfc_cache *mfc = v;
1752 struct ipmr_mfc_iter *it = seq->private;
1753
1754 ++*pos;
1755
1756 if (v == SEQ_START_TOKEN)
1757 return ipmr_mfc_seq_idx(seq->private, 0);
1758
1759 if (mfc->next)
1760 return mfc->next;
1761
1762 if (it->cache == &mfc_unres_queue)
1763 goto end_of_list;
1764
1765 BUG_ON(it->cache != mfc_cache_array);
1766
1767 while (++it->ct < MFC_LINES) {
1768 mfc = mfc_cache_array[it->ct];
1769 if (mfc)
1770 return mfc;
1771 }
1772
1773 /* exhausted cache_array, show unresolved */
1774 read_unlock(&mrt_lock);
1775 it->cache = &mfc_unres_queue;
1776 it->ct = 0;
1777
1778 spin_lock_bh(&mfc_unres_lock);
1779 mfc = mfc_unres_queue;
1780 if (mfc)
1781 return mfc;
1782
1783 end_of_list:
1784 spin_unlock_bh(&mfc_unres_lock);
1785 it->cache = NULL;
1786
1787 return NULL;
1788}
1789
1790static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1791{
1792 struct ipmr_mfc_iter *it = seq->private;
1793
1794 if (it->cache == &mfc_unres_queue)
1795 spin_unlock_bh(&mfc_unres_lock);
1796 else if (it->cache == mfc_cache_array)
1797 read_unlock(&mrt_lock);
1798}
1799
1800static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1801{
1802 int n;
1803
1804 if (v == SEQ_START_TOKEN) {
1805 seq_puts(seq,
1806 "Group Origin Iif Pkts Bytes Wrong Oifs\n");
1807 } else {
1808 const struct mfc_cache *mfc = v;
1809 const struct ipmr_mfc_iter *it = seq->private;
1810
1811 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1812 (unsigned long) mfc->mfc_mcastgrp,
1813 (unsigned long) mfc->mfc_origin,
1814 mfc->mfc_parent,
1815 mfc->mfc_un.res.pkt,
1816 mfc->mfc_un.res.bytes,
1817 mfc->mfc_un.res.wrong_if);
1818
1819 if (it->cache != &mfc_unres_queue) {
1820 for(n = mfc->mfc_un.res.minvif;
1821 n < mfc->mfc_un.res.maxvif; n++ ) {
1822 if(VIF_EXISTS(n)
1823 && mfc->mfc_un.res.ttls[n] < 255)
1824 seq_printf(seq,
1825 " %2d:%-3d",
1826 n, mfc->mfc_un.res.ttls[n]);
1827 }
1828 }
1829 seq_putc(seq, '\n');
1830 }
1831 return 0;
1832}
1833
1834static struct seq_operations ipmr_mfc_seq_ops = {
1835 .start = ipmr_mfc_seq_start,
1836 .next = ipmr_mfc_seq_next,
1837 .stop = ipmr_mfc_seq_stop,
1838 .show = ipmr_mfc_seq_show,
1839};
1840
1841static int ipmr_mfc_open(struct inode *inode, struct file *file)
1842{
1843 struct seq_file *seq;
1844 int rc = -ENOMEM;
1845 struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1846
1847 if (!s)
1848 goto out;
1849
1850 rc = seq_open(file, &ipmr_mfc_seq_ops);
1851 if (rc)
1852 goto out_kfree;
1853
1854 seq = file->private_data;
1855 seq->private = s;
1856out:
1857 return rc;
1858out_kfree:
1859 kfree(s);
1860 goto out;
1861
1862}
1863
1864static struct file_operations ipmr_mfc_fops = {
1865 .owner = THIS_MODULE,
1866 .open = ipmr_mfc_open,
1867 .read = seq_read,
1868 .llseek = seq_lseek,
1869 .release = seq_release_private,
1870};
1871#endif
1872
1873#ifdef CONFIG_IP_PIMSM_V2
1874static struct net_protocol pim_protocol = {
1875 .handler = pim_rcv,
1876};
1877#endif
1878
1879
1880/*
1881 * Setup for IP multicast routing
1882 */
1883
1884void __init ip_mr_init(void)
1885{
1886 mrt_cachep = kmem_cache_create("ip_mrt_cache",
1887 sizeof(struct mfc_cache),
1888 0, SLAB_HWCACHE_ALIGN,
1889 NULL, NULL);
1890 if (!mrt_cachep)
1891 panic("cannot allocate ip_mrt_cache");
1892
1893 init_timer(&ipmr_expire_timer);
1894 ipmr_expire_timer.function=ipmr_expire_process;
1895 register_netdevice_notifier(&ip_mr_notifier);
1896#ifdef CONFIG_PROC_FS
1897 proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1898 proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1899#endif
1900}
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
new file mode 100644
index 000000000000..63a82b4b64bb
--- /dev/null
+++ b/net/ipv4/ipvs/Kconfig
@@ -0,0 +1,244 @@
1#
2# IP Virtual Server configuration
3#
4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER
6
7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER
10 ---help---
11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This
13 option must be enabled for at least one of the clustered computers
14 that will take care of intercepting incoming connections to a
15 single IP address and scheduling them to real servers.
16
17 Three request dispatching techniques are implemented, they are
18 virtual server via NAT, virtual server via tunneling and virtual
19 server via direct routing. The several scheduling algorithms can
20 be used to choose which server the connection is directed to,
21 thus load balancing can be achieved among the servers. For more
22 information and its administration program, please visit the
23 following URL: <http://www.linuxvirtualserver.org/>.
24
25 If you want to compile it in kernel, say Y. To compile it as a
26 module, choose M here. If unsure, say N.
27
28config IP_VS_DEBUG
29 bool "IP virtual server debugging"
30 depends on IP_VS
31 ---help---
32 Say Y here if you want to get additional messages useful in
33 debugging the IP virtual server code. You can change the debug
34 level in /proc/sys/net/ipv4/vs/debug_level
35
36config IP_VS_TAB_BITS
37 int "IPVS connection table size (the Nth power of 2)"
38 depends on IP_VS
39 default "12"
40 ---help---
41 The IPVS connection hash table uses the chaining scheme to handle
42 hash collisions. Using a big IPVS connection hash table will greatly
43 reduce conflicts when there are hundreds of thousands of connections
44 in the hash table.
45
46 Note the table size must be power of 2. The table size will be the
47 value of 2 to the your input number power. The number to choose is
48 from 8 to 20, the default number is 12, which means the table size
49 is 4096. Don't input the number too small, otherwise you will lose
50 performance on it. You can adapt the table size yourself, according
51 to your virtual server application. It is good to set the table size
52 not far less than the number of connections per second multiplying
53 average lasting time of connection in the table. For example, your
54 virtual server gets 200 connections per second, the connection lasts
55 for 200 seconds in average in the connection table, the table size
56 should be not far less than 200x200, it is good to set the table
57 size 32768 (2**15).
58
59 Another note that each connection occupies 128 bytes effectively and
60 each hash entry uses 8 bytes, so you can estimate how much memory is
61 needed for your box.
62
63comment "IPVS transport protocol load balancing support"
64 depends on IP_VS
65
66config IP_VS_PROTO_TCP
67 bool "TCP load balancing support"
68 depends on IP_VS
69 ---help---
70 This option enables support for load balancing TCP transport
71 protocol. Say Y if unsure.
72
73config IP_VS_PROTO_UDP
74 bool "UDP load balancing support"
75 depends on IP_VS
76 ---help---
77 This option enables support for load balancing UDP transport
78 protocol. Say Y if unsure.
79
80config IP_VS_PROTO_ESP
81 bool "ESP load balancing support"
82 depends on IP_VS
83 ---help---
84 This option enables support for load balancing ESP (Encapsultion
85 Security Payload) transport protocol. Say Y if unsure.
86
87config IP_VS_PROTO_AH
88 bool "AH load balancing support"
89 depends on IP_VS
90 ---help---
91 This option enables support for load balancing AH (Authentication
92 Header) transport protocol. Say Y if unsure.
93
94comment "IPVS scheduler"
95 depends on IP_VS
96
97config IP_VS_RR
98 tristate "round-robin scheduling"
99 depends on IP_VS
100 ---help---
101 The robin-robin scheduling algorithm simply directs network
102 connections to different real servers in a round-robin manner.
103
104 If you want to compile it in kernel, say Y. To compile it as a
105 module, choose M here. If unsure, say N.
106
107config IP_VS_WRR
108 tristate "weighted round-robin scheduling"
109 depends on IP_VS
110 ---help---
111 The weighted robin-robin scheduling algorithm directs network
112 connections to different real servers based on server weights
113 in a round-robin manner. Servers with higher weights receive
114 new connections first than those with less weights, and servers
115 with higher weights get more connections than those with less
116 weights and servers with equal weights get equal connections.
117
118 If you want to compile it in kernel, say Y. To compile it as a
119 module, choose M here. If unsure, say N.
120
121config IP_VS_LC
122 tristate "least-connection scheduling"
123 depends on IP_VS
124 ---help---
125 The least-connection scheduling algorithm directs network
126 connections to the server with the least number of active
127 connections.
128
129 If you want to compile it in kernel, say Y. To compile it as a
130 module, choose M here. If unsure, say N.
131
132config IP_VS_WLC
133 tristate "weighted least-connection scheduling"
134 depends on IP_VS
135 ---help---
136 The weighted least-connection scheduling algorithm directs network
137 connections to the server with the least active connections
138 normalized by the server weight.
139
140 If you want to compile it in kernel, say Y. To compile it as a
141 module, choose M here. If unsure, say N.
142
143config IP_VS_LBLC
144 tristate "locality-based least-connection scheduling"
145 depends on IP_VS
146 ---help---
147 The locality-based least-connection scheduling algorithm is for
148 destination IP load balancing. It is usually used in cache cluster.
149 This algorithm usually directs packet destined for an IP address to
150 its server if the server is alive and under load. If the server is
151 overloaded (its active connection numbers is larger than its weight)
152 and there is a server in its half load, then allocate the weighted
153 least-connection server to this IP address.
154
155 If you want to compile it in kernel, say Y. To compile it as a
156 module, choose M here. If unsure, say N.
157
158config IP_VS_LBLCR
159 tristate "locality-based least-connection with replication scheduling"
160 depends on IP_VS
161 ---help---
162 The locality-based least-connection with replication scheduling
163 algorithm is also for destination IP load balancing. It is
164 usually used in cache cluster. It differs from the LBLC scheduling
165 as follows: the load balancer maintains mappings from a target
166 to a set of server nodes that can serve the target. Requests for
167 a target are assigned to the least-connection node in the target's
168 server set. If all the node in the server set are over loaded,
169 it picks up a least-connection node in the cluster and adds it
170 in the sever set for the target. If the server set has not been
171 modified for the specified time, the most loaded node is removed
172 from the server set, in order to avoid high degree of replication.
173
174 If you want to compile it in kernel, say Y. To compile it as a
175 module, choose M here. If unsure, say N.
176
177config IP_VS_DH
178 tristate "destination hashing scheduling"
179 depends on IP_VS
180 ---help---
181 The destination hashing scheduling algorithm assigns network
182 connections to the servers through looking up a statically assigned
183 hash table by their destination IP addresses.
184
185 If you want to compile it in kernel, say Y. To compile it as a
186 module, choose M here. If unsure, say N.
187
188config IP_VS_SH
189 tristate "source hashing scheduling"
190 depends on IP_VS
191 ---help---
192 The source hashing scheduling algorithm assigns network
193 connections to the servers through looking up a statically assigned
194 hash table by their source IP addresses.
195
196 If you want to compile it in kernel, say Y. To compile it as a
197 module, choose M here. If unsure, say N.
198
199config IP_VS_SED
200 tristate "shortest expected delay scheduling"
201 depends on IP_VS
202 ---help---
203 The shortest expected delay scheduling algorithm assigns network
204 connections to the server with the shortest expected delay. The
205 expected delay that the job will experience is (Ci + 1) / Ui if
206 sent to the ith server, in which Ci is the number of connections
207 on the the ith server and Ui is the fixed service rate (weight)
208 of the ith server.
209
210 If you want to compile it in kernel, say Y. To compile it as a
211 module, choose M here. If unsure, say N.
212
213config IP_VS_NQ
214 tristate "never queue scheduling"
215 depends on IP_VS
216 ---help---
217 The never queue scheduling algorithm adopts a two-speed model.
218 When there is an idle server available, the job will be sent to
219 the idle server, instead of waiting for a fast one. When there
220 is no idle server available, the job will be sent to the server
221 that minimize its expected delay (The Shortest Expected Delay
222 scheduling algorithm).
223
224 If you want to compile it in kernel, say Y. To compile it as a
225 module, choose M here. If unsure, say N.
226
227comment 'IPVS application helper'
228 depends on IP_VS
229
230config IP_VS_FTP
231 tristate "FTP protocol helper"
232 depends on IP_VS && IP_VS_PROTO_TCP
233 ---help---
234 FTP is a protocol that transfers IP address and/or port number in
235 the payload. In the virtual server via Network Address Translation,
236 the IP address and port number of real servers cannot be sent to
237 clients in ftp connections directly, so FTP protocol helper is
238 required for tracking the connection and mangling it back to that of
239 virtual service.
240
241 If you want to compile it in kernel, say Y. To compile it as a
242 module, choose M here. If unsure, say N.
243
244endmenu
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
new file mode 100644
index 000000000000..a788461a40c9
--- /dev/null
+++ b/net/ipv4/ipvs/Makefile
@@ -0,0 +1,34 @@
1#
2# Makefile for the IPVS modules on top of IPv4.
3#
4
5# IPVS transport protocol load balancing support
6ip_vs_proto-objs-y :=
7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
10ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
11
12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
14 ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \
15 $(ip_vs_proto-objs-y)
16
17
18# IPVS core
19obj-$(CONFIG_IP_VS) += ip_vs.o
20
21# IPVS schedulers
22obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
23obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
24obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
25obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
26obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
27obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
28obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
29obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
30obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
31obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
32
33# IPVS application helpers
34obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
new file mode 100644
index 000000000000..d9212addd193
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -0,0 +1,658 @@
1/*
2 * ip_vs_app.c: Application module support for IPVS
3 *
4 * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
14 * is that ip_vs_app module handles the reverse direction (incoming requests
15 * and outgoing responses).
16 *
17 * IP_MASQ_APP application masquerading module
18 *
19 * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/skbuff.h>
26#include <linux/in.h>
27#include <linux/ip.h>
28#include <net/protocol.h>
29#include <asm/system.h>
30#include <linux/stat.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33
34#include <net/ip_vs.h>
35
36EXPORT_SYMBOL(register_ip_vs_app);
37EXPORT_SYMBOL(unregister_ip_vs_app);
38EXPORT_SYMBOL(register_ip_vs_app_inc);
39
40/* ipvs application list head */
41static LIST_HEAD(ip_vs_app_list);
42static DECLARE_MUTEX(__ip_vs_app_mutex);
43
44
45/*
46 * Get an ip_vs_app object
47 */
48static inline int ip_vs_app_get(struct ip_vs_app *app)
49{
50 /* test and get the module atomically */
51 if (app->module)
52 return try_module_get(app->module);
53 else
54 return 1;
55}
56
57
58static inline void ip_vs_app_put(struct ip_vs_app *app)
59{
60 if (app->module)
61 module_put(app->module);
62}
63
64
65/*
66 * Allocate/initialize app incarnation and register it in proto apps.
67 */
68static int
69ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
70{
71 struct ip_vs_protocol *pp;
72 struct ip_vs_app *inc;
73 int ret;
74
75 if (!(pp = ip_vs_proto_get(proto)))
76 return -EPROTONOSUPPORT;
77
78 if (!pp->unregister_app)
79 return -EOPNOTSUPP;
80
81 inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
82 if (!inc)
83 return -ENOMEM;
84 memcpy(inc, app, sizeof(*inc));
85 INIT_LIST_HEAD(&inc->p_list);
86 INIT_LIST_HEAD(&inc->incs_list);
87 inc->app = app;
88 inc->port = htons(port);
89 atomic_set(&inc->usecnt, 0);
90
91 if (app->timeouts) {
92 inc->timeout_table =
93 ip_vs_create_timeout_table(app->timeouts,
94 app->timeouts_size);
95 if (!inc->timeout_table) {
96 ret = -ENOMEM;
97 goto out;
98 }
99 }
100
101 ret = pp->register_app(inc);
102 if (ret)
103 goto out;
104
105 list_add(&inc->a_list, &app->incs_list);
106 IP_VS_DBG(9, "%s application %s:%u registered\n",
107 pp->name, inc->name, inc->port);
108
109 return 0;
110
111 out:
112 if (inc->timeout_table)
113 kfree(inc->timeout_table);
114 kfree(inc);
115 return ret;
116}
117
118
119/*
120 * Release app incarnation
121 */
122static void
123ip_vs_app_inc_release(struct ip_vs_app *inc)
124{
125 struct ip_vs_protocol *pp;
126
127 if (!(pp = ip_vs_proto_get(inc->protocol)))
128 return;
129
130 if (pp->unregister_app)
131 pp->unregister_app(inc);
132
133 IP_VS_DBG(9, "%s App %s:%u unregistered\n",
134 pp->name, inc->name, inc->port);
135
136 list_del(&inc->a_list);
137
138 if (inc->timeout_table != NULL)
139 kfree(inc->timeout_table);
140 kfree(inc);
141}
142
143
144/*
145 * Get reference to app inc (only called from softirq)
146 *
147 */
148int ip_vs_app_inc_get(struct ip_vs_app *inc)
149{
150 int result;
151
152 atomic_inc(&inc->usecnt);
153 if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
154 atomic_dec(&inc->usecnt);
155 return result;
156}
157
158
159/*
160 * Put the app inc (only called from timer or net softirq)
161 */
162void ip_vs_app_inc_put(struct ip_vs_app *inc)
163{
164 ip_vs_app_put(inc->app);
165 atomic_dec(&inc->usecnt);
166}
167
168
169/*
170 * Register an application incarnation in protocol applications
171 */
172int
173register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
174{
175 int result;
176
177 down(&__ip_vs_app_mutex);
178
179 result = ip_vs_app_inc_new(app, proto, port);
180
181 up(&__ip_vs_app_mutex);
182
183 return result;
184}
185
186
187/*
188 * ip_vs_app registration routine
189 */
190int register_ip_vs_app(struct ip_vs_app *app)
191{
192 /* increase the module use count */
193 ip_vs_use_count_inc();
194
195 down(&__ip_vs_app_mutex);
196
197 list_add(&app->a_list, &ip_vs_app_list);
198
199 up(&__ip_vs_app_mutex);
200
201 return 0;
202}
203
204
205/*
206 * ip_vs_app unregistration routine
207 * We are sure there are no app incarnations attached to services
208 */
209void unregister_ip_vs_app(struct ip_vs_app *app)
210{
211 struct ip_vs_app *inc, *nxt;
212
213 down(&__ip_vs_app_mutex);
214
215 list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
216 ip_vs_app_inc_release(inc);
217 }
218
219 list_del(&app->a_list);
220
221 up(&__ip_vs_app_mutex);
222
223 /* decrease the module use count */
224 ip_vs_use_count_dec();
225}
226
227
228#if 0000
229/*
230 * Get reference to app by name (called from user context)
231 */
232struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
233{
234 struct ip_vs_app *app, *a = NULL;
235
236 down(&__ip_vs_app_mutex);
237
238 list_for_each_entry(ent, &ip_vs_app_list, a_list) {
239 if (strcmp(app->name, appname))
240 continue;
241
242 /* softirq may call ip_vs_app_get too, so the caller
243 must disable softirq on the current CPU */
244 if (ip_vs_app_get(app))
245 a = app;
246 break;
247 }
248
249 up(&__ip_vs_app_mutex);
250
251 return a;
252}
253#endif
254
255
256/*
257 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
258 */
259int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
260{
261 return pp->app_conn_bind(cp);
262}
263
264
265/*
266 * Unbind cp from application incarnation (called by cp destructor)
267 */
268void ip_vs_unbind_app(struct ip_vs_conn *cp)
269{
270 struct ip_vs_app *inc = cp->app;
271
272 if (!inc)
273 return;
274
275 if (inc->unbind_conn)
276 inc->unbind_conn(inc, cp);
277 if (inc->done_conn)
278 inc->done_conn(inc, cp);
279 ip_vs_app_inc_put(inc);
280 cp->app = NULL;
281}
282
283
284/*
285 * Fixes th->seq based on ip_vs_seq info.
286 */
287static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
288{
289 __u32 seq = ntohl(th->seq);
290
291 /*
292 * Adjust seq with delta-offset for all packets after
293 * the most recent resized pkt seq and with previous_delta offset
294 * for all packets before most recent resized pkt seq.
295 */
296 if (vseq->delta || vseq->previous_delta) {
297 if(after(seq, vseq->init_seq)) {
298 th->seq = htonl(seq + vseq->delta);
299 IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
300 vseq->delta);
301 } else {
302 th->seq = htonl(seq + vseq->previous_delta);
303 IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
304 "(%d) to seq\n", vseq->previous_delta);
305 }
306 }
307}
308
309
310/*
311 * Fixes th->ack_seq based on ip_vs_seq info.
312 */
313static inline void
314vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
315{
316 __u32 ack_seq = ntohl(th->ack_seq);
317
318 /*
319 * Adjust ack_seq with delta-offset for
320 * the packets AFTER most recent resized pkt has caused a shift
321 * for packets before most recent resized pkt, use previous_delta
322 */
323 if (vseq->delta || vseq->previous_delta) {
324 /* since ack_seq is the number of octet that is expected
325 to receive next, so compare it with init_seq+delta */
326 if(after(ack_seq, vseq->init_seq+vseq->delta)) {
327 th->ack_seq = htonl(ack_seq - vseq->delta);
328 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
329 "(%d) from ack_seq\n", vseq->delta);
330
331 } else {
332 th->ack_seq = htonl(ack_seq - vseq->previous_delta);
333 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
334 "previous_delta (%d) from ack_seq\n",
335 vseq->previous_delta);
336 }
337 }
338}
339
340
341/*
342 * Updates ip_vs_seq if pkt has been resized
343 * Assumes already checked proto==IPPROTO_TCP and diff!=0.
344 */
345static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
346 unsigned flag, __u32 seq, int diff)
347{
348 /* spinlock is to keep updating cp->flags atomic */
349 spin_lock(&cp->lock);
350 if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
351 vseq->previous_delta = vseq->delta;
352 vseq->delta += diff;
353 vseq->init_seq = seq;
354 cp->flags |= flag;
355 }
356 spin_unlock(&cp->lock);
357}
358
359static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
360 struct ip_vs_app *app)
361{
362 int diff;
363 unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
364 struct tcphdr *th;
365 __u32 seq;
366
367 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
368 return 0;
369
370 th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
371
372 /*
373 * Remember seq number in case this pkt gets resized
374 */
375 seq = ntohl(th->seq);
376
377 /*
378 * Fix seq stuff if flagged as so.
379 */
380 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
381 vs_fix_seq(&cp->out_seq, th);
382 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
383 vs_fix_ack_seq(&cp->in_seq, th);
384
385 /*
386 * Call private output hook function
387 */
388 if (app->pkt_out == NULL)
389 return 1;
390
391 if (!app->pkt_out(app, cp, pskb, &diff))
392 return 0;
393
394 /*
395 * Update ip_vs seq stuff if len has changed.
396 */
397 if (diff != 0)
398 vs_seq_update(cp, &cp->out_seq,
399 IP_VS_CONN_F_OUT_SEQ, seq, diff);
400
401 return 1;
402}
403
404/*
405 * Output pkt hook. Will call bound ip_vs_app specific function
406 * called by ipvs packet handler, assumes previously checked cp!=NULL
407 * returns false if it can't handle packet (oom)
408 */
409int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
410{
411 struct ip_vs_app *app;
412
413 /*
414 * check if application module is bound to
415 * this ip_vs_conn.
416 */
417 if ((app = cp->app) == NULL)
418 return 1;
419
420 /* TCP is complicated */
421 if (cp->protocol == IPPROTO_TCP)
422 return app_tcp_pkt_out(cp, pskb, app);
423
424 /*
425 * Call private output hook function
426 */
427 if (app->pkt_out == NULL)
428 return 1;
429
430 return app->pkt_out(app, cp, pskb, NULL);
431}
432
433
434static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
435 struct ip_vs_app *app)
436{
437 int diff;
438 unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
439 struct tcphdr *th;
440 __u32 seq;
441
442 if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
443 return 0;
444
445 th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
446
447 /*
448 * Remember seq number in case this pkt gets resized
449 */
450 seq = ntohl(th->seq);
451
452 /*
453 * Fix seq stuff if flagged as so.
454 */
455 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
456 vs_fix_seq(&cp->in_seq, th);
457 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
458 vs_fix_ack_seq(&cp->out_seq, th);
459
460 /*
461 * Call private input hook function
462 */
463 if (app->pkt_in == NULL)
464 return 1;
465
466 if (!app->pkt_in(app, cp, pskb, &diff))
467 return 0;
468
469 /*
470 * Update ip_vs seq stuff if len has changed.
471 */
472 if (diff != 0)
473 vs_seq_update(cp, &cp->in_seq,
474 IP_VS_CONN_F_IN_SEQ, seq, diff);
475
476 return 1;
477}
478
479/*
480 * Input pkt hook. Will call bound ip_vs_app specific function
481 * called by ipvs packet handler, assumes previously checked cp!=NULL.
482 * returns false if can't handle packet (oom).
483 */
484int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
485{
486 struct ip_vs_app *app;
487
488 /*
489 * check if application module is bound to
490 * this ip_vs_conn.
491 */
492 if ((app = cp->app) == NULL)
493 return 1;
494
495 /* TCP is complicated */
496 if (cp->protocol == IPPROTO_TCP)
497 return app_tcp_pkt_in(cp, pskb, app);
498
499 /*
500 * Call private input hook function
501 */
502 if (app->pkt_in == NULL)
503 return 1;
504
505 return app->pkt_in(app, cp, pskb, NULL);
506}
507
508
509#ifdef CONFIG_PROC_FS
510/*
511 * /proc/net/ip_vs_app entry function
512 */
513
514static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
515{
516 struct ip_vs_app *app, *inc;
517
518 list_for_each_entry(app, &ip_vs_app_list, a_list) {
519 list_for_each_entry(inc, &app->incs_list, a_list) {
520 if (pos-- == 0)
521 return inc;
522 }
523 }
524 return NULL;
525
526}
527
528static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
529{
530 down(&__ip_vs_app_mutex);
531
532 return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
533}
534
535static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
536{
537 struct ip_vs_app *inc, *app;
538 struct list_head *e;
539
540 ++*pos;
541 if (v == SEQ_START_TOKEN)
542 return ip_vs_app_idx(0);
543
544 inc = v;
545 app = inc->app;
546
547 if ((e = inc->a_list.next) != &app->incs_list)
548 return list_entry(e, struct ip_vs_app, a_list);
549
550 /* go on to next application */
551 for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
552 app = list_entry(e, struct ip_vs_app, a_list);
553 list_for_each_entry(inc, &app->incs_list, a_list) {
554 return inc;
555 }
556 }
557 return NULL;
558}
559
560static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
561{
562 up(&__ip_vs_app_mutex);
563}
564
565static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
566{
567 if (v == SEQ_START_TOKEN)
568 seq_puts(seq, "prot port usecnt name\n");
569 else {
570 const struct ip_vs_app *inc = v;
571
572 seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
573 ip_vs_proto_name(inc->protocol),
574 ntohs(inc->port),
575 atomic_read(&inc->usecnt),
576 inc->name);
577 }
578 return 0;
579}
580
581static struct seq_operations ip_vs_app_seq_ops = {
582 .start = ip_vs_app_seq_start,
583 .next = ip_vs_app_seq_next,
584 .stop = ip_vs_app_seq_stop,
585 .show = ip_vs_app_seq_show,
586};
587
588static int ip_vs_app_open(struct inode *inode, struct file *file)
589{
590 return seq_open(file, &ip_vs_app_seq_ops);
591}
592
593static struct file_operations ip_vs_app_fops = {
594 .owner = THIS_MODULE,
595 .open = ip_vs_app_open,
596 .read = seq_read,
597 .llseek = seq_lseek,
598 .release = seq_release,
599};
600#endif
601
602
603/*
604 * Replace a segment of data with a new segment
605 */
606int ip_vs_skb_replace(struct sk_buff *skb, int pri,
607 char *o_buf, int o_len, char *n_buf, int n_len)
608{
609 struct iphdr *iph;
610 int diff;
611 int o_offset;
612 int o_left;
613
614 EnterFunction(9);
615
616 diff = n_len - o_len;
617 o_offset = o_buf - (char *)skb->data;
618 /* The length of left data after o_buf+o_len in the skb data */
619 o_left = skb->len - (o_offset + o_len);
620
621 if (diff <= 0) {
622 memmove(o_buf + n_len, o_buf + o_len, o_left);
623 memcpy(o_buf, n_buf, n_len);
624 skb_trim(skb, skb->len + diff);
625 } else if (diff <= skb_tailroom(skb)) {
626 skb_put(skb, diff);
627 memmove(o_buf + n_len, o_buf + o_len, o_left);
628 memcpy(o_buf, n_buf, n_len);
629 } else {
630 if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
631 return -ENOMEM;
632 skb_put(skb, diff);
633 memmove(skb->data + o_offset + n_len,
634 skb->data + o_offset + o_len, o_left);
635 memcpy(skb->data + o_offset, n_buf, n_len);
636 }
637
638 /* must update the iph total length here */
639 iph = skb->nh.iph;
640 iph->tot_len = htons(skb->len);
641
642 LeaveFunction(9);
643 return 0;
644}
645
646
647int ip_vs_app_init(void)
648{
649 /* we will replace it with proc_net_ipvs_create() soon */
650 proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
651 return 0;
652}
653
654
655void ip_vs_app_cleanup(void)
656{
657 proc_net_remove("ip_vs_app");
658}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
new file mode 100644
index 000000000000..fd6feb5499fe
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -0,0 +1,920 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
22 *
23 * Changes:
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/vmalloc.h>
29#include <linux/proc_fs.h> /* for proc_net_* */
30#include <linux/seq_file.h>
31#include <linux/jhash.h>
32#include <linux/random.h>
33
34#include <net/ip_vs.h>
35
36
37/*
38 * Connection hash table: for input and output packets lookups of IPVS
39 */
40static struct list_head *ip_vs_conn_tab;
41
42/* SLAB cache for IPVS connections */
43static kmem_cache_t *ip_vs_conn_cachep;
44
45/* counter for current IPVS connections */
46static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
47
48/* counter for no client port connections */
49static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
50
51/* random value for IPVS connection hash */
52static unsigned int ip_vs_conn_rnd;
53
54/*
55 * Fine locking granularity for big connection hash table
56 */
57#define CT_LOCKARRAY_BITS 4
58#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
59#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
60
61struct ip_vs_aligned_lock
62{
63 rwlock_t l;
64} __attribute__((__aligned__(SMP_CACHE_BYTES)));
65
66/* lock array for conn table */
67static struct ip_vs_aligned_lock
68__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
69
70static inline void ct_read_lock(unsigned key)
71{
72 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
73}
74
75static inline void ct_read_unlock(unsigned key)
76{
77 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
78}
79
80static inline void ct_write_lock(unsigned key)
81{
82 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
83}
84
85static inline void ct_write_unlock(unsigned key)
86{
87 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
88}
89
90static inline void ct_read_lock_bh(unsigned key)
91{
92 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
93}
94
95static inline void ct_read_unlock_bh(unsigned key)
96{
97 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
98}
99
100static inline void ct_write_lock_bh(unsigned key)
101{
102 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
103}
104
105static inline void ct_write_unlock_bh(unsigned key)
106{
107 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
108}
109
110
111/*
112 * Returns hash value for IPVS connection entry
113 */
114static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
115{
116 return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
117 & IP_VS_CONN_TAB_MASK;
118}
119
120
121/*
122 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
123 * returns bool success.
124 */
125static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
126{
127 unsigned hash;
128 int ret;
129
130 /* Hash by protocol, client address and port */
131 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
132
133 ct_write_lock(hash);
134
135 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
136 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
137 cp->flags |= IP_VS_CONN_F_HASHED;
138 atomic_inc(&cp->refcnt);
139 ret = 1;
140 } else {
141 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
142 "called from %p\n", __builtin_return_address(0));
143 ret = 0;
144 }
145
146 ct_write_unlock(hash);
147
148 return ret;
149}
150
151
152/*
153 * UNhashes ip_vs_conn from ip_vs_conn_tab.
154 * returns bool success.
155 */
156static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
157{
158 unsigned hash;
159 int ret;
160
161 /* unhash it and decrease its reference counter */
162 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
163
164 ct_write_lock(hash);
165
166 if (cp->flags & IP_VS_CONN_F_HASHED) {
167 list_del(&cp->c_list);
168 cp->flags &= ~IP_VS_CONN_F_HASHED;
169 atomic_dec(&cp->refcnt);
170 ret = 1;
171 } else
172 ret = 0;
173
174 ct_write_unlock(hash);
175
176 return ret;
177}
178
179
180/*
181 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
182 * Called for pkts coming from OUTside-to-INside.
183 * s_addr, s_port: pkt source address (foreign host)
184 * d_addr, d_port: pkt dest address (load balancer)
185 */
186static inline struct ip_vs_conn *__ip_vs_conn_in_get
187(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
188{
189 unsigned hash;
190 struct ip_vs_conn *cp;
191
192 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
193
194 ct_read_lock(hash);
195
196 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
197 if (s_addr==cp->caddr && s_port==cp->cport &&
198 d_port==cp->vport && d_addr==cp->vaddr &&
199 protocol==cp->protocol) {
200 /* HIT */
201 atomic_inc(&cp->refcnt);
202 ct_read_unlock(hash);
203 return cp;
204 }
205 }
206
207 ct_read_unlock(hash);
208
209 return NULL;
210}
211
212struct ip_vs_conn *ip_vs_conn_in_get
213(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
214{
215 struct ip_vs_conn *cp;
216
217 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
218 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
219 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
220
221 IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
222 ip_vs_proto_name(protocol),
223 NIPQUAD(s_addr), ntohs(s_port),
224 NIPQUAD(d_addr), ntohs(d_port),
225 cp?"hit":"not hit");
226
227 return cp;
228}
229
230
231/*
232 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
233 * Called for pkts coming from inside-to-OUTside.
234 * s_addr, s_port: pkt source address (inside host)
235 * d_addr, d_port: pkt dest address (foreign host)
236 */
237struct ip_vs_conn *ip_vs_conn_out_get
238(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
239{
240 unsigned hash;
241 struct ip_vs_conn *cp, *ret=NULL;
242
243 /*
244 * Check for "full" addressed entries
245 */
246 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
247
248 ct_read_lock(hash);
249
250 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
251 if (d_addr == cp->caddr && d_port == cp->cport &&
252 s_port == cp->dport && s_addr == cp->daddr &&
253 protocol == cp->protocol) {
254 /* HIT */
255 atomic_inc(&cp->refcnt);
256 ret = cp;
257 break;
258 }
259 }
260
261 ct_read_unlock(hash);
262
263 IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
264 ip_vs_proto_name(protocol),
265 NIPQUAD(s_addr), ntohs(s_port),
266 NIPQUAD(d_addr), ntohs(d_port),
267 ret?"hit":"not hit");
268
269 return ret;
270}
271
272
273/*
274 * Put back the conn and restart its timer with its timeout
275 */
276void ip_vs_conn_put(struct ip_vs_conn *cp)
277{
278 /* reset it expire in its timeout */
279 mod_timer(&cp->timer, jiffies+cp->timeout);
280
281 __ip_vs_conn_put(cp);
282}
283
284
285/*
286 * Fill a no_client_port connection with a client port number
287 */
288void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
289{
290 if (ip_vs_conn_unhash(cp)) {
291 spin_lock(&cp->lock);
292 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
293 atomic_dec(&ip_vs_conn_no_cport_cnt);
294 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
295 cp->cport = cport;
296 }
297 spin_unlock(&cp->lock);
298
299 /* hash on new dport */
300 ip_vs_conn_hash(cp);
301 }
302}
303
304
305/*
306 * Bind a connection entry with the corresponding packet_xmit.
307 * Called by ip_vs_conn_new.
308 */
309static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
310{
311 switch (IP_VS_FWD_METHOD(cp)) {
312 case IP_VS_CONN_F_MASQ:
313 cp->packet_xmit = ip_vs_nat_xmit;
314 break;
315
316 case IP_VS_CONN_F_TUNNEL:
317 cp->packet_xmit = ip_vs_tunnel_xmit;
318 break;
319
320 case IP_VS_CONN_F_DROUTE:
321 cp->packet_xmit = ip_vs_dr_xmit;
322 break;
323
324 case IP_VS_CONN_F_LOCALNODE:
325 cp->packet_xmit = ip_vs_null_xmit;
326 break;
327
328 case IP_VS_CONN_F_BYPASS:
329 cp->packet_xmit = ip_vs_bypass_xmit;
330 break;
331 }
332}
333
334
335static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
336{
337 return atomic_read(&dest->activeconns)
338 + atomic_read(&dest->inactconns);
339}
340
341/*
342 * Bind a connection entry with a virtual service destination
343 * Called just after a new connection entry is created.
344 */
345static inline void
346ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
347{
348 /* if dest is NULL, then return directly */
349 if (!dest)
350 return;
351
352 /* Increase the refcnt counter of the dest */
353 atomic_inc(&dest->refcnt);
354
355 /* Bind with the destination and its corresponding transmitter */
356 cp->flags |= atomic_read(&dest->conn_flags);
357 cp->dest = dest;
358
359 IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
360 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
361 ip_vs_proto_name(cp->protocol),
362 NIPQUAD(cp->caddr), ntohs(cp->cport),
363 NIPQUAD(cp->vaddr), ntohs(cp->vport),
364 NIPQUAD(cp->daddr), ntohs(cp->dport),
365 ip_vs_fwd_tag(cp), cp->state,
366 cp->flags, atomic_read(&cp->refcnt),
367 atomic_read(&dest->refcnt));
368
369 /* Update the connection counters */
370 if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
371 /* It is a normal connection, so increase the inactive
372 connection counter because it is in TCP SYNRECV
373 state (inactive) or other protocol inacive state */
374 atomic_inc(&dest->inactconns);
375 } else {
376 /* It is a persistent connection/template, so increase
377 the peristent connection counter */
378 atomic_inc(&dest->persistconns);
379 }
380
381 if (dest->u_threshold != 0 &&
382 ip_vs_dest_totalconns(dest) >= dest->u_threshold)
383 dest->flags |= IP_VS_DEST_F_OVERLOAD;
384}
385
386
387/*
388 * Unbind a connection entry with its VS destination
389 * Called by the ip_vs_conn_expire function.
390 */
391static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
392{
393 struct ip_vs_dest *dest = cp->dest;
394
395 if (!dest)
396 return;
397
398 IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
399 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
400 ip_vs_proto_name(cp->protocol),
401 NIPQUAD(cp->caddr), ntohs(cp->cport),
402 NIPQUAD(cp->vaddr), ntohs(cp->vport),
403 NIPQUAD(cp->daddr), ntohs(cp->dport),
404 ip_vs_fwd_tag(cp), cp->state,
405 cp->flags, atomic_read(&cp->refcnt),
406 atomic_read(&dest->refcnt));
407
408 /* Update the connection counters */
409 if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
410 /* It is a normal connection, so decrease the inactconns
411 or activeconns counter */
412 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
413 atomic_dec(&dest->inactconns);
414 } else {
415 atomic_dec(&dest->activeconns);
416 }
417 } else {
418 /* It is a persistent connection/template, so decrease
419 the peristent connection counter */
420 atomic_dec(&dest->persistconns);
421 }
422
423 if (dest->l_threshold != 0) {
424 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
425 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
426 } else if (dest->u_threshold != 0) {
427 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
428 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
429 } else {
430 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
431 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
432 }
433
434 /*
435 * Simply decrease the refcnt of the dest, because the
436 * dest will be either in service's destination list
437 * or in the trash.
438 */
439 atomic_dec(&dest->refcnt);
440}
441
442
443/*
444 * Checking if the destination of a connection template is available.
445 * If available, return 1, otherwise invalidate this connection
446 * template and return 0.
447 */
448int ip_vs_check_template(struct ip_vs_conn *ct)
449{
450 struct ip_vs_dest *dest = ct->dest;
451
452 /*
453 * Checking the dest server status.
454 */
455 if ((dest == NULL) ||
456 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
457 (sysctl_ip_vs_expire_quiescent_template &&
458 (atomic_read(&dest->weight) == 0))) {
459 IP_VS_DBG(9, "check_template: dest not available for "
460 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
461 "-> d:%u.%u.%u.%u:%d\n",
462 ip_vs_proto_name(ct->protocol),
463 NIPQUAD(ct->caddr), ntohs(ct->cport),
464 NIPQUAD(ct->vaddr), ntohs(ct->vport),
465 NIPQUAD(ct->daddr), ntohs(ct->dport));
466
467 /*
468 * Invalidate the connection template
469 */
470 if (ct->cport) {
471 if (ip_vs_conn_unhash(ct)) {
472 ct->dport = 65535;
473 ct->vport = 65535;
474 ct->cport = 0;
475 ip_vs_conn_hash(ct);
476 }
477 }
478
479 /*
480 * Simply decrease the refcnt of the template,
481 * don't restart its timer.
482 */
483 atomic_dec(&ct->refcnt);
484 return 0;
485 }
486 return 1;
487}
488
489static void ip_vs_conn_expire(unsigned long data)
490{
491 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
492
493 cp->timeout = 60*HZ;
494
495 /*
496 * hey, I'm using it
497 */
498 atomic_inc(&cp->refcnt);
499
500 /*
501 * do I control anybody?
502 */
503 if (atomic_read(&cp->n_control))
504 goto expire_later;
505
506 /*
507 * unhash it if it is hashed in the conn table
508 */
509 if (!ip_vs_conn_unhash(cp))
510 goto expire_later;
511
512 /*
513 * refcnt==1 implies I'm the only one referrer
514 */
515 if (likely(atomic_read(&cp->refcnt) == 1)) {
516 /* delete the timer if it is activated by other users */
517 if (timer_pending(&cp->timer))
518 del_timer(&cp->timer);
519
520 /* does anybody control me? */
521 if (cp->control)
522 ip_vs_control_del(cp);
523
524 if (unlikely(cp->app != NULL))
525 ip_vs_unbind_app(cp);
526 ip_vs_unbind_dest(cp);
527 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
528 atomic_dec(&ip_vs_conn_no_cport_cnt);
529 atomic_dec(&ip_vs_conn_count);
530
531 kmem_cache_free(ip_vs_conn_cachep, cp);
532 return;
533 }
534
535 /* hash it back to the table */
536 ip_vs_conn_hash(cp);
537
538 expire_later:
539 IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
540 atomic_read(&cp->refcnt)-1,
541 atomic_read(&cp->n_control));
542
543 ip_vs_conn_put(cp);
544}
545
546
547void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
548{
549 if (del_timer(&cp->timer))
550 mod_timer(&cp->timer, jiffies);
551 __ip_vs_conn_put(cp);
552}
553
554
555/*
556 * Create a new connection entry and hash it into the ip_vs_conn_tab
557 */
558struct ip_vs_conn *
559ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
560 __u32 daddr, __u16 dport, unsigned flags,
561 struct ip_vs_dest *dest)
562{
563 struct ip_vs_conn *cp;
564 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
565
566 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
567 if (cp == NULL) {
568 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
569 return NULL;
570 }
571
572 memset(cp, 0, sizeof(*cp));
573 INIT_LIST_HEAD(&cp->c_list);
574 init_timer(&cp->timer);
575 cp->timer.data = (unsigned long)cp;
576 cp->timer.function = ip_vs_conn_expire;
577 cp->protocol = proto;
578 cp->caddr = caddr;
579 cp->cport = cport;
580 cp->vaddr = vaddr;
581 cp->vport = vport;
582 cp->daddr = daddr;
583 cp->dport = dport;
584 cp->flags = flags;
585 spin_lock_init(&cp->lock);
586
587 /*
588 * Set the entry is referenced by the current thread before hashing
589 * it in the table, so that other thread run ip_vs_random_dropentry
590 * but cannot drop this entry.
591 */
592 atomic_set(&cp->refcnt, 1);
593
594 atomic_set(&cp->n_control, 0);
595 atomic_set(&cp->in_pkts, 0);
596
597 atomic_inc(&ip_vs_conn_count);
598 if (flags & IP_VS_CONN_F_NO_CPORT)
599 atomic_inc(&ip_vs_conn_no_cport_cnt);
600
601 /* Bind the connection with a destination server */
602 ip_vs_bind_dest(cp, dest);
603
604 /* Set its state and timeout */
605 cp->state = 0;
606 cp->timeout = 3*HZ;
607
608 /* Bind its packet transmitter */
609 ip_vs_bind_xmit(cp);
610
611 if (unlikely(pp && atomic_read(&pp->appcnt)))
612 ip_vs_bind_app(cp, pp);
613
614 /* Hash it in the ip_vs_conn_tab finally */
615 ip_vs_conn_hash(cp);
616
617 return cp;
618}
619
620
621/*
622 * /proc/net/ip_vs_conn entries
623 */
624#ifdef CONFIG_PROC_FS
625
626static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
627{
628 int idx;
629 struct ip_vs_conn *cp;
630
631 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
632 ct_read_lock_bh(idx);
633 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
634 if (pos-- == 0) {
635 seq->private = &ip_vs_conn_tab[idx];
636 return cp;
637 }
638 }
639 ct_read_unlock_bh(idx);
640 }
641
642 return NULL;
643}
644
645static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
646{
647 seq->private = NULL;
648 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
649}
650
651static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
652{
653 struct ip_vs_conn *cp = v;
654 struct list_head *e, *l = seq->private;
655 int idx;
656
657 ++*pos;
658 if (v == SEQ_START_TOKEN)
659 return ip_vs_conn_array(seq, 0);
660
661 /* more on same hash chain? */
662 if ((e = cp->c_list.next) != l)
663 return list_entry(e, struct ip_vs_conn, c_list);
664
665 idx = l - ip_vs_conn_tab;
666 ct_read_unlock_bh(idx);
667
668 while (++idx < IP_VS_CONN_TAB_SIZE) {
669 ct_read_lock_bh(idx);
670 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
671 seq->private = &ip_vs_conn_tab[idx];
672 return cp;
673 }
674 ct_read_unlock_bh(idx);
675 }
676 seq->private = NULL;
677 return NULL;
678}
679
680static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
681{
682 struct list_head *l = seq->private;
683
684 if (l)
685 ct_read_unlock_bh(l - ip_vs_conn_tab);
686}
687
688static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
689{
690
691 if (v == SEQ_START_TOKEN)
692 seq_puts(seq,
693 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
694 else {
695 const struct ip_vs_conn *cp = v;
696
697 seq_printf(seq,
698 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
699 ip_vs_proto_name(cp->protocol),
700 ntohl(cp->caddr), ntohs(cp->cport),
701 ntohl(cp->vaddr), ntohs(cp->vport),
702 ntohl(cp->daddr), ntohs(cp->dport),
703 ip_vs_state_name(cp->protocol, cp->state),
704 (cp->timer.expires-jiffies)/HZ);
705 }
706 return 0;
707}
708
709static struct seq_operations ip_vs_conn_seq_ops = {
710 .start = ip_vs_conn_seq_start,
711 .next = ip_vs_conn_seq_next,
712 .stop = ip_vs_conn_seq_stop,
713 .show = ip_vs_conn_seq_show,
714};
715
716static int ip_vs_conn_open(struct inode *inode, struct file *file)
717{
718 return seq_open(file, &ip_vs_conn_seq_ops);
719}
720
721static struct file_operations ip_vs_conn_fops = {
722 .owner = THIS_MODULE,
723 .open = ip_vs_conn_open,
724 .read = seq_read,
725 .llseek = seq_lseek,
726 .release = seq_release,
727};
728#endif
729
730
731/*
732 * Randomly drop connection entries before running out of memory
733 */
734static inline int todrop_entry(struct ip_vs_conn *cp)
735{
736 /*
737 * The drop rate array needs tuning for real environments.
738 * Called from timer bh only => no locking
739 */
740 static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
741 static char todrop_counter[9] = {0};
742 int i;
743
744 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
745 This will leave enough time for normal connection to get
746 through. */
747 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
748 return 0;
749
750 /* Don't drop the entry if its number of incoming packets is not
751 located in [0, 8] */
752 i = atomic_read(&cp->in_pkts);
753 if (i > 8 || i < 0) return 0;
754
755 if (!todrop_rate[i]) return 0;
756 if (--todrop_counter[i] > 0) return 0;
757
758 todrop_counter[i] = todrop_rate[i];
759 return 1;
760}
761
762
763void ip_vs_random_dropentry(void)
764{
765 int idx;
766 struct ip_vs_conn *cp;
767 struct ip_vs_conn *ct;
768
769 /*
770 * Randomly scan 1/32 of the whole table every second
771 */
772 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
773 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
774
775 /*
776 * Lock is actually needed in this loop.
777 */
778 ct_write_lock(hash);
779
780 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
781 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
782 /* connection template */
783 continue;
784
785 if (cp->protocol == IPPROTO_TCP) {
786 switch(cp->state) {
787 case IP_VS_TCP_S_SYN_RECV:
788 case IP_VS_TCP_S_SYNACK:
789 break;
790
791 case IP_VS_TCP_S_ESTABLISHED:
792 if (todrop_entry(cp))
793 break;
794 continue;
795
796 default:
797 continue;
798 }
799 } else {
800 if (!todrop_entry(cp))
801 continue;
802 }
803
804 /*
805 * Drop the entry, and drop its ct if not referenced
806 */
807 atomic_inc(&cp->refcnt);
808 ct_write_unlock(hash);
809
810 if ((ct = cp->control))
811 atomic_inc(&ct->refcnt);
812 IP_VS_DBG(4, "del connection\n");
813 ip_vs_conn_expire_now(cp);
814 if (ct) {
815 IP_VS_DBG(4, "del conn template\n");
816 ip_vs_conn_expire_now(ct);
817 }
818 ct_write_lock(hash);
819 }
820 ct_write_unlock(hash);
821 }
822}
823
824
825/*
826 * Flush all the connection entries in the ip_vs_conn_tab
827 */
828static void ip_vs_conn_flush(void)
829{
830 int idx;
831 struct ip_vs_conn *cp;
832 struct ip_vs_conn *ct;
833
834 flush_again:
835 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
836 /*
837 * Lock is actually needed in this loop.
838 */
839 ct_write_lock_bh(idx);
840
841 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
842 atomic_inc(&cp->refcnt);
843 ct_write_unlock(idx);
844
845 if ((ct = cp->control))
846 atomic_inc(&ct->refcnt);
847 IP_VS_DBG(4, "del connection\n");
848 ip_vs_conn_expire_now(cp);
849 if (ct) {
850 IP_VS_DBG(4, "del conn template\n");
851 ip_vs_conn_expire_now(ct);
852 }
853 ct_write_lock(idx);
854 }
855 ct_write_unlock_bh(idx);
856 }
857
858 /* the counter may be not NULL, because maybe some conn entries
859 are run by slow timer handler or unhashed but still referred */
860 if (atomic_read(&ip_vs_conn_count) != 0) {
861 schedule();
862 goto flush_again;
863 }
864}
865
866
867int ip_vs_conn_init(void)
868{
869 int idx;
870
871 /*
872 * Allocate the connection hash table and initialize its list heads
873 */
874 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
875 if (!ip_vs_conn_tab)
876 return -ENOMEM;
877
878 /* Allocate ip_vs_conn slab cache */
879 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
880 sizeof(struct ip_vs_conn), 0,
881 SLAB_HWCACHE_ALIGN, NULL, NULL);
882 if (!ip_vs_conn_cachep) {
883 vfree(ip_vs_conn_tab);
884 return -ENOMEM;
885 }
886
887 IP_VS_INFO("Connection hash table configured "
888 "(size=%d, memory=%ldKbytes)\n",
889 IP_VS_CONN_TAB_SIZE,
890 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
891 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
892 sizeof(struct ip_vs_conn));
893
894 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
895 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
896 }
897
898 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
899 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
900 }
901
902 proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
903
904 /* calculate the random value for connection hash */
905 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
906
907 return 0;
908}
909
910
911void ip_vs_conn_cleanup(void)
912{
913 /* flush all the connection entries first */
914 ip_vs_conn_flush();
915
916 /* Release the empty cache */
917 kmem_cache_destroy(ip_vs_conn_cachep);
918 proc_net_remove("ip_vs_conn");
919 vfree(ip_vs_conn_tab);
920}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
new file mode 100644
index 000000000000..5fb257dd07cb
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -0,0 +1,1191 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21 * and others.
22 *
23 * Changes:
24 * Paul `Rusty' Russell properly handle non-linear skbs
25 *
26 */
27
28#include <linux/module.h>
29#include <linux/kernel.h>
30#include <linux/ip.h>
31#include <linux/tcp.h>
32#include <linux/icmp.h>
33
34#include <net/ip.h>
35#include <net/tcp.h>
36#include <net/udp.h>
37#include <net/icmp.h> /* for icmp_send */
38#include <net/route.h>
39
40#include <linux/netfilter.h>
41#include <linux/netfilter_ipv4.h>
42
43#include <net/ip_vs.h>
44
45
46EXPORT_SYMBOL(register_ip_vs_scheduler);
47EXPORT_SYMBOL(unregister_ip_vs_scheduler);
48EXPORT_SYMBOL(ip_vs_skb_replace);
49EXPORT_SYMBOL(ip_vs_proto_name);
50EXPORT_SYMBOL(ip_vs_conn_new);
51EXPORT_SYMBOL(ip_vs_conn_in_get);
52EXPORT_SYMBOL(ip_vs_conn_out_get);
53#ifdef CONFIG_IP_VS_PROTO_TCP
54EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
55#endif
56EXPORT_SYMBOL(ip_vs_conn_put);
57#ifdef CONFIG_IP_VS_DEBUG
58EXPORT_SYMBOL(ip_vs_get_debug_level);
59#endif
60EXPORT_SYMBOL(ip_vs_make_skb_writable);
61
62
63/* ID used in ICMP lookups */
64#define icmp_id(icmph) (((icmph)->un).echo.id)
65
66const char *ip_vs_proto_name(unsigned proto)
67{
68 static char buf[20];
69
70 switch (proto) {
71 case IPPROTO_IP:
72 return "IP";
73 case IPPROTO_UDP:
74 return "UDP";
75 case IPPROTO_TCP:
76 return "TCP";
77 case IPPROTO_ICMP:
78 return "ICMP";
79 default:
80 sprintf(buf, "IP_%d", proto);
81 return buf;
82 }
83}
84
85void ip_vs_init_hash_table(struct list_head *table, int rows)
86{
87 while (--rows >= 0)
88 INIT_LIST_HEAD(&table[rows]);
89}
90
91static inline void
92ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
93{
94 struct ip_vs_dest *dest = cp->dest;
95 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
96 spin_lock(&dest->stats.lock);
97 dest->stats.inpkts++;
98 dest->stats.inbytes += skb->len;
99 spin_unlock(&dest->stats.lock);
100
101 spin_lock(&dest->svc->stats.lock);
102 dest->svc->stats.inpkts++;
103 dest->svc->stats.inbytes += skb->len;
104 spin_unlock(&dest->svc->stats.lock);
105
106 spin_lock(&ip_vs_stats.lock);
107 ip_vs_stats.inpkts++;
108 ip_vs_stats.inbytes += skb->len;
109 spin_unlock(&ip_vs_stats.lock);
110 }
111}
112
113
114static inline void
115ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
116{
117 struct ip_vs_dest *dest = cp->dest;
118 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
119 spin_lock(&dest->stats.lock);
120 dest->stats.outpkts++;
121 dest->stats.outbytes += skb->len;
122 spin_unlock(&dest->stats.lock);
123
124 spin_lock(&dest->svc->stats.lock);
125 dest->svc->stats.outpkts++;
126 dest->svc->stats.outbytes += skb->len;
127 spin_unlock(&dest->svc->stats.lock);
128
129 spin_lock(&ip_vs_stats.lock);
130 ip_vs_stats.outpkts++;
131 ip_vs_stats.outbytes += skb->len;
132 spin_unlock(&ip_vs_stats.lock);
133 }
134}
135
136
137static inline void
138ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
139{
140 spin_lock(&cp->dest->stats.lock);
141 cp->dest->stats.conns++;
142 spin_unlock(&cp->dest->stats.lock);
143
144 spin_lock(&svc->stats.lock);
145 svc->stats.conns++;
146 spin_unlock(&svc->stats.lock);
147
148 spin_lock(&ip_vs_stats.lock);
149 ip_vs_stats.conns++;
150 spin_unlock(&ip_vs_stats.lock);
151}
152
153
154static inline int
155ip_vs_set_state(struct ip_vs_conn *cp, int direction,
156 const struct sk_buff *skb,
157 struct ip_vs_protocol *pp)
158{
159 if (unlikely(!pp->state_transition))
160 return 0;
161 return pp->state_transition(cp, direction, skb, pp);
162}
163
164
165int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
166{
167 struct sk_buff *skb = *pskb;
168
169 /* skb is already used, better copy skb and its payload */
170 if (unlikely(skb_shared(skb) || skb->sk))
171 goto copy_skb;
172
173 /* skb data is already used, copy it */
174 if (unlikely(skb_cloned(skb)))
175 goto copy_data;
176
177 return pskb_may_pull(skb, writable_len);
178
179 copy_data:
180 if (unlikely(writable_len > skb->len))
181 return 0;
182 return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
183
184 copy_skb:
185 if (unlikely(writable_len > skb->len))
186 return 0;
187 skb = skb_copy(skb, GFP_ATOMIC);
188 if (!skb)
189 return 0;
190 BUG_ON(skb_is_nonlinear(skb));
191
192 /* Rest of kernel will get very unhappy if we pass it a
193 suddenly-orphaned skbuff */
194 if ((*pskb)->sk)
195 skb_set_owner_w(skb, (*pskb)->sk);
196 kfree_skb(*pskb);
197 *pskb = skb;
198 return 1;
199}
200
201/*
202 * IPVS persistent scheduling function
203 * It creates a connection entry according to its template if exists,
204 * or selects a server and creates a connection entry plus a template.
205 * Locking: we are svc user (svc->refcnt), so we hold all dests too
206 * Protocols supported: TCP, UDP
207 */
208static struct ip_vs_conn *
209ip_vs_sched_persist(struct ip_vs_service *svc,
210 const struct sk_buff *skb,
211 __u16 ports[2])
212{
213 struct ip_vs_conn *cp = NULL;
214 struct iphdr *iph = skb->nh.iph;
215 struct ip_vs_dest *dest;
216 struct ip_vs_conn *ct;
217 __u16 dport; /* destination port to forward */
218 __u32 snet; /* source network of the client, after masking */
219
220 /* Mask saddr with the netmask to adjust template granularity */
221 snet = iph->saddr & svc->netmask;
222
223 IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
224 "mnet %u.%u.%u.%u\n",
225 NIPQUAD(iph->saddr), ntohs(ports[0]),
226 NIPQUAD(iph->daddr), ntohs(ports[1]),
227 NIPQUAD(snet));
228
229 /*
230 * As far as we know, FTP is a very complicated network protocol, and
231 * it uses control connection and data connections. For active FTP,
232 * FTP server initialize data connection to the client, its source port
233 * is often 20. For passive FTP, FTP server tells the clients the port
234 * that it passively listens to, and the client issues the data
235 * connection. In the tunneling or direct routing mode, the load
236 * balancer is on the client-to-server half of connection, the port
237 * number is unknown to the load balancer. So, a conn template like
238 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
239 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
240 * is created for other persistent services.
241 */
242 if (ports[1] == svc->port) {
243 /* Check if a template already exists */
244 if (svc->port != FTPPORT)
245 ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
246 iph->daddr, ports[1]);
247 else
248 ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
249 iph->daddr, 0);
250
251 if (!ct || !ip_vs_check_template(ct)) {
252 /*
253 * No template found or the dest of the connection
254 * template is not available.
255 */
256 dest = svc->scheduler->schedule(svc, skb);
257 if (dest == NULL) {
258 IP_VS_DBG(1, "p-schedule: no dest found.\n");
259 return NULL;
260 }
261
262 /*
263 * Create a template like <protocol,caddr,0,
264 * vaddr,vport,daddr,dport> for non-ftp service,
265 * and <protocol,caddr,0,vaddr,0,daddr,0>
266 * for ftp service.
267 */
268 if (svc->port != FTPPORT)
269 ct = ip_vs_conn_new(iph->protocol,
270 snet, 0,
271 iph->daddr,
272 ports[1],
273 dest->addr, dest->port,
274 0,
275 dest);
276 else
277 ct = ip_vs_conn_new(iph->protocol,
278 snet, 0,
279 iph->daddr, 0,
280 dest->addr, 0,
281 0,
282 dest);
283 if (ct == NULL)
284 return NULL;
285
286 ct->timeout = svc->timeout;
287 } else {
288 /* set destination with the found template */
289 dest = ct->dest;
290 }
291 dport = dest->port;
292 } else {
293 /*
294 * Note: persistent fwmark-based services and persistent
295 * port zero service are handled here.
296 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
297 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
298 */
299 if (svc->fwmark)
300 ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
301 htonl(svc->fwmark), 0);
302 else
303 ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
304 iph->daddr, 0);
305
306 if (!ct || !ip_vs_check_template(ct)) {
307 /*
308 * If it is not persistent port zero, return NULL,
309 * otherwise create a connection template.
310 */
311 if (svc->port)
312 return NULL;
313
314 dest = svc->scheduler->schedule(svc, skb);
315 if (dest == NULL) {
316 IP_VS_DBG(1, "p-schedule: no dest found.\n");
317 return NULL;
318 }
319
320 /*
321 * Create a template according to the service
322 */
323 if (svc->fwmark)
324 ct = ip_vs_conn_new(IPPROTO_IP,
325 snet, 0,
326 htonl(svc->fwmark), 0,
327 dest->addr, 0,
328 0,
329 dest);
330 else
331 ct = ip_vs_conn_new(iph->protocol,
332 snet, 0,
333 iph->daddr, 0,
334 dest->addr, 0,
335 0,
336 dest);
337 if (ct == NULL)
338 return NULL;
339
340 ct->timeout = svc->timeout;
341 } else {
342 /* set destination with the found template */
343 dest = ct->dest;
344 }
345 dport = ports[1];
346 }
347
348 /*
349 * Create a new connection according to the template
350 */
351 cp = ip_vs_conn_new(iph->protocol,
352 iph->saddr, ports[0],
353 iph->daddr, ports[1],
354 dest->addr, dport,
355 0,
356 dest);
357 if (cp == NULL) {
358 ip_vs_conn_put(ct);
359 return NULL;
360 }
361
362 /*
363 * Add its control
364 */
365 ip_vs_control_add(cp, ct);
366 ip_vs_conn_put(ct);
367
368 ip_vs_conn_stats(cp, svc);
369 return cp;
370}
371
372
373/*
374 * IPVS main scheduling function
375 * It selects a server according to the virtual service, and
376 * creates a connection entry.
377 * Protocols supported: TCP, UDP
378 */
379struct ip_vs_conn *
380ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
381{
382 struct ip_vs_conn *cp = NULL;
383 struct iphdr *iph = skb->nh.iph;
384 struct ip_vs_dest *dest;
385 __u16 _ports[2], *pptr;
386
387 pptr = skb_header_pointer(skb, iph->ihl*4,
388 sizeof(_ports), _ports);
389 if (pptr == NULL)
390 return NULL;
391
392 /*
393 * Persistent service
394 */
395 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
396 return ip_vs_sched_persist(svc, skb, pptr);
397
398 /*
399 * Non-persistent service
400 */
401 if (!svc->fwmark && pptr[1] != svc->port) {
402 if (!svc->port)
403 IP_VS_ERR("Schedule: port zero only supported "
404 "in persistent services, "
405 "check your ipvs configuration\n");
406 return NULL;
407 }
408
409 dest = svc->scheduler->schedule(svc, skb);
410 if (dest == NULL) {
411 IP_VS_DBG(1, "Schedule: no dest found.\n");
412 return NULL;
413 }
414
415 /*
416 * Create a connection entry.
417 */
418 cp = ip_vs_conn_new(iph->protocol,
419 iph->saddr, pptr[0],
420 iph->daddr, pptr[1],
421 dest->addr, dest->port?dest->port:pptr[1],
422 0,
423 dest);
424 if (cp == NULL)
425 return NULL;
426
427 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
428 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
429 ip_vs_fwd_tag(cp),
430 NIPQUAD(cp->caddr), ntohs(cp->cport),
431 NIPQUAD(cp->vaddr), ntohs(cp->vport),
432 NIPQUAD(cp->daddr), ntohs(cp->dport),
433 cp->flags, atomic_read(&cp->refcnt));
434
435 ip_vs_conn_stats(cp, svc);
436 return cp;
437}
438
439
440/*
441 * Pass or drop the packet.
442 * Called by ip_vs_in, when the virtual service is available but
443 * no destination is available for a new connection.
444 */
445int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
446 struct ip_vs_protocol *pp)
447{
448 __u16 _ports[2], *pptr;
449 struct iphdr *iph = skb->nh.iph;
450
451 pptr = skb_header_pointer(skb, iph->ihl*4,
452 sizeof(_ports), _ports);
453 if (pptr == NULL) {
454 ip_vs_service_put(svc);
455 return NF_DROP;
456 }
457
458 /* if it is fwmark-based service, the cache_bypass sysctl is up
459 and the destination is RTN_UNICAST (and not local), then create
460 a cache_bypass connection entry */
461 if (sysctl_ip_vs_cache_bypass && svc->fwmark
462 && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
463 int ret, cs;
464 struct ip_vs_conn *cp;
465
466 ip_vs_service_put(svc);
467
468 /* create a new connection entry */
469 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
470 cp = ip_vs_conn_new(iph->protocol,
471 iph->saddr, pptr[0],
472 iph->daddr, pptr[1],
473 0, 0,
474 IP_VS_CONN_F_BYPASS,
475 NULL);
476 if (cp == NULL)
477 return NF_DROP;
478
479 /* statistics */
480 ip_vs_in_stats(cp, skb);
481
482 /* set state */
483 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
484
485 /* transmit the first SYN packet */
486 ret = cp->packet_xmit(skb, cp, pp);
487 /* do not touch skb anymore */
488
489 atomic_inc(&cp->in_pkts);
490 ip_vs_conn_put(cp);
491 return ret;
492 }
493
494 /*
495 * When the virtual ftp service is presented, packets destined
496 * for other services on the VIP may get here (except services
497 * listed in the ipvs table), pass the packets, because it is
498 * not ipvs job to decide to drop the packets.
499 */
500 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
501 ip_vs_service_put(svc);
502 return NF_ACCEPT;
503 }
504
505 ip_vs_service_put(svc);
506
507 /*
508 * Notify the client that the destination is unreachable, and
509 * release the socket buffer.
510 * Since it is in IP layer, the TCP socket is not actually
511 * created, the TCP RST packet cannot be sent, instead that
512 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
513 */
514 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
515 return NF_DROP;
516}
517
518
519/*
520 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
521 * chain, and is used for VS/NAT.
522 * It detects packets for VS/NAT connections and sends the packets
523 * immediately. This can avoid that iptable_nat mangles the packets
524 * for VS/NAT.
525 */
526static unsigned int ip_vs_post_routing(unsigned int hooknum,
527 struct sk_buff **pskb,
528 const struct net_device *in,
529 const struct net_device *out,
530 int (*okfn)(struct sk_buff *))
531{
532 if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
533 return NF_ACCEPT;
534
535 /* The packet was sent from IPVS, exit this chain */
536 (*okfn)(*pskb);
537
538 return NF_STOLEN;
539}
540
541u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
542{
543 return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
544}
545
546static inline struct sk_buff *
547ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
548{
549 skb = ip_defrag(skb, user);
550 if (skb)
551 ip_send_check(skb->nh.iph);
552 return skb;
553}
554
555/*
556 * Packet has been made sufficiently writable in caller
557 * - inout: 1=in->out, 0=out->in
558 */
559void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
560 struct ip_vs_conn *cp, int inout)
561{
562 struct iphdr *iph = skb->nh.iph;
563 unsigned int icmp_offset = iph->ihl*4;
564 struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset);
565 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
566
567 if (inout) {
568 iph->saddr = cp->vaddr;
569 ip_send_check(iph);
570 ciph->daddr = cp->vaddr;
571 ip_send_check(ciph);
572 } else {
573 iph->daddr = cp->daddr;
574 ip_send_check(iph);
575 ciph->saddr = cp->daddr;
576 ip_send_check(ciph);
577 }
578
579 /* the TCP/UDP port */
580 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
581 __u16 *ports = (void *)ciph + ciph->ihl*4;
582
583 if (inout)
584 ports[1] = cp->vport;
585 else
586 ports[0] = cp->dport;
587 }
588
589 /* And finally the ICMP checksum */
590 icmph->checksum = 0;
591 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
592 skb->ip_summed = CHECKSUM_UNNECESSARY;
593
594 if (inout)
595 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
596 "Forwarding altered outgoing ICMP");
597 else
598 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
599 "Forwarding altered incoming ICMP");
600}
601
602/*
603 * Handle ICMP messages in the inside-to-outside direction (outgoing).
604 * Find any that might be relevant, check against existing connections,
605 * forward to the right destination host if relevant.
606 * Currently handles error types - unreachable, quench, ttl exceeded.
607 * (Only used in VS/NAT)
608 */
609static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
610{
611 struct sk_buff *skb = *pskb;
612 struct iphdr *iph;
613 struct icmphdr _icmph, *ic;
614 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
615 struct ip_vs_conn *cp;
616 struct ip_vs_protocol *pp;
617 unsigned int offset, ihl, verdict;
618
619 *related = 1;
620
621 /* reassemble IP fragments */
622 if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
623 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
624 if (!skb)
625 return NF_STOLEN;
626 *pskb = skb;
627 }
628
629 iph = skb->nh.iph;
630 offset = ihl = iph->ihl * 4;
631 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
632 if (ic == NULL)
633 return NF_DROP;
634
635 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
636 ic->type, ntohs(icmp_id(ic)),
637 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
638
639 /*
640 * Work through seeing if this is for us.
641 * These checks are supposed to be in an order that means easy
642 * things are checked first to speed up processing.... however
643 * this means that some packets will manage to get a long way
644 * down this stack and then be rejected, but that's life.
645 */
646 if ((ic->type != ICMP_DEST_UNREACH) &&
647 (ic->type != ICMP_SOURCE_QUENCH) &&
648 (ic->type != ICMP_TIME_EXCEEDED)) {
649 *related = 0;
650 return NF_ACCEPT;
651 }
652
653 /* Now find the contained IP header */
654 offset += sizeof(_icmph);
655 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
656 if (cih == NULL)
657 return NF_ACCEPT; /* The packet looks wrong, ignore */
658
659 pp = ip_vs_proto_get(cih->protocol);
660 if (!pp)
661 return NF_ACCEPT;
662
663 /* Is the embedded protocol header present? */
664 if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
665 pp->dont_defrag))
666 return NF_ACCEPT;
667
668 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
669
670 offset += cih->ihl * 4;
671
672 /* The embedded headers contain source and dest in reverse order */
673 cp = pp->conn_out_get(skb, pp, cih, offset, 1);
674 if (!cp)
675 return NF_ACCEPT;
676
677 verdict = NF_DROP;
678
679 if (IP_VS_FWD_METHOD(cp) != 0) {
680 IP_VS_ERR("shouldn't reach here, because the box is on the"
681 "half connection in the tun/dr module.\n");
682 }
683
684 /* Ensure the checksum is correct */
685 if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
686 ip_vs_checksum_complete(skb, ihl)) {
687 /* Failed checksum! */
688 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
689 NIPQUAD(iph->saddr));
690 goto out;
691 }
692
693 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
694 offset += 2 * sizeof(__u16);
695 if (!ip_vs_make_skb_writable(pskb, offset))
696 goto out;
697 skb = *pskb;
698
699 ip_vs_nat_icmp(skb, pp, cp, 1);
700
701 /* do the statistics and put it back */
702 ip_vs_out_stats(cp, skb);
703
704 skb->nfcache |= NFC_IPVS_PROPERTY;
705 verdict = NF_ACCEPT;
706
707 out:
708 __ip_vs_conn_put(cp);
709
710 return verdict;
711}
712
713static inline int is_tcp_reset(const struct sk_buff *skb)
714{
715 struct tcphdr _tcph, *th;
716
717 th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
718 sizeof(_tcph), &_tcph);
719 if (th == NULL)
720 return 0;
721 return th->rst;
722}
723
724/*
725 * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
726 * Check if outgoing packet belongs to the established ip_vs_conn,
727 * rewrite addresses of the packet and send it on its way...
728 */
729static unsigned int
730ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
731 const struct net_device *in, const struct net_device *out,
732 int (*okfn)(struct sk_buff *))
733{
734 struct sk_buff *skb = *pskb;
735 struct iphdr *iph;
736 struct ip_vs_protocol *pp;
737 struct ip_vs_conn *cp;
738 int ihl;
739
740 EnterFunction(11);
741
742 if (skb->nfcache & NFC_IPVS_PROPERTY)
743 return NF_ACCEPT;
744
745 iph = skb->nh.iph;
746 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
747 int related, verdict = ip_vs_out_icmp(pskb, &related);
748
749 if (related)
750 return verdict;
751 skb = *pskb;
752 iph = skb->nh.iph;
753 }
754
755 pp = ip_vs_proto_get(iph->protocol);
756 if (unlikely(!pp))
757 return NF_ACCEPT;
758
759 /* reassemble IP fragments */
760 if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
761 !pp->dont_defrag)) {
762 skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
763 if (!skb)
764 return NF_STOLEN;
765 iph = skb->nh.iph;
766 *pskb = skb;
767 }
768
769 ihl = iph->ihl << 2;
770
771 /*
772 * Check if the packet belongs to an existing entry
773 */
774 cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
775
776 if (unlikely(!cp)) {
777 if (sysctl_ip_vs_nat_icmp_send &&
778 (pp->protocol == IPPROTO_TCP ||
779 pp->protocol == IPPROTO_UDP)) {
780 __u16 _ports[2], *pptr;
781
782 pptr = skb_header_pointer(skb, ihl,
783 sizeof(_ports), _ports);
784 if (pptr == NULL)
785 return NF_ACCEPT; /* Not for me */
786 if (ip_vs_lookup_real_service(iph->protocol,
787 iph->saddr, pptr[0])) {
788 /*
789 * Notify the real server: there is no
790 * existing entry if it is not RST
791 * packet or not TCP packet.
792 */
793 if (iph->protocol != IPPROTO_TCP
794 || !is_tcp_reset(skb)) {
795 icmp_send(skb,ICMP_DEST_UNREACH,
796 ICMP_PORT_UNREACH, 0);
797 return NF_DROP;
798 }
799 }
800 }
801 IP_VS_DBG_PKT(12, pp, skb, 0,
802 "packet continues traversal as normal");
803 return NF_ACCEPT;
804 }
805
806 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
807
808 if (!ip_vs_make_skb_writable(pskb, ihl))
809 goto drop;
810
811 /* mangle the packet */
812 if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
813 goto drop;
814 skb = *pskb;
815 skb->nh.iph->saddr = cp->vaddr;
816 ip_send_check(skb->nh.iph);
817
818 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
819
820 ip_vs_out_stats(cp, skb);
821 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
822 ip_vs_conn_put(cp);
823
824 skb->nfcache |= NFC_IPVS_PROPERTY;
825
826 LeaveFunction(11);
827 return NF_ACCEPT;
828
829 drop:
830 ip_vs_conn_put(cp);
831 kfree_skb(*pskb);
832 return NF_STOLEN;
833}
834
835
836/*
837 * Handle ICMP messages in the outside-to-inside direction (incoming).
838 * Find any that might be relevant, check against existing connections,
839 * forward to the right destination host if relevant.
840 * Currently handles error types - unreachable, quench, ttl exceeded.
841 */
842static int
843ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
844{
845 struct sk_buff *skb = *pskb;
846 struct iphdr *iph;
847 struct icmphdr _icmph, *ic;
848 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
849 struct ip_vs_conn *cp;
850 struct ip_vs_protocol *pp;
851 unsigned int offset, ihl, verdict;
852
853 *related = 1;
854
855 /* reassemble IP fragments */
856 if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
857 skb = ip_vs_gather_frags(skb,
858 hooknum == NF_IP_LOCAL_IN ?
859 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
860 if (!skb)
861 return NF_STOLEN;
862 *pskb = skb;
863 }
864
865 iph = skb->nh.iph;
866 offset = ihl = iph->ihl * 4;
867 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
868 if (ic == NULL)
869 return NF_DROP;
870
871 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
872 ic->type, ntohs(icmp_id(ic)),
873 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
874
875 /*
876 * Work through seeing if this is for us.
877 * These checks are supposed to be in an order that means easy
878 * things are checked first to speed up processing.... however
879 * this means that some packets will manage to get a long way
880 * down this stack and then be rejected, but that's life.
881 */
882 if ((ic->type != ICMP_DEST_UNREACH) &&
883 (ic->type != ICMP_SOURCE_QUENCH) &&
884 (ic->type != ICMP_TIME_EXCEEDED)) {
885 *related = 0;
886 return NF_ACCEPT;
887 }
888
889 /* Now find the contained IP header */
890 offset += sizeof(_icmph);
891 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
892 if (cih == NULL)
893 return NF_ACCEPT; /* The packet looks wrong, ignore */
894
895 pp = ip_vs_proto_get(cih->protocol);
896 if (!pp)
897 return NF_ACCEPT;
898
899 /* Is the embedded protocol header present? */
900 if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
901 pp->dont_defrag))
902 return NF_ACCEPT;
903
904 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
905
906 offset += cih->ihl * 4;
907
908 /* The embedded headers contain source and dest in reverse order */
909 cp = pp->conn_in_get(skb, pp, cih, offset, 1);
910 if (!cp)
911 return NF_ACCEPT;
912
913 verdict = NF_DROP;
914
915 /* Ensure the checksum is correct */
916 if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
917 ip_vs_checksum_complete(skb, ihl)) {
918 /* Failed checksum! */
919 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
920 NIPQUAD(iph->saddr));
921 goto out;
922 }
923
924 /* do the statistics and put it back */
925 ip_vs_in_stats(cp, skb);
926 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
927 offset += 2 * sizeof(__u16);
928 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
929 /* do not touch skb anymore */
930
931 out:
932 __ip_vs_conn_put(cp);
933
934 return verdict;
935}
936
937/*
938 * Check if it's for virtual services, look it up,
939 * and send it on its way...
940 */
941static unsigned int
942ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
943 const struct net_device *in, const struct net_device *out,
944 int (*okfn)(struct sk_buff *))
945{
946 struct sk_buff *skb = *pskb;
947 struct iphdr *iph;
948 struct ip_vs_protocol *pp;
949 struct ip_vs_conn *cp;
950 int ret, restart;
951 int ihl;
952
953 /*
954 * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
955 * ... don't know why 1st test DOES NOT include 2nd (?)
956 */
957 if (unlikely(skb->pkt_type != PACKET_HOST
958 || skb->dev == &loopback_dev || skb->sk)) {
959 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
960 skb->pkt_type,
961 skb->nh.iph->protocol,
962 NIPQUAD(skb->nh.iph->daddr));
963 return NF_ACCEPT;
964 }
965
966 iph = skb->nh.iph;
967 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
968 int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
969
970 if (related)
971 return verdict;
972 skb = *pskb;
973 iph = skb->nh.iph;
974 }
975
976 /* Protocol supported? */
977 pp = ip_vs_proto_get(iph->protocol);
978 if (unlikely(!pp))
979 return NF_ACCEPT;
980
981 ihl = iph->ihl << 2;
982
983 /*
984 * Check if the packet belongs to an existing connection entry
985 */
986 cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
987
988 if (unlikely(!cp)) {
989 int v;
990
991 if (!pp->conn_schedule(skb, pp, &v, &cp))
992 return v;
993 }
994
995 if (unlikely(!cp)) {
996 /* sorry, all this trouble for a no-hit :) */
997 IP_VS_DBG_PKT(12, pp, skb, 0,
998 "packet continues traversal as normal");
999 return NF_ACCEPT;
1000 }
1001
1002 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1003
1004 /* Check the server status */
1005 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1006 /* the destination server is not available */
1007
1008 if (sysctl_ip_vs_expire_nodest_conn) {
1009 /* try to expire the connection immediately */
1010 ip_vs_conn_expire_now(cp);
1011 } else {
1012 /* don't restart its timer, and silently
1013 drop the packet. */
1014 __ip_vs_conn_put(cp);
1015 }
1016 return NF_DROP;
1017 }
1018
1019 ip_vs_in_stats(cp, skb);
1020 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1021 if (cp->packet_xmit)
1022 ret = cp->packet_xmit(skb, cp, pp);
1023 /* do not touch skb anymore */
1024 else {
1025 IP_VS_DBG_RL("warning: packet_xmit is null");
1026 ret = NF_ACCEPT;
1027 }
1028
1029 /* increase its packet counter and check if it is needed
1030 to be synchronized */
1031 atomic_inc(&cp->in_pkts);
1032 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1033 (cp->protocol != IPPROTO_TCP ||
1034 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1035 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1036 == sysctl_ip_vs_sync_threshold[0]))
1037 ip_vs_sync_conn(cp);
1038
1039 ip_vs_conn_put(cp);
1040 return ret;
1041}
1042
1043
1044/*
1045 * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
1046 * related packets destined for 0.0.0.0/0.
1047 * When fwmark-based virtual service is used, such as transparent
1048 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1049 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1050 * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
1051 * and send them to ip_vs_in_icmp.
1052 */
1053static unsigned int
1054ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
1055 const struct net_device *in, const struct net_device *out,
1056 int (*okfn)(struct sk_buff *))
1057{
1058 int r;
1059
1060 if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
1061 return NF_ACCEPT;
1062
1063 return ip_vs_in_icmp(pskb, &r, hooknum);
1064}
1065
1066
1067/* After packet filtering, forward packet through VS/DR, VS/TUN,
1068 or VS/NAT(change destination), so that filtering rules can be
1069 applied to IPVS. */
1070static struct nf_hook_ops ip_vs_in_ops = {
1071 .hook = ip_vs_in,
1072 .owner = THIS_MODULE,
1073 .pf = PF_INET,
1074 .hooknum = NF_IP_LOCAL_IN,
1075 .priority = 100,
1076};
1077
1078/* After packet filtering, change source only for VS/NAT */
1079static struct nf_hook_ops ip_vs_out_ops = {
1080 .hook = ip_vs_out,
1081 .owner = THIS_MODULE,
1082 .pf = PF_INET,
1083 .hooknum = NF_IP_FORWARD,
1084 .priority = 100,
1085};
1086
1087/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1088 destined for 0.0.0.0/0, which is for incoming IPVS connections */
1089static struct nf_hook_ops ip_vs_forward_icmp_ops = {
1090 .hook = ip_vs_forward_icmp,
1091 .owner = THIS_MODULE,
1092 .pf = PF_INET,
1093 .hooknum = NF_IP_FORWARD,
1094 .priority = 99,
1095};
1096
1097/* Before the netfilter connection tracking, exit from POST_ROUTING */
1098static struct nf_hook_ops ip_vs_post_routing_ops = {
1099 .hook = ip_vs_post_routing,
1100 .owner = THIS_MODULE,
1101 .pf = PF_INET,
1102 .hooknum = NF_IP_POST_ROUTING,
1103 .priority = NF_IP_PRI_NAT_SRC-1,
1104};
1105
1106
1107/*
1108 * Initialize IP Virtual Server
1109 */
1110static int __init ip_vs_init(void)
1111{
1112 int ret;
1113
1114 ret = ip_vs_control_init();
1115 if (ret < 0) {
1116 IP_VS_ERR("can't setup control.\n");
1117 goto cleanup_nothing;
1118 }
1119
1120 ip_vs_protocol_init();
1121
1122 ret = ip_vs_app_init();
1123 if (ret < 0) {
1124 IP_VS_ERR("can't setup application helper.\n");
1125 goto cleanup_protocol;
1126 }
1127
1128 ret = ip_vs_conn_init();
1129 if (ret < 0) {
1130 IP_VS_ERR("can't setup connection table.\n");
1131 goto cleanup_app;
1132 }
1133
1134 ret = nf_register_hook(&ip_vs_in_ops);
1135 if (ret < 0) {
1136 IP_VS_ERR("can't register in hook.\n");
1137 goto cleanup_conn;
1138 }
1139
1140 ret = nf_register_hook(&ip_vs_out_ops);
1141 if (ret < 0) {
1142 IP_VS_ERR("can't register out hook.\n");
1143 goto cleanup_inops;
1144 }
1145 ret = nf_register_hook(&ip_vs_post_routing_ops);
1146 if (ret < 0) {
1147 IP_VS_ERR("can't register post_routing hook.\n");
1148 goto cleanup_outops;
1149 }
1150 ret = nf_register_hook(&ip_vs_forward_icmp_ops);
1151 if (ret < 0) {
1152 IP_VS_ERR("can't register forward_icmp hook.\n");
1153 goto cleanup_postroutingops;
1154 }
1155
1156 IP_VS_INFO("ipvs loaded.\n");
1157 return ret;
1158
1159 cleanup_postroutingops:
1160 nf_unregister_hook(&ip_vs_post_routing_ops);
1161 cleanup_outops:
1162 nf_unregister_hook(&ip_vs_out_ops);
1163 cleanup_inops:
1164 nf_unregister_hook(&ip_vs_in_ops);
1165 cleanup_conn:
1166 ip_vs_conn_cleanup();
1167 cleanup_app:
1168 ip_vs_app_cleanup();
1169 cleanup_protocol:
1170 ip_vs_protocol_cleanup();
1171 ip_vs_control_cleanup();
1172 cleanup_nothing:
1173 return ret;
1174}
1175
1176static void __exit ip_vs_cleanup(void)
1177{
1178 nf_unregister_hook(&ip_vs_forward_icmp_ops);
1179 nf_unregister_hook(&ip_vs_post_routing_ops);
1180 nf_unregister_hook(&ip_vs_out_ops);
1181 nf_unregister_hook(&ip_vs_in_ops);
1182 ip_vs_conn_cleanup();
1183 ip_vs_app_cleanup();
1184 ip_vs_protocol_cleanup();
1185 ip_vs_control_cleanup();
1186 IP_VS_INFO("ipvs unloaded.\n");
1187}
1188
1189module_init(ip_vs_init);
1190module_exit(ip_vs_cleanup);
1191MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
new file mode 100644
index 000000000000..218d9701036e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -0,0 +1,2391 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
26#include <linux/fs.h>
27#include <linux/sysctl.h>
28#include <linux/proc_fs.h>
29#include <linux/workqueue.h>
30#include <linux/swap.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33
34#include <linux/netfilter.h>
35#include <linux/netfilter_ipv4.h>
36
37#include <net/ip.h>
38#include <net/sock.h>
39
40#include <asm/uaccess.h>
41
42#include <net/ip_vs.h>
43
44/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45static DECLARE_MUTEX(__ip_vs_mutex);
46
47/* lock for service table */
48static DEFINE_RWLOCK(__ip_vs_svc_lock);
49
50/* lock for table with the real services */
51static DEFINE_RWLOCK(__ip_vs_rs_lock);
52
53/* lock for state and timeout tables */
54static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
55
56/* lock for drop entry handling */
57static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
58
59/* lock for drop packet handling */
60static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
61
62/* 1/rate drop and drop-entry variables */
63int ip_vs_drop_rate = 0;
64int ip_vs_drop_counter = 0;
65static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
66
67/* number of virtual services */
68static int ip_vs_num_services = 0;
69
70/* sysctl variables */
71static int sysctl_ip_vs_drop_entry = 0;
72static int sysctl_ip_vs_drop_packet = 0;
73static int sysctl_ip_vs_secure_tcp = 0;
74static int sysctl_ip_vs_amemthresh = 1024;
75static int sysctl_ip_vs_am_droprate = 10;
76int sysctl_ip_vs_cache_bypass = 0;
77int sysctl_ip_vs_expire_nodest_conn = 0;
78int sysctl_ip_vs_expire_quiescent_template = 0;
79int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
80int sysctl_ip_vs_nat_icmp_send = 0;
81
82
83#ifdef CONFIG_IP_VS_DEBUG
84static int sysctl_ip_vs_debug_level = 0;
85
86int ip_vs_get_debug_level(void)
87{
88 return sysctl_ip_vs_debug_level;
89}
90#endif
91
92/*
93 * update_defense_level is called from keventd and from sysctl.
94 */
95static void update_defense_level(void)
96{
97 struct sysinfo i;
98 static int old_secure_tcp = 0;
99 int availmem;
100 int nomem;
101 int to_change = -1;
102
103 /* we only count free and buffered memory (in pages) */
104 si_meminfo(&i);
105 availmem = i.freeram + i.bufferram;
106 /* however in linux 2.5 the i.bufferram is total page cache size,
107 we need adjust it */
108 /* si_swapinfo(&i); */
109 /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111 nomem = (availmem < sysctl_ip_vs_amemthresh);
112
113 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) {
116 case 0:
117 atomic_set(&ip_vs_dropentry, 0);
118 break;
119 case 1:
120 if (nomem) {
121 atomic_set(&ip_vs_dropentry, 1);
122 sysctl_ip_vs_drop_entry = 2;
123 } else {
124 atomic_set(&ip_vs_dropentry, 0);
125 }
126 break;
127 case 2:
128 if (nomem) {
129 atomic_set(&ip_vs_dropentry, 1);
130 } else {
131 atomic_set(&ip_vs_dropentry, 0);
132 sysctl_ip_vs_drop_entry = 1;
133 };
134 break;
135 case 3:
136 atomic_set(&ip_vs_dropentry, 1);
137 break;
138 }
139 spin_unlock(&__ip_vs_dropentry_lock);
140
141 /* drop_packet */
142 spin_lock(&__ip_vs_droppacket_lock);
143 switch (sysctl_ip_vs_drop_packet) {
144 case 0:
145 ip_vs_drop_rate = 0;
146 break;
147 case 1:
148 if (nomem) {
149 ip_vs_drop_rate = ip_vs_drop_counter
150 = sysctl_ip_vs_amemthresh /
151 (sysctl_ip_vs_amemthresh-availmem);
152 sysctl_ip_vs_drop_packet = 2;
153 } else {
154 ip_vs_drop_rate = 0;
155 }
156 break;
157 case 2:
158 if (nomem) {
159 ip_vs_drop_rate = ip_vs_drop_counter
160 = sysctl_ip_vs_amemthresh /
161 (sysctl_ip_vs_amemthresh-availmem);
162 } else {
163 ip_vs_drop_rate = 0;
164 sysctl_ip_vs_drop_packet = 1;
165 }
166 break;
167 case 3:
168 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
169 break;
170 }
171 spin_unlock(&__ip_vs_droppacket_lock);
172
173 /* secure_tcp */
174 write_lock(&__ip_vs_securetcp_lock);
175 switch (sysctl_ip_vs_secure_tcp) {
176 case 0:
177 if (old_secure_tcp >= 2)
178 to_change = 0;
179 break;
180 case 1:
181 if (nomem) {
182 if (old_secure_tcp < 2)
183 to_change = 1;
184 sysctl_ip_vs_secure_tcp = 2;
185 } else {
186 if (old_secure_tcp >= 2)
187 to_change = 0;
188 }
189 break;
190 case 2:
191 if (nomem) {
192 if (old_secure_tcp < 2)
193 to_change = 1;
194 } else {
195 if (old_secure_tcp >= 2)
196 to_change = 0;
197 sysctl_ip_vs_secure_tcp = 1;
198 }
199 break;
200 case 3:
201 if (old_secure_tcp < 2)
202 to_change = 1;
203 break;
204 }
205 old_secure_tcp = sysctl_ip_vs_secure_tcp;
206 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock);
209}
210
211
212/*
213 * Timer for checking the defense
214 */
215#define DEFENSE_TIMER_PERIOD 1*HZ
216static void defense_work_handler(void *data);
217static DECLARE_WORK(defense_work, defense_work_handler, NULL);
218
219static void defense_work_handler(void *data)
220{
221 update_defense_level();
222 if (atomic_read(&ip_vs_dropentry))
223 ip_vs_random_dropentry();
224
225 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
226}
227
228int
229ip_vs_use_count_inc(void)
230{
231 return try_module_get(THIS_MODULE);
232}
233
234void
235ip_vs_use_count_dec(void)
236{
237 module_put(THIS_MODULE);
238}
239
240
241/*
242 * Hash table: for virtual service lookups
243 */
244#define IP_VS_SVC_TAB_BITS 8
245#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
246#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
247
248/* the service table hashed by <protocol, addr, port> */
249static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
250/* the service table hashed by fwmark */
251static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
252
253/*
254 * Hash table: for real service lookups
255 */
256#define IP_VS_RTAB_BITS 4
257#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
258#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
259
260static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
261
262/*
263 * Trash for destinations
264 */
265static LIST_HEAD(ip_vs_dest_trash);
266
267/*
268 * FTP & NULL virtual service counters
269 */
270static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
271static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
272
273
274/*
275 * Returns hash value for virtual service
276 */
277static __inline__ unsigned
278ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
279{
280 register unsigned porth = ntohs(port);
281
282 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
283 & IP_VS_SVC_TAB_MASK;
284}
285
286/*
287 * Returns hash value of fwmark for virtual service lookup
288 */
289static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
290{
291 return fwmark & IP_VS_SVC_TAB_MASK;
292}
293
294/*
295 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
296 * or in the ip_vs_svc_fwm_table by fwmark.
297 * Should be called with locked tables.
298 */
299static int ip_vs_svc_hash(struct ip_vs_service *svc)
300{
301 unsigned hash;
302
303 if (svc->flags & IP_VS_SVC_F_HASHED) {
304 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
305 "called from %p\n", __builtin_return_address(0));
306 return 0;
307 }
308
309 if (svc->fwmark == 0) {
310 /*
311 * Hash it by <protocol,addr,port> in ip_vs_svc_table
312 */
313 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
314 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
315 } else {
316 /*
317 * Hash it by fwmark in ip_vs_svc_fwm_table
318 */
319 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
320 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
321 }
322
323 svc->flags |= IP_VS_SVC_F_HASHED;
324 /* increase its refcnt because it is referenced by the svc table */
325 atomic_inc(&svc->refcnt);
326 return 1;
327}
328
329
330/*
331 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
332 * Should be called with locked tables.
333 */
334static int ip_vs_svc_unhash(struct ip_vs_service *svc)
335{
336 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
337 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
338 "called from %p\n", __builtin_return_address(0));
339 return 0;
340 }
341
342 if (svc->fwmark == 0) {
343 /* Remove it from the ip_vs_svc_table table */
344 list_del(&svc->s_list);
345 } else {
346 /* Remove it from the ip_vs_svc_fwm_table table */
347 list_del(&svc->f_list);
348 }
349
350 svc->flags &= ~IP_VS_SVC_F_HASHED;
351 atomic_dec(&svc->refcnt);
352 return 1;
353}
354
355
356/*
357 * Get service by {proto,addr,port} in the service table.
358 */
359static __inline__ struct ip_vs_service *
360__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
361{
362 unsigned hash;
363 struct ip_vs_service *svc;
364
365 /* Check for "full" addressed entries */
366 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
367
368 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
369 if ((svc->addr == vaddr)
370 && (svc->port == vport)
371 && (svc->protocol == protocol)) {
372 /* HIT */
373 atomic_inc(&svc->usecnt);
374 return svc;
375 }
376 }
377
378 return NULL;
379}
380
381
382/*
383 * Get service by {fwmark} in the service table.
384 */
385static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
386{
387 unsigned hash;
388 struct ip_vs_service *svc;
389
390 /* Check for fwmark addressed entries */
391 hash = ip_vs_svc_fwm_hashkey(fwmark);
392
393 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
394 if (svc->fwmark == fwmark) {
395 /* HIT */
396 atomic_inc(&svc->usecnt);
397 return svc;
398 }
399 }
400
401 return NULL;
402}
403
404struct ip_vs_service *
405ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
406{
407 struct ip_vs_service *svc;
408
409 read_lock(&__ip_vs_svc_lock);
410
411 /*
412 * Check the table hashed by fwmark first
413 */
414 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
415 goto out;
416
417 /*
418 * Check the table hashed by <protocol,addr,port>
419 * for "full" addressed entries
420 */
421 svc = __ip_vs_service_get(protocol, vaddr, vport);
422
423 if (svc == NULL
424 && protocol == IPPROTO_TCP
425 && atomic_read(&ip_vs_ftpsvc_counter)
426 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
427 /*
428 * Check if ftp service entry exists, the packet
429 * might belong to FTP data connections.
430 */
431 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
432 }
433
434 if (svc == NULL
435 && atomic_read(&ip_vs_nullsvc_counter)) {
436 /*
437 * Check if the catch-all port (port zero) exists
438 */
439 svc = __ip_vs_service_get(protocol, vaddr, 0);
440 }
441
442 out:
443 read_unlock(&__ip_vs_svc_lock);
444
445 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
446 fwmark, ip_vs_proto_name(protocol),
447 NIPQUAD(vaddr), ntohs(vport),
448 svc?"hit":"not hit");
449
450 return svc;
451}
452
453
454static inline void
455__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
456{
457 atomic_inc(&svc->refcnt);
458 dest->svc = svc;
459}
460
461static inline void
462__ip_vs_unbind_svc(struct ip_vs_dest *dest)
463{
464 struct ip_vs_service *svc = dest->svc;
465
466 dest->svc = NULL;
467 if (atomic_dec_and_test(&svc->refcnt))
468 kfree(svc);
469}
470
471
472/*
473 * Returns hash value for real service
474 */
475static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
476{
477 register unsigned porth = ntohs(port);
478
479 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
480 & IP_VS_RTAB_MASK;
481}
482
483/*
484 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
485 * should be called with locked tables.
486 */
487static int ip_vs_rs_hash(struct ip_vs_dest *dest)
488{
489 unsigned hash;
490
491 if (!list_empty(&dest->d_list)) {
492 return 0;
493 }
494
495 /*
496 * Hash by proto,addr,port,
497 * which are the parameters of the real service.
498 */
499 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
500 list_add(&dest->d_list, &ip_vs_rtable[hash]);
501
502 return 1;
503}
504
505/*
506 * UNhashes ip_vs_dest from ip_vs_rtable.
507 * should be called with locked tables.
508 */
509static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
510{
511 /*
512 * Remove it from the ip_vs_rtable table.
513 */
514 if (!list_empty(&dest->d_list)) {
515 list_del(&dest->d_list);
516 INIT_LIST_HEAD(&dest->d_list);
517 }
518
519 return 1;
520}
521
522/*
523 * Lookup real service by <proto,addr,port> in the real service table.
524 */
525struct ip_vs_dest *
526ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
527{
528 unsigned hash;
529 struct ip_vs_dest *dest;
530
531 /*
532 * Check for "full" addressed entries
533 * Return the first found entry
534 */
535 hash = ip_vs_rs_hashkey(daddr, dport);
536
537 read_lock(&__ip_vs_rs_lock);
538 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
539 if ((dest->addr == daddr)
540 && (dest->port == dport)
541 && ((dest->protocol == protocol) ||
542 dest->vfwmark)) {
543 /* HIT */
544 read_unlock(&__ip_vs_rs_lock);
545 return dest;
546 }
547 }
548 read_unlock(&__ip_vs_rs_lock);
549
550 return NULL;
551}
552
553/*
554 * Lookup destination by {addr,port} in the given service
555 */
556static struct ip_vs_dest *
557ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
558{
559 struct ip_vs_dest *dest;
560
561 /*
562 * Find the destination for the given service
563 */
564 list_for_each_entry(dest, &svc->destinations, n_list) {
565 if ((dest->addr == daddr) && (dest->port == dport)) {
566 /* HIT */
567 return dest;
568 }
569 }
570
571 return NULL;
572}
573
574
575/*
576 * Lookup dest by {svc,addr,port} in the destination trash.
577 * The destination trash is used to hold the destinations that are removed
578 * from the service table but are still referenced by some conn entries.
579 * The reason to add the destination trash is when the dest is temporary
580 * down (either by administrator or by monitor program), the dest can be
581 * picked back from the trash, the remaining connections to the dest can
582 * continue, and the counting information of the dest is also useful for
583 * scheduling.
584 */
585static struct ip_vs_dest *
586ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
587{
588 struct ip_vs_dest *dest, *nxt;
589
590 /*
591 * Find the destination in trash
592 */
593 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
594 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
595 "refcnt=%d\n",
596 dest->vfwmark,
597 NIPQUAD(dest->addr), ntohs(dest->port),
598 atomic_read(&dest->refcnt));
599 if (dest->addr == daddr &&
600 dest->port == dport &&
601 dest->vfwmark == svc->fwmark &&
602 dest->protocol == svc->protocol &&
603 (svc->fwmark ||
604 (dest->vaddr == svc->addr &&
605 dest->vport == svc->port))) {
606 /* HIT */
607 return dest;
608 }
609
610 /*
611 * Try to purge the destination from trash if not referenced
612 */
613 if (atomic_read(&dest->refcnt) == 1) {
614 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
615 "from trash\n",
616 dest->vfwmark,
617 NIPQUAD(dest->addr), ntohs(dest->port));
618 list_del(&dest->n_list);
619 ip_vs_dst_reset(dest);
620 __ip_vs_unbind_svc(dest);
621 kfree(dest);
622 }
623 }
624
625 return NULL;
626}
627
628
629/*
630 * Clean up all the destinations in the trash
631 * Called by the ip_vs_control_cleanup()
632 *
633 * When the ip_vs_control_clearup is activated by ipvs module exit,
634 * the service tables must have been flushed and all the connections
635 * are expired, and the refcnt of each destination in the trash must
636 * be 1, so we simply release them here.
637 */
638static void ip_vs_trash_cleanup(void)
639{
640 struct ip_vs_dest *dest, *nxt;
641
642 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
643 list_del(&dest->n_list);
644 ip_vs_dst_reset(dest);
645 __ip_vs_unbind_svc(dest);
646 kfree(dest);
647 }
648}
649
650
651static void
652ip_vs_zero_stats(struct ip_vs_stats *stats)
653{
654 spin_lock_bh(&stats->lock);
655 memset(stats, 0, (char *)&stats->lock - (char *)stats);
656 spin_unlock_bh(&stats->lock);
657 ip_vs_zero_estimator(stats);
658}
659
660/*
661 * Update a destination in the given service
662 */
663static void
664__ip_vs_update_dest(struct ip_vs_service *svc,
665 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
666{
667 int conn_flags;
668
669 /* set the weight and the flags */
670 atomic_set(&dest->weight, udest->weight);
671 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
672
673 /* check if local node and update the flags */
674 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
675 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
676 | IP_VS_CONN_F_LOCALNODE;
677 }
678
679 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
680 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
681 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
682 } else {
683 /*
684 * Put the real service in ip_vs_rtable if not present.
685 * For now only for NAT!
686 */
687 write_lock_bh(&__ip_vs_rs_lock);
688 ip_vs_rs_hash(dest);
689 write_unlock_bh(&__ip_vs_rs_lock);
690 }
691 atomic_set(&dest->conn_flags, conn_flags);
692
693 /* bind the service */
694 if (!dest->svc) {
695 __ip_vs_bind_svc(dest, svc);
696 } else {
697 if (dest->svc != svc) {
698 __ip_vs_unbind_svc(dest);
699 ip_vs_zero_stats(&dest->stats);
700 __ip_vs_bind_svc(dest, svc);
701 }
702 }
703
704 /* set the dest status flags */
705 dest->flags |= IP_VS_DEST_F_AVAILABLE;
706
707 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
708 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
709 dest->u_threshold = udest->u_threshold;
710 dest->l_threshold = udest->l_threshold;
711}
712
713
714/*
715 * Create a destination for the given service
716 */
717static int
718ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
719 struct ip_vs_dest **dest_p)
720{
721 struct ip_vs_dest *dest;
722 unsigned atype;
723
724 EnterFunction(2);
725
726 atype = inet_addr_type(udest->addr);
727 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
728 return -EINVAL;
729
730 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
731 if (dest == NULL) {
732 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
733 return -ENOMEM;
734 }
735 memset(dest, 0, sizeof(struct ip_vs_dest));
736
737 dest->protocol = svc->protocol;
738 dest->vaddr = svc->addr;
739 dest->vport = svc->port;
740 dest->vfwmark = svc->fwmark;
741 dest->addr = udest->addr;
742 dest->port = udest->port;
743
744 atomic_set(&dest->activeconns, 0);
745 atomic_set(&dest->inactconns, 0);
746 atomic_set(&dest->persistconns, 0);
747 atomic_set(&dest->refcnt, 0);
748
749 INIT_LIST_HEAD(&dest->d_list);
750 spin_lock_init(&dest->dst_lock);
751 spin_lock_init(&dest->stats.lock);
752 __ip_vs_update_dest(svc, dest, udest);
753 ip_vs_new_estimator(&dest->stats);
754
755 *dest_p = dest;
756
757 LeaveFunction(2);
758 return 0;
759}
760
761
762/*
763 * Add a destination into an existing service
764 */
765static int
766ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
767{
768 struct ip_vs_dest *dest;
769 __u32 daddr = udest->addr;
770 __u16 dport = udest->port;
771 int ret;
772
773 EnterFunction(2);
774
775 if (udest->weight < 0) {
776 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
777 return -ERANGE;
778 }
779
780 if (udest->l_threshold > udest->u_threshold) {
781 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
782 "upper threshold\n");
783 return -ERANGE;
784 }
785
786 /*
787 * Check if the dest already exists in the list
788 */
789 dest = ip_vs_lookup_dest(svc, daddr, dport);
790 if (dest != NULL) {
791 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
792 return -EEXIST;
793 }
794
795 /*
796 * Check if the dest already exists in the trash and
797 * is from the same service
798 */
799 dest = ip_vs_trash_get_dest(svc, daddr, dport);
800 if (dest != NULL) {
801 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
802 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
803 NIPQUAD(daddr), ntohs(dport),
804 atomic_read(&dest->refcnt),
805 dest->vfwmark,
806 NIPQUAD(dest->vaddr),
807 ntohs(dest->vport));
808 __ip_vs_update_dest(svc, dest, udest);
809
810 /*
811 * Get the destination from the trash
812 */
813 list_del(&dest->n_list);
814
815 ip_vs_new_estimator(&dest->stats);
816
817 write_lock_bh(&__ip_vs_svc_lock);
818
819 /*
820 * Wait until all other svc users go away.
821 */
822 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
823
824 list_add(&dest->n_list, &svc->destinations);
825 svc->num_dests++;
826
827 /* call the update_service function of its scheduler */
828 svc->scheduler->update_service(svc);
829
830 write_unlock_bh(&__ip_vs_svc_lock);
831 return 0;
832 }
833
834 /*
835 * Allocate and initialize the dest structure
836 */
837 ret = ip_vs_new_dest(svc, udest, &dest);
838 if (ret) {
839 return ret;
840 }
841
842 /*
843 * Add the dest entry into the list
844 */
845 atomic_inc(&dest->refcnt);
846
847 write_lock_bh(&__ip_vs_svc_lock);
848
849 /*
850 * Wait until all other svc users go away.
851 */
852 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
853
854 list_add(&dest->n_list, &svc->destinations);
855 svc->num_dests++;
856
857 /* call the update_service function of its scheduler */
858 svc->scheduler->update_service(svc);
859
860 write_unlock_bh(&__ip_vs_svc_lock);
861
862 LeaveFunction(2);
863
864 return 0;
865}
866
867
868/*
869 * Edit a destination in the given service
870 */
871static int
872ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
873{
874 struct ip_vs_dest *dest;
875 __u32 daddr = udest->addr;
876 __u16 dport = udest->port;
877
878 EnterFunction(2);
879
880 if (udest->weight < 0) {
881 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
882 return -ERANGE;
883 }
884
885 if (udest->l_threshold > udest->u_threshold) {
886 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
887 "upper threshold\n");
888 return -ERANGE;
889 }
890
891 /*
892 * Lookup the destination list
893 */
894 dest = ip_vs_lookup_dest(svc, daddr, dport);
895 if (dest == NULL) {
896 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
897 return -ENOENT;
898 }
899
900 __ip_vs_update_dest(svc, dest, udest);
901
902 write_lock_bh(&__ip_vs_svc_lock);
903
904 /* Wait until all other svc users go away */
905 while (atomic_read(&svc->usecnt) > 1) {};
906
907 /* call the update_service, because server weight may be changed */
908 svc->scheduler->update_service(svc);
909
910 write_unlock_bh(&__ip_vs_svc_lock);
911
912 LeaveFunction(2);
913
914 return 0;
915}
916
917
918/*
919 * Delete a destination (must be already unlinked from the service)
920 */
921static void __ip_vs_del_dest(struct ip_vs_dest *dest)
922{
923 ip_vs_kill_estimator(&dest->stats);
924
925 /*
926 * Remove it from the d-linked list with the real services.
927 */
928 write_lock_bh(&__ip_vs_rs_lock);
929 ip_vs_rs_unhash(dest);
930 write_unlock_bh(&__ip_vs_rs_lock);
931
932 /*
933 * Decrease the refcnt of the dest, and free the dest
934 * if nobody refers to it (refcnt=0). Otherwise, throw
935 * the destination into the trash.
936 */
937 if (atomic_dec_and_test(&dest->refcnt)) {
938 ip_vs_dst_reset(dest);
939 /* simply decrease svc->refcnt here, let the caller check
940 and release the service if nobody refers to it.
941 Only user context can release destination and service,
942 and only one user context can update virtual service at a
943 time, so the operation here is OK */
944 atomic_dec(&dest->svc->refcnt);
945 kfree(dest);
946 } else {
947 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
948 NIPQUAD(dest->addr), ntohs(dest->port),
949 atomic_read(&dest->refcnt));
950 list_add(&dest->n_list, &ip_vs_dest_trash);
951 atomic_inc(&dest->refcnt);
952 }
953}
954
955
956/*
957 * Unlink a destination from the given service
958 */
959static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
960 struct ip_vs_dest *dest,
961 int svcupd)
962{
963 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
964
965 /*
966 * Remove it from the d-linked destination list.
967 */
968 list_del(&dest->n_list);
969 svc->num_dests--;
970 if (svcupd) {
971 /*
972 * Call the update_service function of its scheduler
973 */
974 svc->scheduler->update_service(svc);
975 }
976}
977
978
979/*
980 * Delete a destination server in the given service
981 */
982static int
983ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
984{
985 struct ip_vs_dest *dest;
986 __u32 daddr = udest->addr;
987 __u16 dport = udest->port;
988
989 EnterFunction(2);
990
991 dest = ip_vs_lookup_dest(svc, daddr, dport);
992 if (dest == NULL) {
993 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
994 return -ENOENT;
995 }
996
997 write_lock_bh(&__ip_vs_svc_lock);
998
999 /*
1000 * Wait until all other svc users go away.
1001 */
1002 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1003
1004 /*
1005 * Unlink dest from the service
1006 */
1007 __ip_vs_unlink_dest(svc, dest, 1);
1008
1009 write_unlock_bh(&__ip_vs_svc_lock);
1010
1011 /*
1012 * Delete the destination
1013 */
1014 __ip_vs_del_dest(dest);
1015
1016 LeaveFunction(2);
1017
1018 return 0;
1019}
1020
1021
1022/*
1023 * Add a service into the service hash table
1024 */
1025static int
1026ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1027{
1028 int ret = 0;
1029 struct ip_vs_scheduler *sched = NULL;
1030 struct ip_vs_service *svc = NULL;
1031
1032 /* increase the module use count */
1033 ip_vs_use_count_inc();
1034
1035 /* Lookup the scheduler by 'u->sched_name' */
1036 sched = ip_vs_scheduler_get(u->sched_name);
1037 if (sched == NULL) {
1038 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1039 u->sched_name);
1040 ret = -ENOENT;
1041 goto out_mod_dec;
1042 }
1043
1044 svc = (struct ip_vs_service *)
1045 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1046 if (svc == NULL) {
1047 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1048 ret = -ENOMEM;
1049 goto out_err;
1050 }
1051 memset(svc, 0, sizeof(struct ip_vs_service));
1052
1053 /* I'm the first user of the service */
1054 atomic_set(&svc->usecnt, 1);
1055 atomic_set(&svc->refcnt, 0);
1056
1057 svc->protocol = u->protocol;
1058 svc->addr = u->addr;
1059 svc->port = u->port;
1060 svc->fwmark = u->fwmark;
1061 svc->flags = u->flags;
1062 svc->timeout = u->timeout * HZ;
1063 svc->netmask = u->netmask;
1064
1065 INIT_LIST_HEAD(&svc->destinations);
1066 rwlock_init(&svc->sched_lock);
1067 spin_lock_init(&svc->stats.lock);
1068
1069 /* Bind the scheduler */
1070 ret = ip_vs_bind_scheduler(svc, sched);
1071 if (ret)
1072 goto out_err;
1073 sched = NULL;
1074
1075 /* Update the virtual service counters */
1076 if (svc->port == FTPPORT)
1077 atomic_inc(&ip_vs_ftpsvc_counter);
1078 else if (svc->port == 0)
1079 atomic_inc(&ip_vs_nullsvc_counter);
1080
1081 ip_vs_new_estimator(&svc->stats);
1082 ip_vs_num_services++;
1083
1084 /* Hash the service into the service table */
1085 write_lock_bh(&__ip_vs_svc_lock);
1086 ip_vs_svc_hash(svc);
1087 write_unlock_bh(&__ip_vs_svc_lock);
1088
1089 *svc_p = svc;
1090 return 0;
1091
1092 out_err:
1093 if (svc != NULL) {
1094 if (svc->scheduler)
1095 ip_vs_unbind_scheduler(svc);
1096 if (svc->inc) {
1097 local_bh_disable();
1098 ip_vs_app_inc_put(svc->inc);
1099 local_bh_enable();
1100 }
1101 kfree(svc);
1102 }
1103 ip_vs_scheduler_put(sched);
1104
1105 out_mod_dec:
1106 /* decrease the module use count */
1107 ip_vs_use_count_dec();
1108
1109 return ret;
1110}
1111
1112
1113/*
1114 * Edit a service and bind it with a new scheduler
1115 */
1116static int
1117ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1118{
1119 struct ip_vs_scheduler *sched, *old_sched;
1120 int ret = 0;
1121
1122 /*
1123 * Lookup the scheduler, by 'u->sched_name'
1124 */
1125 sched = ip_vs_scheduler_get(u->sched_name);
1126 if (sched == NULL) {
1127 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1128 u->sched_name);
1129 return -ENOENT;
1130 }
1131 old_sched = sched;
1132
1133 write_lock_bh(&__ip_vs_svc_lock);
1134
1135 /*
1136 * Wait until all other svc users go away.
1137 */
1138 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1139
1140 /*
1141 * Set the flags and timeout value
1142 */
1143 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1144 svc->timeout = u->timeout * HZ;
1145 svc->netmask = u->netmask;
1146
1147 old_sched = svc->scheduler;
1148 if (sched != old_sched) {
1149 /*
1150 * Unbind the old scheduler
1151 */
1152 if ((ret = ip_vs_unbind_scheduler(svc))) {
1153 old_sched = sched;
1154 goto out;
1155 }
1156
1157 /*
1158 * Bind the new scheduler
1159 */
1160 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1161 /*
1162 * If ip_vs_bind_scheduler fails, restore the old
1163 * scheduler.
1164 * The main reason of failure is out of memory.
1165 *
1166 * The question is if the old scheduler can be
1167 * restored all the time. TODO: if it cannot be
1168 * restored some time, we must delete the service,
1169 * otherwise the system may crash.
1170 */
1171 ip_vs_bind_scheduler(svc, old_sched);
1172 old_sched = sched;
1173 goto out;
1174 }
1175 }
1176
1177 out:
1178 write_unlock_bh(&__ip_vs_svc_lock);
1179
1180 if (old_sched)
1181 ip_vs_scheduler_put(old_sched);
1182
1183 return ret;
1184}
1185
1186
1187/*
1188 * Delete a service from the service list
1189 * - The service must be unlinked, unlocked and not referenced!
1190 * - We are called under _bh lock
1191 */
1192static void __ip_vs_del_service(struct ip_vs_service *svc)
1193{
1194 struct ip_vs_dest *dest, *nxt;
1195 struct ip_vs_scheduler *old_sched;
1196
1197 ip_vs_num_services--;
1198 ip_vs_kill_estimator(&svc->stats);
1199
1200 /* Unbind scheduler */
1201 old_sched = svc->scheduler;
1202 ip_vs_unbind_scheduler(svc);
1203 if (old_sched)
1204 ip_vs_scheduler_put(old_sched);
1205
1206 /* Unbind app inc */
1207 if (svc->inc) {
1208 ip_vs_app_inc_put(svc->inc);
1209 svc->inc = NULL;
1210 }
1211
1212 /*
1213 * Unlink the whole destination list
1214 */
1215 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1216 __ip_vs_unlink_dest(svc, dest, 0);
1217 __ip_vs_del_dest(dest);
1218 }
1219
1220 /*
1221 * Update the virtual service counters
1222 */
1223 if (svc->port == FTPPORT)
1224 atomic_dec(&ip_vs_ftpsvc_counter);
1225 else if (svc->port == 0)
1226 atomic_dec(&ip_vs_nullsvc_counter);
1227
1228 /*
1229 * Free the service if nobody refers to it
1230 */
1231 if (atomic_read(&svc->refcnt) == 0)
1232 kfree(svc);
1233
1234 /* decrease the module use count */
1235 ip_vs_use_count_dec();
1236}
1237
1238/*
1239 * Delete a service from the service list
1240 */
1241static int ip_vs_del_service(struct ip_vs_service *svc)
1242{
1243 if (svc == NULL)
1244 return -EEXIST;
1245
1246 /*
1247 * Unhash it from the service table
1248 */
1249 write_lock_bh(&__ip_vs_svc_lock);
1250
1251 ip_vs_svc_unhash(svc);
1252
1253 /*
1254 * Wait until all the svc users go away.
1255 */
1256 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1257
1258 __ip_vs_del_service(svc);
1259
1260 write_unlock_bh(&__ip_vs_svc_lock);
1261
1262 return 0;
1263}
1264
1265
1266/*
1267 * Flush all the virtual services
1268 */
1269static int ip_vs_flush(void)
1270{
1271 int idx;
1272 struct ip_vs_service *svc, *nxt;
1273
1274 /*
1275 * Flush the service table hashed by <protocol,addr,port>
1276 */
1277 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1278 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1279 write_lock_bh(&__ip_vs_svc_lock);
1280 ip_vs_svc_unhash(svc);
1281 /*
1282 * Wait until all the svc users go away.
1283 */
1284 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1285 __ip_vs_del_service(svc);
1286 write_unlock_bh(&__ip_vs_svc_lock);
1287 }
1288 }
1289
1290 /*
1291 * Flush the service table hashed by fwmark
1292 */
1293 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1294 list_for_each_entry_safe(svc, nxt,
1295 &ip_vs_svc_fwm_table[idx], f_list) {
1296 write_lock_bh(&__ip_vs_svc_lock);
1297 ip_vs_svc_unhash(svc);
1298 /*
1299 * Wait until all the svc users go away.
1300 */
1301 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1302 __ip_vs_del_service(svc);
1303 write_unlock_bh(&__ip_vs_svc_lock);
1304 }
1305 }
1306
1307 return 0;
1308}
1309
1310
1311/*
1312 * Zero counters in a service or all services
1313 */
1314static int ip_vs_zero_service(struct ip_vs_service *svc)
1315{
1316 struct ip_vs_dest *dest;
1317
1318 write_lock_bh(&__ip_vs_svc_lock);
1319 list_for_each_entry(dest, &svc->destinations, n_list) {
1320 ip_vs_zero_stats(&dest->stats);
1321 }
1322 ip_vs_zero_stats(&svc->stats);
1323 write_unlock_bh(&__ip_vs_svc_lock);
1324 return 0;
1325}
1326
1327static int ip_vs_zero_all(void)
1328{
1329 int idx;
1330 struct ip_vs_service *svc;
1331
1332 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1333 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1334 ip_vs_zero_service(svc);
1335 }
1336 }
1337
1338 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1340 ip_vs_zero_service(svc);
1341 }
1342 }
1343
1344 ip_vs_zero_stats(&ip_vs_stats);
1345 return 0;
1346}
1347
1348
1349static int
1350proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1351 void __user *buffer, size_t *lenp, loff_t *ppos)
1352{
1353 int *valp = table->data;
1354 int val = *valp;
1355 int rc;
1356
1357 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1358 if (write && (*valp != val)) {
1359 if ((*valp < 0) || (*valp > 3)) {
1360 /* Restore the correct value */
1361 *valp = val;
1362 } else {
1363 local_bh_disable();
1364 update_defense_level();
1365 local_bh_enable();
1366 }
1367 }
1368 return rc;
1369}
1370
1371
1372static int
1373proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1374 void __user *buffer, size_t *lenp, loff_t *ppos)
1375{
1376 int *valp = table->data;
1377 int val[2];
1378 int rc;
1379
1380 /* backup the value first */
1381 memcpy(val, valp, sizeof(val));
1382
1383 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1384 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1385 /* Restore the correct value */
1386 memcpy(valp, val, sizeof(val));
1387 }
1388 return rc;
1389}
1390
1391
1392/*
1393 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1394 */
1395
1396static struct ctl_table vs_vars[] = {
1397 {
1398 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1399 .procname = "amemthresh",
1400 .data = &sysctl_ip_vs_amemthresh,
1401 .maxlen = sizeof(int),
1402 .mode = 0644,
1403 .proc_handler = &proc_dointvec,
1404 },
1405#ifdef CONFIG_IP_VS_DEBUG
1406 {
1407 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1408 .procname = "debug_level",
1409 .data = &sysctl_ip_vs_debug_level,
1410 .maxlen = sizeof(int),
1411 .mode = 0644,
1412 .proc_handler = &proc_dointvec,
1413 },
1414#endif
1415 {
1416 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1417 .procname = "am_droprate",
1418 .data = &sysctl_ip_vs_am_droprate,
1419 .maxlen = sizeof(int),
1420 .mode = 0644,
1421 .proc_handler = &proc_dointvec,
1422 },
1423 {
1424 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1425 .procname = "drop_entry",
1426 .data = &sysctl_ip_vs_drop_entry,
1427 .maxlen = sizeof(int),
1428 .mode = 0644,
1429 .proc_handler = &proc_do_defense_mode,
1430 },
1431 {
1432 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1433 .procname = "drop_packet",
1434 .data = &sysctl_ip_vs_drop_packet,
1435 .maxlen = sizeof(int),
1436 .mode = 0644,
1437 .proc_handler = &proc_do_defense_mode,
1438 },
1439 {
1440 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1441 .procname = "secure_tcp",
1442 .data = &sysctl_ip_vs_secure_tcp,
1443 .maxlen = sizeof(int),
1444 .mode = 0644,
1445 .proc_handler = &proc_do_defense_mode,
1446 },
1447#if 0
1448 {
1449 .ctl_name = NET_IPV4_VS_TO_ES,
1450 .procname = "timeout_established",
1451 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1452 .maxlen = sizeof(int),
1453 .mode = 0644,
1454 .proc_handler = &proc_dointvec_jiffies,
1455 },
1456 {
1457 .ctl_name = NET_IPV4_VS_TO_SS,
1458 .procname = "timeout_synsent",
1459 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1460 .maxlen = sizeof(int),
1461 .mode = 0644,
1462 .proc_handler = &proc_dointvec_jiffies,
1463 },
1464 {
1465 .ctl_name = NET_IPV4_VS_TO_SR,
1466 .procname = "timeout_synrecv",
1467 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1468 .maxlen = sizeof(int),
1469 .mode = 0644,
1470 .proc_handler = &proc_dointvec_jiffies,
1471 },
1472 {
1473 .ctl_name = NET_IPV4_VS_TO_FW,
1474 .procname = "timeout_finwait",
1475 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1476 .maxlen = sizeof(int),
1477 .mode = 0644,
1478 .proc_handler = &proc_dointvec_jiffies,
1479 },
1480 {
1481 .ctl_name = NET_IPV4_VS_TO_TW,
1482 .procname = "timeout_timewait",
1483 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1484 .maxlen = sizeof(int),
1485 .mode = 0644,
1486 .proc_handler = &proc_dointvec_jiffies,
1487 },
1488 {
1489 .ctl_name = NET_IPV4_VS_TO_CL,
1490 .procname = "timeout_close",
1491 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1492 .maxlen = sizeof(int),
1493 .mode = 0644,
1494 .proc_handler = &proc_dointvec_jiffies,
1495 },
1496 {
1497 .ctl_name = NET_IPV4_VS_TO_CW,
1498 .procname = "timeout_closewait",
1499 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1500 .maxlen = sizeof(int),
1501 .mode = 0644,
1502 .proc_handler = &proc_dointvec_jiffies,
1503 },
1504 {
1505 .ctl_name = NET_IPV4_VS_TO_LA,
1506 .procname = "timeout_lastack",
1507 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1508 .maxlen = sizeof(int),
1509 .mode = 0644,
1510 .proc_handler = &proc_dointvec_jiffies,
1511 },
1512 {
1513 .ctl_name = NET_IPV4_VS_TO_LI,
1514 .procname = "timeout_listen",
1515 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1516 .maxlen = sizeof(int),
1517 .mode = 0644,
1518 .proc_handler = &proc_dointvec_jiffies,
1519 },
1520 {
1521 .ctl_name = NET_IPV4_VS_TO_SA,
1522 .procname = "timeout_synack",
1523 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1524 .maxlen = sizeof(int),
1525 .mode = 0644,
1526 .proc_handler = &proc_dointvec_jiffies,
1527 },
1528 {
1529 .ctl_name = NET_IPV4_VS_TO_UDP,
1530 .procname = "timeout_udp",
1531 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1532 .maxlen = sizeof(int),
1533 .mode = 0644,
1534 .proc_handler = &proc_dointvec_jiffies,
1535 },
1536 {
1537 .ctl_name = NET_IPV4_VS_TO_ICMP,
1538 .procname = "timeout_icmp",
1539 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1540 .maxlen = sizeof(int),
1541 .mode = 0644,
1542 .proc_handler = &proc_dointvec_jiffies,
1543 },
1544#endif
1545 {
1546 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1547 .procname = "cache_bypass",
1548 .data = &sysctl_ip_vs_cache_bypass,
1549 .maxlen = sizeof(int),
1550 .mode = 0644,
1551 .proc_handler = &proc_dointvec,
1552 },
1553 {
1554 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1555 .procname = "expire_nodest_conn",
1556 .data = &sysctl_ip_vs_expire_nodest_conn,
1557 .maxlen = sizeof(int),
1558 .mode = 0644,
1559 .proc_handler = &proc_dointvec,
1560 },
1561 {
1562 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1563 .procname = "expire_quiescent_template",
1564 .data = &sysctl_ip_vs_expire_quiescent_template,
1565 .maxlen = sizeof(int),
1566 .mode = 0644,
1567 .proc_handler = &proc_dointvec,
1568 },
1569 {
1570 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1571 .procname = "sync_threshold",
1572 .data = &sysctl_ip_vs_sync_threshold,
1573 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1574 .mode = 0644,
1575 .proc_handler = &proc_do_sync_threshold,
1576 },
1577 {
1578 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1579 .procname = "nat_icmp_send",
1580 .data = &sysctl_ip_vs_nat_icmp_send,
1581 .maxlen = sizeof(int),
1582 .mode = 0644,
1583 .proc_handler = &proc_dointvec,
1584 },
1585 { .ctl_name = 0 }
1586};
1587
1588static ctl_table vs_table[] = {
1589 {
1590 .ctl_name = NET_IPV4_VS,
1591 .procname = "vs",
1592 .mode = 0555,
1593 .child = vs_vars
1594 },
1595 { .ctl_name = 0 }
1596};
1597
1598static ctl_table ipv4_table[] = {
1599 {
1600 .ctl_name = NET_IPV4,
1601 .procname = "ipv4",
1602 .mode = 0555,
1603 .child = vs_table,
1604 },
1605 { .ctl_name = 0 }
1606};
1607
1608static ctl_table vs_root_table[] = {
1609 {
1610 .ctl_name = CTL_NET,
1611 .procname = "net",
1612 .mode = 0555,
1613 .child = ipv4_table,
1614 },
1615 { .ctl_name = 0 }
1616};
1617
1618static struct ctl_table_header * sysctl_header;
1619
1620#ifdef CONFIG_PROC_FS
1621
1622struct ip_vs_iter {
1623 struct list_head *table;
1624 int bucket;
1625};
1626
1627/*
1628 * Write the contents of the VS rule table to a PROCfs file.
1629 * (It is kept just for backward compatibility)
1630 */
1631static inline const char *ip_vs_fwd_name(unsigned flags)
1632{
1633 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1634 case IP_VS_CONN_F_LOCALNODE:
1635 return "Local";
1636 case IP_VS_CONN_F_TUNNEL:
1637 return "Tunnel";
1638 case IP_VS_CONN_F_DROUTE:
1639 return "Route";
1640 default:
1641 return "Masq";
1642 }
1643}
1644
1645
1646/* Get the Nth entry in the two lists */
1647static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1648{
1649 struct ip_vs_iter *iter = seq->private;
1650 int idx;
1651 struct ip_vs_service *svc;
1652
1653 /* look in hash by protocol */
1654 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1655 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1656 if (pos-- == 0){
1657 iter->table = ip_vs_svc_table;
1658 iter->bucket = idx;
1659 return svc;
1660 }
1661 }
1662 }
1663
1664 /* keep looking in fwmark */
1665 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1666 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1667 if (pos-- == 0) {
1668 iter->table = ip_vs_svc_fwm_table;
1669 iter->bucket = idx;
1670 return svc;
1671 }
1672 }
1673 }
1674
1675 return NULL;
1676}
1677
1678static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1679{
1680
1681 read_lock_bh(&__ip_vs_svc_lock);
1682 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1683}
1684
1685
1686static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1687{
1688 struct list_head *e;
1689 struct ip_vs_iter *iter;
1690 struct ip_vs_service *svc;
1691
1692 ++*pos;
1693 if (v == SEQ_START_TOKEN)
1694 return ip_vs_info_array(seq,0);
1695
1696 svc = v;
1697 iter = seq->private;
1698
1699 if (iter->table == ip_vs_svc_table) {
1700 /* next service in table hashed by protocol */
1701 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1702 return list_entry(e, struct ip_vs_service, s_list);
1703
1704
1705 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1706 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1707 s_list) {
1708 return svc;
1709 }
1710 }
1711
1712 iter->table = ip_vs_svc_fwm_table;
1713 iter->bucket = -1;
1714 goto scan_fwmark;
1715 }
1716
1717 /* next service in hashed by fwmark */
1718 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1719 return list_entry(e, struct ip_vs_service, f_list);
1720
1721 scan_fwmark:
1722 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1723 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1724 f_list)
1725 return svc;
1726 }
1727
1728 return NULL;
1729}
1730
1731static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1732{
1733 read_unlock_bh(&__ip_vs_svc_lock);
1734}
1735
1736
1737static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1738{
1739 if (v == SEQ_START_TOKEN) {
1740 seq_printf(seq,
1741 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1742 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1743 seq_puts(seq,
1744 "Prot LocalAddress:Port Scheduler Flags\n");
1745 seq_puts(seq,
1746 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1747 } else {
1748 const struct ip_vs_service *svc = v;
1749 const struct ip_vs_iter *iter = seq->private;
1750 const struct ip_vs_dest *dest;
1751
1752 if (iter->table == ip_vs_svc_table)
1753 seq_printf(seq, "%s %08X:%04X %s ",
1754 ip_vs_proto_name(svc->protocol),
1755 ntohl(svc->addr),
1756 ntohs(svc->port),
1757 svc->scheduler->name);
1758 else
1759 seq_printf(seq, "FWM %08X %s ",
1760 svc->fwmark, svc->scheduler->name);
1761
1762 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1763 seq_printf(seq, "persistent %d %08X\n",
1764 svc->timeout,
1765 ntohl(svc->netmask));
1766 else
1767 seq_putc(seq, '\n');
1768
1769 list_for_each_entry(dest, &svc->destinations, n_list) {
1770 seq_printf(seq,
1771 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1772 ntohl(dest->addr), ntohs(dest->port),
1773 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1774 atomic_read(&dest->weight),
1775 atomic_read(&dest->activeconns),
1776 atomic_read(&dest->inactconns));
1777 }
1778 }
1779 return 0;
1780}
1781
1782static struct seq_operations ip_vs_info_seq_ops = {
1783 .start = ip_vs_info_seq_start,
1784 .next = ip_vs_info_seq_next,
1785 .stop = ip_vs_info_seq_stop,
1786 .show = ip_vs_info_seq_show,
1787};
1788
1789static int ip_vs_info_open(struct inode *inode, struct file *file)
1790{
1791 struct seq_file *seq;
1792 int rc = -ENOMEM;
1793 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1794
1795 if (!s)
1796 goto out;
1797
1798 rc = seq_open(file, &ip_vs_info_seq_ops);
1799 if (rc)
1800 goto out_kfree;
1801
1802 seq = file->private_data;
1803 seq->private = s;
1804 memset(s, 0, sizeof(*s));
1805out:
1806 return rc;
1807out_kfree:
1808 kfree(s);
1809 goto out;
1810}
1811
1812static struct file_operations ip_vs_info_fops = {
1813 .owner = THIS_MODULE,
1814 .open = ip_vs_info_open,
1815 .read = seq_read,
1816 .llseek = seq_lseek,
1817 .release = seq_release_private,
1818};
1819
1820#endif
1821
1822struct ip_vs_stats ip_vs_stats;
1823
1824#ifdef CONFIG_PROC_FS
1825static int ip_vs_stats_show(struct seq_file *seq, void *v)
1826{
1827
1828/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1829 seq_puts(seq,
1830 " Total Incoming Outgoing Incoming Outgoing\n");
1831 seq_printf(seq,
1832 " Conns Packets Packets Bytes Bytes\n");
1833
1834 spin_lock_bh(&ip_vs_stats.lock);
1835 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1836 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1837 (unsigned long long) ip_vs_stats.inbytes,
1838 (unsigned long long) ip_vs_stats.outbytes);
1839
1840/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1841 seq_puts(seq,
1842 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1843 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1844 ip_vs_stats.cps,
1845 ip_vs_stats.inpps,
1846 ip_vs_stats.outpps,
1847 ip_vs_stats.inbps,
1848 ip_vs_stats.outbps);
1849 spin_unlock_bh(&ip_vs_stats.lock);
1850
1851 return 0;
1852}
1853
1854static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1855{
1856 return single_open(file, ip_vs_stats_show, NULL);
1857}
1858
1859static struct file_operations ip_vs_stats_fops = {
1860 .owner = THIS_MODULE,
1861 .open = ip_vs_stats_seq_open,
1862 .read = seq_read,
1863 .llseek = seq_lseek,
1864 .release = single_release,
1865};
1866
1867#endif
1868
1869/*
1870 * Set timeout values for tcp tcpfin udp in the timeout_table.
1871 */
1872static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1873{
1874 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1875 u->tcp_timeout,
1876 u->tcp_fin_timeout,
1877 u->udp_timeout);
1878
1879#ifdef CONFIG_IP_VS_PROTO_TCP
1880 if (u->tcp_timeout) {
1881 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1882 = u->tcp_timeout * HZ;
1883 }
1884
1885 if (u->tcp_fin_timeout) {
1886 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1887 = u->tcp_fin_timeout * HZ;
1888 }
1889#endif
1890
1891#ifdef CONFIG_IP_VS_PROTO_UDP
1892 if (u->udp_timeout) {
1893 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1894 = u->udp_timeout * HZ;
1895 }
1896#endif
1897 return 0;
1898}
1899
1900
1901#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1902#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1903#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1904 sizeof(struct ip_vs_dest_user))
1905#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1906#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1907#define MAX_ARG_LEN SVCDEST_ARG_LEN
1908
1909static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1910 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1911 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1912 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1913 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1914 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1915 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1916 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1917 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1921};
1922
1923static int
1924do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1925{
1926 int ret;
1927 unsigned char arg[MAX_ARG_LEN];
1928 struct ip_vs_service_user *usvc;
1929 struct ip_vs_service *svc;
1930 struct ip_vs_dest_user *udest;
1931
1932 if (!capable(CAP_NET_ADMIN))
1933 return -EPERM;
1934
1935 if (len != set_arglen[SET_CMDID(cmd)]) {
1936 IP_VS_ERR("set_ctl: len %u != %u\n",
1937 len, set_arglen[SET_CMDID(cmd)]);
1938 return -EINVAL;
1939 }
1940
1941 if (copy_from_user(arg, user, len) != 0)
1942 return -EFAULT;
1943
1944 /* increase the module use count */
1945 ip_vs_use_count_inc();
1946
1947 if (down_interruptible(&__ip_vs_mutex)) {
1948 ret = -ERESTARTSYS;
1949 goto out_dec;
1950 }
1951
1952 if (cmd == IP_VS_SO_SET_FLUSH) {
1953 /* Flush the virtual service */
1954 ret = ip_vs_flush();
1955 goto out_unlock;
1956 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1957 /* Set timeout values for (tcp tcpfin udp) */
1958 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1959 goto out_unlock;
1960 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1961 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1962 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1963 goto out_unlock;
1964 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1965 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1966 ret = stop_sync_thread(dm->state);
1967 goto out_unlock;
1968 }
1969
1970 usvc = (struct ip_vs_service_user *)arg;
1971 udest = (struct ip_vs_dest_user *)(usvc + 1);
1972
1973 if (cmd == IP_VS_SO_SET_ZERO) {
1974 /* if no service address is set, zero counters in all */
1975 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1976 ret = ip_vs_zero_all();
1977 goto out_unlock;
1978 }
1979 }
1980
1981 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1982 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1983 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1984 usvc->protocol, NIPQUAD(usvc->addr),
1985 ntohs(usvc->port), usvc->sched_name);
1986 ret = -EFAULT;
1987 goto out_unlock;
1988 }
1989
1990 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1991 if (usvc->fwmark == 0)
1992 svc = __ip_vs_service_get(usvc->protocol,
1993 usvc->addr, usvc->port);
1994 else
1995 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1996
1997 if (cmd != IP_VS_SO_SET_ADD
1998 && (svc == NULL || svc->protocol != usvc->protocol)) {
1999 ret = -ESRCH;
2000 goto out_unlock;
2001 }
2002
2003 switch (cmd) {
2004 case IP_VS_SO_SET_ADD:
2005 if (svc != NULL)
2006 ret = -EEXIST;
2007 else
2008 ret = ip_vs_add_service(usvc, &svc);
2009 break;
2010 case IP_VS_SO_SET_EDIT:
2011 ret = ip_vs_edit_service(svc, usvc);
2012 break;
2013 case IP_VS_SO_SET_DEL:
2014 ret = ip_vs_del_service(svc);
2015 if (!ret)
2016 goto out_unlock;
2017 break;
2018 case IP_VS_SO_SET_ZERO:
2019 ret = ip_vs_zero_service(svc);
2020 break;
2021 case IP_VS_SO_SET_ADDDEST:
2022 ret = ip_vs_add_dest(svc, udest);
2023 break;
2024 case IP_VS_SO_SET_EDITDEST:
2025 ret = ip_vs_edit_dest(svc, udest);
2026 break;
2027 case IP_VS_SO_SET_DELDEST:
2028 ret = ip_vs_del_dest(svc, udest);
2029 break;
2030 default:
2031 ret = -EINVAL;
2032 }
2033
2034 if (svc)
2035 ip_vs_service_put(svc);
2036
2037 out_unlock:
2038 up(&__ip_vs_mutex);
2039 out_dec:
2040 /* decrease the module use count */
2041 ip_vs_use_count_dec();
2042
2043 return ret;
2044}
2045
2046
2047static void
2048ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2049{
2050 spin_lock_bh(&src->lock);
2051 memcpy(dst, src, (char*)&src->lock - (char*)src);
2052 spin_unlock_bh(&src->lock);
2053}
2054
2055static void
2056ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2057{
2058 dst->protocol = src->protocol;
2059 dst->addr = src->addr;
2060 dst->port = src->port;
2061 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name);
2063 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask;
2066 dst->num_dests = src->num_dests;
2067 ip_vs_copy_stats(&dst->stats, &src->stats);
2068}
2069
2070static inline int
2071__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2072 struct ip_vs_get_services __user *uptr)
2073{
2074 int idx, count=0;
2075 struct ip_vs_service *svc;
2076 struct ip_vs_service_entry entry;
2077 int ret = 0;
2078
2079 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services)
2082 goto out;
2083 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) {
2086 ret = -EFAULT;
2087 goto out;
2088 }
2089 count++;
2090 }
2091 }
2092
2093 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services)
2096 goto out;
2097 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) {
2100 ret = -EFAULT;
2101 goto out;
2102 }
2103 count++;
2104 }
2105 }
2106 out:
2107 return ret;
2108}
2109
2110static inline int
2111__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2112 struct ip_vs_get_dests __user *uptr)
2113{
2114 struct ip_vs_service *svc;
2115 int ret = 0;
2116
2117 if (get->fwmark)
2118 svc = __ip_vs_svc_fwm_get(get->fwmark);
2119 else
2120 svc = __ip_vs_service_get(get->protocol,
2121 get->addr, get->port);
2122 if (svc) {
2123 int count = 0;
2124 struct ip_vs_dest *dest;
2125 struct ip_vs_dest_entry entry;
2126
2127 list_for_each_entry(dest, &svc->destinations, n_list) {
2128 if (count >= get->num_dests)
2129 break;
2130
2131 entry.addr = dest->addr;
2132 entry.port = dest->port;
2133 entry.conn_flags = atomic_read(&dest->conn_flags);
2134 entry.weight = atomic_read(&dest->weight);
2135 entry.u_threshold = dest->u_threshold;
2136 entry.l_threshold = dest->l_threshold;
2137 entry.activeconns = atomic_read(&dest->activeconns);
2138 entry.inactconns = atomic_read(&dest->inactconns);
2139 entry.persistconns = atomic_read(&dest->persistconns);
2140 ip_vs_copy_stats(&entry.stats, &dest->stats);
2141 if (copy_to_user(&uptr->entrytable[count],
2142 &entry, sizeof(entry))) {
2143 ret = -EFAULT;
2144 break;
2145 }
2146 count++;
2147 }
2148 ip_vs_service_put(svc);
2149 } else
2150 ret = -ESRCH;
2151 return ret;
2152}
2153
2154static inline void
2155__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2156{
2157#ifdef CONFIG_IP_VS_PROTO_TCP
2158 u->tcp_timeout =
2159 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2160 u->tcp_fin_timeout =
2161 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2162#endif
2163#ifdef CONFIG_IP_VS_PROTO_UDP
2164 u->udp_timeout =
2165 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2166#endif
2167}
2168
2169
2170#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2171#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2172#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2173#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2174#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2175#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2176#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2177
2178static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2179 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2180 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2181 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2182 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2183 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2184 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2185 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2186};
2187
2188static int
2189do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2190{
2191 unsigned char arg[128];
2192 int ret = 0;
2193
2194 if (!capable(CAP_NET_ADMIN))
2195 return -EPERM;
2196
2197 if (*len < get_arglen[GET_CMDID(cmd)]) {
2198 IP_VS_ERR("get_ctl: len %u < %u\n",
2199 *len, get_arglen[GET_CMDID(cmd)]);
2200 return -EINVAL;
2201 }
2202
2203 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2204 return -EFAULT;
2205
2206 if (down_interruptible(&__ip_vs_mutex))
2207 return -ERESTARTSYS;
2208
2209 switch (cmd) {
2210 case IP_VS_SO_GET_VERSION:
2211 {
2212 char buf[64];
2213
2214 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2215 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2216 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2217 ret = -EFAULT;
2218 goto out;
2219 }
2220 *len = strlen(buf)+1;
2221 }
2222 break;
2223
2224 case IP_VS_SO_GET_INFO:
2225 {
2226 struct ip_vs_getinfo info;
2227 info.version = IP_VS_VERSION_CODE;
2228 info.size = IP_VS_CONN_TAB_SIZE;
2229 info.num_services = ip_vs_num_services;
2230 if (copy_to_user(user, &info, sizeof(info)) != 0)
2231 ret = -EFAULT;
2232 }
2233 break;
2234
2235 case IP_VS_SO_GET_SERVICES:
2236 {
2237 struct ip_vs_get_services *get;
2238 int size;
2239
2240 get = (struct ip_vs_get_services *)arg;
2241 size = sizeof(*get) +
2242 sizeof(struct ip_vs_service_entry) * get->num_services;
2243 if (*len != size) {
2244 IP_VS_ERR("length: %u != %u\n", *len, size);
2245 ret = -EINVAL;
2246 goto out;
2247 }
2248 ret = __ip_vs_get_service_entries(get, user);
2249 }
2250 break;
2251
2252 case IP_VS_SO_GET_SERVICE:
2253 {
2254 struct ip_vs_service_entry *entry;
2255 struct ip_vs_service *svc;
2256
2257 entry = (struct ip_vs_service_entry *)arg;
2258 if (entry->fwmark)
2259 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2260 else
2261 svc = __ip_vs_service_get(entry->protocol,
2262 entry->addr, entry->port);
2263 if (svc) {
2264 ip_vs_copy_service(entry, svc);
2265 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2266 ret = -EFAULT;
2267 ip_vs_service_put(svc);
2268 } else
2269 ret = -ESRCH;
2270 }
2271 break;
2272
2273 case IP_VS_SO_GET_DESTS:
2274 {
2275 struct ip_vs_get_dests *get;
2276 int size;
2277
2278 get = (struct ip_vs_get_dests *)arg;
2279 size = sizeof(*get) +
2280 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2281 if (*len != size) {
2282 IP_VS_ERR("length: %u != %u\n", *len, size);
2283 ret = -EINVAL;
2284 goto out;
2285 }
2286 ret = __ip_vs_get_dest_entries(get, user);
2287 }
2288 break;
2289
2290 case IP_VS_SO_GET_TIMEOUT:
2291 {
2292 struct ip_vs_timeout_user t;
2293
2294 __ip_vs_get_timeouts(&t);
2295 if (copy_to_user(user, &t, sizeof(t)) != 0)
2296 ret = -EFAULT;
2297 }
2298 break;
2299
2300 case IP_VS_SO_GET_DAEMON:
2301 {
2302 struct ip_vs_daemon_user d[2];
2303
2304 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
2308 d[0].syncid = ip_vs_master_syncid;
2309 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
2313 d[1].syncid = ip_vs_backup_syncid;
2314 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0)
2316 ret = -EFAULT;
2317 }
2318 break;
2319
2320 default:
2321 ret = -EINVAL;
2322 }
2323
2324 out:
2325 up(&__ip_vs_mutex);
2326 return ret;
2327}
2328
2329
2330static struct nf_sockopt_ops ip_vs_sockopts = {
2331 .pf = PF_INET,
2332 .set_optmin = IP_VS_BASE_CTL,
2333 .set_optmax = IP_VS_SO_SET_MAX+1,
2334 .set = do_ip_vs_set_ctl,
2335 .get_optmin = IP_VS_BASE_CTL,
2336 .get_optmax = IP_VS_SO_GET_MAX+1,
2337 .get = do_ip_vs_get_ctl,
2338};
2339
2340
2341int ip_vs_control_init(void)
2342{
2343 int ret;
2344 int idx;
2345
2346 EnterFunction(2);
2347
2348 ret = nf_register_sockopt(&ip_vs_sockopts);
2349 if (ret) {
2350 IP_VS_ERR("cannot register sockopt.\n");
2351 return ret;
2352 }
2353
2354 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2355 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2356
2357 sysctl_header = register_sysctl_table(vs_root_table, 0);
2358
2359 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2360 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2361 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2362 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2363 }
2364 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2365 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2366 }
2367
2368 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2369 spin_lock_init(&ip_vs_stats.lock);
2370 ip_vs_new_estimator(&ip_vs_stats);
2371
2372 /* Hook the defense timer */
2373 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2374
2375 LeaveFunction(2);
2376 return 0;
2377}
2378
2379
2380void ip_vs_control_cleanup(void)
2381{
2382 EnterFunction(2);
2383 ip_vs_trash_cleanup();
2384 cancel_rearming_delayed_work(&defense_work);
2385 ip_vs_kill_estimator(&ip_vs_stats);
2386 unregister_sysctl_table(sysctl_header);
2387 proc_net_remove("ip_vs_stats");
2388 proc_net_remove("ip_vs");
2389 nf_unregister_sockopt(&ip_vs_sockopts);
2390 LeaveFunction(2);
2391}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
new file mode 100644
index 000000000000..f3bc320dce93
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -0,0 +1,258 @@
1/*
2 * IPVS: Destination Hashing scheduling module
3 *
4 * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * Inspired by the consistent hashing scheduler patch from
9 * Thomas Proell <proellt@gmx.de>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 * Changes:
17 *
18 */
19
20/*
21 * The dh algorithm is to select server by the hash key of destination IP
22 * address. The pseudo code is as follows:
23 *
24 * n <- servernode[dest_ip];
25 * if (n is dead) OR
26 * (n is overloaded) OR (n.weight <= 0) then
27 * return NULL;
28 *
29 * return n;
30 *
31 * Notes that servernode is a 256-bucket hash table that maps the hash
32 * index derived from packet destination IP address to the current server
33 * array. If the dh scheduler is used in cache cluster, it is good to
34 * combine it with cache_bypass feature. When the statically assigned
35 * server is dead or overloaded, the load balancer can bypass the cache
36 * server and send requests to the original server directly.
37 *
38 */
39
40#include <linux/module.h>
41#include <linux/kernel.h>
42
43#include <net/ip_vs.h>
44
45
46/*
47 * IPVS DH bucket
48 */
49struct ip_vs_dh_bucket {
50 struct ip_vs_dest *dest; /* real server (cache) */
51};
52
53/*
54 * for IPVS DH entry hash table
55 */
56#ifndef CONFIG_IP_VS_DH_TAB_BITS
57#define CONFIG_IP_VS_DH_TAB_BITS 8
58#endif
59#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
60#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
61#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
62
63
64/*
65 * Returns hash value for IPVS DH entry
66 */
67static inline unsigned ip_vs_dh_hashkey(__u32 addr)
68{
69 return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
70}
71
72
73/*
74 * Get ip_vs_dest associated with supplied parameters.
75 */
76static inline struct ip_vs_dest *
77ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
78{
79 return (tbl[ip_vs_dh_hashkey(addr)]).dest;
80}
81
82
83/*
84 * Assign all the hash buckets of the specified table with the service.
85 */
86static int
87ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
88{
89 int i;
90 struct ip_vs_dh_bucket *b;
91 struct list_head *p;
92 struct ip_vs_dest *dest;
93
94 b = tbl;
95 p = &svc->destinations;
96 for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
97 if (list_empty(p)) {
98 b->dest = NULL;
99 } else {
100 if (p == &svc->destinations)
101 p = p->next;
102
103 dest = list_entry(p, struct ip_vs_dest, n_list);
104 atomic_inc(&dest->refcnt);
105 b->dest = dest;
106
107 p = p->next;
108 }
109 b++;
110 }
111 return 0;
112}
113
114
115/*
116 * Flush all the hash buckets of the specified table.
117 */
118static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
119{
120 int i;
121 struct ip_vs_dh_bucket *b;
122
123 b = tbl;
124 for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
125 if (b->dest) {
126 atomic_dec(&b->dest->refcnt);
127 b->dest = NULL;
128 }
129 b++;
130 }
131}
132
133
134static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
135{
136 struct ip_vs_dh_bucket *tbl;
137
138 /* allocate the DH table for this service */
139 tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
140 GFP_ATOMIC);
141 if (tbl == NULL) {
142 IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
143 return -ENOMEM;
144 }
145 svc->sched_data = tbl;
146 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
147 "current service\n",
148 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
149
150 /* assign the hash buckets with the updated service */
151 ip_vs_dh_assign(tbl, svc);
152
153 return 0;
154}
155
156
157static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
158{
159 struct ip_vs_dh_bucket *tbl = svc->sched_data;
160
161 /* got to clean up hash buckets here */
162 ip_vs_dh_flush(tbl);
163
164 /* release the table itself */
165 kfree(svc->sched_data);
166 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
167 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
168
169 return 0;
170}
171
172
173static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
174{
175 struct ip_vs_dh_bucket *tbl = svc->sched_data;
176
177 /* got to clean up hash buckets here */
178 ip_vs_dh_flush(tbl);
179
180 /* assign the hash buckets with the updated service */
181 ip_vs_dh_assign(tbl, svc);
182
183 return 0;
184}
185
186
187/*
188 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
189 * consider that the server is overloaded here.
190 */
191static inline int is_overloaded(struct ip_vs_dest *dest)
192{
193 return dest->flags & IP_VS_DEST_F_OVERLOAD;
194}
195
196
197/*
198 * Destination hashing scheduling
199 */
200static struct ip_vs_dest *
201ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
202{
203 struct ip_vs_dest *dest;
204 struct ip_vs_dh_bucket *tbl;
205 struct iphdr *iph = skb->nh.iph;
206
207 IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
208
209 tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
210 dest = ip_vs_dh_get(tbl, iph->daddr);
211 if (!dest
212 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
213 || atomic_read(&dest->weight) <= 0
214 || is_overloaded(dest)) {
215 return NULL;
216 }
217
218 IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
219 "--> server %u.%u.%u.%u:%d\n",
220 NIPQUAD(iph->daddr),
221 NIPQUAD(dest->addr),
222 ntohs(dest->port));
223
224 return dest;
225}
226
227
228/*
229 * IPVS DH Scheduler structure
230 */
231static struct ip_vs_scheduler ip_vs_dh_scheduler =
232{
233 .name = "dh",
234 .refcnt = ATOMIC_INIT(0),
235 .module = THIS_MODULE,
236 .init_service = ip_vs_dh_init_svc,
237 .done_service = ip_vs_dh_done_svc,
238 .update_service = ip_vs_dh_update_svc,
239 .schedule = ip_vs_dh_schedule,
240};
241
242
243static int __init ip_vs_dh_init(void)
244{
245 INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
246 return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
247}
248
249
250static void __exit ip_vs_dh_cleanup(void)
251{
252 unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
253}
254
255
256module_init(ip_vs_dh_init);
257module_exit(ip_vs_dh_cleanup);
258MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
new file mode 100644
index 000000000000..67b3e2fc1fa1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -0,0 +1,200 @@
1/*
2 * ip_vs_est.c: simple rate estimator for IPVS
3 *
4 * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16#include <linux/kernel.h>
17#include <linux/types.h>
18
19#include <net/ip_vs.h>
20
21/*
22 This code is to estimate rate in a shorter interval (such as 8
23 seconds) for virtual services and real servers. For measure rate in a
24 long interval, it is easy to implement a user level daemon which
25 periodically reads those statistical counters and measure rate.
26
27 Currently, the measurement is activated by slow timer handler. Hope
28 this measurement will not introduce too much load.
29
30 We measure rate during the last 8 seconds every 2 seconds:
31
32 avgrate = avgrate*(1-W) + rate*W
33
34 where W = 2^(-2)
35
36 NOTES.
37
38 * The stored value for average bps is scaled by 2^5, so that maximal
39 rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
40
41 * A lot code is taken from net/sched/estimator.c
42 */
43
44
45struct ip_vs_estimator
46{
47 struct ip_vs_estimator *next;
48 struct ip_vs_stats *stats;
49
50 u32 last_conns;
51 u32 last_inpkts;
52 u32 last_outpkts;
53 u64 last_inbytes;
54 u64 last_outbytes;
55
56 u32 cps;
57 u32 inpps;
58 u32 outpps;
59 u32 inbps;
60 u32 outbps;
61};
62
63
64static struct ip_vs_estimator *est_list = NULL;
65static DEFINE_RWLOCK(est_lock);
66static struct timer_list est_timer;
67
68static void estimation_timer(unsigned long arg)
69{
70 struct ip_vs_estimator *e;
71 struct ip_vs_stats *s;
72 u32 n_conns;
73 u32 n_inpkts, n_outpkts;
74 u64 n_inbytes, n_outbytes;
75 u32 rate;
76
77 read_lock(&est_lock);
78 for (e = est_list; e; e = e->next) {
79 s = e->stats;
80
81 spin_lock(&s->lock);
82 n_conns = s->conns;
83 n_inpkts = s->inpkts;
84 n_outpkts = s->outpkts;
85 n_inbytes = s->inbytes;
86 n_outbytes = s->outbytes;
87
88 /* scaled by 2^10, but divided 2 seconds */
89 rate = (n_conns - e->last_conns)<<9;
90 e->last_conns = n_conns;
91 e->cps += ((long)rate - (long)e->cps)>>2;
92 s->cps = (e->cps+0x1FF)>>10;
93
94 rate = (n_inpkts - e->last_inpkts)<<9;
95 e->last_inpkts = n_inpkts;
96 e->inpps += ((long)rate - (long)e->inpps)>>2;
97 s->inpps = (e->inpps+0x1FF)>>10;
98
99 rate = (n_outpkts - e->last_outpkts)<<9;
100 e->last_outpkts = n_outpkts;
101 e->outpps += ((long)rate - (long)e->outpps)>>2;
102 s->outpps = (e->outpps+0x1FF)>>10;
103
104 rate = (n_inbytes - e->last_inbytes)<<4;
105 e->last_inbytes = n_inbytes;
106 e->inbps += ((long)rate - (long)e->inbps)>>2;
107 s->inbps = (e->inbps+0xF)>>5;
108
109 rate = (n_outbytes - e->last_outbytes)<<4;
110 e->last_outbytes = n_outbytes;
111 e->outbps += ((long)rate - (long)e->outbps)>>2;
112 s->outbps = (e->outbps+0xF)>>5;
113 spin_unlock(&s->lock);
114 }
115 read_unlock(&est_lock);
116 mod_timer(&est_timer, jiffies + 2*HZ);
117}
118
119int ip_vs_new_estimator(struct ip_vs_stats *stats)
120{
121 struct ip_vs_estimator *est;
122
123 est = kmalloc(sizeof(*est), GFP_KERNEL);
124 if (est == NULL)
125 return -ENOMEM;
126
127 memset(est, 0, sizeof(*est));
128 est->stats = stats;
129 est->last_conns = stats->conns;
130 est->cps = stats->cps<<10;
131
132 est->last_inpkts = stats->inpkts;
133 est->inpps = stats->inpps<<10;
134
135 est->last_outpkts = stats->outpkts;
136 est->outpps = stats->outpps<<10;
137
138 est->last_inbytes = stats->inbytes;
139 est->inbps = stats->inbps<<5;
140
141 est->last_outbytes = stats->outbytes;
142 est->outbps = stats->outbps<<5;
143
144 write_lock_bh(&est_lock);
145 est->next = est_list;
146 if (est->next == NULL) {
147 init_timer(&est_timer);
148 est_timer.expires = jiffies + 2*HZ;
149 est_timer.function = estimation_timer;
150 add_timer(&est_timer);
151 }
152 est_list = est;
153 write_unlock_bh(&est_lock);
154 return 0;
155}
156
157void ip_vs_kill_estimator(struct ip_vs_stats *stats)
158{
159 struct ip_vs_estimator *est, **pest;
160 int killed = 0;
161
162 write_lock_bh(&est_lock);
163 pest = &est_list;
164 while ((est=*pest) != NULL) {
165 if (est->stats != stats) {
166 pest = &est->next;
167 continue;
168 }
169 *pest = est->next;
170 kfree(est);
171 killed++;
172 }
173 if (killed && est_list == NULL)
174 del_timer_sync(&est_timer);
175 write_unlock_bh(&est_lock);
176}
177
178void ip_vs_zero_estimator(struct ip_vs_stats *stats)
179{
180 struct ip_vs_estimator *e;
181
182 write_lock_bh(&est_lock);
183 for (e = est_list; e; e = e->next) {
184 if (e->stats != stats)
185 continue;
186
187 /* set counters zero */
188 e->last_conns = 0;
189 e->last_inpkts = 0;
190 e->last_outpkts = 0;
191 e->last_inbytes = 0;
192 e->last_outbytes = 0;
193 e->cps = 0;
194 e->inpps = 0;
195 e->outpps = 0;
196 e->inbps = 0;
197 e->outbps = 0;
198 }
199 write_unlock_bh(&est_lock);
200}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
new file mode 100644
index 000000000000..a19a33ceb811
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -0,0 +1,400 @@
1/*
2 * ip_vs_ftp.c: IPVS ftp application module
3 *
4 * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * Changes:
9 *
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
17 * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
18 *
19 * IP_MASQ_FTP ftp masquerading module
20 *
21 * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
22 *
23 * Author: Wouter Gadeyne
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/moduleparam.h>
29#include <linux/kernel.h>
30#include <linux/skbuff.h>
31#include <linux/in.h>
32#include <linux/ip.h>
33#include <net/protocol.h>
34#include <net/tcp.h>
35
36#include <net/ip_vs.h>
37
38
39#define SERVER_STRING "227 Entering Passive Mode ("
40#define CLIENT_STRING "PORT "
41
42
43/*
44 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
45 * First port is set to the default port.
46 */
47static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
48module_param_array(ports, int, NULL, 0);
49
50/*
51 * Debug level
52 */
53#ifdef CONFIG_IP_VS_DEBUG
54static int debug=0;
55module_param(debug, int, 0);
56#endif
57
58
59/* Dummy variable */
60static int ip_vs_ftp_pasv;
61
62
63static int
64ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
65{
66 return 0;
67}
68
69
70static int
71ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
72{
73 return 0;
74}
75
76
77/*
78 * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
79 * with the "pattern" and terminated with the "term" character.
80 * <addr,port> is in network order.
81 */
82static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
83 const char *pattern, size_t plen, char term,
84 __u32 *addr, __u16 *port,
85 char **start, char **end)
86{
87 unsigned char p[6];
88 int i = 0;
89
90 if (data_limit - data < plen) {
91 /* check if there is partial match */
92 if (strnicmp(data, pattern, data_limit - data) == 0)
93 return -1;
94 else
95 return 0;
96 }
97
98 if (strnicmp(data, pattern, plen) != 0) {
99 return 0;
100 }
101 *start = data + plen;
102
103 for (data = *start; *data != term; data++) {
104 if (data == data_limit)
105 return -1;
106 }
107 *end = data;
108
109 memset(p, 0, sizeof(p));
110 for (data = *start; data != *end; data++) {
111 if (*data >= '0' && *data <= '9') {
112 p[i] = p[i]*10 + *data - '0';
113 } else if (*data == ',' && i < 5) {
114 i++;
115 } else {
116 /* unexpected character */
117 return -1;
118 }
119 }
120
121 if (i != 5)
122 return -1;
123
124 *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
125 *port = (p[5]<<8) | p[4];
126 return 1;
127}
128
129
130/*
131 * Look at outgoing ftp packets to catch the response to a PASV command
132 * from the server (inside-to-outside).
133 * When we see one, we build a connection entry with the client address,
134 * client port 0 (unknown at the moment), the server address and the
135 * server port. Mark the current connection entry as a control channel
136 * of the new entry. All this work is just to make the data connection
137 * can be scheduled to the right server later.
138 *
139 * The outgoing packet should be something like
140 * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
141 * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
142 */
143static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
144 struct sk_buff **pskb, int *diff)
145{
146 struct iphdr *iph;
147 struct tcphdr *th;
148 char *data, *data_limit;
149 char *start, *end;
150 __u32 from;
151 __u16 port;
152 struct ip_vs_conn *n_cp;
153 char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
154 unsigned buf_len;
155 int ret;
156
157 *diff = 0;
158
159 /* Only useful for established sessions */
160 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
161 return 1;
162
163 /* Linear packets are much easier to deal with. */
164 if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
165 return 0;
166
167 if (cp->app_data == &ip_vs_ftp_pasv) {
168 iph = (*pskb)->nh.iph;
169 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
170 data = (char *)th + (th->doff << 2);
171 data_limit = (*pskb)->tail;
172
173 if (ip_vs_ftp_get_addrport(data, data_limit,
174 SERVER_STRING,
175 sizeof(SERVER_STRING)-1, ')',
176 &from, &port,
177 &start, &end) != 1)
178 return 1;
179
180 IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
181 "%u.%u.%u.%u:%d detected\n",
182 NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
183
184 /*
185 * Now update or create an connection entry for it
186 */
187 n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
188 cp->caddr, 0);
189 if (!n_cp) {
190 n_cp = ip_vs_conn_new(IPPROTO_TCP,
191 cp->caddr, 0,
192 cp->vaddr, port,
193 from, port,
194 IP_VS_CONN_F_NO_CPORT,
195 cp->dest);
196 if (!n_cp)
197 return 0;
198
199 /* add its controller */
200 ip_vs_control_add(n_cp, cp);
201 }
202
203 /*
204 * Replace the old passive address with the new one
205 */
206 from = n_cp->vaddr;
207 port = n_cp->vport;
208 sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
209 port&255, (port>>8)&255);
210 buf_len = strlen(buf);
211
212 /*
213 * Calculate required delta-offset to keep TCP happy
214 */
215 *diff = buf_len - (end-start);
216
217 if (*diff == 0) {
218 /* simply replace it with new passive address */
219 memcpy(start, buf, buf_len);
220 ret = 1;
221 } else {
222 ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
223 end-start, buf, buf_len);
224 }
225
226 cp->app_data = NULL;
227 ip_vs_tcp_conn_listen(n_cp);
228 ip_vs_conn_put(n_cp);
229 return ret;
230 }
231 return 1;
232}
233
234
235/*
236 * Look at incoming ftp packets to catch the PASV/PORT command
237 * (outside-to-inside).
238 *
239 * The incoming packet having the PORT command should be something like
240 * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
241 * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
242 * In this case, we create a connection entry using the client address and
243 * port, so that the active ftp data connection from the server can reach
244 * the client.
245 */
246static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
247 struct sk_buff **pskb, int *diff)
248{
249 struct iphdr *iph;
250 struct tcphdr *th;
251 char *data, *data_start, *data_limit;
252 char *start, *end;
253 __u32 to;
254 __u16 port;
255 struct ip_vs_conn *n_cp;
256
257 /* no diff required for incoming packets */
258 *diff = 0;
259
260 /* Only useful for established sessions */
261 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
262 return 1;
263
264 /* Linear packets are much easier to deal with. */
265 if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
266 return 0;
267
268 /*
269 * Detecting whether it is passive
270 */
271 iph = (*pskb)->nh.iph;
272 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
273
274 /* Since there may be OPTIONS in the TCP packet and the HLEN is
275 the length of the header in 32-bit multiples, it is accurate
276 to calculate data address by th+HLEN*4 */
277 data = data_start = (char *)th + (th->doff << 2);
278 data_limit = (*pskb)->tail;
279
280 while (data <= data_limit - 6) {
281 if (strnicmp(data, "PASV\r\n", 6) == 0) {
282 /* Passive mode on */
283 IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n",
284 data - data_start,
285 data_limit - data_start);
286 cp->app_data = &ip_vs_ftp_pasv;
287 return 1;
288 }
289 data++;
290 }
291
292 /*
293 * To support virtual FTP server, the scenerio is as follows:
294 * FTP client ----> Load Balancer ----> FTP server
295 * First detect the port number in the application data,
296 * then create a new connection entry for the coming data
297 * connection.
298 */
299 if (ip_vs_ftp_get_addrport(data_start, data_limit,
300 CLIENT_STRING, sizeof(CLIENT_STRING)-1,
301 '\r', &to, &port,
302 &start, &end) != 1)
303 return 1;
304
305 IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
306 NIPQUAD(to), ntohs(port));
307
308 /* Passive mode off */
309 cp->app_data = NULL;
310
311 /*
312 * Now update or create a connection entry for it
313 */
314 IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
315 ip_vs_proto_name(iph->protocol),
316 NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
317
318 n_cp = ip_vs_conn_in_get(iph->protocol,
319 to, port,
320 cp->vaddr, htons(ntohs(cp->vport)-1));
321 if (!n_cp) {
322 n_cp = ip_vs_conn_new(IPPROTO_TCP,
323 to, port,
324 cp->vaddr, htons(ntohs(cp->vport)-1),
325 cp->daddr, htons(ntohs(cp->dport)-1),
326 0,
327 cp->dest);
328 if (!n_cp)
329 return 0;
330
331 /* add its controller */
332 ip_vs_control_add(n_cp, cp);
333 }
334
335 /*
336 * Move tunnel to listen state
337 */
338 ip_vs_tcp_conn_listen(n_cp);
339 ip_vs_conn_put(n_cp);
340
341 return 1;
342}
343
344
345static struct ip_vs_app ip_vs_ftp = {
346 .name = "ftp",
347 .type = IP_VS_APP_TYPE_FTP,
348 .protocol = IPPROTO_TCP,
349 .module = THIS_MODULE,
350 .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
351 .init_conn = ip_vs_ftp_init_conn,
352 .done_conn = ip_vs_ftp_done_conn,
353 .bind_conn = NULL,
354 .unbind_conn = NULL,
355 .pkt_out = ip_vs_ftp_out,
356 .pkt_in = ip_vs_ftp_in,
357};
358
359
360/*
361 * ip_vs_ftp initialization
362 */
363static int __init ip_vs_ftp_init(void)
364{
365 int i, ret;
366 struct ip_vs_app *app = &ip_vs_ftp;
367
368 ret = register_ip_vs_app(app);
369 if (ret)
370 return ret;
371
372 for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
373 if (!ports[i])
374 continue;
375 ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
376 if (ret)
377 break;
378 IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
379 app->name, i, ports[i]);
380 }
381
382 if (ret)
383 unregister_ip_vs_app(app);
384
385 return ret;
386}
387
388
389/*
390 * ip_vs_ftp finish.
391 */
392static void __exit ip_vs_ftp_exit(void)
393{
394 unregister_ip_vs_app(&ip_vs_ftp);
395}
396
397
398module_init(ip_vs_ftp_init);
399module_exit(ip_vs_ftp_exit);
400MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
new file mode 100644
index 000000000000..c035838b780a
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -0,0 +1,624 @@
1/*
2 * IPVS: Locality-Based Least-Connection scheduling module
3 *
4 * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Martin Hamilton : fixed the terrible locking bugs
15 * *lock(tbl->lock) ==> *lock(&tbl->lock)
16 * Wensong Zhang : fixed the uninitilized tbl->lock bug
17 * Wensong Zhang : added doing full expiration check to
18 * collect stale entries of 24+ hours when
19 * no partial expire check in a half hour
20 * Julian Anastasov : replaced del_timer call with del_timer_sync
21 * to avoid the possible race between timer
22 * handler and del_timer thread in SMP
23 *
24 */
25
26/*
27 * The lblc algorithm is as follows (pseudo code):
28 *
29 * if cachenode[dest_ip] is null then
30 * n, cachenode[dest_ip] <- {weighted least-conn node};
31 * else
32 * n <- cachenode[dest_ip];
33 * if (n is dead) OR
34 * (n.conns>n.weight AND
35 * there is a node m with m.conns<m.weight/2) then
36 * n, cachenode[dest_ip] <- {weighted least-conn node};
37 *
38 * return n;
39 *
40 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
41 * me to write this module.
42 */
43
44#include <linux/module.h>
45#include <linux/kernel.h>
46
47/* for sysctl */
48#include <linux/fs.h>
49#include <linux/sysctl.h>
50
51#include <net/ip_vs.h>
52
53
54/*
55 * It is for garbage collection of stale IPVS lblc entries,
56 * when the table is full.
57 */
58#define CHECK_EXPIRE_INTERVAL (60*HZ)
59#define ENTRY_TIMEOUT (6*60*HZ)
60
61/*
62 * It is for full expiration check.
63 * When there is no partial expiration check (garbage collection)
64 * in a half hour, do a full expiration check to collect stale
65 * entries that haven't been touched for a day.
66 */
67#define COUNT_FOR_FULL_EXPIRATION 30
68static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
69
70
71/*
72 * for IPVS lblc entry hash table
73 */
74#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
75#define CONFIG_IP_VS_LBLC_TAB_BITS 10
76#endif
77#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
78#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
79#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
80
81
82/*
83 * IPVS lblc entry represents an association between destination
84 * IP address and its destination server
85 */
86struct ip_vs_lblc_entry {
87 struct list_head list;
88 __u32 addr; /* destination IP address */
89 struct ip_vs_dest *dest; /* real server (cache) */
90 unsigned long lastuse; /* last used time */
91};
92
93
94/*
95 * IPVS lblc hash table
96 */
97struct ip_vs_lblc_table {
98 rwlock_t lock; /* lock for this table */
99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
100 atomic_t entries; /* number of entries */
101 int max_size; /* maximum size of entries */
102 struct timer_list periodic_timer; /* collect stale entries */
103 int rover; /* rover for expire check */
104 int counter; /* counter for no expire */
105};
106
107
108/*
109 * IPVS LBLC sysctl table
110 */
111
112static ctl_table vs_vars_table[] = {
113 {
114 .ctl_name = NET_IPV4_VS_LBLC_EXPIRE,
115 .procname = "lblc_expiration",
116 .data = &sysctl_ip_vs_lblc_expiration,
117 .maxlen = sizeof(int),
118 .mode = 0644,
119 .proc_handler = &proc_dointvec_jiffies,
120 },
121 { .ctl_name = 0 }
122};
123
124static ctl_table vs_table[] = {
125 {
126 .ctl_name = NET_IPV4_VS,
127 .procname = "vs",
128 .mode = 0555,
129 .child = vs_vars_table
130 },
131 { .ctl_name = 0 }
132};
133
134static ctl_table ipv4_table[] = {
135 {
136 .ctl_name = NET_IPV4,
137 .procname = "ipv4",
138 .mode = 0555,
139 .child = vs_table
140 },
141 { .ctl_name = 0 }
142};
143
144static ctl_table lblc_root_table[] = {
145 {
146 .ctl_name = CTL_NET,
147 .procname = "net",
148 .mode = 0555,
149 .child = ipv4_table
150 },
151 { .ctl_name = 0 }
152};
153
154static struct ctl_table_header * sysctl_header;
155
156/*
157 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
158 * IP address to a server.
159 */
160static inline struct ip_vs_lblc_entry *
161ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
162{
163 struct ip_vs_lblc_entry *en;
164
165 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
166 if (en == NULL) {
167 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
168 return NULL;
169 }
170
171 INIT_LIST_HEAD(&en->list);
172 en->addr = daddr;
173
174 atomic_inc(&dest->refcnt);
175 en->dest = dest;
176
177 return en;
178}
179
180
181static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
182{
183 list_del(&en->list);
184 /*
185 * We don't kfree dest because it is refered either by its service
186 * or the trash dest list.
187 */
188 atomic_dec(&en->dest->refcnt);
189 kfree(en);
190}
191
192
193/*
194 * Returns hash value for IPVS LBLC entry
195 */
196static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
197{
198 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
199}
200
201
202/*
203 * Hash an entry in the ip_vs_lblc_table.
204 * returns bool success.
205 */
206static int
207ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
208{
209 unsigned hash;
210
211 if (!list_empty(&en->list)) {
212 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
213 "called from %p\n", __builtin_return_address(0));
214 return 0;
215 }
216
217 /*
218 * Hash by destination IP address
219 */
220 hash = ip_vs_lblc_hashkey(en->addr);
221
222 write_lock(&tbl->lock);
223 list_add(&en->list, &tbl->bucket[hash]);
224 atomic_inc(&tbl->entries);
225 write_unlock(&tbl->lock);
226
227 return 1;
228}
229
230
231#if 0000
232/*
233 * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
234 * returns bool success.
235 */
236static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
237 struct ip_vs_lblc_entry *en)
238{
239 if (list_empty(&en->list)) {
240 IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
241 "called from %p\n", __builtin_return_address(0));
242 return 0;
243 }
244
245 /*
246 * Remove it from the table
247 */
248 write_lock(&tbl->lock);
249 list_del(&en->list);
250 INIT_LIST_HEAD(&en->list);
251 write_unlock(&tbl->lock);
252
253 return 1;
254}
255#endif
256
257
258/*
259 * Get ip_vs_lblc_entry associated with supplied parameters.
260 */
261static inline struct ip_vs_lblc_entry *
262ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
263{
264 unsigned hash;
265 struct ip_vs_lblc_entry *en;
266
267 hash = ip_vs_lblc_hashkey(addr);
268
269 read_lock(&tbl->lock);
270
271 list_for_each_entry(en, &tbl->bucket[hash], list) {
272 if (en->addr == addr) {
273 /* HIT */
274 read_unlock(&tbl->lock);
275 return en;
276 }
277 }
278
279 read_unlock(&tbl->lock);
280
281 return NULL;
282}
283
284
285/*
286 * Flush all the entries of the specified table.
287 */
288static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
289{
290 int i;
291 struct ip_vs_lblc_entry *en, *nxt;
292
293 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
294 write_lock(&tbl->lock);
295 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
296 ip_vs_lblc_free(en);
297 atomic_dec(&tbl->entries);
298 }
299 write_unlock(&tbl->lock);
300 }
301}
302
303
304static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
305{
306 unsigned long now = jiffies;
307 int i, j;
308 struct ip_vs_lblc_entry *en, *nxt;
309
310 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
311 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
312
313 write_lock(&tbl->lock);
314 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
315 if (time_before(now,
316 en->lastuse + sysctl_ip_vs_lblc_expiration))
317 continue;
318
319 ip_vs_lblc_free(en);
320 atomic_dec(&tbl->entries);
321 }
322 write_unlock(&tbl->lock);
323 }
324 tbl->rover = j;
325}
326
327
328/*
329 * Periodical timer handler for IPVS lblc table
330 * It is used to collect stale entries when the number of entries
331 * exceeds the maximum size of the table.
332 *
333 * Fixme: we probably need more complicated algorithm to collect
334 * entries that have not been used for a long time even
335 * if the number of entries doesn't exceed the maximum size
336 * of the table.
337 * The full expiration check is for this purpose now.
338 */
339static void ip_vs_lblc_check_expire(unsigned long data)
340{
341 struct ip_vs_lblc_table *tbl;
342 unsigned long now = jiffies;
343 int goal;
344 int i, j;
345 struct ip_vs_lblc_entry *en, *nxt;
346
347 tbl = (struct ip_vs_lblc_table *)data;
348
349 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
350 /* do full expiration check */
351 ip_vs_lblc_full_check(tbl);
352 tbl->counter = 1;
353 goto out;
354 }
355
356 if (atomic_read(&tbl->entries) <= tbl->max_size) {
357 tbl->counter++;
358 goto out;
359 }
360
361 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
362 if (goal > tbl->max_size/2)
363 goal = tbl->max_size/2;
364
365 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
366 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
367
368 write_lock(&tbl->lock);
369 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
370 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
371 continue;
372
373 ip_vs_lblc_free(en);
374 atomic_dec(&tbl->entries);
375 goal--;
376 }
377 write_unlock(&tbl->lock);
378 if (goal <= 0)
379 break;
380 }
381 tbl->rover = j;
382
383 out:
384 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
385}
386
387
388static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
389{
390 int i;
391 struct ip_vs_lblc_table *tbl;
392
393 /*
394 * Allocate the ip_vs_lblc_table for this service
395 */
396 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
397 if (tbl == NULL) {
398 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
399 return -ENOMEM;
400 }
401 svc->sched_data = tbl;
402 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
403 "current service\n",
404 sizeof(struct ip_vs_lblc_table));
405
406 /*
407 * Initialize the hash buckets
408 */
409 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
410 INIT_LIST_HEAD(&tbl->bucket[i]);
411 }
412 rwlock_init(&tbl->lock);
413 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
414 tbl->rover = 0;
415 tbl->counter = 1;
416
417 /*
418 * Hook periodic timer for garbage collection
419 */
420 init_timer(&tbl->periodic_timer);
421 tbl->periodic_timer.data = (unsigned long)tbl;
422 tbl->periodic_timer.function = ip_vs_lblc_check_expire;
423 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
424 add_timer(&tbl->periodic_timer);
425
426 return 0;
427}
428
429
430static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
431{
432 struct ip_vs_lblc_table *tbl = svc->sched_data;
433
434 /* remove periodic timer */
435 del_timer_sync(&tbl->periodic_timer);
436
437 /* got to clean up table entries here */
438 ip_vs_lblc_flush(tbl);
439
440 /* release the table itself */
441 kfree(svc->sched_data);
442 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
443 sizeof(struct ip_vs_lblc_table));
444
445 return 0;
446}
447
448
449static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
450{
451 return 0;
452}
453
454
455static inline struct ip_vs_dest *
456__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
457{
458 struct ip_vs_dest *dest, *least;
459 int loh, doh;
460
461 /*
462 * We think the overhead of processing active connections is fifty
463 * times higher than that of inactive connections in average. (This
464 * fifty times might not be accurate, we will change it later.) We
465 * use the following formula to estimate the overhead:
466 * dest->activeconns*50 + dest->inactconns
467 * and the load:
468 * (dest overhead) / dest->weight
469 *
470 * Remember -- no floats in kernel mode!!!
471 * The comparison of h1*w2 > h2*w1 is equivalent to that of
472 * h1/w1 > h2/w2
473 * if every weight is larger than zero.
474 *
475 * The server with weight=0 is quiesced and will not receive any
476 * new connection.
477 */
478 list_for_each_entry(dest, &svc->destinations, n_list) {
479 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
480 continue;
481 if (atomic_read(&dest->weight) > 0) {
482 least = dest;
483 loh = atomic_read(&least->activeconns) * 50
484 + atomic_read(&least->inactconns);
485 goto nextstage;
486 }
487 }
488 return NULL;
489
490 /*
491 * Find the destination with the least load.
492 */
493 nextstage:
494 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
495 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
496 continue;
497
498 doh = atomic_read(&dest->activeconns) * 50
499 + atomic_read(&dest->inactconns);
500 if (loh * atomic_read(&dest->weight) >
501 doh * atomic_read(&least->weight)) {
502 least = dest;
503 loh = doh;
504 }
505 }
506
507 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
508 "activeconns %d refcnt %d weight %d overhead %d\n",
509 NIPQUAD(least->addr), ntohs(least->port),
510 atomic_read(&least->activeconns),
511 atomic_read(&least->refcnt),
512 atomic_read(&least->weight), loh);
513
514 return least;
515}
516
517
518/*
519 * If this destination server is overloaded and there is a less loaded
520 * server, then return true.
521 */
522static inline int
523is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
524{
525 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
526 struct ip_vs_dest *d;
527
528 list_for_each_entry(d, &svc->destinations, n_list) {
529 if (atomic_read(&d->activeconns)*2
530 < atomic_read(&d->weight)) {
531 return 1;
532 }
533 }
534 }
535 return 0;
536}
537
538
539/*
540 * Locality-Based (weighted) Least-Connection scheduling
541 */
542static struct ip_vs_dest *
543ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
544{
545 struct ip_vs_dest *dest;
546 struct ip_vs_lblc_table *tbl;
547 struct ip_vs_lblc_entry *en;
548 struct iphdr *iph = skb->nh.iph;
549
550 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
551
552 tbl = (struct ip_vs_lblc_table *)svc->sched_data;
553 en = ip_vs_lblc_get(tbl, iph->daddr);
554 if (en == NULL) {
555 dest = __ip_vs_wlc_schedule(svc, iph);
556 if (dest == NULL) {
557 IP_VS_DBG(1, "no destination available\n");
558 return NULL;
559 }
560 en = ip_vs_lblc_new(iph->daddr, dest);
561 if (en == NULL) {
562 return NULL;
563 }
564 ip_vs_lblc_hash(tbl, en);
565 } else {
566 dest = en->dest;
567 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
568 || atomic_read(&dest->weight) <= 0
569 || is_overloaded(dest, svc)) {
570 dest = __ip_vs_wlc_schedule(svc, iph);
571 if (dest == NULL) {
572 IP_VS_DBG(1, "no destination available\n");
573 return NULL;
574 }
575 atomic_dec(&en->dest->refcnt);
576 atomic_inc(&dest->refcnt);
577 en->dest = dest;
578 }
579 }
580 en->lastuse = jiffies;
581
582 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
583 "--> server %u.%u.%u.%u:%d\n",
584 NIPQUAD(en->addr),
585 NIPQUAD(dest->addr),
586 ntohs(dest->port));
587
588 return dest;
589}
590
591
592/*
593 * IPVS LBLC Scheduler structure
594 */
595static struct ip_vs_scheduler ip_vs_lblc_scheduler =
596{
597 .name = "lblc",
598 .refcnt = ATOMIC_INIT(0),
599 .module = THIS_MODULE,
600 .init_service = ip_vs_lblc_init_svc,
601 .done_service = ip_vs_lblc_done_svc,
602 .update_service = ip_vs_lblc_update_svc,
603 .schedule = ip_vs_lblc_schedule,
604};
605
606
607static int __init ip_vs_lblc_init(void)
608{
609 INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
610 sysctl_header = register_sysctl_table(lblc_root_table, 0);
611 return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
612}
613
614
615static void __exit ip_vs_lblc_cleanup(void)
616{
617 unregister_sysctl_table(sysctl_header);
618 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
619}
620
621
622module_init(ip_vs_lblc_init);
623module_exit(ip_vs_lblc_cleanup);
624MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
new file mode 100644
index 000000000000..22b5dd55d271
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,888 @@
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
4 * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Julian Anastasov : Added the missing (dest->weight>0)
15 * condition in the ip_vs_dest_set_max.
16 *
17 */
18
19/*
20 * The lblc/r algorithm is as follows (pseudo code):
21 *
22 * if serverSet[dest_ip] is null then
23 * n, serverSet[dest_ip] <- {weighted least-conn node};
24 * else
25 * n <- {least-conn (alive) node in serverSet[dest_ip]};
26 * if (n is null) OR
27 * (n.conns>n.weight AND
28 * there is a node m with m.conns<m.weight/2) then
29 * n <- {weighted least-conn node};
30 * add n to serverSet[dest_ip];
31 * if |serverSet[dest_ip]| > 1 AND
32 * now - serverSet[dest_ip].lastMod > T then
33 * m <- {most conn node in serverSet[dest_ip]};
34 * remove m from serverSet[dest_ip];
35 * if serverSet[dest_ip] changed then
36 * serverSet[dest_ip].lastMod <- now;
37 *
38 * return n;
39 *
40 */
41
42#include <linux/module.h>
43#include <linux/kernel.h>
44
45/* for sysctl */
46#include <linux/fs.h>
47#include <linux/sysctl.h>
48/* for proc_net_create/proc_net_remove */
49#include <linux/proc_fs.h>
50
51#include <net/ip_vs.h>
52
53
54/*
55 * It is for garbage collection of stale IPVS lblcr entries,
56 * when the table is full.
57 */
58#define CHECK_EXPIRE_INTERVAL (60*HZ)
59#define ENTRY_TIMEOUT (6*60*HZ)
60
61/*
62 * It is for full expiration check.
63 * When there is no partial expiration check (garbage collection)
64 * in a half hour, do a full expiration check to collect stale
65 * entries that haven't been touched for a day.
66 */
67#define COUNT_FOR_FULL_EXPIRATION 30
68static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
69
70
71/*
72 * for IPVS lblcr entry hash table
73 */
74#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
75#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
76#endif
77#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
78#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
79#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
80
81
82/*
83 * IPVS destination set structure and operations
84 */
85struct ip_vs_dest_list {
86 struct ip_vs_dest_list *next; /* list link */
87 struct ip_vs_dest *dest; /* destination server */
88};
89
90struct ip_vs_dest_set {
91 atomic_t size; /* set size */
92 unsigned long lastmod; /* last modified time */
93 struct ip_vs_dest_list *list; /* destination list */
94 rwlock_t lock; /* lock for this list */
95};
96
97
98static struct ip_vs_dest_list *
99ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
100{
101 struct ip_vs_dest_list *e;
102
103 for (e=set->list; e!=NULL; e=e->next) {
104 if (e->dest == dest)
105 /* already existed */
106 return NULL;
107 }
108
109 e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
110 if (e == NULL) {
111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
112 return NULL;
113 }
114
115 atomic_inc(&dest->refcnt);
116 e->dest = dest;
117
118 /* link it to the list */
119 write_lock(&set->lock);
120 e->next = set->list;
121 set->list = e;
122 atomic_inc(&set->size);
123 write_unlock(&set->lock);
124
125 set->lastmod = jiffies;
126 return e;
127}
128
129static void
130ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
131{
132 struct ip_vs_dest_list *e, **ep;
133
134 write_lock(&set->lock);
135 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
136 if (e->dest == dest) {
137 /* HIT */
138 *ep = e->next;
139 atomic_dec(&set->size);
140 set->lastmod = jiffies;
141 atomic_dec(&e->dest->refcnt);
142 kfree(e);
143 break;
144 }
145 ep = &e->next;
146 }
147 write_unlock(&set->lock);
148}
149
150static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
151{
152 struct ip_vs_dest_list *e, **ep;
153
154 write_lock(&set->lock);
155 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
156 *ep = e->next;
157 /*
158 * We don't kfree dest because it is refered either
159 * by its service or by the trash dest list.
160 */
161 atomic_dec(&e->dest->refcnt);
162 kfree(e);
163 }
164 write_unlock(&set->lock);
165}
166
167/* get weighted least-connection node in the destination set */
168static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
169{
170 register struct ip_vs_dest_list *e;
171 struct ip_vs_dest *dest, *least;
172 int loh, doh;
173
174 if (set == NULL)
175 return NULL;
176
177 read_lock(&set->lock);
178 /* select the first destination server, whose weight > 0 */
179 for (e=set->list; e!=NULL; e=e->next) {
180 least = e->dest;
181 if (least->flags & IP_VS_DEST_F_OVERLOAD)
182 continue;
183
184 if ((atomic_read(&least->weight) > 0)
185 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
186 loh = atomic_read(&least->activeconns) * 50
187 + atomic_read(&least->inactconns);
188 goto nextstage;
189 }
190 }
191 read_unlock(&set->lock);
192 return NULL;
193
194 /* find the destination with the weighted least load */
195 nextstage:
196 for (e=e->next; e!=NULL; e=e->next) {
197 dest = e->dest;
198 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
199 continue;
200
201 doh = atomic_read(&dest->activeconns) * 50
202 + atomic_read(&dest->inactconns);
203 if ((loh * atomic_read(&dest->weight) >
204 doh * atomic_read(&least->weight))
205 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
206 least = dest;
207 loh = doh;
208 }
209 }
210 read_unlock(&set->lock);
211
212 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
213 "activeconns %d refcnt %d weight %d overhead %d\n",
214 NIPQUAD(least->addr), ntohs(least->port),
215 atomic_read(&least->activeconns),
216 atomic_read(&least->refcnt),
217 atomic_read(&least->weight), loh);
218 return least;
219}
220
221
222/* get weighted most-connection node in the destination set */
223static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
224{
225 register struct ip_vs_dest_list *e;
226 struct ip_vs_dest *dest, *most;
227 int moh, doh;
228
229 if (set == NULL)
230 return NULL;
231
232 read_lock(&set->lock);
233 /* select the first destination server, whose weight > 0 */
234 for (e=set->list; e!=NULL; e=e->next) {
235 most = e->dest;
236 if (atomic_read(&most->weight) > 0) {
237 moh = atomic_read(&most->activeconns) * 50
238 + atomic_read(&most->inactconns);
239 goto nextstage;
240 }
241 }
242 read_unlock(&set->lock);
243 return NULL;
244
245 /* find the destination with the weighted most load */
246 nextstage:
247 for (e=e->next; e!=NULL; e=e->next) {
248 dest = e->dest;
249 doh = atomic_read(&dest->activeconns) * 50
250 + atomic_read(&dest->inactconns);
251 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
252 if ((moh * atomic_read(&dest->weight) <
253 doh * atomic_read(&most->weight))
254 && (atomic_read(&dest->weight) > 0)) {
255 most = dest;
256 moh = doh;
257 }
258 }
259 read_unlock(&set->lock);
260
261 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
262 "activeconns %d refcnt %d weight %d overhead %d\n",
263 NIPQUAD(most->addr), ntohs(most->port),
264 atomic_read(&most->activeconns),
265 atomic_read(&most->refcnt),
266 atomic_read(&most->weight), moh);
267 return most;
268}
269
270
271/*
272 * IPVS lblcr entry represents an association between destination
273 * IP address and its destination server set
274 */
275struct ip_vs_lblcr_entry {
276 struct list_head list;
277 __u32 addr; /* destination IP address */
278 struct ip_vs_dest_set set; /* destination server set */
279 unsigned long lastuse; /* last used time */
280};
281
282
283/*
284 * IPVS lblcr hash table
285 */
286struct ip_vs_lblcr_table {
287 rwlock_t lock; /* lock for this table */
288 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
289 atomic_t entries; /* number of entries */
290 int max_size; /* maximum size of entries */
291 struct timer_list periodic_timer; /* collect stale entries */
292 int rover; /* rover for expire check */
293 int counter; /* counter for no expire */
294};
295
296
297/*
298 * IPVS LBLCR sysctl table
299 */
300
301static ctl_table vs_vars_table[] = {
302 {
303 .ctl_name = NET_IPV4_VS_LBLCR_EXPIRE,
304 .procname = "lblcr_expiration",
305 .data = &sysctl_ip_vs_lblcr_expiration,
306 .maxlen = sizeof(int),
307 .mode = 0644,
308 .proc_handler = &proc_dointvec_jiffies,
309 },
310 { .ctl_name = 0 }
311};
312
313static ctl_table vs_table[] = {
314 {
315 .ctl_name = NET_IPV4_VS,
316 .procname = "vs",
317 .mode = 0555,
318 .child = vs_vars_table
319 },
320 { .ctl_name = 0 }
321};
322
323static ctl_table ipv4_table[] = {
324 {
325 .ctl_name = NET_IPV4,
326 .procname = "ipv4",
327 .mode = 0555,
328 .child = vs_table
329 },
330 { .ctl_name = 0 }
331};
332
333static ctl_table lblcr_root_table[] = {
334 {
335 .ctl_name = CTL_NET,
336 .procname = "net",
337 .mode = 0555,
338 .child = ipv4_table
339 },
340 { .ctl_name = 0 }
341};
342
343static struct ctl_table_header * sysctl_header;
344
345/*
346 * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
347 * IP address to a server.
348 */
349static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
350{
351 struct ip_vs_lblcr_entry *en;
352
353 en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
354 if (en == NULL) {
355 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
356 return NULL;
357 }
358
359 INIT_LIST_HEAD(&en->list);
360 en->addr = daddr;
361
362 /* initilize its dest set */
363 atomic_set(&(en->set.size), 0);
364 en->set.list = NULL;
365 rwlock_init(&en->set.lock);
366
367 return en;
368}
369
370
371static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
372{
373 list_del(&en->list);
374 ip_vs_dest_set_eraseall(&en->set);
375 kfree(en);
376}
377
378
379/*
380 * Returns hash value for IPVS LBLCR entry
381 */
382static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
383{
384 return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
385}
386
387
388/*
389 * Hash an entry in the ip_vs_lblcr_table.
390 * returns bool success.
391 */
392static int
393ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
394{
395 unsigned hash;
396
397 if (!list_empty(&en->list)) {
398 IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
399 "called from %p\n", __builtin_return_address(0));
400 return 0;
401 }
402
403 /*
404 * Hash by destination IP address
405 */
406 hash = ip_vs_lblcr_hashkey(en->addr);
407
408 write_lock(&tbl->lock);
409 list_add(&en->list, &tbl->bucket[hash]);
410 atomic_inc(&tbl->entries);
411 write_unlock(&tbl->lock);
412
413 return 1;
414}
415
416
417#if 0000
418/*
419 * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
420 * returns bool success.
421 */
422static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
423 struct ip_vs_lblcr_entry *en)
424{
425 if (list_empty(&en->list)) {
426 IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
427 "called from %p\n", __builtin_return_address(0));
428 return 0;
429 }
430
431 /*
432 * Remove it from the table
433 */
434 write_lock(&tbl->lock);
435 list_del(&en->list);
436 INIT_LIST_HEAD(&en->list);
437 write_unlock(&tbl->lock);
438
439 return 1;
440}
441#endif
442
443
444/*
445 * Get ip_vs_lblcr_entry associated with supplied parameters.
446 */
447static inline struct ip_vs_lblcr_entry *
448ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
449{
450 unsigned hash;
451 struct ip_vs_lblcr_entry *en;
452
453 hash = ip_vs_lblcr_hashkey(addr);
454
455 read_lock(&tbl->lock);
456
457 list_for_each_entry(en, &tbl->bucket[hash], list) {
458 if (en->addr == addr) {
459 /* HIT */
460 read_unlock(&tbl->lock);
461 return en;
462 }
463 }
464
465 read_unlock(&tbl->lock);
466
467 return NULL;
468}
469
470
471/*
472 * Flush all the entries of the specified table.
473 */
474static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
475{
476 int i;
477 struct ip_vs_lblcr_entry *en, *nxt;
478
479 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
480 write_lock(&tbl->lock);
481 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
482 ip_vs_lblcr_free(en);
483 atomic_dec(&tbl->entries);
484 }
485 write_unlock(&tbl->lock);
486 }
487}
488
489
490static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
491{
492 unsigned long now = jiffies;
493 int i, j;
494 struct ip_vs_lblcr_entry *en, *nxt;
495
496 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
497 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
498
499 write_lock(&tbl->lock);
500 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
501 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
502 now))
503 continue;
504
505 ip_vs_lblcr_free(en);
506 atomic_dec(&tbl->entries);
507 }
508 write_unlock(&tbl->lock);
509 }
510 tbl->rover = j;
511}
512
513
514/*
515 * Periodical timer handler for IPVS lblcr table
516 * It is used to collect stale entries when the number of entries
517 * exceeds the maximum size of the table.
518 *
519 * Fixme: we probably need more complicated algorithm to collect
520 * entries that have not been used for a long time even
521 * if the number of entries doesn't exceed the maximum size
522 * of the table.
523 * The full expiration check is for this purpose now.
524 */
525static void ip_vs_lblcr_check_expire(unsigned long data)
526{
527 struct ip_vs_lblcr_table *tbl;
528 unsigned long now = jiffies;
529 int goal;
530 int i, j;
531 struct ip_vs_lblcr_entry *en, *nxt;
532
533 tbl = (struct ip_vs_lblcr_table *)data;
534
535 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
536 /* do full expiration check */
537 ip_vs_lblcr_full_check(tbl);
538 tbl->counter = 1;
539 goto out;
540 }
541
542 if (atomic_read(&tbl->entries) <= tbl->max_size) {
543 tbl->counter++;
544 goto out;
545 }
546
547 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
548 if (goal > tbl->max_size/2)
549 goal = tbl->max_size/2;
550
551 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
552 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
553
554 write_lock(&tbl->lock);
555 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
556 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
557 continue;
558
559 ip_vs_lblcr_free(en);
560 atomic_dec(&tbl->entries);
561 goal--;
562 }
563 write_unlock(&tbl->lock);
564 if (goal <= 0)
565 break;
566 }
567 tbl->rover = j;
568
569 out:
570 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
571}
572
573
574#ifdef CONFIG_IP_VS_LBLCR_DEBUG
575static struct ip_vs_lblcr_table *lblcr_table_list;
576
577/*
578 * /proc/net/ip_vs_lblcr to display the mappings of
579 * destination IP address <==> its serverSet
580 */
581static int
582ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
583{
584 off_t pos=0, begin;
585 int len=0, size;
586 struct ip_vs_lblcr_table *tbl;
587 unsigned long now = jiffies;
588 int i;
589 struct ip_vs_lblcr_entry *en;
590
591 tbl = lblcr_table_list;
592
593 size = sprintf(buffer, "LastTime Dest IP address Server set\n");
594 pos += size;
595 len += size;
596
597 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
598 read_lock_bh(&tbl->lock);
599 list_for_each_entry(en, &tbl->bucket[i], list) {
600 char tbuf[16];
601 struct ip_vs_dest_list *d;
602
603 sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
604 size = sprintf(buffer+len, "%8lu %-16s ",
605 now-en->lastuse, tbuf);
606
607 read_lock(&en->set.lock);
608 for (d=en->set.list; d!=NULL; d=d->next) {
609 size += sprintf(buffer+len+size,
610 "%u.%u.%u.%u ",
611 NIPQUAD(d->dest->addr));
612 }
613 read_unlock(&en->set.lock);
614 size += sprintf(buffer+len+size, "\n");
615 len += size;
616 pos += size;
617 if (pos <= offset)
618 len=0;
619 if (pos >= offset+length) {
620 read_unlock_bh(&tbl->lock);
621 goto done;
622 }
623 }
624 read_unlock_bh(&tbl->lock);
625 }
626
627 done:
628 begin = len - (pos - offset);
629 *start = buffer + begin;
630 len -= begin;
631 if(len>length)
632 len = length;
633 return len;
634}
635#endif
636
637
638static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
639{
640 int i;
641 struct ip_vs_lblcr_table *tbl;
642
643 /*
644 * Allocate the ip_vs_lblcr_table for this service
645 */
646 tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
647 if (tbl == NULL) {
648 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
649 return -ENOMEM;
650 }
651 svc->sched_data = tbl;
652 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
653 "current service\n",
654 sizeof(struct ip_vs_lblcr_table));
655
656 /*
657 * Initialize the hash buckets
658 */
659 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
660 INIT_LIST_HEAD(&tbl->bucket[i]);
661 }
662 rwlock_init(&tbl->lock);
663 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
664 tbl->rover = 0;
665 tbl->counter = 1;
666
667 /*
668 * Hook periodic timer for garbage collection
669 */
670 init_timer(&tbl->periodic_timer);
671 tbl->periodic_timer.data = (unsigned long)tbl;
672 tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
673 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
674 add_timer(&tbl->periodic_timer);
675
676#ifdef CONFIG_IP_VS_LBLCR_DEBUG
677 lblcr_table_list = tbl;
678#endif
679 return 0;
680}
681
682
683static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
684{
685 struct ip_vs_lblcr_table *tbl = svc->sched_data;
686
687 /* remove periodic timer */
688 del_timer_sync(&tbl->periodic_timer);
689
690 /* got to clean up table entries here */
691 ip_vs_lblcr_flush(tbl);
692
693 /* release the table itself */
694 kfree(svc->sched_data);
695 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
696 sizeof(struct ip_vs_lblcr_table));
697
698 return 0;
699}
700
701
702static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
703{
704 return 0;
705}
706
707
708static inline struct ip_vs_dest *
709__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
710{
711 struct ip_vs_dest *dest, *least;
712 int loh, doh;
713
714 /*
715 * We think the overhead of processing active connections is fifty
716 * times higher than that of inactive connections in average. (This
717 * fifty times might not be accurate, we will change it later.) We
718 * use the following formula to estimate the overhead:
719 * dest->activeconns*50 + dest->inactconns
720 * and the load:
721 * (dest overhead) / dest->weight
722 *
723 * Remember -- no floats in kernel mode!!!
724 * The comparison of h1*w2 > h2*w1 is equivalent to that of
725 * h1/w1 > h2/w2
726 * if every weight is larger than zero.
727 *
728 * The server with weight=0 is quiesced and will not receive any
729 * new connection.
730 */
731 list_for_each_entry(dest, &svc->destinations, n_list) {
732 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
733 continue;
734
735 if (atomic_read(&dest->weight) > 0) {
736 least = dest;
737 loh = atomic_read(&least->activeconns) * 50
738 + atomic_read(&least->inactconns);
739 goto nextstage;
740 }
741 }
742 return NULL;
743
744 /*
745 * Find the destination with the least load.
746 */
747 nextstage:
748 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
749 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
750 continue;
751
752 doh = atomic_read(&dest->activeconns) * 50
753 + atomic_read(&dest->inactconns);
754 if (loh * atomic_read(&dest->weight) >
755 doh * atomic_read(&least->weight)) {
756 least = dest;
757 loh = doh;
758 }
759 }
760
761 IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
762 "activeconns %d refcnt %d weight %d overhead %d\n",
763 NIPQUAD(least->addr), ntohs(least->port),
764 atomic_read(&least->activeconns),
765 atomic_read(&least->refcnt),
766 atomic_read(&least->weight), loh);
767
768 return least;
769}
770
771
772/*
773 * If this destination server is overloaded and there is a less loaded
774 * server, then return true.
775 */
776static inline int
777is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
778{
779 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
780 struct ip_vs_dest *d;
781
782 list_for_each_entry(d, &svc->destinations, n_list) {
783 if (atomic_read(&d->activeconns)*2
784 < atomic_read(&d->weight)) {
785 return 1;
786 }
787 }
788 }
789 return 0;
790}
791
792
793/*
794 * Locality-Based (weighted) Least-Connection scheduling
795 */
796static struct ip_vs_dest *
797ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
798{
799 struct ip_vs_dest *dest;
800 struct ip_vs_lblcr_table *tbl;
801 struct ip_vs_lblcr_entry *en;
802 struct iphdr *iph = skb->nh.iph;
803
804 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
805
806 tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
807 en = ip_vs_lblcr_get(tbl, iph->daddr);
808 if (en == NULL) {
809 dest = __ip_vs_wlc_schedule(svc, iph);
810 if (dest == NULL) {
811 IP_VS_DBG(1, "no destination available\n");
812 return NULL;
813 }
814 en = ip_vs_lblcr_new(iph->daddr);
815 if (en == NULL) {
816 return NULL;
817 }
818 ip_vs_dest_set_insert(&en->set, dest);
819 ip_vs_lblcr_hash(tbl, en);
820 } else {
821 dest = ip_vs_dest_set_min(&en->set);
822 if (!dest || is_overloaded(dest, svc)) {
823 dest = __ip_vs_wlc_schedule(svc, iph);
824 if (dest == NULL) {
825 IP_VS_DBG(1, "no destination available\n");
826 return NULL;
827 }
828 ip_vs_dest_set_insert(&en->set, dest);
829 }
830 if (atomic_read(&en->set.size) > 1 &&
831 jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
832 struct ip_vs_dest *m;
833 m = ip_vs_dest_set_max(&en->set);
834 if (m)
835 ip_vs_dest_set_erase(&en->set, m);
836 }
837 }
838 en->lastuse = jiffies;
839
840 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
841 "--> server %u.%u.%u.%u:%d\n",
842 NIPQUAD(en->addr),
843 NIPQUAD(dest->addr),
844 ntohs(dest->port));
845
846 return dest;
847}
848
849
850/*
851 * IPVS LBLCR Scheduler structure
852 */
853static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
854{
855 .name = "lblcr",
856 .refcnt = ATOMIC_INIT(0),
857 .module = THIS_MODULE,
858 .init_service = ip_vs_lblcr_init_svc,
859 .done_service = ip_vs_lblcr_done_svc,
860 .update_service = ip_vs_lblcr_update_svc,
861 .schedule = ip_vs_lblcr_schedule,
862};
863
864
865static int __init ip_vs_lblcr_init(void)
866{
867 INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
868 sysctl_header = register_sysctl_table(lblcr_root_table, 0);
869#ifdef CONFIG_IP_VS_LBLCR_DEBUG
870 proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
871#endif
872 return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
873}
874
875
876static void __exit ip_vs_lblcr_cleanup(void)
877{
878#ifdef CONFIG_IP_VS_LBLCR_DEBUG
879 proc_net_remove("ip_vs_lblcr");
880#endif
881 unregister_sysctl_table(sysctl_header);
882 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
883}
884
885
886module_init(ip_vs_lblcr_init);
887module_exit(ip_vs_lblcr_cleanup);
888MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
new file mode 100644
index 000000000000..d88fef90a641
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -0,0 +1,123 @@
1/*
2 * IPVS: Least-Connection Scheduling module
3 *
4 * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Wensong Zhang : added the ip_vs_lc_update_svc
15 * Wensong Zhang : added any dest with weight=0 is quiesced
16 *
17 */
18
19#include <linux/module.h>
20#include <linux/kernel.h>
21
22#include <net/ip_vs.h>
23
24
25static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
26{
27 return 0;
28}
29
30
31static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
32{
33 return 0;
34}
35
36
37static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
38{
39 return 0;
40}
41
42
43static inline unsigned int
44ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
45{
46 /*
47 * We think the overhead of processing active connections is 256
48 * times higher than that of inactive connections in average. (This
49 * 256 times might not be accurate, we will change it later) We
50 * use the following formula to estimate the overhead now:
51 * dest->activeconns*256 + dest->inactconns
52 */
53 return (atomic_read(&dest->activeconns) << 8) +
54 atomic_read(&dest->inactconns);
55}
56
57
58/*
59 * Least Connection scheduling
60 */
61static struct ip_vs_dest *
62ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
63{
64 struct ip_vs_dest *dest, *least = NULL;
65 unsigned int loh = 0, doh;
66
67 IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
68
69 /*
70 * Simply select the server with the least number of
71 * (activeconns<<5) + inactconns
72 * Except whose weight is equal to zero.
73 * If the weight is equal to zero, it means that the server is
74 * quiesced, the existing connections to the server still get
75 * served, but no new connection is assigned to the server.
76 */
77
78 list_for_each_entry(dest, &svc->destinations, n_list) {
79 if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
80 atomic_read(&dest->weight) == 0)
81 continue;
82 doh = ip_vs_lc_dest_overhead(dest);
83 if (!least || doh < loh) {
84 least = dest;
85 loh = doh;
86 }
87 }
88
89 if (least)
90 IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
91 NIPQUAD(least->addr), ntohs(least->port),
92 atomic_read(&least->activeconns),
93 atomic_read(&least->inactconns));
94
95 return least;
96}
97
98
99static struct ip_vs_scheduler ip_vs_lc_scheduler = {
100 .name = "lc",
101 .refcnt = ATOMIC_INIT(0),
102 .module = THIS_MODULE,
103 .init_service = ip_vs_lc_init_svc,
104 .done_service = ip_vs_lc_done_svc,
105 .update_service = ip_vs_lc_update_svc,
106 .schedule = ip_vs_lc_schedule,
107};
108
109
110static int __init ip_vs_lc_init(void)
111{
112 INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
113 return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
114}
115
116static void __exit ip_vs_lc_cleanup(void)
117{
118 unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
119}
120
121module_init(ip_vs_lc_init);
122module_exit(ip_vs_lc_cleanup);
123MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
new file mode 100644
index 000000000000..bc2a9e5f2a7b
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -0,0 +1,161 @@
1/*
2 * IPVS: Never Queue scheduling module
3 *
4 * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16
17/*
18 * The NQ algorithm adopts a two-speed model. When there is an idle server
19 * available, the job will be sent to the idle server, instead of waiting
20 * for a fast one. When there is no idle server available, the job will be
21 * sent to the server that minimize its expected delay (The Shortest
22 * Expected Delay scheduling algorithm).
23 *
24 * See the following paper for more information:
25 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
26 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
27 * pages 986-994, 1988.
28 *
29 * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
30 *
31 * The difference between NQ and SED is that NQ can improve overall
32 * system utilization.
33 *
34 */
35
36#include <linux/module.h>
37#include <linux/kernel.h>
38
39#include <net/ip_vs.h>
40
41
42static int
43ip_vs_nq_init_svc(struct ip_vs_service *svc)
44{
45 return 0;
46}
47
48
49static int
50ip_vs_nq_done_svc(struct ip_vs_service *svc)
51{
52 return 0;
53}
54
55
56static int
57ip_vs_nq_update_svc(struct ip_vs_service *svc)
58{
59 return 0;
60}
61
62
63static inline unsigned int
64ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
65{
66 /*
67 * We only use the active connection number in the cost
68 * calculation here.
69 */
70 return atomic_read(&dest->activeconns) + 1;
71}
72
73
74/*
75 * Weighted Least Connection scheduling
76 */
77static struct ip_vs_dest *
78ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
79{
80 struct ip_vs_dest *dest, *least = NULL;
81 unsigned int loh = 0, doh;
82
83 IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
84
85 /*
86 * We calculate the load of each dest server as follows:
87 * (server expected overhead) / dest->weight
88 *
89 * Remember -- no floats in kernel mode!!!
90 * The comparison of h1*w2 > h2*w1 is equivalent to that of
91 * h1/w1 > h2/w2
92 * if every weight is larger than zero.
93 *
94 * The server with weight=0 is quiesced and will not receive any
95 * new connections.
96 */
97
98 list_for_each_entry(dest, &svc->destinations, n_list) {
99
100 if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
101 !atomic_read(&dest->weight))
102 continue;
103
104 doh = ip_vs_nq_dest_overhead(dest);
105
106 /* return the server directly if it is idle */
107 if (atomic_read(&dest->activeconns) == 0) {
108 least = dest;
109 loh = doh;
110 goto out;
111 }
112
113 if (!least ||
114 (loh * atomic_read(&dest->weight) >
115 doh * atomic_read(&least->weight))) {
116 least = dest;
117 loh = doh;
118 }
119 }
120
121 if (!least)
122 return NULL;
123
124 out:
125 IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
126 "activeconns %d refcnt %d weight %d overhead %d\n",
127 NIPQUAD(least->addr), ntohs(least->port),
128 atomic_read(&least->activeconns),
129 atomic_read(&least->refcnt),
130 atomic_read(&least->weight), loh);
131
132 return least;
133}
134
135
136static struct ip_vs_scheduler ip_vs_nq_scheduler =
137{
138 .name = "nq",
139 .refcnt = ATOMIC_INIT(0),
140 .module = THIS_MODULE,
141 .init_service = ip_vs_nq_init_svc,
142 .done_service = ip_vs_nq_done_svc,
143 .update_service = ip_vs_nq_update_svc,
144 .schedule = ip_vs_nq_schedule,
145};
146
147
148static int __init ip_vs_nq_init(void)
149{
150 INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
151 return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
152}
153
154static void __exit ip_vs_nq_cleanup(void)
155{
156 unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
157}
158
159module_init(ip_vs_nq_init);
160module_exit(ip_vs_nq_cleanup);
161MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
new file mode 100644
index 000000000000..253c46252bd5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -0,0 +1,244 @@
1/*
2 * ip_vs_proto.c: transport protocol load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/module.h>
19#include <linux/kernel.h>
20#include <linux/skbuff.h>
21#include <linux/in.h>
22#include <linux/ip.h>
23#include <net/protocol.h>
24#include <net/tcp.h>
25#include <net/udp.h>
26#include <asm/system.h>
27#include <linux/stat.h>
28#include <linux/proc_fs.h>
29
30#include <net/ip_vs.h>
31
32
33/*
34 * IPVS protocols can only be registered/unregistered when the ipvs
35 * module is loaded/unloaded, so no lock is needed in accessing the
36 * ipvs protocol table.
37 */
38
39#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
40#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
41
42static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
43
44
45/*
46 * register an ipvs protocol
47 */
48static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
49{
50 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
51
52 pp->next = ip_vs_proto_table[hash];
53 ip_vs_proto_table[hash] = pp;
54
55 if (pp->init != NULL)
56 pp->init(pp);
57
58 return 0;
59}
60
61
62/*
63 * unregister an ipvs protocol
64 */
65static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
66{
67 struct ip_vs_protocol **pp_p;
68 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
69
70 pp_p = &ip_vs_proto_table[hash];
71 for (; *pp_p; pp_p = &(*pp_p)->next) {
72 if (*pp_p == pp) {
73 *pp_p = pp->next;
74 if (pp->exit != NULL)
75 pp->exit(pp);
76 return 0;
77 }
78 }
79
80 return -ESRCH;
81}
82
83
84/*
85 * get ip_vs_protocol object by its proto.
86 */
87struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
88{
89 struct ip_vs_protocol *pp;
90 unsigned hash = IP_VS_PROTO_HASH(proto);
91
92 for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
93 if (pp->protocol == proto)
94 return pp;
95 }
96
97 return NULL;
98}
99
100
101/*
102 * Propagate event for state change to all protocols
103 */
104void ip_vs_protocol_timeout_change(int flags)
105{
106 struct ip_vs_protocol *pp;
107 int i;
108
109 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
110 for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
111 if (pp->timeout_change)
112 pp->timeout_change(pp, flags);
113 }
114 }
115}
116
117
118int *
119ip_vs_create_timeout_table(int *table, int size)
120{
121 int *t;
122
123 t = kmalloc(size, GFP_ATOMIC);
124 if (t == NULL)
125 return NULL;
126 memcpy(t, table, size);
127 return t;
128}
129
130
131/*
132 * Set timeout value for state specified by name
133 */
134int
135ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
136{
137 int i;
138
139 if (!table || !name || !to)
140 return -EINVAL;
141
142 for (i = 0; i < num; i++) {
143 if (strcmp(names[i], name))
144 continue;
145 table[i] = to * HZ;
146 return 0;
147 }
148 return -ENOENT;
149}
150
151
152const char * ip_vs_state_name(__u16 proto, int state)
153{
154 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
155
156 if (pp == NULL || pp->state_name == NULL)
157 return "ERR!";
158 return pp->state_name(state);
159}
160
161
162void
163ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
164 const struct sk_buff *skb,
165 int offset,
166 const char *msg)
167{
168 char buf[128];
169 struct iphdr _iph, *ih;
170
171 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
172 if (ih == NULL)
173 sprintf(buf, "%s TRUNCATED", pp->name);
174 else if (ih->frag_off & __constant_htons(IP_OFFSET))
175 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
176 pp->name, NIPQUAD(ih->saddr),
177 NIPQUAD(ih->daddr));
178 else {
179 __u16 _ports[2], *pptr
180;
181 pptr = skb_header_pointer(skb, offset + ih->ihl*4,
182 sizeof(_ports), _ports);
183 if (pptr == NULL)
184 sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
185 pp->name,
186 NIPQUAD(ih->saddr),
187 NIPQUAD(ih->daddr));
188 else
189 sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
190 pp->name,
191 NIPQUAD(ih->saddr),
192 ntohs(pptr[0]),
193 NIPQUAD(ih->daddr),
194 ntohs(pptr[1]));
195 }
196
197 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
198}
199
200
201int ip_vs_protocol_init(void)
202{
203 char protocols[64];
204#define REGISTER_PROTOCOL(p) \
205 do { \
206 register_ip_vs_protocol(p); \
207 strcat(protocols, ", "); \
208 strcat(protocols, (p)->name); \
209 } while (0)
210
211 protocols[0] = '\0';
212 protocols[2] = '\0';
213#ifdef CONFIG_IP_VS_PROTO_TCP
214 REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
215#endif
216#ifdef CONFIG_IP_VS_PROTO_UDP
217 REGISTER_PROTOCOL(&ip_vs_protocol_udp);
218#endif
219#ifdef CONFIG_IP_VS_PROTO_ICMP
220 REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
221#endif
222#ifdef CONFIG_IP_VS_PROTO_AH
223 REGISTER_PROTOCOL(&ip_vs_protocol_ah);
224#endif
225#ifdef CONFIG_IP_VS_PROTO_ESP
226 REGISTER_PROTOCOL(&ip_vs_protocol_esp);
227#endif
228 IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
229
230 return 0;
231}
232
233
234void ip_vs_protocol_cleanup(void)
235{
236 struct ip_vs_protocol *pp;
237 int i;
238
239 /* unregister all the ipvs protocols */
240 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
241 while ((pp = ip_vs_proto_table[i]) != NULL)
242 unregister_ip_vs_protocol(pp);
243 }
244}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
new file mode 100644
index 000000000000..453e94a0bbd7
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -0,0 +1,177 @@
1/*
2 * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
5 *
6 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
7 * Wensong Zhang <wensong@linuxvirtualserver.org>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * version 2 as published by the Free Software Foundation;
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42ah_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 __constant_htons(PORT_ISAKMP),
54 iph->daddr,
55 __constant_htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 __constant_htons(PORT_ISAKMP),
60 iph->saddr,
61 __constant_htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 __constant_htons(PORT_ISAKMP),
91 iph->daddr,
92 __constant_htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 __constant_htons(PORT_ISAKMP),
97 iph->saddr,
98 __constant_htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115ah_conn_schedule(struct sk_buff *skb,
116 struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp)
118{
119 /*
120 * AH is only related traffic. Pass the packet to IP stack.
121 */
122 *verdict = NF_ACCEPT;
123 return 0;
124}
125
126
127static void
128ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg)
130{
131 char buf[256];
132 struct iphdr _iph, *ih;
133
134 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
135 if (ih == NULL)
136 sprintf(buf, "%s TRUNCATED", pp->name);
137 else
138 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
139 pp->name, NIPQUAD(ih->saddr),
140 NIPQUAD(ih->daddr));
141
142 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
143}
144
145
146static void ah_init(struct ip_vs_protocol *pp)
147{
148 /* nothing to do now */
149}
150
151
152static void ah_exit(struct ip_vs_protocol *pp)
153{
154 /* nothing to do now */
155}
156
157
158struct ip_vs_protocol ip_vs_protocol_ah = {
159 .name = "AH",
160 .protocol = IPPROTO_AH,
161 .dont_defrag = 1,
162 .init = ah_init,
163 .exit = ah_exit,
164 .conn_schedule = ah_conn_schedule,
165 .conn_in_get = ah_conn_in_get,
166 .conn_out_get = ah_conn_out_get,
167 .snat_handler = NULL,
168 .dnat_handler = NULL,
169 .csum_check = NULL,
170 .state_transition = NULL,
171 .register_app = NULL,
172 .unregister_app = NULL,
173 .app_conn_bind = NULL,
174 .debug_packet = ah_debug_packet,
175 .timeout_change = NULL, /* ISAKMP */
176 .set_state_timeout = NULL,
177};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
new file mode 100644
index 000000000000..478e5c7c7e8e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -0,0 +1,175 @@
1/*
2 * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
5 *
6 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
7 * Wensong Zhang <wensong@linuxvirtualserver.org>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * version 2 as published by the Free Software Foundation;
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42esp_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 __constant_htons(PORT_ISAKMP),
54 iph->daddr,
55 __constant_htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 __constant_htons(PORT_ISAKMP),
60 iph->saddr,
61 __constant_htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 __constant_htons(PORT_ISAKMP),
91 iph->daddr,
92 __constant_htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 __constant_htons(PORT_ISAKMP),
97 iph->saddr,
98 __constant_htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
116 int *verdict, struct ip_vs_conn **cpp)
117{
118 /*
119 * ESP is only related traffic. Pass the packet to IP stack.
120 */
121 *verdict = NF_ACCEPT;
122 return 0;
123}
124
125
126static void
127esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
128 int offset, const char *msg)
129{
130 char buf[256];
131 struct iphdr _iph, *ih;
132
133 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
134 if (ih == NULL)
135 sprintf(buf, "%s TRUNCATED", pp->name);
136 else
137 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
138 pp->name, NIPQUAD(ih->saddr),
139 NIPQUAD(ih->daddr));
140
141 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
142}
143
144
145static void esp_init(struct ip_vs_protocol *pp)
146{
147 /* nothing to do now */
148}
149
150
151static void esp_exit(struct ip_vs_protocol *pp)
152{
153 /* nothing to do now */
154}
155
156
157struct ip_vs_protocol ip_vs_protocol_esp = {
158 .name = "ESP",
159 .protocol = IPPROTO_ESP,
160 .dont_defrag = 1,
161 .init = esp_init,
162 .exit = esp_exit,
163 .conn_schedule = esp_conn_schedule,
164 .conn_in_get = esp_conn_in_get,
165 .conn_out_get = esp_conn_out_get,
166 .snat_handler = NULL,
167 .dnat_handler = NULL,
168 .csum_check = NULL,
169 .state_transition = NULL,
170 .register_app = NULL,
171 .unregister_app = NULL,
172 .app_conn_bind = NULL,
173 .debug_packet = esp_debug_packet,
174 .timeout_change = NULL, /* ISAKMP */
175};
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
new file mode 100644
index 000000000000..191e94aa1c1f
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c
@@ -0,0 +1,182 @@
1/*
2 * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, March 2002
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation;
9 *
10 */
11
12#include <linux/module.h>
13#include <linux/kernel.h>
14#include <linux/icmp.h>
15#include <linux/netfilter.h>
16#include <linux/netfilter_ipv4.h>
17
18#include <net/ip_vs.h>
19
20
21static int icmp_timeouts[1] = { 1*60*HZ };
22
23static char * icmp_state_name_table[1] = { "ICMP" };
24
25static struct ip_vs_conn *
26icmp_conn_in_get(const struct sk_buff *skb,
27 struct ip_vs_protocol *pp,
28 const struct iphdr *iph,
29 unsigned int proto_off,
30 int inverse)
31{
32#if 0
33 struct ip_vs_conn *cp;
34
35 if (likely(!inverse)) {
36 cp = ip_vs_conn_in_get(iph->protocol,
37 iph->saddr, 0,
38 iph->daddr, 0);
39 } else {
40 cp = ip_vs_conn_in_get(iph->protocol,
41 iph->daddr, 0,
42 iph->saddr, 0);
43 }
44
45 return cp;
46
47#else
48 return NULL;
49#endif
50}
51
52static struct ip_vs_conn *
53icmp_conn_out_get(const struct sk_buff *skb,
54 struct ip_vs_protocol *pp,
55 const struct iphdr *iph,
56 unsigned int proto_off,
57 int inverse)
58{
59#if 0
60 struct ip_vs_conn *cp;
61
62 if (likely(!inverse)) {
63 cp = ip_vs_conn_out_get(iph->protocol,
64 iph->saddr, 0,
65 iph->daddr, 0);
66 } else {
67 cp = ip_vs_conn_out_get(IPPROTO_UDP,
68 iph->daddr, 0,
69 iph->saddr, 0);
70 }
71
72 return cp;
73#else
74 return NULL;
75#endif
76}
77
78static int
79icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
80 int *verdict, struct ip_vs_conn **cpp)
81{
82 *verdict = NF_ACCEPT;
83 return 0;
84}
85
86static int
87icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
88{
89 if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
90 if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
91 if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
92 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
93 return 0;
94 }
95 }
96 }
97 return 1;
98}
99
100static void
101icmp_debug_packet(struct ip_vs_protocol *pp,
102 const struct sk_buff *skb,
103 int offset,
104 const char *msg)
105{
106 char buf[256];
107 struct iphdr _iph, *ih;
108
109 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
110 if (ih == NULL)
111 sprintf(buf, "%s TRUNCATED", pp->name);
112 else if (ih->frag_off & __constant_htons(IP_OFFSET))
113 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
114 pp->name, NIPQUAD(ih->saddr),
115 NIPQUAD(ih->daddr));
116 else {
117 struct icmphdr _icmph, *ic;
118
119 ic = skb_header_pointer(skb, offset + ih->ihl*4,
120 sizeof(_icmph), &_icmph);
121 if (ic == NULL)
122 sprintf(buf, "%s TRUNCATED to %u bytes\n",
123 pp->name, skb->len - offset);
124 else
125 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
126 pp->name, NIPQUAD(ih->saddr),
127 NIPQUAD(ih->daddr),
128 ic->type, ic->code);
129 }
130 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
131}
132
133static int
134icmp_state_transition(struct ip_vs_conn *cp, int direction,
135 const struct sk_buff *skb,
136 struct ip_vs_protocol *pp)
137{
138 cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
139 return 1;
140}
141
142static int
143icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
144{
145 int num;
146 char **names;
147
148 num = IP_VS_ICMP_S_LAST;
149 names = icmp_state_name_table;
150 return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
151}
152
153
154static void icmp_init(struct ip_vs_protocol *pp)
155{
156 pp->timeout_table = icmp_timeouts;
157}
158
159static void icmp_exit(struct ip_vs_protocol *pp)
160{
161}
162
163struct ip_vs_protocol ip_vs_protocol_icmp = {
164 .name = "ICMP",
165 .protocol = IPPROTO_ICMP,
166 .dont_defrag = 0,
167 .init = icmp_init,
168 .exit = icmp_exit,
169 .conn_schedule = icmp_conn_schedule,
170 .conn_in_get = icmp_conn_in_get,
171 .conn_out_get = icmp_conn_out_get,
172 .snat_handler = NULL,
173 .dnat_handler = NULL,
174 .csum_check = icmp_csum_check,
175 .state_transition = icmp_state_transition,
176 .register_app = NULL,
177 .unregister_app = NULL,
178 .app_conn_bind = NULL,
179 .debug_packet = icmp_debug_packet,
180 .timeout_change = NULL,
181 .set_state_timeout = icmp_set_state_timeout,
182};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 000000000000..e65de675da74
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,640 @@
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/ip.h>
20#include <linux/tcp.h> /* for tcphdr */
21#include <net/ip.h>
22#include <net/tcp.h> /* for csum_tcpudp_magic */
23#include <linux/netfilter_ipv4.h>
24
25#include <net/ip_vs.h>
26
27
28static struct ip_vs_conn *
29tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
30 const struct iphdr *iph, unsigned int proto_off, int inverse)
31{
32 __u16 _ports[2], *pptr;
33
34 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
35 if (pptr == NULL)
36 return NULL;
37
38 if (likely(!inverse)) {
39 return ip_vs_conn_in_get(iph->protocol,
40 iph->saddr, pptr[0],
41 iph->daddr, pptr[1]);
42 } else {
43 return ip_vs_conn_in_get(iph->protocol,
44 iph->daddr, pptr[1],
45 iph->saddr, pptr[0]);
46 }
47}
48
49static struct ip_vs_conn *
50tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
51 const struct iphdr *iph, unsigned int proto_off, int inverse)
52{
53 __u16 _ports[2], *pptr;
54
55 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
56 if (pptr == NULL)
57 return NULL;
58
59 if (likely(!inverse)) {
60 return ip_vs_conn_out_get(iph->protocol,
61 iph->saddr, pptr[0],
62 iph->daddr, pptr[1]);
63 } else {
64 return ip_vs_conn_out_get(iph->protocol,
65 iph->daddr, pptr[1],
66 iph->saddr, pptr[0]);
67 }
68}
69
70
71static int
72tcp_conn_schedule(struct sk_buff *skb,
73 struct ip_vs_protocol *pp,
74 int *verdict, struct ip_vs_conn **cpp)
75{
76 struct ip_vs_service *svc;
77 struct tcphdr _tcph, *th;
78
79 th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
80 sizeof(_tcph), &_tcph);
81 if (th == NULL) {
82 *verdict = NF_DROP;
83 return 0;
84 }
85
86 if (th->syn &&
87 (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
88 skb->nh.iph->daddr, th->dest))) {
89 if (ip_vs_todrop()) {
90 /*
91 * It seems that we are very loaded.
92 * We have to drop this packet :(
93 */
94 ip_vs_service_put(svc);
95 *verdict = NF_DROP;
96 return 0;
97 }
98
99 /*
100 * Let the virtual server select a real server for the
101 * incoming connection, and create a connection entry.
102 */
103 *cpp = ip_vs_schedule(svc, skb);
104 if (!*cpp) {
105 *verdict = ip_vs_leave(svc, skb, pp);
106 return 0;
107 }
108 ip_vs_service_put(svc);
109 }
110 return 1;
111}
112
113
114static inline void
115tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
116 u16 oldport, u16 newport)
117{
118 tcph->check =
119 ip_vs_check_diff(~oldip, newip,
120 ip_vs_check_diff(oldport ^ 0xFFFF,
121 newport, tcph->check));
122}
123
124
125static int
126tcp_snat_handler(struct sk_buff **pskb,
127 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
128{
129 struct tcphdr *tcph;
130 unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
131
132 /* csum_check requires unshared skb */
133 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
134 return 0;
135
136 if (unlikely(cp->app != NULL)) {
137 /* Some checks before mangling */
138 if (pp->csum_check && !pp->csum_check(*pskb, pp))
139 return 0;
140
141 /* Call application helper if needed */
142 if (!ip_vs_app_pkt_out(cp, pskb))
143 return 0;
144 }
145
146 tcph = (void *)(*pskb)->nh.iph + tcphoff;
147 tcph->source = cp->vport;
148
149 /* Adjust TCP checksums */
150 if (!cp->app) {
151 /* Only port and addr are changed, do fast csum update */
152 tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
153 cp->dport, cp->vport);
154 if ((*pskb)->ip_summed == CHECKSUM_HW)
155 (*pskb)->ip_summed = CHECKSUM_NONE;
156 } else {
157 /* full checksum calculation */
158 tcph->check = 0;
159 (*pskb)->csum = skb_checksum(*pskb, tcphoff,
160 (*pskb)->len - tcphoff, 0);
161 tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
162 (*pskb)->len - tcphoff,
163 cp->protocol,
164 (*pskb)->csum);
165 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
166 pp->name, tcph->check,
167 (char*)&(tcph->check) - (char*)tcph);
168 }
169 return 1;
170}
171
172
173static int
174tcp_dnat_handler(struct sk_buff **pskb,
175 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
176{
177 struct tcphdr *tcph;
178 unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
179
180 /* csum_check requires unshared skb */
181 if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
182 return 0;
183
184 if (unlikely(cp->app != NULL)) {
185 /* Some checks before mangling */
186 if (pp->csum_check && !pp->csum_check(*pskb, pp))
187 return 0;
188
189 /*
190 * Attempt ip_vs_app call.
191 * It will fix ip_vs_conn and iph ack_seq stuff
192 */
193 if (!ip_vs_app_pkt_in(cp, pskb))
194 return 0;
195 }
196
197 tcph = (void *)(*pskb)->nh.iph + tcphoff;
198 tcph->dest = cp->dport;
199
200 /*
201 * Adjust TCP checksums
202 */
203 if (!cp->app) {
204 /* Only port and addr are changed, do fast csum update */
205 tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
206 cp->vport, cp->dport);
207 if ((*pskb)->ip_summed == CHECKSUM_HW)
208 (*pskb)->ip_summed = CHECKSUM_NONE;
209 } else {
210 /* full checksum calculation */
211 tcph->check = 0;
212 (*pskb)->csum = skb_checksum(*pskb, tcphoff,
213 (*pskb)->len - tcphoff, 0);
214 tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
215 (*pskb)->len - tcphoff,
216 cp->protocol,
217 (*pskb)->csum);
218 (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
219 }
220 return 1;
221}
222
223
224static int
225tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
226{
227 unsigned int tcphoff = skb->nh.iph->ihl*4;
228
229 switch (skb->ip_summed) {
230 case CHECKSUM_NONE:
231 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
232 case CHECKSUM_HW:
233 if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
234 skb->len - tcphoff,
235 skb->nh.iph->protocol, skb->csum)) {
236 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
237 "Failed checksum for");
238 return 0;
239 }
240 break;
241 default:
242 /* CHECKSUM_UNNECESSARY */
243 break;
244 }
245
246 return 1;
247}
248
249
250#define TCP_DIR_INPUT 0
251#define TCP_DIR_OUTPUT 4
252#define TCP_DIR_INPUT_ONLY 8
253
254static int tcp_state_off[IP_VS_DIR_LAST] = {
255 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
256 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
257 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
258};
259
260/*
261 * Timeout table[state]
262 */
263static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
264 [IP_VS_TCP_S_NONE] = 2*HZ,
265 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
266 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
267 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
268 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
269 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
270 [IP_VS_TCP_S_CLOSE] = 10*HZ,
271 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
272 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
273 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
274 [IP_VS_TCP_S_SYNACK] = 120*HZ,
275 [IP_VS_TCP_S_LAST] = 2*HZ,
276};
277
278
279#if 0
280
281/* FIXME: This is going to die */
282
283static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
284 [IP_VS_TCP_S_NONE] = 2*HZ,
285 [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
286 [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
287 [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
288 [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
289 [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
290 [IP_VS_TCP_S_CLOSE] = 10*HZ,
291 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
292 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
293 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
294 [IP_VS_TCP_S_SYNACK] = 100*HZ,
295 [IP_VS_TCP_S_LAST] = 2*HZ,
296};
297
298#endif
299
300static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
301 [IP_VS_TCP_S_NONE] = "NONE",
302 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
303 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
304 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
305 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
306 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
307 [IP_VS_TCP_S_CLOSE] = "CLOSE",
308 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
309 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
310 [IP_VS_TCP_S_LISTEN] = "LISTEN",
311 [IP_VS_TCP_S_SYNACK] = "SYNACK",
312 [IP_VS_TCP_S_LAST] = "BUG!",
313};
314
315#define sNO IP_VS_TCP_S_NONE
316#define sES IP_VS_TCP_S_ESTABLISHED
317#define sSS IP_VS_TCP_S_SYN_SENT
318#define sSR IP_VS_TCP_S_SYN_RECV
319#define sFW IP_VS_TCP_S_FIN_WAIT
320#define sTW IP_VS_TCP_S_TIME_WAIT
321#define sCL IP_VS_TCP_S_CLOSE
322#define sCW IP_VS_TCP_S_CLOSE_WAIT
323#define sLA IP_VS_TCP_S_LAST_ACK
324#define sLI IP_VS_TCP_S_LISTEN
325#define sSA IP_VS_TCP_S_SYNACK
326
327struct tcp_states_t {
328 int next_state[IP_VS_TCP_S_LAST];
329};
330
331static const char * tcp_state_name(int state)
332{
333 if (state >= IP_VS_TCP_S_LAST)
334 return "ERR!";
335 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
336}
337
338static struct tcp_states_t tcp_states [] = {
339/* INPUT */
340/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
341/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
342/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
343/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
344/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
345
346/* OUTPUT */
347/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
348/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
349/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
350/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
351/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
352
353/* INPUT-ONLY */
354/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
355/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
356/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
357/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
358/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
359};
360
361static struct tcp_states_t tcp_states_dos [] = {
362/* INPUT */
363/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
364/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
365/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
366/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
367/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
368
369/* OUTPUT */
370/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
371/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
372/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
373/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
374/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
375
376/* INPUT-ONLY */
377/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
378/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
379/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
380/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
381/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
382};
383
384static struct tcp_states_t *tcp_state_table = tcp_states;
385
386
387static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
388{
389 int on = (flags & 1); /* secure_tcp */
390
391 /*
392 ** FIXME: change secure_tcp to independent sysctl var
393 ** or make it per-service or per-app because it is valid
394 ** for most if not for all of the applications. Something
395 ** like "capabilities" (flags) for each object.
396 */
397 tcp_state_table = (on? tcp_states_dos : tcp_states);
398}
399
400static int
401tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
402{
403 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
404 tcp_state_name_table, sname, to);
405}
406
407static inline int tcp_state_idx(struct tcphdr *th)
408{
409 if (th->rst)
410 return 3;
411 if (th->syn)
412 return 0;
413 if (th->fin)
414 return 1;
415 if (th->ack)
416 return 2;
417 return -1;
418}
419
420static inline void
421set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
422 int direction, struct tcphdr *th)
423{
424 int state_idx;
425 int new_state = IP_VS_TCP_S_CLOSE;
426 int state_off = tcp_state_off[direction];
427
428 /*
429 * Update state offset to INPUT_ONLY if necessary
430 * or delete NO_OUTPUT flag if output packet detected
431 */
432 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
433 if (state_off == TCP_DIR_OUTPUT)
434 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
435 else
436 state_off = TCP_DIR_INPUT_ONLY;
437 }
438
439 if ((state_idx = tcp_state_idx(th)) < 0) {
440 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
441 goto tcp_state_out;
442 }
443
444 new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
445
446 tcp_state_out:
447 if (new_state != cp->state) {
448 struct ip_vs_dest *dest = cp->dest;
449
450 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
451 "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
452 pp->name,
453 (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
454 th->syn? 'S' : '.',
455 th->fin? 'F' : '.',
456 th->ack? 'A' : '.',
457 th->rst? 'R' : '.',
458 NIPQUAD(cp->daddr), ntohs(cp->dport),
459 NIPQUAD(cp->caddr), ntohs(cp->cport),
460 tcp_state_name(cp->state),
461 tcp_state_name(new_state),
462 atomic_read(&cp->refcnt));
463 if (dest) {
464 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
465 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
466 atomic_dec(&dest->activeconns);
467 atomic_inc(&dest->inactconns);
468 cp->flags |= IP_VS_CONN_F_INACTIVE;
469 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
470 (new_state == IP_VS_TCP_S_ESTABLISHED)) {
471 atomic_inc(&dest->activeconns);
472 atomic_dec(&dest->inactconns);
473 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
474 }
475 }
476 }
477
478 cp->timeout = pp->timeout_table[cp->state = new_state];
479}
480
481
482/*
483 * Handle state transitions
484 */
485static int
486tcp_state_transition(struct ip_vs_conn *cp, int direction,
487 const struct sk_buff *skb,
488 struct ip_vs_protocol *pp)
489{
490 struct tcphdr _tcph, *th;
491
492 th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
493 sizeof(_tcph), &_tcph);
494 if (th == NULL)
495 return 0;
496
497 spin_lock(&cp->lock);
498 set_tcp_state(pp, cp, direction, th);
499 spin_unlock(&cp->lock);
500
501 return 1;
502}
503
504
505/*
506 * Hash table for TCP application incarnations
507 */
508#define TCP_APP_TAB_BITS 4
509#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
510#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
511
512static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
513static DEFINE_SPINLOCK(tcp_app_lock);
514
515static inline __u16 tcp_app_hashkey(__u16 port)
516{
517 return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
518}
519
520
521static int tcp_register_app(struct ip_vs_app *inc)
522{
523 struct ip_vs_app *i;
524 __u16 hash, port = inc->port;
525 int ret = 0;
526
527 hash = tcp_app_hashkey(port);
528
529 spin_lock_bh(&tcp_app_lock);
530 list_for_each_entry(i, &tcp_apps[hash], p_list) {
531 if (i->port == port) {
532 ret = -EEXIST;
533 goto out;
534 }
535 }
536 list_add(&inc->p_list, &tcp_apps[hash]);
537 atomic_inc(&ip_vs_protocol_tcp.appcnt);
538
539 out:
540 spin_unlock_bh(&tcp_app_lock);
541 return ret;
542}
543
544
545static void
546tcp_unregister_app(struct ip_vs_app *inc)
547{
548 spin_lock_bh(&tcp_app_lock);
549 atomic_dec(&ip_vs_protocol_tcp.appcnt);
550 list_del(&inc->p_list);
551 spin_unlock_bh(&tcp_app_lock);
552}
553
554
555static int
556tcp_app_conn_bind(struct ip_vs_conn *cp)
557{
558 int hash;
559 struct ip_vs_app *inc;
560 int result = 0;
561
562 /* Default binding: bind app only for NAT */
563 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
564 return 0;
565
566 /* Lookup application incarnations and bind the right one */
567 hash = tcp_app_hashkey(cp->vport);
568
569 spin_lock(&tcp_app_lock);
570 list_for_each_entry(inc, &tcp_apps[hash], p_list) {
571 if (inc->port == cp->vport) {
572 if (unlikely(!ip_vs_app_inc_get(inc)))
573 break;
574 spin_unlock(&tcp_app_lock);
575
576 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
577 "%u.%u.%u.%u:%u to app %s on port %u\n",
578 __FUNCTION__,
579 NIPQUAD(cp->caddr), ntohs(cp->cport),
580 NIPQUAD(cp->vaddr), ntohs(cp->vport),
581 inc->name, ntohs(inc->port));
582 cp->app = inc;
583 if (inc->init_conn)
584 result = inc->init_conn(inc, cp);
585 goto out;
586 }
587 }
588 spin_unlock(&tcp_app_lock);
589
590 out:
591 return result;
592}
593
594
595/*
596 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
597 */
598void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
599{
600 spin_lock(&cp->lock);
601 cp->state = IP_VS_TCP_S_LISTEN;
602 cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
603 spin_unlock(&cp->lock);
604}
605
606
607static void tcp_init(struct ip_vs_protocol *pp)
608{
609 IP_VS_INIT_HASH_TABLE(tcp_apps);
610 pp->timeout_table = tcp_timeouts;
611}
612
613
614static void tcp_exit(struct ip_vs_protocol *pp)
615{
616}
617
618
619struct ip_vs_protocol ip_vs_protocol_tcp = {
620 .name = "TCP",
621 .protocol = IPPROTO_TCP,
622 .dont_defrag = 0,
623 .appcnt = ATOMIC_INIT(0),
624 .init = tcp_init,
625 .exit = tcp_exit,
626 .register_app = tcp_register_app,
627 .unregister_app = tcp_unregister_app,
628 .conn_schedule = tcp_conn_schedule,
629 .conn_in_get = tcp_conn_in_get,
630 .conn_out_get = tcp_conn_out_get,
631 .snat_handler = tcp_snat_handler,
632 .dnat_handler = tcp_dnat_handler,
633 .csum_check = tcp_csum_check,
634 .state_name = tcp_state_name,
635 .state_transition = tcp_state_transition,
636 .app_conn_bind = tcp_app_conn_bind,
637 .debug_packet = ip_vs_tcpudp_debug_packet,
638 .timeout_change = tcp_timeout_change,
639 .set_state_timeout = tcp_set_state_timeout,
640};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 000000000000..8ae5f2e0aefa
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,427 @@
1/*
2 * ip_vs_proto_udp.c: UDP load balancing support for IPVS
3 *
4 * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/netfilter_ipv4.h>
20
21#include <net/ip_vs.h>
22
23
24static struct ip_vs_conn *
25udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
26 const struct iphdr *iph, unsigned int proto_off, int inverse)
27{
28 struct ip_vs_conn *cp;
29 __u16 _ports[2], *pptr;
30
31 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
32 if (pptr == NULL)
33 return NULL;
34
35 if (likely(!inverse)) {
36 cp = ip_vs_conn_in_get(iph->protocol,
37 iph->saddr, pptr[0],
38 iph->daddr, pptr[1]);
39 } else {
40 cp = ip_vs_conn_in_get(iph->protocol,
41 iph->daddr, pptr[1],
42 iph->saddr, pptr[0]);
43 }
44
45 return cp;
46}
47
48
49static struct ip_vs_conn *
50udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
51 const struct iphdr *iph, unsigned int proto_off, int inverse)
52{
53 struct ip_vs_conn *cp;
54 __u16 _ports[2], *pptr;
55
56 pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4,
57 sizeof(_ports), _ports);
58 if (pptr == NULL)
59 return NULL;
60
61 if (likely(!inverse)) {
62 cp = ip_vs_conn_out_get(iph->protocol,
63 iph->saddr, pptr[0],
64 iph->daddr, pptr[1]);
65 } else {
66 cp = ip_vs_conn_out_get(iph->protocol,
67 iph->daddr, pptr[1],
68 iph->saddr, pptr[0]);
69 }
70
71 return cp;
72}
73
74
75static int
76udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
77 int *verdict, struct ip_vs_conn **cpp)
78{
79 struct ip_vs_service *svc;
80 struct udphdr _udph, *uh;
81
82 uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
83 sizeof(_udph), &_udph);
84 if (uh == NULL) {
85 *verdict = NF_DROP;
86 return 0;
87 }
88
89 if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
90 skb->nh.iph->daddr, uh->dest))) {
91 if (ip_vs_todrop()) {
92 /*
93 * It seems that we are very loaded.
94 * We have to drop this packet :(
95 */
96 ip_vs_service_put(svc);
97 *verdict = NF_DROP;
98 return 0;
99 }
100
101 /*
102 * Let the virtual server select a real server for the
103 * incoming connection, and create a connection entry.
104 */
105 *cpp = ip_vs_schedule(svc, skb);
106 if (!*cpp) {
107 *verdict = ip_vs_leave(svc, skb, pp);
108 return 0;
109 }
110 ip_vs_service_put(svc);
111 }
112 return 1;
113}
114
115
116static inline void
117udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
118 u16 oldport, u16 newport)
119{
120 uhdr->check =
121 ip_vs_check_diff(~oldip, newip,
122 ip_vs_check_diff(oldport ^ 0xFFFF,
123 newport, uhdr->check));
124 if (!uhdr->check)
125 uhdr->check = 0xFFFF;
126}
127
128static int
129udp_snat_handler(struct sk_buff **pskb,
130 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
131{
132 struct udphdr *udph;
133 unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
134
135 /* csum_check requires unshared skb */
136 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
137 return 0;
138
139 if (unlikely(cp->app != NULL)) {
140 /* Some checks before mangling */
141 if (pp->csum_check && !pp->csum_check(*pskb, pp))
142 return 0;
143
144 /*
145 * Call application helper if needed
146 */
147 if (!ip_vs_app_pkt_out(cp, pskb))
148 return 0;
149 }
150
151 udph = (void *)(*pskb)->nh.iph + udphoff;
152 udph->source = cp->vport;
153
154 /*
155 * Adjust UDP checksums
156 */
157 if (!cp->app && (udph->check != 0)) {
158 /* Only port and addr are changed, do fast csum update */
159 udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
160 cp->dport, cp->vport);
161 if ((*pskb)->ip_summed == CHECKSUM_HW)
162 (*pskb)->ip_summed = CHECKSUM_NONE;
163 } else {
164 /* full checksum calculation */
165 udph->check = 0;
166 (*pskb)->csum = skb_checksum(*pskb, udphoff,
167 (*pskb)->len - udphoff, 0);
168 udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
169 (*pskb)->len - udphoff,
170 cp->protocol,
171 (*pskb)->csum);
172 if (udph->check == 0)
173 udph->check = 0xFFFF;
174 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
175 pp->name, udph->check,
176 (char*)&(udph->check) - (char*)udph);
177 }
178 return 1;
179}
180
181
182static int
183udp_dnat_handler(struct sk_buff **pskb,
184 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
185{
186 struct udphdr *udph;
187 unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
188
189 /* csum_check requires unshared skb */
190 if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
191 return 0;
192
193 if (unlikely(cp->app != NULL)) {
194 /* Some checks before mangling */
195 if (pp->csum_check && !pp->csum_check(*pskb, pp))
196 return 0;
197
198 /*
199 * Attempt ip_vs_app call.
200 * It will fix ip_vs_conn
201 */
202 if (!ip_vs_app_pkt_in(cp, pskb))
203 return 0;
204 }
205
206 udph = (void *)(*pskb)->nh.iph + udphoff;
207 udph->dest = cp->dport;
208
209 /*
210 * Adjust UDP checksums
211 */
212 if (!cp->app && (udph->check != 0)) {
213 /* Only port and addr are changed, do fast csum update */
214 udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
215 cp->vport, cp->dport);
216 if ((*pskb)->ip_summed == CHECKSUM_HW)
217 (*pskb)->ip_summed = CHECKSUM_NONE;
218 } else {
219 /* full checksum calculation */
220 udph->check = 0;
221 (*pskb)->csum = skb_checksum(*pskb, udphoff,
222 (*pskb)->len - udphoff, 0);
223 udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
224 (*pskb)->len - udphoff,
225 cp->protocol,
226 (*pskb)->csum);
227 if (udph->check == 0)
228 udph->check = 0xFFFF;
229 (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
230 }
231 return 1;
232}
233
234
235static int
236udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
237{
238 struct udphdr _udph, *uh;
239 unsigned int udphoff = skb->nh.iph->ihl*4;
240
241 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
242 if (uh == NULL)
243 return 0;
244
245 if (uh->check != 0) {
246 switch (skb->ip_summed) {
247 case CHECKSUM_NONE:
248 skb->csum = skb_checksum(skb, udphoff,
249 skb->len - udphoff, 0);
250 case CHECKSUM_HW:
251 if (csum_tcpudp_magic(skb->nh.iph->saddr,
252 skb->nh.iph->daddr,
253 skb->len - udphoff,
254 skb->nh.iph->protocol,
255 skb->csum)) {
256 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
257 "Failed checksum for");
258 return 0;
259 }
260 break;
261 default:
262 /* CHECKSUM_UNNECESSARY */
263 break;
264 }
265 }
266 return 1;
267}
268
269
270/*
271 * Note: the caller guarantees that only one of register_app,
272 * unregister_app or app_conn_bind is called each time.
273 */
274
275#define UDP_APP_TAB_BITS 4
276#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
277#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
278
279static struct list_head udp_apps[UDP_APP_TAB_SIZE];
280static DEFINE_SPINLOCK(udp_app_lock);
281
282static inline __u16 udp_app_hashkey(__u16 port)
283{
284 return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
285}
286
287
288static int udp_register_app(struct ip_vs_app *inc)
289{
290 struct ip_vs_app *i;
291 __u16 hash, port = inc->port;
292 int ret = 0;
293
294 hash = udp_app_hashkey(port);
295
296
297 spin_lock_bh(&udp_app_lock);
298 list_for_each_entry(i, &udp_apps[hash], p_list) {
299 if (i->port == port) {
300 ret = -EEXIST;
301 goto out;
302 }
303 }
304 list_add(&inc->p_list, &udp_apps[hash]);
305 atomic_inc(&ip_vs_protocol_udp.appcnt);
306
307 out:
308 spin_unlock_bh(&udp_app_lock);
309 return ret;
310}
311
312
313static void
314udp_unregister_app(struct ip_vs_app *inc)
315{
316 spin_lock_bh(&udp_app_lock);
317 atomic_dec(&ip_vs_protocol_udp.appcnt);
318 list_del(&inc->p_list);
319 spin_unlock_bh(&udp_app_lock);
320}
321
322
323static int udp_app_conn_bind(struct ip_vs_conn *cp)
324{
325 int hash;
326 struct ip_vs_app *inc;
327 int result = 0;
328
329 /* Default binding: bind app only for NAT */
330 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
331 return 0;
332
333 /* Lookup application incarnations and bind the right one */
334 hash = udp_app_hashkey(cp->vport);
335
336 spin_lock(&udp_app_lock);
337 list_for_each_entry(inc, &udp_apps[hash], p_list) {
338 if (inc->port == cp->vport) {
339 if (unlikely(!ip_vs_app_inc_get(inc)))
340 break;
341 spin_unlock(&udp_app_lock);
342
343 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
344 "%u.%u.%u.%u:%u to app %s on port %u\n",
345 __FUNCTION__,
346 NIPQUAD(cp->caddr), ntohs(cp->cport),
347 NIPQUAD(cp->vaddr), ntohs(cp->vport),
348 inc->name, ntohs(inc->port));
349 cp->app = inc;
350 if (inc->init_conn)
351 result = inc->init_conn(inc, cp);
352 goto out;
353 }
354 }
355 spin_unlock(&udp_app_lock);
356
357 out:
358 return result;
359}
360
361
362static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
363 [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
364 [IP_VS_UDP_S_LAST] = 2*HZ,
365};
366
367static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
368 [IP_VS_UDP_S_NORMAL] = "UDP",
369 [IP_VS_UDP_S_LAST] = "BUG!",
370};
371
372
373static int
374udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
375{
376 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
377 udp_state_name_table, sname, to);
378}
379
380static const char * udp_state_name(int state)
381{
382 if (state >= IP_VS_UDP_S_LAST)
383 return "ERR!";
384 return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
385}
386
387static int
388udp_state_transition(struct ip_vs_conn *cp, int direction,
389 const struct sk_buff *skb,
390 struct ip_vs_protocol *pp)
391{
392 cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
393 return 1;
394}
395
396static void udp_init(struct ip_vs_protocol *pp)
397{
398 IP_VS_INIT_HASH_TABLE(udp_apps);
399 pp->timeout_table = udp_timeouts;
400}
401
402static void udp_exit(struct ip_vs_protocol *pp)
403{
404}
405
406
407struct ip_vs_protocol ip_vs_protocol_udp = {
408 .name = "UDP",
409 .protocol = IPPROTO_UDP,
410 .dont_defrag = 0,
411 .init = udp_init,
412 .exit = udp_exit,
413 .conn_schedule = udp_conn_schedule,
414 .conn_in_get = udp_conn_in_get,
415 .conn_out_get = udp_conn_out_get,
416 .snat_handler = udp_snat_handler,
417 .dnat_handler = udp_dnat_handler,
418 .csum_check = udp_csum_check,
419 .state_transition = udp_state_transition,
420 .state_name = udp_state_name,
421 .register_app = udp_register_app,
422 .unregister_app = udp_unregister_app,
423 .app_conn_bind = udp_app_conn_bind,
424 .debug_packet = ip_vs_tcpudp_debug_packet,
425 .timeout_change = NULL,
426 .set_state_timeout = udp_set_state_timeout,
427};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
new file mode 100644
index 000000000000..b23bab231cab
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -0,0 +1,118 @@
1/*
2 * IPVS: Round-Robin Scheduling module
3 *
4 * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Peter Kese <peter.kese@ijs.si>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Fixes/Changes:
15 * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
16 * Julian Anastasov : fixed the NULL pointer access bug in debugging
17 * Wensong Zhang : changed some comestics things for debugging
18 * Wensong Zhang : changed for the d-linked destination list
19 * Wensong Zhang : added the ip_vs_rr_update_svc
20 * Wensong Zhang : added any dest with weight=0 is quiesced
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26
27#include <net/ip_vs.h>
28
29
30static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
31{
32 svc->sched_data = &svc->destinations;
33 return 0;
34}
35
36
37static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
38{
39 return 0;
40}
41
42
43static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
44{
45 svc->sched_data = &svc->destinations;
46 return 0;
47}
48
49
50/*
51 * Round-Robin Scheduling
52 */
53static struct ip_vs_dest *
54ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
55{
56 struct list_head *p, *q;
57 struct ip_vs_dest *dest;
58
59 IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
60
61 write_lock(&svc->sched_lock);
62 p = (struct list_head *)svc->sched_data;
63 p = p->next;
64 q = p;
65 do {
66 /* skip list head */
67 if (q == &svc->destinations) {
68 q = q->next;
69 continue;
70 }
71
72 dest = list_entry(q, struct ip_vs_dest, n_list);
73 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
74 atomic_read(&dest->weight) > 0)
75 /* HIT */
76 goto out;
77 q = q->next;
78 } while (q != p);
79 write_unlock(&svc->sched_lock);
80 return NULL;
81
82 out:
83 svc->sched_data = q;
84 write_unlock(&svc->sched_lock);
85 IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
86 "activeconns %d refcnt %d weight %d\n",
87 NIPQUAD(dest->addr), ntohs(dest->port),
88 atomic_read(&dest->activeconns),
89 atomic_read(&dest->refcnt), atomic_read(&dest->weight));
90
91 return dest;
92}
93
94
95static struct ip_vs_scheduler ip_vs_rr_scheduler = {
96 .name = "rr", /* name */
97 .refcnt = ATOMIC_INIT(0),
98 .module = THIS_MODULE,
99 .init_service = ip_vs_rr_init_svc,
100 .done_service = ip_vs_rr_done_svc,
101 .update_service = ip_vs_rr_update_svc,
102 .schedule = ip_vs_rr_schedule,
103};
104
105static int __init ip_vs_rr_init(void)
106{
107 INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
108 return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
109}
110
111static void __exit ip_vs_rr_cleanup(void)
112{
113 unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
114}
115
116module_init(ip_vs_rr_init);
117module_exit(ip_vs_rr_cleanup);
118MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
new file mode 100644
index 000000000000..0f7c56a225bd
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 *
18 * Changes:
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/sched.h>
24#include <linux/spinlock.h>
25#include <asm/string.h>
26#include <linux/kmod.h>
27
28#include <net/ip_vs.h>
29
30/*
31 * IPVS scheduler list
32 */
33static LIST_HEAD(ip_vs_schedulers);
34
35/* lock for service table */
36static DEFINE_RWLOCK(__ip_vs_sched_lock);
37
38
39/*
40 * Bind a service with a scheduler
41 */
42int ip_vs_bind_scheduler(struct ip_vs_service *svc,
43 struct ip_vs_scheduler *scheduler)
44{
45 int ret;
46
47 if (svc == NULL) {
48 IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
49 return -EINVAL;
50 }
51 if (scheduler == NULL) {
52 IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
53 return -EINVAL;
54 }
55
56 svc->scheduler = scheduler;
57
58 if (scheduler->init_service) {
59 ret = scheduler->init_service(svc);
60 if (ret) {
61 IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
62 return ret;
63 }
64 }
65
66 return 0;
67}
68
69
70/*
71 * Unbind a service with its scheduler
72 */
73int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
74{
75 struct ip_vs_scheduler *sched;
76
77 if (svc == NULL) {
78 IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
79 return -EINVAL;
80 }
81
82 sched = svc->scheduler;
83 if (sched == NULL) {
84 IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
85 return -EINVAL;
86 }
87
88 if (sched->done_service) {
89 if (sched->done_service(svc) != 0) {
90 IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
91 return -EINVAL;
92 }
93 }
94
95 svc->scheduler = NULL;
96 return 0;
97}
98
99
100/*
101 * Get scheduler in the scheduler list by name
102 */
103static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
104{
105 struct ip_vs_scheduler *sched;
106
107 IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
108 sched_name);
109
110 read_lock_bh(&__ip_vs_sched_lock);
111
112 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
113 /*
114 * Test and get the modules atomically
115 */
116 if (sched->module && !try_module_get(sched->module)) {
117 /*
118 * This scheduler is just deleted
119 */
120 continue;
121 }
122 if (strcmp(sched_name, sched->name)==0) {
123 /* HIT */
124 read_unlock_bh(&__ip_vs_sched_lock);
125 return sched;
126 }
127 if (sched->module)
128 module_put(sched->module);
129 }
130
131 read_unlock_bh(&__ip_vs_sched_lock);
132 return NULL;
133}
134
135
136/*
137 * Lookup scheduler and try to load it if it doesn't exist
138 */
139struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
140{
141 struct ip_vs_scheduler *sched;
142
143 /*
144 * Search for the scheduler by sched_name
145 */
146 sched = ip_vs_sched_getbyname(sched_name);
147
148 /*
149 * If scheduler not found, load the module and search again
150 */
151 if (sched == NULL) {
152 request_module("ip_vs_%s", sched_name);
153 sched = ip_vs_sched_getbyname(sched_name);
154 }
155
156 return sched;
157}
158
159void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
160{
161 if (scheduler->module)
162 module_put(scheduler->module);
163}
164
165
166/*
167 * Register a scheduler in the scheduler list
168 */
169int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
170{
171 struct ip_vs_scheduler *sched;
172
173 if (!scheduler) {
174 IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
175 return -EINVAL;
176 }
177
178 if (!scheduler->name) {
179 IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
180 return -EINVAL;
181 }
182
183 /* increase the module use count */
184 ip_vs_use_count_inc();
185
186 /*
187 * Make sure that the scheduler with this name doesn't exist
188 * in the scheduler list.
189 */
190 sched = ip_vs_sched_getbyname(scheduler->name);
191 if (sched) {
192 ip_vs_scheduler_put(sched);
193 ip_vs_use_count_dec();
194 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
195 "already existed in the system\n", scheduler->name);
196 return -EINVAL;
197 }
198
199 write_lock_bh(&__ip_vs_sched_lock);
200
201 if (scheduler->n_list.next != &scheduler->n_list) {
202 write_unlock_bh(&__ip_vs_sched_lock);
203 ip_vs_use_count_dec();
204 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
205 "already linked\n", scheduler->name);
206 return -EINVAL;
207 }
208
209 /*
210 * Add it into the d-linked scheduler list
211 */
212 list_add(&scheduler->n_list, &ip_vs_schedulers);
213 write_unlock_bh(&__ip_vs_sched_lock);
214
215 IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
216
217 return 0;
218}
219
220
221/*
222 * Unregister a scheduler from the scheduler list
223 */
224int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
225{
226 if (!scheduler) {
227 IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
228 return -EINVAL;
229 }
230
231 write_lock_bh(&__ip_vs_sched_lock);
232 if (scheduler->n_list.next == &scheduler->n_list) {
233 write_unlock_bh(&__ip_vs_sched_lock);
234 IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
235 "is not in the list. failed\n", scheduler->name);
236 return -EINVAL;
237 }
238
239 /*
240 * Remove it from the d-linked scheduler list
241 */
242 list_del(&scheduler->n_list);
243 write_unlock_bh(&__ip_vs_sched_lock);
244
245 /* decrease the module use count */
246 ip_vs_use_count_dec();
247
248 IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
249
250 return 0;
251}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
new file mode 100644
index 000000000000..ff366f7390d9
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -0,0 +1,163 @@
1/*
2 * IPVS: Shortest Expected Delay scheduling module
3 *
4 * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16
17/*
18 * The SED algorithm attempts to minimize each job's expected delay until
19 * completion. The expected delay that the job will experience is
20 * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
21 * jobs on the the ith server and Ui is the fixed service rate (weight) of
22 * the ith server. The SED algorithm adopts a greedy policy that each does
23 * what is in its own best interest, i.e. to join the queue which would
24 * minimize its expected delay of completion.
25 *
26 * See the following paper for more information:
27 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
28 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
29 * pages 986-994, 1988.
30 *
31 * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
32 *
33 * The difference between SED and WLC is that SED includes the incoming
34 * job in the cost function (the increment of 1). SED may outperform
35 * WLC, while scheduling big jobs under larger heterogeneous systems
36 * (the server weight varies a lot).
37 *
38 */
39
40#include <linux/module.h>
41#include <linux/kernel.h>
42
43#include <net/ip_vs.h>
44
45
46static int
47ip_vs_sed_init_svc(struct ip_vs_service *svc)
48{
49 return 0;
50}
51
52
53static int
54ip_vs_sed_done_svc(struct ip_vs_service *svc)
55{
56 return 0;
57}
58
59
60static int
61ip_vs_sed_update_svc(struct ip_vs_service *svc)
62{
63 return 0;
64}
65
66
67static inline unsigned int
68ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
69{
70 /*
71 * We only use the active connection number in the cost
72 * calculation here.
73 */
74 return atomic_read(&dest->activeconns) + 1;
75}
76
77
78/*
79 * Weighted Least Connection scheduling
80 */
81static struct ip_vs_dest *
82ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
83{
84 struct ip_vs_dest *dest, *least;
85 unsigned int loh, doh;
86
87 IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
88
89 /*
90 * We calculate the load of each dest server as follows:
91 * (server expected overhead) / dest->weight
92 *
93 * Remember -- no floats in kernel mode!!!
94 * The comparison of h1*w2 > h2*w1 is equivalent to that of
95 * h1/w1 > h2/w2
96 * if every weight is larger than zero.
97 *
98 * The server with weight=0 is quiesced and will not receive any
99 * new connections.
100 */
101
102 list_for_each_entry(dest, &svc->destinations, n_list) {
103 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
104 atomic_read(&dest->weight) > 0) {
105 least = dest;
106 loh = ip_vs_sed_dest_overhead(least);
107 goto nextstage;
108 }
109 }
110 return NULL;
111
112 /*
113 * Find the destination with the least load.
114 */
115 nextstage:
116 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
117 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
118 continue;
119 doh = ip_vs_sed_dest_overhead(dest);
120 if (loh * atomic_read(&dest->weight) >
121 doh * atomic_read(&least->weight)) {
122 least = dest;
123 loh = doh;
124 }
125 }
126
127 IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
128 "activeconns %d refcnt %d weight %d overhead %d\n",
129 NIPQUAD(least->addr), ntohs(least->port),
130 atomic_read(&least->activeconns),
131 atomic_read(&least->refcnt),
132 atomic_read(&least->weight), loh);
133
134 return least;
135}
136
137
138static struct ip_vs_scheduler ip_vs_sed_scheduler =
139{
140 .name = "sed",
141 .refcnt = ATOMIC_INIT(0),
142 .module = THIS_MODULE,
143 .init_service = ip_vs_sed_init_svc,
144 .done_service = ip_vs_sed_done_svc,
145 .update_service = ip_vs_sed_update_svc,
146 .schedule = ip_vs_sed_schedule,
147};
148
149
150static int __init ip_vs_sed_init(void)
151{
152 INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
153 return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
154}
155
156static void __exit ip_vs_sed_cleanup(void)
157{
158 unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
159}
160
161module_init(ip_vs_sed_init);
162module_exit(ip_vs_sed_cleanup);
163MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
new file mode 100644
index 000000000000..6f7c50e44a39
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -0,0 +1,255 @@
1/*
2 * IPVS: Source Hashing scheduling module
3 *
4 * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@gnuchina.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *
15 */
16
17/*
18 * The sh algorithm is to select server by the hash key of source IP
19 * address. The pseudo code is as follows:
20 *
21 * n <- servernode[src_ip];
22 * if (n is dead) OR
23 * (n is overloaded) or (n.weight <= 0) then
24 * return NULL;
25 *
26 * return n;
27 *
28 * Notes that servernode is a 256-bucket hash table that maps the hash
29 * index derived from packet source IP address to the current server
30 * array. If the sh scheduler is used in cache cluster, it is good to
31 * combine it with cache_bypass feature. When the statically assigned
32 * server is dead or overloaded, the load balancer can bypass the cache
33 * server and send requests to the original server directly.
34 *
35 */
36
37#include <linux/module.h>
38#include <linux/kernel.h>
39
40#include <net/ip_vs.h>
41
42
43/*
44 * IPVS SH bucket
45 */
46struct ip_vs_sh_bucket {
47 struct ip_vs_dest *dest; /* real server (cache) */
48};
49
50/*
51 * for IPVS SH entry hash table
52 */
53#ifndef CONFIG_IP_VS_SH_TAB_BITS
54#define CONFIG_IP_VS_SH_TAB_BITS 8
55#endif
56#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
57#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
58#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
59
60
61/*
62 * Returns hash value for IPVS SH entry
63 */
64static inline unsigned ip_vs_sh_hashkey(__u32 addr)
65{
66 return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
67}
68
69
70/*
71 * Get ip_vs_dest associated with supplied parameters.
72 */
73static inline struct ip_vs_dest *
74ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
75{
76 return (tbl[ip_vs_sh_hashkey(addr)]).dest;
77}
78
79
80/*
81 * Assign all the hash buckets of the specified table with the service.
82 */
83static int
84ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
85{
86 int i;
87 struct ip_vs_sh_bucket *b;
88 struct list_head *p;
89 struct ip_vs_dest *dest;
90
91 b = tbl;
92 p = &svc->destinations;
93 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
94 if (list_empty(p)) {
95 b->dest = NULL;
96 } else {
97 if (p == &svc->destinations)
98 p = p->next;
99
100 dest = list_entry(p, struct ip_vs_dest, n_list);
101 atomic_inc(&dest->refcnt);
102 b->dest = dest;
103
104 p = p->next;
105 }
106 b++;
107 }
108 return 0;
109}
110
111
112/*
113 * Flush all the hash buckets of the specified table.
114 */
115static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
116{
117 int i;
118 struct ip_vs_sh_bucket *b;
119
120 b = tbl;
121 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
122 if (b->dest) {
123 atomic_dec(&b->dest->refcnt);
124 b->dest = NULL;
125 }
126 b++;
127 }
128}
129
130
131static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
132{
133 struct ip_vs_sh_bucket *tbl;
134
135 /* allocate the SH table for this service */
136 tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
137 GFP_ATOMIC);
138 if (tbl == NULL) {
139 IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
140 return -ENOMEM;
141 }
142 svc->sched_data = tbl;
143 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
144 "current service\n",
145 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
146
147 /* assign the hash buckets with the updated service */
148 ip_vs_sh_assign(tbl, svc);
149
150 return 0;
151}
152
153
154static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
155{
156 struct ip_vs_sh_bucket *tbl = svc->sched_data;
157
158 /* got to clean up hash buckets here */
159 ip_vs_sh_flush(tbl);
160
161 /* release the table itself */
162 kfree(svc->sched_data);
163 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
164 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
165
166 return 0;
167}
168
169
170static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
171{
172 struct ip_vs_sh_bucket *tbl = svc->sched_data;
173
174 /* got to clean up hash buckets here */
175 ip_vs_sh_flush(tbl);
176
177 /* assign the hash buckets with the updated service */
178 ip_vs_sh_assign(tbl, svc);
179
180 return 0;
181}
182
183
184/*
185 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
186 * consider that the server is overloaded here.
187 */
188static inline int is_overloaded(struct ip_vs_dest *dest)
189{
190 return dest->flags & IP_VS_DEST_F_OVERLOAD;
191}
192
193
194/*
195 * Source Hashing scheduling
196 */
197static struct ip_vs_dest *
198ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
199{
200 struct ip_vs_dest *dest;
201 struct ip_vs_sh_bucket *tbl;
202 struct iphdr *iph = skb->nh.iph;
203
204 IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
205
206 tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
207 dest = ip_vs_sh_get(tbl, iph->saddr);
208 if (!dest
209 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
210 || atomic_read(&dest->weight) <= 0
211 || is_overloaded(dest)) {
212 return NULL;
213 }
214
215 IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
216 "--> server %u.%u.%u.%u:%d\n",
217 NIPQUAD(iph->saddr),
218 NIPQUAD(dest->addr),
219 ntohs(dest->port));
220
221 return dest;
222}
223
224
225/*
226 * IPVS SH Scheduler structure
227 */
228static struct ip_vs_scheduler ip_vs_sh_scheduler =
229{
230 .name = "sh",
231 .refcnt = ATOMIC_INIT(0),
232 .module = THIS_MODULE,
233 .init_service = ip_vs_sh_init_svc,
234 .done_service = ip_vs_sh_done_svc,
235 .update_service = ip_vs_sh_update_svc,
236 .schedule = ip_vs_sh_schedule,
237};
238
239
240static int __init ip_vs_sh_init(void)
241{
242 INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
243 return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
244}
245
246
247static void __exit ip_vs_sh_cleanup(void)
248{
249 unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
250}
251
252
253module_init(ip_vs_sh_init);
254module_exit(ip_vs_sh_cleanup);
255MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
new file mode 100644
index 000000000000..25c479550a32
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -0,0 +1,892 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 *
12 * ip_vs_sync: sync connection info from master load balancer to backups
13 * through multicast
14 *
15 * Changes:
16 * Alexandre Cassen : Added master & backup support at a time.
17 * Alexandre Cassen : Added SyncID support for incoming sync
18 * messages filtering.
19 * Justin Ossevoort : Fix endian problem on sync message size.
20 */
21
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/net.h>
25#include <linux/completion.h>
26#include <linux/delay.h>
27#include <linux/skbuff.h>
28#include <linux/in.h>
29#include <linux/igmp.h> /* for ip_mc_join_group */
30
31#include <net/ip.h>
32#include <net/sock.h>
33#include <asm/uaccess.h> /* for get_fs and set_fs */
34
35#include <net/ip_vs.h>
36
37#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
38#define IP_VS_SYNC_PORT 8848 /* multicast port */
39
40
41/*
42 * IPVS sync connection entry
43 */
44struct ip_vs_sync_conn {
45 __u8 reserved;
46
47 /* Protocol, addresses and port numbers */
48 __u8 protocol; /* Which protocol (TCP/UDP) */
49 __u16 cport;
50 __u16 vport;
51 __u16 dport;
52 __u32 caddr; /* client address */
53 __u32 vaddr; /* virtual address */
54 __u32 daddr; /* destination address */
55
56 /* Flags and state transition */
57 __u16 flags; /* status flags */
58 __u16 state; /* state info */
59
60 /* The sequence options start here */
61};
62
63struct ip_vs_sync_conn_options {
64 struct ip_vs_seq in_seq; /* incoming seq. struct */
65 struct ip_vs_seq out_seq; /* outgoing seq. struct */
66};
67
68#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
69#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
70#define FULL_CONN_SIZE \
71(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
72
73
74/*
75 The master mulitcasts messages to the backup load balancers in the
76 following format.
77
78 0 1 2 3
79 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
80 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
81 | Count Conns | SyncID | Size |
82 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 | |
84 | IPVS Sync Connection (1) |
85 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
86 | . |
87 | . |
88 | . |
89 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 | |
91 | IPVS Sync Connection (n) |
92 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93*/
94
95#define SYNC_MESG_HEADER_LEN 4
96
97struct ip_vs_sync_mesg {
98 __u8 nr_conns;
99 __u8 syncid;
100 __u16 size;
101
102 /* ip_vs_sync_conn entries start here */
103};
104
105/* the maximum length of sync (sending/receiving) message */
106static int sync_send_mesg_maxlen;
107static int sync_recv_mesg_maxlen;
108
109struct ip_vs_sync_buff {
110 struct list_head list;
111 unsigned long firstuse;
112
113 /* pointers for the message data */
114 struct ip_vs_sync_mesg *mesg;
115 unsigned char *head;
116 unsigned char *end;
117};
118
119
120/* the sync_buff list head and the lock */
121static LIST_HEAD(ip_vs_sync_queue);
122static DEFINE_SPINLOCK(ip_vs_sync_lock);
123
124/* current sync_buff for accepting new conn entries */
125static struct ip_vs_sync_buff *curr_sb = NULL;
126static DEFINE_SPINLOCK(curr_sb_lock);
127
128/* ipvs sync daemon state */
129volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
130volatile int ip_vs_master_syncid = 0;
131volatile int ip_vs_backup_syncid = 0;
132
133/* multicast interface name */
134char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
135char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
136
137/* multicast addr */
138static struct sockaddr_in mcast_addr;
139
140
141static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
142{
143 spin_lock(&ip_vs_sync_lock);
144 list_add_tail(&sb->list, &ip_vs_sync_queue);
145 spin_unlock(&ip_vs_sync_lock);
146}
147
148static inline struct ip_vs_sync_buff * sb_dequeue(void)
149{
150 struct ip_vs_sync_buff *sb;
151
152 spin_lock_bh(&ip_vs_sync_lock);
153 if (list_empty(&ip_vs_sync_queue)) {
154 sb = NULL;
155 } else {
156 sb = list_entry(ip_vs_sync_queue.next,
157 struct ip_vs_sync_buff,
158 list);
159 list_del(&sb->list);
160 }
161 spin_unlock_bh(&ip_vs_sync_lock);
162
163 return sb;
164}
165
166static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
167{
168 struct ip_vs_sync_buff *sb;
169
170 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
171 return NULL;
172
173 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
174 kfree(sb);
175 return NULL;
176 }
177 sb->mesg->nr_conns = 0;
178 sb->mesg->syncid = ip_vs_master_syncid;
179 sb->mesg->size = 4;
180 sb->head = (unsigned char *)sb->mesg + 4;
181 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
182 sb->firstuse = jiffies;
183 return sb;
184}
185
186static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
187{
188 kfree(sb->mesg);
189 kfree(sb);
190}
191
192/*
193 * Get the current sync buffer if it has been created for more
194 * than the specified time or the specified time is zero.
195 */
196static inline struct ip_vs_sync_buff *
197get_curr_sync_buff(unsigned long time)
198{
199 struct ip_vs_sync_buff *sb;
200
201 spin_lock_bh(&curr_sb_lock);
202 if (curr_sb && (time == 0 ||
203 time_before(jiffies - curr_sb->firstuse, time))) {
204 sb = curr_sb;
205 curr_sb = NULL;
206 } else
207 sb = NULL;
208 spin_unlock_bh(&curr_sb_lock);
209 return sb;
210}
211
212
213/*
214 * Add an ip_vs_conn information into the current sync_buff.
215 * Called by ip_vs_in.
216 */
217void ip_vs_sync_conn(struct ip_vs_conn *cp)
218{
219 struct ip_vs_sync_mesg *m;
220 struct ip_vs_sync_conn *s;
221 int len;
222
223 spin_lock(&curr_sb_lock);
224 if (!curr_sb) {
225 if (!(curr_sb=ip_vs_sync_buff_create())) {
226 spin_unlock(&curr_sb_lock);
227 IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
228 return;
229 }
230 }
231
232 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
233 SIMPLE_CONN_SIZE;
234 m = curr_sb->mesg;
235 s = (struct ip_vs_sync_conn *)curr_sb->head;
236
237 /* copy members */
238 s->protocol = cp->protocol;
239 s->cport = cp->cport;
240 s->vport = cp->vport;
241 s->dport = cp->dport;
242 s->caddr = cp->caddr;
243 s->vaddr = cp->vaddr;
244 s->daddr = cp->daddr;
245 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
246 s->state = htons(cp->state);
247 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
248 struct ip_vs_sync_conn_options *opt =
249 (struct ip_vs_sync_conn_options *)&s[1];
250 memcpy(opt, &cp->in_seq, sizeof(*opt));
251 }
252
253 m->nr_conns++;
254 m->size += len;
255 curr_sb->head += len;
256
257 /* check if there is a space for next one */
258 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
259 sb_queue_tail(curr_sb);
260 curr_sb = NULL;
261 }
262 spin_unlock(&curr_sb_lock);
263
264 /* synchronize its controller if it has */
265 if (cp->control)
266 ip_vs_sync_conn(cp->control);
267}
268
269
270/*
271 * Process received multicast message and create the corresponding
272 * ip_vs_conn entries.
273 */
274static void ip_vs_process_message(const char *buffer, const size_t buflen)
275{
276 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
277 struct ip_vs_sync_conn *s;
278 struct ip_vs_sync_conn_options *opt;
279 struct ip_vs_conn *cp;
280 char *p;
281 int i;
282
283 /* Convert size back to host byte order */
284 m->size = ntohs(m->size);
285
286 if (buflen != m->size) {
287 IP_VS_ERR("bogus message\n");
288 return;
289 }
290
291 /* SyncID sanity check */
292 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
293 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
294 m->syncid);
295 return;
296 }
297
298 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
299 for (i=0; i<m->nr_conns; i++) {
300 s = (struct ip_vs_sync_conn *)p;
301 cp = ip_vs_conn_in_get(s->protocol,
302 s->caddr, s->cport,
303 s->vaddr, s->vport);
304 if (!cp) {
305 cp = ip_vs_conn_new(s->protocol,
306 s->caddr, s->cport,
307 s->vaddr, s->vport,
308 s->daddr, s->dport,
309 ntohs(s->flags), NULL);
310 if (!cp) {
311 IP_VS_ERR("ip_vs_conn_new failed\n");
312 return;
313 }
314 cp->state = ntohs(s->state);
315 } else if (!cp->dest) {
316 /* it is an entry created by the synchronization */
317 cp->state = ntohs(s->state);
318 cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
319 } /* Note that we don't touch its state and flags
320 if it is a normal entry. */
321
322 if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
323 opt = (struct ip_vs_sync_conn_options *)&s[1];
324 memcpy(&cp->in_seq, opt, sizeof(*opt));
325 p += FULL_CONN_SIZE;
326 } else
327 p += SIMPLE_CONN_SIZE;
328
329 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
330 cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
331 ip_vs_conn_put(cp);
332
333 if (p > buffer+buflen) {
334 IP_VS_ERR("bogus message\n");
335 return;
336 }
337 }
338}
339
340
341/*
342 * Setup loopback of outgoing multicasts on a sending socket
343 */
344static void set_mcast_loop(struct sock *sk, u_char loop)
345{
346 struct inet_sock *inet = inet_sk(sk);
347
348 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
349 lock_sock(sk);
350 inet->mc_loop = loop ? 1 : 0;
351 release_sock(sk);
352}
353
354/*
355 * Specify TTL for outgoing multicasts on a sending socket
356 */
357static void set_mcast_ttl(struct sock *sk, u_char ttl)
358{
359 struct inet_sock *inet = inet_sk(sk);
360
361 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
362 lock_sock(sk);
363 inet->mc_ttl = ttl;
364 release_sock(sk);
365}
366
367/*
368 * Specifiy default interface for outgoing multicasts
369 */
370static int set_mcast_if(struct sock *sk, char *ifname)
371{
372 struct net_device *dev;
373 struct inet_sock *inet = inet_sk(sk);
374
375 if ((dev = __dev_get_by_name(ifname)) == NULL)
376 return -ENODEV;
377
378 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
379 return -EINVAL;
380
381 lock_sock(sk);
382 inet->mc_index = dev->ifindex;
383 /* inet->mc_addr = 0; */
384 release_sock(sk);
385
386 return 0;
387}
388
389
390/*
391 * Set the maximum length of sync message according to the
392 * specified interface's MTU.
393 */
394static int set_sync_mesg_maxlen(int sync_state)
395{
396 struct net_device *dev;
397 int num;
398
399 if (sync_state == IP_VS_STATE_MASTER) {
400 if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
401 return -ENODEV;
402
403 num = (dev->mtu - sizeof(struct iphdr) -
404 sizeof(struct udphdr) -
405 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
406 sync_send_mesg_maxlen =
407 SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
408 IP_VS_DBG(7, "setting the maximum length of sync sending "
409 "message %d.\n", sync_send_mesg_maxlen);
410 } else if (sync_state == IP_VS_STATE_BACKUP) {
411 if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
412 return -ENODEV;
413
414 sync_recv_mesg_maxlen = dev->mtu -
415 sizeof(struct iphdr) - sizeof(struct udphdr);
416 IP_VS_DBG(7, "setting the maximum length of sync receiving "
417 "message %d.\n", sync_recv_mesg_maxlen);
418 }
419
420 return 0;
421}
422
423
424/*
425 * Join a multicast group.
426 * the group is specified by a class D multicast address 224.0.0.0/8
427 * in the in_addr structure passed in as a parameter.
428 */
429static int
430join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
431{
432 struct ip_mreqn mreq;
433 struct net_device *dev;
434 int ret;
435
436 memset(&mreq, 0, sizeof(mreq));
437 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
438
439 if ((dev = __dev_get_by_name(ifname)) == NULL)
440 return -ENODEV;
441 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
442 return -EINVAL;
443
444 mreq.imr_ifindex = dev->ifindex;
445
446 lock_sock(sk);
447 ret = ip_mc_join_group(sk, &mreq);
448 release_sock(sk);
449
450 return ret;
451}
452
453
454static int bind_mcastif_addr(struct socket *sock, char *ifname)
455{
456 struct net_device *dev;
457 u32 addr;
458 struct sockaddr_in sin;
459
460 if ((dev = __dev_get_by_name(ifname)) == NULL)
461 return -ENODEV;
462
463 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
464 if (!addr)
465 IP_VS_ERR("You probably need to specify IP address on "
466 "multicast interface.\n");
467
468 IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
469 ifname, NIPQUAD(addr));
470
471 /* Now bind the socket with the address of multicast interface */
472 sin.sin_family = AF_INET;
473 sin.sin_addr.s_addr = addr;
474 sin.sin_port = 0;
475
476 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
477}
478
479/*
480 * Set up sending multicast socket over UDP
481 */
482static struct socket * make_send_sock(void)
483{
484 struct socket *sock;
485
486 /* First create a socket */
487 if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
488 IP_VS_ERR("Error during creation of socket; terminating\n");
489 return NULL;
490 }
491
492 if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
493 IP_VS_ERR("Error setting outbound mcast interface\n");
494 goto error;
495 }
496
497 set_mcast_loop(sock->sk, 0);
498 set_mcast_ttl(sock->sk, 1);
499
500 if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
501 IP_VS_ERR("Error binding address of the mcast interface\n");
502 goto error;
503 }
504
505 if (sock->ops->connect(sock,
506 (struct sockaddr*)&mcast_addr,
507 sizeof(struct sockaddr), 0) < 0) {
508 IP_VS_ERR("Error connecting to the multicast addr\n");
509 goto error;
510 }
511
512 return sock;
513
514 error:
515 sock_release(sock);
516 return NULL;
517}
518
519
520/*
521 * Set up receiving multicast socket over UDP
522 */
523static struct socket * make_receive_sock(void)
524{
525 struct socket *sock;
526
527 /* First create a socket */
528 if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
529 IP_VS_ERR("Error during creation of socket; terminating\n");
530 return NULL;
531 }
532
533 /* it is equivalent to the REUSEADDR option in user-space */
534 sock->sk->sk_reuse = 1;
535
536 if (sock->ops->bind(sock,
537 (struct sockaddr*)&mcast_addr,
538 sizeof(struct sockaddr)) < 0) {
539 IP_VS_ERR("Error binding to the multicast addr\n");
540 goto error;
541 }
542
543 /* join the multicast group */
544 if (join_mcast_group(sock->sk,
545 (struct in_addr*)&mcast_addr.sin_addr,
546 ip_vs_backup_mcast_ifn) < 0) {
547 IP_VS_ERR("Error joining to the multicast group\n");
548 goto error;
549 }
550
551 return sock;
552
553 error:
554 sock_release(sock);
555 return NULL;
556}
557
558
559static int
560ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
561{
562 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
563 struct kvec iov;
564 int len;
565
566 EnterFunction(7);
567 iov.iov_base = (void *)buffer;
568 iov.iov_len = length;
569
570 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
571
572 LeaveFunction(7);
573 return len;
574}
575
576static void
577ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
578{
579 int msize;
580
581 msize = msg->size;
582
583 /* Put size in network byte order */
584 msg->size = htons(msg->size);
585
586 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
587 IP_VS_ERR("ip_vs_send_async error\n");
588}
589
590static int
591ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
592{
593 struct msghdr msg = {NULL,};
594 struct kvec iov;
595 int len;
596
597 EnterFunction(7);
598
599 /* Receive a packet */
600 iov.iov_base = buffer;
601 iov.iov_len = (size_t)buflen;
602
603 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
604
605 if (len < 0)
606 return -1;
607
608 LeaveFunction(7);
609 return len;
610}
611
612
613static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
614static pid_t sync_master_pid = 0;
615static pid_t sync_backup_pid = 0;
616
617static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
618static int stop_master_sync = 0;
619static int stop_backup_sync = 0;
620
621static void sync_master_loop(void)
622{
623 struct socket *sock;
624 struct ip_vs_sync_buff *sb;
625
626 /* create the sending multicast socket */
627 sock = make_send_sock();
628 if (!sock)
629 return;
630
631 IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
632 "syncid = %d\n",
633 ip_vs_master_mcast_ifn, ip_vs_master_syncid);
634
635 for (;;) {
636 while ((sb=sb_dequeue())) {
637 ip_vs_send_sync_msg(sock, sb->mesg);
638 ip_vs_sync_buff_release(sb);
639 }
640
641 /* check if entries stay in curr_sb for 2 seconds */
642 if ((sb = get_curr_sync_buff(2*HZ))) {
643 ip_vs_send_sync_msg(sock, sb->mesg);
644 ip_vs_sync_buff_release(sb);
645 }
646
647 if (stop_master_sync)
648 break;
649
650 ssleep(1);
651 }
652
653 /* clean up the sync_buff queue */
654 while ((sb=sb_dequeue())) {
655 ip_vs_sync_buff_release(sb);
656 }
657
658 /* clean up the current sync_buff */
659 if ((sb = get_curr_sync_buff(0))) {
660 ip_vs_sync_buff_release(sb);
661 }
662
663 /* release the sending multicast socket */
664 sock_release(sock);
665}
666
667
668static void sync_backup_loop(void)
669{
670 struct socket *sock;
671 char *buf;
672 int len;
673
674 if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
675 IP_VS_ERR("sync_backup_loop: kmalloc error\n");
676 return;
677 }
678
679 /* create the receiving multicast socket */
680 sock = make_receive_sock();
681 if (!sock)
682 goto out;
683
684 IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
685 "syncid = %d\n",
686 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
687
688 for (;;) {
689 /* do you have data now? */
690 while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
691 if ((len =
692 ip_vs_receive(sock, buf,
693 sync_recv_mesg_maxlen)) <= 0) {
694 IP_VS_ERR("receiving message error\n");
695 break;
696 }
697 /* disable bottom half, because it accessed the data
698 shared by softirq while getting/creating conns */
699 local_bh_disable();
700 ip_vs_process_message(buf, len);
701 local_bh_enable();
702 }
703
704 if (stop_backup_sync)
705 break;
706
707 ssleep(1);
708 }
709
710 /* release the sending multicast socket */
711 sock_release(sock);
712
713 out:
714 kfree(buf);
715}
716
717
718static void set_sync_pid(int sync_state, pid_t sync_pid)
719{
720 if (sync_state == IP_VS_STATE_MASTER)
721 sync_master_pid = sync_pid;
722 else if (sync_state == IP_VS_STATE_BACKUP)
723 sync_backup_pid = sync_pid;
724}
725
726static void set_stop_sync(int sync_state, int set)
727{
728 if (sync_state == IP_VS_STATE_MASTER)
729 stop_master_sync = set;
730 else if (sync_state == IP_VS_STATE_BACKUP)
731 stop_backup_sync = set;
732 else {
733 stop_master_sync = set;
734 stop_backup_sync = set;
735 }
736}
737
738static int sync_thread(void *startup)
739{
740 DECLARE_WAITQUEUE(wait, current);
741 mm_segment_t oldmm;
742 int state;
743 const char *name;
744
745 /* increase the module use count */
746 ip_vs_use_count_inc();
747
748 if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
749 state = IP_VS_STATE_MASTER;
750 name = "ipvs_syncmaster";
751 } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
752 state = IP_VS_STATE_BACKUP;
753 name = "ipvs_syncbackup";
754 } else {
755 IP_VS_BUG();
756 ip_vs_use_count_dec();
757 return -EINVAL;
758 }
759
760 daemonize(name);
761
762 oldmm = get_fs();
763 set_fs(KERNEL_DS);
764
765 /* Block all signals */
766 spin_lock_irq(&current->sighand->siglock);
767 siginitsetinv(&current->blocked, 0);
768 recalc_sigpending();
769 spin_unlock_irq(&current->sighand->siglock);
770
771 /* set the maximum length of sync message */
772 set_sync_mesg_maxlen(state);
773
774 /* set up multicast address */
775 mcast_addr.sin_family = AF_INET;
776 mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
777 mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
778
779 add_wait_queue(&sync_wait, &wait);
780
781 set_sync_pid(state, current->pid);
782 complete((struct completion *)startup);
783
784 /* processing master/backup loop here */
785 if (state == IP_VS_STATE_MASTER)
786 sync_master_loop();
787 else if (state == IP_VS_STATE_BACKUP)
788 sync_backup_loop();
789 else IP_VS_BUG();
790
791 remove_wait_queue(&sync_wait, &wait);
792
793 /* thread exits */
794 set_sync_pid(state, 0);
795 IP_VS_INFO("sync thread stopped!\n");
796
797 set_fs(oldmm);
798
799 /* decrease the module use count */
800 ip_vs_use_count_dec();
801
802 set_stop_sync(state, 0);
803 wake_up(&stop_sync_wait);
804
805 return 0;
806}
807
808
809static int fork_sync_thread(void *startup)
810{
811 pid_t pid;
812
813 /* fork the sync thread here, then the parent process of the
814 sync thread is the init process after this thread exits. */
815 repeat:
816 if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
817 IP_VS_ERR("could not create sync_thread due to %d... "
818 "retrying.\n", pid);
819 ssleep(1);
820 goto repeat;
821 }
822
823 return 0;
824}
825
826
827int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
828{
829 DECLARE_COMPLETION(startup);
830 pid_t pid;
831
832 if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
833 (state == IP_VS_STATE_BACKUP && sync_backup_pid))
834 return -EEXIST;
835
836 IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
837 IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
838 sizeof(struct ip_vs_sync_conn));
839
840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
843 ip_vs_master_syncid = syncid;
844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
846 ip_vs_backup_syncid = syncid;
847 }
848
849 repeat:
850 if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
851 IP_VS_ERR("could not create fork_sync_thread due to %d... "
852 "retrying.\n", pid);
853 ssleep(1);
854 goto repeat;
855 }
856
857 wait_for_completion(&startup);
858
859 return 0;
860}
861
862
863int stop_sync_thread(int state)
864{
865 DECLARE_WAITQUEUE(wait, current);
866
867 if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
868 (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
869 return -ESRCH;
870
871 IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
872 IP_VS_INFO("stopping sync thread %d ...\n",
873 (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
874
875 __set_current_state(TASK_UNINTERRUPTIBLE);
876 add_wait_queue(&stop_sync_wait, &wait);
877 set_stop_sync(state, 1);
878 ip_vs_sync_state -= state;
879 wake_up(&sync_wait);
880 schedule();
881 __set_current_state(TASK_RUNNING);
882 remove_wait_queue(&stop_sync_wait, &wait);
883
884 /* Note: no need to reap the sync thread, because its parent
885 process is the init process */
886
887 if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
888 (state == IP_VS_STATE_BACKUP && stop_backup_sync))
889 IP_VS_BUG();
890
891 return 0;
892}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
new file mode 100644
index 000000000000..8a9d913261d8
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -0,0 +1,151 @@
1/*
2 * IPVS: Weighted Least-Connection Scheduling module
3 *
4 * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Peter Kese <peter.kese@ijs.si>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
16 * Wensong Zhang : changed to use the inactconns in scheduling
17 * Wensong Zhang : changed some comestics things for debugging
18 * Wensong Zhang : changed for the d-linked destination list
19 * Wensong Zhang : added the ip_vs_wlc_update_svc
20 * Wensong Zhang : added any dest with weight=0 is quiesced
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26
27#include <net/ip_vs.h>
28
29
30static int
31ip_vs_wlc_init_svc(struct ip_vs_service *svc)
32{
33 return 0;
34}
35
36
37static int
38ip_vs_wlc_done_svc(struct ip_vs_service *svc)
39{
40 return 0;
41}
42
43
44static int
45ip_vs_wlc_update_svc(struct ip_vs_service *svc)
46{
47 return 0;
48}
49
50
51static inline unsigned int
52ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
53{
54 /*
55 * We think the overhead of processing active connections is 256
56 * times higher than that of inactive connections in average. (This
57 * 256 times might not be accurate, we will change it later) We
58 * use the following formula to estimate the overhead now:
59 * dest->activeconns*256 + dest->inactconns
60 */
61 return (atomic_read(&dest->activeconns) << 8) +
62 atomic_read(&dest->inactconns);
63}
64
65
66/*
67 * Weighted Least Connection scheduling
68 */
69static struct ip_vs_dest *
70ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
71{
72 struct ip_vs_dest *dest, *least;
73 unsigned int loh, doh;
74
75 IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
76
77 /*
78 * We calculate the load of each dest server as follows:
79 * (dest overhead) / dest->weight
80 *
81 * Remember -- no floats in kernel mode!!!
82 * The comparison of h1*w2 > h2*w1 is equivalent to that of
83 * h1/w1 > h2/w2
84 * if every weight is larger than zero.
85 *
86 * The server with weight=0 is quiesced and will not receive any
87 * new connections.
88 */
89
90 list_for_each_entry(dest, &svc->destinations, n_list) {
91 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
92 atomic_read(&dest->weight) > 0) {
93 least = dest;
94 loh = ip_vs_wlc_dest_overhead(least);
95 goto nextstage;
96 }
97 }
98 return NULL;
99
100 /*
101 * Find the destination with the least load.
102 */
103 nextstage:
104 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
105 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
106 continue;
107 doh = ip_vs_wlc_dest_overhead(dest);
108 if (loh * atomic_read(&dest->weight) >
109 doh * atomic_read(&least->weight)) {
110 least = dest;
111 loh = doh;
112 }
113 }
114
115 IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
116 "activeconns %d refcnt %d weight %d overhead %d\n",
117 NIPQUAD(least->addr), ntohs(least->port),
118 atomic_read(&least->activeconns),
119 atomic_read(&least->refcnt),
120 atomic_read(&least->weight), loh);
121
122 return least;
123}
124
125
126static struct ip_vs_scheduler ip_vs_wlc_scheduler =
127{
128 .name = "wlc",
129 .refcnt = ATOMIC_INIT(0),
130 .module = THIS_MODULE,
131 .init_service = ip_vs_wlc_init_svc,
132 .done_service = ip_vs_wlc_done_svc,
133 .update_service = ip_vs_wlc_update_svc,
134 .schedule = ip_vs_wlc_schedule,
135};
136
137
138static int __init ip_vs_wlc_init(void)
139{
140 INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
141 return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
142}
143
144static void __exit ip_vs_wlc_cleanup(void)
145{
146 unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
147}
148
149module_init(ip_vs_wlc_init);
150module_exit(ip_vs_wlc_cleanup);
151MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
new file mode 100644
index 000000000000..749fa044eca5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -0,0 +1,235 @@
1/*
2 * IPVS: Weighted Round-Robin Scheduling module
3 *
4 * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_wrr_update_svc
18 * Julian Anastasov : fixed the bug of returning destination
19 * with weight 0 when all weights are zero
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25
26#include <net/ip_vs.h>
27
28/*
29 * current destination pointer for weighted round-robin scheduling
30 */
31struct ip_vs_wrr_mark {
32 struct list_head *cl; /* current list head */
33 int cw; /* current weight */
34 int mw; /* maximum weight */
35 int di; /* decreasing interval */
36};
37
38
39/*
40 * Get the gcd of server weights
41 */
42static int gcd(int a, int b)
43{
44 int c;
45
46 while ((c = a % b)) {
47 a = b;
48 b = c;
49 }
50 return b;
51}
52
53static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
54{
55 struct ip_vs_dest *dest;
56 int weight;
57 int g = 0;
58
59 list_for_each_entry(dest, &svc->destinations, n_list) {
60 weight = atomic_read(&dest->weight);
61 if (weight > 0) {
62 if (g > 0)
63 g = gcd(weight, g);
64 else
65 g = weight;
66 }
67 }
68 return g ? g : 1;
69}
70
71
72/*
73 * Get the maximum weight of the service destinations.
74 */
75static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
76{
77 struct ip_vs_dest *dest;
78 int weight = 0;
79
80 list_for_each_entry(dest, &svc->destinations, n_list) {
81 if (atomic_read(&dest->weight) > weight)
82 weight = atomic_read(&dest->weight);
83 }
84
85 return weight;
86}
87
88
89static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
90{
91 struct ip_vs_wrr_mark *mark;
92
93 /*
94 * Allocate the mark variable for WRR scheduling
95 */
96 mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
97 if (mark == NULL) {
98 IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
99 return -ENOMEM;
100 }
101 mark->cl = &svc->destinations;
102 mark->cw = 0;
103 mark->mw = ip_vs_wrr_max_weight(svc);
104 mark->di = ip_vs_wrr_gcd_weight(svc);
105 svc->sched_data = mark;
106
107 return 0;
108}
109
110
111static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
112{
113 /*
114 * Release the mark variable
115 */
116 kfree(svc->sched_data);
117
118 return 0;
119}
120
121
122static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
123{
124 struct ip_vs_wrr_mark *mark = svc->sched_data;
125
126 mark->cl = &svc->destinations;
127 mark->mw = ip_vs_wrr_max_weight(svc);
128 mark->di = ip_vs_wrr_gcd_weight(svc);
129 if (mark->cw > mark->mw)
130 mark->cw = 0;
131 return 0;
132}
133
134
135/*
136 * Weighted Round-Robin Scheduling
137 */
138static struct ip_vs_dest *
139ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
140{
141 struct ip_vs_dest *dest;
142 struct ip_vs_wrr_mark *mark = svc->sched_data;
143 struct list_head *p;
144
145 IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
146
147 /*
148 * This loop will always terminate, because mark->cw in (0, max_weight]
149 * and at least one server has its weight equal to max_weight.
150 */
151 write_lock(&svc->sched_lock);
152 p = mark->cl;
153 while (1) {
154 if (mark->cl == &svc->destinations) {
155 /* it is at the head of the destination list */
156
157 if (mark->cl == mark->cl->next) {
158 /* no dest entry */
159 dest = NULL;
160 goto out;
161 }
162
163 mark->cl = svc->destinations.next;
164 mark->cw -= mark->di;
165 if (mark->cw <= 0) {
166 mark->cw = mark->mw;
167 /*
168 * Still zero, which means no available servers.
169 */
170 if (mark->cw == 0) {
171 mark->cl = &svc->destinations;
172 IP_VS_INFO("ip_vs_wrr_schedule(): "
173 "no available servers\n");
174 dest = NULL;
175 goto out;
176 }
177 }
178 } else
179 mark->cl = mark->cl->next;
180
181 if (mark->cl != &svc->destinations) {
182 /* not at the head of the list */
183 dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
184 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
185 atomic_read(&dest->weight) >= mark->cw) {
186 /* got it */
187 break;
188 }
189 }
190
191 if (mark->cl == p && mark->cw == mark->di) {
192 /* back to the start, and no dest is found.
193 It is only possible when all dests are OVERLOADED */
194 dest = NULL;
195 goto out;
196 }
197 }
198
199 IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
200 "activeconns %d refcnt %d weight %d\n",
201 NIPQUAD(dest->addr), ntohs(dest->port),
202 atomic_read(&dest->activeconns),
203 atomic_read(&dest->refcnt),
204 atomic_read(&dest->weight));
205
206 out:
207 write_unlock(&svc->sched_lock);
208 return dest;
209}
210
211
212static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
213 .name = "wrr",
214 .refcnt = ATOMIC_INIT(0),
215 .module = THIS_MODULE,
216 .init_service = ip_vs_wrr_init_svc,
217 .done_service = ip_vs_wrr_done_svc,
218 .update_service = ip_vs_wrr_update_svc,
219 .schedule = ip_vs_wrr_schedule,
220};
221
222static int __init ip_vs_wrr_init(void)
223{
224 INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
225 return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
226}
227
228static void __exit ip_vs_wrr_cleanup(void)
229{
230 unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
231}
232
233module_init(ip_vs_wrr_init);
234module_exit(ip_vs_wrr_cleanup);
235MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
new file mode 100644
index 000000000000..faa6176bbeb1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -0,0 +1,563 @@
1/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
4 * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
5 *
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/ip.h>
20#include <linux/tcp.h> /* for tcphdr */
21#include <net/tcp.h> /* for csum_tcpudp_magic */
22#include <net/udp.h>
23#include <net/icmp.h> /* for icmp_send */
24#include <net/route.h> /* for ip_route_output */
25#include <linux/netfilter.h>
26#include <linux/netfilter_ipv4.h>
27
28#include <net/ip_vs.h>
29
30
31/*
32 * Destination cache to speed up outgoing route lookup
33 */
34static inline void
35__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
36{
37 struct dst_entry *old_dst;
38
39 old_dst = dest->dst_cache;
40 dest->dst_cache = dst;
41 dest->dst_rtos = rtos;
42 dst_release(old_dst);
43}
44
45static inline struct dst_entry *
46__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47{
48 struct dst_entry *dst = dest->dst_cache;
49
50 if (!dst)
51 return NULL;
52 if ((dst->obsolete || rtos != dest->dst_rtos) &&
53 dst->ops->check(dst, cookie) == NULL) {
54 dest->dst_cache = NULL;
55 dst_release(dst);
56 return NULL;
57 }
58 dst_hold(dst);
59 return dst;
60}
61
62static inline struct rtable *
63__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
64{
65 struct rtable *rt; /* Route to the other host */
66 struct ip_vs_dest *dest = cp->dest;
67
68 if (dest) {
69 spin_lock(&dest->dst_lock);
70 if (!(rt = (struct rtable *)
71 __ip_vs_dst_check(dest, rtos, 0))) {
72 struct flowi fl = {
73 .oif = 0,
74 .nl_u = {
75 .ip4_u = {
76 .daddr = dest->addr,
77 .saddr = 0,
78 .tos = rtos, } },
79 };
80
81 if (ip_route_output_key(&rt, &fl)) {
82 spin_unlock(&dest->dst_lock);
83 IP_VS_DBG_RL("ip_route_output error, "
84 "dest: %u.%u.%u.%u\n",
85 NIPQUAD(dest->addr));
86 return NULL;
87 }
88 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
89 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
90 NIPQUAD(dest->addr),
91 atomic_read(&rt->u.dst.__refcnt), rtos);
92 }
93 spin_unlock(&dest->dst_lock);
94 } else {
95 struct flowi fl = {
96 .oif = 0,
97 .nl_u = {
98 .ip4_u = {
99 .daddr = cp->daddr,
100 .saddr = 0,
101 .tos = rtos, } },
102 };
103
104 if (ip_route_output_key(&rt, &fl)) {
105 IP_VS_DBG_RL("ip_route_output error, dest: "
106 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
107 return NULL;
108 }
109 }
110
111 return rt;
112}
113
114
115/*
116 * Release dest->dst_cache before a dest is removed
117 */
118void
119ip_vs_dst_reset(struct ip_vs_dest *dest)
120{
121 struct dst_entry *old_dst;
122
123 old_dst = dest->dst_cache;
124 dest->dst_cache = NULL;
125 dst_release(old_dst);
126}
127
128#define IP_VS_XMIT(skb, rt) \
129do { \
130 nf_reset_debug(skb); \
131 (skb)->nfcache |= NFC_IPVS_PROPERTY; \
132 (skb)->ip_summed = CHECKSUM_NONE; \
133 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
134 (rt)->u.dst.dev, dst_output); \
135} while (0)
136
137
138/*
139 * NULL transmitter (do nothing except return NF_ACCEPT)
140 */
141int
142ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
143 struct ip_vs_protocol *pp)
144{
145 /* we do not touch skb and do not need pskb ptr */
146 return NF_ACCEPT;
147}
148
149
150/*
151 * Bypass transmitter
152 * Let packets bypass the destination when the destination is not
153 * available, it may be only used in transparent cache cluster.
154 */
155int
156ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
157 struct ip_vs_protocol *pp)
158{
159 struct rtable *rt; /* Route to the other host */
160 struct iphdr *iph = skb->nh.iph;
161 u8 tos = iph->tos;
162 int mtu;
163 struct flowi fl = {
164 .oif = 0,
165 .nl_u = {
166 .ip4_u = {
167 .daddr = iph->daddr,
168 .saddr = 0,
169 .tos = RT_TOS(tos), } },
170 };
171
172 EnterFunction(10);
173
174 if (ip_route_output_key(&rt, &fl)) {
175 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
176 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
177 goto tx_error_icmp;
178 }
179
180 /* MTU checking */
181 mtu = dst_mtu(&rt->u.dst);
182 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
183 ip_rt_put(rt);
184 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
185 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
186 goto tx_error;
187 }
188
189 /*
190 * Call ip_send_check because we are not sure it is called
191 * after ip_defrag. Is copy-on-write needed?
192 */
193 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
194 ip_rt_put(rt);
195 return NF_STOLEN;
196 }
197 ip_send_check(skb->nh.iph);
198
199 /* drop old route */
200 dst_release(skb->dst);
201 skb->dst = &rt->u.dst;
202
203 /* Another hack: avoid icmp_send in ip_fragment */
204 skb->local_df = 1;
205
206 IP_VS_XMIT(skb, rt);
207
208 LeaveFunction(10);
209 return NF_STOLEN;
210
211 tx_error_icmp:
212 dst_link_failure(skb);
213 tx_error:
214 kfree_skb(skb);
215 LeaveFunction(10);
216 return NF_STOLEN;
217}
218
219
220/*
221 * NAT transmitter (only for outside-to-inside nat forwarding)
222 * Not used for related ICMP
223 */
224int
225ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
226 struct ip_vs_protocol *pp)
227{
228 struct rtable *rt; /* Route to the other host */
229 int mtu;
230 struct iphdr *iph = skb->nh.iph;
231
232 EnterFunction(10);
233
234 /* check if it is a connection of no-client-port */
235 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
236 __u16 _pt, *p;
237 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
238 if (p == NULL)
239 goto tx_error;
240 ip_vs_conn_fill_cport(cp, *p);
241 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
242 }
243
244 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
245 goto tx_error_icmp;
246
247 /* MTU checking */
248 mtu = dst_mtu(&rt->u.dst);
249 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
250 ip_rt_put(rt);
251 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
252 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
253 goto tx_error;
254 }
255
256 /* copy-on-write the packet before mangling it */
257 if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
258 goto tx_error_put;
259
260 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
261 goto tx_error_put;
262
263 /* drop old route */
264 dst_release(skb->dst);
265 skb->dst = &rt->u.dst;
266
267 /* mangle the packet */
268 if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
269 goto tx_error;
270 skb->nh.iph->daddr = cp->daddr;
271 ip_send_check(skb->nh.iph);
272
273 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
274
275 /* FIXME: when application helper enlarges the packet and the length
276 is larger than the MTU of outgoing device, there will be still
277 MTU problem. */
278
279 /* Another hack: avoid icmp_send in ip_fragment */
280 skb->local_df = 1;
281
282 IP_VS_XMIT(skb, rt);
283
284 LeaveFunction(10);
285 return NF_STOLEN;
286
287 tx_error_icmp:
288 dst_link_failure(skb);
289 tx_error:
290 LeaveFunction(10);
291 kfree_skb(skb);
292 return NF_STOLEN;
293 tx_error_put:
294 ip_rt_put(rt);
295 goto tx_error;
296}
297
298
299/*
300 * IP Tunneling transmitter
301 *
302 * This function encapsulates the packet in a new IP packet, its
303 * destination will be set to cp->daddr. Most code of this function
304 * is taken from ipip.c.
305 *
306 * It is used in VS/TUN cluster. The load balancer selects a real
307 * server from a cluster based on a scheduling algorithm,
308 * encapsulates the request packet and forwards it to the selected
309 * server. For example, all real servers are configured with
310 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
311 * the encapsulated packet, it will decapsulate the packet, processe
312 * the request and return the response packets directly to the client
313 * without passing the load balancer. This can greatly increase the
314 * scalability of virtual server.
315 *
316 * Used for ANY protocol
317 */
318int
319ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
320 struct ip_vs_protocol *pp)
321{
322 struct rtable *rt; /* Route to the other host */
323 struct net_device *tdev; /* Device to other host */
324 struct iphdr *old_iph = skb->nh.iph;
325 u8 tos = old_iph->tos;
326 u16 df = old_iph->frag_off;
327 struct iphdr *iph; /* Our new IP header */
328 int max_headroom; /* The extra header space needed */
329 int mtu;
330
331 EnterFunction(10);
332
333 if (skb->protocol != __constant_htons(ETH_P_IP)) {
334 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
335 "ETH_P_IP: %d, skb protocol: %d\n",
336 __constant_htons(ETH_P_IP), skb->protocol);
337 goto tx_error;
338 }
339
340 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
341 goto tx_error_icmp;
342
343 tdev = rt->u.dst.dev;
344
345 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
346 if (mtu < 68) {
347 ip_rt_put(rt);
348 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
349 goto tx_error;
350 }
351 if (skb->dst)
352 skb->dst->ops->update_pmtu(skb->dst, mtu);
353
354 df |= (old_iph->frag_off&__constant_htons(IP_DF));
355
356 if ((old_iph->frag_off&__constant_htons(IP_DF))
357 && mtu < ntohs(old_iph->tot_len)) {
358 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
359 ip_rt_put(rt);
360 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
361 goto tx_error;
362 }
363
364 /*
365 * Okay, now see if we can stuff it in the buffer as-is.
366 */
367 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
368
369 if (skb_headroom(skb) < max_headroom
370 || skb_cloned(skb) || skb_shared(skb)) {
371 struct sk_buff *new_skb =
372 skb_realloc_headroom(skb, max_headroom);
373 if (!new_skb) {
374 ip_rt_put(rt);
375 kfree_skb(skb);
376 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
377 return NF_STOLEN;
378 }
379 kfree_skb(skb);
380 skb = new_skb;
381 old_iph = skb->nh.iph;
382 }
383
384 skb->h.raw = (void *) old_iph;
385
386 /* fix old IP header checksum */
387 ip_send_check(old_iph);
388
389 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
390 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
391
392 /* drop old route */
393 dst_release(skb->dst);
394 skb->dst = &rt->u.dst;
395
396 /*
397 * Push down and install the IPIP header.
398 */
399 iph = skb->nh.iph;
400 iph->version = 4;
401 iph->ihl = sizeof(struct iphdr)>>2;
402 iph->frag_off = df;
403 iph->protocol = IPPROTO_IPIP;
404 iph->tos = tos;
405 iph->daddr = rt->rt_dst;
406 iph->saddr = rt->rt_src;
407 iph->ttl = old_iph->ttl;
408 iph->tot_len = htons(skb->len);
409 ip_select_ident(iph, &rt->u.dst, NULL);
410 ip_send_check(iph);
411
412 /* Another hack: avoid icmp_send in ip_fragment */
413 skb->local_df = 1;
414
415 IP_VS_XMIT(skb, rt);
416
417 LeaveFunction(10);
418
419 return NF_STOLEN;
420
421 tx_error_icmp:
422 dst_link_failure(skb);
423 tx_error:
424 kfree_skb(skb);
425 LeaveFunction(10);
426 return NF_STOLEN;
427}
428
429
430/*
431 * Direct Routing transmitter
432 * Used for ANY protocol
433 */
434int
435ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
436 struct ip_vs_protocol *pp)
437{
438 struct rtable *rt; /* Route to the other host */
439 struct iphdr *iph = skb->nh.iph;
440 int mtu;
441
442 EnterFunction(10);
443
444 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
445 goto tx_error_icmp;
446
447 /* MTU checking */
448 mtu = dst_mtu(&rt->u.dst);
449 if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
450 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
451 ip_rt_put(rt);
452 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
453 goto tx_error;
454 }
455
456 /*
457 * Call ip_send_check because we are not sure it is called
458 * after ip_defrag. Is copy-on-write needed?
459 */
460 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
461 ip_rt_put(rt);
462 return NF_STOLEN;
463 }
464 ip_send_check(skb->nh.iph);
465
466 /* drop old route */
467 dst_release(skb->dst);
468 skb->dst = &rt->u.dst;
469
470 /* Another hack: avoid icmp_send in ip_fragment */
471 skb->local_df = 1;
472
473 IP_VS_XMIT(skb, rt);
474
475 LeaveFunction(10);
476 return NF_STOLEN;
477
478 tx_error_icmp:
479 dst_link_failure(skb);
480 tx_error:
481 kfree_skb(skb);
482 LeaveFunction(10);
483 return NF_STOLEN;
484}
485
486
487/*
488 * ICMP packet transmitter
489 * called by the ip_vs_in_icmp
490 */
491int
492ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
493 struct ip_vs_protocol *pp, int offset)
494{
495 struct rtable *rt; /* Route to the other host */
496 int mtu;
497 int rc;
498
499 EnterFunction(10);
500
501 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
502 forwarded directly here, because there is no need to
503 translate address/port back */
504 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
505 if (cp->packet_xmit)
506 rc = cp->packet_xmit(skb, cp, pp);
507 else
508 rc = NF_ACCEPT;
509 /* do not touch skb anymore */
510 atomic_inc(&cp->in_pkts);
511 __ip_vs_conn_put(cp);
512 goto out;
513 }
514
515 /*
516 * mangle and send the packet here (only for VS/NAT)
517 */
518
519 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
520 goto tx_error_icmp;
521
522 /* MTU checking */
523 mtu = dst_mtu(&rt->u.dst);
524 if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
525 ip_rt_put(rt);
526 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
527 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
528 goto tx_error;
529 }
530
531 /* copy-on-write the packet before mangling it */
532 if (!ip_vs_make_skb_writable(&skb, offset))
533 goto tx_error_put;
534
535 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
536 goto tx_error_put;
537
538 /* drop the old route when skb is not shared */
539 dst_release(skb->dst);
540 skb->dst = &rt->u.dst;
541
542 ip_vs_nat_icmp(skb, pp, cp, 0);
543
544 /* Another hack: avoid icmp_send in ip_fragment */
545 skb->local_df = 1;
546
547 IP_VS_XMIT(skb, rt);
548
549 rc = NF_STOLEN;
550 goto out;
551
552 tx_error_icmp:
553 dst_link_failure(skb);
554 tx_error:
555 dev_kfree_skb(skb);
556 rc = NF_STOLEN;
557 out:
558 LeaveFunction(10);
559 return rc;
560 tx_error_put:
561 ip_rt_put(rt);
562 goto tx_error;
563}
diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c
new file mode 100644
index 000000000000..4e9ca7c76407
--- /dev/null
+++ b/net/ipv4/multipath.c
@@ -0,0 +1,55 @@
1/* multipath.c: IPV4 multipath algorithm support.
2 *
3 * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
4 * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
5 */
6
7#include <linux/module.h>
8#include <linux/errno.h>
9#include <linux/netdevice.h>
10#include <linux/spinlock.h>
11
12#include <net/ip_mp_alg.h>
13
14static DEFINE_SPINLOCK(alg_table_lock);
15struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1];
16
17int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
18{
19 struct ip_mp_alg_ops **slot;
20 int err;
21
22 if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX ||
23 !ops->mp_alg_select_route)
24 return -EINVAL;
25
26 spin_lock(&alg_table_lock);
27 slot = &ip_mp_alg_table[n];
28 if (*slot != NULL) {
29 err = -EBUSY;
30 } else {
31 *slot = ops;
32 err = 0;
33 }
34 spin_unlock(&alg_table_lock);
35
36 return err;
37}
38EXPORT_SYMBOL(multipath_alg_register);
39
40void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
41{
42 struct ip_mp_alg_ops **slot;
43
44 if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX)
45 return;
46
47 spin_lock(&alg_table_lock);
48 slot = &ip_mp_alg_table[n];
49 if (*slot == ops)
50 *slot = NULL;
51 spin_unlock(&alg_table_lock);
52
53 synchronize_net();
54}
55EXPORT_SYMBOL(multipath_alg_unregister);
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
new file mode 100644
index 000000000000..9349686131fc
--- /dev/null
+++ b/net/ipv4/multipath_drr.c
@@ -0,0 +1,265 @@
1/*
2 * Device round robin policy for multipath.
3 *
4 *
5 * Version: $Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $
6 *
7 * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <asm/system.h>
17#include <asm/uaccess.h>
18#include <linux/types.h>
19#include <linux/sched.h>
20#include <linux/errno.h>
21#include <linux/timer.h>
22#include <linux/mm.h>
23#include <linux/kernel.h>
24#include <linux/fcntl.h>
25#include <linux/stat.h>
26#include <linux/socket.h>
27#include <linux/in.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h>
30#include <linux/inetdevice.h>
31#include <linux/igmp.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/mroute.h>
35#include <linux/init.h>
36#include <net/ip.h>
37#include <net/protocol.h>
38#include <linux/skbuff.h>
39#include <net/sock.h>
40#include <net/icmp.h>
41#include <net/udp.h>
42#include <net/raw.h>
43#include <linux/notifier.h>
44#include <linux/if_arp.h>
45#include <linux/netfilter_ipv4.h>
46#include <net/ipip.h>
47#include <net/checksum.h>
48#include <net/ip_mp_alg.h>
49
50struct multipath_device {
51 int ifi; /* interface index of device */
52 atomic_t usecount;
53 int allocated;
54};
55
56#define MULTIPATH_MAX_DEVICECANDIDATES 10
57
58static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
59static DEFINE_SPINLOCK(state_lock);
60static struct rtable *last_selection = NULL;
61
62static int inline __multipath_findslot(void)
63{
64 int i;
65
66 for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
67 if (state[i].allocated == 0)
68 return i;
69 }
70 return -1;
71}
72
73static int inline __multipath_finddev(int ifindex)
74{
75 int i;
76
77 for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
78 if (state[i].allocated != 0 &&
79 state[i].ifi == ifindex)
80 return i;
81 }
82 return -1;
83}
84
85static int drr_dev_event(struct notifier_block *this,
86 unsigned long event, void *ptr)
87{
88 struct net_device *dev = ptr;
89 int devidx;
90
91 switch (event) {
92 case NETDEV_UNREGISTER:
93 case NETDEV_DOWN:
94 spin_lock_bh(&state_lock);
95
96 devidx = __multipath_finddev(dev->ifindex);
97 if (devidx != -1) {
98 state[devidx].allocated = 0;
99 state[devidx].ifi = 0;
100 atomic_set(&state[devidx].usecount, 0);
101 }
102
103 spin_unlock_bh(&state_lock);
104 break;
105 };
106
107 return NOTIFY_DONE;
108}
109
110struct notifier_block drr_dev_notifier = {
111 .notifier_call = drr_dev_event,
112};
113
114static void drr_remove(struct rtable *rt)
115{
116 if (last_selection == rt)
117 last_selection = NULL;
118}
119
120static void drr_safe_inc(atomic_t *usecount)
121{
122 int n;
123
124 atomic_inc(usecount);
125
126 n = atomic_read(usecount);
127 if (n <= 0) {
128 int i;
129
130 spin_lock_bh(&state_lock);
131
132 for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++)
133 atomic_set(&state[i].usecount, 0);
134
135 spin_unlock_bh(&state_lock);
136 }
137}
138
139static void drr_select_route(const struct flowi *flp,
140 struct rtable *first, struct rtable **rp)
141{
142 struct rtable *nh, *result, *cur_min;
143 int min_usecount = -1;
144 int devidx = -1;
145 int cur_min_devidx = -1;
146
147 /* if necessary and possible utilize the old alternative */
148 if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
149 last_selection != NULL) {
150 result = last_selection;
151 *rp = result;
152 return;
153 }
154
155 /* 1. make sure all alt. nexthops have the same GC related data */
156 /* 2. determine the new candidate to be returned */
157 result = NULL;
158 cur_min = NULL;
159 for (nh = rcu_dereference(first); nh;
160 nh = rcu_dereference(nh->u.rt_next)) {
161 if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
162 multipath_comparekeys(&nh->fl, flp)) {
163 int nh_ifidx = nh->u.dst.dev->ifindex;
164
165 nh->u.dst.lastuse = jiffies;
166 nh->u.dst.__use++;
167 if (result != NULL)
168 continue;
169
170 /* search for the output interface */
171
172 /* this is not SMP safe, only add/remove are
173 * SMP safe as wrong usecount updates have no big
174 * impact
175 */
176 devidx = __multipath_finddev(nh_ifidx);
177 if (devidx == -1) {
178 /* add the interface to the array
179 * SMP safe
180 */
181 spin_lock_bh(&state_lock);
182
183 /* due to SMP: search again */
184 devidx = __multipath_finddev(nh_ifidx);
185 if (devidx == -1) {
186 /* add entry for device */
187 devidx = __multipath_findslot();
188 if (devidx == -1) {
189 /* unlikely but possible */
190 continue;
191 }
192
193 state[devidx].allocated = 1;
194 state[devidx].ifi = nh_ifidx;
195 atomic_set(&state[devidx].usecount, 0);
196 min_usecount = 0;
197 }
198
199 spin_unlock_bh(&state_lock);
200 }
201
202 if (min_usecount == 0) {
203 /* if the device has not been used it is
204 * the primary target
205 */
206 drr_safe_inc(&state[devidx].usecount);
207 result = nh;
208 } else {
209 int count =
210 atomic_read(&state[devidx].usecount);
211
212 if (min_usecount == -1 ||
213 count < min_usecount) {
214 cur_min = nh;
215 cur_min_devidx = devidx;
216 min_usecount = count;
217 }
218 }
219 }
220 }
221
222 if (!result) {
223 if (cur_min) {
224 drr_safe_inc(&state[cur_min_devidx].usecount);
225 result = cur_min;
226 } else {
227 result = first;
228 }
229 }
230
231 *rp = result;
232 last_selection = result;
233}
234
235static struct ip_mp_alg_ops drr_ops = {
236 .mp_alg_select_route = drr_select_route,
237 .mp_alg_remove = drr_remove,
238};
239
240static int __init drr_init(void)
241{
242 int err = register_netdevice_notifier(&drr_dev_notifier);
243
244 if (err)
245 return err;
246
247 err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR);
248 if (err)
249 goto fail;
250
251 return 0;
252
253fail:
254 unregister_netdevice_notifier(&drr_dev_notifier);
255 return err;
256}
257
258static void __exit drr_exit(void)
259{
260 unregister_netdevice_notifier(&drr_dev_notifier);
261 multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR);
262}
263
264module_init(drr_init);
265module_exit(drr_exit);
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
new file mode 100644
index 000000000000..805a16e47de5
--- /dev/null
+++ b/net/ipv4/multipath_random.c
@@ -0,0 +1,128 @@
1/*
2 * Random policy for multipath.
3 *
4 *
5 * Version: $Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $
6 *
7 * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <asm/system.h>
17#include <asm/uaccess.h>
18#include <linux/types.h>
19#include <linux/sched.h>
20#include <linux/errno.h>
21#include <linux/timer.h>
22#include <linux/mm.h>
23#include <linux/kernel.h>
24#include <linux/fcntl.h>
25#include <linux/stat.h>
26#include <linux/socket.h>
27#include <linux/in.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h>
30#include <linux/inetdevice.h>
31#include <linux/igmp.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/mroute.h>
35#include <linux/init.h>
36#include <net/ip.h>
37#include <net/protocol.h>
38#include <linux/skbuff.h>
39#include <net/sock.h>
40#include <net/icmp.h>
41#include <net/udp.h>
42#include <net/raw.h>
43#include <linux/notifier.h>
44#include <linux/if_arp.h>
45#include <linux/netfilter_ipv4.h>
46#include <net/ipip.h>
47#include <net/checksum.h>
48#include <net/ip_mp_alg.h>
49
50#define MULTIPATH_MAX_CANDIDATES 40
51
52/* interface to random number generation */
53static unsigned int RANDOM_SEED = 93186752;
54
55static inline unsigned int random(unsigned int ubound)
56{
57 static unsigned int a = 1588635695,
58 q = 2,
59 r = 1117695901;
60
61 RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
62
63 return RANDOM_SEED % ubound;
64}
65
66
67static void random_select_route(const struct flowi *flp,
68 struct rtable *first,
69 struct rtable **rp)
70{
71 struct rtable *rt;
72 struct rtable *decision;
73 unsigned char candidate_count = 0;
74
75 /* count all candidate */
76 for (rt = rcu_dereference(first); rt;
77 rt = rcu_dereference(rt->u.rt_next)) {
78 if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
79 multipath_comparekeys(&rt->fl, flp))
80 ++candidate_count;
81 }
82
83 /* choose a random candidate */
84 decision = first;
85 if (candidate_count > 1) {
86 unsigned char i = 0;
87 unsigned char candidate_no = (unsigned char)
88 random(candidate_count);
89
90 /* find chosen candidate and adjust GC data for all candidates
91 * to ensure they stay in cache
92 */
93 for (rt = first; rt; rt = rt->u.rt_next) {
94 if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
95 multipath_comparekeys(&rt->fl, flp)) {
96 rt->u.dst.lastuse = jiffies;
97
98 if (i == candidate_no)
99 decision = rt;
100
101 if (i >= candidate_count)
102 break;
103
104 i++;
105 }
106 }
107 }
108
109 decision->u.dst.__use++;
110 *rp = decision;
111}
112
113static struct ip_mp_alg_ops random_ops = {
114 .mp_alg_select_route = random_select_route,
115};
116
117static int __init random_init(void)
118{
119 return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM);
120}
121
122static void __exit random_exit(void)
123{
124 multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM);
125}
126
127module_init(random_init);
128module_exit(random_exit);
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
new file mode 100644
index 000000000000..554a82568160
--- /dev/null
+++ b/net/ipv4/multipath_rr.c
@@ -0,0 +1,115 @@
1/*
2 * Round robin policy for multipath.
3 *
4 *
5 * Version: $Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $
6 *
7 * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <asm/system.h>
17#include <asm/uaccess.h>
18#include <linux/types.h>
19#include <linux/sched.h>
20#include <linux/errno.h>
21#include <linux/timer.h>
22#include <linux/mm.h>
23#include <linux/kernel.h>
24#include <linux/fcntl.h>
25#include <linux/stat.h>
26#include <linux/socket.h>
27#include <linux/in.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h>
30#include <linux/inetdevice.h>
31#include <linux/igmp.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/mroute.h>
35#include <linux/init.h>
36#include <net/ip.h>
37#include <net/protocol.h>
38#include <linux/skbuff.h>
39#include <net/sock.h>
40#include <net/icmp.h>
41#include <net/udp.h>
42#include <net/raw.h>
43#include <linux/notifier.h>
44#include <linux/if_arp.h>
45#include <linux/netfilter_ipv4.h>
46#include <net/ipip.h>
47#include <net/checksum.h>
48#include <net/ip_mp_alg.h>
49
50#define MULTIPATH_MAX_CANDIDATES 40
51
52static struct rtable* last_used = NULL;
53
54static void rr_remove(struct rtable *rt)
55{
56 if (last_used == rt)
57 last_used = NULL;
58}
59
60static void rr_select_route(const struct flowi *flp,
61 struct rtable *first, struct rtable **rp)
62{
63 struct rtable *nh, *result, *min_use_cand = NULL;
64 int min_use = -1;
65
66 /* if necessary and possible utilize the old alternative */
67 if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
68 last_used != NULL) {
69 result = last_used;
70 goto out;
71 }
72
73 /* 1. make sure all alt. nexthops have the same GC related data
74 * 2. determine the new candidate to be returned
75 */
76 result = NULL;
77 for (nh = rcu_dereference(first); nh;
78 nh = rcu_dereference(nh->u.rt_next)) {
79 if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
80 multipath_comparekeys(&nh->fl, flp)) {
81 nh->u.dst.lastuse = jiffies;
82
83 if (min_use == -1 || nh->u.dst.__use < min_use) {
84 min_use = nh->u.dst.__use;
85 min_use_cand = nh;
86 }
87 }
88 }
89 result = min_use_cand;
90 if (!result)
91 result = first;
92
93out:
94 last_used = result;
95 result->u.dst.__use++;
96 *rp = result;
97}
98
99static struct ip_mp_alg_ops rr_ops = {
100 .mp_alg_select_route = rr_select_route,
101 .mp_alg_remove = rr_remove,
102};
103
104static int __init rr_init(void)
105{
106 return multipath_alg_register(&rr_ops, IP_MP_ALG_RR);
107}
108
109static void __exit rr_exit(void)
110{
111 multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR);
112}
113
114module_init(rr_init);
115module_exit(rr_exit);
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
new file mode 100644
index 000000000000..10b23e1bece6
--- /dev/null
+++ b/net/ipv4/multipath_wrandom.c
@@ -0,0 +1,344 @@
1/*
2 * Weighted random policy for multipath.
3 *
4 *
5 * Version: $Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
6 *
7 * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <asm/system.h>
17#include <asm/uaccess.h>
18#include <linux/types.h>
19#include <linux/sched.h>
20#include <linux/errno.h>
21#include <linux/timer.h>
22#include <linux/mm.h>
23#include <linux/kernel.h>
24#include <linux/fcntl.h>
25#include <linux/stat.h>
26#include <linux/socket.h>
27#include <linux/in.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h>
30#include <linux/inetdevice.h>
31#include <linux/igmp.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/mroute.h>
35#include <linux/init.h>
36#include <net/ip.h>
37#include <net/protocol.h>
38#include <linux/skbuff.h>
39#include <net/sock.h>
40#include <net/icmp.h>
41#include <net/udp.h>
42#include <net/raw.h>
43#include <linux/notifier.h>
44#include <linux/if_arp.h>
45#include <linux/netfilter_ipv4.h>
46#include <net/ipip.h>
47#include <net/checksum.h>
48#include <net/ip_fib.h>
49#include <net/ip_mp_alg.h>
50
51#define MULTIPATH_STATE_SIZE 15
52
53struct multipath_candidate {
54 struct multipath_candidate *next;
55 int power;
56 struct rtable *rt;
57};
58
59struct multipath_dest {
60 struct list_head list;
61
62 const struct fib_nh *nh_info;
63 __u32 netmask;
64 __u32 network;
65 unsigned char prefixlen;
66
67 struct rcu_head rcu;
68};
69
70struct multipath_bucket {
71 struct list_head head;
72 spinlock_t lock;
73};
74
75struct multipath_route {
76 struct list_head list;
77
78 int oif;
79 __u32 gw;
80 struct list_head dests;
81
82 struct rcu_head rcu;
83};
84
85/* state: primarily weight per route information */
86static struct multipath_bucket state[MULTIPATH_STATE_SIZE];
87
88/* interface to random number generation */
89static unsigned int RANDOM_SEED = 93186752;
90
91static inline unsigned int random(unsigned int ubound)
92{
93 static unsigned int a = 1588635695,
94 q = 2,
95 r = 1117695901;
96 RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
97 return RANDOM_SEED % ubound;
98}
99
100static unsigned char __multipath_lookup_weight(const struct flowi *fl,
101 const struct rtable *rt)
102{
103 const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
104 struct multipath_route *r;
105 struct multipath_route *target_route = NULL;
106 struct multipath_dest *d;
107 int weight = 1;
108
109 /* lookup the weight information for a certain route */
110 rcu_read_lock();
111
112 /* find state entry for gateway or add one if necessary */
113 list_for_each_entry_rcu(r, &state[state_idx].head, list) {
114 if (r->gw == rt->rt_gateway &&
115 r->oif == rt->idev->dev->ifindex) {
116 target_route = r;
117 break;
118 }
119 }
120
121 if (!target_route) {
122 /* this should not happen... but we are prepared */
123 printk( KERN_CRIT"%s: missing state for gateway: %u and " \
124 "device %d\n", __FUNCTION__, rt->rt_gateway,
125 rt->idev->dev->ifindex);
126 goto out;
127 }
128
129 /* find state entry for destination */
130 list_for_each_entry_rcu(d, &target_route->dests, list) {
131 __u32 targetnetwork = fl->fl4_dst &
132 (0xFFFFFFFF >> (32 - d->prefixlen));
133
134 if ((targetnetwork & d->netmask) == d->network) {
135 weight = d->nh_info->nh_weight;
136 goto out;
137 }
138 }
139
140out:
141 rcu_read_unlock();
142 return weight;
143}
144
145static void wrandom_init_state(void)
146{
147 int i;
148
149 for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
150 INIT_LIST_HEAD(&state[i].head);
151 spin_lock_init(&state[i].lock);
152 }
153}
154
155static void wrandom_select_route(const struct flowi *flp,
156 struct rtable *first,
157 struct rtable **rp)
158{
159 struct rtable *rt;
160 struct rtable *decision;
161 struct multipath_candidate *first_mpc = NULL;
162 struct multipath_candidate *mpc, *last_mpc = NULL;
163 int power = 0;
164 int last_power;
165 int selector;
166 const size_t size_mpc = sizeof(struct multipath_candidate);
167
168 /* collect all candidates and identify their weights */
169 for (rt = rcu_dereference(first); rt;
170 rt = rcu_dereference(rt->u.rt_next)) {
171 if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
172 multipath_comparekeys(&rt->fl, flp)) {
173 struct multipath_candidate* mpc =
174 (struct multipath_candidate*)
175 kmalloc(size_mpc, GFP_KERNEL);
176
177 if (!mpc)
178 return;
179
180 power += __multipath_lookup_weight(flp, rt) * 10000;
181
182 mpc->power = power;
183 mpc->rt = rt;
184 mpc->next = NULL;
185
186 if (!first_mpc)
187 first_mpc = mpc;
188 else
189 last_mpc->next = mpc;
190
191 last_mpc = mpc;
192 }
193 }
194
195 /* choose a weighted random candidate */
196 decision = first;
197 selector = random(power);
198 last_power = 0;
199
200 /* select candidate, adjust GC data and cleanup local state */
201 decision = first;
202 last_mpc = NULL;
203 for (mpc = first_mpc; mpc; mpc = mpc->next) {
204 mpc->rt->u.dst.lastuse = jiffies;
205 if (last_power <= selector && selector < mpc->power)
206 decision = mpc->rt;
207
208 last_power = mpc->power;
209 if (last_mpc)
210 kfree(last_mpc);
211
212 last_mpc = mpc;
213 }
214
215 if (last_mpc) {
216 /* concurrent __multipath_flush may lead to !last_mpc */
217 kfree(last_mpc);
218 }
219
220 decision->u.dst.__use++;
221 *rp = decision;
222}
223
224static void wrandom_set_nhinfo(__u32 network,
225 __u32 netmask,
226 unsigned char prefixlen,
227 const struct fib_nh *nh)
228{
229 const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
230 struct multipath_route *r, *target_route = NULL;
231 struct multipath_dest *d, *target_dest = NULL;
232
233 /* store the weight information for a certain route */
234 spin_lock(&state[state_idx].lock);
235
236 /* find state entry for gateway or add one if necessary */
237 list_for_each_entry_rcu(r, &state[state_idx].head, list) {
238 if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
239 target_route = r;
240 break;
241 }
242 }
243
244 if (!target_route) {
245 const size_t size_rt = sizeof(struct multipath_route);
246 target_route = (struct multipath_route *)
247 kmalloc(size_rt, GFP_KERNEL);
248
249 target_route->gw = nh->nh_gw;
250 target_route->oif = nh->nh_oif;
251 memset(&target_route->rcu, 0, sizeof(struct rcu_head));
252 INIT_LIST_HEAD(&target_route->dests);
253
254 list_add_rcu(&target_route->list, &state[state_idx].head);
255 }
256
257 /* find state entry for destination or add one if necessary */
258 list_for_each_entry_rcu(d, &target_route->dests, list) {
259 if (d->nh_info == nh) {
260 target_dest = d;
261 break;
262 }
263 }
264
265 if (!target_dest) {
266 const size_t size_dst = sizeof(struct multipath_dest);
267 target_dest = (struct multipath_dest*)
268 kmalloc(size_dst, GFP_KERNEL);
269
270 target_dest->nh_info = nh;
271 target_dest->network = network;
272 target_dest->netmask = netmask;
273 target_dest->prefixlen = prefixlen;
274 memset(&target_dest->rcu, 0, sizeof(struct rcu_head));
275
276 list_add_rcu(&target_dest->list, &target_route->dests);
277 }
278 /* else: we already stored this info for another destination =>
279 * we are finished
280 */
281
282 spin_unlock(&state[state_idx].lock);
283}
284
285static void __multipath_free(struct rcu_head *head)
286{
287 struct multipath_route *rt = container_of(head, struct multipath_route,
288 rcu);
289 kfree(rt);
290}
291
292static void __multipath_free_dst(struct rcu_head *head)
293{
294 struct multipath_dest *dst = container_of(head,
295 struct multipath_dest,
296 rcu);
297 kfree(dst);
298}
299
300static void wrandom_flush(void)
301{
302 int i;
303
304 /* defere delete to all entries */
305 for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
306 struct multipath_route *r;
307
308 spin_lock(&state[i].lock);
309 list_for_each_entry_rcu(r, &state[i].head, list) {
310 struct multipath_dest *d;
311 list_for_each_entry_rcu(d, &r->dests, list) {
312 list_del_rcu(&d->list);
313 call_rcu(&d->rcu,
314 __multipath_free_dst);
315 }
316 list_del_rcu(&r->list);
317 call_rcu(&r->rcu,
318 __multipath_free);
319 }
320
321 spin_unlock(&state[i].lock);
322 }
323}
324
325static struct ip_mp_alg_ops wrandom_ops = {
326 .mp_alg_select_route = wrandom_select_route,
327 .mp_alg_flush = wrandom_flush,
328 .mp_alg_set_nhinfo = wrandom_set_nhinfo,
329};
330
331static int __init wrandom_init(void)
332{
333 wrandom_init_state();
334
335 return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
336}
337
338static void __exit wrandom_exit(void)
339{
340 multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
341}
342
343module_init(wrandom_init);
344module_exit(wrandom_exit);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 000000000000..46d4cb1c06f0
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,696 @@
1#
2# IP netfilter configuration
3#
4
5menu "IP: Netfilter Configuration"
6 depends on INET && NETFILTER
7
8# connection tracking, helpers and protocols
9config IP_NF_CONNTRACK
10 tristate "Connection tracking (required for masq/NAT)"
11 ---help---
12 Connection tracking keeps a record of what packets have passed
13 through your machine, in order to figure out how they are related
14 into connections.
15
16 This is required to do Masquerading or other kinds of Network
17 Address Translation (except for Fast NAT). It can also be used to
18 enhance packet filtering (see `Connection state match support'
19 below).
20
21 To compile it as a module, choose M here. If unsure, say N.
22
23config IP_NF_CT_ACCT
24 bool "Connection tracking flow accounting"
25 depends on IP_NF_CONNTRACK
26 help
27 If this option is enabled, the connection tracking code will
28 keep per-flow packet and byte counters.
29
30 Those counters can be used for flow-based accounting or the
31 `connbytes' match.
32
33 If unsure, say `N'.
34
35config IP_NF_CONNTRACK_MARK
36 bool 'Connection mark tracking support'
37 help
38 This option enables support for connection marks, used by the
39 `CONNMARK' target and `connmark' match. Similar to the mark value
40 of packets, but this mark value is kept in the conntrack session
41 instead of the individual packets.
42
43config IP_NF_CT_PROTO_SCTP
44 tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
45 depends on IP_NF_CONNTRACK && EXPERIMENTAL
46 help
47 With this option enabled, the connection tracking code will
48 be able to do state tracking on SCTP connections.
49
50 If you want to compile it as a module, say M here and read
51 <file:Documentation/modules.txt>. If unsure, say `N'.
52
53config IP_NF_FTP
54 tristate "FTP protocol support"
55 depends on IP_NF_CONNTRACK
56 help
57 Tracking FTP connections is problematic: special helpers are
58 required for tracking them, and doing masquerading and other forms
59 of Network Address Translation on them.
60
61 To compile it as a module, choose M here. If unsure, say Y.
62
63config IP_NF_IRC
64 tristate "IRC protocol support"
65 depends on IP_NF_CONNTRACK
66 ---help---
67 There is a commonly-used extension to IRC called
68 Direct Client-to-Client Protocol (DCC). This enables users to send
69 files to each other, and also chat to each other without the need
70 of a server. DCC Sending is used anywhere you send files over IRC,
71 and DCC Chat is most commonly used by Eggdrop bots. If you are
72 using NAT, this extension will enable you to send files and initiate
73 chats. Note that you do NOT need this extension to get files or
74 have others initiate chats, or everything else in IRC.
75
76 To compile it as a module, choose M here. If unsure, say Y.
77
78config IP_NF_TFTP
79 tristate "TFTP protocol support"
80 depends on IP_NF_CONNTRACK
81 help
82 TFTP connection tracking helper, this is required depending
83 on how restrictive your ruleset is.
84 If you are using a tftp client behind -j SNAT or -j MASQUERADING
85 you will need this.
86
87 To compile it as a module, choose M here. If unsure, say Y.
88
89config IP_NF_AMANDA
90 tristate "Amanda backup protocol support"
91 depends on IP_NF_CONNTRACK
92 help
93 If you are running the Amanda backup package <http://www.amanda.org/>
94 on this machine or machines that will be MASQUERADED through this
95 machine, then you may want to enable this feature. This allows the
96 connection tracking and natting code to allow the sub-channels that
97 Amanda requires for communication of the backup data, messages and
98 index.
99
100 To compile it as a module, choose M here. If unsure, say Y.
101
102config IP_NF_QUEUE
103 tristate "Userspace queueing via NETLINK"
104 help
105 Netfilter has the ability to queue packets to user space: the
106 netlink device can be used to access them using this driver.
107
108 To compile it as a module, choose M here. If unsure, say N.
109
110config IP_NF_IPTABLES
111 tristate "IP tables support (required for filtering/masq/NAT)"
112 help
113 iptables is a general, extensible packet identification framework.
114 The packet filtering and full NAT (masquerading, port forwarding,
115 etc) subsystems now use this: say `Y' or `M' here if you want to use
116 either of those.
117
118 To compile it as a module, choose M here. If unsure, say N.
119
120# The matches.
121config IP_NF_MATCH_LIMIT
122 tristate "limit match support"
123 depends on IP_NF_IPTABLES
124 help
125 limit matching allows you to control the rate at which a rule can be
126 matched: mainly useful in combination with the LOG target ("LOG
127 target support", below) and to avoid some Denial of Service attacks.
128
129 To compile it as a module, choose M here. If unsure, say N.
130
131config IP_NF_MATCH_IPRANGE
132 tristate "IP range match support"
133 depends on IP_NF_IPTABLES
134 help
135 This option makes possible to match IP addresses against IP address
136 ranges.
137
138 To compile it as a module, choose M here. If unsure, say N.
139
140config IP_NF_MATCH_MAC
141 tristate "MAC address match support"
142 depends on IP_NF_IPTABLES
143 help
144 MAC matching allows you to match packets based on the source
145 Ethernet address of the packet.
146
147 To compile it as a module, choose M here. If unsure, say N.
148
149config IP_NF_MATCH_PKTTYPE
150 tristate "Packet type match support"
151 depends on IP_NF_IPTABLES
152 help
153 Packet type matching allows you to match a packet by
154 its "class", eg. BROADCAST, MULTICAST, ...
155
156 Typical usage:
157 iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
158
159 To compile it as a module, choose M here. If unsure, say N.
160
161config IP_NF_MATCH_MARK
162 tristate "netfilter MARK match support"
163 depends on IP_NF_IPTABLES
164 help
165 Netfilter mark matching allows you to match packets based on the
166 `nfmark' value in the packet. This can be set by the MARK target
167 (see below).
168
169 To compile it as a module, choose M here. If unsure, say N.
170
171config IP_NF_MATCH_MULTIPORT
172 tristate "Multiple port match support"
173 depends on IP_NF_IPTABLES
174 help
175 Multiport matching allows you to match TCP or UDP packets based on
176 a series of source or destination ports: normally a rule can only
177 match a single range of ports.
178
179 To compile it as a module, choose M here. If unsure, say N.
180
181config IP_NF_MATCH_TOS
182 tristate "TOS match support"
183 depends on IP_NF_IPTABLES
184 help
185 TOS matching allows you to match packets based on the Type Of
186 Service fields of the IP packet.
187
188 To compile it as a module, choose M here. If unsure, say N.
189
190config IP_NF_MATCH_RECENT
191 tristate "recent match support"
192 depends on IP_NF_IPTABLES
193 help
194 This match is used for creating one or many lists of recently
195 used addresses and then matching against that/those list(s).
196
197 Short options are available by using 'iptables -m recent -h'
198 Official Website: <http://snowman.net/projects/ipt_recent/>
199
200 To compile it as a module, choose M here. If unsure, say N.
201
202config IP_NF_MATCH_ECN
203 tristate "ECN match support"
204 depends on IP_NF_IPTABLES
205 help
206 This option adds a `ECN' match, which allows you to match against
207 the IPv4 and TCP header ECN fields.
208
209 To compile it as a module, choose M here. If unsure, say N.
210
211config IP_NF_MATCH_DSCP
212 tristate "DSCP match support"
213 depends on IP_NF_IPTABLES
214 help
215 This option adds a `DSCP' match, which allows you to match against
216 the IPv4 header DSCP field (DSCP codepoint).
217
218 The DSCP codepoint can have any value between 0x0 and 0x4f.
219
220 To compile it as a module, choose M here. If unsure, say N.
221
222config IP_NF_MATCH_AH_ESP
223 tristate "AH/ESP match support"
224 depends on IP_NF_IPTABLES
225 help
226 These two match extensions (`ah' and `esp') allow you to match a
227 range of SPIs inside AH or ESP headers of IPSec packets.
228
229 To compile it as a module, choose M here. If unsure, say N.
230
231config IP_NF_MATCH_LENGTH
232 tristate "LENGTH match support"
233 depends on IP_NF_IPTABLES
234 help
235 This option allows you to match the length of a packet against a
236 specific value or range of values.
237
238 To compile it as a module, choose M here. If unsure, say N.
239
240config IP_NF_MATCH_TTL
241 tristate "TTL match support"
242 depends on IP_NF_IPTABLES
243 help
244 This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
245 to match packets by their TTL value.
246
247 To compile it as a module, choose M here. If unsure, say N.
248
249config IP_NF_MATCH_TCPMSS
250 tristate "tcpmss match support"
251 depends on IP_NF_IPTABLES
252 help
253 This option adds a `tcpmss' match, which allows you to examine the
254 MSS value of TCP SYN packets, which control the maximum packet size
255 for that connection.
256
257 To compile it as a module, choose M here. If unsure, say N.
258
259config IP_NF_MATCH_HELPER
260 tristate "Helper match support"
261 depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
262 help
263 Helper matching allows you to match packets in dynamic connections
264 tracked by a conntrack-helper, ie. ip_conntrack_ftp
265
266 To compile it as a module, choose M here. If unsure, say Y.
267
268config IP_NF_MATCH_STATE
269 tristate "Connection state match support"
270 depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
271 help
272 Connection state matching allows you to match packets based on their
273 relationship to a tracked connection (ie. previous packets). This
274 is a powerful tool for packet classification.
275
276 To compile it as a module, choose M here. If unsure, say N.
277
278config IP_NF_MATCH_CONNTRACK
279 tristate "Connection tracking match support"
280 depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
281 help
282 This is a general conntrack match module, a superset of the state match.
283
284 It allows matching on additional conntrack information, which is
285 useful in complex configurations, such as NAT gateways with multiple
286 internet links or tunnels.
287
288 To compile it as a module, choose M here. If unsure, say N.
289
290config IP_NF_MATCH_OWNER
291 tristate "Owner match support"
292 depends on IP_NF_IPTABLES
293 help
294 Packet owner matching allows you to match locally-generated packets
295 based on who created them: the user, group, process or session.
296
297 To compile it as a module, choose M here. If unsure, say N.
298
299config IP_NF_MATCH_PHYSDEV
300 tristate "Physdev match support"
301 depends on IP_NF_IPTABLES && BRIDGE_NETFILTER
302 help
303 Physdev packet matching matches against the physical bridge ports
304 the IP packet arrived on or will leave by.
305
306 To compile it as a module, choose M here. If unsure, say N.
307
308config IP_NF_MATCH_ADDRTYPE
309 tristate 'address type match support'
310 depends on IP_NF_IPTABLES
311 help
312 This option allows you to match what routing thinks of an address,
313 eg. UNICAST, LOCAL, BROADCAST, ...
314
315 If you want to compile it as a module, say M here and read
316 <file:Documentation/modules.txt>. If unsure, say `N'.
317
318config IP_NF_MATCH_REALM
319 tristate 'realm match support'
320 depends on IP_NF_IPTABLES
321 select NET_CLS_ROUTE
322 help
323 This option adds a `realm' match, which allows you to use the realm
324 key from the routing subsystem inside iptables.
325
326 This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
327 in tc world.
328
329 If you want to compile it as a module, say M here and read
330 <file:Documentation/modules.txt>. If unsure, say `N'.
331
332config IP_NF_MATCH_SCTP
333 tristate 'SCTP protocol match support'
334 depends on IP_NF_IPTABLES
335 help
336 With this option enabled, you will be able to use the iptables
337 `sctp' match in order to match on SCTP source/destination ports
338 and SCTP chunk types.
339
340 If you want to compile it as a module, say M here and read
341 <file:Documentation/modules.txt>. If unsure, say `N'.
342
343config IP_NF_MATCH_COMMENT
344 tristate 'comment match support'
345 depends on IP_NF_IPTABLES
346 help
347 This option adds a `comment' dummy-match, which allows you to put
348 comments in your iptables ruleset.
349
350 If you want to compile it as a module, say M here and read
351 <file:Documentation/modules.txt>. If unsure, say `N'.
352
353config IP_NF_MATCH_CONNMARK
354 tristate 'Connection mark match support'
355 depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES
356 help
357 This option adds a `connmark' match, which allows you to match the
358 connection mark value previously set for the session by `CONNMARK'.
359
360 If you want to compile it as a module, say M here and read
361 <file:Documentation/modules.txt>. The module will be called
362 ipt_connmark.o. If unsure, say `N'.
363
364config IP_NF_MATCH_HASHLIMIT
365 tristate 'hashlimit match support'
366 depends on IP_NF_IPTABLES
367 help
368 This option adds a new iptables `hashlimit' match.
369
370 As opposed to `limit', this match dynamically crates a hash table
371 of limit buckets, based on your selection of source/destination
372 ip addresses and/or ports.
373
374 It enables you to express policies like `10kpps for any given
375 destination IP' or `500pps from any given source IP' with a single
376 IPtables rule.
377
378# `filter', generic and specific targets
379config IP_NF_FILTER
380 tristate "Packet filtering"
381 depends on IP_NF_IPTABLES
382 help
383 Packet filtering defines a table `filter', which has a series of
384 rules for simple packet filtering at local input, forwarding and
385 local output. See the man page for iptables(8).
386
387 To compile it as a module, choose M here. If unsure, say N.
388
389config IP_NF_TARGET_REJECT
390 tristate "REJECT target support"
391 depends on IP_NF_FILTER
392 help
393 The REJECT target allows a filtering rule to specify that an ICMP
394 error should be issued in response to an incoming packet, rather
395 than silently being dropped.
396
397 To compile it as a module, choose M here. If unsure, say N.
398
399config IP_NF_TARGET_LOG
400 tristate "LOG target support"
401 depends on IP_NF_IPTABLES
402 help
403 This option adds a `LOG' target, which allows you to create rules in
404 any iptables table which records the packet header to the syslog.
405
406 To compile it as a module, choose M here. If unsure, say N.
407
408config IP_NF_TARGET_ULOG
409 tristate "ULOG target support"
410 depends on IP_NF_IPTABLES
411 ---help---
412 This option adds a `ULOG' target, which allows you to create rules in
413 any iptables table. The packet is passed to a userspace logging
414 daemon using netlink multicast sockets; unlike the LOG target
415 which can only be viewed through syslog.
416
417 The apropriate userspace logging daemon (ulogd) may be obtained from
418 <http://www.gnumonks.org/projects/ulogd/>
419
420 To compile it as a module, choose M here. If unsure, say N.
421
422config IP_NF_TARGET_TCPMSS
423 tristate "TCPMSS target support"
424 depends on IP_NF_IPTABLES
425 ---help---
426 This option adds a `TCPMSS' target, which allows you to alter the
427 MSS value of TCP SYN packets, to control the maximum size for that
428 connection (usually limiting it to your outgoing interface's MTU
429 minus 40).
430
431 This is used to overcome criminally braindead ISPs or servers which
432 block ICMP Fragmentation Needed packets. The symptoms of this
433 problem are that everything works fine from your Linux
434 firewall/router, but machines behind it can never exchange large
435 packets:
436 1) Web browsers connect, then hang with no data received.
437 2) Small mail works fine, but large emails hang.
438 3) ssh works fine, but scp hangs after initial handshaking.
439
440 Workaround: activate this option and add a rule to your firewall
441 configuration like:
442
443 iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
444 -j TCPMSS --clamp-mss-to-pmtu
445
446 To compile it as a module, choose M here. If unsure, say N.
447
448# NAT + specific targets
449config IP_NF_NAT
450 tristate "Full NAT"
451 depends on IP_NF_IPTABLES && IP_NF_CONNTRACK
452 help
453 The Full NAT option allows masquerading, port forwarding and other
454 forms of full Network Address Port Translation. It is controlled by
455 the `nat' table in iptables: see the man page for iptables(8).
456
457 To compile it as a module, choose M here. If unsure, say N.
458
459config IP_NF_NAT_NEEDED
460 bool
461 depends on IP_NF_NAT != n
462 default y
463
464config IP_NF_TARGET_MASQUERADE
465 tristate "MASQUERADE target support"
466 depends on IP_NF_NAT
467 help
468 Masquerading is a special case of NAT: all outgoing connections are
469 changed to seem to come from a particular interface's address, and
470 if the interface goes down, those connections are lost. This is
471 only useful for dialup accounts with dynamic IP address (ie. your IP
472 address will be different on next dialup).
473
474 To compile it as a module, choose M here. If unsure, say N.
475
476config IP_NF_TARGET_REDIRECT
477 tristate "REDIRECT target support"
478 depends on IP_NF_NAT
479 help
480 REDIRECT is a special case of NAT: all incoming connections are
481 mapped onto the incoming interface's address, causing the packets to
482 come to the local machine instead of passing through. This is
483 useful for transparent proxies.
484
485 To compile it as a module, choose M here. If unsure, say N.
486
487config IP_NF_TARGET_NETMAP
488 tristate "NETMAP target support"
489 depends on IP_NF_NAT
490 help
491 NETMAP is an implementation of static 1:1 NAT mapping of network
492 addresses. It maps the network address part, while keeping the host
493 address part intact. It is similar to Fast NAT, except that
494 Netfilter's connection tracking doesn't work well with Fast NAT.
495
496 To compile it as a module, choose M here. If unsure, say N.
497
498config IP_NF_TARGET_SAME
499 tristate "SAME target support"
500 depends on IP_NF_NAT
501 help
502 This option adds a `SAME' target, which works like the standard SNAT
503 target, but attempts to give clients the same IP for all connections.
504
505 To compile it as a module, choose M here. If unsure, say N.
506
507config IP_NF_NAT_SNMP_BASIC
508 tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
509 depends on EXPERIMENTAL && IP_NF_NAT
510 ---help---
511
512 This module implements an Application Layer Gateway (ALG) for
513 SNMP payloads. In conjunction with NAT, it allows a network
514 management system to access multiple private networks with
515 conflicting addresses. It works by modifying IP addresses
516 inside SNMP payloads to match IP-layer NAT mapping.
517
518 This is the "basic" form of SNMP-ALG, as described in RFC 2962
519
520 To compile it as a module, choose M here. If unsure, say N.
521
522config IP_NF_NAT_IRC
523 tristate
524 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
525 default IP_NF_NAT if IP_NF_IRC=y
526 default m if IP_NF_IRC=m
527
528# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y),
529# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. Argh.
530config IP_NF_NAT_FTP
531 tristate
532 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
533 default IP_NF_NAT if IP_NF_FTP=y
534 default m if IP_NF_FTP=m
535
536config IP_NF_NAT_TFTP
537 tristate
538 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
539 default IP_NF_NAT if IP_NF_TFTP=y
540 default m if IP_NF_TFTP=m
541
542config IP_NF_NAT_AMANDA
543 tristate
544 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
545 default IP_NF_NAT if IP_NF_AMANDA=y
546 default m if IP_NF_AMANDA=m
547
548# mangle + specific targets
549config IP_NF_MANGLE
550 tristate "Packet mangling"
551 depends on IP_NF_IPTABLES
552 help
553 This option adds a `mangle' table to iptables: see the man page for
554 iptables(8). This table is used for various packet alterations
555 which can effect how the packet is routed.
556
557 To compile it as a module, choose M here. If unsure, say N.
558
559config IP_NF_TARGET_TOS
560 tristate "TOS target support"
561 depends on IP_NF_MANGLE
562 help
563 This option adds a `TOS' target, which allows you to create rules in
564 the `mangle' table which alter the Type Of Service field of an IP
565 packet prior to routing.
566
567 To compile it as a module, choose M here. If unsure, say N.
568
569config IP_NF_TARGET_ECN
570 tristate "ECN target support"
571 depends on IP_NF_MANGLE
572 ---help---
573 This option adds a `ECN' target, which can be used in the iptables mangle
574 table.
575
576 You can use this target to remove the ECN bits from the IPv4 header of
577 an IP packet. This is particularly useful, if you need to work around
578 existing ECN blackholes on the internet, but don't want to disable
579 ECN support in general.
580
581 To compile it as a module, choose M here. If unsure, say N.
582
583config IP_NF_TARGET_DSCP
584 tristate "DSCP target support"
585 depends on IP_NF_MANGLE
586 help
587 This option adds a `DSCP' match, which allows you to match against
588 the IPv4 header DSCP field (DSCP codepoint).
589
590 The DSCP codepoint can have any value between 0x0 and 0x4f.
591
592 To compile it as a module, choose M here. If unsure, say N.
593
594config IP_NF_TARGET_MARK
595 tristate "MARK target support"
596 depends on IP_NF_MANGLE
597 help
598 This option adds a `MARK' target, which allows you to create rules
599 in the `mangle' table which alter the netfilter mark (nfmark) field
600 associated with the packet prior to routing. This can change
601 the routing method (see `Use netfilter MARK value as routing
602 key') and can also be used by other subsystems to change their
603 behavior.
604
605 To compile it as a module, choose M here. If unsure, say N.
606
607config IP_NF_TARGET_CLASSIFY
608 tristate "CLASSIFY target support"
609 depends on IP_NF_MANGLE
610 help
611 This option adds a `CLASSIFY' target, which enables the user to set
612 the priority of a packet. Some qdiscs can use this value for
613 classification, among these are:
614
615 atm, cbq, dsmark, pfifo_fast, htb, prio
616
617 To compile it as a module, choose M here. If unsure, say N.
618
619config IP_NF_TARGET_CONNMARK
620 tristate 'CONNMARK target support'
621 depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
622 help
623 This option adds a `CONNMARK' target, which allows one to manipulate
624 the connection mark value. Similar to the MARK target, but
625 affects the connection mark value rather than the packet mark value.
626
627 If you want to compile it as a module, say M here and read
628 <file:Documentation/modules.txt>. The module will be called
629 ipt_CONNMARK.o. If unsure, say `N'.
630
631config IP_NF_TARGET_CLUSTERIP
632 tristate "CLUSTERIP target support (EXPERIMENTAL)"
633 depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL
634 help
635 The CLUSTERIP target allows you to build load-balancing clusters of
636 network servers without having a dedicated load-balancing
637 router/server/switch.
638
639 To compile it as a module, choose M here. If unsure, say N.
640
641# raw + specific targets
642config IP_NF_RAW
643 tristate 'raw table support (required for NOTRACK/TRACE)'
644 depends on IP_NF_IPTABLES
645 help
646 This option adds a `raw' table to iptables. This table is the very
647 first in the netfilter framework and hooks in at the PREROUTING
648 and OUTPUT chains.
649
650 If you want to compile it as a module, say M here and read
651 <file:Documentation/modules.txt>. If unsure, say `N'.
652
653config IP_NF_TARGET_NOTRACK
654 tristate 'NOTRACK target support'
655 depends on IP_NF_RAW
656 depends on IP_NF_CONNTRACK
657 help
658 The NOTRACK target allows a select rule to specify
659 which packets *not* to enter the conntrack/NAT
660 subsystem with all the consequences (no ICMP error tracking,
661 no protocol helpers for the selected packets).
662
663 If you want to compile it as a module, say M here and read
664 <file:Documentation/modules.txt>. If unsure, say `N'.
665
666
667# ARP tables
668config IP_NF_ARPTABLES
669 tristate "ARP tables support"
670 help
671 arptables is a general, extensible packet identification framework.
672 The ARP packet filtering and mangling (manipulation)subsystems
673 use this: say Y or M here if you want to use either of those.
674
675 To compile it as a module, choose M here. If unsure, say N.
676
677config IP_NF_ARPFILTER
678 tristate "ARP packet filtering"
679 depends on IP_NF_ARPTABLES
680 help
681 ARP packet filtering defines a table `filter', which has a series of
682 rules for simple ARP packet filtering at local input and
683 local output. On a bridge, you can also specify filtering rules
684 for forwarded ARP packets. See the man page for arptables(8).
685
686 To compile it as a module, choose M here. If unsure, say N.
687
688config IP_NF_ARP_MANGLE
689 tristate "ARP payload mangling"
690 depends on IP_NF_ARPTABLES
691 help
692 Allows altering the ARP packet payload: source and destination
693 hardware and network addresses.
694
695endmenu
696
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 000000000000..45796d5924dd
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,89 @@
1#
2# Makefile for the netfilter modules on top of IPv4.
3#
4
5# objects for the standalone - connection tracking / NAT
6ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
7iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
8
9# connection tracking
10obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
11
12# SCTP protocol connection tracking
13obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
14
15# connection tracking helpers
16obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
17obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
18obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
19obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
20
21# NAT helpers
22obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
23obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
24obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
25obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
26
27# generic IP tables
28obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
29
30# the three instances of ip_tables
31obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
32obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
33obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
34obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
35
36# matches
37obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
38obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
39obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
40obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
41obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
42obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
43obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
44obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
45obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
46obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
47obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
48obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
49obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
50obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
51obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
52obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
53obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
54obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
55obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
56obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
57obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
58obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
59obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
60obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
61obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
62
63# targets
64obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
65obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
66obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
67obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
68obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
69obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
70obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
71obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
72obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
73obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
74obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
75obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
76obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o
77obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
78obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
79obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
80obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
81
82# generic ARP tables
83obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
84obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
85
86# just filtering instance of ARP tables for now
87obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
88
89obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 000000000000..df79f5ed6a0a
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1333 @@
1/*
2 * Packet matching code for ARP packets.
3 *
4 * Based heavily, if not almost entirely, upon ip_tables.c framework.
5 *
6 * Some ARP specific bits are:
7 *
8 * Copyright (C) 2002 David S. Miller (davem@redhat.com)
9 *
10 */
11
12#include <linux/config.h>
13#include <linux/kernel.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/if_arp.h>
17#include <linux/kmod.h>
18#include <linux/vmalloc.h>
19#include <linux/proc_fs.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <asm/uaccess.h>
24#include <asm/semaphore.h>
25
26#include <linux/netfilter_arp/arp_tables.h>
27
28MODULE_LICENSE("GPL");
29MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
30MODULE_DESCRIPTION("arptables core");
31
32/*#define DEBUG_ARP_TABLES*/
33/*#define DEBUG_ARP_TABLES_USER*/
34
35#ifdef DEBUG_ARP_TABLES
36#define dprintf(format, args...) printk(format , ## args)
37#else
38#define dprintf(format, args...)
39#endif
40
41#ifdef DEBUG_ARP_TABLES_USER
42#define duprintf(format, args...) printk(format , ## args)
43#else
44#define duprintf(format, args...)
45#endif
46
47#ifdef CONFIG_NETFILTER_DEBUG
48#define ARP_NF_ASSERT(x) \
49do { \
50 if (!(x)) \
51 printk("ARP_NF_ASSERT: %s:%s:%u\n", \
52 __FUNCTION__, __FILE__, __LINE__); \
53} while(0)
54#else
55#define ARP_NF_ASSERT(x)
56#endif
57#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
58
59static DECLARE_MUTEX(arpt_mutex);
60
61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
63#include <linux/netfilter_ipv4/lockhelp.h>
64#include <linux/netfilter_ipv4/listhelp.h>
65
66struct arpt_table_info {
67 unsigned int size;
68 unsigned int number;
69 unsigned int initial_entries;
70 unsigned int hook_entry[NF_ARP_NUMHOOKS];
71 unsigned int underflow[NF_ARP_NUMHOOKS];
72 char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
73};
74
75static LIST_HEAD(arpt_target);
76static LIST_HEAD(arpt_tables);
77#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
78
79#ifdef CONFIG_SMP
80#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
81#else
82#define TABLE_OFFSET(t,p) 0
83#endif
84
85static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
86 char *hdr_addr, int len)
87{
88 int i, ret;
89
90 if (len > ARPT_DEV_ADDR_LEN_MAX)
91 len = ARPT_DEV_ADDR_LEN_MAX;
92
93 ret = 0;
94 for (i = 0; i < len; i++)
95 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
96
97 return (ret != 0);
98}
99
100/* Returns whether packet matches rule or not. */
101static inline int arp_packet_match(const struct arphdr *arphdr,
102 struct net_device *dev,
103 const char *indev,
104 const char *outdev,
105 const struct arpt_arp *arpinfo)
106{
107 char *arpptr = (char *)(arphdr + 1);
108 char *src_devaddr, *tgt_devaddr;
109 u32 src_ipaddr, tgt_ipaddr;
110 int i, ret;
111
112#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg))
113
114 if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
115 ARPT_INV_ARPOP)) {
116 dprintf("ARP operation field mismatch.\n");
117 dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
118 arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
119 return 0;
120 }
121
122 if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
123 ARPT_INV_ARPHRD)) {
124 dprintf("ARP hardware address format mismatch.\n");
125 dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
126 arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
127 return 0;
128 }
129
130 if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
131 ARPT_INV_ARPPRO)) {
132 dprintf("ARP protocol address format mismatch.\n");
133 dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
134 arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
135 return 0;
136 }
137
138 if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
139 ARPT_INV_ARPHLN)) {
140 dprintf("ARP hardware address length mismatch.\n");
141 dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
142 arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
143 return 0;
144 }
145
146 src_devaddr = arpptr;
147 arpptr += dev->addr_len;
148 memcpy(&src_ipaddr, arpptr, sizeof(u32));
149 arpptr += sizeof(u32);
150 tgt_devaddr = arpptr;
151 arpptr += dev->addr_len;
152 memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
153
154 if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
155 ARPT_INV_SRCDEVADDR) ||
156 FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
157 ARPT_INV_TGTDEVADDR)) {
158 dprintf("Source or target device address mismatch.\n");
159
160 return 0;
161 }
162
163 if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
164 ARPT_INV_SRCIP) ||
165 FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
166 ARPT_INV_TGTIP)) {
167 dprintf("Source or target IP address mismatch.\n");
168
169 dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
170 NIPQUAD(src_ipaddr),
171 NIPQUAD(arpinfo->smsk.s_addr),
172 NIPQUAD(arpinfo->src.s_addr),
173 arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
174 dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
175 NIPQUAD(tgt_ipaddr),
176 NIPQUAD(arpinfo->tmsk.s_addr),
177 NIPQUAD(arpinfo->tgt.s_addr),
178 arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
179 return 0;
180 }
181
182 /* Look for ifname matches. */
183 for (i = 0, ret = 0; i < IFNAMSIZ; i++) {
184 ret |= (indev[i] ^ arpinfo->iniface[i])
185 & arpinfo->iniface_mask[i];
186 }
187
188 if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
189 dprintf("VIA in mismatch (%s vs %s).%s\n",
190 indev, arpinfo->iniface,
191 arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
192 return 0;
193 }
194
195 for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
196 unsigned long odev;
197 memcpy(&odev, outdev + i*sizeof(unsigned long),
198 sizeof(unsigned long));
199 ret |= (odev
200 ^ ((const unsigned long *)arpinfo->outiface)[i])
201 & ((const unsigned long *)arpinfo->outiface_mask)[i];
202 }
203
204 if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
205 dprintf("VIA out mismatch (%s vs %s).%s\n",
206 outdev, arpinfo->outiface,
207 arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
208 return 0;
209 }
210
211 return 1;
212}
213
214static inline int arp_checkentry(const struct arpt_arp *arp)
215{
216 if (arp->flags & ~ARPT_F_MASK) {
217 duprintf("Unknown flag bits set: %08X\n",
218 arp->flags & ~ARPT_F_MASK);
219 return 0;
220 }
221 if (arp->invflags & ~ARPT_INV_MASK) {
222 duprintf("Unknown invflag bits set: %08X\n",
223 arp->invflags & ~ARPT_INV_MASK);
224 return 0;
225 }
226
227 return 1;
228}
229
230static unsigned int arpt_error(struct sk_buff **pskb,
231 unsigned int hooknum,
232 const struct net_device *in,
233 const struct net_device *out,
234 const void *targinfo,
235 void *userinfo)
236{
237 if (net_ratelimit())
238 printk("arp_tables: error: '%s'\n", (char *)targinfo);
239
240 return NF_DROP;
241}
242
243static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
244{
245 return (struct arpt_entry *)(base + offset);
246}
247
248unsigned int arpt_do_table(struct sk_buff **pskb,
249 unsigned int hook,
250 const struct net_device *in,
251 const struct net_device *out,
252 struct arpt_table *table,
253 void *userdata)
254{
255 static const char nulldevname[IFNAMSIZ];
256 unsigned int verdict = NF_DROP;
257 struct arphdr *arp;
258 int hotdrop = 0;
259 struct arpt_entry *e, *back;
260 const char *indev, *outdev;
261 void *table_base;
262
263 /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
264 if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) +
265 (2 * (*pskb)->dev->addr_len) +
266 (2 * sizeof(u32)))))
267 return NF_DROP;
268
269 indev = in ? in->name : nulldevname;
270 outdev = out ? out->name : nulldevname;
271
272 read_lock_bh(&table->lock);
273 table_base = (void *)table->private->entries
274 + TABLE_OFFSET(table->private,
275 smp_processor_id());
276 e = get_entry(table_base, table->private->hook_entry[hook]);
277 back = get_entry(table_base, table->private->underflow[hook]);
278
279 arp = (*pskb)->nh.arph;
280 do {
281 if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) {
282 struct arpt_entry_target *t;
283 int hdr_len;
284
285 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
286 (2 * (*pskb)->dev->addr_len);
287 ADD_COUNTER(e->counters, hdr_len, 1);
288
289 t = arpt_get_target(e);
290
291 /* Standard target? */
292 if (!t->u.kernel.target->target) {
293 int v;
294
295 v = ((struct arpt_standard_target *)t)->verdict;
296 if (v < 0) {
297 /* Pop from stack? */
298 if (v != ARPT_RETURN) {
299 verdict = (unsigned)(-v) - 1;
300 break;
301 }
302 e = back;
303 back = get_entry(table_base,
304 back->comefrom);
305 continue;
306 }
307 if (table_base + v
308 != (void *)e + e->next_offset) {
309 /* Save old back ptr in next entry */
310 struct arpt_entry *next
311 = (void *)e + e->next_offset;
312 next->comefrom =
313 (void *)back - table_base;
314
315 /* set back pointer to next entry */
316 back = next;
317 }
318
319 e = get_entry(table_base, v);
320 } else {
321 /* Targets which reenter must return
322 * abs. verdicts
323 */
324 verdict = t->u.kernel.target->target(pskb,
325 hook,
326 in, out,
327 t->data,
328 userdata);
329
330 /* Target might have changed stuff. */
331 arp = (*pskb)->nh.arph;
332
333 if (verdict == ARPT_CONTINUE)
334 e = (void *)e + e->next_offset;
335 else
336 /* Verdict */
337 break;
338 }
339 } else {
340 e = (void *)e + e->next_offset;
341 }
342 } while (!hotdrop);
343 read_unlock_bh(&table->lock);
344
345 if (hotdrop)
346 return NF_DROP;
347 else
348 return verdict;
349}
350
351static inline void *find_inlist_lock_noload(struct list_head *head,
352 const char *name,
353 int *error,
354 struct semaphore *mutex)
355{
356 void *ret;
357
358 *error = down_interruptible(mutex);
359 if (*error != 0)
360 return NULL;
361
362 ret = list_named_find(head, name);
363 if (!ret) {
364 *error = -ENOENT;
365 up(mutex);
366 }
367 return ret;
368}
369
370#ifndef CONFIG_KMOD
371#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
372#else
373static void *
374find_inlist_lock(struct list_head *head,
375 const char *name,
376 const char *prefix,
377 int *error,
378 struct semaphore *mutex)
379{
380 void *ret;
381
382 ret = find_inlist_lock_noload(head, name, error, mutex);
383 if (!ret) {
384 duprintf("find_inlist: loading `%s%s'.\n", prefix, name);
385 request_module("%s%s", prefix, name);
386 ret = find_inlist_lock_noload(head, name, error, mutex);
387 }
388
389 return ret;
390}
391#endif
392
393static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex)
394{
395 return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex);
396}
397
398static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex)
399{
400 return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex);
401}
402
403/* All zeroes == unconditional rule. */
404static inline int unconditional(const struct arpt_arp *arp)
405{
406 unsigned int i;
407
408 for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++)
409 if (((__u32 *)arp)[i])
410 return 0;
411
412 return 1;
413}
414
415/* Figures out from what hook each rule can be called: returns 0 if
416 * there are loops. Puts hook bitmask in comefrom.
417 */
418static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
419{
420 unsigned int hook;
421
422 /* No recursion; use packet counter to save back ptrs (reset
423 * to 0 as we leave), and comefrom to save source hook bitmask.
424 */
425 for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
426 unsigned int pos = newinfo->hook_entry[hook];
427 struct arpt_entry *e
428 = (struct arpt_entry *)(newinfo->entries + pos);
429
430 if (!(valid_hooks & (1 << hook)))
431 continue;
432
433 /* Set initial back pointer. */
434 e->counters.pcnt = pos;
435
436 for (;;) {
437 struct arpt_standard_target *t
438 = (void *)arpt_get_target(e);
439
440 if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
441 printk("arptables: loop hook %u pos %u %08X.\n",
442 hook, pos, e->comefrom);
443 return 0;
444 }
445 e->comefrom
446 |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
447
448 /* Unconditional return/END. */
449 if (e->target_offset == sizeof(struct arpt_entry)
450 && (strcmp(t->target.u.user.name,
451 ARPT_STANDARD_TARGET) == 0)
452 && t->verdict < 0
453 && unconditional(&e->arp)) {
454 unsigned int oldpos, size;
455
456 /* Return: backtrack through the last
457 * big jump.
458 */
459 do {
460 e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
461 oldpos = pos;
462 pos = e->counters.pcnt;
463 e->counters.pcnt = 0;
464
465 /* We're at the start. */
466 if (pos == oldpos)
467 goto next;
468
469 e = (struct arpt_entry *)
470 (newinfo->entries + pos);
471 } while (oldpos == pos + e->next_offset);
472
473 /* Move along one */
474 size = e->next_offset;
475 e = (struct arpt_entry *)
476 (newinfo->entries + pos + size);
477 e->counters.pcnt = pos;
478 pos += size;
479 } else {
480 int newpos = t->verdict;
481
482 if (strcmp(t->target.u.user.name,
483 ARPT_STANDARD_TARGET) == 0
484 && newpos >= 0) {
485 /* This a jump; chase it. */
486 duprintf("Jump rule %u -> %u\n",
487 pos, newpos);
488 } else {
489 /* ... this is a fallthru */
490 newpos = pos + e->next_offset;
491 }
492 e = (struct arpt_entry *)
493 (newinfo->entries + newpos);
494 e->counters.pcnt = pos;
495 pos = newpos;
496 }
497 }
498 next:
499 duprintf("Finished chain %u\n", hook);
500 }
501 return 1;
502}
503
504static inline int standard_check(const struct arpt_entry_target *t,
505 unsigned int max_offset)
506{
507 struct arpt_standard_target *targ = (void *)t;
508
509 /* Check standard info. */
510 if (t->u.target_size
511 != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
512 duprintf("arpt_standard_check: target size %u != %Zu\n",
513 t->u.target_size,
514 ARPT_ALIGN(sizeof(struct arpt_standard_target)));
515 return 0;
516 }
517
518 if (targ->verdict >= 0
519 && targ->verdict > max_offset - sizeof(struct arpt_entry)) {
520 duprintf("arpt_standard_check: bad verdict (%i)\n",
521 targ->verdict);
522 return 0;
523 }
524
525 if (targ->verdict < -NF_MAX_VERDICT - 1) {
526 duprintf("arpt_standard_check: bad negative verdict (%i)\n",
527 targ->verdict);
528 return 0;
529 }
530 return 1;
531}
532
533static struct arpt_target arpt_standard_target;
534
535static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size,
536 unsigned int *i)
537{
538 struct arpt_entry_target *t;
539 struct arpt_target *target;
540 int ret;
541
542 if (!arp_checkentry(&e->arp)) {
543 duprintf("arp_tables: arp check failed %p %s.\n", e, name);
544 return -EINVAL;
545 }
546
547 t = arpt_get_target(e);
548 target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex);
549 if (!target) {
550 duprintf("check_entry: `%s' not found\n", t->u.user.name);
551 goto out;
552 }
553 if (!try_module_get((target->me))) {
554 ret = -ENOENT;
555 goto out_unlock;
556 }
557 t->u.kernel.target = target;
558 up(&arpt_mutex);
559
560 if (t->u.kernel.target == &arpt_standard_target) {
561 if (!standard_check(t, size)) {
562 ret = -EINVAL;
563 goto out;
564 }
565 } else if (t->u.kernel.target->checkentry
566 && !t->u.kernel.target->checkentry(name, e, t->data,
567 t->u.target_size
568 - sizeof(*t),
569 e->comefrom)) {
570 module_put(t->u.kernel.target->me);
571 duprintf("arp_tables: check failed for `%s'.\n",
572 t->u.kernel.target->name);
573 ret = -EINVAL;
574 goto out;
575 }
576
577 (*i)++;
578 return 0;
579
580out_unlock:
581 up(&arpt_mutex);
582out:
583 return ret;
584}
585
586static inline int check_entry_size_and_hooks(struct arpt_entry *e,
587 struct arpt_table_info *newinfo,
588 unsigned char *base,
589 unsigned char *limit,
590 const unsigned int *hook_entries,
591 const unsigned int *underflows,
592 unsigned int *i)
593{
594 unsigned int h;
595
596 if ((unsigned long)e % __alignof__(struct arpt_entry) != 0
597 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
598 duprintf("Bad offset %p\n", e);
599 return -EINVAL;
600 }
601
602 if (e->next_offset
603 < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
604 duprintf("checking: element %p size %u\n",
605 e, e->next_offset);
606 return -EINVAL;
607 }
608
609 /* Check hooks & underflows */
610 for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
611 if ((unsigned char *)e - base == hook_entries[h])
612 newinfo->hook_entry[h] = hook_entries[h];
613 if ((unsigned char *)e - base == underflows[h])
614 newinfo->underflow[h] = underflows[h];
615 }
616
617 /* FIXME: underflows must be unconditional, standard verdicts
618 < 0 (not ARPT_RETURN). --RR */
619
620 /* Clear counters and comefrom */
621 e->counters = ((struct arpt_counters) { 0, 0 });
622 e->comefrom = 0;
623
624 (*i)++;
625 return 0;
626}
627
628static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
629{
630 struct arpt_entry_target *t;
631
632 if (i && (*i)-- == 0)
633 return 1;
634
635 t = arpt_get_target(e);
636 if (t->u.kernel.target->destroy)
637 t->u.kernel.target->destroy(t->data,
638 t->u.target_size - sizeof(*t));
639 module_put(t->u.kernel.target->me);
640 return 0;
641}
642
643/* Checks and translates the user-supplied table segment (held in
644 * newinfo).
645 */
646static int translate_table(const char *name,
647 unsigned int valid_hooks,
648 struct arpt_table_info *newinfo,
649 unsigned int size,
650 unsigned int number,
651 const unsigned int *hook_entries,
652 const unsigned int *underflows)
653{
654 unsigned int i;
655 int ret;
656
657 newinfo->size = size;
658 newinfo->number = number;
659
660 /* Init all hooks to impossible value. */
661 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
662 newinfo->hook_entry[i] = 0xFFFFFFFF;
663 newinfo->underflow[i] = 0xFFFFFFFF;
664 }
665
666 duprintf("translate_table: size %u\n", newinfo->size);
667 i = 0;
668
669 /* Walk through entries, checking offsets. */
670 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
671 check_entry_size_and_hooks,
672 newinfo,
673 newinfo->entries,
674 newinfo->entries + size,
675 hook_entries, underflows, &i);
676 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
677 if (ret != 0)
678 return ret;
679
680 if (i != number) {
681 duprintf("translate_table: %u not %u entries\n",
682 i, number);
683 return -EINVAL;
684 }
685
686 /* Check hooks all assigned */
687 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
688 /* Only hooks which are valid */
689 if (!(valid_hooks & (1 << i)))
690 continue;
691 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
692 duprintf("Invalid hook entry %u %u\n",
693 i, hook_entries[i]);
694 return -EINVAL;
695 }
696 if (newinfo->underflow[i] == 0xFFFFFFFF) {
697 duprintf("Invalid underflow %u %u\n",
698 i, underflows[i]);
699 return -EINVAL;
700 }
701 }
702
703 if (!mark_source_chains(newinfo, valid_hooks)) {
704 duprintf("Looping hook\n");
705 return -ELOOP;
706 }
707
708 /* Finally, each sanity check must pass */
709 i = 0;
710 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
711 check_entry, name, size, &i);
712
713 if (ret != 0) {
714 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
715 cleanup_entry, &i);
716 return ret;
717 }
718
719 /* And one copy for every other CPU */
720 for (i = 1; i < num_possible_cpus(); i++) {
721 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
722 newinfo->entries,
723 SMP_ALIGN(newinfo->size));
724 }
725
726 return ret;
727}
728
729static struct arpt_table_info *replace_table(struct arpt_table *table,
730 unsigned int num_counters,
731 struct arpt_table_info *newinfo,
732 int *error)
733{
734 struct arpt_table_info *oldinfo;
735
736 /* Do the substitution. */
737 write_lock_bh(&table->lock);
738 /* Check inside lock: is the old number correct? */
739 if (num_counters != table->private->number) {
740 duprintf("num_counters != table->private->number (%u/%u)\n",
741 num_counters, table->private->number);
742 write_unlock_bh(&table->lock);
743 *error = -EAGAIN;
744 return NULL;
745 }
746 oldinfo = table->private;
747 table->private = newinfo;
748 newinfo->initial_entries = oldinfo->initial_entries;
749 write_unlock_bh(&table->lock);
750
751 return oldinfo;
752}
753
754/* Gets counters. */
755static inline int add_entry_to_counter(const struct arpt_entry *e,
756 struct arpt_counters total[],
757 unsigned int *i)
758{
759 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
760
761 (*i)++;
762 return 0;
763}
764
765static void get_counters(const struct arpt_table_info *t,
766 struct arpt_counters counters[])
767{
768 unsigned int cpu;
769 unsigned int i;
770
771 for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
772 i = 0;
773 ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
774 t->size,
775 add_entry_to_counter,
776 counters,
777 &i);
778 }
779}
780
781static int copy_entries_to_user(unsigned int total_size,
782 struct arpt_table *table,
783 void __user *userptr)
784{
785 unsigned int off, num, countersize;
786 struct arpt_entry *e;
787 struct arpt_counters *counters;
788 int ret = 0;
789
790 /* We need atomic snapshot of counters: rest doesn't change
791 * (other than comefrom, which userspace doesn't care
792 * about).
793 */
794 countersize = sizeof(struct arpt_counters) * table->private->number;
795 counters = vmalloc(countersize);
796
797 if (counters == NULL)
798 return -ENOMEM;
799
800 /* First, sum counters... */
801 memset(counters, 0, countersize);
802 write_lock_bh(&table->lock);
803 get_counters(table->private, counters);
804 write_unlock_bh(&table->lock);
805
806 /* ... then copy entire thing from CPU 0... */
807 if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
808 ret = -EFAULT;
809 goto free_counters;
810 }
811
812 /* FIXME: use iterator macros --RR */
813 /* ... then go back and fix counters and names */
814 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
815 struct arpt_entry_target *t;
816
817 e = (struct arpt_entry *)(table->private->entries + off);
818 if (copy_to_user(userptr + off
819 + offsetof(struct arpt_entry, counters),
820 &counters[num],
821 sizeof(counters[num])) != 0) {
822 ret = -EFAULT;
823 goto free_counters;
824 }
825
826 t = arpt_get_target(e);
827 if (copy_to_user(userptr + off + e->target_offset
828 + offsetof(struct arpt_entry_target,
829 u.user.name),
830 t->u.kernel.target->name,
831 strlen(t->u.kernel.target->name)+1) != 0) {
832 ret = -EFAULT;
833 goto free_counters;
834 }
835 }
836
837 free_counters:
838 vfree(counters);
839 return ret;
840}
841
842static int get_entries(const struct arpt_get_entries *entries,
843 struct arpt_get_entries __user *uptr)
844{
845 int ret;
846 struct arpt_table *t;
847
848 t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex);
849 if (t) {
850 duprintf("t->private->number = %u\n",
851 t->private->number);
852 if (entries->size == t->private->size)
853 ret = copy_entries_to_user(t->private->size,
854 t, uptr->entrytable);
855 else {
856 duprintf("get_entries: I've got %u not %u!\n",
857 t->private->size,
858 entries->size);
859 ret = -EINVAL;
860 }
861 up(&arpt_mutex);
862 } else
863 duprintf("get_entries: Can't find %s!\n",
864 entries->name);
865
866 return ret;
867}
868
869static int do_replace(void __user *user, unsigned int len)
870{
871 int ret;
872 struct arpt_replace tmp;
873 struct arpt_table *t;
874 struct arpt_table_info *newinfo, *oldinfo;
875 struct arpt_counters *counters;
876
877 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
878 return -EFAULT;
879
880 /* Hack: Causes ipchains to give correct error msg --RR */
881 if (len != sizeof(tmp) + tmp.size)
882 return -ENOPROTOOPT;
883
884 /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
885 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
886 return -ENOMEM;
887
888 newinfo = vmalloc(sizeof(struct arpt_table_info)
889 + SMP_ALIGN(tmp.size) * num_possible_cpus());
890 if (!newinfo)
891 return -ENOMEM;
892
893 if (copy_from_user(newinfo->entries, user + sizeof(tmp),
894 tmp.size) != 0) {
895 ret = -EFAULT;
896 goto free_newinfo;
897 }
898
899 counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters));
900 if (!counters) {
901 ret = -ENOMEM;
902 goto free_newinfo;
903 }
904 memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
905
906 ret = translate_table(tmp.name, tmp.valid_hooks,
907 newinfo, tmp.size, tmp.num_entries,
908 tmp.hook_entry, tmp.underflow);
909 if (ret != 0)
910 goto free_newinfo_counters;
911
912 duprintf("arp_tables: Translated table\n");
913
914 t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
915 if (!t)
916 goto free_newinfo_counters_untrans;
917
918 /* You lied! */
919 if (tmp.valid_hooks != t->valid_hooks) {
920 duprintf("Valid hook crap: %08X vs %08X\n",
921 tmp.valid_hooks, t->valid_hooks);
922 ret = -EINVAL;
923 goto free_newinfo_counters_untrans_unlock;
924 }
925
926 /* Get a reference in advance, we're not allowed fail later */
927 if (!try_module_get(t->me)) {
928 ret = -EBUSY;
929 goto free_newinfo_counters_untrans_unlock;
930 }
931
932 oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
933 if (!oldinfo)
934 goto put_module;
935
936 /* Update module usage count based on number of rules */
937 duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
938 oldinfo->number, oldinfo->initial_entries, newinfo->number);
939 if ((oldinfo->number > oldinfo->initial_entries) ||
940 (newinfo->number <= oldinfo->initial_entries))
941 module_put(t->me);
942 if ((oldinfo->number > oldinfo->initial_entries) &&
943 (newinfo->number <= oldinfo->initial_entries))
944 module_put(t->me);
945
946 /* Get the old counters. */
947 get_counters(oldinfo, counters);
948 /* Decrease module usage counts and free resource */
949 ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
950 vfree(oldinfo);
951 if (copy_to_user(tmp.counters, counters,
952 sizeof(struct arpt_counters) * tmp.num_counters) != 0)
953 ret = -EFAULT;
954 vfree(counters);
955 up(&arpt_mutex);
956 return ret;
957
958 put_module:
959 module_put(t->me);
960 free_newinfo_counters_untrans_unlock:
961 up(&arpt_mutex);
962 free_newinfo_counters_untrans:
963 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
964 free_newinfo_counters:
965 vfree(counters);
966 free_newinfo:
967 vfree(newinfo);
968 return ret;
969}
970
971/* We're lazy, and add to the first CPU; overflow works its fey magic
972 * and everything is OK.
973 */
974static inline int add_counter_to_entry(struct arpt_entry *e,
975 const struct arpt_counters addme[],
976 unsigned int *i)
977{
978
979 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
980
981 (*i)++;
982 return 0;
983}
984
985static int do_add_counters(void __user *user, unsigned int len)
986{
987 unsigned int i;
988 struct arpt_counters_info tmp, *paddc;
989 struct arpt_table *t;
990 int ret;
991
992 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
993 return -EFAULT;
994
995 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters))
996 return -EINVAL;
997
998 paddc = vmalloc(len);
999 if (!paddc)
1000 return -ENOMEM;
1001
1002 if (copy_from_user(paddc, user, len) != 0) {
1003 ret = -EFAULT;
1004 goto free;
1005 }
1006
1007 t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
1008 if (!t)
1009 goto free;
1010
1011 write_lock_bh(&t->lock);
1012 if (t->private->number != paddc->num_counters) {
1013 ret = -EINVAL;
1014 goto unlock_up_free;
1015 }
1016
1017 i = 0;
1018 ARPT_ENTRY_ITERATE(t->private->entries,
1019 t->private->size,
1020 add_counter_to_entry,
1021 paddc->counters,
1022 &i);
1023 unlock_up_free:
1024 write_unlock_bh(&t->lock);
1025 up(&arpt_mutex);
1026 free:
1027 vfree(paddc);
1028
1029 return ret;
1030}
1031
1032static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1033{
1034 int ret;
1035
1036 if (!capable(CAP_NET_ADMIN))
1037 return -EPERM;
1038
1039 switch (cmd) {
1040 case ARPT_SO_SET_REPLACE:
1041 ret = do_replace(user, len);
1042 break;
1043
1044 case ARPT_SO_SET_ADD_COUNTERS:
1045 ret = do_add_counters(user, len);
1046 break;
1047
1048 default:
1049 duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
1050 ret = -EINVAL;
1051 }
1052
1053 return ret;
1054}
1055
1056static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
1057{
1058 int ret;
1059
1060 if (!capable(CAP_NET_ADMIN))
1061 return -EPERM;
1062
1063 switch (cmd) {
1064 case ARPT_SO_GET_INFO: {
1065 char name[ARPT_TABLE_MAXNAMELEN];
1066 struct arpt_table *t;
1067
1068 if (*len != sizeof(struct arpt_getinfo)) {
1069 duprintf("length %u != %Zu\n", *len,
1070 sizeof(struct arpt_getinfo));
1071 ret = -EINVAL;
1072 break;
1073 }
1074
1075 if (copy_from_user(name, user, sizeof(name)) != 0) {
1076 ret = -EFAULT;
1077 break;
1078 }
1079 name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
1080 t = arpt_find_table_lock(name, &ret, &arpt_mutex);
1081 if (t) {
1082 struct arpt_getinfo info;
1083
1084 info.valid_hooks = t->valid_hooks;
1085 memcpy(info.hook_entry, t->private->hook_entry,
1086 sizeof(info.hook_entry));
1087 memcpy(info.underflow, t->private->underflow,
1088 sizeof(info.underflow));
1089 info.num_entries = t->private->number;
1090 info.size = t->private->size;
1091 strcpy(info.name, name);
1092
1093 if (copy_to_user(user, &info, *len) != 0)
1094 ret = -EFAULT;
1095 else
1096 ret = 0;
1097
1098 up(&arpt_mutex);
1099 }
1100 }
1101 break;
1102
1103 case ARPT_SO_GET_ENTRIES: {
1104 struct arpt_get_entries get;
1105
1106 if (*len < sizeof(get)) {
1107 duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
1108 ret = -EINVAL;
1109 } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
1110 ret = -EFAULT;
1111 } else if (*len != sizeof(struct arpt_get_entries) + get.size) {
1112 duprintf("get_entries: %u != %Zu\n", *len,
1113 sizeof(struct arpt_get_entries) + get.size);
1114 ret = -EINVAL;
1115 } else
1116 ret = get_entries(&get, user);
1117 break;
1118 }
1119
1120 default:
1121 duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
1122 ret = -EINVAL;
1123 }
1124
1125 return ret;
1126}
1127
1128/* Registration hooks for targets. */
1129int arpt_register_target(struct arpt_target *target)
1130{
1131 int ret;
1132
1133 ret = down_interruptible(&arpt_mutex);
1134 if (ret != 0)
1135 return ret;
1136
1137 if (!list_named_insert(&arpt_target, target)) {
1138 duprintf("arpt_register_target: `%s' already in list!\n",
1139 target->name);
1140 ret = -EINVAL;
1141 }
1142 up(&arpt_mutex);
1143 return ret;
1144}
1145
1146void arpt_unregister_target(struct arpt_target *target)
1147{
1148 down(&arpt_mutex);
1149 LIST_DELETE(&arpt_target, target);
1150 up(&arpt_mutex);
1151}
1152
1153int arpt_register_table(struct arpt_table *table,
1154 const struct arpt_replace *repl)
1155{
1156 int ret;
1157 struct arpt_table_info *newinfo;
1158 static struct arpt_table_info bootstrap
1159 = { 0, 0, 0, { 0 }, { 0 }, { } };
1160
1161 newinfo = vmalloc(sizeof(struct arpt_table_info)
1162 + SMP_ALIGN(repl->size) * num_possible_cpus());
1163 if (!newinfo) {
1164 ret = -ENOMEM;
1165 return ret;
1166 }
1167 memcpy(newinfo->entries, repl->entries, repl->size);
1168
1169 ret = translate_table(table->name, table->valid_hooks,
1170 newinfo, repl->size,
1171 repl->num_entries,
1172 repl->hook_entry,
1173 repl->underflow);
1174 duprintf("arpt_register_table: translate table gives %d\n", ret);
1175 if (ret != 0) {
1176 vfree(newinfo);
1177 return ret;
1178 }
1179
1180 ret = down_interruptible(&arpt_mutex);
1181 if (ret != 0) {
1182 vfree(newinfo);
1183 return ret;
1184 }
1185
1186 /* Don't autoload: we'd eat our tail... */
1187 if (list_named_find(&arpt_tables, table->name)) {
1188 ret = -EEXIST;
1189 goto free_unlock;
1190 }
1191
1192 /* Simplifies replace_table code. */
1193 table->private = &bootstrap;
1194 if (!replace_table(table, 0, newinfo, &ret))
1195 goto free_unlock;
1196
1197 duprintf("table->private->number = %u\n",
1198 table->private->number);
1199
1200 /* save number of initial entries */
1201 table->private->initial_entries = table->private->number;
1202
1203 rwlock_init(&table->lock);
1204 list_prepend(&arpt_tables, table);
1205
1206 unlock:
1207 up(&arpt_mutex);
1208 return ret;
1209
1210 free_unlock:
1211 vfree(newinfo);
1212 goto unlock;
1213}
1214
1215void arpt_unregister_table(struct arpt_table *table)
1216{
1217 down(&arpt_mutex);
1218 LIST_DELETE(&arpt_tables, table);
1219 up(&arpt_mutex);
1220
1221 /* Decrease module usage counts and free resources */
1222 ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
1223 cleanup_entry, NULL);
1224 vfree(table->private);
1225}
1226
1227/* The built-in targets: standard (NULL) and error. */
1228static struct arpt_target arpt_standard_target = {
1229 .name = ARPT_STANDARD_TARGET,
1230};
1231
1232static struct arpt_target arpt_error_target = {
1233 .name = ARPT_ERROR_TARGET,
1234 .target = arpt_error,
1235};
1236
1237static struct nf_sockopt_ops arpt_sockopts = {
1238 .pf = PF_INET,
1239 .set_optmin = ARPT_BASE_CTL,
1240 .set_optmax = ARPT_SO_SET_MAX+1,
1241 .set = do_arpt_set_ctl,
1242 .get_optmin = ARPT_BASE_CTL,
1243 .get_optmax = ARPT_SO_GET_MAX+1,
1244 .get = do_arpt_get_ctl,
1245};
1246
1247#ifdef CONFIG_PROC_FS
1248static inline int print_name(const struct arpt_table *t,
1249 off_t start_offset, char *buffer, int length,
1250 off_t *pos, unsigned int *count)
1251{
1252 if ((*count)++ >= start_offset) {
1253 unsigned int namelen;
1254
1255 namelen = sprintf(buffer + *pos, "%s\n", t->name);
1256 if (*pos + namelen > length) {
1257 /* Stop iterating */
1258 return 1;
1259 }
1260 *pos += namelen;
1261 }
1262 return 0;
1263}
1264
1265static int arpt_get_tables(char *buffer, char **start, off_t offset, int length)
1266{
1267 off_t pos = 0;
1268 unsigned int count = 0;
1269
1270 if (down_interruptible(&arpt_mutex) != 0)
1271 return 0;
1272
1273 LIST_FIND(&arpt_tables, print_name, struct arpt_table *,
1274 offset, buffer, length, &pos, &count);
1275
1276 up(&arpt_mutex);
1277
1278 /* `start' hack - see fs/proc/generic.c line ~105 */
1279 *start=(char *)((unsigned long)count-offset);
1280 return pos;
1281}
1282#endif /*CONFIG_PROC_FS*/
1283
1284static int __init init(void)
1285{
1286 int ret;
1287
1288 /* Noone else will be downing sem now, so we won't sleep */
1289 down(&arpt_mutex);
1290 list_append(&arpt_target, &arpt_standard_target);
1291 list_append(&arpt_target, &arpt_error_target);
1292 up(&arpt_mutex);
1293
1294 /* Register setsockopt */
1295 ret = nf_register_sockopt(&arpt_sockopts);
1296 if (ret < 0) {
1297 duprintf("Unable to register sockopts.\n");
1298 return ret;
1299 }
1300
1301#ifdef CONFIG_PROC_FS
1302 {
1303 struct proc_dir_entry *proc;
1304
1305 proc = proc_net_create("arp_tables_names", 0, arpt_get_tables);
1306 if (!proc) {
1307 nf_unregister_sockopt(&arpt_sockopts);
1308 return -ENOMEM;
1309 }
1310 proc->owner = THIS_MODULE;
1311 }
1312#endif
1313
1314 printk("arp_tables: (C) 2002 David S. Miller\n");
1315 return 0;
1316}
1317
1318static void __exit fini(void)
1319{
1320 nf_unregister_sockopt(&arpt_sockopts);
1321#ifdef CONFIG_PROC_FS
1322 proc_net_remove("arp_tables_names");
1323#endif
1324}
1325
1326EXPORT_SYMBOL(arpt_register_table);
1327EXPORT_SYMBOL(arpt_unregister_table);
1328EXPORT_SYMBOL(arpt_do_table);
1329EXPORT_SYMBOL(arpt_register_target);
1330EXPORT_SYMBOL(arpt_unregister_target);
1331
1332module_init(init);
1333module_exit(fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 000000000000..3e592ec86482
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,104 @@
1/* module that allows mangling of the arp payload */
2#include <linux/module.h>
3#include <linux/netfilter_arp/arpt_mangle.h>
4#include <net/sock.h>
5
6MODULE_LICENSE("GPL");
7MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
8MODULE_DESCRIPTION("arptables arp payload mangle target");
9
10static unsigned int
11target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in,
12 const struct net_device *out, const void *targinfo, void *userinfo)
13{
14 const struct arpt_mangle *mangle = targinfo;
15 struct arphdr *arp;
16 unsigned char *arpptr;
17 int pln, hln;
18
19 if (skb_shared(*pskb) || skb_cloned(*pskb)) {
20 struct sk_buff *nskb;
21
22 nskb = skb_copy(*pskb, GFP_ATOMIC);
23 if (!nskb)
24 return NF_DROP;
25 if ((*pskb)->sk)
26 skb_set_owner_w(nskb, (*pskb)->sk);
27 kfree_skb(*pskb);
28 *pskb = nskb;
29 }
30
31 arp = (*pskb)->nh.arph;
32 arpptr = (*pskb)->nh.raw + sizeof(*arp);
33 pln = arp->ar_pln;
34 hln = arp->ar_hln;
35 /* We assume that pln and hln were checked in the match */
36 if (mangle->flags & ARPT_MANGLE_SDEV) {
37 if (ARPT_DEV_ADDR_LEN_MAX < hln ||
38 (arpptr + hln > (**pskb).tail))
39 return NF_DROP;
40 memcpy(arpptr, mangle->src_devaddr, hln);
41 }
42 arpptr += hln;
43 if (mangle->flags & ARPT_MANGLE_SIP) {
44 if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
45 (arpptr + pln > (**pskb).tail))
46 return NF_DROP;
47 memcpy(arpptr, &mangle->u_s.src_ip, pln);
48 }
49 arpptr += pln;
50 if (mangle->flags & ARPT_MANGLE_TDEV) {
51 if (ARPT_DEV_ADDR_LEN_MAX < hln ||
52 (arpptr + hln > (**pskb).tail))
53 return NF_DROP;
54 memcpy(arpptr, mangle->tgt_devaddr, hln);
55 }
56 arpptr += hln;
57 if (mangle->flags & ARPT_MANGLE_TIP) {
58 if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
59 (arpptr + pln > (**pskb).tail))
60 return NF_DROP;
61 memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
62 }
63 return mangle->target;
64}
65
66static int
67checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo,
68 unsigned int targinfosize, unsigned int hook_mask)
69{
70 const struct arpt_mangle *mangle = targinfo;
71
72 if (mangle->flags & ~ARPT_MANGLE_MASK ||
73 !(mangle->flags & ARPT_MANGLE_MASK))
74 return 0;
75
76 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
77 mangle->target != ARPT_CONTINUE)
78 return 0;
79 return 1;
80}
81
82static struct arpt_target arpt_mangle_reg
83= {
84 .name = "mangle",
85 .target = target,
86 .checkentry = checkentry,
87 .me = THIS_MODULE,
88};
89
90static int __init init(void)
91{
92 if (arpt_register_target(&arpt_mangle_reg))
93 return -EINVAL;
94
95 return 0;
96}
97
98static void __exit fini(void)
99{
100 arpt_unregister_target(&arpt_mangle_reg);
101}
102
103module_init(init);
104module_exit(fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 000000000000..0d759f5a4ef0
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,214 @@
1/*
2 * Filtering ARP tables module.
3 *
4 * Copyright (C) 2002 David S. Miller (davem@redhat.com)
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/netfilter_arp/arp_tables.h>
10
11MODULE_LICENSE("GPL");
12MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
13MODULE_DESCRIPTION("arptables filter table");
14
15#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
16 (1 << NF_ARP_FORWARD))
17
18/* Standard entry. */
19struct arpt_standard
20{
21 struct arpt_entry entry;
22 struct arpt_standard_target target;
23};
24
25struct arpt_error_target
26{
27 struct arpt_entry_target target;
28 char errorname[ARPT_FUNCTION_MAXNAMELEN];
29};
30
31struct arpt_error
32{
33 struct arpt_entry entry;
34 struct arpt_error_target target;
35};
36
37static struct
38{
39 struct arpt_replace repl;
40 struct arpt_standard entries[3];
41 struct arpt_error term;
42} initial_table __initdata
43= { { "filter", FILTER_VALID_HOOKS, 4,
44 sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
45 { [NF_ARP_IN] = 0,
46 [NF_ARP_OUT] = sizeof(struct arpt_standard),
47 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
48 { [NF_ARP_IN] = 0,
49 [NF_ARP_OUT] = sizeof(struct arpt_standard),
50 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
51 0, NULL, { } },
52 {
53 /* ARP_IN */
54 {
55 {
56 {
57 { 0 }, { 0 }, { 0 }, { 0 },
58 0, 0,
59 { { 0, }, { 0, } },
60 { { 0, }, { 0, } },
61 0, 0,
62 0, 0,
63 0, 0,
64 "", "", { 0 }, { 0 },
65 0, 0
66 },
67 sizeof(struct arpt_entry),
68 sizeof(struct arpt_standard),
69 0,
70 { 0, 0 }, { } },
71 { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
72 -NF_ACCEPT - 1 }
73 },
74 /* ARP_OUT */
75 {
76 {
77 {
78 { 0 }, { 0 }, { 0 }, { 0 },
79 0, 0,
80 { { 0, }, { 0, } },
81 { { 0, }, { 0, } },
82 0, 0,
83 0, 0,
84 0, 0,
85 "", "", { 0 }, { 0 },
86 0, 0
87 },
88 sizeof(struct arpt_entry),
89 sizeof(struct arpt_standard),
90 0,
91 { 0, 0 }, { } },
92 { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
93 -NF_ACCEPT - 1 }
94 },
95 /* ARP_FORWARD */
96 {
97 {
98 {
99 { 0 }, { 0 }, { 0 }, { 0 },
100 0, 0,
101 { { 0, }, { 0, } },
102 { { 0, }, { 0, } },
103 0, 0,
104 0, 0,
105 0, 0,
106 "", "", { 0 }, { 0 },
107 0, 0
108 },
109 sizeof(struct arpt_entry),
110 sizeof(struct arpt_standard),
111 0,
112 { 0, 0 }, { } },
113 { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
114 -NF_ACCEPT - 1 }
115 }
116 },
117 /* ERROR */
118 {
119 {
120 {
121 { 0 }, { 0 }, { 0 }, { 0 },
122 0, 0,
123 { { 0, }, { 0, } },
124 { { 0, }, { 0, } },
125 0, 0,
126 0, 0,
127 0, 0,
128 "", "", { 0 }, { 0 },
129 0, 0
130 },
131 sizeof(struct arpt_entry),
132 sizeof(struct arpt_error),
133 0,
134 { 0, 0 }, { } },
135 { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } },
136 { } },
137 "ERROR"
138 }
139 }
140};
141
142static struct arpt_table packet_filter = {
143 .name = "filter",
144 .valid_hooks = FILTER_VALID_HOOKS,
145 .lock = RW_LOCK_UNLOCKED,
146 .private = NULL,
147 .me = THIS_MODULE,
148};
149
150/* The work comes in here from netfilter.c */
151static unsigned int arpt_hook(unsigned int hook,
152 struct sk_buff **pskb,
153 const struct net_device *in,
154 const struct net_device *out,
155 int (*okfn)(struct sk_buff *))
156{
157 return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL);
158}
159
160static struct nf_hook_ops arpt_ops[] = {
161 {
162 .hook = arpt_hook,
163 .owner = THIS_MODULE,
164 .pf = NF_ARP,
165 .hooknum = NF_ARP_IN,
166 },
167 {
168 .hook = arpt_hook,
169 .owner = THIS_MODULE,
170 .pf = NF_ARP,
171 .hooknum = NF_ARP_OUT,
172 },
173 {
174 .hook = arpt_hook,
175 .owner = THIS_MODULE,
176 .pf = NF_ARP,
177 .hooknum = NF_ARP_FORWARD,
178 },
179};
180
181static int __init init(void)
182{
183 int ret, i;
184
185 /* Register table */
186 ret = arpt_register_table(&packet_filter, &initial_table.repl);
187 if (ret < 0)
188 return ret;
189
190 for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
191 if ((ret = nf_register_hook(&arpt_ops[i])) < 0)
192 goto cleanup_hooks;
193 return ret;
194
195cleanup_hooks:
196 while (--i >= 0)
197 nf_unregister_hook(&arpt_ops[i]);
198
199 arpt_unregister_table(&packet_filter);
200 return ret;
201}
202
203static void __exit fini(void)
204{
205 unsigned int i;
206
207 for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
208 nf_unregister_hook(&arpt_ops[i]);
209
210 arpt_unregister_table(&packet_filter);
211}
212
213module_init(init);
214module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
new file mode 100644
index 000000000000..3dbddd062605
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -0,0 +1,167 @@
1/* Amanda extension for IP connection tracking, Version 0.2
2 * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
3 * based on HW's ip_conntrack_irc.c as well as other modules
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 *
10 * Module load syntax:
11 * insmod ip_conntrack_amanda.o [master_timeout=n]
12 *
13 * Where master_timeout is the timeout (in seconds) of the master
14 * connection (port 10080). This defaults to 5 minutes but if
15 * your clients take longer than 5 minutes to do their work
16 * before getting back to the Amanda server, you can increase
17 * this value.
18 *
19 */
20
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/netfilter.h>
24#include <linux/ip.h>
25#include <linux/moduleparam.h>
26#include <net/checksum.h>
27#include <net/udp.h>
28
29#include <linux/netfilter_ipv4/lockhelp.h>
30#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
32
33static unsigned int master_timeout = 300;
34
35MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
36MODULE_DESCRIPTION("Amanda connection tracking module");
37MODULE_LICENSE("GPL");
38module_param(master_timeout, int, 0600);
39MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
40
41static char *conns[] = { "DATA ", "MESG ", "INDEX " };
42
43/* This is slow, but it's simple. --RR */
44static char amanda_buffer[65536];
45static DECLARE_LOCK(amanda_buffer_lock);
46
47unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
48 enum ip_conntrack_info ctinfo,
49 unsigned int matchoff,
50 unsigned int matchlen,
51 struct ip_conntrack_expect *exp);
52EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
53
54static int help(struct sk_buff **pskb,
55 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
56{
57 struct ip_conntrack_expect *exp;
58 char *data, *data_limit, *tmp;
59 unsigned int dataoff, i;
60 u_int16_t port, len;
61 int ret = NF_ACCEPT;
62
63 /* Only look at packets from the Amanda server */
64 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
65 return NF_ACCEPT;
66
67 /* increase the UDP timeout of the master connection as replies from
68 * Amanda clients to the server can be quite delayed */
69 ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
70
71 /* No data? */
72 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
73 if (dataoff >= (*pskb)->len) {
74 if (net_ratelimit())
75 printk("amanda_help: skblen = %u\n", (*pskb)->len);
76 return NF_ACCEPT;
77 }
78
79 LOCK_BH(&amanda_buffer_lock);
80 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
81 data = amanda_buffer;
82 data_limit = amanda_buffer + (*pskb)->len - dataoff;
83 *data_limit = '\0';
84
85 /* Search for the CONNECT string */
86 data = strstr(data, "CONNECT ");
87 if (!data)
88 goto out;
89 data += strlen("CONNECT ");
90
91 /* Only search first line. */
92 if ((tmp = strchr(data, '\n')))
93 *tmp = '\0';
94
95 for (i = 0; i < ARRAY_SIZE(conns); i++) {
96 char *match = strstr(data, conns[i]);
97 if (!match)
98 continue;
99 tmp = data = match + strlen(conns[i]);
100 port = simple_strtoul(data, &data, 10);
101 len = data - tmp;
102 if (port == 0 || len > 5)
103 break;
104
105 exp = ip_conntrack_expect_alloc();
106 if (exp == NULL) {
107 ret = NF_DROP;
108 goto out;
109 }
110
111 exp->expectfn = NULL;
112 exp->master = ct;
113
114 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
115 exp->tuple.src.u.tcp.port = 0;
116 exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
117 exp->tuple.dst.protonum = IPPROTO_TCP;
118 exp->tuple.dst.u.tcp.port = htons(port);
119
120 exp->mask.src.ip = 0xFFFFFFFF;
121 exp->mask.src.u.tcp.port = 0;
122 exp->mask.dst.ip = 0xFFFFFFFF;
123 exp->mask.dst.protonum = 0xFF;
124 exp->mask.dst.u.tcp.port = 0xFFFF;
125
126 if (ip_nat_amanda_hook)
127 ret = ip_nat_amanda_hook(pskb, ctinfo,
128 tmp - amanda_buffer,
129 len, exp);
130 else if (ip_conntrack_expect_related(exp) != 0) {
131 ip_conntrack_expect_free(exp);
132 ret = NF_DROP;
133 }
134 }
135
136out:
137 UNLOCK_BH(&amanda_buffer_lock);
138 return ret;
139}
140
141static struct ip_conntrack_helper amanda_helper = {
142 .max_expected = ARRAY_SIZE(conns),
143 .timeout = 180,
144 .me = THIS_MODULE,
145 .help = help,
146 .name = "amanda",
147
148 .tuple = { .src = { .u = { __constant_htons(10080) } },
149 .dst = { .protonum = IPPROTO_UDP },
150 },
151 .mask = { .src = { .u = { 0xFFFF } },
152 .dst = { .protonum = 0xFF },
153 },
154};
155
156static void __exit fini(void)
157{
158 ip_conntrack_helper_unregister(&amanda_helper);
159}
160
161static int __init init(void)
162{
163 return ip_conntrack_helper_register(&amanda_helper);
164}
165
166module_init(init);
167module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
new file mode 100644
index 000000000000..28d9425d5c39
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -0,0 +1,1247 @@
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/icmp.h>
23#include <linux/ip.h>
24#include <linux/netfilter.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/module.h>
27#include <linux/skbuff.h>
28#include <linux/proc_fs.h>
29#include <linux/vmalloc.h>
30#include <net/checksum.h>
31#include <net/ip.h>
32#include <linux/stddef.h>
33#include <linux/sysctl.h>
34#include <linux/slab.h>
35#include <linux/random.h>
36#include <linux/jhash.h>
37#include <linux/err.h>
38#include <linux/percpu.h>
39#include <linux/moduleparam.h>
40
41/* This rwlock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
45
46#include <linux/netfilter_ipv4/ip_conntrack.h>
47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49#include <linux/netfilter_ipv4/ip_conntrack_core.h>
50#include <linux/netfilter_ipv4/listhelp.h>
51
52#define IP_CONNTRACK_VERSION "2.1"
53
54#if 0
55#define DEBUGP printk
56#else
57#define DEBUGP(format, args...)
58#endif
59
60DECLARE_RWLOCK(ip_conntrack_lock);
61
62/* ip_conntrack_standalone needs this */
63atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64
65void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66LIST_HEAD(ip_conntrack_expect_list);
67struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68static LIST_HEAD(helpers);
69unsigned int ip_conntrack_htable_size = 0;
70int ip_conntrack_max;
71struct list_head *ip_conntrack_hash;
72static kmem_cache_t *ip_conntrack_cachep;
73static kmem_cache_t *ip_conntrack_expect_cachep;
74struct ip_conntrack ip_conntrack_untracked;
75unsigned int ip_ct_log_invalid;
76static LIST_HEAD(unconfirmed);
77static int ip_conntrack_vmalloc;
78
79DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
80
81void
82ip_conntrack_put(struct ip_conntrack *ct)
83{
84 IP_NF_ASSERT(ct);
85 nf_conntrack_put(&ct->ct_general);
86}
87
88static int ip_conntrack_hash_rnd_initted;
89static unsigned int ip_conntrack_hash_rnd;
90
91static u_int32_t
92hash_conntrack(const struct ip_conntrack_tuple *tuple)
93{
94#if 0
95 dump_tuple(tuple);
96#endif
97 return (jhash_3words(tuple->src.ip,
98 (tuple->dst.ip ^ tuple->dst.protonum),
99 (tuple->src.u.all | (tuple->dst.u.all << 16)),
100 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
101}
102
103int
104ip_ct_get_tuple(const struct iphdr *iph,
105 const struct sk_buff *skb,
106 unsigned int dataoff,
107 struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack_protocol *protocol)
109{
110 /* Never happen */
111 if (iph->frag_off & htons(IP_OFFSET)) {
112 printk("ip_conntrack_core: Frag of proto %u.\n",
113 iph->protocol);
114 return 0;
115 }
116
117 tuple->src.ip = iph->saddr;
118 tuple->dst.ip = iph->daddr;
119 tuple->dst.protonum = iph->protocol;
120 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
121
122 return protocol->pkt_to_tuple(skb, dataoff, tuple);
123}
124
125int
126ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127 const struct ip_conntrack_tuple *orig,
128 const struct ip_conntrack_protocol *protocol)
129{
130 inverse->src.ip = orig->dst.ip;
131 inverse->dst.ip = orig->src.ip;
132 inverse->dst.protonum = orig->dst.protonum;
133 inverse->dst.dir = !orig->dst.dir;
134
135 return protocol->invert_tuple(inverse, orig);
136}
137
138
139/* ip_conntrack_expect helper functions */
140static void destroy_expect(struct ip_conntrack_expect *exp)
141{
142 ip_conntrack_put(exp->master);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp);
145 CONNTRACK_STAT_INC(expect_delete);
146}
147
148static void unlink_expect(struct ip_conntrack_expect *exp)
149{
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--;
154}
155
156static void expectation_timed_out(unsigned long ul_expect)
157{
158 struct ip_conntrack_expect *exp = (void *)ul_expect;
159
160 WRITE_LOCK(&ip_conntrack_lock);
161 unlink_expect(exp);
162 WRITE_UNLOCK(&ip_conntrack_lock);
163 destroy_expect(exp);
164}
165
166/* If an expectation for this connection is found, it gets delete from
167 * global list then returned. */
168static struct ip_conntrack_expect *
169find_expectation(const struct ip_conntrack_tuple *tuple)
170{
171 struct ip_conntrack_expect *i;
172
173 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
174 /* If master is not in hash table yet (ie. packet hasn't left
175 this machine yet), how can other end know about expected?
176 Hence these are not the droids you are looking for (if
177 master ct never got confirmed, we'd hold a reference to it
178 and weird things would happen to future packets). */
179 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
180 && is_confirmed(i->master)
181 && del_timer(&i->timeout)) {
182 unlink_expect(i);
183 return i;
184 }
185 }
186 return NULL;
187}
188
189/* delete all expectations for this conntrack */
190static void remove_expectations(struct ip_conntrack *ct)
191{
192 struct ip_conntrack_expect *i, *tmp;
193
194 /* Optimization: most connection never expect any others. */
195 if (ct->expecting == 0)
196 return;
197
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) {
200 unlink_expect(i);
201 destroy_expect(i);
202 }
203 }
204}
205
206static void
207clean_from_lists(struct ip_conntrack *ct)
208{
209 unsigned int ho, hr;
210
211 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
213
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
216 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
217 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
218
219 /* Destroy all pending expectations */
220 remove_expectations(ct);
221}
222
223static void
224destroy_conntrack(struct nf_conntrack *nfct)
225{
226 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
227 struct ip_conntrack_protocol *proto;
228
229 DEBUGP("destroy_conntrack(%p)\n", ct);
230 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
231 IP_NF_ASSERT(!timer_pending(&ct->timeout));
232
233 /* To make sure we don't get any weird locking issues here:
234 * destroy_conntrack() MUST NOT be called with a write lock
235 * to ip_conntrack_lock!!! -HW */
236 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
237 if (proto && proto->destroy)
238 proto->destroy(ct);
239
240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct);
242
243 WRITE_LOCK(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here,
247 * too. */
248 remove_expectations(ct);
249
250 /* We overload first tuple to link into unconfirmed list. */
251 if (!is_confirmed(ct)) {
252 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
253 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
254 }
255
256 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock);
258
259 if (ct->master)
260 ip_conntrack_put(ct->master);
261
262 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
263 kmem_cache_free(ip_conntrack_cachep, ct);
264 atomic_dec(&ip_conntrack_count);
265}
266
267static void death_by_timeout(unsigned long ul_conntrack)
268{
269 struct ip_conntrack *ct = (void *)ul_conntrack;
270
271 WRITE_LOCK(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock);
277 ip_conntrack_put(ct);
278}
279
280static inline int
281conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack)
284{
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple);
288}
289
290static struct ip_conntrack_tuple_hash *
291__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
292 const struct ip_conntrack *ignored_conntrack)
293{
294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple);
296
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found);
301 return h;
302 }
303 CONNTRACK_STAT_INC(searched);
304 }
305
306 return NULL;
307}
308
309/* Find a connection corresponding to a tuple. */
310struct ip_conntrack_tuple_hash *
311ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
312 const struct ip_conntrack *ignored_conntrack)
313{
314 struct ip_conntrack_tuple_hash *h;
315
316 READ_LOCK(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock);
321
322 return h;
323}
324
325/* Confirm a connection given skb; places it in hash table */
326int
327__ip_conntrack_confirm(struct sk_buff **pskb)
328{
329 unsigned int hash, repl_hash;
330 struct ip_conntrack *ct;
331 enum ip_conntrack_info ctinfo;
332
333 ct = ip_conntrack_get(*pskb, &ctinfo);
334
335 /* ipt_REJECT uses ip_conntrack_attach to attach related
336 ICMP/TCP RST packets in other direction. Actual packet
337 which created connection will be IP_CT_NEW or for an
338 expected connection, IP_CT_RELATED. */
339 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
340 return NF_ACCEPT;
341
342 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
343 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
344
345 /* We're not in hash table, and we refuse to set up related
346 connections for unconfirmed conns. But packet copies and
347 REJECT will give spurious warnings here. */
348 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
349
350 /* No external references means noone else could have
351 confirmed us. */
352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct);
354
355 WRITE_LOCK(&ip_conntrack_lock);
356
357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're
359 not in the hash. If there is, we lost race. */
360 if (!LIST_FIND(&ip_conntrack_hash[hash],
361 conntrack_tuple_cmp,
362 struct ip_conntrack_tuple_hash *,
363 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
364 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
365 conntrack_tuple_cmp,
366 struct ip_conntrack_tuple_hash *,
367 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
368 /* Remove from unconfirmed list */
369 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
370
371 list_prepend(&ip_conntrack_hash[hash],
372 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
373 list_prepend(&ip_conntrack_hash[repl_hash],
374 &ct->tuplehash[IP_CT_DIR_REPLY]);
375 /* Timer relative to confirmation time, not original
376 setting time, otherwise we'd get timer wrap in
377 weird delay cases. */
378 ct->timeout.expires += jiffies;
379 add_timer(&ct->timeout);
380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock);
384 return NF_ACCEPT;
385 }
386
387 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock);
389
390 return NF_DROP;
391}
392
393/* Returns true if a connection correspondings to the tuple (required
394 for NAT). */
395int
396ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
397 const struct ip_conntrack *ignored_conntrack)
398{
399 struct ip_conntrack_tuple_hash *h;
400
401 READ_LOCK(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock);
404
405 return h != NULL;
406}
407
408/* There's a small race here where we may free a just-assured
409 connection. Too bad: we're in trouble anyway. */
410static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
411{
412 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
413}
414
415static int early_drop(struct list_head *chain)
416{
417 /* Traverse backwards: gives us oldest, which is roughly LRU */
418 struct ip_conntrack_tuple_hash *h;
419 struct ip_conntrack *ct = NULL;
420 int dropped = 0;
421
422 READ_LOCK(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) {
425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use);
427 }
428 READ_UNLOCK(&ip_conntrack_lock);
429
430 if (!ct)
431 return dropped;
432
433 if (del_timer(&ct->timeout)) {
434 death_by_timeout((unsigned long)ct);
435 dropped = 1;
436 CONNTRACK_STAT_INC(early_drop);
437 }
438 ip_conntrack_put(ct);
439 return dropped;
440}
441
442static inline int helper_cmp(const struct ip_conntrack_helper *i,
443 const struct ip_conntrack_tuple *rtuple)
444{
445 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
446}
447
448static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
449{
450 return LIST_FIND(&helpers, helper_cmp,
451 struct ip_conntrack_helper *,
452 tuple);
453}
454
455/* Allocate a new conntrack: we return -ENOMEM if classification
456 failed due to stress. Otherwise it really is unclassifiable. */
457static struct ip_conntrack_tuple_hash *
458init_conntrack(const struct ip_conntrack_tuple *tuple,
459 struct ip_conntrack_protocol *protocol,
460 struct sk_buff *skb)
461{
462 struct ip_conntrack *conntrack;
463 struct ip_conntrack_tuple repl_tuple;
464 size_t hash;
465 struct ip_conntrack_expect *exp;
466
467 if (!ip_conntrack_hash_rnd_initted) {
468 get_random_bytes(&ip_conntrack_hash_rnd, 4);
469 ip_conntrack_hash_rnd_initted = 1;
470 }
471
472 hash = hash_conntrack(tuple);
473
474 if (ip_conntrack_max
475 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
476 /* Try dropping from this hash chain. */
477 if (!early_drop(&ip_conntrack_hash[hash])) {
478 if (net_ratelimit())
479 printk(KERN_WARNING
480 "ip_conntrack: table full, dropping"
481 " packet.\n");
482 return ERR_PTR(-ENOMEM);
483 }
484 }
485
486 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
487 DEBUGP("Can't invert tuple.\n");
488 return NULL;
489 }
490
491 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
492 if (!conntrack) {
493 DEBUGP("Can't allocate conntrack.\n");
494 return ERR_PTR(-ENOMEM);
495 }
496
497 memset(conntrack, 0, sizeof(*conntrack));
498 atomic_set(&conntrack->ct_general.use, 1);
499 conntrack->ct_general.destroy = destroy_conntrack;
500 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
501 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
502 if (!protocol->new(conntrack, skb)) {
503 kmem_cache_free(ip_conntrack_cachep, conntrack);
504 return NULL;
505 }
506 /* Don't set timer yet: wait for confirmation */
507 init_timer(&conntrack->timeout);
508 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout;
510
511 WRITE_LOCK(&ip_conntrack_lock);
512 exp = find_expectation(tuple);
513
514 if (exp) {
515 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
516 conntrack, exp);
517 /* Welcome, Mr. Bond. We've been expecting you... */
518 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
519 conntrack->master = exp->master;
520#if CONFIG_IP_NF_CONNTRACK_MARK
521 conntrack->mark = exp->master->mark;
522#endif
523 nf_conntrack_get(&conntrack->master->ct_general);
524 CONNTRACK_STAT_INC(expect_new);
525 } else {
526 conntrack->helper = ip_ct_find_helper(&repl_tuple);
527
528 CONNTRACK_STAT_INC(new);
529 }
530
531 /* Overload tuple linked list to put us in unconfirmed list. */
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533
534 atomic_inc(&ip_conntrack_count);
535 WRITE_UNLOCK(&ip_conntrack_lock);
536
537 if (exp) {
538 if (exp->expectfn)
539 exp->expectfn(conntrack, exp);
540 destroy_expect(exp);
541 }
542
543 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
544}
545
546/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
547static inline struct ip_conntrack *
548resolve_normal_ct(struct sk_buff *skb,
549 struct ip_conntrack_protocol *proto,
550 int *set_reply,
551 unsigned int hooknum,
552 enum ip_conntrack_info *ctinfo)
553{
554 struct ip_conntrack_tuple tuple;
555 struct ip_conntrack_tuple_hash *h;
556 struct ip_conntrack *ct;
557
558 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
559
560 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
561 &tuple,proto))
562 return NULL;
563
564 /* look for tuple match */
565 h = ip_conntrack_find_get(&tuple, NULL);
566 if (!h) {
567 h = init_conntrack(&tuple, proto, skb);
568 if (!h)
569 return NULL;
570 if (IS_ERR(h))
571 return (void *)h;
572 }
573 ct = tuplehash_to_ctrack(h);
574
575 /* It exists; we have (non-exclusive) reference. */
576 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
577 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
578 /* Please set reply bit if this packet OK */
579 *set_reply = 1;
580 } else {
581 /* Once we've had two way comms, always ESTABLISHED. */
582 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
583 DEBUGP("ip_conntrack_in: normal packet for %p\n",
584 ct);
585 *ctinfo = IP_CT_ESTABLISHED;
586 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
587 DEBUGP("ip_conntrack_in: related packet for %p\n",
588 ct);
589 *ctinfo = IP_CT_RELATED;
590 } else {
591 DEBUGP("ip_conntrack_in: new packet for %p\n",
592 ct);
593 *ctinfo = IP_CT_NEW;
594 }
595 *set_reply = 0;
596 }
597 skb->nfct = &ct->ct_general;
598 skb->nfctinfo = *ctinfo;
599 return ct;
600}
601
602/* Netfilter hook itself. */
603unsigned int ip_conntrack_in(unsigned int hooknum,
604 struct sk_buff **pskb,
605 const struct net_device *in,
606 const struct net_device *out,
607 int (*okfn)(struct sk_buff *))
608{
609 struct ip_conntrack *ct;
610 enum ip_conntrack_info ctinfo;
611 struct ip_conntrack_protocol *proto;
612 int set_reply;
613 int ret;
614
615 /* Previously seen (loopback or untracked)? Ignore. */
616 if ((*pskb)->nfct) {
617 CONNTRACK_STAT_INC(ignore);
618 return NF_ACCEPT;
619 }
620
621 /* Never happen */
622 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
623 if (net_ratelimit()) {
624 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
625 (*pskb)->nh.iph->protocol, hooknum);
626 }
627 return NF_DROP;
628 }
629
630 /* FIXME: Do this right please. --RR */
631 (*pskb)->nfcache |= NFC_UNKNOWN;
632
633/* Doesn't cover locally-generated broadcast, so not worth it. */
634#if 0
635 /* Ignore broadcast: no `connection'. */
636 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
637 printk("Broadcast packet!\n");
638 return NF_ACCEPT;
639 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
640 == htonl(0x000000FF)) {
641 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
642 NIPQUAD((*pskb)->nh.iph->saddr),
643 NIPQUAD((*pskb)->nh.iph->daddr),
644 (*pskb)->sk, (*pskb)->pkt_type);
645 }
646#endif
647
648 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
649
650 /* It may be an special packet, error, unclean...
651 * inverse of the return code tells to the netfilter
652 * core what to do with the packet. */
653 if (proto->error != NULL
654 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
655 CONNTRACK_STAT_INC(error);
656 CONNTRACK_STAT_INC(invalid);
657 return -ret;
658 }
659
660 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
661 /* Not valid part of a connection */
662 CONNTRACK_STAT_INC(invalid);
663 return NF_ACCEPT;
664 }
665
666 if (IS_ERR(ct)) {
667 /* Too stressed to deal. */
668 CONNTRACK_STAT_INC(drop);
669 return NF_DROP;
670 }
671
672 IP_NF_ASSERT((*pskb)->nfct);
673
674 ret = proto->packet(ct, *pskb, ctinfo);
675 if (ret < 0) {
676 /* Invalid: inverse of the return code tells
677 * the netfilter core what to do*/
678 nf_conntrack_put((*pskb)->nfct);
679 (*pskb)->nfct = NULL;
680 CONNTRACK_STAT_INC(invalid);
681 return -ret;
682 }
683
684 if (set_reply)
685 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
686
687 return ret;
688}
689
690int invert_tuplepr(struct ip_conntrack_tuple *inverse,
691 const struct ip_conntrack_tuple *orig)
692{
693 return ip_ct_invert_tuple(inverse, orig,
694 ip_ct_find_proto(orig->dst.protonum));
695}
696
697/* Would two expected things clash? */
698static inline int expect_clash(const struct ip_conntrack_expect *a,
699 const struct ip_conntrack_expect *b)
700{
701 /* Part covered by intersection of masks must be unequal,
702 otherwise they clash */
703 struct ip_conntrack_tuple intersect_mask
704 = { { a->mask.src.ip & b->mask.src.ip,
705 { a->mask.src.u.all & b->mask.src.u.all } },
706 { a->mask.dst.ip & b->mask.dst.ip,
707 { a->mask.dst.u.all & b->mask.dst.u.all },
708 a->mask.dst.protonum & b->mask.dst.protonum } };
709
710 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
711}
712
713static inline int expect_matches(const struct ip_conntrack_expect *a,
714 const struct ip_conntrack_expect *b)
715{
716 return a->master == b->master
717 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
718 && ip_ct_tuple_equal(&a->mask, &b->mask);
719}
720
721/* Generally a bad idea to call this: could have matched already. */
722void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
723{
724 struct ip_conntrack_expect *i;
725
726 WRITE_LOCK(&ip_conntrack_lock);
727 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i);
731 WRITE_UNLOCK(&ip_conntrack_lock);
732 destroy_expect(i);
733 return;
734 }
735 }
736 WRITE_UNLOCK(&ip_conntrack_lock);
737}
738
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
740{
741 struct ip_conntrack_expect *new;
742
743 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
744 if (!new) {
745 DEBUGP("expect_related: OOM allocating expect\n");
746 return NULL;
747 }
748 new->master = NULL;
749 return new;
750}
751
752void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
753{
754 kmem_cache_free(ip_conntrack_expect_cachep, expect);
755}
756
757static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
758{
759 atomic_inc(&exp->master->ct_general.use);
760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list);
762
763 if (exp->master->helper->timeout) {
764 init_timer(&exp->timeout);
765 exp->timeout.data = (unsigned long)exp;
766 exp->timeout.function = expectation_timed_out;
767 exp->timeout.expires
768 = jiffies + exp->master->helper->timeout * HZ;
769 add_timer(&exp->timeout);
770 } else
771 exp->timeout.function = NULL;
772
773 CONNTRACK_STAT_INC(expect_create);
774}
775
776/* Race with expectations being used means we could have none to find; OK. */
777static void evict_oldest_expect(struct ip_conntrack *master)
778{
779 struct ip_conntrack_expect *i;
780
781 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
782 if (i->master == master) {
783 if (del_timer(&i->timeout)) {
784 unlink_expect(i);
785 destroy_expect(i);
786 }
787 break;
788 }
789 }
790}
791
792static inline int refresh_timer(struct ip_conntrack_expect *i)
793{
794 if (!del_timer(&i->timeout))
795 return 0;
796
797 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
798 add_timer(&i->timeout);
799 return 1;
800}
801
802int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
803{
804 struct ip_conntrack_expect *i;
805 int ret;
806
807 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
808 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
809 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
810
811 WRITE_LOCK(&ip_conntrack_lock);
812 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
813 if (expect_matches(i, expect)) {
814 /* Refresh timer: if it's dying, ignore.. */
815 if (refresh_timer(i)) {
816 ret = 0;
817 /* We don't need the one they've given us. */
818 ip_conntrack_expect_free(expect);
819 goto out;
820 }
821 } else if (expect_clash(i, expect)) {
822 ret = -EBUSY;
823 goto out;
824 }
825 }
826
827 /* Will be over limit? */
828 if (expect->master->helper->max_expected &&
829 expect->master->expecting >= expect->master->helper->max_expected)
830 evict_oldest_expect(expect->master);
831
832 ip_conntrack_expect_insert(expect);
833 ret = 0;
834out:
835 WRITE_UNLOCK(&ip_conntrack_lock);
836 return ret;
837}
838
839/* Alter reply tuple (maybe alter helper). This is for NAT, and is
840 implicitly racy: see __ip_conntrack_confirm */
841void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
842 const struct ip_conntrack_tuple *newreply)
843{
844 WRITE_LOCK(&ip_conntrack_lock);
845 /* Should be unconfirmed, so not in hash table yet */
846 IP_NF_ASSERT(!is_confirmed(conntrack));
847
848 DEBUGP("Altering reply tuple of %p to ", conntrack);
849 DUMP_TUPLE(newreply);
850
851 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
852 if (!conntrack->master && conntrack->expecting == 0)
853 conntrack->helper = ip_ct_find_helper(newreply);
854 WRITE_UNLOCK(&ip_conntrack_lock);
855}
856
857int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
858{
859 BUG_ON(me->timeout == 0);
860 WRITE_LOCK(&ip_conntrack_lock);
861 list_prepend(&helpers, me);
862 WRITE_UNLOCK(&ip_conntrack_lock);
863
864 return 0;
865}
866
867static inline int unhelp(struct ip_conntrack_tuple_hash *i,
868 const struct ip_conntrack_helper *me)
869{
870 if (tuplehash_to_ctrack(i)->helper == me)
871 tuplehash_to_ctrack(i)->helper = NULL;
872 return 0;
873}
874
875void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
876{
877 unsigned int i;
878 struct ip_conntrack_expect *exp, *tmp;
879
880 /* Need write lock here, to delete helper. */
881 WRITE_LOCK(&ip_conntrack_lock);
882 LIST_DELETE(&helpers, me);
883
884 /* Get rid of expectations */
885 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
886 if (exp->master->helper == me && del_timer(&exp->timeout)) {
887 unlink_expect(exp);
888 destroy_expect(exp);
889 }
890 }
891 /* Get rid of expecteds, set helpers to NULL. */
892 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
893 for (i = 0; i < ip_conntrack_htable_size; i++)
894 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
895 struct ip_conntrack_tuple_hash *, me);
896 WRITE_UNLOCK(&ip_conntrack_lock);
897
898 /* Someone could be still looking at the helper in a bh. */
899 synchronize_net();
900}
901
902static inline void ct_add_counters(struct ip_conntrack *ct,
903 enum ip_conntrack_info ctinfo,
904 const struct sk_buff *skb)
905{
906#ifdef CONFIG_IP_NF_CT_ACCT
907 if (skb) {
908 ct->counters[CTINFO2DIR(ctinfo)].packets++;
909 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
910 ntohs(skb->nh.iph->tot_len);
911 }
912#endif
913}
914
915/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
916void ip_ct_refresh_acct(struct ip_conntrack *ct,
917 enum ip_conntrack_info ctinfo,
918 const struct sk_buff *skb,
919 unsigned long extra_jiffies)
920{
921 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
922
923 /* If not in hash table, timer will not be active yet */
924 if (!is_confirmed(ct)) {
925 ct->timeout.expires = extra_jiffies;
926 ct_add_counters(ct, ctinfo, skb);
927 } else {
928 WRITE_LOCK(&ip_conntrack_lock);
929 /* Need del_timer for race avoidance (may already be dying). */
930 if (del_timer(&ct->timeout)) {
931 ct->timeout.expires = jiffies + extra_jiffies;
932 add_timer(&ct->timeout);
933 }
934 ct_add_counters(ct, ctinfo, skb);
935 WRITE_UNLOCK(&ip_conntrack_lock);
936 }
937}
938
939/* Returns new sk_buff, or NULL */
940struct sk_buff *
941ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
942{
943 struct sock *sk = skb->sk;
944#ifdef CONFIG_NETFILTER_DEBUG
945 unsigned int olddebug = skb->nf_debug;
946#endif
947
948 if (sk) {
949 sock_hold(sk);
950 skb_orphan(skb);
951 }
952
953 local_bh_disable();
954 skb = ip_defrag(skb, user);
955 local_bh_enable();
956
957 if (!skb) {
958 if (sk)
959 sock_put(sk);
960 return skb;
961 }
962
963 if (sk) {
964 skb_set_owner_w(skb, sk);
965 sock_put(sk);
966 }
967
968 ip_send_check(skb->nh.iph);
969 skb->nfcache |= NFC_ALTERED;
970#ifdef CONFIG_NETFILTER_DEBUG
971 /* Packet path as if nothing had happened. */
972 skb->nf_debug = olddebug;
973#endif
974 return skb;
975}
976
977/* Used by ipt_REJECT. */
978static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
979{
980 struct ip_conntrack *ct;
981 enum ip_conntrack_info ctinfo;
982
983 /* This ICMP is in reverse direction to the packet which caused it */
984 ct = ip_conntrack_get(skb, &ctinfo);
985
986 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
987 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
988 else
989 ctinfo = IP_CT_RELATED;
990
991 /* Attach to new skbuff, and increment count */
992 nskb->nfct = &ct->ct_general;
993 nskb->nfctinfo = ctinfo;
994 nf_conntrack_get(nskb->nfct);
995}
996
997static inline int
998do_iter(const struct ip_conntrack_tuple_hash *i,
999 int (*iter)(struct ip_conntrack *i, void *data),
1000 void *data)
1001{
1002 return iter(tuplehash_to_ctrack(i), data);
1003}
1004
1005/* Bring out ya dead! */
1006static struct ip_conntrack_tuple_hash *
1007get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1008 void *data, unsigned int *bucket)
1009{
1010 struct ip_conntrack_tuple_hash *h = NULL;
1011
1012 WRITE_LOCK(&ip_conntrack_lock);
1013 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1014 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1015 struct ip_conntrack_tuple_hash *, iter, data);
1016 if (h)
1017 break;
1018 }
1019 if (!h)
1020 h = LIST_FIND_W(&unconfirmed, do_iter,
1021 struct ip_conntrack_tuple_hash *, iter, data);
1022 if (h)
1023 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1024 WRITE_UNLOCK(&ip_conntrack_lock);
1025
1026 return h;
1027}
1028
1029void
1030ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1031{
1032 struct ip_conntrack_tuple_hash *h;
1033 unsigned int bucket = 0;
1034
1035 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1036 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1037 /* Time to push up daises... */
1038 if (del_timer(&ct->timeout))
1039 death_by_timeout((unsigned long)ct);
1040 /* ... else the timer will get him soon. */
1041
1042 ip_conntrack_put(ct);
1043 }
1044}
1045
1046/* Fast function for those who don't want to parse /proc (and I don't
1047 blame them). */
1048/* Reversing the socket's dst/src point of view gives us the reply
1049 mapping. */
1050static int
1051getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1052{
1053 struct inet_sock *inet = inet_sk(sk);
1054 struct ip_conntrack_tuple_hash *h;
1055 struct ip_conntrack_tuple tuple;
1056
1057 IP_CT_TUPLE_U_BLANK(&tuple);
1058 tuple.src.ip = inet->rcv_saddr;
1059 tuple.src.u.tcp.port = inet->sport;
1060 tuple.dst.ip = inet->daddr;
1061 tuple.dst.u.tcp.port = inet->dport;
1062 tuple.dst.protonum = IPPROTO_TCP;
1063
1064 /* We only do TCP at the moment: is there a better way? */
1065 if (strcmp(sk->sk_prot->name, "TCP")) {
1066 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1067 return -ENOPROTOOPT;
1068 }
1069
1070 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1071 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1072 *len, sizeof(struct sockaddr_in));
1073 return -EINVAL;
1074 }
1075
1076 h = ip_conntrack_find_get(&tuple, NULL);
1077 if (h) {
1078 struct sockaddr_in sin;
1079 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1080
1081 sin.sin_family = AF_INET;
1082 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1083 .tuple.dst.u.tcp.port;
1084 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1085 .tuple.dst.ip;
1086
1087 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1088 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1089 ip_conntrack_put(ct);
1090 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1091 return -EFAULT;
1092 else
1093 return 0;
1094 }
1095 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1096 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1097 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1098 return -ENOENT;
1099}
1100
1101static struct nf_sockopt_ops so_getorigdst = {
1102 .pf = PF_INET,
1103 .get_optmin = SO_ORIGINAL_DST,
1104 .get_optmax = SO_ORIGINAL_DST+1,
1105 .get = &getorigdst,
1106};
1107
1108static int kill_all(struct ip_conntrack *i, void *data)
1109{
1110 return 1;
1111}
1112
1113static void free_conntrack_hash(void)
1114{
1115 if (ip_conntrack_vmalloc)
1116 vfree(ip_conntrack_hash);
1117 else
1118 free_pages((unsigned long)ip_conntrack_hash,
1119 get_order(sizeof(struct list_head)
1120 * ip_conntrack_htable_size));
1121}
1122
1123/* Mishearing the voices in his head, our hero wonders how he's
1124 supposed to kill the mall. */
1125void ip_conntrack_cleanup(void)
1126{
1127 ip_ct_attach = NULL;
1128 /* This makes sure all current packets have passed through
1129 netfilter framework. Roll on, two-stage module
1130 delete... */
1131 synchronize_net();
1132
1133 i_see_dead_people:
1134 ip_ct_iterate_cleanup(kill_all, NULL);
1135 if (atomic_read(&ip_conntrack_count) != 0) {
1136 schedule();
1137 goto i_see_dead_people;
1138 }
1139
1140 kmem_cache_destroy(ip_conntrack_cachep);
1141 kmem_cache_destroy(ip_conntrack_expect_cachep);
1142 free_conntrack_hash();
1143 nf_unregister_sockopt(&so_getorigdst);
1144}
1145
1146static int hashsize;
1147module_param(hashsize, int, 0400);
1148
1149int __init ip_conntrack_init(void)
1150{
1151 unsigned int i;
1152 int ret;
1153
1154 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1155 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1156 if (hashsize) {
1157 ip_conntrack_htable_size = hashsize;
1158 } else {
1159 ip_conntrack_htable_size
1160 = (((num_physpages << PAGE_SHIFT) / 16384)
1161 / sizeof(struct list_head));
1162 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1163 ip_conntrack_htable_size = 8192;
1164 if (ip_conntrack_htable_size < 16)
1165 ip_conntrack_htable_size = 16;
1166 }
1167 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1168
1169 printk("ip_conntrack version %s (%u buckets, %d max)"
1170 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1171 ip_conntrack_htable_size, ip_conntrack_max,
1172 sizeof(struct ip_conntrack));
1173
1174 ret = nf_register_sockopt(&so_getorigdst);
1175 if (ret != 0) {
1176 printk(KERN_ERR "Unable to register netfilter socket option\n");
1177 return ret;
1178 }
1179
1180 /* AK: the hash table is twice as big than needed because it
1181 uses list_head. it would be much nicer to caches to use a
1182 single pointer list head here. */
1183 ip_conntrack_vmalloc = 0;
1184 ip_conntrack_hash
1185 =(void*)__get_free_pages(GFP_KERNEL,
1186 get_order(sizeof(struct list_head)
1187 *ip_conntrack_htable_size));
1188 if (!ip_conntrack_hash) {
1189 ip_conntrack_vmalloc = 1;
1190 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1191 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1192 * ip_conntrack_htable_size);
1193 }
1194 if (!ip_conntrack_hash) {
1195 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1196 goto err_unreg_sockopt;
1197 }
1198
1199 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1200 sizeof(struct ip_conntrack), 0,
1201 0, NULL, NULL);
1202 if (!ip_conntrack_cachep) {
1203 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1204 goto err_free_hash;
1205 }
1206
1207 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1208 sizeof(struct ip_conntrack_expect),
1209 0, 0, NULL, NULL);
1210 if (!ip_conntrack_expect_cachep) {
1211 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1212 goto err_free_conntrack_slab;
1213 }
1214
1215 /* Don't NEED lock here, but good form anyway. */
1216 WRITE_LOCK(&ip_conntrack_lock);
1217 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1218 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1219 /* Sew in builtin protocols. */
1220 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1221 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1222 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1223 WRITE_UNLOCK(&ip_conntrack_lock);
1224
1225 for (i = 0; i < ip_conntrack_htable_size; i++)
1226 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1227
1228 /* For use by ipt_REJECT */
1229 ip_ct_attach = ip_conntrack_attach;
1230
1231 /* Set up fake conntrack:
1232 - to never be deleted, not in any hashes */
1233 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1234 /* - and look it like as a confirmed connection */
1235 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1236
1237 return ret;
1238
1239err_free_conntrack_slab:
1240 kmem_cache_destroy(ip_conntrack_cachep);
1241err_free_hash:
1242 free_conntrack_hash();
1243err_unreg_sockopt:
1244 nf_unregister_sockopt(&so_getorigdst);
1245
1246 return -ENOMEM;
1247}
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
new file mode 100644
index 000000000000..12b88cbb11db
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -0,0 +1,501 @@
1/* FTP extension for IP connection tracking. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/config.h>
12#include <linux/module.h>
13#include <linux/netfilter.h>
14#include <linux/ip.h>
15#include <linux/ctype.h>
16#include <net/checksum.h>
17#include <net/tcp.h>
18
19#include <linux/netfilter_ipv4/lockhelp.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
22#include <linux/moduleparam.h>
23
24MODULE_LICENSE("GPL");
25MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
26MODULE_DESCRIPTION("ftp connection tracking helper");
27
28/* This is slow, but it's simple. --RR */
29static char ftp_buffer[65536];
30
31static DECLARE_LOCK(ip_ftp_lock);
32
33#define MAX_PORTS 8
34static int ports[MAX_PORTS];
35static int ports_c;
36module_param_array(ports, int, &ports_c, 0400);
37
38static int loose;
39module_param(loose, int, 0600);
40
41unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb,
42 enum ip_conntrack_info ctinfo,
43 enum ip_ct_ftp_type type,
44 unsigned int matchoff,
45 unsigned int matchlen,
46 struct ip_conntrack_expect *exp,
47 u32 *seq);
48EXPORT_SYMBOL_GPL(ip_nat_ftp_hook);
49
50#if 0
51#define DEBUGP printk
52#else
53#define DEBUGP(format, args...)
54#endif
55
56static int try_rfc959(const char *, size_t, u_int32_t [], char);
57static int try_eprt(const char *, size_t, u_int32_t [], char);
58static int try_epsv_response(const char *, size_t, u_int32_t [], char);
59
60static struct ftp_search {
61 enum ip_conntrack_dir dir;
62 const char *pattern;
63 size_t plen;
64 char skip;
65 char term;
66 enum ip_ct_ftp_type ftptype;
67 int (*getnum)(const char *, size_t, u_int32_t[], char);
68} search[] = {
69 {
70 IP_CT_DIR_ORIGINAL,
71 "PORT", sizeof("PORT") - 1, ' ', '\r',
72 IP_CT_FTP_PORT,
73 try_rfc959,
74 },
75 {
76 IP_CT_DIR_REPLY,
77 "227 ", sizeof("227 ") - 1, '(', ')',
78 IP_CT_FTP_PASV,
79 try_rfc959,
80 },
81 {
82 IP_CT_DIR_ORIGINAL,
83 "EPRT", sizeof("EPRT") - 1, ' ', '\r',
84 IP_CT_FTP_EPRT,
85 try_eprt,
86 },
87 {
88 IP_CT_DIR_REPLY,
89 "229 ", sizeof("229 ") - 1, '(', ')',
90 IP_CT_FTP_EPSV,
91 try_epsv_response,
92 },
93};
94
95static int try_number(const char *data, size_t dlen, u_int32_t array[],
96 int array_size, char sep, char term)
97{
98 u_int32_t i, len;
99
100 memset(array, 0, sizeof(array[0])*array_size);
101
102 /* Keep data pointing at next char. */
103 for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
104 if (*data >= '0' && *data <= '9') {
105 array[i] = array[i]*10 + *data - '0';
106 }
107 else if (*data == sep)
108 i++;
109 else {
110 /* Unexpected character; true if it's the
111 terminator and we're finished. */
112 if (*data == term && i == array_size - 1)
113 return len;
114
115 DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
116 len, i, *data);
117 return 0;
118 }
119 }
120 DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
121
122 return 0;
123}
124
125/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
126static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6],
127 char term)
128{
129 return try_number(data, dlen, array, 6, ',', term);
130}
131
132/* Grab port: number up to delimiter */
133static int get_port(const char *data, int start, size_t dlen, char delim,
134 u_int32_t array[2])
135{
136 u_int16_t port = 0;
137 int i;
138
139 for (i = start; i < dlen; i++) {
140 /* Finished? */
141 if (data[i] == delim) {
142 if (port == 0)
143 break;
144 array[0] = port >> 8;
145 array[1] = port;
146 return i + 1;
147 }
148 else if (data[i] >= '0' && data[i] <= '9')
149 port = port*10 + data[i] - '0';
150 else /* Some other crap */
151 break;
152 }
153 return 0;
154}
155
156/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */
157static int try_eprt(const char *data, size_t dlen, u_int32_t array[6],
158 char term)
159{
160 char delim;
161 int length;
162
163 /* First character is delimiter, then "1" for IPv4, then
164 delimiter again. */
165 if (dlen <= 3) return 0;
166 delim = data[0];
167 if (isdigit(delim) || delim < 33 || delim > 126
168 || data[1] != '1' || data[2] != delim)
169 return 0;
170
171 DEBUGP("EPRT: Got |1|!\n");
172 /* Now we have IP address. */
173 length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
174 if (length == 0)
175 return 0;
176
177 DEBUGP("EPRT: Got IP address!\n");
178 /* Start offset includes initial "|1|", and trailing delimiter */
179 return get_port(data, 3 + length + 1, dlen, delim, array+4);
180}
181
182/* Returns 0, or length of numbers: |||6446| */
183static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6],
184 char term)
185{
186 char delim;
187
188 /* Three delimiters. */
189 if (dlen <= 3) return 0;
190 delim = data[0];
191 if (isdigit(delim) || delim < 33 || delim > 126
192 || data[1] != delim || data[2] != delim)
193 return 0;
194
195 return get_port(data, 3, dlen, delim, array+4);
196}
197
198/* Return 1 for match, 0 for accept, -1 for partial. */
199static int find_pattern(const char *data, size_t dlen,
200 const char *pattern, size_t plen,
201 char skip, char term,
202 unsigned int *numoff,
203 unsigned int *numlen,
204 u_int32_t array[6],
205 int (*getnum)(const char *, size_t, u_int32_t[], char))
206{
207 size_t i;
208
209 DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
210 if (dlen == 0)
211 return 0;
212
213 if (dlen <= plen) {
214 /* Short packet: try for partial? */
215 if (strnicmp(data, pattern, dlen) == 0)
216 return -1;
217 else return 0;
218 }
219
220 if (strnicmp(data, pattern, plen) != 0) {
221#if 0
222 size_t i;
223
224 DEBUGP("ftp: string mismatch\n");
225 for (i = 0; i < plen; i++) {
226 DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
227 i, data[i], data[i],
228 pattern[i], pattern[i]);
229 }
230#endif
231 return 0;
232 }
233
234 DEBUGP("Pattern matches!\n");
235 /* Now we've found the constant string, try to skip
236 to the 'skip' character */
237 for (i = plen; data[i] != skip; i++)
238 if (i == dlen - 1) return -1;
239
240 /* Skip over the last character */
241 i++;
242
243 DEBUGP("Skipped up to `%c'!\n", skip);
244
245 *numoff = i;
246 *numlen = getnum(data + i, dlen - i, array, term);
247 if (!*numlen)
248 return -1;
249
250 DEBUGP("Match succeeded!\n");
251 return 1;
252}
253
254/* Look up to see if we're just after a \n. */
255static int find_nl_seq(u16 seq, const struct ip_ct_ftp_master *info, int dir)
256{
257 unsigned int i;
258
259 for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
260 if (info->seq_aft_nl[dir][i] == seq)
261 return 1;
262 return 0;
263}
264
265/* We don't update if it's older than what we have. */
266static void update_nl_seq(u16 nl_seq, struct ip_ct_ftp_master *info, int dir)
267{
268 unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
269
270 /* Look for oldest: if we find exact match, we're done. */
271 for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
272 if (info->seq_aft_nl[dir][i] == nl_seq)
273 return;
274
275 if (oldest == info->seq_aft_nl_num[dir]
276 || before(info->seq_aft_nl[dir][i], oldest))
277 oldest = i;
278 }
279
280 if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
281 info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
282 else if (oldest != NUM_SEQ_TO_REMEMBER)
283 info->seq_aft_nl[dir][oldest] = nl_seq;
284}
285
286static int help(struct sk_buff **pskb,
287 struct ip_conntrack *ct,
288 enum ip_conntrack_info ctinfo)
289{
290 unsigned int dataoff, datalen;
291 struct tcphdr _tcph, *th;
292 char *fb_ptr;
293 int ret;
294 u32 seq, array[6] = { 0 };
295 int dir = CTINFO2DIR(ctinfo);
296 unsigned int matchlen, matchoff;
297 struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info;
298 struct ip_conntrack_expect *exp;
299 unsigned int i;
300 int found = 0, ends_in_nl;
301
302 /* Until there's been traffic both ways, don't look in packets. */
303 if (ctinfo != IP_CT_ESTABLISHED
304 && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
305 DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
306 return NF_ACCEPT;
307 }
308
309 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
310 sizeof(_tcph), &_tcph);
311 if (th == NULL)
312 return NF_ACCEPT;
313
314 dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
315 /* No data? */
316 if (dataoff >= (*pskb)->len) {
317 DEBUGP("ftp: pskblen = %u\n", (*pskb)->len);
318 return NF_ACCEPT;
319 }
320 datalen = (*pskb)->len - dataoff;
321
322 LOCK_BH(&ip_ftp_lock);
323 fb_ptr = skb_header_pointer(*pskb, dataoff,
324 (*pskb)->len - dataoff, ftp_buffer);
325 BUG_ON(fb_ptr == NULL);
326
327 ends_in_nl = (fb_ptr[datalen - 1] == '\n');
328 seq = ntohl(th->seq) + datalen;
329
330 /* Look up to see if we're just after a \n. */
331 if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
332 /* Now if this ends in \n, update ftp info. */
333 DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
334 ct_ftp_info->seq_aft_nl[0][dir]
335 old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
336 ret = NF_ACCEPT;
337 goto out_update_nl;
338 }
339
340 /* Initialize IP array to expected address (it's not mentioned
341 in EPSV responses) */
342 array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF;
343 array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF;
344 array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
345 array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
346
347 for (i = 0; i < ARRAY_SIZE(search); i++) {
348 if (search[i].dir != dir) continue;
349
350 found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
351 search[i].pattern,
352 search[i].plen,
353 search[i].skip,
354 search[i].term,
355 &matchoff, &matchlen,
356 array,
357 search[i].getnum);
358 if (found) break;
359 }
360 if (found == -1) {
361 /* We don't usually drop packets. After all, this is
362 connection tracking, not packet filtering.
363 However, it is necessary for accurate tracking in
364 this case. */
365 if (net_ratelimit())
366 printk("conntrack_ftp: partial %s %u+%u\n",
367 search[i].pattern,
368 ntohl(th->seq), datalen);
369 ret = NF_DROP;
370 goto out;
371 } else if (found == 0) { /* No match */
372 ret = NF_ACCEPT;
373 goto out_update_nl;
374 }
375
376 DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n",
377 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
378
379 /* Allocate expectation which will be inserted */
380 exp = ip_conntrack_expect_alloc();
381 if (exp == NULL) {
382 ret = NF_DROP;
383 goto out;
384 }
385
386 /* We refer to the reverse direction ("!dir") tuples here,
387 * because we're expecting something in the other direction.
388 * Doesn't matter unless NAT is happening. */
389 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
390
391 if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3])
392 != ct->tuplehash[dir].tuple.src.ip) {
393 /* Enrico Scholz's passive FTP to partially RNAT'd ftp
394 server: it really wants us to connect to a
395 different IP address. Simply don't record it for
396 NAT. */
397 DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
398 array[0], array[1], array[2], array[3],
399 NIPQUAD(ct->tuplehash[dir].tuple.src.ip));
400
401 /* Thanks to Cristiano Lincoln Mattos
402 <lincoln@cesar.org.br> for reporting this potential
403 problem (DMZ machines opening holes to internal
404 networks, or the packet filter itself). */
405 if (!loose) {
406 ret = NF_ACCEPT;
407 ip_conntrack_expect_free(exp);
408 goto out_update_nl;
409 }
410 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
411 | (array[2] << 8) | array[3]);
412 }
413
414 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
415 exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]);
416 exp->tuple.src.u.tcp.port = 0; /* Don't care. */
417 exp->tuple.dst.protonum = IPPROTO_TCP;
418 exp->mask = ((struct ip_conntrack_tuple)
419 { { 0xFFFFFFFF, { 0 } },
420 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
421
422 exp->expectfn = NULL;
423 exp->master = ct;
424
425 /* Now, NAT might want to mangle the packet, and register the
426 * (possibly changed) expectation itself. */
427 if (ip_nat_ftp_hook)
428 ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
429 matchoff, matchlen, exp, &seq);
430 else {
431 /* Can't expect this? Best to drop packet now. */
432 if (ip_conntrack_expect_related(exp) != 0) {
433 ip_conntrack_expect_free(exp);
434 ret = NF_DROP;
435 } else
436 ret = NF_ACCEPT;
437 }
438
439out_update_nl:
440 /* Now if this ends in \n, update ftp info. Seq may have been
441 * adjusted by NAT code. */
442 if (ends_in_nl)
443 update_nl_seq(seq, ct_ftp_info,dir);
444 out:
445 UNLOCK_BH(&ip_ftp_lock);
446 return ret;
447}
448
449static struct ip_conntrack_helper ftp[MAX_PORTS];
450static char ftp_names[MAX_PORTS][10];
451
452/* Not __exit: called from init() */
453static void fini(void)
454{
455 int i;
456 for (i = 0; i < ports_c; i++) {
457 DEBUGP("ip_ct_ftp: unregistering helper for port %d\n",
458 ports[i]);
459 ip_conntrack_helper_unregister(&ftp[i]);
460 }
461}
462
463static int __init init(void)
464{
465 int i, ret;
466 char *tmpname;
467
468 if (ports_c == 0)
469 ports[ports_c++] = FTP_PORT;
470
471 for (i = 0; i < ports_c; i++) {
472 ftp[i].tuple.src.u.tcp.port = htons(ports[i]);
473 ftp[i].tuple.dst.protonum = IPPROTO_TCP;
474 ftp[i].mask.src.u.tcp.port = 0xFFFF;
475 ftp[i].mask.dst.protonum = 0xFF;
476 ftp[i].max_expected = 1;
477 ftp[i].timeout = 5 * 60; /* 5 minutes */
478 ftp[i].me = THIS_MODULE;
479 ftp[i].help = help;
480
481 tmpname = &ftp_names[i][0];
482 if (ports[i] == FTP_PORT)
483 sprintf(tmpname, "ftp");
484 else
485 sprintf(tmpname, "ftp-%d", ports[i]);
486 ftp[i].name = tmpname;
487
488 DEBUGP("ip_ct_ftp: registering helper for port %d\n",
489 ports[i]);
490 ret = ip_conntrack_helper_register(&ftp[i]);
491
492 if (ret) {
493 fini();
494 return ret;
495 }
496 }
497 return 0;
498}
499
500module_init(init);
501module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
new file mode 100644
index 000000000000..33cc7348b6ee
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -0,0 +1,313 @@
1/* IRC extension for IP connection tracking, Version 1.21
2 * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
3 * based on RR's ip_conntrack_ftp.c
4 *
5 * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 **
12 * Module load syntax:
13 * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS>
14 * max_dcc_channels=n dcc_timeout=secs
15 *
16 * please give the ports of all IRC servers You wish to connect to.
17 * If You don't specify ports, the default will be port 6667.
18 * With max_dcc_channels you can define the maximum number of not
19 * yet answered DCC channels per IRC session (default 8).
20 * With dcc_timeout you can specify how long the system waits for
21 * an expected DCC channel (default 300 seconds).
22 *
23 */
24
25#include <linux/config.h>
26#include <linux/module.h>
27#include <linux/netfilter.h>
28#include <linux/ip.h>
29#include <net/checksum.h>
30#include <net/tcp.h>
31
32#include <linux/netfilter_ipv4/lockhelp.h>
33#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
34#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
35#include <linux/moduleparam.h>
36
37#define MAX_PORTS 8
38static int ports[MAX_PORTS];
39static int ports_c;
40static int max_dcc_channels = 8;
41static unsigned int dcc_timeout = 300;
42/* This is slow, but it's simple. --RR */
43static char irc_buffer[65536];
44static DECLARE_LOCK(irc_buffer_lock);
45
46unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
47 enum ip_conntrack_info ctinfo,
48 unsigned int matchoff,
49 unsigned int matchlen,
50 struct ip_conntrack_expect *exp);
51EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
52
53MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
54MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
55MODULE_LICENSE("GPL");
56module_param_array(ports, int, &ports_c, 0400);
57MODULE_PARM_DESC(ports, "port numbers of IRC servers");
58module_param(max_dcc_channels, int, 0400);
59MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
60module_param(dcc_timeout, int, 0400);
61MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
62
63static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
64#define MINMATCHLEN 5
65
66#if 0
67#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \
68 __FILE__, __FUNCTION__ , ## args)
69#else
70#define DEBUGP(format, args...)
71#endif
72
73static int parse_dcc(char *data, char *data_end, u_int32_t *ip,
74 u_int16_t *port, char **ad_beg_p, char **ad_end_p)
75/* tries to get the ip_addr and port out of a dcc command
76 return value: -1 on failure, 0 on success
77 data pointer to first byte of DCC command data
78 data_end pointer to last byte of dcc command data
79 ip returns parsed ip of dcc command
80 port returns parsed port of dcc command
81 ad_beg_p returns pointer to first byte of addr data
82 ad_end_p returns pointer to last byte of addr data */
83{
84
85 /* at least 12: "AAAAAAAA P\1\n" */
86 while (*data++ != ' ')
87 if (data > data_end - 12)
88 return -1;
89
90 *ad_beg_p = data;
91 *ip = simple_strtoul(data, &data, 10);
92
93 /* skip blanks between ip and port */
94 while (*data == ' ') {
95 if (data >= data_end)
96 return -1;
97 data++;
98 }
99
100 *port = simple_strtoul(data, &data, 10);
101 *ad_end_p = data;
102
103 return 0;
104}
105
106static int help(struct sk_buff **pskb,
107 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
108{
109 unsigned int dataoff;
110 struct tcphdr _tcph, *th;
111 char *data, *data_limit, *ib_ptr;
112 int dir = CTINFO2DIR(ctinfo);
113 struct ip_conntrack_expect *exp;
114 u32 seq;
115 u_int32_t dcc_ip;
116 u_int16_t dcc_port;
117 int i, ret = NF_ACCEPT;
118 char *addr_beg_p, *addr_end_p;
119
120 DEBUGP("entered\n");
121
122 /* If packet is coming from IRC server */
123 if (dir == IP_CT_DIR_REPLY)
124 return NF_ACCEPT;
125
126 /* Until there's been traffic both ways, don't look in packets. */
127 if (ctinfo != IP_CT_ESTABLISHED
128 && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
129 DEBUGP("Conntrackinfo = %u\n", ctinfo);
130 return NF_ACCEPT;
131 }
132
133 /* Not a full tcp header? */
134 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
135 sizeof(_tcph), &_tcph);
136 if (th == NULL)
137 return NF_ACCEPT;
138
139 /* No data? */
140 dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
141 if (dataoff >= (*pskb)->len)
142 return NF_ACCEPT;
143
144 LOCK_BH(&irc_buffer_lock);
145 ib_ptr = skb_header_pointer(*pskb, dataoff,
146 (*pskb)->len - dataoff, irc_buffer);
147 BUG_ON(ib_ptr == NULL);
148
149 data = ib_ptr;
150 data_limit = ib_ptr + (*pskb)->len - dataoff;
151
152 /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
153 * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
154 while (data < (data_limit - (19 + MINMATCHLEN))) {
155 if (memcmp(data, "\1DCC ", 5)) {
156 data++;
157 continue;
158 }
159
160 data += 5;
161 /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
162
163 DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n",
164 NIPQUAD(iph->saddr), ntohs(th->source),
165 NIPQUAD(iph->daddr), ntohs(th->dest));
166
167 for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
168 if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
169 /* no match */
170 continue;
171 }
172
173 DEBUGP("DCC %s detected\n", dccprotos[i]);
174 data += strlen(dccprotos[i]);
175 /* we have at least
176 * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
177 * data left (== 14/13 bytes) */
178 if (parse_dcc((char *)data, data_limit, &dcc_ip,
179 &dcc_port, &addr_beg_p, &addr_end_p)) {
180 /* unable to parse */
181 DEBUGP("unable to parse dcc command\n");
182 continue;
183 }
184 DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n",
185 HIPQUAD(dcc_ip), dcc_port);
186
187 /* dcc_ip can be the internal OR external (NAT'ed) IP
188 * Tiago Sousa <mirage@kaotik.org> */
189 if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip)
190 && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) {
191 if (net_ratelimit())
192 printk(KERN_WARNING
193 "Forged DCC command from "
194 "%u.%u.%u.%u: %u.%u.%u.%u:%u\n",
195 NIPQUAD(ct->tuplehash[dir].tuple.src.ip),
196 HIPQUAD(dcc_ip), dcc_port);
197
198 continue;
199 }
200
201 exp = ip_conntrack_expect_alloc();
202 if (exp == NULL) {
203 ret = NF_DROP;
204 goto out;
205 }
206
207 /* save position of address in dcc string,
208 * necessary for NAT */
209 DEBUGP("tcph->seq = %u\n", th->seq);
210 seq = ntohl(th->seq) + (addr_beg_p - ib_ptr);
211
212 /* We refer to the reverse direction ("!dir")
213 * tuples here, because we're expecting
214 * something in the other * direction.
215 * Doesn't matter unless NAT is happening. */
216 exp->tuple = ((struct ip_conntrack_tuple)
217 { { 0, { 0 } },
218 { ct->tuplehash[!dir].tuple.dst.ip,
219 { .tcp = { htons(dcc_port) } },
220 IPPROTO_TCP }});
221 exp->mask = ((struct ip_conntrack_tuple)
222 { { 0, { 0 } },
223 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
224 exp->expectfn = NULL;
225 exp->master = ct;
226 if (ip_nat_irc_hook)
227 ret = ip_nat_irc_hook(pskb, ctinfo,
228 addr_beg_p - ib_ptr,
229 addr_end_p - addr_beg_p,
230 exp);
231 else if (ip_conntrack_expect_related(exp) != 0) {
232 ip_conntrack_expect_free(exp);
233 ret = NF_DROP;
234 }
235 goto out;
236 } /* for .. NUM_DCCPROTO */
237 } /* while data < ... */
238
239 out:
240 UNLOCK_BH(&irc_buffer_lock);
241 return ret;
242}
243
244static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
245static char irc_names[MAX_PORTS][10];
246
247static void fini(void);
248
249static int __init init(void)
250{
251 int i, ret;
252 struct ip_conntrack_helper *hlpr;
253 char *tmpname;
254
255 if (max_dcc_channels < 1) {
256 printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n");
257 return -EBUSY;
258 }
259 if (dcc_timeout < 0) {
260 printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
261 return -EBUSY;
262 }
263
264 /* If no port given, default to standard irc port */
265 if (ports_c == 0)
266 ports[ports_c++] = IRC_PORT;
267
268 for (i = 0; i < ports_c; i++) {
269 hlpr = &irc_helpers[i];
270 hlpr->tuple.src.u.tcp.port = htons(ports[i]);
271 hlpr->tuple.dst.protonum = IPPROTO_TCP;
272 hlpr->mask.src.u.tcp.port = 0xFFFF;
273 hlpr->mask.dst.protonum = 0xFF;
274 hlpr->max_expected = max_dcc_channels;
275 hlpr->timeout = dcc_timeout;
276 hlpr->me = THIS_MODULE;
277 hlpr->help = help;
278
279 tmpname = &irc_names[i][0];
280 if (ports[i] == IRC_PORT)
281 sprintf(tmpname, "irc");
282 else
283 sprintf(tmpname, "irc-%d", i);
284 hlpr->name = tmpname;
285
286 DEBUGP("port #%d: %d\n", i, ports[i]);
287
288 ret = ip_conntrack_helper_register(hlpr);
289
290 if (ret) {
291 printk("ip_conntrack_irc: ERROR registering port %d\n",
292 ports[i]);
293 fini();
294 return -EBUSY;
295 }
296 }
297 return 0;
298}
299
300/* This function is intentionally _NOT_ defined as __exit, because
301 * it is needed by the init function */
302static void fini(void)
303{
304 int i;
305 for (i = 0; i < ports_c; i++) {
306 DEBUGP("unregistering port %d\n",
307 ports[i]);
308 ip_conntrack_helper_unregister(&irc_helpers[i]);
309 }
310}
311
312module_init(init);
313module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
new file mode 100644
index 000000000000..88c3712bd251
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -0,0 +1,75 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/sched.h>
11#include <linux/timer.h>
12#include <linux/netfilter.h>
13#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
14
15unsigned long ip_ct_generic_timeout = 600*HZ;
16
17static int generic_pkt_to_tuple(const struct sk_buff *skb,
18 unsigned int dataoff,
19 struct ip_conntrack_tuple *tuple)
20{
21 tuple->src.u.all = 0;
22 tuple->dst.u.all = 0;
23
24 return 1;
25}
26
27static int generic_invert_tuple(struct ip_conntrack_tuple *tuple,
28 const struct ip_conntrack_tuple *orig)
29{
30 tuple->src.u.all = 0;
31 tuple->dst.u.all = 0;
32
33 return 1;
34}
35
36/* Print out the per-protocol part of the tuple. */
37static int generic_print_tuple(struct seq_file *s,
38 const struct ip_conntrack_tuple *tuple)
39{
40 return 0;
41}
42
43/* Print out the private part of the conntrack. */
44static int generic_print_conntrack(struct seq_file *s,
45 const struct ip_conntrack *state)
46{
47 return 0;
48}
49
50/* Returns verdict for packet, or -1 for invalid. */
51static int packet(struct ip_conntrack *conntrack,
52 const struct sk_buff *skb,
53 enum ip_conntrack_info ctinfo)
54{
55 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
56 return NF_ACCEPT;
57}
58
59/* Called when a new connection for this protocol found. */
60static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
61{
62 return 1;
63}
64
65struct ip_conntrack_protocol ip_conntrack_generic_protocol =
66{
67 .proto = 0,
68 .name = "unknown",
69 .pkt_to_tuple = generic_pkt_to_tuple,
70 .invert_tuple = generic_invert_tuple,
71 .print_tuple = generic_print_tuple,
72 .print_conntrack = generic_print_conntrack,
73 .packet = packet,
74 .new = new,
75};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
new file mode 100644
index 000000000000..602c74db3252
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -0,0 +1,279 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/sched.h>
11#include <linux/timer.h>
12#include <linux/netfilter.h>
13#include <linux/in.h>
14#include <linux/icmp.h>
15#include <linux/seq_file.h>
16#include <net/ip.h>
17#include <net/checksum.h>
18#include <linux/netfilter.h>
19#include <linux/netfilter_ipv4.h>
20#include <linux/netfilter_ipv4/ip_conntrack.h>
21#include <linux/netfilter_ipv4/ip_conntrack_core.h>
22#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
23
24unsigned long ip_ct_icmp_timeout = 30*HZ;
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(format, args...)
30#endif
31
32static int icmp_pkt_to_tuple(const struct sk_buff *skb,
33 unsigned int dataoff,
34 struct ip_conntrack_tuple *tuple)
35{
36 struct icmphdr _hdr, *hp;
37
38 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
39 if (hp == NULL)
40 return 0;
41
42 tuple->dst.u.icmp.type = hp->type;
43 tuple->src.u.icmp.id = hp->un.echo.id;
44 tuple->dst.u.icmp.code = hp->code;
45
46 return 1;
47}
48
49static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
50 const struct ip_conntrack_tuple *orig)
51{
52 /* Add 1; spaces filled with 0. */
53 static u_int8_t invmap[]
54 = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
55 [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
56 [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
57 [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
58 [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
59 [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
60 [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
61 [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
62
63 if (orig->dst.u.icmp.type >= sizeof(invmap)
64 || !invmap[orig->dst.u.icmp.type])
65 return 0;
66
67 tuple->src.u.icmp.id = orig->src.u.icmp.id;
68 tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
69 tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
70 return 1;
71}
72
73/* Print out the per-protocol part of the tuple. */
74static int icmp_print_tuple(struct seq_file *s,
75 const struct ip_conntrack_tuple *tuple)
76{
77 return seq_printf(s, "type=%u code=%u id=%u ",
78 tuple->dst.u.icmp.type,
79 tuple->dst.u.icmp.code,
80 ntohs(tuple->src.u.icmp.id));
81}
82
83/* Print out the private part of the conntrack. */
84static int icmp_print_conntrack(struct seq_file *s,
85 const struct ip_conntrack *conntrack)
86{
87 return 0;
88}
89
90/* Returns verdict for packet, or -1 for invalid. */
91static int icmp_packet(struct ip_conntrack *ct,
92 const struct sk_buff *skb,
93 enum ip_conntrack_info ctinfo)
94{
95 /* Try to delete connection immediately after all replies:
96 won't actually vanish as we still have skb, and del_timer
97 means this will only run once even if count hits zero twice
98 (theoretically possible with SMP) */
99 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
100 if (atomic_dec_and_test(&ct->proto.icmp.count)
101 && del_timer(&ct->timeout))
102 ct->timeout.function((unsigned long)ct);
103 } else {
104 atomic_inc(&ct->proto.icmp.count);
105 ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
106 }
107
108 return NF_ACCEPT;
109}
110
111/* Called when a new connection for this protocol found. */
112static int icmp_new(struct ip_conntrack *conntrack,
113 const struct sk_buff *skb)
114{
115 static u_int8_t valid_new[]
116 = { [ICMP_ECHO] = 1,
117 [ICMP_TIMESTAMP] = 1,
118 [ICMP_INFO_REQUEST] = 1,
119 [ICMP_ADDRESS] = 1 };
120
121 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
122 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
123 /* Can't create a new ICMP `conn' with this. */
124 DEBUGP("icmp: can't create new conn with type %u\n",
125 conntrack->tuplehash[0].tuple.dst.u.icmp.type);
126 DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
127 return 0;
128 }
129 atomic_set(&conntrack->proto.icmp.count, 0);
130 return 1;
131}
132
133static int
134icmp_error_message(struct sk_buff *skb,
135 enum ip_conntrack_info *ctinfo,
136 unsigned int hooknum)
137{
138 struct ip_conntrack_tuple innertuple, origtuple;
139 struct {
140 struct icmphdr icmp;
141 struct iphdr ip;
142 } _in, *inside;
143 struct ip_conntrack_protocol *innerproto;
144 struct ip_conntrack_tuple_hash *h;
145 int dataoff;
146
147 IP_NF_ASSERT(skb->nfct == NULL);
148
149 /* Not enough header? */
150 inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
151 if (inside == NULL)
152 return NF_ACCEPT;
153
154 /* Ignore ICMP's containing fragments (shouldn't happen) */
155 if (inside->ip.frag_off & htons(IP_OFFSET)) {
156 DEBUGP("icmp_error_track: fragment of proto %u\n",
157 inside->ip.protocol);
158 return NF_ACCEPT;
159 }
160
161 innerproto = ip_ct_find_proto(inside->ip.protocol);
162 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
163 /* Are they talking about one of our connections? */
164 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
165 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
166 return NF_ACCEPT;
167 }
168
169 /* Ordinarily, we'd expect the inverted tupleproto, but it's
170 been preserved inside the ICMP. */
171 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
172 DEBUGP("icmp_error_track: Can't invert tuple\n");
173 return NF_ACCEPT;
174 }
175
176 *ctinfo = IP_CT_RELATED;
177
178 h = ip_conntrack_find_get(&innertuple, NULL);
179 if (!h) {
180 /* Locally generated ICMPs will match inverted if they
181 haven't been SNAT'ed yet */
182 /* FIXME: NAT code has to handle half-done double NAT --RR */
183 if (hooknum == NF_IP_LOCAL_OUT)
184 h = ip_conntrack_find_get(&origtuple, NULL);
185
186 if (!h) {
187 DEBUGP("icmp_error_track: no match\n");
188 return NF_ACCEPT;
189 }
190 /* Reverse direction from that found */
191 if (DIRECTION(h) != IP_CT_DIR_REPLY)
192 *ctinfo += IP_CT_IS_REPLY;
193 } else {
194 if (DIRECTION(h) == IP_CT_DIR_REPLY)
195 *ctinfo += IP_CT_IS_REPLY;
196 }
197
198 /* Update skb to refer to this connection */
199 skb->nfct = &tuplehash_to_ctrack(h)->ct_general;
200 skb->nfctinfo = *ctinfo;
201 return -NF_ACCEPT;
202}
203
204/* Small and modified version of icmp_rcv */
205static int
206icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
207 unsigned int hooknum)
208{
209 struct icmphdr _ih, *icmph;
210
211 /* Not enough header? */
212 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
213 if (icmph == NULL) {
214 if (LOG_INVALID(IPPROTO_ICMP))
215 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
216 "ip_ct_icmp: short packet ");
217 return -NF_ACCEPT;
218 }
219
220 /* See ip_conntrack_proto_tcp.c */
221 if (hooknum != NF_IP_PRE_ROUTING)
222 goto checksum_skipped;
223
224 switch (skb->ip_summed) {
225 case CHECKSUM_HW:
226 if (!(u16)csum_fold(skb->csum))
227 break;
228 if (LOG_INVALID(IPPROTO_ICMP))
229 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
230 "ip_ct_icmp: bad HW ICMP checksum ");
231 return -NF_ACCEPT;
232 case CHECKSUM_NONE:
233 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
234 if (LOG_INVALID(IPPROTO_ICMP))
235 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
236 "ip_ct_icmp: bad ICMP checksum ");
237 return -NF_ACCEPT;
238 }
239 default:
240 break;
241 }
242
243checksum_skipped:
244 /*
245 * 18 is the highest 'known' ICMP type. Anything else is a mystery
246 *
247 * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
248 * discarded.
249 */
250 if (icmph->type > NR_ICMP_TYPES) {
251 if (LOG_INVALID(IPPROTO_ICMP))
252 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
253 "ip_ct_icmp: invalid ICMP type ");
254 return -NF_ACCEPT;
255 }
256
257 /* Need to track icmp error message? */
258 if (icmph->type != ICMP_DEST_UNREACH
259 && icmph->type != ICMP_SOURCE_QUENCH
260 && icmph->type != ICMP_TIME_EXCEEDED
261 && icmph->type != ICMP_PARAMETERPROB
262 && icmph->type != ICMP_REDIRECT)
263 return NF_ACCEPT;
264
265 return icmp_error_message(skb, ctinfo, hooknum);
266}
267
268struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
269{
270 .proto = IPPROTO_ICMP,
271 .name = "icmp",
272 .pkt_to_tuple = icmp_pkt_to_tuple,
273 .invert_tuple = icmp_invert_tuple,
274 .print_tuple = icmp_print_tuple,
275 .print_conntrack = icmp_print_conntrack,
276 .packet = icmp_packet,
277 .new = icmp_new,
278 .error = icmp_error,
279};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
new file mode 100644
index 000000000000..ff8c34a860ff
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -0,0 +1,649 @@
1/*
2 * Connection tracking protocol helper module for SCTP.
3 *
4 * SCTP is defined in RFC 2960. References to various sections in this code
5 * are to this RFC.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12/*
13 * Added support for proc manipulation of timeouts.
14 */
15
16#include <linux/types.h>
17#include <linux/sched.h>
18#include <linux/timer.h>
19#include <linux/netfilter.h>
20#include <linux/module.h>
21#include <linux/in.h>
22#include <linux/ip.h>
23#include <linux/sctp.h>
24#include <linux/string.h>
25#include <linux/seq_file.h>
26
27#include <linux/netfilter_ipv4/ip_conntrack.h>
28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
29#include <linux/netfilter_ipv4/lockhelp.h>
30
31#if 0
32#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
33#else
34#define DEBUGP(format, args...)
35#endif
36
37/* Protects conntrack->proto.sctp */
38static DECLARE_RWLOCK(sctp_lock);
39
40/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
41 closely. They're more complex. --RR
42
43 And so for me for SCTP :D -Kiran */
44
45static const char *sctp_conntrack_names[] = {
46 "NONE",
47 "CLOSED",
48 "COOKIE_WAIT",
49 "COOKIE_ECHOED",
50 "ESTABLISHED",
51 "SHUTDOWN_SENT",
52 "SHUTDOWN_RECD",
53 "SHUTDOWN_ACK_SENT",
54};
55
56#define SECS * HZ
57#define MINS * 60 SECS
58#define HOURS * 60 MINS
59#define DAYS * 24 HOURS
60
61static unsigned long ip_ct_sctp_timeout_closed = 10 SECS;
62static unsigned long ip_ct_sctp_timeout_cookie_wait = 3 SECS;
63static unsigned long ip_ct_sctp_timeout_cookie_echoed = 3 SECS;
64static unsigned long ip_ct_sctp_timeout_established = 5 DAYS;
65static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000;
66static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000;
67static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS;
68
69static unsigned long * sctp_timeouts[]
70= { NULL, /* SCTP_CONNTRACK_NONE */
71 &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
72 &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
73 &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */
74 &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */
75 &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */
76 &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */
77 &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
78 };
79
80#define sNO SCTP_CONNTRACK_NONE
81#define sCL SCTP_CONNTRACK_CLOSED
82#define sCW SCTP_CONNTRACK_COOKIE_WAIT
83#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
84#define sES SCTP_CONNTRACK_ESTABLISHED
85#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
86#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
87#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
88#define sIV SCTP_CONNTRACK_MAX
89
90/*
91 These are the descriptions of the states:
92
93NOTE: These state names are tantalizingly similar to the states of an
94SCTP endpoint. But the interpretation of the states is a little different,
95considering that these are the states of the connection and not of an end
96point. Please note the subtleties. -Kiran
97
98NONE - Nothing so far.
99COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
100 an INIT_ACK chunk in the reply direction.
101COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
102ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
103SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
104SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
105SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
106 to that of the SHUTDOWN chunk.
107CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
108 the SHUTDOWN chunk. Connection is closed.
109*/
110
111/* TODO
112 - I have assumed that the first INIT is in the original direction.
113 This messes things when an INIT comes in the reply direction in CLOSED
114 state.
115 - Check the error type in the reply dir before transitioning from
116cookie echoed to closed.
117 - Sec 5.2.4 of RFC 2960
118 - Multi Homing support.
119*/
120
121/* SCTP conntrack state transitions */
122static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
123 {
124/* ORIGINAL */
125/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
126/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
127/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
128/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
129/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
130/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
131/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
132/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
133/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
134/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
135 },
136 {
137/* REPLY */
138/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
139/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
140/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
141/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
142/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
143/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
144/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
145/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
146/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
147/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
148 }
149};
150
151static int sctp_pkt_to_tuple(const struct sk_buff *skb,
152 unsigned int dataoff,
153 struct ip_conntrack_tuple *tuple)
154{
155 sctp_sctphdr_t _hdr, *hp;
156
157 DEBUGP(__FUNCTION__);
158 DEBUGP("\n");
159
160 /* Actually only need first 8 bytes. */
161 hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
162 if (hp == NULL)
163 return 0;
164
165 tuple->src.u.sctp.port = hp->source;
166 tuple->dst.u.sctp.port = hp->dest;
167 return 1;
168}
169
170static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple,
171 const struct ip_conntrack_tuple *orig)
172{
173 DEBUGP(__FUNCTION__);
174 DEBUGP("\n");
175
176 tuple->src.u.sctp.port = orig->dst.u.sctp.port;
177 tuple->dst.u.sctp.port = orig->src.u.sctp.port;
178 return 1;
179}
180
181/* Print out the per-protocol part of the tuple. */
182static int sctp_print_tuple(struct seq_file *s,
183 const struct ip_conntrack_tuple *tuple)
184{
185 DEBUGP(__FUNCTION__);
186 DEBUGP("\n");
187
188 return seq_printf(s, "sport=%hu dport=%hu ",
189 ntohs(tuple->src.u.sctp.port),
190 ntohs(tuple->dst.u.sctp.port));
191}
192
193/* Print out the private part of the conntrack. */
194static int sctp_print_conntrack(struct seq_file *s,
195 const struct ip_conntrack *conntrack)
196{
197 enum sctp_conntrack state;
198
199 DEBUGP(__FUNCTION__);
200 DEBUGP("\n");
201
202 READ_LOCK(&sctp_lock);
203 state = conntrack->proto.sctp.state;
204 READ_UNLOCK(&sctp_lock);
205
206 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
207}
208
209#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \
210for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \
211 offset < skb->len && \
212 (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \
213 offset += (htons(sch->length) + 3) & ~3, count++)
214
215/* Some validity checks to make sure the chunks are fine */
216static int do_basic_checks(struct ip_conntrack *conntrack,
217 const struct sk_buff *skb,
218 char *map)
219{
220 u_int32_t offset, count;
221 sctp_chunkhdr_t _sch, *sch;
222 int flag;
223
224 DEBUGP(__FUNCTION__);
225 DEBUGP("\n");
226
227 flag = 0;
228
229 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
230 DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type);
231
232 if (sch->type == SCTP_CID_INIT
233 || sch->type == SCTP_CID_INIT_ACK
234 || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
235 flag = 1;
236 }
237
238 /* Cookie Ack/Echo chunks not the first OR
239 Init / Init Ack / Shutdown compl chunks not the only chunks */
240 if ((sch->type == SCTP_CID_COOKIE_ACK
241 || sch->type == SCTP_CID_COOKIE_ECHO
242 || flag)
243 && count !=0 ) {
244 DEBUGP("Basic checks failed\n");
245 return 1;
246 }
247
248 if (map) {
249 set_bit(sch->type, (void *)map);
250 }
251 }
252
253 DEBUGP("Basic checks passed\n");
254 return 0;
255}
256
257static int new_state(enum ip_conntrack_dir dir,
258 enum sctp_conntrack cur_state,
259 int chunk_type)
260{
261 int i;
262
263 DEBUGP(__FUNCTION__);
264 DEBUGP("\n");
265
266 DEBUGP("Chunk type: %d\n", chunk_type);
267
268 switch (chunk_type) {
269 case SCTP_CID_INIT:
270 DEBUGP("SCTP_CID_INIT\n");
271 i = 0; break;
272 case SCTP_CID_INIT_ACK:
273 DEBUGP("SCTP_CID_INIT_ACK\n");
274 i = 1; break;
275 case SCTP_CID_ABORT:
276 DEBUGP("SCTP_CID_ABORT\n");
277 i = 2; break;
278 case SCTP_CID_SHUTDOWN:
279 DEBUGP("SCTP_CID_SHUTDOWN\n");
280 i = 3; break;
281 case SCTP_CID_SHUTDOWN_ACK:
282 DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
283 i = 4; break;
284 case SCTP_CID_ERROR:
285 DEBUGP("SCTP_CID_ERROR\n");
286 i = 5; break;
287 case SCTP_CID_COOKIE_ECHO:
288 DEBUGP("SCTP_CID_COOKIE_ECHO\n");
289 i = 6; break;
290 case SCTP_CID_COOKIE_ACK:
291 DEBUGP("SCTP_CID_COOKIE_ACK\n");
292 i = 7; break;
293 case SCTP_CID_SHUTDOWN_COMPLETE:
294 DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
295 i = 8; break;
296 default:
297 /* Other chunks like DATA, SACK, HEARTBEAT and
298 its ACK do not cause a change in state */
299 DEBUGP("Unknown chunk type, Will stay in %s\n",
300 sctp_conntrack_names[cur_state]);
301 return cur_state;
302 }
303
304 DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
305 dir, sctp_conntrack_names[cur_state], chunk_type,
306 sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
307
308 return sctp_conntracks[dir][i][cur_state];
309}
310
311/* Returns verdict for packet, or -1 for invalid. */
312static int sctp_packet(struct ip_conntrack *conntrack,
313 const struct sk_buff *skb,
314 enum ip_conntrack_info ctinfo)
315{
316 enum sctp_conntrack newconntrack, oldsctpstate;
317 struct iphdr *iph = skb->nh.iph;
318 sctp_sctphdr_t _sctph, *sh;
319 sctp_chunkhdr_t _sch, *sch;
320 u_int32_t offset, count;
321 char map[256 / sizeof (char)] = {0};
322
323 DEBUGP(__FUNCTION__);
324 DEBUGP("\n");
325
326 sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
327 if (sh == NULL)
328 return -1;
329
330 if (do_basic_checks(conntrack, skb, map) != 0)
331 return -1;
332
333 /* Check the verification tag (Sec 8.5) */
334 if (!test_bit(SCTP_CID_INIT, (void *)map)
335 && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
336 && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
337 && !test_bit(SCTP_CID_ABORT, (void *)map)
338 && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
339 && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
340 DEBUGP("Verification tag check failed\n");
341 return -1;
342 }
343
344 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
345 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
346 WRITE_LOCK(&sctp_lock);
347
348 /* Special cases of Verification tag check (Sec 8.5.1) */
349 if (sch->type == SCTP_CID_INIT) {
350 /* Sec 8.5.1 (A) */
351 if (sh->vtag != 0) {
352 WRITE_UNLOCK(&sctp_lock);
353 return -1;
354 }
355 } else if (sch->type == SCTP_CID_ABORT) {
356 /* Sec 8.5.1 (B) */
357 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
358 && !(sh->vtag == conntrack->proto.sctp.vtag
359 [1 - CTINFO2DIR(ctinfo)])) {
360 WRITE_UNLOCK(&sctp_lock);
361 return -1;
362 }
363 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
364 /* Sec 8.5.1 (C) */
365 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
366 && !(sh->vtag == conntrack->proto.sctp.vtag
367 [1 - CTINFO2DIR(ctinfo)]
368 && (sch->flags & 1))) {
369 WRITE_UNLOCK(&sctp_lock);
370 return -1;
371 }
372 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
373 /* Sec 8.5.1 (D) */
374 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
375 WRITE_UNLOCK(&sctp_lock);
376 return -1;
377 }
378 }
379
380 oldsctpstate = conntrack->proto.sctp.state;
381 newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
382
383 /* Invalid */
384 if (newconntrack == SCTP_CONNTRACK_MAX) {
385 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
386 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
387 WRITE_UNLOCK(&sctp_lock);
388 return -1;
389 }
390
391 /* If it is an INIT or an INIT ACK note down the vtag */
392 if (sch->type == SCTP_CID_INIT
393 || sch->type == SCTP_CID_INIT_ACK) {
394 sctp_inithdr_t _inithdr, *ih;
395
396 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
397 sizeof(_inithdr), &_inithdr);
398 if (ih == NULL) {
399 WRITE_UNLOCK(&sctp_lock);
400 return -1;
401 }
402 DEBUGP("Setting vtag %x for dir %d\n",
403 ih->init_tag, !CTINFO2DIR(ctinfo));
404 conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
405 }
406
407 conntrack->proto.sctp.state = newconntrack;
408 WRITE_UNLOCK(&sctp_lock);
409 }
410
411 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
412
413 if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
414 && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
415 && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
416 DEBUGP("Setting assured bit\n");
417 set_bit(IPS_ASSURED_BIT, &conntrack->status);
418 }
419
420 return NF_ACCEPT;
421}
422
423/* Called when a new connection for this protocol found. */
424static int sctp_new(struct ip_conntrack *conntrack,
425 const struct sk_buff *skb)
426{
427 enum sctp_conntrack newconntrack;
428 struct iphdr *iph = skb->nh.iph;
429 sctp_sctphdr_t _sctph, *sh;
430 sctp_chunkhdr_t _sch, *sch;
431 u_int32_t offset, count;
432 char map[256 / sizeof (char)] = {0};
433
434 DEBUGP(__FUNCTION__);
435 DEBUGP("\n");
436
437 sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
438 if (sh == NULL)
439 return 0;
440
441 if (do_basic_checks(conntrack, skb, map) != 0)
442 return 0;
443
444 /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
445 if ((test_bit (SCTP_CID_ABORT, (void *)map))
446 || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
447 || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
448 return 0;
449 }
450
451 newconntrack = SCTP_CONNTRACK_MAX;
452 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
453 /* Don't need lock here: this conntrack not in circulation yet */
454 newconntrack = new_state (IP_CT_DIR_ORIGINAL,
455 SCTP_CONNTRACK_NONE, sch->type);
456
457 /* Invalid: delete conntrack */
458 if (newconntrack == SCTP_CONNTRACK_MAX) {
459 DEBUGP("ip_conntrack_sctp: invalid new deleting.\n");
460 return 0;
461 }
462
463 /* Copy the vtag into the state info */
464 if (sch->type == SCTP_CID_INIT) {
465 if (sh->vtag == 0) {
466 sctp_inithdr_t _inithdr, *ih;
467
468 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
469 sizeof(_inithdr), &_inithdr);
470 if (ih == NULL)
471 return 0;
472
473 DEBUGP("Setting vtag %x for new conn\n",
474 ih->init_tag);
475
476 conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
477 ih->init_tag;
478 } else {
479 /* Sec 8.5.1 (A) */
480 return 0;
481 }
482 }
483 /* If it is a shutdown ack OOTB packet, we expect a return
484 shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
485 else {
486 DEBUGP("Setting vtag %x for new conn OOTB\n",
487 sh->vtag);
488 conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
489 }
490
491 conntrack->proto.sctp.state = newconntrack;
492 }
493
494 return 1;
495}
496
497static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
498 .proto = IPPROTO_SCTP,
499 .name = "sctp",
500 .pkt_to_tuple = sctp_pkt_to_tuple,
501 .invert_tuple = sctp_invert_tuple,
502 .print_tuple = sctp_print_tuple,
503 .print_conntrack = sctp_print_conntrack,
504 .packet = sctp_packet,
505 .new = sctp_new,
506 .destroy = NULL,
507 .me = THIS_MODULE
508};
509
510#ifdef CONFIG_SYSCTL
511static ctl_table ip_ct_sysctl_table[] = {
512 {
513 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
514 .procname = "ip_conntrack_sctp_timeout_closed",
515 .data = &ip_ct_sctp_timeout_closed,
516 .maxlen = sizeof(unsigned int),
517 .mode = 0644,
518 .proc_handler = &proc_dointvec_jiffies,
519 },
520 {
521 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
522 .procname = "ip_conntrack_sctp_timeout_cookie_wait",
523 .data = &ip_ct_sctp_timeout_cookie_wait,
524 .maxlen = sizeof(unsigned int),
525 .mode = 0644,
526 .proc_handler = &proc_dointvec_jiffies,
527 },
528 {
529 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
530 .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
531 .data = &ip_ct_sctp_timeout_cookie_echoed,
532 .maxlen = sizeof(unsigned int),
533 .mode = 0644,
534 .proc_handler = &proc_dointvec_jiffies,
535 },
536 {
537 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
538 .procname = "ip_conntrack_sctp_timeout_established",
539 .data = &ip_ct_sctp_timeout_established,
540 .maxlen = sizeof(unsigned int),
541 .mode = 0644,
542 .proc_handler = &proc_dointvec_jiffies,
543 },
544 {
545 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
546 .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
547 .data = &ip_ct_sctp_timeout_shutdown_sent,
548 .maxlen = sizeof(unsigned int),
549 .mode = 0644,
550 .proc_handler = &proc_dointvec_jiffies,
551 },
552 {
553 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
554 .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
555 .data = &ip_ct_sctp_timeout_shutdown_recd,
556 .maxlen = sizeof(unsigned int),
557 .mode = 0644,
558 .proc_handler = &proc_dointvec_jiffies,
559 },
560 {
561 .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
562 .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
563 .data = &ip_ct_sctp_timeout_shutdown_ack_sent,
564 .maxlen = sizeof(unsigned int),
565 .mode = 0644,
566 .proc_handler = &proc_dointvec_jiffies,
567 },
568 { .ctl_name = 0 }
569};
570
571static ctl_table ip_ct_netfilter_table[] = {
572 {
573 .ctl_name = NET_IPV4_NETFILTER,
574 .procname = "netfilter",
575 .mode = 0555,
576 .child = ip_ct_sysctl_table,
577 },
578 { .ctl_name = 0 }
579};
580
581static ctl_table ip_ct_ipv4_table[] = {
582 {
583 .ctl_name = NET_IPV4,
584 .procname = "ipv4",
585 .mode = 0555,
586 .child = ip_ct_netfilter_table,
587 },
588 { .ctl_name = 0 }
589};
590
591static ctl_table ip_ct_net_table[] = {
592 {
593 .ctl_name = CTL_NET,
594 .procname = "net",
595 .mode = 0555,
596 .child = ip_ct_ipv4_table,
597 },
598 { .ctl_name = 0 }
599};
600
601static struct ctl_table_header *ip_ct_sysctl_header;
602#endif
603
604static int __init init(void)
605{
606 int ret;
607
608 ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp);
609 if (ret) {
610 printk("ip_conntrack_proto_sctp: protocol register failed\n");
611 goto out;
612 }
613
614#ifdef CONFIG_SYSCTL
615 ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
616 if (ip_ct_sysctl_header == NULL) {
617 ret = -ENOMEM;
618 printk("ip_conntrack_proto_sctp: can't register to sysctl.\n");
619 goto cleanup;
620 }
621#endif
622
623 return ret;
624
625#ifdef CONFIG_SYSCTL
626 cleanup:
627 ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
628#endif
629 out:
630 DEBUGP("SCTP conntrack module loading %s\n",
631 ret ? "failed": "succeeded");
632 return ret;
633}
634
635static void __exit fini(void)
636{
637 ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
638#ifdef CONFIG_SYSCTL
639 unregister_sysctl_table(ip_ct_sysctl_header);
640#endif
641 DEBUGP("SCTP conntrack module unloaded\n");
642}
643
644module_init(init);
645module_exit(fini);
646
647MODULE_LICENSE("GPL");
648MODULE_AUTHOR("Kiran Kumar Immidi");
649MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
new file mode 100644
index 000000000000..e800b16fc920
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -0,0 +1,1098 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
9 * - Real stateful connection tracking
10 * - Modified state transitions table
11 * - Window scaling support added
12 * - SACK support added
13 *
14 * Willy Tarreau:
15 * - State table bugfixes
16 * - More robust state changes
17 * - Tuning timer parameters
18 *
19 * version 2.2
20 */
21
22#include <linux/config.h>
23#include <linux/types.h>
24#include <linux/sched.h>
25#include <linux/timer.h>
26#include <linux/netfilter.h>
27#include <linux/module.h>
28#include <linux/in.h>
29#include <linux/ip.h>
30#include <linux/tcp.h>
31#include <linux/spinlock.h>
32
33#include <net/tcp.h>
34
35#include <linux/netfilter.h>
36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv4/ip_conntrack.h>
38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39#include <linux/netfilter_ipv4/lockhelp.h>
40
41#if 0
42#define DEBUGP printk
43#define DEBUGP_VARS
44#else
45#define DEBUGP(format, args...)
46#endif
47
48/* Protects conntrack->proto.tcp */
49static DECLARE_RWLOCK(tcp_lock);
50
51/* "Be conservative in what you do,
52 be liberal in what you accept from others."
53 If it's non-zero, we mark only out of window RST segments as INVALID. */
54int ip_ct_tcp_be_liberal = 0;
55
56/* When connection is picked up from the middle, how many packets are required
57 to pass in each direction when we assume we are in sync - if any side uses
58 window scaling, we lost the game.
59 If it is set to zero, we disable picking up already established
60 connections. */
61int ip_ct_tcp_loose = 3;
62
63/* Max number of the retransmitted packets without receiving an (acceptable)
64 ACK from the destination. If this number is reached, a shorter timer
65 will be started. */
66int ip_ct_tcp_max_retrans = 3;
67
68 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
69 closely. They're more complex. --RR */
70
71static const char *tcp_conntrack_names[] = {
72 "NONE",
73 "SYN_SENT",
74 "SYN_RECV",
75 "ESTABLISHED",
76 "FIN_WAIT",
77 "CLOSE_WAIT",
78 "LAST_ACK",
79 "TIME_WAIT",
80 "CLOSE",
81 "LISTEN"
82};
83
84#define SECS * HZ
85#define MINS * 60 SECS
86#define HOURS * 60 MINS
87#define DAYS * 24 HOURS
88
89unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS;
90unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS;
91unsigned long ip_ct_tcp_timeout_established = 5 DAYS;
92unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS;
93unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS;
94unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS;
95unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS;
96unsigned long ip_ct_tcp_timeout_close = 10 SECS;
97
98/* RFC1122 says the R2 limit should be at least 100 seconds.
99 Linux uses 15 packets as limit, which corresponds
100 to ~13-30min depending on RTO. */
101unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS;
102
103static unsigned long * tcp_timeouts[]
104= { NULL, /* TCP_CONNTRACK_NONE */
105 &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
106 &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
107 &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
108 &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
109 &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
110 &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
111 &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
112 &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
113 NULL, /* TCP_CONNTRACK_LISTEN */
114 };
115
116#define sNO TCP_CONNTRACK_NONE
117#define sSS TCP_CONNTRACK_SYN_SENT
118#define sSR TCP_CONNTRACK_SYN_RECV
119#define sES TCP_CONNTRACK_ESTABLISHED
120#define sFW TCP_CONNTRACK_FIN_WAIT
121#define sCW TCP_CONNTRACK_CLOSE_WAIT
122#define sLA TCP_CONNTRACK_LAST_ACK
123#define sTW TCP_CONNTRACK_TIME_WAIT
124#define sCL TCP_CONNTRACK_CLOSE
125#define sLI TCP_CONNTRACK_LISTEN
126#define sIV TCP_CONNTRACK_MAX
127#define sIG TCP_CONNTRACK_IGNORE
128
129/* What TCP flags are set from RST/SYN/FIN/ACK. */
130enum tcp_bit_set {
131 TCP_SYN_SET,
132 TCP_SYNACK_SET,
133 TCP_FIN_SET,
134 TCP_ACK_SET,
135 TCP_RST_SET,
136 TCP_NONE_SET,
137};
138
139/*
140 * The TCP state transition table needs a few words...
141 *
142 * We are the man in the middle. All the packets go through us
143 * but might get lost in transit to the destination.
144 * It is assumed that the destinations can't receive segments
145 * we haven't seen.
146 *
147 * The checked segment is in window, but our windows are *not*
148 * equivalent with the ones of the sender/receiver. We always
149 * try to guess the state of the current sender.
150 *
151 * The meaning of the states are:
152 *
153 * NONE: initial state
154 * SYN_SENT: SYN-only packet seen
155 * SYN_RECV: SYN-ACK packet seen
156 * ESTABLISHED: ACK packet seen
157 * FIN_WAIT: FIN packet seen
158 * CLOSE_WAIT: ACK seen (after FIN)
159 * LAST_ACK: FIN seen (after FIN)
160 * TIME_WAIT: last ACK seen
161 * CLOSE: closed connection
162 *
163 * LISTEN state is not used.
164 *
165 * Packets marked as IGNORED (sIG):
166 * if they may be either invalid or valid
167 * and the receiver may send back a connection
168 * closing RST or a SYN/ACK.
169 *
170 * Packets marked as INVALID (sIV):
171 * if they are invalid
172 * or we do not support the request (simultaneous open)
173 */
174static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
175 {
176/* ORIGINAL */
177/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
178/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
179/*
180 * sNO -> sSS Initialize a new connection
181 * sSS -> sSS Retransmitted SYN
182 * sSR -> sIG Late retransmitted SYN?
183 * sES -> sIG Error: SYNs in window outside the SYN_SENT state
184 * are errors. Receiver will reply with RST
185 * and close the connection.
186 * Or we are not in sync and hold a dead connection.
187 * sFW -> sIG
188 * sCW -> sIG
189 * sLA -> sIG
190 * sTW -> sSS Reopened connection (RFC 1122).
191 * sCL -> sSS
192 */
193/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
194/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
195/*
196 * A SYN/ACK from the client is always invalid:
197 * - either it tries to set up a simultaneous open, which is
198 * not supported;
199 * - or the firewall has just been inserted between the two hosts
200 * during the session set-up. The SYN will be retransmitted
201 * by the true client (or it'll time out).
202 */
203/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
204/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
205/*
206 * sNO -> sIV Too late and no reason to do anything...
207 * sSS -> sIV Client migth not send FIN in this state:
208 * we enforce waiting for a SYN/ACK reply first.
209 * sSR -> sFW Close started.
210 * sES -> sFW
211 * sFW -> sLA FIN seen in both directions, waiting for
212 * the last ACK.
213 * Migth be a retransmitted FIN as well...
214 * sCW -> sLA
215 * sLA -> sLA Retransmitted FIN. Remain in the same state.
216 * sTW -> sTW
217 * sCL -> sCL
218 */
219/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
220/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
221/*
222 * sNO -> sES Assumed.
223 * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
224 * sSR -> sES Established state is reached.
225 * sES -> sES :-)
226 * sFW -> sCW Normal close request answered by ACK.
227 * sCW -> sCW
228 * sLA -> sTW Last ACK detected.
229 * sTW -> sTW Retransmitted last ACK. Remain in the same state.
230 * sCL -> sCL
231 */
232/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
233/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
234/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
235 },
236 {
237/* REPLY */
238/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
239/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
240/*
241 * sNO -> sIV Never reached.
242 * sSS -> sIV Simultaneous open, not supported
243 * sSR -> sIV Simultaneous open, not supported.
244 * sES -> sIV Server may not initiate a connection.
245 * sFW -> sIV
246 * sCW -> sIV
247 * sLA -> sIV
248 * sTW -> sIV Reopened connection, but server may not do it.
249 * sCL -> sIV
250 */
251/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
252/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
253/*
254 * sSS -> sSR Standard open.
255 * sSR -> sSR Retransmitted SYN/ACK.
256 * sES -> sIG Late retransmitted SYN/ACK?
257 * sFW -> sIG Might be SYN/ACK answering ignored SYN
258 * sCW -> sIG
259 * sLA -> sIG
260 * sTW -> sIG
261 * sCL -> sIG
262 */
263/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
264/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
265/*
266 * sSS -> sIV Server might not send FIN in this state.
267 * sSR -> sFW Close started.
268 * sES -> sFW
269 * sFW -> sLA FIN seen in both directions.
270 * sCW -> sLA
271 * sLA -> sLA Retransmitted FIN.
272 * sTW -> sTW
273 * sCL -> sCL
274 */
275/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
276/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
277/*
278 * sSS -> sIV Might be a half-open connection.
279 * sSR -> sSR Might answer late resent SYN.
280 * sES -> sES :-)
281 * sFW -> sCW Normal close request answered by ACK.
282 * sCW -> sCW
283 * sLA -> sTW Last ACK detected.
284 * sTW -> sTW Retransmitted last ACK.
285 * sCL -> sCL
286 */
287/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
288/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
289/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
290 }
291};
292
293static int tcp_pkt_to_tuple(const struct sk_buff *skb,
294 unsigned int dataoff,
295 struct ip_conntrack_tuple *tuple)
296{
297 struct tcphdr _hdr, *hp;
298
299 /* Actually only need first 8 bytes. */
300 hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
301 if (hp == NULL)
302 return 0;
303
304 tuple->src.u.tcp.port = hp->source;
305 tuple->dst.u.tcp.port = hp->dest;
306
307 return 1;
308}
309
310static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
311 const struct ip_conntrack_tuple *orig)
312{
313 tuple->src.u.tcp.port = orig->dst.u.tcp.port;
314 tuple->dst.u.tcp.port = orig->src.u.tcp.port;
315 return 1;
316}
317
318/* Print out the per-protocol part of the tuple. */
319static int tcp_print_tuple(struct seq_file *s,
320 const struct ip_conntrack_tuple *tuple)
321{
322 return seq_printf(s, "sport=%hu dport=%hu ",
323 ntohs(tuple->src.u.tcp.port),
324 ntohs(tuple->dst.u.tcp.port));
325}
326
327/* Print out the private part of the conntrack. */
328static int tcp_print_conntrack(struct seq_file *s,
329 const struct ip_conntrack *conntrack)
330{
331 enum tcp_conntrack state;
332
333 READ_LOCK(&tcp_lock);
334 state = conntrack->proto.tcp.state;
335 READ_UNLOCK(&tcp_lock);
336
337 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
338}
339
340static unsigned int get_conntrack_index(const struct tcphdr *tcph)
341{
342 if (tcph->rst) return TCP_RST_SET;
343 else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
344 else if (tcph->fin) return TCP_FIN_SET;
345 else if (tcph->ack) return TCP_ACK_SET;
346 else return TCP_NONE_SET;
347}
348
349/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
350 in IP Filter' by Guido van Rooij.
351
352 http://www.nluug.nl/events/sane2000/papers.html
353 http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
354
355 The boundaries and the conditions are changed according to RFC793:
356 the packet must intersect the window (i.e. segments may be
357 after the right or before the left edge) and thus receivers may ACK
358 segments after the right edge of the window.
359
360 td_maxend = max(sack + max(win,1)) seen in reply packets
361 td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
362 td_maxwin += seq + len - sender.td_maxend
363 if seq + len > sender.td_maxend
364 td_end = max(seq + len) seen in sent packets
365
366 I. Upper bound for valid data: seq <= sender.td_maxend
367 II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
368 III. Upper bound for valid ack: sack <= receiver.td_end
369 IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
370
371 where sack is the highest right edge of sack block found in the packet.
372
373 The upper bound limit for a valid ack is not ignored -
374 we doesn't have to deal with fragments.
375*/
376
377static inline __u32 segment_seq_plus_len(__u32 seq,
378 size_t len,
379 struct iphdr *iph,
380 struct tcphdr *tcph)
381{
382 return (seq + len - (iph->ihl + tcph->doff)*4
383 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
384}
385
386/* Fixme: what about big packets? */
387#define MAXACKWINCONST 66000
388#define MAXACKWINDOW(sender) \
389 ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
390 : MAXACKWINCONST)
391
392/*
393 * Simplified tcp_parse_options routine from tcp_input.c
394 */
395static void tcp_options(const struct sk_buff *skb,
396 struct iphdr *iph,
397 struct tcphdr *tcph,
398 struct ip_ct_tcp_state *state)
399{
400 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
401 unsigned char *ptr;
402 int length = (tcph->doff*4) - sizeof(struct tcphdr);
403
404 if (!length)
405 return;
406
407 ptr = skb_header_pointer(skb,
408 (iph->ihl * 4) + sizeof(struct tcphdr),
409 length, buff);
410 BUG_ON(ptr == NULL);
411
412 state->td_scale =
413 state->flags = 0;
414
415 while (length > 0) {
416 int opcode=*ptr++;
417 int opsize;
418
419 switch (opcode) {
420 case TCPOPT_EOL:
421 return;
422 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
423 length--;
424 continue;
425 default:
426 opsize=*ptr++;
427 if (opsize < 2) /* "silly options" */
428 return;
429 if (opsize > length)
430 break; /* don't parse partial options */
431
432 if (opcode == TCPOPT_SACK_PERM
433 && opsize == TCPOLEN_SACK_PERM)
434 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
435 else if (opcode == TCPOPT_WINDOW
436 && opsize == TCPOLEN_WINDOW) {
437 state->td_scale = *(u_int8_t *)ptr;
438
439 if (state->td_scale > 14) {
440 /* See RFC1323 */
441 state->td_scale = 14;
442 }
443 state->flags |=
444 IP_CT_TCP_FLAG_WINDOW_SCALE;
445 }
446 ptr += opsize - 2;
447 length -= opsize;
448 }
449 }
450}
451
452static void tcp_sack(const struct sk_buff *skb,
453 struct iphdr *iph,
454 struct tcphdr *tcph,
455 __u32 *sack)
456{
457 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
458 unsigned char *ptr;
459 int length = (tcph->doff*4) - sizeof(struct tcphdr);
460 __u32 tmp;
461
462 if (!length)
463 return;
464
465 ptr = skb_header_pointer(skb,
466 (iph->ihl * 4) + sizeof(struct tcphdr),
467 length, buff);
468 BUG_ON(ptr == NULL);
469
470 /* Fast path for timestamp-only option */
471 if (length == TCPOLEN_TSTAMP_ALIGNED*4
472 && *(__u32 *)ptr ==
473 __constant_ntohl((TCPOPT_NOP << 24)
474 | (TCPOPT_NOP << 16)
475 | (TCPOPT_TIMESTAMP << 8)
476 | TCPOLEN_TIMESTAMP))
477 return;
478
479 while (length > 0) {
480 int opcode=*ptr++;
481 int opsize, i;
482
483 switch (opcode) {
484 case TCPOPT_EOL:
485 return;
486 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
487 length--;
488 continue;
489 default:
490 opsize=*ptr++;
491 if (opsize < 2) /* "silly options" */
492 return;
493 if (opsize > length)
494 break; /* don't parse partial options */
495
496 if (opcode == TCPOPT_SACK
497 && opsize >= (TCPOLEN_SACK_BASE
498 + TCPOLEN_SACK_PERBLOCK)
499 && !((opsize - TCPOLEN_SACK_BASE)
500 % TCPOLEN_SACK_PERBLOCK)) {
501 for (i = 0;
502 i < (opsize - TCPOLEN_SACK_BASE);
503 i += TCPOLEN_SACK_PERBLOCK) {
504 tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
505
506 if (after(tmp, *sack))
507 *sack = tmp;
508 }
509 return;
510 }
511 ptr += opsize - 2;
512 length -= opsize;
513 }
514 }
515}
516
517static int tcp_in_window(struct ip_ct_tcp *state,
518 enum ip_conntrack_dir dir,
519 unsigned int index,
520 const struct sk_buff *skb,
521 struct iphdr *iph,
522 struct tcphdr *tcph)
523{
524 struct ip_ct_tcp_state *sender = &state->seen[dir];
525 struct ip_ct_tcp_state *receiver = &state->seen[!dir];
526 __u32 seq, ack, sack, end, win, swin;
527 int res;
528
529 /*
530 * Get the required data from the packet.
531 */
532 seq = ntohl(tcph->seq);
533 ack = sack = ntohl(tcph->ack_seq);
534 win = ntohs(tcph->window);
535 end = segment_seq_plus_len(seq, skb->len, iph, tcph);
536
537 if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
538 tcp_sack(skb, iph, tcph, &sack);
539
540 DEBUGP("tcp_in_window: START\n");
541 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
542 "seq=%u ack=%u sack=%u win=%u end=%u\n",
543 NIPQUAD(iph->saddr), ntohs(tcph->source),
544 NIPQUAD(iph->daddr), ntohs(tcph->dest),
545 seq, ack, sack, win, end);
546 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
547 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
548 sender->td_end, sender->td_maxend, sender->td_maxwin,
549 sender->td_scale,
550 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
551 receiver->td_scale);
552
553 if (sender->td_end == 0) {
554 /*
555 * Initialize sender data.
556 */
557 if (tcph->syn && tcph->ack) {
558 /*
559 * Outgoing SYN-ACK in reply to a SYN.
560 */
561 sender->td_end =
562 sender->td_maxend = end;
563 sender->td_maxwin = (win == 0 ? 1 : win);
564
565 tcp_options(skb, iph, tcph, sender);
566 /*
567 * RFC 1323:
568 * Both sides must send the Window Scale option
569 * to enable window scaling in either direction.
570 */
571 if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
572 && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
573 sender->td_scale =
574 receiver->td_scale = 0;
575 } else {
576 /*
577 * We are in the middle of a connection,
578 * its history is lost for us.
579 * Let's try to use the data from the packet.
580 */
581 sender->td_end = end;
582 sender->td_maxwin = (win == 0 ? 1 : win);
583 sender->td_maxend = end + sender->td_maxwin;
584 }
585 } else if (((state->state == TCP_CONNTRACK_SYN_SENT
586 && dir == IP_CT_DIR_ORIGINAL)
587 || (state->state == TCP_CONNTRACK_SYN_RECV
588 && dir == IP_CT_DIR_REPLY))
589 && after(end, sender->td_end)) {
590 /*
591 * RFC 793: "if a TCP is reinitialized ... then it need
592 * not wait at all; it must only be sure to use sequence
593 * numbers larger than those recently used."
594 */
595 sender->td_end =
596 sender->td_maxend = end;
597 sender->td_maxwin = (win == 0 ? 1 : win);
598
599 tcp_options(skb, iph, tcph, sender);
600 }
601
602 if (!(tcph->ack)) {
603 /*
604 * If there is no ACK, just pretend it was set and OK.
605 */
606 ack = sack = receiver->td_end;
607 } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
608 (TCP_FLAG_ACK|TCP_FLAG_RST))
609 && (ack == 0)) {
610 /*
611 * Broken TCP stacks, that set ACK in RST packets as well
612 * with zero ack value.
613 */
614 ack = sack = receiver->td_end;
615 }
616
617 if (seq == end
618 && (!tcph->rst
619 || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
620 /*
621 * Packets contains no data: we assume it is valid
622 * and check the ack value only.
623 * However RST segments are always validated by their
624 * SEQ number, except when seq == 0 (reset sent answering
625 * SYN.
626 */
627 seq = end = sender->td_end;
628
629 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
630 "seq=%u ack=%u sack =%u win=%u end=%u\n",
631 NIPQUAD(iph->saddr), ntohs(tcph->source),
632 NIPQUAD(iph->daddr), ntohs(tcph->dest),
633 seq, ack, sack, win, end);
634 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
635 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
636 sender->td_end, sender->td_maxend, sender->td_maxwin,
637 sender->td_scale,
638 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
639 receiver->td_scale);
640
641 DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
642 before(seq, sender->td_maxend + 1),
643 after(end, sender->td_end - receiver->td_maxwin - 1),
644 before(sack, receiver->td_end + 1),
645 after(ack, receiver->td_end - MAXACKWINDOW(sender)));
646
647 if (sender->loose || receiver->loose ||
648 (before(seq, sender->td_maxend + 1) &&
649 after(end, sender->td_end - receiver->td_maxwin - 1) &&
650 before(sack, receiver->td_end + 1) &&
651 after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
652 /*
653 * Take into account window scaling (RFC 1323).
654 */
655 if (!tcph->syn)
656 win <<= sender->td_scale;
657
658 /*
659 * Update sender data.
660 */
661 swin = win + (sack - ack);
662 if (sender->td_maxwin < swin)
663 sender->td_maxwin = swin;
664 if (after(end, sender->td_end))
665 sender->td_end = end;
666 /*
667 * Update receiver data.
668 */
669 if (after(end, sender->td_maxend))
670 receiver->td_maxwin += end - sender->td_maxend;
671 if (after(sack + win, receiver->td_maxend - 1)) {
672 receiver->td_maxend = sack + win;
673 if (win == 0)
674 receiver->td_maxend++;
675 }
676
677 /*
678 * Check retransmissions.
679 */
680 if (index == TCP_ACK_SET) {
681 if (state->last_dir == dir
682 && state->last_seq == seq
683 && state->last_ack == ack
684 && state->last_end == end)
685 state->retrans++;
686 else {
687 state->last_dir = dir;
688 state->last_seq = seq;
689 state->last_ack = ack;
690 state->last_end = end;
691 state->retrans = 0;
692 }
693 }
694 /*
695 * Close the window of disabled window tracking :-)
696 */
697 if (sender->loose)
698 sender->loose--;
699
700 res = 1;
701 } else {
702 if (LOG_INVALID(IPPROTO_TCP))
703 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
704 "ip_ct_tcp: %s ",
705 before(seq, sender->td_maxend + 1) ?
706 after(end, sender->td_end - receiver->td_maxwin - 1) ?
707 before(sack, receiver->td_end + 1) ?
708 after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
709 : "ACK is under the lower bound (possible overly delayed ACK)"
710 : "ACK is over the upper bound (ACKed data not seen yet)"
711 : "SEQ is under the lower bound (already ACKed data retransmitted)"
712 : "SEQ is over the upper bound (over the window of the receiver)");
713
714 res = ip_ct_tcp_be_liberal;
715 }
716
717 DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
718 "receiver end=%u maxend=%u maxwin=%u\n",
719 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
720 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
721
722 return res;
723}
724
725#ifdef CONFIG_IP_NF_NAT_NEEDED
726/* Update sender->td_end after NAT successfully mangled the packet */
727void ip_conntrack_tcp_update(struct sk_buff *skb,
728 struct ip_conntrack *conntrack,
729 enum ip_conntrack_dir dir)
730{
731 struct iphdr *iph = skb->nh.iph;
732 struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
733 __u32 end;
734#ifdef DEBUGP_VARS
735 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
736 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
737#endif
738
739 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
740
741 WRITE_LOCK(&tcp_lock);
742 /*
743 * We have to worry for the ack in the reply packet only...
744 */
745 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
746 conntrack->proto.tcp.seen[dir].td_end = end;
747 conntrack->proto.tcp.last_end = end;
748 WRITE_UNLOCK(&tcp_lock);
749 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
750 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
751 sender->td_end, sender->td_maxend, sender->td_maxwin,
752 sender->td_scale,
753 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
754 receiver->td_scale);
755}
756
757#endif
758
759#define TH_FIN 0x01
760#define TH_SYN 0x02
761#define TH_RST 0x04
762#define TH_PUSH 0x08
763#define TH_ACK 0x10
764#define TH_URG 0x20
765#define TH_ECE 0x40
766#define TH_CWR 0x80
767
768/* table of valid flag combinations - ECE and CWR are always valid */
769static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
770{
771 [TH_SYN] = 1,
772 [TH_SYN|TH_ACK] = 1,
773 [TH_RST] = 1,
774 [TH_RST|TH_ACK] = 1,
775 [TH_RST|TH_ACK|TH_PUSH] = 1,
776 [TH_FIN|TH_ACK] = 1,
777 [TH_ACK] = 1,
778 [TH_ACK|TH_PUSH] = 1,
779 [TH_ACK|TH_URG] = 1,
780 [TH_ACK|TH_URG|TH_PUSH] = 1,
781 [TH_FIN|TH_ACK|TH_PUSH] = 1,
782 [TH_FIN|TH_ACK|TH_URG] = 1,
783 [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
784};
785
786/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
787static int tcp_error(struct sk_buff *skb,
788 enum ip_conntrack_info *ctinfo,
789 unsigned int hooknum)
790{
791 struct iphdr *iph = skb->nh.iph;
792 struct tcphdr _tcph, *th;
793 unsigned int tcplen = skb->len - iph->ihl * 4;
794 u_int8_t tcpflags;
795
796 /* Smaller that minimal TCP header? */
797 th = skb_header_pointer(skb, iph->ihl * 4,
798 sizeof(_tcph), &_tcph);
799 if (th == NULL) {
800 if (LOG_INVALID(IPPROTO_TCP))
801 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
802 "ip_ct_tcp: short packet ");
803 return -NF_ACCEPT;
804 }
805
806 /* Not whole TCP header or malformed packet */
807 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
808 if (LOG_INVALID(IPPROTO_TCP))
809 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
810 "ip_ct_tcp: truncated/malformed packet ");
811 return -NF_ACCEPT;
812 }
813
814 /* Checksum invalid? Ignore.
815 * We skip checking packets on the outgoing path
816 * because the semantic of CHECKSUM_HW is different there
817 * and moreover root might send raw packets.
818 */
819 /* FIXME: Source route IP option packets --RR */
820 if (hooknum == NF_IP_PRE_ROUTING
821 && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
822 skb->ip_summed == CHECKSUM_HW ? skb->csum
823 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
824 if (LOG_INVALID(IPPROTO_TCP))
825 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
826 "ip_ct_tcp: bad TCP checksum ");
827 return -NF_ACCEPT;
828 }
829
830 /* Check TCP flags. */
831 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
832 if (!tcp_valid_flags[tcpflags]) {
833 if (LOG_INVALID(IPPROTO_TCP))
834 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
835 "ip_ct_tcp: invalid TCP flag combination ");
836 return -NF_ACCEPT;
837 }
838
839 return NF_ACCEPT;
840}
841
842/* Returns verdict for packet, or -1 for invalid. */
843static int tcp_packet(struct ip_conntrack *conntrack,
844 const struct sk_buff *skb,
845 enum ip_conntrack_info ctinfo)
846{
847 enum tcp_conntrack new_state, old_state;
848 enum ip_conntrack_dir dir;
849 struct iphdr *iph = skb->nh.iph;
850 struct tcphdr *th, _tcph;
851 unsigned long timeout;
852 unsigned int index;
853
854 th = skb_header_pointer(skb, iph->ihl * 4,
855 sizeof(_tcph), &_tcph);
856 BUG_ON(th == NULL);
857
858 WRITE_LOCK(&tcp_lock);
859 old_state = conntrack->proto.tcp.state;
860 dir = CTINFO2DIR(ctinfo);
861 index = get_conntrack_index(th);
862 new_state = tcp_conntracks[dir][index][old_state];
863
864 switch (new_state) {
865 case TCP_CONNTRACK_IGNORE:
866 /* Either SYN in ORIGINAL
867 * or SYN/ACK in REPLY. */
868 if (index == TCP_SYNACK_SET
869 && conntrack->proto.tcp.last_index == TCP_SYN_SET
870 && conntrack->proto.tcp.last_dir != dir
871 && ntohl(th->ack_seq) ==
872 conntrack->proto.tcp.last_end) {
873 /* This SYN/ACK acknowledges a SYN that we earlier
874 * ignored as invalid. This means that the client and
875 * the server are both in sync, while the firewall is
876 * not. We kill this session and block the SYN/ACK so
877 * that the client cannot but retransmit its SYN and
878 * thus initiate a clean new session.
879 */
880 WRITE_UNLOCK(&tcp_lock);
881 if (LOG_INVALID(IPPROTO_TCP))
882 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
883 "ip_ct_tcp: killing out of sync session ");
884 if (del_timer(&conntrack->timeout))
885 conntrack->timeout.function((unsigned long)
886 conntrack);
887 return -NF_DROP;
888 }
889 conntrack->proto.tcp.last_index = index;
890 conntrack->proto.tcp.last_dir = dir;
891 conntrack->proto.tcp.last_seq = ntohl(th->seq);
892 conntrack->proto.tcp.last_end =
893 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
894
895 WRITE_UNLOCK(&tcp_lock);
896 if (LOG_INVALID(IPPROTO_TCP))
897 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
898 "ip_ct_tcp: invalid packet ignored ");
899 return NF_ACCEPT;
900 case TCP_CONNTRACK_MAX:
901 /* Invalid packet */
902 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
903 dir, get_conntrack_index(th),
904 old_state);
905 WRITE_UNLOCK(&tcp_lock);
906 if (LOG_INVALID(IPPROTO_TCP))
907 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
908 "ip_ct_tcp: invalid state ");
909 return -NF_ACCEPT;
910 case TCP_CONNTRACK_SYN_SENT:
911 if (old_state < TCP_CONNTRACK_TIME_WAIT)
912 break;
913 if ((conntrack->proto.tcp.seen[dir].flags &
914 IP_CT_TCP_FLAG_CLOSE_INIT)
915 || after(ntohl(th->seq),
916 conntrack->proto.tcp.seen[dir].td_end)) {
917 /* Attempt to reopen a closed connection.
918 * Delete this connection and look up again. */
919 WRITE_UNLOCK(&tcp_lock);
920 if (del_timer(&conntrack->timeout))
921 conntrack->timeout.function((unsigned long)
922 conntrack);
923 return -NF_REPEAT;
924 } else {
925 WRITE_UNLOCK(&tcp_lock);
926 if (LOG_INVALID(IPPROTO_TCP))
927 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
928 "ip_ct_tcp: invalid SYN");
929 return -NF_ACCEPT;
930 }
931 case TCP_CONNTRACK_CLOSE:
932 if (index == TCP_RST_SET
933 && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
934 && conntrack->proto.tcp.last_index == TCP_SYN_SET
935 && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
936 /* RST sent to invalid SYN we had let trough
937 * SYN was in window then, tear down connection.
938 * We skip window checking, because packet might ACK
939 * segments we ignored in the SYN. */
940 goto in_window;
941 }
942 /* Just fall trough */
943 default:
944 /* Keep compilers happy. */
945 break;
946 }
947
948 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
949 skb, iph, th)) {
950 WRITE_UNLOCK(&tcp_lock);
951 return -NF_ACCEPT;
952 }
953 in_window:
954 /* From now on we have got in-window packets */
955 conntrack->proto.tcp.last_index = index;
956
957 DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
958 "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
959 NIPQUAD(iph->saddr), ntohs(th->source),
960 NIPQUAD(iph->daddr), ntohs(th->dest),
961 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
962 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
963 old_state, new_state);
964
965 conntrack->proto.tcp.state = new_state;
966 if (old_state != new_state
967 && (new_state == TCP_CONNTRACK_FIN_WAIT
968 || new_state == TCP_CONNTRACK_CLOSE))
969 conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
970 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
971 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
972 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
973 WRITE_UNLOCK(&tcp_lock);
974
975 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
976 /* If only reply is a RST, we can consider ourselves not to
977 have an established connection: this is a fairly common
978 problem case, so we can delete the conntrack
979 immediately. --RR */
980 if (th->rst) {
981 if (del_timer(&conntrack->timeout))
982 conntrack->timeout.function((unsigned long)
983 conntrack);
984 return NF_ACCEPT;
985 }
986 } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
987 && (old_state == TCP_CONNTRACK_SYN_RECV
988 || old_state == TCP_CONNTRACK_ESTABLISHED)
989 && new_state == TCP_CONNTRACK_ESTABLISHED) {
990 /* Set ASSURED if we see see valid ack in ESTABLISHED
991 after SYN_RECV or a valid answer for a picked up
992 connection. */
993 set_bit(IPS_ASSURED_BIT, &conntrack->status);
994 }
995 ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
996
997 return NF_ACCEPT;
998}
999
1000/* Called when a new connection for this protocol found. */
1001static int tcp_new(struct ip_conntrack *conntrack,
1002 const struct sk_buff *skb)
1003{
1004 enum tcp_conntrack new_state;
1005 struct iphdr *iph = skb->nh.iph;
1006 struct tcphdr *th, _tcph;
1007#ifdef DEBUGP_VARS
1008 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
1009 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
1010#endif
1011
1012 th = skb_header_pointer(skb, iph->ihl * 4,
1013 sizeof(_tcph), &_tcph);
1014 BUG_ON(th == NULL);
1015
1016 /* Don't need lock here: this conntrack not in circulation yet */
1017 new_state
1018 = tcp_conntracks[0][get_conntrack_index(th)]
1019 [TCP_CONNTRACK_NONE];
1020
1021 /* Invalid: delete conntrack */
1022 if (new_state >= TCP_CONNTRACK_MAX) {
1023 DEBUGP("ip_ct_tcp: invalid new deleting.\n");
1024 return 0;
1025 }
1026
1027 if (new_state == TCP_CONNTRACK_SYN_SENT) {
1028 /* SYN packet */
1029 conntrack->proto.tcp.seen[0].td_end =
1030 segment_seq_plus_len(ntohl(th->seq), skb->len,
1031 iph, th);
1032 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1033 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1034 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1035 conntrack->proto.tcp.seen[0].td_maxend =
1036 conntrack->proto.tcp.seen[0].td_end;
1037
1038 tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
1039 conntrack->proto.tcp.seen[1].flags = 0;
1040 conntrack->proto.tcp.seen[0].loose =
1041 conntrack->proto.tcp.seen[1].loose = 0;
1042 } else if (ip_ct_tcp_loose == 0) {
1043 /* Don't try to pick up connections. */
1044 return 0;
1045 } else {
1046 /*
1047 * We are in the middle of a connection,
1048 * its history is lost for us.
1049 * Let's try to use the data from the packet.
1050 */
1051 conntrack->proto.tcp.seen[0].td_end =
1052 segment_seq_plus_len(ntohl(th->seq), skb->len,
1053 iph, th);
1054 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1055 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1056 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1057 conntrack->proto.tcp.seen[0].td_maxend =
1058 conntrack->proto.tcp.seen[0].td_end +
1059 conntrack->proto.tcp.seen[0].td_maxwin;
1060 conntrack->proto.tcp.seen[0].td_scale = 0;
1061
1062 /* We assume SACK. Should we assume window scaling too? */
1063 conntrack->proto.tcp.seen[0].flags =
1064 conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
1065 conntrack->proto.tcp.seen[0].loose =
1066 conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
1067 }
1068
1069 conntrack->proto.tcp.seen[1].td_end = 0;
1070 conntrack->proto.tcp.seen[1].td_maxend = 0;
1071 conntrack->proto.tcp.seen[1].td_maxwin = 1;
1072 conntrack->proto.tcp.seen[1].td_scale = 0;
1073
1074 /* tcp_packet will set them */
1075 conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
1076 conntrack->proto.tcp.last_index = TCP_NONE_SET;
1077
1078 DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1079 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1080 sender->td_end, sender->td_maxend, sender->td_maxwin,
1081 sender->td_scale,
1082 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1083 receiver->td_scale);
1084 return 1;
1085}
1086
1087struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
1088{
1089 .proto = IPPROTO_TCP,
1090 .name = "tcp",
1091 .pkt_to_tuple = tcp_pkt_to_tuple,
1092 .invert_tuple = tcp_invert_tuple,
1093 .print_tuple = tcp_print_tuple,
1094 .print_conntrack = tcp_print_conntrack,
1095 .packet = tcp_packet,
1096 .new = tcp_new,
1097 .error = tcp_error,
1098};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
new file mode 100644
index 000000000000..5bc28a224623
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -0,0 +1,146 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/sched.h>
11#include <linux/timer.h>
12#include <linux/netfilter.h>
13#include <linux/in.h>
14#include <linux/udp.h>
15#include <linux/seq_file.h>
16#include <net/checksum.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
20
21unsigned long ip_ct_udp_timeout = 30*HZ;
22unsigned long ip_ct_udp_timeout_stream = 180*HZ;
23
24static int udp_pkt_to_tuple(const struct sk_buff *skb,
25 unsigned int dataoff,
26 struct ip_conntrack_tuple *tuple)
27{
28 struct udphdr _hdr, *hp;
29
30 /* Actually only need first 8 bytes. */
31 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
32 if (hp == NULL)
33 return 0;
34
35 tuple->src.u.udp.port = hp->source;
36 tuple->dst.u.udp.port = hp->dest;
37
38 return 1;
39}
40
41static int udp_invert_tuple(struct ip_conntrack_tuple *tuple,
42 const struct ip_conntrack_tuple *orig)
43{
44 tuple->src.u.udp.port = orig->dst.u.udp.port;
45 tuple->dst.u.udp.port = orig->src.u.udp.port;
46 return 1;
47}
48
49/* Print out the per-protocol part of the tuple. */
50static int udp_print_tuple(struct seq_file *s,
51 const struct ip_conntrack_tuple *tuple)
52{
53 return seq_printf(s, "sport=%hu dport=%hu ",
54 ntohs(tuple->src.u.udp.port),
55 ntohs(tuple->dst.u.udp.port));
56}
57
58/* Print out the private part of the conntrack. */
59static int udp_print_conntrack(struct seq_file *s,
60 const struct ip_conntrack *conntrack)
61{
62 return 0;
63}
64
65/* Returns verdict for packet, and may modify conntracktype */
66static int udp_packet(struct ip_conntrack *conntrack,
67 const struct sk_buff *skb,
68 enum ip_conntrack_info ctinfo)
69{
70 /* If we've seen traffic both ways, this is some kind of UDP
71 stream. Extend timeout. */
72 if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
73 ip_ct_refresh_acct(conntrack, ctinfo, skb,
74 ip_ct_udp_timeout_stream);
75 /* Also, more likely to be important, and not a probe */
76 set_bit(IPS_ASSURED_BIT, &conntrack->status);
77 } else
78 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
79
80 return NF_ACCEPT;
81}
82
83/* Called when a new connection for this protocol found. */
84static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
85{
86 return 1;
87}
88
89static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
90 unsigned int hooknum)
91{
92 struct iphdr *iph = skb->nh.iph;
93 unsigned int udplen = skb->len - iph->ihl * 4;
94 struct udphdr _hdr, *hdr;
95
96 /* Header is too small? */
97 hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
98 if (hdr == NULL) {
99 if (LOG_INVALID(IPPROTO_UDP))
100 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
101 "ip_ct_udp: short packet ");
102 return -NF_ACCEPT;
103 }
104
105 /* Truncated/malformed packets */
106 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
107 if (LOG_INVALID(IPPROTO_UDP))
108 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
109 "ip_ct_udp: truncated/malformed packet ");
110 return -NF_ACCEPT;
111 }
112
113 /* Packet with no checksum */
114 if (!hdr->check)
115 return NF_ACCEPT;
116
117 /* Checksum invalid? Ignore.
118 * We skip checking packets on the outgoing path
119 * because the semantic of CHECKSUM_HW is different there
120 * and moreover root might send raw packets.
121 * FIXME: Source route IP option packets --RR */
122 if (hooknum == NF_IP_PRE_ROUTING
123 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
124 skb->ip_summed == CHECKSUM_HW ? skb->csum
125 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
126 if (LOG_INVALID(IPPROTO_UDP))
127 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
128 "ip_ct_udp: bad UDP checksum ");
129 return -NF_ACCEPT;
130 }
131
132 return NF_ACCEPT;
133}
134
135struct ip_conntrack_protocol ip_conntrack_protocol_udp =
136{
137 .proto = IPPROTO_UDP,
138 .name = "udp",
139 .pkt_to_tuple = udp_pkt_to_tuple,
140 .invert_tuple = udp_invert_tuple,
141 .print_tuple = udp_print_tuple,
142 .print_conntrack = udp_print_conntrack,
143 .packet = udp_packet,
144 .new = udp_new,
145 .error = udp_error,
146};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
new file mode 100644
index 000000000000..80a7bde2a57a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -0,0 +1,961 @@
1/* This file contains all the functions required for the standalone
2 ip_conntrack module.
3
4 These are not required by the compatibility layer.
5*/
6
7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15#include <linux/config.h>
16#include <linux/types.h>
17#include <linux/ip.h>
18#include <linux/netfilter.h>
19#include <linux/netfilter_ipv4.h>
20#include <linux/module.h>
21#include <linux/skbuff.h>
22#include <linux/proc_fs.h>
23#include <linux/seq_file.h>
24#include <linux/percpu.h>
25#ifdef CONFIG_SYSCTL
26#include <linux/sysctl.h>
27#endif
28#include <net/checksum.h>
29#include <net/ip.h>
30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
33
34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
36#include <linux/netfilter_ipv4/ip_conntrack_core.h>
37#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
38#include <linux/netfilter_ipv4/listhelp.h>
39
40#if 0
41#define DEBUGP printk
42#else
43#define DEBUGP(format, args...)
44#endif
45
46MODULE_LICENSE("GPL");
47
48extern atomic_t ip_conntrack_count;
49DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
50
51static int kill_proto(struct ip_conntrack *i, void *data)
52{
53 return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
54 *((u_int8_t *) data));
55}
56
57#ifdef CONFIG_PROC_FS
58static int
59print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple,
60 struct ip_conntrack_protocol *proto)
61{
62 seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
63 NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip));
64 return proto->print_tuple(s, tuple);
65}
66
67#ifdef CONFIG_IP_NF_CT_ACCT
68static unsigned int
69seq_print_counters(struct seq_file *s,
70 const struct ip_conntrack_counter *counter)
71{
72 return seq_printf(s, "packets=%llu bytes=%llu ",
73 (unsigned long long)counter->packets,
74 (unsigned long long)counter->bytes);
75}
76#else
77#define seq_print_counters(x, y) 0
78#endif
79
80struct ct_iter_state {
81 unsigned int bucket;
82};
83
84static struct list_head *ct_get_first(struct seq_file *seq)
85{
86 struct ct_iter_state *st = seq->private;
87
88 for (st->bucket = 0;
89 st->bucket < ip_conntrack_htable_size;
90 st->bucket++) {
91 if (!list_empty(&ip_conntrack_hash[st->bucket]))
92 return ip_conntrack_hash[st->bucket].next;
93 }
94 return NULL;
95}
96
97static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
98{
99 struct ct_iter_state *st = seq->private;
100
101 head = head->next;
102 while (head == &ip_conntrack_hash[st->bucket]) {
103 if (++st->bucket >= ip_conntrack_htable_size)
104 return NULL;
105 head = ip_conntrack_hash[st->bucket].next;
106 }
107 return head;
108}
109
110static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
111{
112 struct list_head *head = ct_get_first(seq);
113
114 if (head)
115 while (pos && (head = ct_get_next(seq, head)))
116 pos--;
117 return pos ? NULL : head;
118}
119
120static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
121{
122 READ_LOCK(&ip_conntrack_lock);
123 return ct_get_idx(seq, *pos);
124}
125
126static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
127{
128 (*pos)++;
129 return ct_get_next(s, v);
130}
131
132static void ct_seq_stop(struct seq_file *s, void *v)
133{
134 READ_UNLOCK(&ip_conntrack_lock);
135}
136
137static int ct_seq_show(struct seq_file *s, void *v)
138{
139 const struct ip_conntrack_tuple_hash *hash = v;
140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
141 struct ip_conntrack_protocol *proto;
142
143 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
144 IP_NF_ASSERT(conntrack);
145
146 /* we only want to print DIR_ORIGINAL */
147 if (DIRECTION(hash))
148 return 0;
149
150 proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
151 .tuple.dst.protonum);
152 IP_NF_ASSERT(proto);
153
154 if (seq_printf(s, "%-8s %u %ld ",
155 proto->name,
156 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
157 timer_pending(&conntrack->timeout)
158 ? (long)(conntrack->timeout.expires - jiffies)/HZ
159 : 0) != 0)
160 return -ENOSPC;
161
162 if (proto->print_conntrack(s, conntrack))
163 return -ENOSPC;
164
165 if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
166 proto))
167 return -ENOSPC;
168
169 if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
170 return -ENOSPC;
171
172 if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
173 if (seq_printf(s, "[UNREPLIED] "))
174 return -ENOSPC;
175
176 if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
177 proto))
178 return -ENOSPC;
179
180 if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
181 return -ENOSPC;
182
183 if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
184 if (seq_printf(s, "[ASSURED] "))
185 return -ENOSPC;
186
187#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
188 if (seq_printf(s, "mark=%lu ", conntrack->mark))
189 return -ENOSPC;
190#endif
191
192 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
193 return -ENOSPC;
194
195 return 0;
196}
197
198static struct seq_operations ct_seq_ops = {
199 .start = ct_seq_start,
200 .next = ct_seq_next,
201 .stop = ct_seq_stop,
202 .show = ct_seq_show
203};
204
205static int ct_open(struct inode *inode, struct file *file)
206{
207 struct seq_file *seq;
208 struct ct_iter_state *st;
209 int ret;
210
211 st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
212 if (st == NULL)
213 return -ENOMEM;
214 ret = seq_open(file, &ct_seq_ops);
215 if (ret)
216 goto out_free;
217 seq = file->private_data;
218 seq->private = st;
219 memset(st, 0, sizeof(struct ct_iter_state));
220 return ret;
221out_free:
222 kfree(st);
223 return ret;
224}
225
226static struct file_operations ct_file_ops = {
227 .owner = THIS_MODULE,
228 .open = ct_open,
229 .read = seq_read,
230 .llseek = seq_lseek,
231 .release = seq_release_private,
232};
233
234/* expects */
235static void *exp_seq_start(struct seq_file *s, loff_t *pos)
236{
237 struct list_head *e = &ip_conntrack_expect_list;
238 loff_t i;
239
240 /* strange seq_file api calls stop even if we fail,
241 * thus we need to grab lock since stop unlocks */
242 READ_LOCK(&ip_conntrack_lock);
243
244 if (list_empty(e))
245 return NULL;
246
247 for (i = 0; i <= *pos; i++) {
248 e = e->next;
249 if (e == &ip_conntrack_expect_list)
250 return NULL;
251 }
252 return e;
253}
254
255static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
256{
257 struct list_head *e = v;
258
259 e = e->next;
260
261 if (e == &ip_conntrack_expect_list)
262 return NULL;
263
264 return e;
265}
266
267static void exp_seq_stop(struct seq_file *s, void *v)
268{
269 READ_UNLOCK(&ip_conntrack_lock);
270}
271
272static int exp_seq_show(struct seq_file *s, void *v)
273{
274 struct ip_conntrack_expect *expect = v;
275
276 if (expect->timeout.function)
277 seq_printf(s, "%ld ", timer_pending(&expect->timeout)
278 ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
279 else
280 seq_printf(s, "- ");
281
282 seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
283
284 print_tuple(s, &expect->tuple,
285 ip_ct_find_proto(expect->tuple.dst.protonum));
286 return seq_putc(s, '\n');
287}
288
289static struct seq_operations exp_seq_ops = {
290 .start = exp_seq_start,
291 .next = exp_seq_next,
292 .stop = exp_seq_stop,
293 .show = exp_seq_show
294};
295
296static int exp_open(struct inode *inode, struct file *file)
297{
298 return seq_open(file, &exp_seq_ops);
299}
300
301static struct file_operations exp_file_ops = {
302 .owner = THIS_MODULE,
303 .open = exp_open,
304 .read = seq_read,
305 .llseek = seq_lseek,
306 .release = seq_release
307};
308
309static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
310{
311 int cpu;
312
313 if (*pos == 0)
314 return SEQ_START_TOKEN;
315
316 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
317 if (!cpu_possible(cpu))
318 continue;
319 *pos = cpu+1;
320 return &per_cpu(ip_conntrack_stat, cpu);
321 }
322
323 return NULL;
324}
325
326static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
327{
328 int cpu;
329
330 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
331 if (!cpu_possible(cpu))
332 continue;
333 *pos = cpu+1;
334 return &per_cpu(ip_conntrack_stat, cpu);
335 }
336
337 return NULL;
338}
339
340static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
341{
342}
343
344static int ct_cpu_seq_show(struct seq_file *seq, void *v)
345{
346 unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
347 struct ip_conntrack_stat *st = v;
348
349 if (v == SEQ_START_TOKEN) {
350 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
351 return 0;
352 }
353
354 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
355 "%08x %08x %08x %08x %08x %08x %08x %08x \n",
356 nr_conntracks,
357 st->searched,
358 st->found,
359 st->new,
360 st->invalid,
361 st->ignore,
362 st->delete,
363 st->delete_list,
364 st->insert,
365 st->insert_failed,
366 st->drop,
367 st->early_drop,
368 st->error,
369
370 st->expect_new,
371 st->expect_create,
372 st->expect_delete
373 );
374 return 0;
375}
376
377static struct seq_operations ct_cpu_seq_ops = {
378 .start = ct_cpu_seq_start,
379 .next = ct_cpu_seq_next,
380 .stop = ct_cpu_seq_stop,
381 .show = ct_cpu_seq_show,
382};
383
384static int ct_cpu_seq_open(struct inode *inode, struct file *file)
385{
386 return seq_open(file, &ct_cpu_seq_ops);
387}
388
389static struct file_operations ct_cpu_seq_fops = {
390 .owner = THIS_MODULE,
391 .open = ct_cpu_seq_open,
392 .read = seq_read,
393 .llseek = seq_lseek,
394 .release = seq_release_private,
395};
396#endif
397
398static unsigned int ip_confirm(unsigned int hooknum,
399 struct sk_buff **pskb,
400 const struct net_device *in,
401 const struct net_device *out,
402 int (*okfn)(struct sk_buff *))
403{
404 struct ip_conntrack *ct;
405 enum ip_conntrack_info ctinfo;
406
407 /* This is where we call the helper: as the packet goes out. */
408 ct = ip_conntrack_get(*pskb, &ctinfo);
409 if (ct && ct->helper) {
410 unsigned int ret;
411 ret = ct->helper->help(pskb, ct, ctinfo);
412 if (ret != NF_ACCEPT)
413 return ret;
414 }
415
416 /* We've seen it coming out the other side: confirm it */
417 return ip_conntrack_confirm(pskb);
418}
419
420static unsigned int ip_conntrack_defrag(unsigned int hooknum,
421 struct sk_buff **pskb,
422 const struct net_device *in,
423 const struct net_device *out,
424 int (*okfn)(struct sk_buff *))
425{
426#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
427 /* Previously seen (loopback)? Ignore. Do this before
428 fragment check. */
429 if ((*pskb)->nfct)
430 return NF_ACCEPT;
431#endif
432
433 /* Gather fragments. */
434 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
435 *pskb = ip_ct_gather_frags(*pskb,
436 hooknum == NF_IP_PRE_ROUTING ?
437 IP_DEFRAG_CONNTRACK_IN :
438 IP_DEFRAG_CONNTRACK_OUT);
439 if (!*pskb)
440 return NF_STOLEN;
441 }
442 return NF_ACCEPT;
443}
444
445static unsigned int ip_refrag(unsigned int hooknum,
446 struct sk_buff **pskb,
447 const struct net_device *in,
448 const struct net_device *out,
449 int (*okfn)(struct sk_buff *))
450{
451 struct rtable *rt = (struct rtable *)(*pskb)->dst;
452
453 /* We've seen it coming out the other side: confirm */
454 if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT)
455 return NF_DROP;
456
457 /* Local packets are never produced too large for their
458 interface. We degfragment them at LOCAL_OUT, however,
459 so we have to refragment them here. */
460 if ((*pskb)->len > dst_mtu(&rt->u.dst) &&
461 !skb_shinfo(*pskb)->tso_size) {
462 /* No hook can be after us, so this should be OK. */
463 ip_fragment(*pskb, okfn);
464 return NF_STOLEN;
465 }
466 return NF_ACCEPT;
467}
468
469static unsigned int ip_conntrack_local(unsigned int hooknum,
470 struct sk_buff **pskb,
471 const struct net_device *in,
472 const struct net_device *out,
473 int (*okfn)(struct sk_buff *))
474{
475 /* root is playing with raw sockets. */
476 if ((*pskb)->len < sizeof(struct iphdr)
477 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
478 if (net_ratelimit())
479 printk("ipt_hook: happy cracking.\n");
480 return NF_ACCEPT;
481 }
482 return ip_conntrack_in(hooknum, pskb, in, out, okfn);
483}
484
485/* Connection tracking may drop packets, but never alters them, so
486 make it the first hook. */
487static struct nf_hook_ops ip_conntrack_defrag_ops = {
488 .hook = ip_conntrack_defrag,
489 .owner = THIS_MODULE,
490 .pf = PF_INET,
491 .hooknum = NF_IP_PRE_ROUTING,
492 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
493};
494
495static struct nf_hook_ops ip_conntrack_in_ops = {
496 .hook = ip_conntrack_in,
497 .owner = THIS_MODULE,
498 .pf = PF_INET,
499 .hooknum = NF_IP_PRE_ROUTING,
500 .priority = NF_IP_PRI_CONNTRACK,
501};
502
503static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = {
504 .hook = ip_conntrack_defrag,
505 .owner = THIS_MODULE,
506 .pf = PF_INET,
507 .hooknum = NF_IP_LOCAL_OUT,
508 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
509};
510
511static struct nf_hook_ops ip_conntrack_local_out_ops = {
512 .hook = ip_conntrack_local,
513 .owner = THIS_MODULE,
514 .pf = PF_INET,
515 .hooknum = NF_IP_LOCAL_OUT,
516 .priority = NF_IP_PRI_CONNTRACK,
517};
518
519/* Refragmenter; last chance. */
520static struct nf_hook_ops ip_conntrack_out_ops = {
521 .hook = ip_refrag,
522 .owner = THIS_MODULE,
523 .pf = PF_INET,
524 .hooknum = NF_IP_POST_ROUTING,
525 .priority = NF_IP_PRI_LAST,
526};
527
528static struct nf_hook_ops ip_conntrack_local_in_ops = {
529 .hook = ip_confirm,
530 .owner = THIS_MODULE,
531 .pf = PF_INET,
532 .hooknum = NF_IP_LOCAL_IN,
533 .priority = NF_IP_PRI_LAST-1,
534};
535
536/* Sysctl support */
537
538#ifdef CONFIG_SYSCTL
539
540/* From ip_conntrack_core.c */
541extern int ip_conntrack_max;
542extern unsigned int ip_conntrack_htable_size;
543
544/* From ip_conntrack_proto_tcp.c */
545extern unsigned long ip_ct_tcp_timeout_syn_sent;
546extern unsigned long ip_ct_tcp_timeout_syn_recv;
547extern unsigned long ip_ct_tcp_timeout_established;
548extern unsigned long ip_ct_tcp_timeout_fin_wait;
549extern unsigned long ip_ct_tcp_timeout_close_wait;
550extern unsigned long ip_ct_tcp_timeout_last_ack;
551extern unsigned long ip_ct_tcp_timeout_time_wait;
552extern unsigned long ip_ct_tcp_timeout_close;
553extern unsigned long ip_ct_tcp_timeout_max_retrans;
554extern int ip_ct_tcp_loose;
555extern int ip_ct_tcp_be_liberal;
556extern int ip_ct_tcp_max_retrans;
557
558/* From ip_conntrack_proto_udp.c */
559extern unsigned long ip_ct_udp_timeout;
560extern unsigned long ip_ct_udp_timeout_stream;
561
562/* From ip_conntrack_proto_icmp.c */
563extern unsigned long ip_ct_icmp_timeout;
564
565/* From ip_conntrack_proto_icmp.c */
566extern unsigned long ip_ct_generic_timeout;
567
568/* Log invalid packets of a given protocol */
569static int log_invalid_proto_min = 0;
570static int log_invalid_proto_max = 255;
571
572static struct ctl_table_header *ip_ct_sysctl_header;
573
574static ctl_table ip_ct_sysctl_table[] = {
575 {
576 .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
577 .procname = "ip_conntrack_max",
578 .data = &ip_conntrack_max,
579 .maxlen = sizeof(int),
580 .mode = 0644,
581 .proc_handler = &proc_dointvec,
582 },
583 {
584 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
585 .procname = "ip_conntrack_count",
586 .data = &ip_conntrack_count,
587 .maxlen = sizeof(int),
588 .mode = 0444,
589 .proc_handler = &proc_dointvec,
590 },
591 {
592 .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
593 .procname = "ip_conntrack_buckets",
594 .data = &ip_conntrack_htable_size,
595 .maxlen = sizeof(unsigned int),
596 .mode = 0444,
597 .proc_handler = &proc_dointvec,
598 },
599 {
600 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
601 .procname = "ip_conntrack_tcp_timeout_syn_sent",
602 .data = &ip_ct_tcp_timeout_syn_sent,
603 .maxlen = sizeof(unsigned int),
604 .mode = 0644,
605 .proc_handler = &proc_dointvec_jiffies,
606 },
607 {
608 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
609 .procname = "ip_conntrack_tcp_timeout_syn_recv",
610 .data = &ip_ct_tcp_timeout_syn_recv,
611 .maxlen = sizeof(unsigned int),
612 .mode = 0644,
613 .proc_handler = &proc_dointvec_jiffies,
614 },
615 {
616 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
617 .procname = "ip_conntrack_tcp_timeout_established",
618 .data = &ip_ct_tcp_timeout_established,
619 .maxlen = sizeof(unsigned int),
620 .mode = 0644,
621 .proc_handler = &proc_dointvec_jiffies,
622 },
623 {
624 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
625 .procname = "ip_conntrack_tcp_timeout_fin_wait",
626 .data = &ip_ct_tcp_timeout_fin_wait,
627 .maxlen = sizeof(unsigned int),
628 .mode = 0644,
629 .proc_handler = &proc_dointvec_jiffies,
630 },
631 {
632 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
633 .procname = "ip_conntrack_tcp_timeout_close_wait",
634 .data = &ip_ct_tcp_timeout_close_wait,
635 .maxlen = sizeof(unsigned int),
636 .mode = 0644,
637 .proc_handler = &proc_dointvec_jiffies,
638 },
639 {
640 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
641 .procname = "ip_conntrack_tcp_timeout_last_ack",
642 .data = &ip_ct_tcp_timeout_last_ack,
643 .maxlen = sizeof(unsigned int),
644 .mode = 0644,
645 .proc_handler = &proc_dointvec_jiffies,
646 },
647 {
648 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
649 .procname = "ip_conntrack_tcp_timeout_time_wait",
650 .data = &ip_ct_tcp_timeout_time_wait,
651 .maxlen = sizeof(unsigned int),
652 .mode = 0644,
653 .proc_handler = &proc_dointvec_jiffies,
654 },
655 {
656 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
657 .procname = "ip_conntrack_tcp_timeout_close",
658 .data = &ip_ct_tcp_timeout_close,
659 .maxlen = sizeof(unsigned int),
660 .mode = 0644,
661 .proc_handler = &proc_dointvec_jiffies,
662 },
663 {
664 .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,
665 .procname = "ip_conntrack_udp_timeout",
666 .data = &ip_ct_udp_timeout,
667 .maxlen = sizeof(unsigned int),
668 .mode = 0644,
669 .proc_handler = &proc_dointvec_jiffies,
670 },
671 {
672 .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
673 .procname = "ip_conntrack_udp_timeout_stream",
674 .data = &ip_ct_udp_timeout_stream,
675 .maxlen = sizeof(unsigned int),
676 .mode = 0644,
677 .proc_handler = &proc_dointvec_jiffies,
678 },
679 {
680 .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,
681 .procname = "ip_conntrack_icmp_timeout",
682 .data = &ip_ct_icmp_timeout,
683 .maxlen = sizeof(unsigned int),
684 .mode = 0644,
685 .proc_handler = &proc_dointvec_jiffies,
686 },
687 {
688 .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,
689 .procname = "ip_conntrack_generic_timeout",
690 .data = &ip_ct_generic_timeout,
691 .maxlen = sizeof(unsigned int),
692 .mode = 0644,
693 .proc_handler = &proc_dointvec_jiffies,
694 },
695 {
696 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
697 .procname = "ip_conntrack_log_invalid",
698 .data = &ip_ct_log_invalid,
699 .maxlen = sizeof(unsigned int),
700 .mode = 0644,
701 .proc_handler = &proc_dointvec_minmax,
702 .strategy = &sysctl_intvec,
703 .extra1 = &log_invalid_proto_min,
704 .extra2 = &log_invalid_proto_max,
705 },
706 {
707 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
708 .procname = "ip_conntrack_tcp_timeout_max_retrans",
709 .data = &ip_ct_tcp_timeout_max_retrans,
710 .maxlen = sizeof(unsigned int),
711 .mode = 0644,
712 .proc_handler = &proc_dointvec_jiffies,
713 },
714 {
715 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
716 .procname = "ip_conntrack_tcp_loose",
717 .data = &ip_ct_tcp_loose,
718 .maxlen = sizeof(unsigned int),
719 .mode = 0644,
720 .proc_handler = &proc_dointvec,
721 },
722 {
723 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
724 .procname = "ip_conntrack_tcp_be_liberal",
725 .data = &ip_ct_tcp_be_liberal,
726 .maxlen = sizeof(unsigned int),
727 .mode = 0644,
728 .proc_handler = &proc_dointvec,
729 },
730 {
731 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
732 .procname = "ip_conntrack_tcp_max_retrans",
733 .data = &ip_ct_tcp_max_retrans,
734 .maxlen = sizeof(unsigned int),
735 .mode = 0644,
736 .proc_handler = &proc_dointvec,
737 },
738 { .ctl_name = 0 }
739};
740
741#define NET_IP_CONNTRACK_MAX 2089
742
743static ctl_table ip_ct_netfilter_table[] = {
744 {
745 .ctl_name = NET_IPV4_NETFILTER,
746 .procname = "netfilter",
747 .mode = 0555,
748 .child = ip_ct_sysctl_table,
749 },
750 {
751 .ctl_name = NET_IP_CONNTRACK_MAX,
752 .procname = "ip_conntrack_max",
753 .data = &ip_conntrack_max,
754 .maxlen = sizeof(int),
755 .mode = 0644,
756 .proc_handler = &proc_dointvec
757 },
758 { .ctl_name = 0 }
759};
760
761static ctl_table ip_ct_ipv4_table[] = {
762 {
763 .ctl_name = NET_IPV4,
764 .procname = "ipv4",
765 .mode = 0555,
766 .child = ip_ct_netfilter_table,
767 },
768 { .ctl_name = 0 }
769};
770
771static ctl_table ip_ct_net_table[] = {
772 {
773 .ctl_name = CTL_NET,
774 .procname = "net",
775 .mode = 0555,
776 .child = ip_ct_ipv4_table,
777 },
778 { .ctl_name = 0 }
779};
780
781EXPORT_SYMBOL(ip_ct_log_invalid);
782#endif /* CONFIG_SYSCTL */
783
784static int init_or_cleanup(int init)
785{
786#ifdef CONFIG_PROC_FS
787 struct proc_dir_entry *proc, *proc_exp, *proc_stat;
788#endif
789 int ret = 0;
790
791 if (!init) goto cleanup;
792
793 ret = ip_conntrack_init();
794 if (ret < 0)
795 goto cleanup_nothing;
796
797#ifdef CONFIG_PROC_FS
798 ret = -ENOMEM;
799 proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
800 if (!proc) goto cleanup_init;
801
802 proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
803 &exp_file_ops);
804 if (!proc_exp) goto cleanup_proc;
805
806 proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
807 if (!proc_stat)
808 goto cleanup_proc_exp;
809
810 proc_stat->proc_fops = &ct_cpu_seq_fops;
811 proc_stat->owner = THIS_MODULE;
812#endif
813
814 ret = nf_register_hook(&ip_conntrack_defrag_ops);
815 if (ret < 0) {
816 printk("ip_conntrack: can't register pre-routing defrag hook.\n");
817 goto cleanup_proc_stat;
818 }
819 ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops);
820 if (ret < 0) {
821 printk("ip_conntrack: can't register local_out defrag hook.\n");
822 goto cleanup_defragops;
823 }
824 ret = nf_register_hook(&ip_conntrack_in_ops);
825 if (ret < 0) {
826 printk("ip_conntrack: can't register pre-routing hook.\n");
827 goto cleanup_defraglocalops;
828 }
829 ret = nf_register_hook(&ip_conntrack_local_out_ops);
830 if (ret < 0) {
831 printk("ip_conntrack: can't register local out hook.\n");
832 goto cleanup_inops;
833 }
834 ret = nf_register_hook(&ip_conntrack_out_ops);
835 if (ret < 0) {
836 printk("ip_conntrack: can't register post-routing hook.\n");
837 goto cleanup_inandlocalops;
838 }
839 ret = nf_register_hook(&ip_conntrack_local_in_ops);
840 if (ret < 0) {
841 printk("ip_conntrack: can't register local in hook.\n");
842 goto cleanup_inoutandlocalops;
843 }
844#ifdef CONFIG_SYSCTL
845 ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
846 if (ip_ct_sysctl_header == NULL) {
847 printk("ip_conntrack: can't register to sysctl.\n");
848 ret = -ENOMEM;
849 goto cleanup_localinops;
850 }
851#endif
852
853 return ret;
854
855 cleanup:
856#ifdef CONFIG_SYSCTL
857 unregister_sysctl_table(ip_ct_sysctl_header);
858 cleanup_localinops:
859#endif
860 nf_unregister_hook(&ip_conntrack_local_in_ops);
861 cleanup_inoutandlocalops:
862 nf_unregister_hook(&ip_conntrack_out_ops);
863 cleanup_inandlocalops:
864 nf_unregister_hook(&ip_conntrack_local_out_ops);
865 cleanup_inops:
866 nf_unregister_hook(&ip_conntrack_in_ops);
867 cleanup_defraglocalops:
868 nf_unregister_hook(&ip_conntrack_defrag_local_out_ops);
869 cleanup_defragops:
870 nf_unregister_hook(&ip_conntrack_defrag_ops);
871 cleanup_proc_stat:
872#ifdef CONFIG_PROC_FS
873 remove_proc_entry("ip_conntrack", proc_net_stat);
874 cleanup_proc_exp:
875 proc_net_remove("ip_conntrack_expect");
876 cleanup_proc:
877 proc_net_remove("ip_conntrack");
878 cleanup_init:
879#endif /* CONFIG_PROC_FS */
880 ip_conntrack_cleanup();
881 cleanup_nothing:
882 return ret;
883}
884
885/* FIXME: Allow NULL functions and sub in pointers to generic for
886 them. --RR */
887int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
888{
889 int ret = 0;
890
891 WRITE_LOCK(&ip_conntrack_lock);
892 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
893 ret = -EBUSY;
894 goto out;
895 }
896 ip_ct_protos[proto->proto] = proto;
897 out:
898 WRITE_UNLOCK(&ip_conntrack_lock);
899 return ret;
900}
901
902void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
903{
904 WRITE_LOCK(&ip_conntrack_lock);
905 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
906 WRITE_UNLOCK(&ip_conntrack_lock);
907
908 /* Somebody could be still looking at the proto in bh. */
909 synchronize_net();
910
911 /* Remove all contrack entries for this protocol */
912 ip_ct_iterate_cleanup(kill_proto, &proto->proto);
913}
914
915static int __init init(void)
916{
917 return init_or_cleanup(1);
918}
919
920static void __exit fini(void)
921{
922 init_or_cleanup(0);
923}
924
925module_init(init);
926module_exit(fini);
927
928/* Some modules need us, but don't depend directly on any symbol.
929 They should call this. */
930void need_ip_conntrack(void)
931{
932}
933
934EXPORT_SYMBOL(ip_conntrack_protocol_register);
935EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
936EXPORT_SYMBOL(ip_ct_get_tuple);
937EXPORT_SYMBOL(invert_tuplepr);
938EXPORT_SYMBOL(ip_conntrack_alter_reply);
939EXPORT_SYMBOL(ip_conntrack_destroyed);
940EXPORT_SYMBOL(need_ip_conntrack);
941EXPORT_SYMBOL(ip_conntrack_helper_register);
942EXPORT_SYMBOL(ip_conntrack_helper_unregister);
943EXPORT_SYMBOL(ip_ct_iterate_cleanup);
944EXPORT_SYMBOL(ip_ct_refresh_acct);
945EXPORT_SYMBOL(ip_ct_protos);
946EXPORT_SYMBOL(ip_ct_find_proto);
947EXPORT_SYMBOL(ip_conntrack_expect_alloc);
948EXPORT_SYMBOL(ip_conntrack_expect_free);
949EXPORT_SYMBOL(ip_conntrack_expect_related);
950EXPORT_SYMBOL(ip_conntrack_unexpect_related);
951EXPORT_SYMBOL(ip_conntrack_tuple_taken);
952EXPORT_SYMBOL(ip_ct_gather_frags);
953EXPORT_SYMBOL(ip_conntrack_htable_size);
954EXPORT_SYMBOL(ip_conntrack_lock);
955EXPORT_SYMBOL(ip_conntrack_hash);
956EXPORT_SYMBOL(ip_conntrack_untracked);
957EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
958EXPORT_SYMBOL_GPL(ip_conntrack_put);
959#ifdef CONFIG_IP_NF_NAT_NEEDED
960EXPORT_SYMBOL(ip_conntrack_tcp_update);
961#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
new file mode 100644
index 000000000000..992fac3e36ee
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -0,0 +1,159 @@
1/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 *
7 * Version: 0.0.7
8 *
9 * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
10 * - port to newnat API
11 *
12 */
13
14#include <linux/module.h>
15#include <linux/ip.h>
16#include <linux/udp.h>
17
18#include <linux/netfilter.h>
19#include <linux/netfilter_ipv4/ip_tables.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
22#include <linux/moduleparam.h>
23
24MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
25MODULE_DESCRIPTION("tftp connection tracking helper");
26MODULE_LICENSE("GPL");
27
28#define MAX_PORTS 8
29static int ports[MAX_PORTS];
30static int ports_c;
31module_param_array(ports, int, &ports_c, 0400);
32MODULE_PARM_DESC(ports, "port numbers of tftp servers");
33
34#if 0
35#define DEBUGP(format, args...) printk("%s:%s:" format, \
36 __FILE__, __FUNCTION__ , ## args)
37#else
38#define DEBUGP(format, args...)
39#endif
40
41unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb,
42 enum ip_conntrack_info ctinfo,
43 struct ip_conntrack_expect *exp);
44EXPORT_SYMBOL_GPL(ip_nat_tftp_hook);
45
46static int tftp_help(struct sk_buff **pskb,
47 struct ip_conntrack *ct,
48 enum ip_conntrack_info ctinfo)
49{
50 struct tftphdr _tftph, *tfh;
51 struct ip_conntrack_expect *exp;
52 unsigned int ret = NF_ACCEPT;
53
54 tfh = skb_header_pointer(*pskb,
55 (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr),
56 sizeof(_tftph), &_tftph);
57 if (tfh == NULL)
58 return NF_ACCEPT;
59
60 switch (ntohs(tfh->opcode)) {
61 /* RRQ and WRQ works the same way */
62 case TFTP_OPCODE_READ:
63 case TFTP_OPCODE_WRITE:
64 DEBUGP("");
65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
67
68 exp = ip_conntrack_expect_alloc();
69 if (exp == NULL)
70 return NF_DROP;
71
72 exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
73 exp->mask.src.ip = 0xffffffff;
74 exp->mask.dst.ip = 0xffffffff;
75 exp->mask.dst.u.udp.port = 0xffff;
76 exp->mask.dst.protonum = 0xff;
77 exp->expectfn = NULL;
78 exp->master = ct;
79
80 DEBUGP("expect: ");
81 DUMP_TUPLE(&exp->tuple);
82 DUMP_TUPLE(&exp->mask);
83 if (ip_nat_tftp_hook)
84 ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
85 else if (ip_conntrack_expect_related(exp) != 0) {
86 ip_conntrack_expect_free(exp);
87 ret = NF_DROP;
88 }
89 break;
90 case TFTP_OPCODE_DATA:
91 case TFTP_OPCODE_ACK:
92 DEBUGP("Data/ACK opcode\n");
93 break;
94 case TFTP_OPCODE_ERROR:
95 DEBUGP("Error opcode\n");
96 break;
97 default:
98 DEBUGP("Unknown opcode\n");
99 }
100 return NF_ACCEPT;
101}
102
103static struct ip_conntrack_helper tftp[MAX_PORTS];
104static char tftp_names[MAX_PORTS][10];
105
106static void fini(void)
107{
108 int i;
109
110 for (i = 0 ; i < ports_c; i++) {
111 DEBUGP("unregistering helper for port %d\n",
112 ports[i]);
113 ip_conntrack_helper_unregister(&tftp[i]);
114 }
115}
116
117static int __init init(void)
118{
119 int i, ret;
120 char *tmpname;
121
122 if (ports_c == 0)
123 ports[ports_c++] = TFTP_PORT;
124
125 for (i = 0; i < ports_c; i++) {
126 /* Create helper structure */
127 memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper));
128
129 tftp[i].tuple.dst.protonum = IPPROTO_UDP;
130 tftp[i].tuple.src.u.udp.port = htons(ports[i]);
131 tftp[i].mask.dst.protonum = 0xFF;
132 tftp[i].mask.src.u.udp.port = 0xFFFF;
133 tftp[i].max_expected = 1;
134 tftp[i].timeout = 5 * 60; /* 5 minutes */
135 tftp[i].me = THIS_MODULE;
136 tftp[i].help = tftp_help;
137
138 tmpname = &tftp_names[i][0];
139 if (ports[i] == TFTP_PORT)
140 sprintf(tmpname, "tftp");
141 else
142 sprintf(tmpname, "tftp-%d", i);
143 tftp[i].name = tmpname;
144
145 DEBUGP("port #%d: %d\n", i, ports[i]);
146
147 ret=ip_conntrack_helper_register(&tftp[i]);
148 if (ret) {
149 printk("ERROR registering helper for port %d\n",
150 ports[i]);
151 fini();
152 return(ret);
153 }
154 }
155 return(0);
156}
157
158module_init(init);
159module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
new file mode 100644
index 000000000000..da1f412583ed
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -0,0 +1,88 @@
1/* Amanda extension for TCP NAT alteration.
2 * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
3 * based on a copy of HW's ip_nat_irc.c as well as other modules
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 *
10 * Module load syntax:
11 * insmod ip_nat_amanda.o
12 */
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/netfilter.h>
17#include <linux/skbuff.h>
18#include <linux/ip.h>
19#include <linux/udp.h>
20#include <net/tcp.h>
21#include <net/udp.h>
22
23#include <linux/netfilter_ipv4.h>
24#include <linux/netfilter_ipv4/ip_nat.h>
25#include <linux/netfilter_ipv4/ip_nat_helper.h>
26#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
27#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
28
29
30MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
31MODULE_DESCRIPTION("Amanda NAT helper");
32MODULE_LICENSE("GPL");
33
34static unsigned int help(struct sk_buff **pskb,
35 enum ip_conntrack_info ctinfo,
36 unsigned int matchoff,
37 unsigned int matchlen,
38 struct ip_conntrack_expect *exp)
39{
40 char buffer[sizeof("65535")];
41 u_int16_t port;
42 unsigned int ret;
43
44 /* Connection comes from client. */
45 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
46 exp->dir = IP_CT_DIR_ORIGINAL;
47
48 /* When you see the packet, we need to NAT it the same as the
49 * this one (ie. same IP: it will be TCP and master is UDP). */
50 exp->expectfn = ip_nat_follow_master;
51
52 /* Try to get same port: if not, try to change it. */
53 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
54 exp->tuple.dst.u.tcp.port = htons(port);
55 if (ip_conntrack_expect_related(exp) == 0)
56 break;
57 }
58
59 if (port == 0) {
60 ip_conntrack_expect_free(exp);
61 return NF_DROP;
62 }
63
64 sprintf(buffer, "%u", port);
65 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
66 matchoff, matchlen,
67 buffer, strlen(buffer));
68 if (ret != NF_ACCEPT)
69 ip_conntrack_unexpect_related(exp);
70 return ret;
71}
72
73static void __exit fini(void)
74{
75 ip_nat_amanda_hook = NULL;
76 /* Make sure noone calls it, meanwhile. */
77 synchronize_net();
78}
79
80static int __init init(void)
81{
82 BUG_ON(ip_nat_amanda_hook);
83 ip_nat_amanda_hook = help;
84 return 0;
85}
86
87module_init(init);
88module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
new file mode 100644
index 000000000000..162ceacfc29a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -0,0 +1,556 @@
1/* NAT for netfilter; shared with compatibility layer. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/timer.h>
14#include <linux/skbuff.h>
15#include <linux/netfilter_ipv4.h>
16#include <linux/vmalloc.h>
17#include <net/checksum.h>
18#include <net/icmp.h>
19#include <net/ip.h>
20#include <net/tcp.h> /* For tcp_prot in getorigdst */
21#include <linux/icmp.h>
22#include <linux/udp.h>
23#include <linux/jhash.h>
24
25#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
26#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
27
28#include <linux/netfilter_ipv4/ip_conntrack.h>
29#include <linux/netfilter_ipv4/ip_conntrack_core.h>
30#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
31#include <linux/netfilter_ipv4/ip_nat.h>
32#include <linux/netfilter_ipv4/ip_nat_protocol.h>
33#include <linux/netfilter_ipv4/ip_nat_core.h>
34#include <linux/netfilter_ipv4/ip_nat_helper.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
36#include <linux/netfilter_ipv4/listhelp.h>
37
38#if 0
39#define DEBUGP printk
40#else
41#define DEBUGP(format, args...)
42#endif
43
44DECLARE_RWLOCK(ip_nat_lock);
45
46/* Calculated at init based on memory size */
47static unsigned int ip_nat_htable_size;
48
49static struct list_head *bysource;
50struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
51
52
53/* We keep an extra hash for each conntrack, for fast searching. */
54static inline unsigned int
55hash_by_src(const struct ip_conntrack_tuple *tuple)
56{
57 /* Original src, to ensure we map it consistently if poss. */
58 return jhash_3words(tuple->src.ip, tuple->src.u.all,
59 tuple->dst.protonum, 0) % ip_nat_htable_size;
60}
61
62/* Noone using conntrack by the time this called. */
63static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
64{
65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return;
67
68 WRITE_LOCK(&ip_nat_lock);
69 list_del(&conn->nat.info.bysource);
70 WRITE_UNLOCK(&ip_nat_lock);
71}
72
73/* We do checksum mangling, so if they were wrong before they're still
74 * wrong. Also works for incomplete packets (eg. ICMP dest
75 * unreachables.) */
76u_int16_t
77ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
78{
79 u_int32_t diffs[] = { oldvalinv, newval };
80 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
81 oldcheck^0xFFFF));
82}
83
84/* Is this tuple already taken? (not by us) */
85int
86ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
87 const struct ip_conntrack *ignored_conntrack)
88{
89 /* Conntrack tracking doesn't keep track of outgoing tuples; only
90 incoming ones. NAT means they don't have a fixed mapping,
91 so we invert the tuple and look for the incoming reply.
92
93 We could keep a separate hash if this proves too slow. */
94 struct ip_conntrack_tuple reply;
95
96 invert_tuplepr(&reply, tuple);
97 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
98}
99
100/* If we source map this tuple so reply looks like reply_tuple, will
101 * that meet the constraints of range. */
102static int
103in_range(const struct ip_conntrack_tuple *tuple,
104 const struct ip_nat_range *range)
105{
106 struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
107
108 /* If we are supposed to map IPs, then we must be in the
109 range specified, otherwise let this drag us onto a new src IP. */
110 if (range->flags & IP_NAT_RANGE_MAP_IPS) {
111 if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
112 || ntohl(tuple->src.ip) > ntohl(range->max_ip))
113 return 0;
114 }
115
116 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
117 || proto->in_range(tuple, IP_NAT_MANIP_SRC,
118 &range->min, &range->max))
119 return 1;
120
121 return 0;
122}
123
124static inline int
125same_src(const struct ip_conntrack *ct,
126 const struct ip_conntrack_tuple *tuple)
127{
128 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
129 == tuple->dst.protonum
130 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
131 == tuple->src.ip
132 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
133 == tuple->src.u.all);
134}
135
136/* Only called for SRC manip */
137static int
138find_appropriate_src(const struct ip_conntrack_tuple *tuple,
139 struct ip_conntrack_tuple *result,
140 const struct ip_nat_range *range)
141{
142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct;
144
145 READ_LOCK(&ip_nat_lock);
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */
149 invert_tuplepr(result,
150 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
151 result->dst = tuple->dst;
152
153 if (in_range(result, range)) {
154 READ_UNLOCK(&ip_nat_lock);
155 return 1;
156 }
157 }
158 }
159 READ_UNLOCK(&ip_nat_lock);
160 return 0;
161}
162
163/* For [FUTURE] fragmentation handling, we want the least-used
164 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
165 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
166 1-65535, we don't do pro-rata allocation based on ports; we choose
167 the ip with the lowest src-ip/dst-ip/proto usage.
168*/
169static void
170find_best_ips_proto(struct ip_conntrack_tuple *tuple,
171 const struct ip_nat_range *range,
172 const struct ip_conntrack *conntrack,
173 enum ip_nat_manip_type maniptype)
174{
175 u_int32_t *var_ipp;
176 /* Host order */
177 u_int32_t minip, maxip, j;
178
179 /* No IP mapping? Do nothing. */
180 if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
181 return;
182
183 if (maniptype == IP_NAT_MANIP_SRC)
184 var_ipp = &tuple->src.ip;
185 else
186 var_ipp = &tuple->dst.ip;
187
188 /* Fast path: only one choice. */
189 if (range->min_ip == range->max_ip) {
190 *var_ipp = range->min_ip;
191 return;
192 }
193
194 /* Hashing source and destination IPs gives a fairly even
195 * spread in practice (if there are a small number of IPs
196 * involved, there usually aren't that many connections
197 * anyway). The consistency means that servers see the same
198 * client coming from the same IP (some Internet Banking sites
199 * like this), even across reboots. */
200 minip = ntohl(range->min_ip);
201 maxip = ntohl(range->max_ip);
202 j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
203 *var_ipp = htonl(minip + j % (maxip - minip + 1));
204}
205
206/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
207 * we change the source to map into the range. For NF_IP_PRE_ROUTING
208 * and NF_IP_LOCAL_OUT, we change the destination to map into the
209 * range. It might not be possible to get a unique tuple, but we try.
210 * At worst (or if we race), we will end up with a final duplicate in
211 * __ip_conntrack_confirm and drop the packet. */
212static void
213get_unique_tuple(struct ip_conntrack_tuple *tuple,
214 const struct ip_conntrack_tuple *orig_tuple,
215 const struct ip_nat_range *range,
216 struct ip_conntrack *conntrack,
217 enum ip_nat_manip_type maniptype)
218{
219 struct ip_nat_protocol *proto
220 = ip_nat_find_proto(orig_tuple->dst.protonum);
221
222 /* 1) If this srcip/proto/src-proto-part is currently mapped,
223 and that same mapping gives a unique tuple within the given
224 range, use that.
225
226 This is only required for source (ie. NAT/masq) mappings.
227 So far, we don't do local source mappings, so multiple
228 manips not an issue. */
229 if (maniptype == IP_NAT_MANIP_SRC) {
230 if (find_appropriate_src(orig_tuple, tuple, range)) {
231 DEBUGP("get_unique_tuple: Found current src map\n");
232 if (!ip_nat_used_tuple(tuple, conntrack))
233 return;
234 }
235 }
236
237 /* 2) Select the least-used IP/proto combination in the given
238 range. */
239 *tuple = *orig_tuple;
240 find_best_ips_proto(tuple, range, conntrack, maniptype);
241
242 /* 3) The per-protocol part of the manip is made to map into
243 the range to make a unique tuple. */
244
245 /* Only bother mapping if it's not already in range and unique */
246 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
247 || proto->in_range(tuple, maniptype, &range->min, &range->max))
248 && !ip_nat_used_tuple(tuple, conntrack))
249 return;
250
251 /* Last change: get protocol to try to obtain unique tuple. */
252 proto->unique_tuple(tuple, range, maniptype, conntrack);
253}
254
255unsigned int
256ip_nat_setup_info(struct ip_conntrack *conntrack,
257 const struct ip_nat_range *range,
258 unsigned int hooknum)
259{
260 struct ip_conntrack_tuple curr_tuple, new_tuple;
261 struct ip_nat_info *info = &conntrack->nat.info;
262 int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
263 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
264
265 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
266 || hooknum == NF_IP_POST_ROUTING
267 || hooknum == NF_IP_LOCAL_IN
268 || hooknum == NF_IP_LOCAL_OUT);
269 BUG_ON(ip_nat_initialized(conntrack, maniptype));
270
271 /* What we've got will look like inverse of reply. Normally
272 this is what is in the conntrack, except for prior
273 manipulations (future optimization: if num_manips == 0,
274 orig_tp =
275 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
276 invert_tuplepr(&curr_tuple,
277 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
278
279 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
280
281 if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
282 struct ip_conntrack_tuple reply;
283
284 /* Alter conntrack table so will recognize replies. */
285 invert_tuplepr(&reply, &new_tuple);
286 ip_conntrack_alter_reply(conntrack, &reply);
287
288 /* Non-atomic: we own this at the moment. */
289 if (maniptype == IP_NAT_MANIP_SRC)
290 conntrack->status |= IPS_SRC_NAT;
291 else
292 conntrack->status |= IPS_DST_NAT;
293 }
294
295 /* Place in source hash if this is the first time. */
296 if (have_to_hash) {
297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple);
300 WRITE_LOCK(&ip_nat_lock);
301 list_add(&info->bysource, &bysource[srchash]);
302 WRITE_UNLOCK(&ip_nat_lock);
303 }
304
305 /* It's done. */
306 if (maniptype == IP_NAT_MANIP_DST)
307 set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
308 else
309 set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
310
311 return NF_ACCEPT;
312}
313
314/* Returns true if succeeded. */
315static int
316manip_pkt(u_int16_t proto,
317 struct sk_buff **pskb,
318 unsigned int iphdroff,
319 const struct ip_conntrack_tuple *target,
320 enum ip_nat_manip_type maniptype)
321{
322 struct iphdr *iph;
323
324 (*pskb)->nfcache |= NFC_ALTERED;
325 if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
326 return 0;
327
328 iph = (void *)(*pskb)->data + iphdroff;
329
330 /* Manipulate protcol part. */
331 if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
332 target, maniptype))
333 return 0;
334
335 iph = (void *)(*pskb)->data + iphdroff;
336
337 if (maniptype == IP_NAT_MANIP_SRC) {
338 iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
339 iph->check);
340 iph->saddr = target->src.ip;
341 } else {
342 iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
343 iph->check);
344 iph->daddr = target->dst.ip;
345 }
346 return 1;
347}
348
349/* Do packet manipulations according to ip_nat_setup_info. */
350unsigned int nat_packet(struct ip_conntrack *ct,
351 enum ip_conntrack_info ctinfo,
352 unsigned int hooknum,
353 struct sk_buff **pskb)
354{
355 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
356 unsigned long statusbit;
357 enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
358
359 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
360 && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) {
361 DEBUGP("ip_nat_core: adjusting sequence number\n");
362 /* future: put this in a l4-proto specific function,
363 * and call this function here. */
364 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
365 return NF_DROP;
366 }
367
368 if (mtype == IP_NAT_MANIP_SRC)
369 statusbit = IPS_SRC_NAT;
370 else
371 statusbit = IPS_DST_NAT;
372
373 /* Invert if this is reply dir. */
374 if (dir == IP_CT_DIR_REPLY)
375 statusbit ^= IPS_NAT_MASK;
376
377 /* Non-atomic: these bits don't change. */
378 if (ct->status & statusbit) {
379 struct ip_conntrack_tuple target;
380
381 /* We are aiming to look like inverse of other direction. */
382 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
383
384 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
385 return NF_DROP;
386 }
387 return NF_ACCEPT;
388}
389
390/* Dir is direction ICMP is coming from (opposite to packet it contains) */
391int icmp_reply_translation(struct sk_buff **pskb,
392 struct ip_conntrack *ct,
393 enum ip_nat_manip_type manip,
394 enum ip_conntrack_dir dir)
395{
396 struct {
397 struct icmphdr icmp;
398 struct iphdr ip;
399 } *inside;
400 struct ip_conntrack_tuple inner, target;
401 int hdrlen = (*pskb)->nh.iph->ihl * 4;
402
403 if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
404 return 0;
405
406 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
407
408 /* We're actually going to mangle it beyond trivial checksum
409 adjustment, so make sure the current checksum is correct. */
410 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
411 hdrlen = (*pskb)->nh.iph->ihl * 4;
412 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
413 (*pskb)->len - hdrlen, 0)))
414 return 0;
415 }
416
417 /* Must be RELATED */
418 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
419 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
420
421 /* Redirects on non-null nats must be dropped, else they'll
422 start talking to each other without our translation, and be
423 confused... --RR */
424 if (inside->icmp.type == ICMP_REDIRECT) {
425 /* If NAT isn't finished, assume it and drop. */
426 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
427 return 0;
428
429 if (ct->status & IPS_NAT_MASK)
430 return 0;
431 }
432
433 DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
434 *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
435
436 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
437 sizeof(struct icmphdr) + inside->ip.ihl*4,
438 &inner, ip_ct_find_proto(inside->ip.protocol)))
439 return 0;
440
441 /* Change inner back to look like incoming packet. We do the
442 opposite manip on this hook to normal, because it might not
443 pass all hooks (locally-generated ICMP). Consider incoming
444 packet: PREROUTING (DST manip), routing produces ICMP, goes
445 through POSTROUTING (which must correct the DST manip). */
446 if (!manip_pkt(inside->ip.protocol, pskb,
447 (*pskb)->nh.iph->ihl*4
448 + sizeof(inside->icmp),
449 &ct->tuplehash[!dir].tuple,
450 !manip))
451 return 0;
452
453 /* Reloading "inside" here since manip_pkt inner. */
454 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
455 inside->icmp.checksum = 0;
456 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
457 (*pskb)->len - hdrlen,
458 0));
459
460 /* Change outer to look the reply to an incoming packet
461 * (proto 0 means don't invert per-proto part). */
462
463 /* Obviously, we need to NAT destination IP, but source IP
464 should be NAT'ed only if it is from a NAT'd host.
465
466 Explanation: some people use NAT for anonymizing. Also,
467 CERT recommends dropping all packets from private IP
468 addresses (although ICMP errors from internal links with
469 such addresses are not too uncommon, as Alan Cox points
470 out) */
471 if (manip != IP_NAT_MANIP_SRC
472 || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
473 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
474 if (!manip_pkt(0, pskb, 0, &target, manip))
475 return 0;
476 }
477
478 return 1;
479}
480
481/* Protocol registration. */
482int ip_nat_protocol_register(struct ip_nat_protocol *proto)
483{
484 int ret = 0;
485
486 WRITE_LOCK(&ip_nat_lock);
487 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
488 ret = -EBUSY;
489 goto out;
490 }
491 ip_nat_protos[proto->protonum] = proto;
492 out:
493 WRITE_UNLOCK(&ip_nat_lock);
494 return ret;
495}
496
497/* Noone stores the protocol anywhere; simply delete it. */
498void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
499{
500 WRITE_LOCK(&ip_nat_lock);
501 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
502 WRITE_UNLOCK(&ip_nat_lock);
503
504 /* Someone could be still looking at the proto in a bh. */
505 synchronize_net();
506}
507
508int __init ip_nat_init(void)
509{
510 size_t i;
511
512 /* Leave them the same for the moment. */
513 ip_nat_htable_size = ip_conntrack_htable_size;
514
515 /* One vmalloc for both hash tables */
516 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
517 if (!bysource)
518 return -ENOMEM;
519
520 /* Sew in builtin protocols. */
521 WRITE_LOCK(&ip_nat_lock);
522 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
523 ip_nat_protos[i] = &ip_nat_unknown_protocol;
524 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
525 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
526 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
527 WRITE_UNLOCK(&ip_nat_lock);
528
529 for (i = 0; i < ip_nat_htable_size; i++) {
530 INIT_LIST_HEAD(&bysource[i]);
531 }
532
533 /* FIXME: Man, this is a hack. <SIGH> */
534 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
535 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
536
537 /* Initialize fake conntrack so that NAT will skip it */
538 ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
539 return 0;
540}
541
542/* Clear NAT section of all conntracks, in case we're loaded again. */
543static int clean_nat(struct ip_conntrack *i, void *data)
544{
545 memset(&i->nat, 0, sizeof(i->nat));
546 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
547 return 0;
548}
549
550/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
551void ip_nat_cleanup(void)
552{
553 ip_ct_iterate_cleanup(&clean_nat, NULL);
554 ip_conntrack_destroyed = NULL;
555 vfree(bysource);
556}
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
new file mode 100644
index 000000000000..c6000e794ad6
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -0,0 +1,183 @@
1/* FTP extension for TCP NAT alteration. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/netfilter_ipv4.h>
13#include <linux/ip.h>
14#include <linux/tcp.h>
15#include <linux/moduleparam.h>
16#include <net/tcp.h>
17#include <linux/netfilter_ipv4/ip_nat.h>
18#include <linux/netfilter_ipv4/ip_nat_helper.h>
19#include <linux/netfilter_ipv4/ip_nat_rule.h>
20#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
21#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
22
23MODULE_LICENSE("GPL");
24MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
25MODULE_DESCRIPTION("ftp NAT helper");
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(format, args...)
31#endif
32
33/* FIXME: Time out? --RR */
34
35static int
36mangle_rfc959_packet(struct sk_buff **pskb,
37 u_int32_t newip,
38 u_int16_t port,
39 unsigned int matchoff,
40 unsigned int matchlen,
41 struct ip_conntrack *ct,
42 enum ip_conntrack_info ctinfo,
43 u32 *seq)
44{
45 char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
46
47 sprintf(buffer, "%u,%u,%u,%u,%u,%u",
48 NIPQUAD(newip), port>>8, port&0xFF);
49
50 DEBUGP("calling ip_nat_mangle_tcp_packet\n");
51
52 *seq += strlen(buffer) - matchlen;
53 return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
54 matchlen, buffer, strlen(buffer));
55}
56
57/* |1|132.235.1.2|6275| */
58static int
59mangle_eprt_packet(struct sk_buff **pskb,
60 u_int32_t newip,
61 u_int16_t port,
62 unsigned int matchoff,
63 unsigned int matchlen,
64 struct ip_conntrack *ct,
65 enum ip_conntrack_info ctinfo,
66 u32 *seq)
67{
68 char buffer[sizeof("|1|255.255.255.255|65535|")];
69
70 sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
71
72 DEBUGP("calling ip_nat_mangle_tcp_packet\n");
73
74 *seq += strlen(buffer) - matchlen;
75 return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
76 matchlen, buffer, strlen(buffer));
77}
78
79/* |1|132.235.1.2|6275| */
80static int
81mangle_epsv_packet(struct sk_buff **pskb,
82 u_int32_t newip,
83 u_int16_t port,
84 unsigned int matchoff,
85 unsigned int matchlen,
86 struct ip_conntrack *ct,
87 enum ip_conntrack_info ctinfo,
88 u32 *seq)
89{
90 char buffer[sizeof("|||65535|")];
91
92 sprintf(buffer, "|||%u|", port);
93
94 DEBUGP("calling ip_nat_mangle_tcp_packet\n");
95
96 *seq += strlen(buffer) - matchlen;
97 return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
98 matchlen, buffer, strlen(buffer));
99}
100
101static int (*mangle[])(struct sk_buff **, u_int32_t, u_int16_t,
102 unsigned int,
103 unsigned int,
104 struct ip_conntrack *,
105 enum ip_conntrack_info,
106 u32 *seq)
107= { [IP_CT_FTP_PORT] = mangle_rfc959_packet,
108 [IP_CT_FTP_PASV] = mangle_rfc959_packet,
109 [IP_CT_FTP_EPRT] = mangle_eprt_packet,
110 [IP_CT_FTP_EPSV] = mangle_epsv_packet
111};
112
113/* So, this packet has hit the connection tracking matching code.
114 Mangle it, and change the expectation to match the new version. */
115static unsigned int ip_nat_ftp(struct sk_buff **pskb,
116 enum ip_conntrack_info ctinfo,
117 enum ip_ct_ftp_type type,
118 unsigned int matchoff,
119 unsigned int matchlen,
120 struct ip_conntrack_expect *exp,
121 u32 *seq)
122{
123 u_int32_t newip;
124 u_int16_t port;
125 int dir = CTINFO2DIR(ctinfo);
126 struct ip_conntrack *ct = exp->master;
127
128 DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
129
130 /* Connection will come from wherever this packet goes, hence !dir */
131 newip = ct->tuplehash[!dir].tuple.dst.ip;
132 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
133 exp->dir = !dir;
134
135 /* When you see the packet, we need to NAT it the same as the
136 * this one. */
137 exp->expectfn = ip_nat_follow_master;
138
139 /* Try to get same port: if not, try to change it. */
140 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
141 exp->tuple.dst.u.tcp.port = htons(port);
142 if (ip_conntrack_expect_related(exp) == 0)
143 break;
144 }
145
146 if (port == 0) {
147 ip_conntrack_expect_free(exp);
148 return NF_DROP;
149 }
150
151 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
152 seq)) {
153 ip_conntrack_unexpect_related(exp);
154 return NF_DROP;
155 }
156 return NF_ACCEPT;
157}
158
159static void __exit fini(void)
160{
161 ip_nat_ftp_hook = NULL;
162 /* Make sure noone calls it, meanwhile. */
163 synchronize_net();
164}
165
166static int __init init(void)
167{
168 BUG_ON(ip_nat_ftp_hook);
169 ip_nat_ftp_hook = ip_nat_ftp;
170 return 0;
171}
172
173/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
174static int warn_set(const char *val, struct kernel_param *kp)
175{
176 printk(KERN_INFO __stringify(KBUILD_MODNAME)
177 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
178 return 0;
179}
180module_param_call(ports, warn_set, NULL, NULL, 0);
181
182module_init(init);
183module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
new file mode 100644
index 000000000000..1637b96d8c01
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -0,0 +1,430 @@
1/* ip_nat_helper.c - generic support functions for NAT helpers
2 *
3 * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
4 * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>:
11 * - add support for SACK adjustment
12 * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>:
13 * - merge SACK support into newnat API
14 * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>:
15 * - make ip_nat_resize_packet more generic (TCP and UDP)
16 * - add ip_nat_mangle_udp_packet
17 */
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/kmod.h>
21#include <linux/types.h>
22#include <linux/timer.h>
23#include <linux/skbuff.h>
24#include <linux/netfilter_ipv4.h>
25#include <net/checksum.h>
26#include <net/icmp.h>
27#include <net/ip.h>
28#include <net/tcp.h>
29#include <net/udp.h>
30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
33
34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
36#include <linux/netfilter_ipv4/ip_nat.h>
37#include <linux/netfilter_ipv4/ip_nat_protocol.h>
38#include <linux/netfilter_ipv4/ip_nat_core.h>
39#include <linux/netfilter_ipv4/ip_nat_helper.h>
40#include <linux/netfilter_ipv4/listhelp.h>
41
42#if 0
43#define DEBUGP printk
44#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
45#else
46#define DEBUGP(format, args...)
47#define DUMP_OFFSET(x)
48#endif
49
50static DECLARE_LOCK(ip_nat_seqofs_lock);
51
52/* Setup TCP sequence correction given this change at this sequence */
53static inline void
54adjust_tcp_sequence(u32 seq,
55 int sizediff,
56 struct ip_conntrack *ct,
57 enum ip_conntrack_info ctinfo)
58{
59 int dir;
60 struct ip_nat_seq *this_way, *other_way;
61
62 DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n",
63 (*skb)->len, new_size);
64
65 dir = CTINFO2DIR(ctinfo);
66
67 this_way = &ct->nat.info.seq[dir];
68 other_way = &ct->nat.info.seq[!dir];
69
70 DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
71 DUMP_OFFSET(this_way);
72
73 LOCK_BH(&ip_nat_seqofs_lock);
74
75 /* SYN adjust. If it's uninitialized, or this is after last
76 * correction, record it: we don't handle more than one
77 * adjustment in the window, but do deal with common case of a
78 * retransmit */
79 if (this_way->offset_before == this_way->offset_after
80 || before(this_way->correction_pos, seq)) {
81 this_way->correction_pos = seq;
82 this_way->offset_before = this_way->offset_after;
83 this_way->offset_after += sizediff;
84 }
85 UNLOCK_BH(&ip_nat_seqofs_lock);
86
87 DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
88 DUMP_OFFSET(this_way);
89}
90
91/* Frobs data inside this packet, which is linear. */
92static void mangle_contents(struct sk_buff *skb,
93 unsigned int dataoff,
94 unsigned int match_offset,
95 unsigned int match_len,
96 const char *rep_buffer,
97 unsigned int rep_len)
98{
99 unsigned char *data;
100
101 BUG_ON(skb_is_nonlinear(skb));
102 data = (unsigned char *)skb->nh.iph + dataoff;
103
104 /* move post-replacement */
105 memmove(data + match_offset + rep_len,
106 data + match_offset + match_len,
107 skb->tail - (data + match_offset + match_len));
108
109 /* insert data from buffer */
110 memcpy(data + match_offset, rep_buffer, rep_len);
111
112 /* update skb info */
113 if (rep_len > match_len) {
114 DEBUGP("ip_nat_mangle_packet: Extending packet by "
115 "%u from %u bytes\n", rep_len - match_len,
116 skb->len);
117 skb_put(skb, rep_len - match_len);
118 } else {
119 DEBUGP("ip_nat_mangle_packet: Shrinking packet from "
120 "%u from %u bytes\n", match_len - rep_len,
121 skb->len);
122 __skb_trim(skb, skb->len + rep_len - match_len);
123 }
124
125 /* fix IP hdr checksum information */
126 skb->nh.iph->tot_len = htons(skb->len);
127 ip_send_check(skb->nh.iph);
128}
129
130/* Unusual, but possible case. */
131static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
132{
133 struct sk_buff *nskb;
134
135 if ((*pskb)->len + extra > 65535)
136 return 0;
137
138 nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC);
139 if (!nskb)
140 return 0;
141
142 /* Transfer socket to new skb. */
143 if ((*pskb)->sk)
144 skb_set_owner_w(nskb, (*pskb)->sk);
145#ifdef CONFIG_NETFILTER_DEBUG
146 nskb->nf_debug = (*pskb)->nf_debug;
147#endif
148 kfree_skb(*pskb);
149 *pskb = nskb;
150 return 1;
151}
152
153/* Generic function for mangling variable-length address changes inside
154 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
155 * command in FTP).
156 *
157 * Takes care about all the nasty sequence number changes, checksumming,
158 * skb enlargement, ...
159 *
160 * */
161int
162ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
163 struct ip_conntrack *ct,
164 enum ip_conntrack_info ctinfo,
165 unsigned int match_offset,
166 unsigned int match_len,
167 const char *rep_buffer,
168 unsigned int rep_len)
169{
170 struct iphdr *iph;
171 struct tcphdr *tcph;
172 int datalen;
173
174 if (!skb_ip_make_writable(pskb, (*pskb)->len))
175 return 0;
176
177 if (rep_len > match_len
178 && rep_len - match_len > skb_tailroom(*pskb)
179 && !enlarge_skb(pskb, rep_len - match_len))
180 return 0;
181
182 SKB_LINEAR_ASSERT(*pskb);
183
184 iph = (*pskb)->nh.iph;
185 tcph = (void *)iph + iph->ihl*4;
186
187 mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
188 match_offset, match_len, rep_buffer, rep_len);
189
190 datalen = (*pskb)->len - iph->ihl*4;
191 tcph->check = 0;
192 tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr,
193 csum_partial((char *)tcph, datalen, 0));
194
195 if (rep_len != match_len) {
196 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
197 adjust_tcp_sequence(ntohl(tcph->seq),
198 (int)rep_len - (int)match_len,
199 ct, ctinfo);
200 /* Tell TCP window tracking about seq change */
201 ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo));
202 }
203 return 1;
204}
205
206/* Generic function for mangling variable-length address changes inside
207 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
208 * command in the Amanda protocol)
209 *
210 * Takes care about all the nasty sequence number changes, checksumming,
211 * skb enlargement, ...
212 *
213 * XXX - This function could be merged with ip_nat_mangle_tcp_packet which
214 * should be fairly easy to do.
215 */
216int
217ip_nat_mangle_udp_packet(struct sk_buff **pskb,
218 struct ip_conntrack *ct,
219 enum ip_conntrack_info ctinfo,
220 unsigned int match_offset,
221 unsigned int match_len,
222 const char *rep_buffer,
223 unsigned int rep_len)
224{
225 struct iphdr *iph;
226 struct udphdr *udph;
227
228 /* UDP helpers might accidentally mangle the wrong packet */
229 iph = (*pskb)->nh.iph;
230 if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) +
231 match_offset + match_len)
232 return 0;
233
234 if (!skb_ip_make_writable(pskb, (*pskb)->len))
235 return 0;
236
237 if (rep_len > match_len
238 && rep_len - match_len > skb_tailroom(*pskb)
239 && !enlarge_skb(pskb, rep_len - match_len))
240 return 0;
241
242 iph = (*pskb)->nh.iph;
243 udph = (void *)iph + iph->ihl*4;
244 mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
245 match_offset, match_len, rep_buffer, rep_len);
246
247 /* update the length of the UDP packet */
248 udph->len = htons((*pskb)->len - iph->ihl*4);
249
250 /* fix udp checksum if udp checksum was previously calculated */
251 if (udph->check) {
252 int datalen = (*pskb)->len - iph->ihl * 4;
253 udph->check = 0;
254 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
255 datalen, IPPROTO_UDP,
256 csum_partial((char *)udph,
257 datalen, 0));
258 }
259
260 return 1;
261}
262
263/* Adjust one found SACK option including checksum correction */
264static void
265sack_adjust(struct sk_buff *skb,
266 struct tcphdr *tcph,
267 unsigned int sackoff,
268 unsigned int sackend,
269 struct ip_nat_seq *natseq)
270{
271 while (sackoff < sackend) {
272 struct tcp_sack_block *sack;
273 u_int32_t new_start_seq, new_end_seq;
274
275 sack = (void *)skb->data + sackoff;
276 if (after(ntohl(sack->start_seq) - natseq->offset_before,
277 natseq->correction_pos))
278 new_start_seq = ntohl(sack->start_seq)
279 - natseq->offset_after;
280 else
281 new_start_seq = ntohl(sack->start_seq)
282 - natseq->offset_before;
283 new_start_seq = htonl(new_start_seq);
284
285 if (after(ntohl(sack->end_seq) - natseq->offset_before,
286 natseq->correction_pos))
287 new_end_seq = ntohl(sack->end_seq)
288 - natseq->offset_after;
289 else
290 new_end_seq = ntohl(sack->end_seq)
291 - natseq->offset_before;
292 new_end_seq = htonl(new_end_seq);
293
294 DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
295 ntohl(sack->start_seq), new_start_seq,
296 ntohl(sack->end_seq), new_end_seq);
297
298 tcph->check =
299 ip_nat_cheat_check(~sack->start_seq, new_start_seq,
300 ip_nat_cheat_check(~sack->end_seq,
301 new_end_seq,
302 tcph->check));
303 sack->start_seq = new_start_seq;
304 sack->end_seq = new_end_seq;
305 sackoff += sizeof(*sack);
306 }
307}
308
309/* TCP SACK sequence number adjustment */
310static inline unsigned int
311ip_nat_sack_adjust(struct sk_buff **pskb,
312 struct tcphdr *tcph,
313 struct ip_conntrack *ct,
314 enum ip_conntrack_info ctinfo)
315{
316 unsigned int dir, optoff, optend;
317
318 optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
319 optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
320
321 if (!skb_ip_make_writable(pskb, optend))
322 return 0;
323
324 dir = CTINFO2DIR(ctinfo);
325
326 while (optoff < optend) {
327 /* Usually: option, length. */
328 unsigned char *op = (*pskb)->data + optoff;
329
330 switch (op[0]) {
331 case TCPOPT_EOL:
332 return 1;
333 case TCPOPT_NOP:
334 optoff++;
335 continue;
336 default:
337 /* no partial options */
338 if (optoff + 1 == optend
339 || optoff + op[1] > optend
340 || op[1] < 2)
341 return 0;
342 if (op[0] == TCPOPT_SACK
343 && op[1] >= 2+TCPOLEN_SACK_PERBLOCK
344 && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
345 sack_adjust(*pskb, tcph, optoff+2,
346 optoff+op[1],
347 &ct->nat.info.seq[!dir]);
348 optoff += op[1];
349 }
350 }
351 return 1;
352}
353
354/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
355int
356ip_nat_seq_adjust(struct sk_buff **pskb,
357 struct ip_conntrack *ct,
358 enum ip_conntrack_info ctinfo)
359{
360 struct tcphdr *tcph;
361 int dir, newseq, newack;
362 struct ip_nat_seq *this_way, *other_way;
363
364 dir = CTINFO2DIR(ctinfo);
365
366 this_way = &ct->nat.info.seq[dir];
367 other_way = &ct->nat.info.seq[!dir];
368
369 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
370 return 0;
371
372 tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
373 if (after(ntohl(tcph->seq), this_way->correction_pos))
374 newseq = ntohl(tcph->seq) + this_way->offset_after;
375 else
376 newseq = ntohl(tcph->seq) + this_way->offset_before;
377 newseq = htonl(newseq);
378
379 if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
380 other_way->correction_pos))
381 newack = ntohl(tcph->ack_seq) - other_way->offset_after;
382 else
383 newack = ntohl(tcph->ack_seq) - other_way->offset_before;
384 newack = htonl(newack);
385
386 tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
387 ip_nat_cheat_check(~tcph->ack_seq,
388 newack,
389 tcph->check));
390
391 DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
392 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
393 ntohl(newack));
394
395 tcph->seq = newseq;
396 tcph->ack_seq = newack;
397
398 if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
399 return 0;
400
401 ip_conntrack_tcp_update(*pskb, ct, dir);
402
403 return 1;
404}
405
406/* Setup NAT on this expected conntrack so it follows master. */
407/* If we fail to get a free NAT slot, we'll get dropped on confirm */
408void ip_nat_follow_master(struct ip_conntrack *ct,
409 struct ip_conntrack_expect *exp)
410{
411 struct ip_nat_range range;
412
413 /* This must be a fresh one. */
414 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
415
416 /* Change src to where master sends to */
417 range.flags = IP_NAT_RANGE_MAP_IPS;
418 range.min_ip = range.max_ip
419 = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
420 /* hook doesn't matter, but it has to do source manip */
421 ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
422
423 /* For DST manip, map port here to where it's expected. */
424 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
425 range.min = range.max = exp->saved_proto;
426 range.min_ip = range.max_ip
427 = ct->master->tuplehash[!exp->dir].tuple.src.ip;
428 /* hook doesn't matter, but it has to do destination manip */
429 ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
430}
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
new file mode 100644
index 000000000000..9c1ca3381d56
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -0,0 +1,125 @@
1/* IRC extension for TCP NAT alteration.
2 * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
3 * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
4 * based on a copy of RR's ip_nat_ftp.c
5 *
6 * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/module.h>
15#include <linux/netfilter_ipv4.h>
16#include <linux/ip.h>
17#include <linux/tcp.h>
18#include <linux/kernel.h>
19#include <net/tcp.h>
20#include <linux/netfilter_ipv4/ip_nat.h>
21#include <linux/netfilter_ipv4/ip_nat_helper.h>
22#include <linux/netfilter_ipv4/ip_nat_rule.h>
23#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
24#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
25#include <linux/moduleparam.h>
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(format, args...)
31#endif
32
33MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
34MODULE_DESCRIPTION("IRC (DCC) NAT helper");
35MODULE_LICENSE("GPL");
36
37static unsigned int help(struct sk_buff **pskb,
38 enum ip_conntrack_info ctinfo,
39 unsigned int matchoff,
40 unsigned int matchlen,
41 struct ip_conntrack_expect *exp)
42{
43 u_int16_t port;
44 unsigned int ret;
45
46 /* "4294967296 65635 " */
47 char buffer[18];
48
49 DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n",
50 expect->seq, exp_irc_info->len,
51 ntohl(tcph->seq));
52
53 /* Reply comes from server. */
54 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
55 exp->dir = IP_CT_DIR_REPLY;
56
57 /* When you see the packet, we need to NAT it the same as the
58 * this one. */
59 exp->expectfn = ip_nat_follow_master;
60
61 /* Try to get same port: if not, try to change it. */
62 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
63 exp->tuple.dst.u.tcp.port = htons(port);
64 if (ip_conntrack_expect_related(exp) == 0)
65 break;
66 }
67
68 if (port == 0) {
69 ip_conntrack_expect_free(exp);
70 return NF_DROP;
71 }
72
73 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
74 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
75 * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
76 * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
77 * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
78 * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
79 * 255.255.255.255==4294967296, 10 digits)
80 * P: bound port (min 1 d, max 5d (65635))
81 * F: filename (min 1 d )
82 * S: size (min 1 d )
83 * 0x01, \n: terminators
84 */
85
86 /* AAA = "us", ie. where server normally talks to. */
87 sprintf(buffer, "%u %u",
88 ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip),
89 port);
90 DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n",
91 buffer, NIPQUAD(exp->tuple.src.ip), port);
92
93 ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo,
94 matchoff, matchlen, buffer,
95 strlen(buffer));
96 if (ret != NF_ACCEPT)
97 ip_conntrack_unexpect_related(exp);
98 return ret;
99}
100
101static void __exit fini(void)
102{
103 ip_nat_irc_hook = NULL;
104 /* Make sure noone calls it, meanwhile. */
105 synchronize_net();
106}
107
108static int __init init(void)
109{
110 BUG_ON(ip_nat_irc_hook);
111 ip_nat_irc_hook = help;
112 return 0;
113}
114
115/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
116static int warn_set(const char *val, struct kernel_param *kp)
117{
118 printk(KERN_INFO __stringify(KBUILD_MODNAME)
119 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
120 return 0;
121}
122module_param_call(ports, warn_set, NULL, NULL, 0);
123
124module_init(init);
125module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
new file mode 100644
index 000000000000..a558cf0eee8a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -0,0 +1,115 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/netfilter.h>
12#include <linux/ip.h>
13#include <linux/icmp.h>
14#include <linux/if.h>
15
16#include <linux/netfilter_ipv4/ip_nat.h>
17#include <linux/netfilter_ipv4/ip_nat_core.h>
18#include <linux/netfilter_ipv4/ip_nat_rule.h>
19#include <linux/netfilter_ipv4/ip_nat_protocol.h>
20
21static int
22icmp_in_range(const struct ip_conntrack_tuple *tuple,
23 enum ip_nat_manip_type maniptype,
24 const union ip_conntrack_manip_proto *min,
25 const union ip_conntrack_manip_proto *max)
26{
27 return (tuple->src.u.icmp.id >= min->icmp.id
28 && tuple->src.u.icmp.id <= max->icmp.id);
29}
30
31static int
32icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
33 const struct ip_nat_range *range,
34 enum ip_nat_manip_type maniptype,
35 const struct ip_conntrack *conntrack)
36{
37 static u_int16_t id;
38 unsigned int range_size
39 = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
40 unsigned int i;
41
42 /* If no range specified... */
43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF;
45
46 for (i = 0; i < range_size; i++, id++) {
47 tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size);
48 if (!ip_nat_used_tuple(tuple, conntrack))
49 return 1;
50 }
51 return 0;
52}
53
54static int
55icmp_manip_pkt(struct sk_buff **pskb,
56 unsigned int iphdroff,
57 const struct ip_conntrack_tuple *tuple,
58 enum ip_nat_manip_type maniptype)
59{
60 struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
61 struct icmphdr *hdr;
62 unsigned int hdroff = iphdroff + iph->ihl*4;
63
64 if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
65 return 0;
66
67 hdr = (struct icmphdr *)((*pskb)->data + hdroff);
68
69 hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF,
70 tuple->src.u.icmp.id,
71 hdr->checksum);
72 hdr->un.echo.id = tuple->src.u.icmp.id;
73 return 1;
74}
75
76static unsigned int
77icmp_print(char *buffer,
78 const struct ip_conntrack_tuple *match,
79 const struct ip_conntrack_tuple *mask)
80{
81 unsigned int len = 0;
82
83 if (mask->src.u.icmp.id)
84 len += sprintf(buffer + len, "id=%u ",
85 ntohs(match->src.u.icmp.id));
86
87 if (mask->dst.u.icmp.type)
88 len += sprintf(buffer + len, "type=%u ",
89 ntohs(match->dst.u.icmp.type));
90
91 if (mask->dst.u.icmp.code)
92 len += sprintf(buffer + len, "code=%u ",
93 ntohs(match->dst.u.icmp.code));
94
95 return len;
96}
97
98static unsigned int
99icmp_print_range(char *buffer, const struct ip_nat_range *range)
100{
101 if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF)
102 return sprintf(buffer, "id %u-%u ",
103 ntohs(range->min.icmp.id),
104 ntohs(range->max.icmp.id));
105 else return 0;
106}
107
108struct ip_nat_protocol ip_nat_protocol_icmp
109= { "ICMP", IPPROTO_ICMP,
110 icmp_manip_pkt,
111 icmp_in_range,
112 icmp_unique_tuple,
113 icmp_print,
114 icmp_print_range
115};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
new file mode 100644
index 000000000000..a91cfceff272
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -0,0 +1,178 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/netfilter.h>
12#include <linux/ip.h>
13#include <linux/tcp.h>
14#include <linux/if.h>
15#include <linux/netfilter_ipv4/ip_nat.h>
16#include <linux/netfilter_ipv4/ip_nat_rule.h>
17#include <linux/netfilter_ipv4/ip_nat_protocol.h>
18#include <linux/netfilter_ipv4/ip_nat_core.h>
19
20static int
21tcp_in_range(const struct ip_conntrack_tuple *tuple,
22 enum ip_nat_manip_type maniptype,
23 const union ip_conntrack_manip_proto *min,
24 const union ip_conntrack_manip_proto *max)
25{
26 u_int16_t port;
27
28 if (maniptype == IP_NAT_MANIP_SRC)
29 port = tuple->src.u.tcp.port;
30 else
31 port = tuple->dst.u.tcp.port;
32
33 return ntohs(port) >= ntohs(min->tcp.port)
34 && ntohs(port) <= ntohs(max->tcp.port);
35}
36
37static int
38tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
39 const struct ip_nat_range *range,
40 enum ip_nat_manip_type maniptype,
41 const struct ip_conntrack *conntrack)
42{
43 static u_int16_t port, *portptr;
44 unsigned int range_size, min, i;
45
46 if (maniptype == IP_NAT_MANIP_SRC)
47 portptr = &tuple->src.u.tcp.port;
48 else
49 portptr = &tuple->dst.u.tcp.port;
50
51 /* If no range specified... */
52 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
53 /* If it's dst rewrite, can't change port */
54 if (maniptype == IP_NAT_MANIP_DST)
55 return 0;
56
57 /* Map privileged onto privileged. */
58 if (ntohs(*portptr) < 1024) {
59 /* Loose convention: >> 512 is credential passing */
60 if (ntohs(*portptr)<512) {
61 min = 1;
62 range_size = 511 - min + 1;
63 } else {
64 min = 600;
65 range_size = 1023 - min + 1;
66 }
67 } else {
68 min = 1024;
69 range_size = 65535 - 1024 + 1;
70 }
71 } else {
72 min = ntohs(range->min.tcp.port);
73 range_size = ntohs(range->max.tcp.port) - min + 1;
74 }
75
76 for (i = 0; i < range_size; i++, port++) {
77 *portptr = htons(min + port % range_size);
78 if (!ip_nat_used_tuple(tuple, conntrack)) {
79 return 1;
80 }
81 }
82 return 0;
83}
84
85static int
86tcp_manip_pkt(struct sk_buff **pskb,
87 unsigned int iphdroff,
88 const struct ip_conntrack_tuple *tuple,
89 enum ip_nat_manip_type maniptype)
90{
91 struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
92 struct tcphdr *hdr;
93 unsigned int hdroff = iphdroff + iph->ihl*4;
94 u32 oldip, newip;
95 u16 *portptr, newport, oldport;
96 int hdrsize = 8; /* TCP connection tracking guarantees this much */
97
98 /* this could be a inner header returned in icmp packet; in such
99 cases we cannot update the checksum field since it is outside of
100 the 8 bytes of transport layer headers we are guaranteed */
101 if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
102 hdrsize = sizeof(struct tcphdr);
103
104 if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
105 return 0;
106
107 iph = (struct iphdr *)((*pskb)->data + iphdroff);
108 hdr = (struct tcphdr *)((*pskb)->data + hdroff);
109
110 if (maniptype == IP_NAT_MANIP_SRC) {
111 /* Get rid of src ip and src pt */
112 oldip = iph->saddr;
113 newip = tuple->src.ip;
114 newport = tuple->src.u.tcp.port;
115 portptr = &hdr->source;
116 } else {
117 /* Get rid of dst ip and dst pt */
118 oldip = iph->daddr;
119 newip = tuple->dst.ip;
120 newport = tuple->dst.u.tcp.port;
121 portptr = &hdr->dest;
122 }
123
124 oldport = *portptr;
125 *portptr = newport;
126
127 if (hdrsize < sizeof(*hdr))
128 return 1;
129
130 hdr->check = ip_nat_cheat_check(~oldip, newip,
131 ip_nat_cheat_check(oldport ^ 0xFFFF,
132 newport,
133 hdr->check));
134 return 1;
135}
136
137static unsigned int
138tcp_print(char *buffer,
139 const struct ip_conntrack_tuple *match,
140 const struct ip_conntrack_tuple *mask)
141{
142 unsigned int len = 0;
143
144 if (mask->src.u.tcp.port)
145 len += sprintf(buffer + len, "srcpt=%u ",
146 ntohs(match->src.u.tcp.port));
147
148
149 if (mask->dst.u.tcp.port)
150 len += sprintf(buffer + len, "dstpt=%u ",
151 ntohs(match->dst.u.tcp.port));
152
153 return len;
154}
155
156static unsigned int
157tcp_print_range(char *buffer, const struct ip_nat_range *range)
158{
159 if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) {
160 if (range->min.tcp.port == range->max.tcp.port)
161 return sprintf(buffer, "port %u ",
162 ntohs(range->min.tcp.port));
163 else
164 return sprintf(buffer, "ports %u-%u ",
165 ntohs(range->min.tcp.port),
166 ntohs(range->max.tcp.port));
167 }
168 else return 0;
169}
170
171struct ip_nat_protocol ip_nat_protocol_tcp
172= { "TCP", IPPROTO_TCP,
173 tcp_manip_pkt,
174 tcp_in_range,
175 tcp_unique_tuple,
176 tcp_print,
177 tcp_print_range
178};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
new file mode 100644
index 000000000000..c669e3b5f5d0
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -0,0 +1,165 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/netfilter.h>
12#include <linux/ip.h>
13#include <linux/udp.h>
14#include <linux/if.h>
15
16#include <linux/netfilter_ipv4/ip_nat.h>
17#include <linux/netfilter_ipv4/ip_nat_core.h>
18#include <linux/netfilter_ipv4/ip_nat_rule.h>
19#include <linux/netfilter_ipv4/ip_nat_protocol.h>
20
21static int
22udp_in_range(const struct ip_conntrack_tuple *tuple,
23 enum ip_nat_manip_type maniptype,
24 const union ip_conntrack_manip_proto *min,
25 const union ip_conntrack_manip_proto *max)
26{
27 u_int16_t port;
28
29 if (maniptype == IP_NAT_MANIP_SRC)
30 port = tuple->src.u.udp.port;
31 else
32 port = tuple->dst.u.udp.port;
33
34 return ntohs(port) >= ntohs(min->udp.port)
35 && ntohs(port) <= ntohs(max->udp.port);
36}
37
38static int
39udp_unique_tuple(struct ip_conntrack_tuple *tuple,
40 const struct ip_nat_range *range,
41 enum ip_nat_manip_type maniptype,
42 const struct ip_conntrack *conntrack)
43{
44 static u_int16_t port, *portptr;
45 unsigned int range_size, min, i;
46
47 if (maniptype == IP_NAT_MANIP_SRC)
48 portptr = &tuple->src.u.udp.port;
49 else
50 portptr = &tuple->dst.u.udp.port;
51
52 /* If no range specified... */
53 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
54 /* If it's dst rewrite, can't change port */
55 if (maniptype == IP_NAT_MANIP_DST)
56 return 0;
57
58 if (ntohs(*portptr) < 1024) {
59 /* Loose convention: >> 512 is credential passing */
60 if (ntohs(*portptr)<512) {
61 min = 1;
62 range_size = 511 - min + 1;
63 } else {
64 min = 600;
65 range_size = 1023 - min + 1;
66 }
67 } else {
68 min = 1024;
69 range_size = 65535 - 1024 + 1;
70 }
71 } else {
72 min = ntohs(range->min.udp.port);
73 range_size = ntohs(range->max.udp.port) - min + 1;
74 }
75
76 for (i = 0; i < range_size; i++, port++) {
77 *portptr = htons(min + port % range_size);
78 if (!ip_nat_used_tuple(tuple, conntrack))
79 return 1;
80 }
81 return 0;
82}
83
84static int
85udp_manip_pkt(struct sk_buff **pskb,
86 unsigned int iphdroff,
87 const struct ip_conntrack_tuple *tuple,
88 enum ip_nat_manip_type maniptype)
89{
90 struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
91 struct udphdr *hdr;
92 unsigned int hdroff = iphdroff + iph->ihl*4;
93 u32 oldip, newip;
94 u16 *portptr, newport;
95
96 if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
97 return 0;
98
99 iph = (struct iphdr *)((*pskb)->data + iphdroff);
100 hdr = (struct udphdr *)((*pskb)->data + hdroff);
101
102 if (maniptype == IP_NAT_MANIP_SRC) {
103 /* Get rid of src ip and src pt */
104 oldip = iph->saddr;
105 newip = tuple->src.ip;
106 newport = tuple->src.u.udp.port;
107 portptr = &hdr->source;
108 } else {
109 /* Get rid of dst ip and dst pt */
110 oldip = iph->daddr;
111 newip = tuple->dst.ip;
112 newport = tuple->dst.u.udp.port;
113 portptr = &hdr->dest;
114 }
115 if (hdr->check) /* 0 is a special case meaning no checksum */
116 hdr->check = ip_nat_cheat_check(~oldip, newip,
117 ip_nat_cheat_check(*portptr ^ 0xFFFF,
118 newport,
119 hdr->check));
120 *portptr = newport;
121 return 1;
122}
123
124static unsigned int
125udp_print(char *buffer,
126 const struct ip_conntrack_tuple *match,
127 const struct ip_conntrack_tuple *mask)
128{
129 unsigned int len = 0;
130
131 if (mask->src.u.udp.port)
132 len += sprintf(buffer + len, "srcpt=%u ",
133 ntohs(match->src.u.udp.port));
134
135
136 if (mask->dst.u.udp.port)
137 len += sprintf(buffer + len, "dstpt=%u ",
138 ntohs(match->dst.u.udp.port));
139
140 return len;
141}
142
143static unsigned int
144udp_print_range(char *buffer, const struct ip_nat_range *range)
145{
146 if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) {
147 if (range->min.udp.port == range->max.udp.port)
148 return sprintf(buffer, "port %u ",
149 ntohs(range->min.udp.port));
150 else
151 return sprintf(buffer, "ports %u-%u ",
152 ntohs(range->min.udp.port),
153 ntohs(range->max.udp.port));
154 }
155 else return 0;
156}
157
158struct ip_nat_protocol ip_nat_protocol_udp
159= { "UDP", IPPROTO_UDP,
160 udp_manip_pkt,
161 udp_in_range,
162 udp_unique_tuple,
163 udp_print,
164 udp_print_range
165};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
new file mode 100644
index 000000000000..f5525bd58d16
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -0,0 +1,70 @@
1/* The "unknown" protocol. This is what is used for protocols we
2 * don't understand. It's returned by ip_ct_find_proto().
3 */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/types.h>
14#include <linux/init.h>
15#include <linux/netfilter.h>
16#include <linux/if.h>
17
18#include <linux/netfilter_ipv4/ip_nat.h>
19#include <linux/netfilter_ipv4/ip_nat_rule.h>
20#include <linux/netfilter_ipv4/ip_nat_protocol.h>
21
22static int unknown_in_range(const struct ip_conntrack_tuple *tuple,
23 enum ip_nat_manip_type manip_type,
24 const union ip_conntrack_manip_proto *min,
25 const union ip_conntrack_manip_proto *max)
26{
27 return 1;
28}
29
30static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
31 const struct ip_nat_range *range,
32 enum ip_nat_manip_type maniptype,
33 const struct ip_conntrack *conntrack)
34{
35 /* Sorry: we can't help you; if it's not unique, we can't frob
36 anything. */
37 return 0;
38}
39
40static int
41unknown_manip_pkt(struct sk_buff **pskb,
42 unsigned int iphdroff,
43 const struct ip_conntrack_tuple *tuple,
44 enum ip_nat_manip_type maniptype)
45{
46 return 1;
47}
48
49static unsigned int
50unknown_print(char *buffer,
51 const struct ip_conntrack_tuple *match,
52 const struct ip_conntrack_tuple *mask)
53{
54 return 0;
55}
56
57static unsigned int
58unknown_print_range(char *buffer, const struct ip_nat_range *range)
59{
60 return 0;
61}
62
63struct ip_nat_protocol ip_nat_unknown_protocol = {
64 "unknown", 0,
65 unknown_manip_pkt,
66 unknown_in_range,
67 unknown_unique_tuple,
68 unknown_print,
69 unknown_print_range
70};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
new file mode 100644
index 000000000000..581f097f5a24
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -0,0 +1,319 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* Everything about the rules for NAT. */
10#include <linux/types.h>
11#include <linux/ip.h>
12#include <linux/netfilter.h>
13#include <linux/netfilter_ipv4.h>
14#include <linux/module.h>
15#include <linux/kmod.h>
16#include <linux/skbuff.h>
17#include <linux/proc_fs.h>
18#include <net/checksum.h>
19#include <net/route.h>
20#include <linux/bitops.h>
21
22#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
23#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
24
25#include <linux/netfilter_ipv4/ip_tables.h>
26#include <linux/netfilter_ipv4/ip_nat.h>
27#include <linux/netfilter_ipv4/ip_nat_core.h>
28#include <linux/netfilter_ipv4/ip_nat_rule.h>
29#include <linux/netfilter_ipv4/listhelp.h>
30
31#if 0
32#define DEBUGP printk
33#else
34#define DEBUGP(format, args...)
35#endif
36
37#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
38
39static struct
40{
41 struct ipt_replace repl;
42 struct ipt_standard entries[3];
43 struct ipt_error term;
44} nat_initial_table __initdata
45= { { "nat", NAT_VALID_HOOKS, 4,
46 sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
47 { [NF_IP_PRE_ROUTING] = 0,
48 [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
49 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
50 { [NF_IP_PRE_ROUTING] = 0,
51 [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
52 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
53 0, NULL, { } },
54 {
55 /* PRE_ROUTING */
56 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
57 0,
58 sizeof(struct ipt_entry),
59 sizeof(struct ipt_standard),
60 0, { 0, 0 }, { } },
61 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
62 -NF_ACCEPT - 1 } },
63 /* POST_ROUTING */
64 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
65 0,
66 sizeof(struct ipt_entry),
67 sizeof(struct ipt_standard),
68 0, { 0, 0 }, { } },
69 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
70 -NF_ACCEPT - 1 } },
71 /* LOCAL_OUT */
72 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
73 0,
74 sizeof(struct ipt_entry),
75 sizeof(struct ipt_standard),
76 0, { 0, 0 }, { } },
77 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
78 -NF_ACCEPT - 1 } }
79 },
80 /* ERROR */
81 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
82 0,
83 sizeof(struct ipt_entry),
84 sizeof(struct ipt_error),
85 0, { 0, 0 }, { } },
86 { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
87 { } },
88 "ERROR"
89 }
90 }
91};
92
93static struct ipt_table nat_table = {
94 .name = "nat",
95 .valid_hooks = NAT_VALID_HOOKS,
96 .lock = RW_LOCK_UNLOCKED,
97 .me = THIS_MODULE,
98};
99
100/* Source NAT */
101static unsigned int ipt_snat_target(struct sk_buff **pskb,
102 const struct net_device *in,
103 const struct net_device *out,
104 unsigned int hooknum,
105 const void *targinfo,
106 void *userinfo)
107{
108 struct ip_conntrack *ct;
109 enum ip_conntrack_info ctinfo;
110 const struct ip_nat_multi_range_compat *mr = targinfo;
111
112 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
113
114 ct = ip_conntrack_get(*pskb, &ctinfo);
115
116 /* Connection must be valid and new. */
117 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
118 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
119 IP_NF_ASSERT(out);
120
121 return ip_nat_setup_info(ct, &mr->range[0], hooknum);
122}
123
124/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
125static void warn_if_extra_mangle(u32 dstip, u32 srcip)
126{
127 static int warned = 0;
128 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
129 struct rtable *rt;
130
131 if (ip_route_output_key(&rt, &fl) != 0)
132 return;
133
134 if (rt->rt_src != srcip && !warned) {
135 printk("NAT: no longer support implicit source local NAT\n");
136 printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
137 NIPQUAD(srcip), NIPQUAD(dstip));
138 warned = 1;
139 }
140 ip_rt_put(rt);
141}
142
143static unsigned int ipt_dnat_target(struct sk_buff **pskb,
144 const struct net_device *in,
145 const struct net_device *out,
146 unsigned int hooknum,
147 const void *targinfo,
148 void *userinfo)
149{
150 struct ip_conntrack *ct;
151 enum ip_conntrack_info ctinfo;
152 const struct ip_nat_multi_range_compat *mr = targinfo;
153
154 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
155 || hooknum == NF_IP_LOCAL_OUT);
156
157 ct = ip_conntrack_get(*pskb, &ctinfo);
158
159 /* Connection must be valid and new. */
160 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
161
162 if (hooknum == NF_IP_LOCAL_OUT
163 && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
164 warn_if_extra_mangle((*pskb)->nh.iph->daddr,
165 mr->range[0].min_ip);
166
167 return ip_nat_setup_info(ct, &mr->range[0], hooknum);
168}
169
170static int ipt_snat_checkentry(const char *tablename,
171 const struct ipt_entry *e,
172 void *targinfo,
173 unsigned int targinfosize,
174 unsigned int hook_mask)
175{
176 struct ip_nat_multi_range_compat *mr = targinfo;
177
178 /* Must be a valid range */
179 if (mr->rangesize != 1) {
180 printk("SNAT: multiple ranges no longer supported\n");
181 return 0;
182 }
183
184 if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
185 DEBUGP("SNAT: Target size %u wrong for %u ranges\n",
186 targinfosize, mr->rangesize);
187 return 0;
188 }
189
190 /* Only allow these for NAT. */
191 if (strcmp(tablename, "nat") != 0) {
192 DEBUGP("SNAT: wrong table %s\n", tablename);
193 return 0;
194 }
195
196 if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
197 DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask);
198 return 0;
199 }
200 return 1;
201}
202
203static int ipt_dnat_checkentry(const char *tablename,
204 const struct ipt_entry *e,
205 void *targinfo,
206 unsigned int targinfosize,
207 unsigned int hook_mask)
208{
209 struct ip_nat_multi_range_compat *mr = targinfo;
210
211 /* Must be a valid range */
212 if (mr->rangesize != 1) {
213 printk("DNAT: multiple ranges no longer supported\n");
214 return 0;
215 }
216
217 if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
218 DEBUGP("DNAT: Target size %u wrong for %u ranges\n",
219 targinfosize, mr->rangesize);
220 return 0;
221 }
222
223 /* Only allow these for NAT. */
224 if (strcmp(tablename, "nat") != 0) {
225 DEBUGP("DNAT: wrong table %s\n", tablename);
226 return 0;
227 }
228
229 if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
230 DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask);
231 return 0;
232 }
233
234 return 1;
235}
236
237inline unsigned int
238alloc_null_binding(struct ip_conntrack *conntrack,
239 struct ip_nat_info *info,
240 unsigned int hooknum)
241{
242 /* Force range to this IP; let proto decide mapping for
243 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
244 Use reply in case it's already been mangled (eg local packet).
245 */
246 u_int32_t ip
247 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
248 ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
249 : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
250 struct ip_nat_range range
251 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
252
253 DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack,
254 NIPQUAD(ip));
255 return ip_nat_setup_info(conntrack, &range, hooknum);
256}
257
258int ip_nat_rule_find(struct sk_buff **pskb,
259 unsigned int hooknum,
260 const struct net_device *in,
261 const struct net_device *out,
262 struct ip_conntrack *ct,
263 struct ip_nat_info *info)
264{
265 int ret;
266
267 ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL);
268
269 if (ret == NF_ACCEPT) {
270 if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
271 /* NUL mapping */
272 ret = alloc_null_binding(ct, info, hooknum);
273 }
274 return ret;
275}
276
277static struct ipt_target ipt_snat_reg = {
278 .name = "SNAT",
279 .target = ipt_snat_target,
280 .checkentry = ipt_snat_checkentry,
281};
282
283static struct ipt_target ipt_dnat_reg = {
284 .name = "DNAT",
285 .target = ipt_dnat_target,
286 .checkentry = ipt_dnat_checkentry,
287};
288
289int __init ip_nat_rule_init(void)
290{
291 int ret;
292
293 ret = ipt_register_table(&nat_table, &nat_initial_table.repl);
294 if (ret != 0)
295 return ret;
296 ret = ipt_register_target(&ipt_snat_reg);
297 if (ret != 0)
298 goto unregister_table;
299
300 ret = ipt_register_target(&ipt_dnat_reg);
301 if (ret != 0)
302 goto unregister_snat;
303
304 return ret;
305
306 unregister_snat:
307 ipt_unregister_target(&ipt_snat_reg);
308 unregister_table:
309 ipt_unregister_table(&nat_table);
310
311 return ret;
312}
313
314void ip_nat_rule_cleanup(void)
315{
316 ipt_unregister_target(&ipt_dnat_reg);
317 ipt_unregister_target(&ipt_snat_reg);
318 ipt_unregister_table(&nat_table);
319}
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
new file mode 100644
index 000000000000..2a48b6e635ae
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -0,0 +1,1347 @@
1/*
2 * ip_nat_snmp_basic.c
3 *
4 * Basic SNMP Application Layer Gateway
5 *
6 * This IP NAT module is intended for use with SNMP network
7 * discovery and monitoring applications where target networks use
8 * conflicting private address realms.
9 *
10 * Static NAT is used to remap the networks from the view of the network
11 * management system at the IP layer, and this module remaps some application
12 * layer addresses to match.
13 *
14 * The simplest form of ALG is performed, where only tagged IP addresses
15 * are modified. The module does not need to be MIB aware and only scans
16 * messages at the ASN.1/BER level.
17 *
18 * Currently, only SNMPv1 and SNMPv2 are supported.
19 *
20 * More information on ALG and associated issues can be found in
21 * RFC 2962
22 *
23 * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
24 * McLean & Jochen Friedrich, stripped down for use in the kernel.
25 *
26 * Copyright (c) 2000 RP Internet (www.rpi.net.au).
27 *
28 * This program is free software; you can redistribute it and/or modify
29 * it under the terms of the GNU General Public License as published by
30 * the Free Software Foundation; either version 2 of the License, or
31 * (at your option) any later version.
32 * This program is distributed in the hope that it will be useful,
33 * but WITHOUT ANY WARRANTY; without even the implied warranty of
34 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35 * GNU General Public License for more details.
36 * You should have received a copy of the GNU General Public License
37 * along with this program; if not, write to the Free Software
38 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39 *
40 * Author: James Morris <jmorris@intercode.com.au>
41 *
42 * Updates:
43 * 2000-08-06: Convert to new helper API (Harald Welte).
44 *
45 */
46#include <linux/config.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/moduleparam.h>
51#include <linux/netfilter_ipv4.h>
52#include <linux/netfilter_ipv4/ip_nat.h>
53#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
54#include <linux/netfilter_ipv4/ip_nat_helper.h>
55#include <linux/ip.h>
56#include <net/checksum.h>
57#include <net/udp.h>
58#include <asm/uaccess.h>
59
60MODULE_LICENSE("GPL");
61MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
62MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
63
64#define SNMP_PORT 161
65#define SNMP_TRAP_PORT 162
66#define NOCT1(n) (u_int8_t )((n) & 0xff)
67
68static int debug;
69static DEFINE_SPINLOCK(snmp_lock);
70
71/*
72 * Application layer address mapping mimics the NAT mapping, but
73 * only for the first octet in this case (a more flexible system
74 * can be implemented if needed).
75 */
76struct oct1_map
77{
78 u_int8_t from;
79 u_int8_t to;
80};
81
82
83/*****************************************************************************
84 *
85 * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
86 *
87 *****************************************************************************/
88
89/* Class */
90#define ASN1_UNI 0 /* Universal */
91#define ASN1_APL 1 /* Application */
92#define ASN1_CTX 2 /* Context */
93#define ASN1_PRV 3 /* Private */
94
95/* Tag */
96#define ASN1_EOC 0 /* End Of Contents */
97#define ASN1_BOL 1 /* Boolean */
98#define ASN1_INT 2 /* Integer */
99#define ASN1_BTS 3 /* Bit String */
100#define ASN1_OTS 4 /* Octet String */
101#define ASN1_NUL 5 /* Null */
102#define ASN1_OJI 6 /* Object Identifier */
103#define ASN1_OJD 7 /* Object Description */
104#define ASN1_EXT 8 /* External */
105#define ASN1_SEQ 16 /* Sequence */
106#define ASN1_SET 17 /* Set */
107#define ASN1_NUMSTR 18 /* Numerical String */
108#define ASN1_PRNSTR 19 /* Printable String */
109#define ASN1_TEXSTR 20 /* Teletext String */
110#define ASN1_VIDSTR 21 /* Video String */
111#define ASN1_IA5STR 22 /* IA5 String */
112#define ASN1_UNITIM 23 /* Universal Time */
113#define ASN1_GENTIM 24 /* General Time */
114#define ASN1_GRASTR 25 /* Graphical String */
115#define ASN1_VISSTR 26 /* Visible String */
116#define ASN1_GENSTR 27 /* General String */
117
118/* Primitive / Constructed methods*/
119#define ASN1_PRI 0 /* Primitive */
120#define ASN1_CON 1 /* Constructed */
121
122/*
123 * Error codes.
124 */
125#define ASN1_ERR_NOERROR 0
126#define ASN1_ERR_DEC_EMPTY 2
127#define ASN1_ERR_DEC_EOC_MISMATCH 3
128#define ASN1_ERR_DEC_LENGTH_MISMATCH 4
129#define ASN1_ERR_DEC_BADVALUE 5
130
131/*
132 * ASN.1 context.
133 */
134struct asn1_ctx
135{
136 int error; /* Error condition */
137 unsigned char *pointer; /* Octet just to be decoded */
138 unsigned char *begin; /* First octet */
139 unsigned char *end; /* Octet after last octet */
140};
141
142/*
143 * Octet string (not null terminated)
144 */
145struct asn1_octstr
146{
147 unsigned char *data;
148 unsigned int len;
149};
150
151static void asn1_open(struct asn1_ctx *ctx,
152 unsigned char *buf,
153 unsigned int len)
154{
155 ctx->begin = buf;
156 ctx->end = buf + len;
157 ctx->pointer = buf;
158 ctx->error = ASN1_ERR_NOERROR;
159}
160
161static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
162{
163 if (ctx->pointer >= ctx->end) {
164 ctx->error = ASN1_ERR_DEC_EMPTY;
165 return 0;
166 }
167 *ch = *(ctx->pointer)++;
168 return 1;
169}
170
171static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
172{
173 unsigned char ch;
174
175 *tag = 0;
176
177 do
178 {
179 if (!asn1_octet_decode(ctx, &ch))
180 return 0;
181 *tag <<= 7;
182 *tag |= ch & 0x7F;
183 } while ((ch & 0x80) == 0x80);
184 return 1;
185}
186
187static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
188 unsigned int *cls,
189 unsigned int *con,
190 unsigned int *tag)
191{
192 unsigned char ch;
193
194 if (!asn1_octet_decode(ctx, &ch))
195 return 0;
196
197 *cls = (ch & 0xC0) >> 6;
198 *con = (ch & 0x20) >> 5;
199 *tag = (ch & 0x1F);
200
201 if (*tag == 0x1F) {
202 if (!asn1_tag_decode(ctx, tag))
203 return 0;
204 }
205 return 1;
206}
207
208static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
209 unsigned int *def,
210 unsigned int *len)
211{
212 unsigned char ch, cnt;
213
214 if (!asn1_octet_decode(ctx, &ch))
215 return 0;
216
217 if (ch == 0x80)
218 *def = 0;
219 else {
220 *def = 1;
221
222 if (ch < 0x80)
223 *len = ch;
224 else {
225 cnt = (unsigned char) (ch & 0x7F);
226 *len = 0;
227
228 while (cnt > 0) {
229 if (!asn1_octet_decode(ctx, &ch))
230 return 0;
231 *len <<= 8;
232 *len |= ch;
233 cnt--;
234 }
235 }
236 }
237 return 1;
238}
239
240static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
241 unsigned char **eoc,
242 unsigned int *cls,
243 unsigned int *con,
244 unsigned int *tag)
245{
246 unsigned int def, len;
247
248 if (!asn1_id_decode(ctx, cls, con, tag))
249 return 0;
250
251 if (!asn1_length_decode(ctx, &def, &len))
252 return 0;
253
254 if (def)
255 *eoc = ctx->pointer + len;
256 else
257 *eoc = NULL;
258 return 1;
259}
260
261static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
262{
263 unsigned char ch;
264
265 if (eoc == 0) {
266 if (!asn1_octet_decode(ctx, &ch))
267 return 0;
268
269 if (ch != 0x00) {
270 ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
271 return 0;
272 }
273
274 if (!asn1_octet_decode(ctx, &ch))
275 return 0;
276
277 if (ch != 0x00) {
278 ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
279 return 0;
280 }
281 return 1;
282 } else {
283 if (ctx->pointer != eoc) {
284 ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
285 return 0;
286 }
287 return 1;
288 }
289}
290
291static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
292{
293 ctx->pointer = eoc;
294 return 1;
295}
296
297static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
298 unsigned char *eoc,
299 long *integer)
300{
301 unsigned char ch;
302 unsigned int len;
303
304 if (!asn1_octet_decode(ctx, &ch))
305 return 0;
306
307 *integer = (signed char) ch;
308 len = 1;
309
310 while (ctx->pointer < eoc) {
311 if (++len > sizeof (long)) {
312 ctx->error = ASN1_ERR_DEC_BADVALUE;
313 return 0;
314 }
315
316 if (!asn1_octet_decode(ctx, &ch))
317 return 0;
318
319 *integer <<= 8;
320 *integer |= ch;
321 }
322 return 1;
323}
324
325static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
326 unsigned char *eoc,
327 unsigned int *integer)
328{
329 unsigned char ch;
330 unsigned int len;
331
332 if (!asn1_octet_decode(ctx, &ch))
333 return 0;
334
335 *integer = ch;
336 if (ch == 0) len = 0;
337 else len = 1;
338
339 while (ctx->pointer < eoc) {
340 if (++len > sizeof (unsigned int)) {
341 ctx->error = ASN1_ERR_DEC_BADVALUE;
342 return 0;
343 }
344
345 if (!asn1_octet_decode(ctx, &ch))
346 return 0;
347
348 *integer <<= 8;
349 *integer |= ch;
350 }
351 return 1;
352}
353
354static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
355 unsigned char *eoc,
356 unsigned long *integer)
357{
358 unsigned char ch;
359 unsigned int len;
360
361 if (!asn1_octet_decode(ctx, &ch))
362 return 0;
363
364 *integer = ch;
365 if (ch == 0) len = 0;
366 else len = 1;
367
368 while (ctx->pointer < eoc) {
369 if (++len > sizeof (unsigned long)) {
370 ctx->error = ASN1_ERR_DEC_BADVALUE;
371 return 0;
372 }
373
374 if (!asn1_octet_decode(ctx, &ch))
375 return 0;
376
377 *integer <<= 8;
378 *integer |= ch;
379 }
380 return 1;
381}
382
383static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
384 unsigned char *eoc,
385 unsigned char **octets,
386 unsigned int *len)
387{
388 unsigned char *ptr;
389
390 *len = 0;
391
392 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
393 if (*octets == NULL) {
394 if (net_ratelimit())
395 printk("OOM in bsalg (%d)\n", __LINE__);
396 return 0;
397 }
398
399 ptr = *octets;
400 while (ctx->pointer < eoc) {
401 if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
402 kfree(*octets);
403 *octets = NULL;
404 return 0;
405 }
406 (*len)++;
407 }
408 return 1;
409}
410
411static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
412 unsigned long *subid)
413{
414 unsigned char ch;
415
416 *subid = 0;
417
418 do {
419 if (!asn1_octet_decode(ctx, &ch))
420 return 0;
421
422 *subid <<= 7;
423 *subid |= ch & 0x7F;
424 } while ((ch & 0x80) == 0x80);
425 return 1;
426}
427
428static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
429 unsigned char *eoc,
430 unsigned long **oid,
431 unsigned int *len)
432{
433 unsigned long subid;
434 unsigned int size;
435 unsigned long *optr;
436
437 size = eoc - ctx->pointer + 1;
438 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
439 if (*oid == NULL) {
440 if (net_ratelimit())
441 printk("OOM in bsalg (%d)\n", __LINE__);
442 return 0;
443 }
444
445 optr = *oid;
446
447 if (!asn1_subid_decode(ctx, &subid)) {
448 kfree(*oid);
449 *oid = NULL;
450 return 0;
451 }
452
453 if (subid < 40) {
454 optr [0] = 0;
455 optr [1] = subid;
456 } else if (subid < 80) {
457 optr [0] = 1;
458 optr [1] = subid - 40;
459 } else {
460 optr [0] = 2;
461 optr [1] = subid - 80;
462 }
463
464 *len = 2;
465 optr += 2;
466
467 while (ctx->pointer < eoc) {
468 if (++(*len) > size) {
469 ctx->error = ASN1_ERR_DEC_BADVALUE;
470 kfree(*oid);
471 *oid = NULL;
472 return 0;
473 }
474
475 if (!asn1_subid_decode(ctx, optr++)) {
476 kfree(*oid);
477 *oid = NULL;
478 return 0;
479 }
480 }
481 return 1;
482}
483
484/*****************************************************************************
485 *
486 * SNMP decoding routines (gxsnmp author Dirk Wisse)
487 *
488 *****************************************************************************/
489
490/* SNMP Versions */
491#define SNMP_V1 0
492#define SNMP_V2C 1
493#define SNMP_V2 2
494#define SNMP_V3 3
495
496/* Default Sizes */
497#define SNMP_SIZE_COMM 256
498#define SNMP_SIZE_OBJECTID 128
499#define SNMP_SIZE_BUFCHR 256
500#define SNMP_SIZE_BUFINT 128
501#define SNMP_SIZE_SMALLOBJECTID 16
502
503/* Requests */
504#define SNMP_PDU_GET 0
505#define SNMP_PDU_NEXT 1
506#define SNMP_PDU_RESPONSE 2
507#define SNMP_PDU_SET 3
508#define SNMP_PDU_TRAP1 4
509#define SNMP_PDU_BULK 5
510#define SNMP_PDU_INFORM 6
511#define SNMP_PDU_TRAP2 7
512
513/* Errors */
514#define SNMP_NOERROR 0
515#define SNMP_TOOBIG 1
516#define SNMP_NOSUCHNAME 2
517#define SNMP_BADVALUE 3
518#define SNMP_READONLY 4
519#define SNMP_GENERROR 5
520#define SNMP_NOACCESS 6
521#define SNMP_WRONGTYPE 7
522#define SNMP_WRONGLENGTH 8
523#define SNMP_WRONGENCODING 9
524#define SNMP_WRONGVALUE 10
525#define SNMP_NOCREATION 11
526#define SNMP_INCONSISTENTVALUE 12
527#define SNMP_RESOURCEUNAVAILABLE 13
528#define SNMP_COMMITFAILED 14
529#define SNMP_UNDOFAILED 15
530#define SNMP_AUTHORIZATIONERROR 16
531#define SNMP_NOTWRITABLE 17
532#define SNMP_INCONSISTENTNAME 18
533
534/* General SNMP V1 Traps */
535#define SNMP_TRAP_COLDSTART 0
536#define SNMP_TRAP_WARMSTART 1
537#define SNMP_TRAP_LINKDOWN 2
538#define SNMP_TRAP_LINKUP 3
539#define SNMP_TRAP_AUTFAILURE 4
540#define SNMP_TRAP_EQPNEIGHBORLOSS 5
541#define SNMP_TRAP_ENTSPECIFIC 6
542
543/* SNMPv1 Types */
544#define SNMP_NULL 0
545#define SNMP_INTEGER 1 /* l */
546#define SNMP_OCTETSTR 2 /* c */
547#define SNMP_DISPLAYSTR 2 /* c */
548#define SNMP_OBJECTID 3 /* ul */
549#define SNMP_IPADDR 4 /* uc */
550#define SNMP_COUNTER 5 /* ul */
551#define SNMP_GAUGE 6 /* ul */
552#define SNMP_TIMETICKS 7 /* ul */
553#define SNMP_OPAQUE 8 /* c */
554
555/* Additional SNMPv2 Types */
556#define SNMP_UINTEGER 5 /* ul */
557#define SNMP_BITSTR 9 /* uc */
558#define SNMP_NSAP 10 /* uc */
559#define SNMP_COUNTER64 11 /* ul */
560#define SNMP_NOSUCHOBJECT 12
561#define SNMP_NOSUCHINSTANCE 13
562#define SNMP_ENDOFMIBVIEW 14
563
564union snmp_syntax
565{
566 unsigned char uc[0]; /* 8 bit unsigned */
567 char c[0]; /* 8 bit signed */
568 unsigned long ul[0]; /* 32 bit unsigned */
569 long l[0]; /* 32 bit signed */
570};
571
572struct snmp_object
573{
574 unsigned long *id;
575 unsigned int id_len;
576 unsigned short type;
577 unsigned int syntax_len;
578 union snmp_syntax syntax;
579};
580
581struct snmp_request
582{
583 unsigned long id;
584 unsigned int error_status;
585 unsigned int error_index;
586};
587
588struct snmp_v1_trap
589{
590 unsigned long *id;
591 unsigned int id_len;
592 unsigned long ip_address; /* pointer */
593 unsigned int general;
594 unsigned int specific;
595 unsigned long time;
596};
597
598/* SNMP types */
599#define SNMP_IPA 0
600#define SNMP_CNT 1
601#define SNMP_GGE 2
602#define SNMP_TIT 3
603#define SNMP_OPQ 4
604#define SNMP_C64 6
605
606/* SNMP errors */
607#define SERR_NSO 0
608#define SERR_NSI 1
609#define SERR_EOM 2
610
611static inline void mangle_address(unsigned char *begin,
612 unsigned char *addr,
613 const struct oct1_map *map,
614 u_int16_t *check);
615struct snmp_cnv
616{
617 unsigned int class;
618 unsigned int tag;
619 int syntax;
620};
621
622static struct snmp_cnv snmp_conv [] =
623{
624 {ASN1_UNI, ASN1_NUL, SNMP_NULL},
625 {ASN1_UNI, ASN1_INT, SNMP_INTEGER},
626 {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
627 {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
628 {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
629 {ASN1_APL, SNMP_IPA, SNMP_IPADDR},
630 {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */
631 {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */
632 {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
633 {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
634
635 /* SNMPv2 data types and errors */
636 {ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
637 {ASN1_APL, SNMP_C64, SNMP_COUNTER64},
638 {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
639 {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
640 {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
641 {0, 0, -1}
642};
643
644static unsigned char snmp_tag_cls2syntax(unsigned int tag,
645 unsigned int cls,
646 unsigned short *syntax)
647{
648 struct snmp_cnv *cnv;
649
650 cnv = snmp_conv;
651
652 while (cnv->syntax != -1) {
653 if (cnv->tag == tag && cnv->class == cls) {
654 *syntax = cnv->syntax;
655 return 1;
656 }
657 cnv++;
658 }
659 return 0;
660}
661
662static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
663 struct snmp_object **obj)
664{
665 unsigned int cls, con, tag, len, idlen;
666 unsigned short type;
667 unsigned char *eoc, *end, *p;
668 unsigned long *lp, *id;
669 unsigned long ul;
670 long l;
671
672 *obj = NULL;
673 id = NULL;
674
675 if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
676 return 0;
677
678 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
679 return 0;
680
681 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
682 return 0;
683
684 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
685 return 0;
686
687 if (!asn1_oid_decode(ctx, end, &id, &idlen))
688 return 0;
689
690 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
691 kfree(id);
692 return 0;
693 }
694
695 if (con != ASN1_PRI) {
696 kfree(id);
697 return 0;
698 }
699
700 if (!snmp_tag_cls2syntax(tag, cls, &type)) {
701 kfree(id);
702 return 0;
703 }
704
705 switch (type) {
706 case SNMP_INTEGER:
707 len = sizeof(long);
708 if (!asn1_long_decode(ctx, end, &l)) {
709 kfree(id);
710 return 0;
711 }
712 *obj = kmalloc(sizeof(struct snmp_object) + len,
713 GFP_ATOMIC);
714 if (*obj == NULL) {
715 kfree(id);
716 if (net_ratelimit())
717 printk("OOM in bsalg (%d)\n", __LINE__);
718 return 0;
719 }
720 (*obj)->syntax.l[0] = l;
721 break;
722 case SNMP_OCTETSTR:
723 case SNMP_OPAQUE:
724 if (!asn1_octets_decode(ctx, end, &p, &len)) {
725 kfree(id);
726 return 0;
727 }
728 *obj = kmalloc(sizeof(struct snmp_object) + len,
729 GFP_ATOMIC);
730 if (*obj == NULL) {
731 kfree(id);
732 if (net_ratelimit())
733 printk("OOM in bsalg (%d)\n", __LINE__);
734 return 0;
735 }
736 memcpy((*obj)->syntax.c, p, len);
737 kfree(p);
738 break;
739 case SNMP_NULL:
740 case SNMP_NOSUCHOBJECT:
741 case SNMP_NOSUCHINSTANCE:
742 case SNMP_ENDOFMIBVIEW:
743 len = 0;
744 *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
745 if (*obj == NULL) {
746 kfree(id);
747 if (net_ratelimit())
748 printk("OOM in bsalg (%d)\n", __LINE__);
749 return 0;
750 }
751 if (!asn1_null_decode(ctx, end)) {
752 kfree(id);
753 kfree(*obj);
754 *obj = NULL;
755 return 0;
756 }
757 break;
758 case SNMP_OBJECTID:
759 if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
760 kfree(id);
761 return 0;
762 }
763 len *= sizeof(unsigned long);
764 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
765 if (*obj == NULL) {
766 kfree(id);
767 if (net_ratelimit())
768 printk("OOM in bsalg (%d)\n", __LINE__);
769 return 0;
770 }
771 memcpy((*obj)->syntax.ul, lp, len);
772 kfree(lp);
773 break;
774 case SNMP_IPADDR:
775 if (!asn1_octets_decode(ctx, end, &p, &len)) {
776 kfree(id);
777 return 0;
778 }
779 if (len != 4) {
780 kfree(p);
781 kfree(id);
782 return 0;
783 }
784 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
785 if (*obj == NULL) {
786 kfree(p);
787 kfree(id);
788 if (net_ratelimit())
789 printk("OOM in bsalg (%d)\n", __LINE__);
790 return 0;
791 }
792 memcpy((*obj)->syntax.uc, p, len);
793 kfree(p);
794 break;
795 case SNMP_COUNTER:
796 case SNMP_GAUGE:
797 case SNMP_TIMETICKS:
798 len = sizeof(unsigned long);
799 if (!asn1_ulong_decode(ctx, end, &ul)) {
800 kfree(id);
801 return 0;
802 }
803 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
804 if (*obj == NULL) {
805 kfree(id);
806 if (net_ratelimit())
807 printk("OOM in bsalg (%d)\n", __LINE__);
808 return 0;
809 }
810 (*obj)->syntax.ul[0] = ul;
811 break;
812 default:
813 kfree(id);
814 return 0;
815 }
816
817 (*obj)->syntax_len = len;
818 (*obj)->type = type;
819 (*obj)->id = id;
820 (*obj)->id_len = idlen;
821
822 if (!asn1_eoc_decode(ctx, eoc)) {
823 kfree(id);
824 kfree(*obj);
825 *obj = NULL;
826 return 0;
827 }
828 return 1;
829}
830
831static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
832 struct snmp_request *request)
833{
834 unsigned int cls, con, tag;
835 unsigned char *end;
836
837 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
838 return 0;
839
840 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
841 return 0;
842
843 if (!asn1_ulong_decode(ctx, end, &request->id))
844 return 0;
845
846 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
847 return 0;
848
849 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
850 return 0;
851
852 if (!asn1_uint_decode(ctx, end, &request->error_status))
853 return 0;
854
855 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
856 return 0;
857
858 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
859 return 0;
860
861 if (!asn1_uint_decode(ctx, end, &request->error_index))
862 return 0;
863
864 return 1;
865}
866
867/*
868 * Fast checksum update for possibly oddly-aligned UDP byte, from the
869 * code example in the draft.
870 */
871static void fast_csum(unsigned char *csum,
872 const unsigned char *optr,
873 const unsigned char *nptr,
874 int odd)
875{
876 long x, old, new;
877
878 x = csum[0] * 256 + csum[1];
879
880 x =~ x & 0xFFFF;
881
882 if (odd) old = optr[0] * 256;
883 else old = optr[0];
884
885 x -= old & 0xFFFF;
886 if (x <= 0) {
887 x--;
888 x &= 0xFFFF;
889 }
890
891 if (odd) new = nptr[0] * 256;
892 else new = nptr[0];
893
894 x += new & 0xFFFF;
895 if (x & 0x10000) {
896 x++;
897 x &= 0xFFFF;
898 }
899
900 x =~ x & 0xFFFF;
901 csum[0] = x / 256;
902 csum[1] = x & 0xFF;
903}
904
905/*
906 * Mangle IP address.
907 * - begin points to the start of the snmp messgae
908 * - addr points to the start of the address
909 */
910static inline void mangle_address(unsigned char *begin,
911 unsigned char *addr,
912 const struct oct1_map *map,
913 u_int16_t *check)
914{
915 if (map->from == NOCT1(*addr)) {
916 u_int32_t old;
917
918 if (debug)
919 memcpy(&old, (unsigned char *)addr, sizeof(old));
920
921 *addr = map->to;
922
923 /* Update UDP checksum if being used */
924 if (*check) {
925 unsigned char odd = !((addr - begin) % 2);
926
927 fast_csum((unsigned char *)check,
928 &map->from, &map->to, odd);
929
930 }
931
932 if (debug)
933 printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to "
934 "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr));
935 }
936}
937
938static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
939 struct snmp_v1_trap *trap,
940 const struct oct1_map *map,
941 u_int16_t *check)
942{
943 unsigned int cls, con, tag, len;
944 unsigned char *end;
945
946 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
947 return 0;
948
949 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
950 return 0;
951
952 if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
953 return 0;
954
955 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
956 goto err_id_free;
957
958 if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
959 (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
960 goto err_id_free;
961
962 if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
963 goto err_id_free;
964
965 /* IPv4 only */
966 if (len != 4)
967 goto err_addr_free;
968
969 mangle_address(ctx->begin, ctx->pointer - 4, map, check);
970
971 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
972 goto err_addr_free;
973
974 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
975 goto err_addr_free;
976
977 if (!asn1_uint_decode(ctx, end, &trap->general))
978 goto err_addr_free;
979
980 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
981 goto err_addr_free;
982
983 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
984 goto err_addr_free;
985
986 if (!asn1_uint_decode(ctx, end, &trap->specific))
987 goto err_addr_free;
988
989 if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
990 goto err_addr_free;
991
992 if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
993 (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
994 goto err_addr_free;
995
996 if (!asn1_ulong_decode(ctx, end, &trap->time))
997 goto err_addr_free;
998
999 return 1;
1000
1001err_id_free:
1002 kfree(trap->id);
1003
1004err_addr_free:
1005 kfree((unsigned long *)trap->ip_address);
1006
1007 return 0;
1008}
1009
1010/*****************************************************************************
1011 *
1012 * Misc. routines
1013 *
1014 *****************************************************************************/
1015
1016static void hex_dump(unsigned char *buf, size_t len)
1017{
1018 size_t i;
1019
1020 for (i = 0; i < len; i++) {
1021 if (i && !(i % 16))
1022 printk("\n");
1023 printk("%02x ", *(buf + i));
1024 }
1025 printk("\n");
1026}
1027
1028/*
1029 * Parse and mangle SNMP message according to mapping.
1030 * (And this is the fucking 'basic' method).
1031 */
1032static int snmp_parse_mangle(unsigned char *msg,
1033 u_int16_t len,
1034 const struct oct1_map *map,
1035 u_int16_t *check)
1036{
1037 unsigned char *eoc, *end;
1038 unsigned int cls, con, tag, vers, pdutype;
1039 struct asn1_ctx ctx;
1040 struct asn1_octstr comm;
1041 struct snmp_object **obj;
1042
1043 if (debug > 1)
1044 hex_dump(msg, len);
1045
1046 asn1_open(&ctx, msg, len);
1047
1048 /*
1049 * Start of SNMP message.
1050 */
1051 if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
1052 return 0;
1053 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
1054 return 0;
1055
1056 /*
1057 * Version 1 or 2 handled.
1058 */
1059 if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
1060 return 0;
1061 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
1062 return 0;
1063 if (!asn1_uint_decode (&ctx, end, &vers))
1064 return 0;
1065 if (debug > 1)
1066 printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
1067 if (vers > 1)
1068 return 1;
1069
1070 /*
1071 * Community.
1072 */
1073 if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
1074 return 0;
1075 if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
1076 return 0;
1077 if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
1078 return 0;
1079 if (debug > 1) {
1080 unsigned int i;
1081
1082 printk(KERN_DEBUG "bsalg: community: ");
1083 for (i = 0; i < comm.len; i++)
1084 printk("%c", comm.data[i]);
1085 printk("\n");
1086 }
1087 kfree(comm.data);
1088
1089 /*
1090 * PDU type
1091 */
1092 if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
1093 return 0;
1094 if (cls != ASN1_CTX || con != ASN1_CON)
1095 return 0;
1096 if (debug > 1) {
1097 unsigned char *pdus[] = {
1098 [SNMP_PDU_GET] = "get",
1099 [SNMP_PDU_NEXT] = "get-next",
1100 [SNMP_PDU_RESPONSE] = "response",
1101 [SNMP_PDU_SET] = "set",
1102 [SNMP_PDU_TRAP1] = "trapv1",
1103 [SNMP_PDU_BULK] = "bulk",
1104 [SNMP_PDU_INFORM] = "inform",
1105 [SNMP_PDU_TRAP2] = "trapv2"
1106 };
1107
1108 if (pdutype > SNMP_PDU_TRAP2)
1109 printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
1110 else
1111 printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
1112 }
1113 if (pdutype != SNMP_PDU_RESPONSE &&
1114 pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
1115 return 1;
1116
1117 /*
1118 * Request header or v1 trap
1119 */
1120 if (pdutype == SNMP_PDU_TRAP1) {
1121 struct snmp_v1_trap trap;
1122 unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
1123
1124 /* Discard trap allocations regardless */
1125 kfree(trap.id);
1126 kfree((unsigned long *)trap.ip_address);
1127
1128 if (!ret)
1129 return ret;
1130
1131 } else {
1132 struct snmp_request req;
1133
1134 if (!snmp_request_decode(&ctx, &req))
1135 return 0;
1136
1137 if (debug > 1)
1138 printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
1139 "error_index=%u\n", req.id, req.error_status,
1140 req.error_index);
1141 }
1142
1143 /*
1144 * Loop through objects, look for IP addresses to mangle.
1145 */
1146 if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
1147 return 0;
1148
1149 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
1150 return 0;
1151
1152 obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
1153 if (obj == NULL) {
1154 if (net_ratelimit())
1155 printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
1156 return 0;
1157 }
1158
1159 while (!asn1_eoc_decode(&ctx, eoc)) {
1160 unsigned int i;
1161
1162 if (!snmp_object_decode(&ctx, obj)) {
1163 if (*obj) {
1164 if ((*obj)->id)
1165 kfree((*obj)->id);
1166 kfree(*obj);
1167 }
1168 kfree(obj);
1169 return 0;
1170 }
1171
1172 if (debug > 1) {
1173 printk(KERN_DEBUG "bsalg: object: ");
1174 for (i = 0; i < (*obj)->id_len; i++) {
1175 if (i > 0)
1176 printk(".");
1177 printk("%lu", (*obj)->id[i]);
1178 }
1179 printk(": type=%u\n", (*obj)->type);
1180
1181 }
1182
1183 if ((*obj)->type == SNMP_IPADDR)
1184 mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
1185
1186 kfree((*obj)->id);
1187 kfree(*obj);
1188 }
1189 kfree(obj);
1190
1191 if (!asn1_eoc_decode(&ctx, eoc))
1192 return 0;
1193
1194 return 1;
1195}
1196
1197/*****************************************************************************
1198 *
1199 * NAT routines.
1200 *
1201 *****************************************************************************/
1202
1203/*
1204 * SNMP translation routine.
1205 */
1206static int snmp_translate(struct ip_conntrack *ct,
1207 enum ip_conntrack_info ctinfo,
1208 struct sk_buff **pskb)
1209{
1210 struct iphdr *iph = (*pskb)->nh.iph;
1211 struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
1212 u_int16_t udplen = ntohs(udph->len);
1213 u_int16_t paylen = udplen - sizeof(struct udphdr);
1214 int dir = CTINFO2DIR(ctinfo);
1215 struct oct1_map map;
1216
1217 /*
1218 * Determine mappping for application layer addresses based
1219 * on NAT manipulations for the packet.
1220 */
1221 if (dir == IP_CT_DIR_ORIGINAL) {
1222 /* SNAT traps */
1223 map.from = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip);
1224 map.to = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip);
1225 } else {
1226 /* DNAT replies */
1227 map.from = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
1228 map.to = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip);
1229 }
1230
1231 if (map.from == map.to)
1232 return NF_ACCEPT;
1233
1234 if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
1235 paylen, &map, &udph->check)) {
1236 if (net_ratelimit())
1237 printk(KERN_WARNING "bsalg: parser failed\n");
1238 return NF_DROP;
1239 }
1240 return NF_ACCEPT;
1241}
1242
1243/* We don't actually set up expectations, just adjust internal IP
1244 * addresses if this is being NATted */
1245static int help(struct sk_buff **pskb,
1246 struct ip_conntrack *ct,
1247 enum ip_conntrack_info ctinfo)
1248{
1249 int dir = CTINFO2DIR(ctinfo);
1250 unsigned int ret;
1251 struct iphdr *iph = (*pskb)->nh.iph;
1252 struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
1253
1254 /* SNMP replies and originating SNMP traps get mangled */
1255 if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
1256 return NF_ACCEPT;
1257 if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
1258 return NF_ACCEPT;
1259
1260 /* No NAT? */
1261 if (!(ct->status & IPS_NAT_MASK))
1262 return NF_ACCEPT;
1263
1264 /*
1265 * Make sure the packet length is ok. So far, we were only guaranteed
1266 * to have a valid length IP header plus 8 bytes, which means we have
1267 * enough room for a UDP header. Just verify the UDP length field so we
1268 * can mess around with the payload.
1269 */
1270 if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) {
1271 if (net_ratelimit())
1272 printk(KERN_WARNING "SNMP: dropping malformed packet "
1273 "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n",
1274 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1275 return NF_DROP;
1276 }
1277
1278 if (!skb_ip_make_writable(pskb, (*pskb)->len))
1279 return NF_DROP;
1280
1281 spin_lock_bh(&snmp_lock);
1282 ret = snmp_translate(ct, ctinfo, pskb);
1283 spin_unlock_bh(&snmp_lock);
1284 return ret;
1285}
1286
1287static struct ip_conntrack_helper snmp_helper = {
1288 .max_expected = 0,
1289 .timeout = 180,
1290 .me = THIS_MODULE,
1291 .help = help,
1292 .name = "snmp",
1293
1294 .tuple = { .src = { .u = { __constant_htons(SNMP_PORT) } },
1295 .dst = { .protonum = IPPROTO_UDP },
1296 },
1297 .mask = { .src = { .u = { 0xFFFF } },
1298 .dst = { .protonum = 0xFF },
1299 },
1300};
1301
1302static struct ip_conntrack_helper snmp_trap_helper = {
1303 .max_expected = 0,
1304 .timeout = 180,
1305 .me = THIS_MODULE,
1306 .help = help,
1307 .name = "snmp_trap",
1308
1309 .tuple = { .src = { .u = { __constant_htons(SNMP_TRAP_PORT) } },
1310 .dst = { .protonum = IPPROTO_UDP },
1311 },
1312 .mask = { .src = { .u = { 0xFFFF } },
1313 .dst = { .protonum = 0xFF },
1314 },
1315};
1316
1317/*****************************************************************************
1318 *
1319 * Module stuff.
1320 *
1321 *****************************************************************************/
1322
1323static int __init init(void)
1324{
1325 int ret = 0;
1326
1327 ret = ip_conntrack_helper_register(&snmp_helper);
1328 if (ret < 0)
1329 return ret;
1330 ret = ip_conntrack_helper_register(&snmp_trap_helper);
1331 if (ret < 0) {
1332 ip_conntrack_helper_unregister(&snmp_helper);
1333 return ret;
1334 }
1335 return ret;
1336}
1337
1338static void __exit fini(void)
1339{
1340 ip_conntrack_helper_unregister(&snmp_helper);
1341 ip_conntrack_helper_unregister(&snmp_trap_helper);
1342}
1343
1344module_init(init);
1345module_exit(fini);
1346
1347module_param(debug, bool, 0600);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
new file mode 100644
index 000000000000..dec4a74212cd
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -0,0 +1,349 @@
1/* This file contains all the functions required for the standalone
2 ip_nat module.
3
4 These are not required by the compatibility layer.
5*/
6
7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15/*
16 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
17 * - new API and handling of conntrack/nat helpers
18 * - now capable of multiple expectations for one master
19 * */
20
21#include <linux/config.h>
22#include <linux/types.h>
23#include <linux/icmp.h>
24#include <linux/ip.h>
25#include <linux/netfilter.h>
26#include <linux/netfilter_ipv4.h>
27#include <linux/module.h>
28#include <linux/skbuff.h>
29#include <linux/proc_fs.h>
30#include <net/ip.h>
31#include <net/checksum.h>
32#include <linux/spinlock.h>
33
34#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
35#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
36
37#include <linux/netfilter_ipv4/ip_nat.h>
38#include <linux/netfilter_ipv4/ip_nat_rule.h>
39#include <linux/netfilter_ipv4/ip_nat_protocol.h>
40#include <linux/netfilter_ipv4/ip_nat_core.h>
41#include <linux/netfilter_ipv4/ip_nat_helper.h>
42#include <linux/netfilter_ipv4/ip_tables.h>
43#include <linux/netfilter_ipv4/ip_conntrack_core.h>
44#include <linux/netfilter_ipv4/listhelp.h>
45
46#if 0
47#define DEBUGP printk
48#else
49#define DEBUGP(format, args...)
50#endif
51
52#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \
53 : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \
54 : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \
55 : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \
56 : "*ERROR*")))
57
58static unsigned int
59ip_nat_fn(unsigned int hooknum,
60 struct sk_buff **pskb,
61 const struct net_device *in,
62 const struct net_device *out,
63 int (*okfn)(struct sk_buff *))
64{
65 struct ip_conntrack *ct;
66 enum ip_conntrack_info ctinfo;
67 struct ip_nat_info *info;
68 /* maniptype == SRC for postrouting. */
69 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
70
71 /* We never see fragments: conntrack defrags on pre-routing
72 and local-out, and ip_nat_out protects post-routing. */
73 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
74 & htons(IP_MF|IP_OFFSET)));
75
76 (*pskb)->nfcache |= NFC_UNKNOWN;
77
78 /* If we had a hardware checksum before, it's now invalid */
79 if ((*pskb)->ip_summed == CHECKSUM_HW)
80 if (skb_checksum_help(*pskb, (out == NULL)))
81 return NF_DROP;
82
83 ct = ip_conntrack_get(*pskb, &ctinfo);
84 /* Can't track? It's not due to stress, or conntrack would
85 have dropped it. Hence it's the user's responsibilty to
86 packet filter it out, or implement conntrack/NAT for that
87 protocol. 8) --RR */
88 if (!ct) {
89 /* Exception: ICMP redirect to new connection (not in
90 hash table yet). We must not let this through, in
91 case we're doing NAT to the same network. */
92 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
93 struct icmphdr _hdr, *hp;
94
95 hp = skb_header_pointer(*pskb,
96 (*pskb)->nh.iph->ihl*4,
97 sizeof(_hdr), &_hdr);
98 if (hp != NULL &&
99 hp->type == ICMP_REDIRECT)
100 return NF_DROP;
101 }
102 return NF_ACCEPT;
103 }
104
105 switch (ctinfo) {
106 case IP_CT_RELATED:
107 case IP_CT_RELATED+IP_CT_IS_REPLY:
108 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
109 if (!icmp_reply_translation(pskb, ct, maniptype,
110 CTINFO2DIR(ctinfo)))
111 return NF_DROP;
112 else
113 return NF_ACCEPT;
114 }
115 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
116 case IP_CT_NEW:
117 info = &ct->nat.info;
118
119 /* Seen it before? This can happen for loopback, retrans,
120 or local packets.. */
121 if (!ip_nat_initialized(ct, maniptype)) {
122 unsigned int ret;
123
124 /* LOCAL_IN hook doesn't have a chain! */
125 if (hooknum == NF_IP_LOCAL_IN)
126 ret = alloc_null_binding(ct, info, hooknum);
127 else
128 ret = ip_nat_rule_find(pskb, hooknum,
129 in, out, ct,
130 info);
131
132 if (ret != NF_ACCEPT) {
133 return ret;
134 }
135 } else
136 DEBUGP("Already setup manip %s for ct %p\n",
137 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
138 ct);
139 break;
140
141 default:
142 /* ESTABLISHED */
143 IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
144 || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
145 info = &ct->nat.info;
146 }
147
148 IP_NF_ASSERT(info);
149 return nat_packet(ct, ctinfo, hooknum, pskb);
150}
151
152static unsigned int
153ip_nat_in(unsigned int hooknum,
154 struct sk_buff **pskb,
155 const struct net_device *in,
156 const struct net_device *out,
157 int (*okfn)(struct sk_buff *))
158{
159 u_int32_t saddr, daddr;
160 unsigned int ret;
161
162 saddr = (*pskb)->nh.iph->saddr;
163 daddr = (*pskb)->nh.iph->daddr;
164
165 ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
166 if (ret != NF_DROP && ret != NF_STOLEN
167 && ((*pskb)->nh.iph->saddr != saddr
168 || (*pskb)->nh.iph->daddr != daddr)) {
169 dst_release((*pskb)->dst);
170 (*pskb)->dst = NULL;
171 }
172 return ret;
173}
174
175static unsigned int
176ip_nat_out(unsigned int hooknum,
177 struct sk_buff **pskb,
178 const struct net_device *in,
179 const struct net_device *out,
180 int (*okfn)(struct sk_buff *))
181{
182 /* root is playing with raw sockets. */
183 if ((*pskb)->len < sizeof(struct iphdr)
184 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
185 return NF_ACCEPT;
186
187 /* We can hit fragment here; forwarded packets get
188 defragmented by connection tracking coming in, then
189 fragmented (grr) by the forward code.
190
191 In future: If we have nfct != NULL, AND we have NAT
192 initialized, AND there is no helper, then we can do full
193 NAPT on the head, and IP-address-only NAT on the rest.
194
195 I'm starting to have nightmares about fragments. */
196
197 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
198 *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT);
199
200 if (!*pskb)
201 return NF_STOLEN;
202 }
203
204 return ip_nat_fn(hooknum, pskb, in, out, okfn);
205}
206
207static unsigned int
208ip_nat_local_fn(unsigned int hooknum,
209 struct sk_buff **pskb,
210 const struct net_device *in,
211 const struct net_device *out,
212 int (*okfn)(struct sk_buff *))
213{
214 u_int32_t saddr, daddr;
215 unsigned int ret;
216
217 /* root is playing with raw sockets. */
218 if ((*pskb)->len < sizeof(struct iphdr)
219 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
220 return NF_ACCEPT;
221
222 saddr = (*pskb)->nh.iph->saddr;
223 daddr = (*pskb)->nh.iph->daddr;
224
225 ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
226 if (ret != NF_DROP && ret != NF_STOLEN
227 && ((*pskb)->nh.iph->saddr != saddr
228 || (*pskb)->nh.iph->daddr != daddr))
229 return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
230 return ret;
231}
232
233/* We must be after connection tracking and before packet filtering. */
234
235/* Before packet filtering, change destination */
236static struct nf_hook_ops ip_nat_in_ops = {
237 .hook = ip_nat_in,
238 .owner = THIS_MODULE,
239 .pf = PF_INET,
240 .hooknum = NF_IP_PRE_ROUTING,
241 .priority = NF_IP_PRI_NAT_DST,
242};
243
244/* After packet filtering, change source */
245static struct nf_hook_ops ip_nat_out_ops = {
246 .hook = ip_nat_out,
247 .owner = THIS_MODULE,
248 .pf = PF_INET,
249 .hooknum = NF_IP_POST_ROUTING,
250 .priority = NF_IP_PRI_NAT_SRC,
251};
252
253/* Before packet filtering, change destination */
254static struct nf_hook_ops ip_nat_local_out_ops = {
255 .hook = ip_nat_local_fn,
256 .owner = THIS_MODULE,
257 .pf = PF_INET,
258 .hooknum = NF_IP_LOCAL_OUT,
259 .priority = NF_IP_PRI_NAT_DST,
260};
261
262/* After packet filtering, change source for reply packets of LOCAL_OUT DNAT */
263static struct nf_hook_ops ip_nat_local_in_ops = {
264 .hook = ip_nat_fn,
265 .owner = THIS_MODULE,
266 .pf = PF_INET,
267 .hooknum = NF_IP_LOCAL_IN,
268 .priority = NF_IP_PRI_NAT_SRC,
269};
270
271static int init_or_cleanup(int init)
272{
273 int ret = 0;
274
275 need_ip_conntrack();
276
277 if (!init) goto cleanup;
278
279 ret = ip_nat_rule_init();
280 if (ret < 0) {
281 printk("ip_nat_init: can't setup rules.\n");
282 goto cleanup_nothing;
283 }
284 ret = ip_nat_init();
285 if (ret < 0) {
286 printk("ip_nat_init: can't setup rules.\n");
287 goto cleanup_rule_init;
288 }
289 ret = nf_register_hook(&ip_nat_in_ops);
290 if (ret < 0) {
291 printk("ip_nat_init: can't register in hook.\n");
292 goto cleanup_nat;
293 }
294 ret = nf_register_hook(&ip_nat_out_ops);
295 if (ret < 0) {
296 printk("ip_nat_init: can't register out hook.\n");
297 goto cleanup_inops;
298 }
299 ret = nf_register_hook(&ip_nat_local_out_ops);
300 if (ret < 0) {
301 printk("ip_nat_init: can't register local out hook.\n");
302 goto cleanup_outops;
303 }
304 ret = nf_register_hook(&ip_nat_local_in_ops);
305 if (ret < 0) {
306 printk("ip_nat_init: can't register local in hook.\n");
307 goto cleanup_localoutops;
308 }
309 return ret;
310
311 cleanup:
312 nf_unregister_hook(&ip_nat_local_in_ops);
313 cleanup_localoutops:
314 nf_unregister_hook(&ip_nat_local_out_ops);
315 cleanup_outops:
316 nf_unregister_hook(&ip_nat_out_ops);
317 cleanup_inops:
318 nf_unregister_hook(&ip_nat_in_ops);
319 cleanup_nat:
320 ip_nat_cleanup();
321 cleanup_rule_init:
322 ip_nat_rule_cleanup();
323 cleanup_nothing:
324 MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
325 return ret;
326}
327
328static int __init init(void)
329{
330 return init_or_cleanup(1);
331}
332
333static void __exit fini(void)
334{
335 init_or_cleanup(0);
336}
337
338module_init(init);
339module_exit(fini);
340
341EXPORT_SYMBOL(ip_nat_setup_info);
342EXPORT_SYMBOL(ip_nat_protocol_register);
343EXPORT_SYMBOL(ip_nat_protocol_unregister);
344EXPORT_SYMBOL(ip_nat_cheat_check);
345EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
346EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
347EXPORT_SYMBOL(ip_nat_used_tuple);
348EXPORT_SYMBOL(ip_nat_follow_master);
349MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
new file mode 100644
index 000000000000..0343e0d64674
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -0,0 +1,70 @@
1/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 *
7 * Version: 0.0.7
8 *
9 * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
10 * - Port to newnat API
11 *
12 * This module currently supports DNAT:
13 * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y
14 *
15 * and SNAT:
16 * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x }
17 *
18 * It has not been tested with
19 * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip
20 * If you do test this please let me know if it works or not.
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/ip.h>
27#include <linux/udp.h>
28
29#include <linux/netfilter.h>
30#include <linux/netfilter_ipv4/ip_tables.h>
31#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
32#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
33#include <linux/netfilter_ipv4/ip_nat_helper.h>
34#include <linux/netfilter_ipv4/ip_nat_rule.h>
35#include <linux/moduleparam.h>
36
37MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
38MODULE_DESCRIPTION("tftp NAT helper");
39MODULE_LICENSE("GPL");
40
41static unsigned int help(struct sk_buff **pskb,
42 enum ip_conntrack_info ctinfo,
43 struct ip_conntrack_expect *exp)
44{
45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
46 exp->dir = IP_CT_DIR_REPLY;
47 exp->expectfn = ip_nat_follow_master;
48 if (ip_conntrack_expect_related(exp) != 0) {
49 ip_conntrack_expect_free(exp);
50 return NF_DROP;
51 }
52 return NF_ACCEPT;
53}
54
55static void __exit fini(void)
56{
57 ip_nat_tftp_hook = NULL;
58 /* Make sure noone calls it, meanwhile. */
59 synchronize_net();
60}
61
62static int __init init(void)
63{
64 BUG_ON(ip_nat_tftp_hook);
65 ip_nat_tftp_hook = help;
66 return 0;
67}
68
69module_init(init);
70module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
new file mode 100644
index 000000000000..9e40dffc204f
--- /dev/null
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -0,0 +1,741 @@
1/*
2 * This is a module which is used for queueing IPv4 packets and
3 * communicating with userspace via netlink.
4 *
5 * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
12 * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
13 * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
14 * Zander).
15 * 2000-08-01: Added Nick Williams' MAC support.
16 * 2002-06-25: Code cleanup.
17 * 2005-01-10: Added /proc counter for dropped packets; fixed so
18 * packets aren't delivered to user space if they're going
19 * to be dropped.
20 *
21 */
22#include <linux/module.h>
23#include <linux/skbuff.h>
24#include <linux/init.h>
25#include <linux/ip.h>
26#include <linux/notifier.h>
27#include <linux/netdevice.h>
28#include <linux/netfilter.h>
29#include <linux/netfilter_ipv4/ip_queue.h>
30#include <linux/netfilter_ipv4/ip_tables.h>
31#include <linux/netlink.h>
32#include <linux/spinlock.h>
33#include <linux/sysctl.h>
34#include <linux/proc_fs.h>
35#include <linux/security.h>
36#include <net/sock.h>
37#include <net/route.h>
38
39#define IPQ_QMAX_DEFAULT 1024
40#define IPQ_PROC_FS_NAME "ip_queue"
41#define NET_IPQ_QMAX 2088
42#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
43
44struct ipq_rt_info {
45 __u8 tos;
46 __u32 daddr;
47 __u32 saddr;
48};
49
50struct ipq_queue_entry {
51 struct list_head list;
52 struct nf_info *info;
53 struct sk_buff *skb;
54 struct ipq_rt_info rt_info;
55};
56
57typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
58
59static unsigned char copy_mode = IPQ_COPY_NONE;
60static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
61static DEFINE_RWLOCK(queue_lock);
62static int peer_pid;
63static unsigned int copy_range;
64static unsigned int queue_total;
65static unsigned int queue_dropped = 0;
66static unsigned int queue_user_dropped = 0;
67static struct sock *ipqnl;
68static LIST_HEAD(queue_list);
69static DECLARE_MUTEX(ipqnl_sem);
70
71static void
72ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
73{
74 nf_reinject(entry->skb, entry->info, verdict);
75 kfree(entry);
76}
77
78static inline void
79__ipq_enqueue_entry(struct ipq_queue_entry *entry)
80{
81 list_add(&entry->list, &queue_list);
82 queue_total++;
83}
84
85/*
86 * Find and return a queued entry matched by cmpfn, or return the last
87 * entry if cmpfn is NULL.
88 */
89static inline struct ipq_queue_entry *
90__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
91{
92 struct list_head *p;
93
94 list_for_each_prev(p, &queue_list) {
95 struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
96
97 if (!cmpfn || cmpfn(entry, data))
98 return entry;
99 }
100 return NULL;
101}
102
103static inline void
104__ipq_dequeue_entry(struct ipq_queue_entry *entry)
105{
106 list_del(&entry->list);
107 queue_total--;
108}
109
110static inline struct ipq_queue_entry *
111__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
112{
113 struct ipq_queue_entry *entry;
114
115 entry = __ipq_find_entry(cmpfn, data);
116 if (entry == NULL)
117 return NULL;
118
119 __ipq_dequeue_entry(entry);
120 return entry;
121}
122
123
124static inline void
125__ipq_flush(int verdict)
126{
127 struct ipq_queue_entry *entry;
128
129 while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
130 ipq_issue_verdict(entry, verdict);
131}
132
133static inline int
134__ipq_set_mode(unsigned char mode, unsigned int range)
135{
136 int status = 0;
137
138 switch(mode) {
139 case IPQ_COPY_NONE:
140 case IPQ_COPY_META:
141 copy_mode = mode;
142 copy_range = 0;
143 break;
144
145 case IPQ_COPY_PACKET:
146 copy_mode = mode;
147 copy_range = range;
148 if (copy_range > 0xFFFF)
149 copy_range = 0xFFFF;
150 break;
151
152 default:
153 status = -EINVAL;
154
155 }
156 return status;
157}
158
159static inline void
160__ipq_reset(void)
161{
162 peer_pid = 0;
163 net_disable_timestamp();
164 __ipq_set_mode(IPQ_COPY_NONE, 0);
165 __ipq_flush(NF_DROP);
166}
167
168static struct ipq_queue_entry *
169ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
170{
171 struct ipq_queue_entry *entry;
172
173 write_lock_bh(&queue_lock);
174 entry = __ipq_find_dequeue_entry(cmpfn, data);
175 write_unlock_bh(&queue_lock);
176 return entry;
177}
178
179static void
180ipq_flush(int verdict)
181{
182 write_lock_bh(&queue_lock);
183 __ipq_flush(verdict);
184 write_unlock_bh(&queue_lock);
185}
186
187static struct sk_buff *
188ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
189{
190 unsigned char *old_tail;
191 size_t size = 0;
192 size_t data_len = 0;
193 struct sk_buff *skb;
194 struct ipq_packet_msg *pmsg;
195 struct nlmsghdr *nlh;
196
197 read_lock_bh(&queue_lock);
198
199 switch (copy_mode) {
200 case IPQ_COPY_META:
201 case IPQ_COPY_NONE:
202 size = NLMSG_SPACE(sizeof(*pmsg));
203 data_len = 0;
204 break;
205
206 case IPQ_COPY_PACKET:
207 if (copy_range == 0 || copy_range > entry->skb->len)
208 data_len = entry->skb->len;
209 else
210 data_len = copy_range;
211
212 size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
213 break;
214
215 default:
216 *errp = -EINVAL;
217 read_unlock_bh(&queue_lock);
218 return NULL;
219 }
220
221 read_unlock_bh(&queue_lock);
222
223 skb = alloc_skb(size, GFP_ATOMIC);
224 if (!skb)
225 goto nlmsg_failure;
226
227 old_tail= skb->tail;
228 nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
229 pmsg = NLMSG_DATA(nlh);
230 memset(pmsg, 0, sizeof(*pmsg));
231
232 pmsg->packet_id = (unsigned long )entry;
233 pmsg->data_len = data_len;
234 pmsg->timestamp_sec = entry->skb->stamp.tv_sec;
235 pmsg->timestamp_usec = entry->skb->stamp.tv_usec;
236 pmsg->mark = entry->skb->nfmark;
237 pmsg->hook = entry->info->hook;
238 pmsg->hw_protocol = entry->skb->protocol;
239
240 if (entry->info->indev)
241 strcpy(pmsg->indev_name, entry->info->indev->name);
242 else
243 pmsg->indev_name[0] = '\0';
244
245 if (entry->info->outdev)
246 strcpy(pmsg->outdev_name, entry->info->outdev->name);
247 else
248 pmsg->outdev_name[0] = '\0';
249
250 if (entry->info->indev && entry->skb->dev) {
251 pmsg->hw_type = entry->skb->dev->type;
252 if (entry->skb->dev->hard_header_parse)
253 pmsg->hw_addrlen =
254 entry->skb->dev->hard_header_parse(entry->skb,
255 pmsg->hw_addr);
256 }
257
258 if (data_len)
259 if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
260 BUG();
261
262 nlh->nlmsg_len = skb->tail - old_tail;
263 return skb;
264
265nlmsg_failure:
266 if (skb)
267 kfree_skb(skb);
268 *errp = -EINVAL;
269 printk(KERN_ERR "ip_queue: error creating packet message\n");
270 return NULL;
271}
272
273static int
274ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
275{
276 int status = -EINVAL;
277 struct sk_buff *nskb;
278 struct ipq_queue_entry *entry;
279
280 if (copy_mode == IPQ_COPY_NONE)
281 return -EAGAIN;
282
283 entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
284 if (entry == NULL) {
285 printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
286 return -ENOMEM;
287 }
288
289 entry->info = info;
290 entry->skb = skb;
291
292 if (entry->info->hook == NF_IP_LOCAL_OUT) {
293 struct iphdr *iph = skb->nh.iph;
294
295 entry->rt_info.tos = iph->tos;
296 entry->rt_info.daddr = iph->daddr;
297 entry->rt_info.saddr = iph->saddr;
298 }
299
300 nskb = ipq_build_packet_message(entry, &status);
301 if (nskb == NULL)
302 goto err_out_free;
303
304 write_lock_bh(&queue_lock);
305
306 if (!peer_pid)
307 goto err_out_free_nskb;
308
309 if (queue_total >= queue_maxlen) {
310 queue_dropped++;
311 status = -ENOSPC;
312 if (net_ratelimit())
313 printk (KERN_WARNING "ip_queue: full at %d entries, "
314 "dropping packets(s). Dropped: %d\n", queue_total,
315 queue_dropped);
316 goto err_out_free_nskb;
317 }
318
319 /* netlink_unicast will either free the nskb or attach it to a socket */
320 status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
321 if (status < 0) {
322 queue_user_dropped++;
323 goto err_out_unlock;
324 }
325
326 __ipq_enqueue_entry(entry);
327
328 write_unlock_bh(&queue_lock);
329 return status;
330
331err_out_free_nskb:
332 kfree_skb(nskb);
333
334err_out_unlock:
335 write_unlock_bh(&queue_lock);
336
337err_out_free:
338 kfree(entry);
339 return status;
340}
341
342static int
343ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
344{
345 int diff;
346 struct iphdr *user_iph = (struct iphdr *)v->payload;
347
348 if (v->data_len < sizeof(*user_iph))
349 return 0;
350 diff = v->data_len - e->skb->len;
351 if (diff < 0)
352 skb_trim(e->skb, v->data_len);
353 else if (diff > 0) {
354 if (v->data_len > 0xFFFF)
355 return -EINVAL;
356 if (diff > skb_tailroom(e->skb)) {
357 struct sk_buff *newskb;
358
359 newskb = skb_copy_expand(e->skb,
360 skb_headroom(e->skb),
361 diff,
362 GFP_ATOMIC);
363 if (newskb == NULL) {
364 printk(KERN_WARNING "ip_queue: OOM "
365 "in mangle, dropping packet\n");
366 return -ENOMEM;
367 }
368 if (e->skb->sk)
369 skb_set_owner_w(newskb, e->skb->sk);
370 kfree_skb(e->skb);
371 e->skb = newskb;
372 }
373 skb_put(e->skb, diff);
374 }
375 if (!skb_ip_make_writable(&e->skb, v->data_len))
376 return -ENOMEM;
377 memcpy(e->skb->data, v->payload, v->data_len);
378 e->skb->nfcache |= NFC_ALTERED;
379
380 /*
381 * Extra routing may needed on local out, as the QUEUE target never
382 * returns control to the table.
383 */
384 if (e->info->hook == NF_IP_LOCAL_OUT) {
385 struct iphdr *iph = e->skb->nh.iph;
386
387 if (!(iph->tos == e->rt_info.tos
388 && iph->daddr == e->rt_info.daddr
389 && iph->saddr == e->rt_info.saddr))
390 return ip_route_me_harder(&e->skb);
391 }
392 return 0;
393}
394
395static inline int
396id_cmp(struct ipq_queue_entry *e, unsigned long id)
397{
398 return (id == (unsigned long )e);
399}
400
401static int
402ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
403{
404 struct ipq_queue_entry *entry;
405
406 if (vmsg->value > NF_MAX_VERDICT)
407 return -EINVAL;
408
409 entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
410 if (entry == NULL)
411 return -ENOENT;
412 else {
413 int verdict = vmsg->value;
414
415 if (vmsg->data_len && vmsg->data_len == len)
416 if (ipq_mangle_ipv4(vmsg, entry) < 0)
417 verdict = NF_DROP;
418
419 ipq_issue_verdict(entry, verdict);
420 return 0;
421 }
422}
423
424static int
425ipq_set_mode(unsigned char mode, unsigned int range)
426{
427 int status;
428
429 write_lock_bh(&queue_lock);
430 status = __ipq_set_mode(mode, range);
431 write_unlock_bh(&queue_lock);
432 return status;
433}
434
435static int
436ipq_receive_peer(struct ipq_peer_msg *pmsg,
437 unsigned char type, unsigned int len)
438{
439 int status = 0;
440
441 if (len < sizeof(*pmsg))
442 return -EINVAL;
443
444 switch (type) {
445 case IPQM_MODE:
446 status = ipq_set_mode(pmsg->msg.mode.value,
447 pmsg->msg.mode.range);
448 break;
449
450 case IPQM_VERDICT:
451 if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
452 status = -EINVAL;
453 else
454 status = ipq_set_verdict(&pmsg->msg.verdict,
455 len - sizeof(*pmsg));
456 break;
457 default:
458 status = -EINVAL;
459 }
460 return status;
461}
462
463static int
464dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
465{
466 if (entry->info->indev)
467 if (entry->info->indev->ifindex == ifindex)
468 return 1;
469
470 if (entry->info->outdev)
471 if (entry->info->outdev->ifindex == ifindex)
472 return 1;
473
474 return 0;
475}
476
477static void
478ipq_dev_drop(int ifindex)
479{
480 struct ipq_queue_entry *entry;
481
482 while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
483 ipq_issue_verdict(entry, NF_DROP);
484}
485
486#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
487
488static inline void
489ipq_rcv_skb(struct sk_buff *skb)
490{
491 int status, type, pid, flags, nlmsglen, skblen;
492 struct nlmsghdr *nlh;
493
494 skblen = skb->len;
495 if (skblen < sizeof(*nlh))
496 return;
497
498 nlh = (struct nlmsghdr *)skb->data;
499 nlmsglen = nlh->nlmsg_len;
500 if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
501 return;
502
503 pid = nlh->nlmsg_pid;
504 flags = nlh->nlmsg_flags;
505
506 if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
507 RCV_SKB_FAIL(-EINVAL);
508
509 if (flags & MSG_TRUNC)
510 RCV_SKB_FAIL(-ECOMM);
511
512 type = nlh->nlmsg_type;
513 if (type < NLMSG_NOOP || type >= IPQM_MAX)
514 RCV_SKB_FAIL(-EINVAL);
515
516 if (type <= IPQM_BASE)
517 return;
518
519 if (security_netlink_recv(skb))
520 RCV_SKB_FAIL(-EPERM);
521
522 write_lock_bh(&queue_lock);
523
524 if (peer_pid) {
525 if (peer_pid != pid) {
526 write_unlock_bh(&queue_lock);
527 RCV_SKB_FAIL(-EBUSY);
528 }
529 } else {
530 net_enable_timestamp();
531 peer_pid = pid;
532 }
533
534 write_unlock_bh(&queue_lock);
535
536 status = ipq_receive_peer(NLMSG_DATA(nlh), type,
537 skblen - NLMSG_LENGTH(0));
538 if (status < 0)
539 RCV_SKB_FAIL(status);
540
541 if (flags & NLM_F_ACK)
542 netlink_ack(skb, nlh, 0);
543 return;
544}
545
546static void
547ipq_rcv_sk(struct sock *sk, int len)
548{
549 do {
550 struct sk_buff *skb;
551
552 if (down_trylock(&ipqnl_sem))
553 return;
554
555 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
556 ipq_rcv_skb(skb);
557 kfree_skb(skb);
558 }
559
560 up(&ipqnl_sem);
561
562 } while (ipqnl && ipqnl->sk_receive_queue.qlen);
563}
564
565static int
566ipq_rcv_dev_event(struct notifier_block *this,
567 unsigned long event, void *ptr)
568{
569 struct net_device *dev = ptr;
570
571 /* Drop any packets associated with the downed device */
572 if (event == NETDEV_DOWN)
573 ipq_dev_drop(dev->ifindex);
574 return NOTIFY_DONE;
575}
576
577static struct notifier_block ipq_dev_notifier = {
578 .notifier_call = ipq_rcv_dev_event,
579};
580
581static int
582ipq_rcv_nl_event(struct notifier_block *this,
583 unsigned long event, void *ptr)
584{
585 struct netlink_notify *n = ptr;
586
587 if (event == NETLINK_URELEASE &&
588 n->protocol == NETLINK_FIREWALL && n->pid) {
589 write_lock_bh(&queue_lock);
590 if (n->pid == peer_pid)
591 __ipq_reset();
592 write_unlock_bh(&queue_lock);
593 }
594 return NOTIFY_DONE;
595}
596
597static struct notifier_block ipq_nl_notifier = {
598 .notifier_call = ipq_rcv_nl_event,
599};
600
601static struct ctl_table_header *ipq_sysctl_header;
602
603static ctl_table ipq_table[] = {
604 {
605 .ctl_name = NET_IPQ_QMAX,
606 .procname = NET_IPQ_QMAX_NAME,
607 .data = &queue_maxlen,
608 .maxlen = sizeof(queue_maxlen),
609 .mode = 0644,
610 .proc_handler = proc_dointvec
611 },
612 { .ctl_name = 0 }
613};
614
615static ctl_table ipq_dir_table[] = {
616 {
617 .ctl_name = NET_IPV4,
618 .procname = "ipv4",
619 .mode = 0555,
620 .child = ipq_table
621 },
622 { .ctl_name = 0 }
623};
624
625static ctl_table ipq_root_table[] = {
626 {
627 .ctl_name = CTL_NET,
628 .procname = "net",
629 .mode = 0555,
630 .child = ipq_dir_table
631 },
632 { .ctl_name = 0 }
633};
634
635#ifdef CONFIG_PROC_FS
636static int
637ipq_get_info(char *buffer, char **start, off_t offset, int length)
638{
639 int len;
640
641 read_lock_bh(&queue_lock);
642
643 len = sprintf(buffer,
644 "Peer PID : %d\n"
645 "Copy mode : %hu\n"
646 "Copy range : %u\n"
647 "Queue length : %u\n"
648 "Queue max. length : %u\n"
649 "Queue dropped : %u\n"
650 "Netlink dropped : %u\n",
651 peer_pid,
652 copy_mode,
653 copy_range,
654 queue_total,
655 queue_maxlen,
656 queue_dropped,
657 queue_user_dropped);
658
659 read_unlock_bh(&queue_lock);
660
661 *start = buffer + offset;
662 len -= offset;
663 if (len > length)
664 len = length;
665 else if (len < 0)
666 len = 0;
667 return len;
668}
669#endif /* CONFIG_PROC_FS */
670
671static int
672init_or_cleanup(int init)
673{
674 int status = -ENOMEM;
675 struct proc_dir_entry *proc;
676
677 if (!init)
678 goto cleanup;
679
680 netlink_register_notifier(&ipq_nl_notifier);
681 ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
682 if (ipqnl == NULL) {
683 printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
684 goto cleanup_netlink_notifier;
685 }
686
687 proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
688 if (proc)
689 proc->owner = THIS_MODULE;
690 else {
691 printk(KERN_ERR "ip_queue: failed to create proc entry\n");
692 goto cleanup_ipqnl;
693 }
694
695 register_netdevice_notifier(&ipq_dev_notifier);
696 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
697
698 status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
699 if (status < 0) {
700 printk(KERN_ERR "ip_queue: failed to register queue handler\n");
701 goto cleanup_sysctl;
702 }
703 return status;
704
705cleanup:
706 nf_unregister_queue_handler(PF_INET);
707 synchronize_net();
708 ipq_flush(NF_DROP);
709
710cleanup_sysctl:
711 unregister_sysctl_table(ipq_sysctl_header);
712 unregister_netdevice_notifier(&ipq_dev_notifier);
713 proc_net_remove(IPQ_PROC_FS_NAME);
714
715cleanup_ipqnl:
716 sock_release(ipqnl->sk_socket);
717 down(&ipqnl_sem);
718 up(&ipqnl_sem);
719
720cleanup_netlink_notifier:
721 netlink_unregister_notifier(&ipq_nl_notifier);
722 return status;
723}
724
725static int __init init(void)
726{
727
728 return init_or_cleanup(1);
729}
730
731static void __exit fini(void)
732{
733 init_or_cleanup(0);
734}
735
736MODULE_DESCRIPTION("IPv4 packet queue handler");
737MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
738MODULE_LICENSE("GPL");
739
740module_init(init);
741module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 000000000000..8a54f92b8496
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,1964 @@
1/*
2 * Packet matching code.
3 *
4 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
5 * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
12 * - increase module usage count as soon as we have rules inside
13 * a table
14 */
15#include <linux/config.h>
16#include <linux/cache.h>
17#include <linux/skbuff.h>
18#include <linux/kmod.h>
19#include <linux/vmalloc.h>
20#include <linux/netdevice.h>
21#include <linux/module.h>
22#include <linux/tcp.h>
23#include <linux/udp.h>
24#include <linux/icmp.h>
25#include <net/ip.h>
26#include <asm/uaccess.h>
27#include <asm/semaphore.h>
28#include <linux/proc_fs.h>
29#include <linux/err.h>
30
31#include <linux/netfilter_ipv4/ip_tables.h>
32
33MODULE_LICENSE("GPL");
34MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
35MODULE_DESCRIPTION("IPv4 packet filter");
36
37/*#define DEBUG_IP_FIREWALL*/
38/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
39/*#define DEBUG_IP_FIREWALL_USER*/
40
41#ifdef DEBUG_IP_FIREWALL
42#define dprintf(format, args...) printk(format , ## args)
43#else
44#define dprintf(format, args...)
45#endif
46
47#ifdef DEBUG_IP_FIREWALL_USER
48#define duprintf(format, args...) printk(format , ## args)
49#else
50#define duprintf(format, args...)
51#endif
52
53#ifdef CONFIG_NETFILTER_DEBUG
54#define IP_NF_ASSERT(x) \
55do { \
56 if (!(x)) \
57 printk("IP_NF_ASSERT: %s:%s:%u\n", \
58 __FUNCTION__, __FILE__, __LINE__); \
59} while(0)
60#else
61#define IP_NF_ASSERT(x)
62#endif
63#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
64
65static DECLARE_MUTEX(ipt_mutex);
66
67/* Must have mutex */
68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
70#include <linux/netfilter_ipv4/lockhelp.h>
71#include <linux/netfilter_ipv4/listhelp.h>
72
73#if 0
74/* All the better to debug you with... */
75#define static
76#define inline
77#endif
78
79/*
80 We keep a set of rules for each CPU, so we can avoid write-locking
81 them in the softirq when updating the counters and therefore
82 only need to read-lock in the softirq; doing a write_lock_bh() in user
83 context stops packets coming through and allows user context to read
84 the counters or update the rules.
85
86 To be cache friendly on SMP, we arrange them like so:
87 [ n-entries ]
88 ... cache-align padding ...
89 [ n-entries ]
90
91 Hence the start of any table is given by get_table() below. */
92
93/* The table itself */
94struct ipt_table_info
95{
96 /* Size per table */
97 unsigned int size;
98 /* Number of entries: FIXME. --RR */
99 unsigned int number;
100 /* Initial number of entries. Needed for module usage count */
101 unsigned int initial_entries;
102
103 /* Entry points and underflows */
104 unsigned int hook_entry[NF_IP_NUMHOOKS];
105 unsigned int underflow[NF_IP_NUMHOOKS];
106
107 /* ipt_entry tables: one per CPU */
108 char entries[0] ____cacheline_aligned;
109};
110
111static LIST_HEAD(ipt_target);
112static LIST_HEAD(ipt_match);
113static LIST_HEAD(ipt_tables);
114#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
115
116#ifdef CONFIG_SMP
117#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
118#else
119#define TABLE_OFFSET(t,p) 0
120#endif
121
122#if 0
123#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
124#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
125#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
126#endif
127
128/* Returns whether matches rule or not. */
129static inline int
130ip_packet_match(const struct iphdr *ip,
131 const char *indev,
132 const char *outdev,
133 const struct ipt_ip *ipinfo,
134 int isfrag)
135{
136 size_t i;
137 unsigned long ret;
138
139#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
140
141 if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
142 IPT_INV_SRCIP)
143 || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
144 IPT_INV_DSTIP)) {
145 dprintf("Source or dest mismatch.\n");
146
147 dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
148 NIPQUAD(ip->saddr),
149 NIPQUAD(ipinfo->smsk.s_addr),
150 NIPQUAD(ipinfo->src.s_addr),
151 ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
152 dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
153 NIPQUAD(ip->daddr),
154 NIPQUAD(ipinfo->dmsk.s_addr),
155 NIPQUAD(ipinfo->dst.s_addr),
156 ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
157 return 0;
158 }
159
160 /* Look for ifname matches; this should unroll nicely. */
161 for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
162 ret |= (((const unsigned long *)indev)[i]
163 ^ ((const unsigned long *)ipinfo->iniface)[i])
164 & ((const unsigned long *)ipinfo->iniface_mask)[i];
165 }
166
167 if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
168 dprintf("VIA in mismatch (%s vs %s).%s\n",
169 indev, ipinfo->iniface,
170 ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
171 return 0;
172 }
173
174 for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
175 ret |= (((const unsigned long *)outdev)[i]
176 ^ ((const unsigned long *)ipinfo->outiface)[i])
177 & ((const unsigned long *)ipinfo->outiface_mask)[i];
178 }
179
180 if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
181 dprintf("VIA out mismatch (%s vs %s).%s\n",
182 outdev, ipinfo->outiface,
183 ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
184 return 0;
185 }
186
187 /* Check specific protocol */
188 if (ipinfo->proto
189 && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
190 dprintf("Packet protocol %hi does not match %hi.%s\n",
191 ip->protocol, ipinfo->proto,
192 ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
193 return 0;
194 }
195
196 /* If we have a fragment rule but the packet is not a fragment
197 * then we return zero */
198 if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
199 dprintf("Fragment rule but not fragment.%s\n",
200 ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
201 return 0;
202 }
203
204 return 1;
205}
206
207static inline int
208ip_checkentry(const struct ipt_ip *ip)
209{
210 if (ip->flags & ~IPT_F_MASK) {
211 duprintf("Unknown flag bits set: %08X\n",
212 ip->flags & ~IPT_F_MASK);
213 return 0;
214 }
215 if (ip->invflags & ~IPT_INV_MASK) {
216 duprintf("Unknown invflag bits set: %08X\n",
217 ip->invflags & ~IPT_INV_MASK);
218 return 0;
219 }
220 return 1;
221}
222
223static unsigned int
224ipt_error(struct sk_buff **pskb,
225 const struct net_device *in,
226 const struct net_device *out,
227 unsigned int hooknum,
228 const void *targinfo,
229 void *userinfo)
230{
231 if (net_ratelimit())
232 printk("ip_tables: error: `%s'\n", (char *)targinfo);
233
234 return NF_DROP;
235}
236
237static inline
238int do_match(struct ipt_entry_match *m,
239 const struct sk_buff *skb,
240 const struct net_device *in,
241 const struct net_device *out,
242 int offset,
243 int *hotdrop)
244{
245 /* Stop iteration if it doesn't match */
246 if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop))
247 return 1;
248 else
249 return 0;
250}
251
252static inline struct ipt_entry *
253get_entry(void *base, unsigned int offset)
254{
255 return (struct ipt_entry *)(base + offset);
256}
257
258/* Returns one of the generic firewall policies, like NF_ACCEPT. */
259unsigned int
260ipt_do_table(struct sk_buff **pskb,
261 unsigned int hook,
262 const struct net_device *in,
263 const struct net_device *out,
264 struct ipt_table *table,
265 void *userdata)
266{
267 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
268 u_int16_t offset;
269 struct iphdr *ip;
270 u_int16_t datalen;
271 int hotdrop = 0;
272 /* Initializing verdict to NF_DROP keeps gcc happy. */
273 unsigned int verdict = NF_DROP;
274 const char *indev, *outdev;
275 void *table_base;
276 struct ipt_entry *e, *back;
277
278 /* Initialization */
279 ip = (*pskb)->nh.iph;
280 datalen = (*pskb)->len - ip->ihl * 4;
281 indev = in ? in->name : nulldevname;
282 outdev = out ? out->name : nulldevname;
283 /* We handle fragments by dealing with the first fragment as
284 * if it was a normal packet. All other fragments are treated
285 * normally, except that they will NEVER match rules that ask
286 * things we don't know, ie. tcp syn flag or ports). If the
287 * rule is also a fragment-specific rule, non-fragments won't
288 * match it. */
289 offset = ntohs(ip->frag_off) & IP_OFFSET;
290
291 read_lock_bh(&table->lock);
292 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
293 table_base = (void *)table->private->entries
294 + TABLE_OFFSET(table->private, smp_processor_id());
295 e = get_entry(table_base, table->private->hook_entry[hook]);
296
297#ifdef CONFIG_NETFILTER_DEBUG
298 /* Check noone else using our table */
299 if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
300 && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
301 printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
302 smp_processor_id(),
303 table->name,
304 &((struct ipt_entry *)table_base)->comefrom,
305 ((struct ipt_entry *)table_base)->comefrom);
306 }
307 ((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
308#endif
309
310 /* For return from builtin chain */
311 back = get_entry(table_base, table->private->underflow[hook]);
312
313 do {
314 IP_NF_ASSERT(e);
315 IP_NF_ASSERT(back);
316 (*pskb)->nfcache |= e->nfcache;
317 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
318 struct ipt_entry_target *t;
319
320 if (IPT_MATCH_ITERATE(e, do_match,
321 *pskb, in, out,
322 offset, &hotdrop) != 0)
323 goto no_match;
324
325 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
326
327 t = ipt_get_target(e);
328 IP_NF_ASSERT(t->u.kernel.target);
329 /* Standard target? */
330 if (!t->u.kernel.target->target) {
331 int v;
332
333 v = ((struct ipt_standard_target *)t)->verdict;
334 if (v < 0) {
335 /* Pop from stack? */
336 if (v != IPT_RETURN) {
337 verdict = (unsigned)(-v) - 1;
338 break;
339 }
340 e = back;
341 back = get_entry(table_base,
342 back->comefrom);
343 continue;
344 }
345 if (table_base + v
346 != (void *)e + e->next_offset) {
347 /* Save old back ptr in next entry */
348 struct ipt_entry *next
349 = (void *)e + e->next_offset;
350 next->comefrom
351 = (void *)back - table_base;
352 /* set back pointer to next entry */
353 back = next;
354 }
355
356 e = get_entry(table_base, v);
357 } else {
358 /* Targets which reenter must return
359 abs. verdicts */
360#ifdef CONFIG_NETFILTER_DEBUG
361 ((struct ipt_entry *)table_base)->comefrom
362 = 0xeeeeeeec;
363#endif
364 verdict = t->u.kernel.target->target(pskb,
365 in, out,
366 hook,
367 t->data,
368 userdata);
369
370#ifdef CONFIG_NETFILTER_DEBUG
371 if (((struct ipt_entry *)table_base)->comefrom
372 != 0xeeeeeeec
373 && verdict == IPT_CONTINUE) {
374 printk("Target %s reentered!\n",
375 t->u.kernel.target->name);
376 verdict = NF_DROP;
377 }
378 ((struct ipt_entry *)table_base)->comefrom
379 = 0x57acc001;
380#endif
381 /* Target might have changed stuff. */
382 ip = (*pskb)->nh.iph;
383 datalen = (*pskb)->len - ip->ihl * 4;
384
385 if (verdict == IPT_CONTINUE)
386 e = (void *)e + e->next_offset;
387 else
388 /* Verdict */
389 break;
390 }
391 } else {
392
393 no_match:
394 e = (void *)e + e->next_offset;
395 }
396 } while (!hotdrop);
397
398#ifdef CONFIG_NETFILTER_DEBUG
399 ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
400#endif
401 read_unlock_bh(&table->lock);
402
403#ifdef DEBUG_ALLOW_ALL
404 return NF_ACCEPT;
405#else
406 if (hotdrop)
407 return NF_DROP;
408 else return verdict;
409#endif
410}
411
412/*
413 * These are weird, but module loading must not be done with mutex
414 * held (since they will register), and we have to have a single
415 * function to use try_then_request_module().
416 */
417
418/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
419static inline struct ipt_table *find_table_lock(const char *name)
420{
421 struct ipt_table *t;
422
423 if (down_interruptible(&ipt_mutex) != 0)
424 return ERR_PTR(-EINTR);
425
426 list_for_each_entry(t, &ipt_tables, list)
427 if (strcmp(t->name, name) == 0 && try_module_get(t->me))
428 return t;
429 up(&ipt_mutex);
430 return NULL;
431}
432
433/* Find match, grabs ref. Returns ERR_PTR() on error. */
434static inline struct ipt_match *find_match(const char *name, u8 revision)
435{
436 struct ipt_match *m;
437 int err = 0;
438
439 if (down_interruptible(&ipt_mutex) != 0)
440 return ERR_PTR(-EINTR);
441
442 list_for_each_entry(m, &ipt_match, list) {
443 if (strcmp(m->name, name) == 0) {
444 if (m->revision == revision) {
445 if (try_module_get(m->me)) {
446 up(&ipt_mutex);
447 return m;
448 }
449 } else
450 err = -EPROTOTYPE; /* Found something. */
451 }
452 }
453 up(&ipt_mutex);
454 return ERR_PTR(err);
455}
456
457/* Find target, grabs ref. Returns ERR_PTR() on error. */
458static inline struct ipt_target *find_target(const char *name, u8 revision)
459{
460 struct ipt_target *t;
461 int err = 0;
462
463 if (down_interruptible(&ipt_mutex) != 0)
464 return ERR_PTR(-EINTR);
465
466 list_for_each_entry(t, &ipt_target, list) {
467 if (strcmp(t->name, name) == 0) {
468 if (t->revision == revision) {
469 if (try_module_get(t->me)) {
470 up(&ipt_mutex);
471 return t;
472 }
473 } else
474 err = -EPROTOTYPE; /* Found something. */
475 }
476 }
477 up(&ipt_mutex);
478 return ERR_PTR(err);
479}
480
481struct ipt_target *ipt_find_target(const char *name, u8 revision)
482{
483 struct ipt_target *target;
484
485 target = try_then_request_module(find_target(name, revision),
486 "ipt_%s", name);
487 if (IS_ERR(target) || !target)
488 return NULL;
489 return target;
490}
491
492static int match_revfn(const char *name, u8 revision, int *bestp)
493{
494 struct ipt_match *m;
495 int have_rev = 0;
496
497 list_for_each_entry(m, &ipt_match, list) {
498 if (strcmp(m->name, name) == 0) {
499 if (m->revision > *bestp)
500 *bestp = m->revision;
501 if (m->revision == revision)
502 have_rev = 1;
503 }
504 }
505 return have_rev;
506}
507
508static int target_revfn(const char *name, u8 revision, int *bestp)
509{
510 struct ipt_target *t;
511 int have_rev = 0;
512
513 list_for_each_entry(t, &ipt_target, list) {
514 if (strcmp(t->name, name) == 0) {
515 if (t->revision > *bestp)
516 *bestp = t->revision;
517 if (t->revision == revision)
518 have_rev = 1;
519 }
520 }
521 return have_rev;
522}
523
524/* Returns true or false (if no such extension at all) */
525static inline int find_revision(const char *name, u8 revision,
526 int (*revfn)(const char *, u8, int *),
527 int *err)
528{
529 int have_rev, best = -1;
530
531 if (down_interruptible(&ipt_mutex) != 0) {
532 *err = -EINTR;
533 return 1;
534 }
535 have_rev = revfn(name, revision, &best);
536 up(&ipt_mutex);
537
538 /* Nothing at all? Return 0 to try loading module. */
539 if (best == -1) {
540 *err = -ENOENT;
541 return 0;
542 }
543
544 *err = best;
545 if (!have_rev)
546 *err = -EPROTONOSUPPORT;
547 return 1;
548}
549
550
551/* All zeroes == unconditional rule. */
552static inline int
553unconditional(const struct ipt_ip *ip)
554{
555 unsigned int i;
556
557 for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
558 if (((__u32 *)ip)[i])
559 return 0;
560
561 return 1;
562}
563
564/* Figures out from what hook each rule can be called: returns 0 if
565 there are loops. Puts hook bitmask in comefrom. */
566static int
567mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
568{
569 unsigned int hook;
570
571 /* No recursion; use packet counter to save back ptrs (reset
572 to 0 as we leave), and comefrom to save source hook bitmask */
573 for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
574 unsigned int pos = newinfo->hook_entry[hook];
575 struct ipt_entry *e
576 = (struct ipt_entry *)(newinfo->entries + pos);
577
578 if (!(valid_hooks & (1 << hook)))
579 continue;
580
581 /* Set initial back pointer. */
582 e->counters.pcnt = pos;
583
584 for (;;) {
585 struct ipt_standard_target *t
586 = (void *)ipt_get_target(e);
587
588 if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
589 printk("iptables: loop hook %u pos %u %08X.\n",
590 hook, pos, e->comefrom);
591 return 0;
592 }
593 e->comefrom
594 |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
595
596 /* Unconditional return/END. */
597 if (e->target_offset == sizeof(struct ipt_entry)
598 && (strcmp(t->target.u.user.name,
599 IPT_STANDARD_TARGET) == 0)
600 && t->verdict < 0
601 && unconditional(&e->ip)) {
602 unsigned int oldpos, size;
603
604 /* Return: backtrack through the last
605 big jump. */
606 do {
607 e->comefrom ^= (1<<NF_IP_NUMHOOKS);
608#ifdef DEBUG_IP_FIREWALL_USER
609 if (e->comefrom
610 & (1 << NF_IP_NUMHOOKS)) {
611 duprintf("Back unset "
612 "on hook %u "
613 "rule %u\n",
614 hook, pos);
615 }
616#endif
617 oldpos = pos;
618 pos = e->counters.pcnt;
619 e->counters.pcnt = 0;
620
621 /* We're at the start. */
622 if (pos == oldpos)
623 goto next;
624
625 e = (struct ipt_entry *)
626 (newinfo->entries + pos);
627 } while (oldpos == pos + e->next_offset);
628
629 /* Move along one */
630 size = e->next_offset;
631 e = (struct ipt_entry *)
632 (newinfo->entries + pos + size);
633 e->counters.pcnt = pos;
634 pos += size;
635 } else {
636 int newpos = t->verdict;
637
638 if (strcmp(t->target.u.user.name,
639 IPT_STANDARD_TARGET) == 0
640 && newpos >= 0) {
641 /* This a jump; chase it. */
642 duprintf("Jump rule %u -> %u\n",
643 pos, newpos);
644 } else {
645 /* ... this is a fallthru */
646 newpos = pos + e->next_offset;
647 }
648 e = (struct ipt_entry *)
649 (newinfo->entries + newpos);
650 e->counters.pcnt = pos;
651 pos = newpos;
652 }
653 }
654 next:
655 duprintf("Finished chain %u\n", hook);
656 }
657 return 1;
658}
659
660static inline int
661cleanup_match(struct ipt_entry_match *m, unsigned int *i)
662{
663 if (i && (*i)-- == 0)
664 return 1;
665
666 if (m->u.kernel.match->destroy)
667 m->u.kernel.match->destroy(m->data,
668 m->u.match_size - sizeof(*m));
669 module_put(m->u.kernel.match->me);
670 return 0;
671}
672
673static inline int
674standard_check(const struct ipt_entry_target *t,
675 unsigned int max_offset)
676{
677 struct ipt_standard_target *targ = (void *)t;
678
679 /* Check standard info. */
680 if (t->u.target_size
681 != IPT_ALIGN(sizeof(struct ipt_standard_target))) {
682 duprintf("standard_check: target size %u != %u\n",
683 t->u.target_size,
684 IPT_ALIGN(sizeof(struct ipt_standard_target)));
685 return 0;
686 }
687
688 if (targ->verdict >= 0
689 && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
690 duprintf("ipt_standard_check: bad verdict (%i)\n",
691 targ->verdict);
692 return 0;
693 }
694
695 if (targ->verdict < -NF_MAX_VERDICT - 1) {
696 duprintf("ipt_standard_check: bad negative verdict (%i)\n",
697 targ->verdict);
698 return 0;
699 }
700 return 1;
701}
702
703static inline int
704check_match(struct ipt_entry_match *m,
705 const char *name,
706 const struct ipt_ip *ip,
707 unsigned int hookmask,
708 unsigned int *i)
709{
710 struct ipt_match *match;
711
712 match = try_then_request_module(find_match(m->u.user.name,
713 m->u.user.revision),
714 "ipt_%s", m->u.user.name);
715 if (IS_ERR(match) || !match) {
716 duprintf("check_match: `%s' not found\n", m->u.user.name);
717 return match ? PTR_ERR(match) : -ENOENT;
718 }
719 m->u.kernel.match = match;
720
721 if (m->u.kernel.match->checkentry
722 && !m->u.kernel.match->checkentry(name, ip, m->data,
723 m->u.match_size - sizeof(*m),
724 hookmask)) {
725 module_put(m->u.kernel.match->me);
726 duprintf("ip_tables: check failed for `%s'.\n",
727 m->u.kernel.match->name);
728 return -EINVAL;
729 }
730
731 (*i)++;
732 return 0;
733}
734
735static struct ipt_target ipt_standard_target;
736
737static inline int
738check_entry(struct ipt_entry *e, const char *name, unsigned int size,
739 unsigned int *i)
740{
741 struct ipt_entry_target *t;
742 struct ipt_target *target;
743 int ret;
744 unsigned int j;
745
746 if (!ip_checkentry(&e->ip)) {
747 duprintf("ip_tables: ip check failed %p %s.\n", e, name);
748 return -EINVAL;
749 }
750
751 j = 0;
752 ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
753 if (ret != 0)
754 goto cleanup_matches;
755
756 t = ipt_get_target(e);
757 target = try_then_request_module(find_target(t->u.user.name,
758 t->u.user.revision),
759 "ipt_%s", t->u.user.name);
760 if (IS_ERR(target) || !target) {
761 duprintf("check_entry: `%s' not found\n", t->u.user.name);
762 ret = target ? PTR_ERR(target) : -ENOENT;
763 goto cleanup_matches;
764 }
765 t->u.kernel.target = target;
766
767 if (t->u.kernel.target == &ipt_standard_target) {
768 if (!standard_check(t, size)) {
769 ret = -EINVAL;
770 goto cleanup_matches;
771 }
772 } else if (t->u.kernel.target->checkentry
773 && !t->u.kernel.target->checkentry(name, e, t->data,
774 t->u.target_size
775 - sizeof(*t),
776 e->comefrom)) {
777 module_put(t->u.kernel.target->me);
778 duprintf("ip_tables: check failed for `%s'.\n",
779 t->u.kernel.target->name);
780 ret = -EINVAL;
781 goto cleanup_matches;
782 }
783
784 (*i)++;
785 return 0;
786
787 cleanup_matches:
788 IPT_MATCH_ITERATE(e, cleanup_match, &j);
789 return ret;
790}
791
792static inline int
793check_entry_size_and_hooks(struct ipt_entry *e,
794 struct ipt_table_info *newinfo,
795 unsigned char *base,
796 unsigned char *limit,
797 const unsigned int *hook_entries,
798 const unsigned int *underflows,
799 unsigned int *i)
800{
801 unsigned int h;
802
803 if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
804 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
805 duprintf("Bad offset %p\n", e);
806 return -EINVAL;
807 }
808
809 if (e->next_offset
810 < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
811 duprintf("checking: element %p size %u\n",
812 e, e->next_offset);
813 return -EINVAL;
814 }
815
816 /* Check hooks & underflows */
817 for (h = 0; h < NF_IP_NUMHOOKS; h++) {
818 if ((unsigned char *)e - base == hook_entries[h])
819 newinfo->hook_entry[h] = hook_entries[h];
820 if ((unsigned char *)e - base == underflows[h])
821 newinfo->underflow[h] = underflows[h];
822 }
823
824 /* FIXME: underflows must be unconditional, standard verdicts
825 < 0 (not IPT_RETURN). --RR */
826
827 /* Clear counters and comefrom */
828 e->counters = ((struct ipt_counters) { 0, 0 });
829 e->comefrom = 0;
830
831 (*i)++;
832 return 0;
833}
834
835static inline int
836cleanup_entry(struct ipt_entry *e, unsigned int *i)
837{
838 struct ipt_entry_target *t;
839
840 if (i && (*i)-- == 0)
841 return 1;
842
843 /* Cleanup all matches */
844 IPT_MATCH_ITERATE(e, cleanup_match, NULL);
845 t = ipt_get_target(e);
846 if (t->u.kernel.target->destroy)
847 t->u.kernel.target->destroy(t->data,
848 t->u.target_size - sizeof(*t));
849 module_put(t->u.kernel.target->me);
850 return 0;
851}
852
853/* Checks and translates the user-supplied table segment (held in
854 newinfo) */
855static int
856translate_table(const char *name,
857 unsigned int valid_hooks,
858 struct ipt_table_info *newinfo,
859 unsigned int size,
860 unsigned int number,
861 const unsigned int *hook_entries,
862 const unsigned int *underflows)
863{
864 unsigned int i;
865 int ret;
866
867 newinfo->size = size;
868 newinfo->number = number;
869
870 /* Init all hooks to impossible value. */
871 for (i = 0; i < NF_IP_NUMHOOKS; i++) {
872 newinfo->hook_entry[i] = 0xFFFFFFFF;
873 newinfo->underflow[i] = 0xFFFFFFFF;
874 }
875
876 duprintf("translate_table: size %u\n", newinfo->size);
877 i = 0;
878 /* Walk through entries, checking offsets. */
879 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
880 check_entry_size_and_hooks,
881 newinfo,
882 newinfo->entries,
883 newinfo->entries + size,
884 hook_entries, underflows, &i);
885 if (ret != 0)
886 return ret;
887
888 if (i != number) {
889 duprintf("translate_table: %u not %u entries\n",
890 i, number);
891 return -EINVAL;
892 }
893
894 /* Check hooks all assigned */
895 for (i = 0; i < NF_IP_NUMHOOKS; i++) {
896 /* Only hooks which are valid */
897 if (!(valid_hooks & (1 << i)))
898 continue;
899 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
900 duprintf("Invalid hook entry %u %u\n",
901 i, hook_entries[i]);
902 return -EINVAL;
903 }
904 if (newinfo->underflow[i] == 0xFFFFFFFF) {
905 duprintf("Invalid underflow %u %u\n",
906 i, underflows[i]);
907 return -EINVAL;
908 }
909 }
910
911 if (!mark_source_chains(newinfo, valid_hooks))
912 return -ELOOP;
913
914 /* Finally, each sanity check must pass */
915 i = 0;
916 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
917 check_entry, name, size, &i);
918
919 if (ret != 0) {
920 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
921 cleanup_entry, &i);
922 return ret;
923 }
924
925 /* And one copy for every other CPU */
926 for (i = 1; i < num_possible_cpus(); i++) {
927 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
928 newinfo->entries,
929 SMP_ALIGN(newinfo->size));
930 }
931
932 return ret;
933}
934
935static struct ipt_table_info *
936replace_table(struct ipt_table *table,
937 unsigned int num_counters,
938 struct ipt_table_info *newinfo,
939 int *error)
940{
941 struct ipt_table_info *oldinfo;
942
943#ifdef CONFIG_NETFILTER_DEBUG
944 {
945 struct ipt_entry *table_base;
946 unsigned int i;
947
948 for (i = 0; i < num_possible_cpus(); i++) {
949 table_base =
950 (void *)newinfo->entries
951 + TABLE_OFFSET(newinfo, i);
952
953 table_base->comefrom = 0xdead57ac;
954 }
955 }
956#endif
957
958 /* Do the substitution. */
959 write_lock_bh(&table->lock);
960 /* Check inside lock: is the old number correct? */
961 if (num_counters != table->private->number) {
962 duprintf("num_counters != table->private->number (%u/%u)\n",
963 num_counters, table->private->number);
964 write_unlock_bh(&table->lock);
965 *error = -EAGAIN;
966 return NULL;
967 }
968 oldinfo = table->private;
969 table->private = newinfo;
970 newinfo->initial_entries = oldinfo->initial_entries;
971 write_unlock_bh(&table->lock);
972
973 return oldinfo;
974}
975
976/* Gets counters. */
977static inline int
978add_entry_to_counter(const struct ipt_entry *e,
979 struct ipt_counters total[],
980 unsigned int *i)
981{
982 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
983
984 (*i)++;
985 return 0;
986}
987
988static void
989get_counters(const struct ipt_table_info *t,
990 struct ipt_counters counters[])
991{
992 unsigned int cpu;
993 unsigned int i;
994
995 for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
996 i = 0;
997 IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
998 t->size,
999 add_entry_to_counter,
1000 counters,
1001 &i);
1002 }
1003}
1004
1005static int
1006copy_entries_to_user(unsigned int total_size,
1007 struct ipt_table *table,
1008 void __user *userptr)
1009{
1010 unsigned int off, num, countersize;
1011 struct ipt_entry *e;
1012 struct ipt_counters *counters;
1013 int ret = 0;
1014
1015 /* We need atomic snapshot of counters: rest doesn't change
1016 (other than comefrom, which userspace doesn't care
1017 about). */
1018 countersize = sizeof(struct ipt_counters) * table->private->number;
1019 counters = vmalloc(countersize);
1020
1021 if (counters == NULL)
1022 return -ENOMEM;
1023
1024 /* First, sum counters... */
1025 memset(counters, 0, countersize);
1026 write_lock_bh(&table->lock);
1027 get_counters(table->private, counters);
1028 write_unlock_bh(&table->lock);
1029
1030 /* ... then copy entire thing from CPU 0... */
1031 if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
1032 ret = -EFAULT;
1033 goto free_counters;
1034 }
1035
1036 /* FIXME: use iterator macros --RR */
1037 /* ... then go back and fix counters and names */
1038 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
1039 unsigned int i;
1040 struct ipt_entry_match *m;
1041 struct ipt_entry_target *t;
1042
1043 e = (struct ipt_entry *)(table->private->entries + off);
1044 if (copy_to_user(userptr + off
1045 + offsetof(struct ipt_entry, counters),
1046 &counters[num],
1047 sizeof(counters[num])) != 0) {
1048 ret = -EFAULT;
1049 goto free_counters;
1050 }
1051
1052 for (i = sizeof(struct ipt_entry);
1053 i < e->target_offset;
1054 i += m->u.match_size) {
1055 m = (void *)e + i;
1056
1057 if (copy_to_user(userptr + off + i
1058 + offsetof(struct ipt_entry_match,
1059 u.user.name),
1060 m->u.kernel.match->name,
1061 strlen(m->u.kernel.match->name)+1)
1062 != 0) {
1063 ret = -EFAULT;
1064 goto free_counters;
1065 }
1066 }
1067
1068 t = ipt_get_target(e);
1069 if (copy_to_user(userptr + off + e->target_offset
1070 + offsetof(struct ipt_entry_target,
1071 u.user.name),
1072 t->u.kernel.target->name,
1073 strlen(t->u.kernel.target->name)+1) != 0) {
1074 ret = -EFAULT;
1075 goto free_counters;
1076 }
1077 }
1078
1079 free_counters:
1080 vfree(counters);
1081 return ret;
1082}
1083
1084static int
1085get_entries(const struct ipt_get_entries *entries,
1086 struct ipt_get_entries __user *uptr)
1087{
1088 int ret;
1089 struct ipt_table *t;
1090
1091 t = find_table_lock(entries->name);
1092 if (t && !IS_ERR(t)) {
1093 duprintf("t->private->number = %u\n",
1094 t->private->number);
1095 if (entries->size == t->private->size)
1096 ret = copy_entries_to_user(t->private->size,
1097 t, uptr->entrytable);
1098 else {
1099 duprintf("get_entries: I've got %u not %u!\n",
1100 t->private->size,
1101 entries->size);
1102 ret = -EINVAL;
1103 }
1104 module_put(t->me);
1105 up(&ipt_mutex);
1106 } else
1107 ret = t ? PTR_ERR(t) : -ENOENT;
1108
1109 return ret;
1110}
1111
1112static int
1113do_replace(void __user *user, unsigned int len)
1114{
1115 int ret;
1116 struct ipt_replace tmp;
1117 struct ipt_table *t;
1118 struct ipt_table_info *newinfo, *oldinfo;
1119 struct ipt_counters *counters;
1120
1121 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1122 return -EFAULT;
1123
1124 /* Hack: Causes ipchains to give correct error msg --RR */
1125 if (len != sizeof(tmp) + tmp.size)
1126 return -ENOPROTOOPT;
1127
1128 /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
1129 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
1130 return -ENOMEM;
1131
1132 newinfo = vmalloc(sizeof(struct ipt_table_info)
1133 + SMP_ALIGN(tmp.size) * num_possible_cpus());
1134 if (!newinfo)
1135 return -ENOMEM;
1136
1137 if (copy_from_user(newinfo->entries, user + sizeof(tmp),
1138 tmp.size) != 0) {
1139 ret = -EFAULT;
1140 goto free_newinfo;
1141 }
1142
1143 counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
1144 if (!counters) {
1145 ret = -ENOMEM;
1146 goto free_newinfo;
1147 }
1148 memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
1149
1150 ret = translate_table(tmp.name, tmp.valid_hooks,
1151 newinfo, tmp.size, tmp.num_entries,
1152 tmp.hook_entry, tmp.underflow);
1153 if (ret != 0)
1154 goto free_newinfo_counters;
1155
1156 duprintf("ip_tables: Translated table\n");
1157
1158 t = try_then_request_module(find_table_lock(tmp.name),
1159 "iptable_%s", tmp.name);
1160 if (!t || IS_ERR(t)) {
1161 ret = t ? PTR_ERR(t) : -ENOENT;
1162 goto free_newinfo_counters_untrans;
1163 }
1164
1165 /* You lied! */
1166 if (tmp.valid_hooks != t->valid_hooks) {
1167 duprintf("Valid hook crap: %08X vs %08X\n",
1168 tmp.valid_hooks, t->valid_hooks);
1169 ret = -EINVAL;
1170 goto put_module;
1171 }
1172
1173 oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
1174 if (!oldinfo)
1175 goto put_module;
1176
1177 /* Update module usage count based on number of rules */
1178 duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
1179 oldinfo->number, oldinfo->initial_entries, newinfo->number);
1180 if ((oldinfo->number > oldinfo->initial_entries) ||
1181 (newinfo->number <= oldinfo->initial_entries))
1182 module_put(t->me);
1183 if ((oldinfo->number > oldinfo->initial_entries) &&
1184 (newinfo->number <= oldinfo->initial_entries))
1185 module_put(t->me);
1186
1187 /* Get the old counters. */
1188 get_counters(oldinfo, counters);
1189 /* Decrease module usage counts and free resource */
1190 IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
1191 vfree(oldinfo);
1192 if (copy_to_user(tmp.counters, counters,
1193 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
1194 ret = -EFAULT;
1195 vfree(counters);
1196 up(&ipt_mutex);
1197 return ret;
1198
1199 put_module:
1200 module_put(t->me);
1201 up(&ipt_mutex);
1202 free_newinfo_counters_untrans:
1203 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
1204 free_newinfo_counters:
1205 vfree(counters);
1206 free_newinfo:
1207 vfree(newinfo);
1208 return ret;
1209}
1210
1211/* We're lazy, and add to the first CPU; overflow works its fey magic
1212 * and everything is OK. */
1213static inline int
1214add_counter_to_entry(struct ipt_entry *e,
1215 const struct ipt_counters addme[],
1216 unsigned int *i)
1217{
1218#if 0
1219 duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
1220 *i,
1221 (long unsigned int)e->counters.pcnt,
1222 (long unsigned int)e->counters.bcnt,
1223 (long unsigned int)addme[*i].pcnt,
1224 (long unsigned int)addme[*i].bcnt);
1225#endif
1226
1227 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1228
1229 (*i)++;
1230 return 0;
1231}
1232
1233static int
1234do_add_counters(void __user *user, unsigned int len)
1235{
1236 unsigned int i;
1237 struct ipt_counters_info tmp, *paddc;
1238 struct ipt_table *t;
1239 int ret = 0;
1240
1241 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1242 return -EFAULT;
1243
1244 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
1245 return -EINVAL;
1246
1247 paddc = vmalloc(len);
1248 if (!paddc)
1249 return -ENOMEM;
1250
1251 if (copy_from_user(paddc, user, len) != 0) {
1252 ret = -EFAULT;
1253 goto free;
1254 }
1255
1256 t = find_table_lock(tmp.name);
1257 if (!t || IS_ERR(t)) {
1258 ret = t ? PTR_ERR(t) : -ENOENT;
1259 goto free;
1260 }
1261
1262 write_lock_bh(&t->lock);
1263 if (t->private->number != paddc->num_counters) {
1264 ret = -EINVAL;
1265 goto unlock_up_free;
1266 }
1267
1268 i = 0;
1269 IPT_ENTRY_ITERATE(t->private->entries,
1270 t->private->size,
1271 add_counter_to_entry,
1272 paddc->counters,
1273 &i);
1274 unlock_up_free:
1275 write_unlock_bh(&t->lock);
1276 up(&ipt_mutex);
1277 module_put(t->me);
1278 free:
1279 vfree(paddc);
1280
1281 return ret;
1282}
1283
1284static int
1285do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1286{
1287 int ret;
1288
1289 if (!capable(CAP_NET_ADMIN))
1290 return -EPERM;
1291
1292 switch (cmd) {
1293 case IPT_SO_SET_REPLACE:
1294 ret = do_replace(user, len);
1295 break;
1296
1297 case IPT_SO_SET_ADD_COUNTERS:
1298 ret = do_add_counters(user, len);
1299 break;
1300
1301 default:
1302 duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
1303 ret = -EINVAL;
1304 }
1305
1306 return ret;
1307}
1308
1309static int
1310do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
1311{
1312 int ret;
1313
1314 if (!capable(CAP_NET_ADMIN))
1315 return -EPERM;
1316
1317 switch (cmd) {
1318 case IPT_SO_GET_INFO: {
1319 char name[IPT_TABLE_MAXNAMELEN];
1320 struct ipt_table *t;
1321
1322 if (*len != sizeof(struct ipt_getinfo)) {
1323 duprintf("length %u != %u\n", *len,
1324 sizeof(struct ipt_getinfo));
1325 ret = -EINVAL;
1326 break;
1327 }
1328
1329 if (copy_from_user(name, user, sizeof(name)) != 0) {
1330 ret = -EFAULT;
1331 break;
1332 }
1333 name[IPT_TABLE_MAXNAMELEN-1] = '\0';
1334
1335 t = try_then_request_module(find_table_lock(name),
1336 "iptable_%s", name);
1337 if (t && !IS_ERR(t)) {
1338 struct ipt_getinfo info;
1339
1340 info.valid_hooks = t->valid_hooks;
1341 memcpy(info.hook_entry, t->private->hook_entry,
1342 sizeof(info.hook_entry));
1343 memcpy(info.underflow, t->private->underflow,
1344 sizeof(info.underflow));
1345 info.num_entries = t->private->number;
1346 info.size = t->private->size;
1347 memcpy(info.name, name, sizeof(info.name));
1348
1349 if (copy_to_user(user, &info, *len) != 0)
1350 ret = -EFAULT;
1351 else
1352 ret = 0;
1353 up(&ipt_mutex);
1354 module_put(t->me);
1355 } else
1356 ret = t ? PTR_ERR(t) : -ENOENT;
1357 }
1358 break;
1359
1360 case IPT_SO_GET_ENTRIES: {
1361 struct ipt_get_entries get;
1362
1363 if (*len < sizeof(get)) {
1364 duprintf("get_entries: %u < %u\n", *len, sizeof(get));
1365 ret = -EINVAL;
1366 } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
1367 ret = -EFAULT;
1368 } else if (*len != sizeof(struct ipt_get_entries) + get.size) {
1369 duprintf("get_entries: %u != %u\n", *len,
1370 sizeof(struct ipt_get_entries) + get.size);
1371 ret = -EINVAL;
1372 } else
1373 ret = get_entries(&get, user);
1374 break;
1375 }
1376
1377 case IPT_SO_GET_REVISION_MATCH:
1378 case IPT_SO_GET_REVISION_TARGET: {
1379 struct ipt_get_revision rev;
1380 int (*revfn)(const char *, u8, int *);
1381
1382 if (*len != sizeof(rev)) {
1383 ret = -EINVAL;
1384 break;
1385 }
1386 if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
1387 ret = -EFAULT;
1388 break;
1389 }
1390
1391 if (cmd == IPT_SO_GET_REVISION_TARGET)
1392 revfn = target_revfn;
1393 else
1394 revfn = match_revfn;
1395
1396 try_then_request_module(find_revision(rev.name, rev.revision,
1397 revfn, &ret),
1398 "ipt_%s", rev.name);
1399 break;
1400 }
1401
1402 default:
1403 duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
1404 ret = -EINVAL;
1405 }
1406
1407 return ret;
1408}
1409
1410/* Registration hooks for targets. */
1411int
1412ipt_register_target(struct ipt_target *target)
1413{
1414 int ret;
1415
1416 ret = down_interruptible(&ipt_mutex);
1417 if (ret != 0)
1418 return ret;
1419 list_add(&target->list, &ipt_target);
1420 up(&ipt_mutex);
1421 return ret;
1422}
1423
1424void
1425ipt_unregister_target(struct ipt_target *target)
1426{
1427 down(&ipt_mutex);
1428 LIST_DELETE(&ipt_target, target);
1429 up(&ipt_mutex);
1430}
1431
1432int
1433ipt_register_match(struct ipt_match *match)
1434{
1435 int ret;
1436
1437 ret = down_interruptible(&ipt_mutex);
1438 if (ret != 0)
1439 return ret;
1440
1441 list_add(&match->list, &ipt_match);
1442 up(&ipt_mutex);
1443
1444 return ret;
1445}
1446
1447void
1448ipt_unregister_match(struct ipt_match *match)
1449{
1450 down(&ipt_mutex);
1451 LIST_DELETE(&ipt_match, match);
1452 up(&ipt_mutex);
1453}
1454
1455int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
1456{
1457 int ret;
1458 struct ipt_table_info *newinfo;
1459 static struct ipt_table_info bootstrap
1460 = { 0, 0, 0, { 0 }, { 0 }, { } };
1461
1462 newinfo = vmalloc(sizeof(struct ipt_table_info)
1463 + SMP_ALIGN(repl->size) * num_possible_cpus());
1464 if (!newinfo)
1465 return -ENOMEM;
1466
1467 memcpy(newinfo->entries, repl->entries, repl->size);
1468
1469 ret = translate_table(table->name, table->valid_hooks,
1470 newinfo, repl->size,
1471 repl->num_entries,
1472 repl->hook_entry,
1473 repl->underflow);
1474 if (ret != 0) {
1475 vfree(newinfo);
1476 return ret;
1477 }
1478
1479 ret = down_interruptible(&ipt_mutex);
1480 if (ret != 0) {
1481 vfree(newinfo);
1482 return ret;
1483 }
1484
1485 /* Don't autoload: we'd eat our tail... */
1486 if (list_named_find(&ipt_tables, table->name)) {
1487 ret = -EEXIST;
1488 goto free_unlock;
1489 }
1490
1491 /* Simplifies replace_table code. */
1492 table->private = &bootstrap;
1493 if (!replace_table(table, 0, newinfo, &ret))
1494 goto free_unlock;
1495
1496 duprintf("table->private->number = %u\n",
1497 table->private->number);
1498
1499 /* save number of initial entries */
1500 table->private->initial_entries = table->private->number;
1501
1502 rwlock_init(&table->lock);
1503 list_prepend(&ipt_tables, table);
1504
1505 unlock:
1506 up(&ipt_mutex);
1507 return ret;
1508
1509 free_unlock:
1510 vfree(newinfo);
1511 goto unlock;
1512}
1513
1514void ipt_unregister_table(struct ipt_table *table)
1515{
1516 down(&ipt_mutex);
1517 LIST_DELETE(&ipt_tables, table);
1518 up(&ipt_mutex);
1519
1520 /* Decrease module usage counts and free resources */
1521 IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
1522 cleanup_entry, NULL);
1523 vfree(table->private);
1524}
1525
1526/* Returns 1 if the port is matched by the range, 0 otherwise */
1527static inline int
1528port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
1529{
1530 int ret;
1531
1532 ret = (port >= min && port <= max) ^ invert;
1533 return ret;
1534}
1535
1536static int
1537tcp_find_option(u_int8_t option,
1538 const struct sk_buff *skb,
1539 unsigned int optlen,
1540 int invert,
1541 int *hotdrop)
1542{
1543 /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
1544 u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
1545 unsigned int i;
1546
1547 duprintf("tcp_match: finding option\n");
1548
1549 if (!optlen)
1550 return invert;
1551
1552 /* If we don't have the whole header, drop packet. */
1553 op = skb_header_pointer(skb,
1554 skb->nh.iph->ihl*4 + sizeof(struct tcphdr),
1555 optlen, _opt);
1556 if (op == NULL) {
1557 *hotdrop = 1;
1558 return 0;
1559 }
1560
1561 for (i = 0; i < optlen; ) {
1562 if (op[i] == option) return !invert;
1563 if (op[i] < 2) i++;
1564 else i += op[i+1]?:1;
1565 }
1566
1567 return invert;
1568}
1569
1570static int
1571tcp_match(const struct sk_buff *skb,
1572 const struct net_device *in,
1573 const struct net_device *out,
1574 const void *matchinfo,
1575 int offset,
1576 int *hotdrop)
1577{
1578 struct tcphdr _tcph, *th;
1579 const struct ipt_tcp *tcpinfo = matchinfo;
1580
1581 if (offset) {
1582 /* To quote Alan:
1583
1584 Don't allow a fragment of TCP 8 bytes in. Nobody normal
1585 causes this. Its a cracker trying to break in by doing a
1586 flag overwrite to pass the direction checks.
1587 */
1588 if (offset == 1) {
1589 duprintf("Dropping evil TCP offset=1 frag.\n");
1590 *hotdrop = 1;
1591 }
1592 /* Must not be a fragment. */
1593 return 0;
1594 }
1595
1596#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
1597
1598 th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
1599 sizeof(_tcph), &_tcph);
1600 if (th == NULL) {
1601 /* We've been asked to examine this packet, and we
1602 can't. Hence, no choice but to drop. */
1603 duprintf("Dropping evil TCP offset=0 tinygram.\n");
1604 *hotdrop = 1;
1605 return 0;
1606 }
1607
1608 if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
1609 ntohs(th->source),
1610 !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT)))
1611 return 0;
1612 if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
1613 ntohs(th->dest),
1614 !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT)))
1615 return 0;
1616 if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
1617 == tcpinfo->flg_cmp,
1618 IPT_TCP_INV_FLAGS))
1619 return 0;
1620 if (tcpinfo->option) {
1621 if (th->doff * 4 < sizeof(_tcph)) {
1622 *hotdrop = 1;
1623 return 0;
1624 }
1625 if (!tcp_find_option(tcpinfo->option, skb,
1626 th->doff*4 - sizeof(_tcph),
1627 tcpinfo->invflags & IPT_TCP_INV_OPTION,
1628 hotdrop))
1629 return 0;
1630 }
1631 return 1;
1632}
1633
1634/* Called when user tries to insert an entry of this type. */
1635static int
1636tcp_checkentry(const char *tablename,
1637 const struct ipt_ip *ip,
1638 void *matchinfo,
1639 unsigned int matchsize,
1640 unsigned int hook_mask)
1641{
1642 const struct ipt_tcp *tcpinfo = matchinfo;
1643
1644 /* Must specify proto == TCP, and no unknown invflags */
1645 return ip->proto == IPPROTO_TCP
1646 && !(ip->invflags & IPT_INV_PROTO)
1647 && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp))
1648 && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK);
1649}
1650
1651static int
1652udp_match(const struct sk_buff *skb,
1653 const struct net_device *in,
1654 const struct net_device *out,
1655 const void *matchinfo,
1656 int offset,
1657 int *hotdrop)
1658{
1659 struct udphdr _udph, *uh;
1660 const struct ipt_udp *udpinfo = matchinfo;
1661
1662 /* Must not be a fragment. */
1663 if (offset)
1664 return 0;
1665
1666 uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
1667 sizeof(_udph), &_udph);
1668 if (uh == NULL) {
1669 /* We've been asked to examine this packet, and we
1670 can't. Hence, no choice but to drop. */
1671 duprintf("Dropping evil UDP tinygram.\n");
1672 *hotdrop = 1;
1673 return 0;
1674 }
1675
1676 return port_match(udpinfo->spts[0], udpinfo->spts[1],
1677 ntohs(uh->source),
1678 !!(udpinfo->invflags & IPT_UDP_INV_SRCPT))
1679 && port_match(udpinfo->dpts[0], udpinfo->dpts[1],
1680 ntohs(uh->dest),
1681 !!(udpinfo->invflags & IPT_UDP_INV_DSTPT));
1682}
1683
1684/* Called when user tries to insert an entry of this type. */
1685static int
1686udp_checkentry(const char *tablename,
1687 const struct ipt_ip *ip,
1688 void *matchinfo,
1689 unsigned int matchinfosize,
1690 unsigned int hook_mask)
1691{
1692 const struct ipt_udp *udpinfo = matchinfo;
1693
1694 /* Must specify proto == UDP, and no unknown invflags */
1695 if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) {
1696 duprintf("ipt_udp: Protocol %u != %u\n", ip->proto,
1697 IPPROTO_UDP);
1698 return 0;
1699 }
1700 if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) {
1701 duprintf("ipt_udp: matchsize %u != %u\n",
1702 matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp)));
1703 return 0;
1704 }
1705 if (udpinfo->invflags & ~IPT_UDP_INV_MASK) {
1706 duprintf("ipt_udp: unknown flags %X\n",
1707 udpinfo->invflags);
1708 return 0;
1709 }
1710
1711 return 1;
1712}
1713
1714/* Returns 1 if the type and code is matched by the range, 0 otherwise */
1715static inline int
1716icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
1717 u_int8_t type, u_int8_t code,
1718 int invert)
1719{
1720 return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
1721 ^ invert;
1722}
1723
1724static int
1725icmp_match(const struct sk_buff *skb,
1726 const struct net_device *in,
1727 const struct net_device *out,
1728 const void *matchinfo,
1729 int offset,
1730 int *hotdrop)
1731{
1732 struct icmphdr _icmph, *ic;
1733 const struct ipt_icmp *icmpinfo = matchinfo;
1734
1735 /* Must not be a fragment. */
1736 if (offset)
1737 return 0;
1738
1739 ic = skb_header_pointer(skb, skb->nh.iph->ihl*4,
1740 sizeof(_icmph), &_icmph);
1741 if (ic == NULL) {
1742 /* We've been asked to examine this packet, and we
1743 * can't. Hence, no choice but to drop.
1744 */
1745 duprintf("Dropping evil ICMP tinygram.\n");
1746 *hotdrop = 1;
1747 return 0;
1748 }
1749
1750 return icmp_type_code_match(icmpinfo->type,
1751 icmpinfo->code[0],
1752 icmpinfo->code[1],
1753 ic->type, ic->code,
1754 !!(icmpinfo->invflags&IPT_ICMP_INV));
1755}
1756
1757/* Called when user tries to insert an entry of this type. */
1758static int
1759icmp_checkentry(const char *tablename,
1760 const struct ipt_ip *ip,
1761 void *matchinfo,
1762 unsigned int matchsize,
1763 unsigned int hook_mask)
1764{
1765 const struct ipt_icmp *icmpinfo = matchinfo;
1766
1767 /* Must specify proto == ICMP, and no unknown invflags */
1768 return ip->proto == IPPROTO_ICMP
1769 && !(ip->invflags & IPT_INV_PROTO)
1770 && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp))
1771 && !(icmpinfo->invflags & ~IPT_ICMP_INV);
1772}
1773
1774/* The built-in targets: standard (NULL) and error. */
1775static struct ipt_target ipt_standard_target = {
1776 .name = IPT_STANDARD_TARGET,
1777};
1778
1779static struct ipt_target ipt_error_target = {
1780 .name = IPT_ERROR_TARGET,
1781 .target = ipt_error,
1782};
1783
1784static struct nf_sockopt_ops ipt_sockopts = {
1785 .pf = PF_INET,
1786 .set_optmin = IPT_BASE_CTL,
1787 .set_optmax = IPT_SO_SET_MAX+1,
1788 .set = do_ipt_set_ctl,
1789 .get_optmin = IPT_BASE_CTL,
1790 .get_optmax = IPT_SO_GET_MAX+1,
1791 .get = do_ipt_get_ctl,
1792};
1793
1794static struct ipt_match tcp_matchstruct = {
1795 .name = "tcp",
1796 .match = &tcp_match,
1797 .checkentry = &tcp_checkentry,
1798};
1799
1800static struct ipt_match udp_matchstruct = {
1801 .name = "udp",
1802 .match = &udp_match,
1803 .checkentry = &udp_checkentry,
1804};
1805
1806static struct ipt_match icmp_matchstruct = {
1807 .name = "icmp",
1808 .match = &icmp_match,
1809 .checkentry = &icmp_checkentry,
1810};
1811
1812#ifdef CONFIG_PROC_FS
1813static inline int print_name(const char *i,
1814 off_t start_offset, char *buffer, int length,
1815 off_t *pos, unsigned int *count)
1816{
1817 if ((*count)++ >= start_offset) {
1818 unsigned int namelen;
1819
1820 namelen = sprintf(buffer + *pos, "%s\n",
1821 i + sizeof(struct list_head));
1822 if (*pos + namelen > length) {
1823 /* Stop iterating */
1824 return 1;
1825 }
1826 *pos += namelen;
1827 }
1828 return 0;
1829}
1830
1831static inline int print_target(const struct ipt_target *t,
1832 off_t start_offset, char *buffer, int length,
1833 off_t *pos, unsigned int *count)
1834{
1835 if (t == &ipt_standard_target || t == &ipt_error_target)
1836 return 0;
1837 return print_name((char *)t, start_offset, buffer, length, pos, count);
1838}
1839
1840static int ipt_get_tables(char *buffer, char **start, off_t offset, int length)
1841{
1842 off_t pos = 0;
1843 unsigned int count = 0;
1844
1845 if (down_interruptible(&ipt_mutex) != 0)
1846 return 0;
1847
1848 LIST_FIND(&ipt_tables, print_name, void *,
1849 offset, buffer, length, &pos, &count);
1850
1851 up(&ipt_mutex);
1852
1853 /* `start' hack - see fs/proc/generic.c line ~105 */
1854 *start=(char *)((unsigned long)count-offset);
1855 return pos;
1856}
1857
1858static int ipt_get_targets(char *buffer, char **start, off_t offset, int length)
1859{
1860 off_t pos = 0;
1861 unsigned int count = 0;
1862
1863 if (down_interruptible(&ipt_mutex) != 0)
1864 return 0;
1865
1866 LIST_FIND(&ipt_target, print_target, struct ipt_target *,
1867 offset, buffer, length, &pos, &count);
1868
1869 up(&ipt_mutex);
1870
1871 *start = (char *)((unsigned long)count - offset);
1872 return pos;
1873}
1874
1875static int ipt_get_matches(char *buffer, char **start, off_t offset, int length)
1876{
1877 off_t pos = 0;
1878 unsigned int count = 0;
1879
1880 if (down_interruptible(&ipt_mutex) != 0)
1881 return 0;
1882
1883 LIST_FIND(&ipt_match, print_name, void *,
1884 offset, buffer, length, &pos, &count);
1885
1886 up(&ipt_mutex);
1887
1888 *start = (char *)((unsigned long)count - offset);
1889 return pos;
1890}
1891
1892static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] =
1893{ { "ip_tables_names", ipt_get_tables },
1894 { "ip_tables_targets", ipt_get_targets },
1895 { "ip_tables_matches", ipt_get_matches },
1896 { NULL, NULL} };
1897#endif /*CONFIG_PROC_FS*/
1898
1899static int __init init(void)
1900{
1901 int ret;
1902
1903 /* Noone else will be downing sem now, so we won't sleep */
1904 down(&ipt_mutex);
1905 list_append(&ipt_target, &ipt_standard_target);
1906 list_append(&ipt_target, &ipt_error_target);
1907 list_append(&ipt_match, &tcp_matchstruct);
1908 list_append(&ipt_match, &udp_matchstruct);
1909 list_append(&ipt_match, &icmp_matchstruct);
1910 up(&ipt_mutex);
1911
1912 /* Register setsockopt */
1913 ret = nf_register_sockopt(&ipt_sockopts);
1914 if (ret < 0) {
1915 duprintf("Unable to register sockopts.\n");
1916 return ret;
1917 }
1918
1919#ifdef CONFIG_PROC_FS
1920 {
1921 struct proc_dir_entry *proc;
1922 int i;
1923
1924 for (i = 0; ipt_proc_entry[i].name; i++) {
1925 proc = proc_net_create(ipt_proc_entry[i].name, 0,
1926 ipt_proc_entry[i].get_info);
1927 if (!proc) {
1928 while (--i >= 0)
1929 proc_net_remove(ipt_proc_entry[i].name);
1930 nf_unregister_sockopt(&ipt_sockopts);
1931 return -ENOMEM;
1932 }
1933 proc->owner = THIS_MODULE;
1934 }
1935 }
1936#endif
1937
1938 printk("ip_tables: (C) 2000-2002 Netfilter core team\n");
1939 return 0;
1940}
1941
1942static void __exit fini(void)
1943{
1944 nf_unregister_sockopt(&ipt_sockopts);
1945#ifdef CONFIG_PROC_FS
1946 {
1947 int i;
1948 for (i = 0; ipt_proc_entry[i].name; i++)
1949 proc_net_remove(ipt_proc_entry[i].name);
1950 }
1951#endif
1952}
1953
1954EXPORT_SYMBOL(ipt_register_table);
1955EXPORT_SYMBOL(ipt_unregister_table);
1956EXPORT_SYMBOL(ipt_register_match);
1957EXPORT_SYMBOL(ipt_unregister_match);
1958EXPORT_SYMBOL(ipt_do_table);
1959EXPORT_SYMBOL(ipt_register_target);
1960EXPORT_SYMBOL(ipt_unregister_target);
1961EXPORT_SYMBOL(ipt_find_target);
1962
1963module_init(init);
1964module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
new file mode 100644
index 000000000000..9842e6e23184
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -0,0 +1,92 @@
1/*
2 * This is a module which is used for setting the skb->priority field
3 * of an skb for qdisc classification.
4 */
5
6/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/ip.h>
16#include <net/checksum.h>
17
18#include <linux/netfilter_ipv4/ip_tables.h>
19#include <linux/netfilter_ipv4/ipt_CLASSIFY.h>
20
21MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
22MODULE_LICENSE("GPL");
23MODULE_DESCRIPTION("iptables qdisc classification target module");
24
25static unsigned int
26target(struct sk_buff **pskb,
27 const struct net_device *in,
28 const struct net_device *out,
29 unsigned int hooknum,
30 const void *targinfo,
31 void *userinfo)
32{
33 const struct ipt_classify_target_info *clinfo = targinfo;
34
35 if((*pskb)->priority != clinfo->priority) {
36 (*pskb)->priority = clinfo->priority;
37 (*pskb)->nfcache |= NFC_ALTERED;
38 }
39
40 return IPT_CONTINUE;
41}
42
43static int
44checkentry(const char *tablename,
45 const struct ipt_entry *e,
46 void *targinfo,
47 unsigned int targinfosize,
48 unsigned int hook_mask)
49{
50 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){
51 printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n",
52 targinfosize,
53 IPT_ALIGN(sizeof(struct ipt_classify_target_info)));
54 return 0;
55 }
56
57 if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
58 (1 << NF_IP_POST_ROUTING))) {
59 printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD "
60 "and POST_ROUTING.\n");
61 return 0;
62 }
63
64 if (strcmp(tablename, "mangle") != 0) {
65 printk(KERN_ERR "CLASSIFY: can only be called from "
66 "\"mangle\" table, not \"%s\".\n",
67 tablename);
68 return 0;
69 }
70
71 return 1;
72}
73
74static struct ipt_target ipt_classify_reg = {
75 .name = "CLASSIFY",
76 .target = target,
77 .checkentry = checkentry,
78 .me = THIS_MODULE,
79};
80
81static int __init init(void)
82{
83 return ipt_register_target(&ipt_classify_reg);
84}
85
86static void __exit fini(void)
87{
88 ipt_unregister_target(&ipt_classify_reg);
89}
90
91module_init(init);
92module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
new file mode 100644
index 000000000000..0f12e3a3dc73
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -0,0 +1,761 @@
1/* Cluster IP hashmark target
2 * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
3 * based on ideas of Fabio Olive Leite <olive@unixforge.org>
4 *
5 * Development of this code funded by SuSE Linux AG, http://www.suse.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 */
12#include <linux/module.h>
13#include <linux/config.h>
14#include <linux/proc_fs.h>
15#include <linux/jhash.h>
16#include <linux/skbuff.h>
17#include <linux/ip.h>
18#include <linux/tcp.h>
19#include <linux/udp.h>
20#include <linux/icmp.h>
21#include <linux/if_arp.h>
22#include <linux/proc_fs.h>
23#include <linux/seq_file.h>
24
25#include <net/checksum.h>
26
27#include <linux/netfilter_arp.h>
28
29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/lockhelp.h>
33
34#define CLUSTERIP_VERSION "0.6"
35
36#define DEBUG_CLUSTERIP
37
38#ifdef DEBUG_CLUSTERIP
39#define DEBUGP printk
40#else
41#define DEBUGP
42#endif
43
44MODULE_LICENSE("GPL");
45MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
46MODULE_DESCRIPTION("iptables target for CLUSTERIP");
47
48struct clusterip_config {
49 struct list_head list; /* list of all configs */
50 atomic_t refcount; /* reference count */
51
52 u_int32_t clusterip; /* the IP address */
53 u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
54 struct net_device *dev; /* device */
55 u_int16_t num_total_nodes; /* total number of nodes */
56 u_int16_t num_local_nodes; /* number of local nodes */
57 u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; /* node number array */
58
59#ifdef CONFIG_PROC_FS
60 struct proc_dir_entry *pde; /* proc dir entry */
61#endif
62 enum clusterip_hashmode hash_mode; /* which hashing mode */
63 u_int32_t hash_initval; /* hash initialization */
64};
65
66static LIST_HEAD(clusterip_configs);
67
68/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
69 * data within all structurses (num_local_nodes, local_nodes[]) */
70static DECLARE_RWLOCK(clusterip_lock);
71
72#ifdef CONFIG_PROC_FS
73static struct file_operations clusterip_proc_fops;
74static struct proc_dir_entry *clusterip_procdir;
75#endif
76
77static inline void
78clusterip_config_get(struct clusterip_config *c) {
79 atomic_inc(&c->refcount);
80}
81
82static inline void
83clusterip_config_put(struct clusterip_config *c) {
84 if (atomic_dec_and_test(&c->refcount)) {
85 WRITE_LOCK(&clusterip_lock);
86 list_del(&c->list);
87 WRITE_UNLOCK(&clusterip_lock);
88 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
89 dev_put(c->dev);
90 kfree(c);
91 }
92}
93
94
95static struct clusterip_config *
96__clusterip_config_find(u_int32_t clusterip)
97{
98 struct list_head *pos;
99
100 MUST_BE_READ_LOCKED(&clusterip_lock);
101 list_for_each(pos, &clusterip_configs) {
102 struct clusterip_config *c = list_entry(pos,
103 struct clusterip_config, list);
104 if (c->clusterip == clusterip) {
105 return c;
106 }
107 }
108
109 return NULL;
110}
111
112static inline struct clusterip_config *
113clusterip_config_find_get(u_int32_t clusterip)
114{
115 struct clusterip_config *c;
116
117 READ_LOCK(&clusterip_lock);
118 c = __clusterip_config_find(clusterip);
119 if (!c) {
120 READ_UNLOCK(&clusterip_lock);
121 return NULL;
122 }
123 atomic_inc(&c->refcount);
124 READ_UNLOCK(&clusterip_lock);
125
126 return c;
127}
128
129static struct clusterip_config *
130clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
131 struct net_device *dev)
132{
133 struct clusterip_config *c;
134 char buffer[16];
135
136 c = kmalloc(sizeof(*c), GFP_ATOMIC);
137 if (!c)
138 return NULL;
139
140 memset(c, 0, sizeof(*c));
141 c->dev = dev;
142 c->clusterip = ip;
143 memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
144 c->num_total_nodes = i->num_total_nodes;
145 c->num_local_nodes = i->num_local_nodes;
146 memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
147 c->hash_mode = i->hash_mode;
148 c->hash_initval = i->hash_initval;
149 atomic_set(&c->refcount, 1);
150
151#ifdef CONFIG_PROC_FS
152 /* create proc dir entry */
153 sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
154 c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir);
155 if (!c->pde) {
156 kfree(c);
157 return NULL;
158 }
159 c->pde->proc_fops = &clusterip_proc_fops;
160 c->pde->data = c;
161#endif
162
163 WRITE_LOCK(&clusterip_lock);
164 list_add(&c->list, &clusterip_configs);
165 WRITE_UNLOCK(&clusterip_lock);
166
167 return c;
168}
169
170static int
171clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
172{
173 int i;
174
175 WRITE_LOCK(&clusterip_lock);
176
177 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
178 || nodenum > CLUSTERIP_MAX_NODES) {
179 WRITE_UNLOCK(&clusterip_lock);
180 return 1;
181 }
182
183 /* check if we alrady have this number in our array */
184 for (i = 0; i < c->num_local_nodes; i++) {
185 if (c->local_nodes[i] == nodenum) {
186 WRITE_UNLOCK(&clusterip_lock);
187 return 1;
188 }
189 }
190
191 c->local_nodes[c->num_local_nodes++] = nodenum;
192
193 WRITE_UNLOCK(&clusterip_lock);
194 return 0;
195}
196
197static int
198clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
199{
200 int i;
201
202 WRITE_LOCK(&clusterip_lock);
203
204 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
205 WRITE_UNLOCK(&clusterip_lock);
206 return 1;
207 }
208
209 for (i = 0; i < c->num_local_nodes; i++) {
210 if (c->local_nodes[i] == nodenum) {
211 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
212 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
213 c->num_local_nodes--;
214 WRITE_UNLOCK(&clusterip_lock);
215 return 0;
216 }
217 }
218
219 WRITE_UNLOCK(&clusterip_lock);
220 return 1;
221}
222
223static inline u_int32_t
224clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
225{
226 struct iphdr *iph = skb->nh.iph;
227 unsigned long hashval;
228 u_int16_t sport, dport;
229 struct tcphdr *th;
230 struct udphdr *uh;
231 struct icmphdr *ih;
232
233 switch (iph->protocol) {
234 case IPPROTO_TCP:
235 th = (void *)iph+iph->ihl*4;
236 sport = ntohs(th->source);
237 dport = ntohs(th->dest);
238 break;
239 case IPPROTO_UDP:
240 uh = (void *)iph+iph->ihl*4;
241 sport = ntohs(uh->source);
242 dport = ntohs(uh->dest);
243 break;
244 case IPPROTO_ICMP:
245 ih = (void *)iph+iph->ihl*4;
246 sport = ntohs(ih->un.echo.id);
247 dport = (ih->type<<8)|ih->code;
248 break;
249 default:
250 if (net_ratelimit()) {
251 printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
252 iph->protocol);
253 }
254 sport = dport = 0;
255 }
256
257 switch (config->hash_mode) {
258 case CLUSTERIP_HASHMODE_SIP:
259 hashval = jhash_1word(ntohl(iph->saddr),
260 config->hash_initval);
261 break;
262 case CLUSTERIP_HASHMODE_SIP_SPT:
263 hashval = jhash_2words(ntohl(iph->saddr), sport,
264 config->hash_initval);
265 break;
266 case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
267 hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
268 config->hash_initval);
269 break;
270 default:
271 /* to make gcc happy */
272 hashval = 0;
273 /* This cannot happen, unless the check function wasn't called
274 * at rule load time */
275 printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
276 BUG();
277 break;
278 }
279
280 /* node numbers are 1..n, not 0..n */
281 return ((hashval % config->num_total_nodes)+1);
282}
283
284static inline int
285clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
286{
287 int i;
288
289 READ_LOCK(&clusterip_lock);
290
291 if (config->num_local_nodes == 0) {
292 READ_UNLOCK(&clusterip_lock);
293 return 0;
294 }
295
296 for (i = 0; i < config->num_local_nodes; i++) {
297 if (config->local_nodes[i] == hash) {
298 READ_UNLOCK(&clusterip_lock);
299 return 1;
300 }
301 }
302
303 READ_UNLOCK(&clusterip_lock);
304
305 return 0;
306}
307
308/***********************************************************************
309 * IPTABLES TARGET
310 ***********************************************************************/
311
312static unsigned int
313target(struct sk_buff **pskb,
314 const struct net_device *in,
315 const struct net_device *out,
316 unsigned int hooknum,
317 const void *targinfo,
318 void *userinfo)
319{
320 const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
321 enum ip_conntrack_info ctinfo;
322 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
323 u_int32_t hash;
324
325 /* don't need to clusterip_config_get() here, since refcount
326 * is only decremented by destroy() - and ip_tables guarantees
327 * that the ->target() function isn't called after ->destroy() */
328
329 if (!ct) {
330 printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
331 /* FIXME: need to drop invalid ones, since replies
332 * to outgoing connections of other nodes will be
333 * marked as INVALID */
334 return NF_DROP;
335 }
336
337 /* special case: ICMP error handling. conntrack distinguishes between
338 * error messages (RELATED) and information requests (see below) */
339 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
340 && (ctinfo == IP_CT_RELATED
341 || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
342 return IPT_CONTINUE;
343
344 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
345 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
346 * on, which all have an ID field [relevant for hashing]. */
347
348 hash = clusterip_hashfn(*pskb, cipinfo->config);
349
350 switch (ctinfo) {
351 case IP_CT_NEW:
352 ct->mark = hash;
353 break;
354 case IP_CT_RELATED:
355 case IP_CT_RELATED+IP_CT_IS_REPLY:
356 /* FIXME: we don't handle expectations at the
357 * moment. they can arrive on a different node than
358 * the master connection (e.g. FTP passive mode) */
359 case IP_CT_ESTABLISHED:
360 case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
361 break;
362 default:
363 break;
364 }
365
366#ifdef DEBUG_CLUSTERP
367 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
368#endif
369 DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
370 if (!clusterip_responsible(cipinfo->config, hash)) {
371 DEBUGP("not responsible\n");
372 return NF_DROP;
373 }
374 DEBUGP("responsible\n");
375
376 /* despite being received via linklayer multicast, this is
377 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
378 (*pskb)->pkt_type = PACKET_HOST;
379
380 return IPT_CONTINUE;
381}
382
383static int
384checkentry(const char *tablename,
385 const struct ipt_entry *e,
386 void *targinfo,
387 unsigned int targinfosize,
388 unsigned int hook_mask)
389{
390 struct ipt_clusterip_tgt_info *cipinfo = targinfo;
391
392 struct clusterip_config *config;
393
394 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) {
395 printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n",
396 targinfosize,
397 IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info)));
398 return 0;
399 }
400
401 if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
402 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
403 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
404 printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
405 cipinfo->hash_mode);
406 return 0;
407
408 }
409 if (e->ip.dmsk.s_addr != 0xffffffff
410 || e->ip.dst.s_addr == 0) {
411 printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
412 return 0;
413 }
414
415 /* FIXME: further sanity checks */
416
417 config = clusterip_config_find_get(e->ip.dst.s_addr);
418 if (!config) {
419 if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
420 printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
421 return 0;
422 } else {
423 struct net_device *dev;
424
425 if (e->ip.iniface[0] == '\0') {
426 printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
427 return 0;
428 }
429
430 dev = dev_get_by_name(e->ip.iniface);
431 if (!dev) {
432 printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
433 return 0;
434 }
435
436 config = clusterip_config_init(cipinfo,
437 e->ip.dst.s_addr, dev);
438 if (!config) {
439 printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
440 dev_put(dev);
441 return 0;
442 }
443 dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
444 }
445 }
446
447 cipinfo->config = config;
448
449 return 1;
450}
451
452/* drop reference count of cluster config when rule is deleted */
453static void destroy(void *matchinfo, unsigned int matchinfosize)
454{
455 struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
456
457 /* we first remove the proc entry and then drop the reference
458 * count. In case anyone still accesses the file, the open/close
459 * functions are also incrementing the refcount on their own */
460#ifdef CONFIG_PROC_FS
461 remove_proc_entry(cipinfo->config->pde->name,
462 cipinfo->config->pde->parent);
463#endif
464 clusterip_config_put(cipinfo->config);
465}
466
467static struct ipt_target clusterip_tgt = {
468 .name = "CLUSTERIP",
469 .target = &target,
470 .checkentry = &checkentry,
471 .destroy = &destroy,
472 .me = THIS_MODULE
473};
474
475
476/***********************************************************************
477 * ARP MANGLING CODE
478 ***********************************************************************/
479
480/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
481struct arp_payload {
482 u_int8_t src_hw[ETH_ALEN];
483 u_int32_t src_ip;
484 u_int8_t dst_hw[ETH_ALEN];
485 u_int32_t dst_ip;
486} __attribute__ ((packed));
487
488#ifdef CLUSTERIP_DEBUG
489static void arp_print(struct arp_payload *payload)
490{
491#define HBUFFERLEN 30
492 char hbuffer[HBUFFERLEN];
493 int j,k;
494 const char hexbuf[]= "0123456789abcdef";
495
496 for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
497 hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15];
498 hbuffer[k++]=hexbuf[payload->src_hw[j]&15];
499 hbuffer[k++]=':';
500 }
501 hbuffer[--k]='\0';
502
503 printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n",
504 NIPQUAD(payload->src_ip), hbuffer,
505 NIPQUAD(payload->dst_ip));
506}
507#endif
508
509static unsigned int
510arp_mangle(unsigned int hook,
511 struct sk_buff **pskb,
512 const struct net_device *in,
513 const struct net_device *out,
514 int (*okfn)(struct sk_buff *))
515{
516 struct arphdr *arp = (*pskb)->nh.arph;
517 struct arp_payload *payload;
518 struct clusterip_config *c;
519
520 /* we don't care about non-ethernet and non-ipv4 ARP */
521 if (arp->ar_hrd != htons(ARPHRD_ETHER)
522 || arp->ar_pro != htons(ETH_P_IP)
523 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
524 return NF_ACCEPT;
525
526 /* we only want to mangle arp replies */
527 if (arp->ar_op != htons(ARPOP_REPLY))
528 return NF_ACCEPT;
529
530 payload = (void *)(arp+1);
531
532 /* if there is no clusterip configuration for the arp reply's
533 * source ip, we don't want to mangle it */
534 c = clusterip_config_find_get(payload->src_ip);
535 if (!c)
536 return NF_ACCEPT;
537
538 /* normally the linux kernel always replies to arp queries of
539 * addresses on different interfacs. However, in the CLUSTERIP case
540 * this wouldn't work, since we didn't subscribe the mcast group on
541 * other interfaces */
542 if (c->dev != out) {
543 DEBUGP("CLUSTERIP: not mangling arp reply on different "
544 "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
545 clusterip_config_put(c);
546 return NF_ACCEPT;
547 }
548
549 /* mangle reply hardware address */
550 memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
551
552#ifdef CLUSTERIP_DEBUG
553 DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
554 arp_print(payload);
555#endif
556
557 clusterip_config_put(c);
558
559 return NF_ACCEPT;
560}
561
562static struct nf_hook_ops cip_arp_ops = {
563 .hook = arp_mangle,
564 .pf = NF_ARP,
565 .hooknum = NF_ARP_OUT,
566 .priority = -1
567};
568
569/***********************************************************************
570 * PROC DIR HANDLING
571 ***********************************************************************/
572
573#ifdef CONFIG_PROC_FS
574
575static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
576{
577 struct proc_dir_entry *pde = s->private;
578 struct clusterip_config *c = pde->data;
579 unsigned int *nodeidx;
580
581 READ_LOCK(&clusterip_lock);
582 if (*pos >= c->num_local_nodes)
583 return NULL;
584
585 nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL);
586 if (!nodeidx)
587 return ERR_PTR(-ENOMEM);
588
589 *nodeidx = *pos;
590 return nodeidx;
591}
592
593static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
594{
595 struct proc_dir_entry *pde = s->private;
596 struct clusterip_config *c = pde->data;
597 unsigned int *nodeidx = (unsigned int *)v;
598
599 *pos = ++(*nodeidx);
600 if (*pos >= c->num_local_nodes) {
601 kfree(v);
602 return NULL;
603 }
604 return nodeidx;
605}
606
607static void clusterip_seq_stop(struct seq_file *s, void *v)
608{
609 kfree(v);
610
611 READ_UNLOCK(&clusterip_lock);
612}
613
614static int clusterip_seq_show(struct seq_file *s, void *v)
615{
616 struct proc_dir_entry *pde = s->private;
617 struct clusterip_config *c = pde->data;
618 unsigned int *nodeidx = (unsigned int *)v;
619
620 if (*nodeidx != 0)
621 seq_putc(s, ',');
622 seq_printf(s, "%u", c->local_nodes[*nodeidx]);
623
624 if (*nodeidx == c->num_local_nodes-1)
625 seq_putc(s, '\n');
626
627 return 0;
628}
629
630static struct seq_operations clusterip_seq_ops = {
631 .start = clusterip_seq_start,
632 .next = clusterip_seq_next,
633 .stop = clusterip_seq_stop,
634 .show = clusterip_seq_show,
635};
636
637static int clusterip_proc_open(struct inode *inode, struct file *file)
638{
639 int ret = seq_open(file, &clusterip_seq_ops);
640
641 if (!ret) {
642 struct seq_file *sf = file->private_data;
643 struct proc_dir_entry *pde = PDE(inode);
644 struct clusterip_config *c = pde->data;
645
646 sf->private = pde;
647
648 clusterip_config_get(c);
649 }
650
651 return ret;
652}
653
654static int clusterip_proc_release(struct inode *inode, struct file *file)
655{
656 struct proc_dir_entry *pde = PDE(inode);
657 struct clusterip_config *c = pde->data;
658 int ret;
659
660 ret = seq_release(inode, file);
661
662 if (!ret)
663 clusterip_config_put(c);
664
665 return ret;
666}
667
668static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
669 size_t size, loff_t *ofs)
670{
671#define PROC_WRITELEN 10
672 char buffer[PROC_WRITELEN+1];
673 struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
674 struct clusterip_config *c = pde->data;
675 unsigned long nodenum;
676
677 if (copy_from_user(buffer, input, PROC_WRITELEN))
678 return -EFAULT;
679
680 if (*buffer == '+') {
681 nodenum = simple_strtoul(buffer+1, NULL, 10);
682 if (clusterip_add_node(c, nodenum))
683 return -ENOMEM;
684 } else if (*buffer == '-') {
685 nodenum = simple_strtoul(buffer+1, NULL,10);
686 if (clusterip_del_node(c, nodenum))
687 return -ENOENT;
688 } else
689 return -EIO;
690
691 return size;
692}
693
694static struct file_operations clusterip_proc_fops = {
695 .owner = THIS_MODULE,
696 .open = clusterip_proc_open,
697 .read = seq_read,
698 .write = clusterip_proc_write,
699 .llseek = seq_lseek,
700 .release = clusterip_proc_release,
701};
702
703#endif /* CONFIG_PROC_FS */
704
705static int init_or_cleanup(int fini)
706{
707 int ret;
708
709 if (fini)
710 goto cleanup;
711
712 if (ipt_register_target(&clusterip_tgt)) {
713 ret = -EINVAL;
714 goto cleanup_none;
715 }
716
717 if (nf_register_hook(&cip_arp_ops) < 0) {
718 ret = -EINVAL;
719 goto cleanup_target;
720 }
721
722#ifdef CONFIG_PROC_FS
723 clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
724 if (!clusterip_procdir) {
725 printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
726 ret = -ENOMEM;
727 goto cleanup_hook;
728 }
729#endif /* CONFIG_PROC_FS */
730
731 printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
732 CLUSTERIP_VERSION);
733
734 return 0;
735
736cleanup:
737 printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
738 CLUSTERIP_VERSION);
739#ifdef CONFIG_PROC_FS
740 remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
741#endif
742cleanup_hook:
743 nf_unregister_hook(&cip_arp_ops);
744cleanup_target:
745 ipt_unregister_target(&clusterip_tgt);
746cleanup_none:
747 return -EINVAL;
748}
749
750static int __init init(void)
751{
752 return init_or_cleanup(0);
753}
754
755static void __exit fini(void)
756{
757 init_or_cleanup(1);
758}
759
760module_init(init);
761module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
new file mode 100644
index 000000000000..30ddd3e18eb7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -0,0 +1,118 @@
1/* This kernel module is used to modify the connection mark values, or
2 * to optionally restore the skb nfmark from the connection mark
3 *
4 * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
5 * by Henrik Nordstrom <hno@marasystems.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21#include <linux/module.h>
22#include <linux/skbuff.h>
23#include <linux/ip.h>
24#include <net/checksum.h>
25
26MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
27MODULE_DESCRIPTION("IP tables CONNMARK matching module");
28MODULE_LICENSE("GPL");
29
30#include <linux/netfilter_ipv4/ip_tables.h>
31#include <linux/netfilter_ipv4/ipt_CONNMARK.h>
32#include <linux/netfilter_ipv4/ip_conntrack.h>
33
34static unsigned int
35target(struct sk_buff **pskb,
36 const struct net_device *in,
37 const struct net_device *out,
38 unsigned int hooknum,
39 const void *targinfo,
40 void *userinfo)
41{
42 const struct ipt_connmark_target_info *markinfo = targinfo;
43 unsigned long diff;
44 unsigned long nfmark;
45 unsigned long newmark;
46
47 enum ip_conntrack_info ctinfo;
48 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
49 if (ct) {
50 switch(markinfo->mode) {
51 case IPT_CONNMARK_SET:
52 newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
53 if (newmark != ct->mark)
54 ct->mark = newmark;
55 break;
56 case IPT_CONNMARK_SAVE:
57 newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
58 if (ct->mark != newmark)
59 ct->mark = newmark;
60 break;
61 case IPT_CONNMARK_RESTORE:
62 nfmark = (*pskb)->nfmark;
63 diff = (ct->mark ^ nfmark) & markinfo->mask;
64 if (diff != 0) {
65 (*pskb)->nfmark = nfmark ^ diff;
66 (*pskb)->nfcache |= NFC_ALTERED;
67 }
68 break;
69 }
70 }
71
72 return IPT_CONTINUE;
73}
74
75static int
76checkentry(const char *tablename,
77 const struct ipt_entry *e,
78 void *targinfo,
79 unsigned int targinfosize,
80 unsigned int hook_mask)
81{
82 struct ipt_connmark_target_info *matchinfo = targinfo;
83 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) {
84 printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n",
85 targinfosize,
86 IPT_ALIGN(sizeof(struct ipt_connmark_target_info)));
87 return 0;
88 }
89
90 if (matchinfo->mode == IPT_CONNMARK_RESTORE) {
91 if (strcmp(tablename, "mangle") != 0) {
92 printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename);
93 return 0;
94 }
95 }
96
97 return 1;
98}
99
100static struct ipt_target ipt_connmark_reg = {
101 .name = "CONNMARK",
102 .target = &target,
103 .checkentry = &checkentry,
104 .me = THIS_MODULE
105};
106
107static int __init init(void)
108{
109 return ipt_register_target(&ipt_connmark_reg);
110}
111
112static void __exit fini(void)
113{
114 ipt_unregister_target(&ipt_connmark_reg);
115}
116
117module_init(init);
118module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
new file mode 100644
index 000000000000..3ea4509099f9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -0,0 +1,106 @@
1/* iptables module for setting the IPv4 DSCP field, Version 1.8
2 *
3 * (C) 2002 by Harald Welte <laforge@netfilter.org>
4 * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * See RFC2474 for a description of the DSCP field within the IP Header.
11 *
12 * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp
13*/
14
15#include <linux/module.h>
16#include <linux/skbuff.h>
17#include <linux/ip.h>
18#include <net/checksum.h>
19
20#include <linux/netfilter_ipv4/ip_tables.h>
21#include <linux/netfilter_ipv4/ipt_DSCP.h>
22
23MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
24MODULE_DESCRIPTION("iptables DSCP modification module");
25MODULE_LICENSE("GPL");
26
27static unsigned int
28target(struct sk_buff **pskb,
29 const struct net_device *in,
30 const struct net_device *out,
31 unsigned int hooknum,
32 const void *targinfo,
33 void *userinfo)
34{
35 const struct ipt_DSCP_info *dinfo = targinfo;
36 u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
37
38
39 if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
40 u_int16_t diffs[2];
41
42 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
43 return NF_DROP;
44
45 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
46 (*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK)
47 | sh_dscp;
48 diffs[1] = htons((*pskb)->nh.iph->tos);
49 (*pskb)->nh.iph->check
50 = csum_fold(csum_partial((char *)diffs,
51 sizeof(diffs),
52 (*pskb)->nh.iph->check
53 ^ 0xFFFF));
54 (*pskb)->nfcache |= NFC_ALTERED;
55 }
56 return IPT_CONTINUE;
57}
58
59static int
60checkentry(const char *tablename,
61 const struct ipt_entry *e,
62 void *targinfo,
63 unsigned int targinfosize,
64 unsigned int hook_mask)
65{
66 const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp;
67
68 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_DSCP_info))) {
69 printk(KERN_WARNING "DSCP: targinfosize %u != %Zu\n",
70 targinfosize,
71 IPT_ALIGN(sizeof(struct ipt_DSCP_info)));
72 return 0;
73 }
74
75 if (strcmp(tablename, "mangle") != 0) {
76 printk(KERN_WARNING "DSCP: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
77 return 0;
78 }
79
80 if ((dscp > IPT_DSCP_MAX)) {
81 printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
82 return 0;
83 }
84
85 return 1;
86}
87
88static struct ipt_target ipt_dscp_reg = {
89 .name = "DSCP",
90 .target = target,
91 .checkentry = checkentry,
92 .me = THIS_MODULE,
93};
94
95static int __init init(void)
96{
97 return ipt_register_target(&ipt_dscp_reg);
98}
99
100static void __exit fini(void)
101{
102 ipt_unregister_target(&ipt_dscp_reg);
103}
104
105module_init(init);
106module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
new file mode 100644
index 000000000000..ada9911118e9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -0,0 +1,175 @@
1/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
2 *
3 * (C) 2002 by Harald Welte <laforge@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp
10*/
11
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/ip.h>
15#include <linux/tcp.h>
16#include <net/checksum.h>
17
18#include <linux/netfilter_ipv4/ip_tables.h>
19#include <linux/netfilter_ipv4/ipt_ECN.h>
20
21MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
23MODULE_DESCRIPTION("iptables ECN modification module");
24
25/* set ECT codepoint from IP header.
26 * return 0 if there was an error. */
27static inline int
28set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
29{
30 if (((*pskb)->nh.iph->tos & IPT_ECN_IP_MASK)
31 != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
32 u_int16_t diffs[2];
33
34 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
35 return 0;
36
37 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
38 (*pskb)->nh.iph->tos &= ~IPT_ECN_IP_MASK;
39 (*pskb)->nh.iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
40 diffs[1] = htons((*pskb)->nh.iph->tos);
41 (*pskb)->nh.iph->check
42 = csum_fold(csum_partial((char *)diffs,
43 sizeof(diffs),
44 (*pskb)->nh.iph->check
45 ^0xFFFF));
46 (*pskb)->nfcache |= NFC_ALTERED;
47 }
48 return 1;
49}
50
51/* Return 0 if there was an error. */
52static inline int
53set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
54{
55 struct tcphdr _tcph, *tcph;
56 u_int16_t diffs[2];
57
58 /* Not enought header? */
59 tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
60 sizeof(_tcph), &_tcph);
61 if (!tcph)
62 return 0;
63
64 if (!(einfo->operation & IPT_ECN_OP_SET_ECE
65 || tcph->ece == einfo->proto.tcp.ece)
66 && (!(einfo->operation & IPT_ECN_OP_SET_CWR
67 || tcph->cwr == einfo->proto.tcp.cwr)))
68 return 1;
69
70 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
71 return 0;
72 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
73
74 diffs[0] = ((u_int16_t *)tcph)[6];
75 if (einfo->operation & IPT_ECN_OP_SET_ECE)
76 tcph->ece = einfo->proto.tcp.ece;
77 if (einfo->operation & IPT_ECN_OP_SET_CWR)
78 tcph->cwr = einfo->proto.tcp.cwr;
79 diffs[1] = ((u_int16_t *)tcph)[6];
80 diffs[0] = diffs[0] ^ 0xFFFF;
81
82 if ((*pskb)->ip_summed != CHECKSUM_HW)
83 tcph->check = csum_fold(csum_partial((char *)diffs,
84 sizeof(diffs),
85 tcph->check^0xFFFF));
86 else
87 if (skb_checksum_help(*pskb, inward))
88 return 0;
89 (*pskb)->nfcache |= NFC_ALTERED;
90 return 1;
91}
92
93static unsigned int
94target(struct sk_buff **pskb,
95 const struct net_device *in,
96 const struct net_device *out,
97 unsigned int hooknum,
98 const void *targinfo,
99 void *userinfo)
100{
101 const struct ipt_ECN_info *einfo = targinfo;
102
103 if (einfo->operation & IPT_ECN_OP_SET_IP)
104 if (!set_ect_ip(pskb, einfo))
105 return NF_DROP;
106
107 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
108 && (*pskb)->nh.iph->protocol == IPPROTO_TCP)
109 if (!set_ect_tcp(pskb, einfo, (out == NULL)))
110 return NF_DROP;
111
112 return IPT_CONTINUE;
113}
114
115static int
116checkentry(const char *tablename,
117 const struct ipt_entry *e,
118 void *targinfo,
119 unsigned int targinfosize,
120 unsigned int hook_mask)
121{
122 const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
123
124 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) {
125 printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n",
126 targinfosize,
127 IPT_ALIGN(sizeof(struct ipt_ECN_info)));
128 return 0;
129 }
130
131 if (strcmp(tablename, "mangle") != 0) {
132 printk(KERN_WARNING "ECN: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
133 return 0;
134 }
135
136 if (einfo->operation & IPT_ECN_OP_MASK) {
137 printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
138 einfo->operation);
139 return 0;
140 }
141 if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
142 printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n",
143 einfo->ip_ect);
144 return 0;
145 }
146
147 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR))
148 && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) {
149 printk(KERN_WARNING "ECN: cannot use TCP operations on a "
150 "non-tcp rule\n");
151 return 0;
152 }
153
154 return 1;
155}
156
157static struct ipt_target ipt_ecn_reg = {
158 .name = "ECN",
159 .target = target,
160 .checkentry = checkentry,
161 .me = THIS_MODULE,
162};
163
164static int __init init(void)
165{
166 return ipt_register_target(&ipt_ecn_reg);
167}
168
169static void __exit fini(void)
170{
171 ipt_unregister_target(&ipt_ecn_reg);
172}
173
174module_init(init);
175module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
new file mode 100644
index 000000000000..ef08733d26da
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -0,0 +1,485 @@
1/*
2 * This is a module which is used for logging packets.
3 */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/spinlock.h>
15#include <linux/skbuff.h>
16#include <linux/ip.h>
17#include <net/icmp.h>
18#include <net/udp.h>
19#include <net/tcp.h>
20#include <net/route.h>
21
22#include <linux/netfilter.h>
23#include <linux/netfilter_ipv4/ip_tables.h>
24#include <linux/netfilter_ipv4/ipt_LOG.h>
25
26MODULE_LICENSE("GPL");
27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
28MODULE_DESCRIPTION("iptables syslog logging module");
29
30static unsigned int nflog = 1;
31module_param(nflog, int, 0400);
32MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
33
34#if 0
35#define DEBUGP printk
36#else
37#define DEBUGP(format, args...)
38#endif
39
40/* Use lock to serialize, so printks don't overlap */
41static DEFINE_SPINLOCK(log_lock);
42
43/* One level of recursion won't kill us */
44static void dump_packet(const struct ipt_log_info *info,
45 const struct sk_buff *skb,
46 unsigned int iphoff)
47{
48 struct iphdr _iph, *ih;
49
50 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
51 if (ih == NULL) {
52 printk("TRUNCATED");
53 return;
54 }
55
56 /* Important fields:
57 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
58 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
59 printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
60 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
61
62 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
63 printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
64 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
65 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
66
67 /* Max length: 6 "CE DF MF " */
68 if (ntohs(ih->frag_off) & IP_CE)
69 printk("CE ");
70 if (ntohs(ih->frag_off) & IP_DF)
71 printk("DF ");
72 if (ntohs(ih->frag_off) & IP_MF)
73 printk("MF ");
74
75 /* Max length: 11 "FRAG:65535 " */
76 if (ntohs(ih->frag_off) & IP_OFFSET)
77 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
78
79 if ((info->logflags & IPT_LOG_IPOPT)
80 && ih->ihl * 4 > sizeof(struct iphdr)) {
81 unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
82 unsigned int i, optsize;
83
84 optsize = ih->ihl * 4 - sizeof(struct iphdr);
85 op = skb_header_pointer(skb, iphoff+sizeof(_iph),
86 optsize, _opt);
87 if (op == NULL) {
88 printk("TRUNCATED");
89 return;
90 }
91
92 /* Max length: 127 "OPT (" 15*4*2chars ") " */
93 printk("OPT (");
94 for (i = 0; i < optsize; i++)
95 printk("%02X", op[i]);
96 printk(") ");
97 }
98
99 switch (ih->protocol) {
100 case IPPROTO_TCP: {
101 struct tcphdr _tcph, *th;
102
103 /* Max length: 10 "PROTO=TCP " */
104 printk("PROTO=TCP ");
105
106 if (ntohs(ih->frag_off) & IP_OFFSET)
107 break;
108
109 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
110 th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
111 sizeof(_tcph), &_tcph);
112 if (th == NULL) {
113 printk("INCOMPLETE [%u bytes] ",
114 skb->len - iphoff - ih->ihl*4);
115 break;
116 }
117
118 /* Max length: 20 "SPT=65535 DPT=65535 " */
119 printk("SPT=%u DPT=%u ",
120 ntohs(th->source), ntohs(th->dest));
121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
122 if (info->logflags & IPT_LOG_TCPSEQ)
123 printk("SEQ=%u ACK=%u ",
124 ntohl(th->seq), ntohl(th->ack_seq));
125 /* Max length: 13 "WINDOW=65535 " */
126 printk("WINDOW=%u ", ntohs(th->window));
127 /* Max length: 9 "RES=0x3F " */
128 printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
129 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
130 if (th->cwr)
131 printk("CWR ");
132 if (th->ece)
133 printk("ECE ");
134 if (th->urg)
135 printk("URG ");
136 if (th->ack)
137 printk("ACK ");
138 if (th->psh)
139 printk("PSH ");
140 if (th->rst)
141 printk("RST ");
142 if (th->syn)
143 printk("SYN ");
144 if (th->fin)
145 printk("FIN ");
146 /* Max length: 11 "URGP=65535 " */
147 printk("URGP=%u ", ntohs(th->urg_ptr));
148
149 if ((info->logflags & IPT_LOG_TCPOPT)
150 && th->doff * 4 > sizeof(struct tcphdr)) {
151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
152 unsigned char *op;
153 unsigned int i, optsize;
154
155 optsize = th->doff * 4 - sizeof(struct tcphdr);
156 op = skb_header_pointer(skb,
157 iphoff+ih->ihl*4+sizeof(_tcph),
158 optsize, _opt);
159 if (op == NULL) {
160 printk("TRUNCATED");
161 return;
162 }
163
164 /* Max length: 127 "OPT (" 15*4*2chars ") " */
165 printk("OPT (");
166 for (i = 0; i < optsize; i++)
167 printk("%02X", op[i]);
168 printk(") ");
169 }
170 break;
171 }
172 case IPPROTO_UDP: {
173 struct udphdr _udph, *uh;
174
175 /* Max length: 10 "PROTO=UDP " */
176 printk("PROTO=UDP ");
177
178 if (ntohs(ih->frag_off) & IP_OFFSET)
179 break;
180
181 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
182 uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
183 sizeof(_udph), &_udph);
184 if (uh == NULL) {
185 printk("INCOMPLETE [%u bytes] ",
186 skb->len - iphoff - ih->ihl*4);
187 break;
188 }
189
190 /* Max length: 20 "SPT=65535 DPT=65535 " */
191 printk("SPT=%u DPT=%u LEN=%u ",
192 ntohs(uh->source), ntohs(uh->dest),
193 ntohs(uh->len));
194 break;
195 }
196 case IPPROTO_ICMP: {
197 struct icmphdr _icmph, *ich;
198 static size_t required_len[NR_ICMP_TYPES+1]
199 = { [ICMP_ECHOREPLY] = 4,
200 [ICMP_DEST_UNREACH]
201 = 8 + sizeof(struct iphdr),
202 [ICMP_SOURCE_QUENCH]
203 = 8 + sizeof(struct iphdr),
204 [ICMP_REDIRECT]
205 = 8 + sizeof(struct iphdr),
206 [ICMP_ECHO] = 4,
207 [ICMP_TIME_EXCEEDED]
208 = 8 + sizeof(struct iphdr),
209 [ICMP_PARAMETERPROB]
210 = 8 + sizeof(struct iphdr),
211 [ICMP_TIMESTAMP] = 20,
212 [ICMP_TIMESTAMPREPLY] = 20,
213 [ICMP_ADDRESS] = 12,
214 [ICMP_ADDRESSREPLY] = 12 };
215
216 /* Max length: 11 "PROTO=ICMP " */
217 printk("PROTO=ICMP ");
218
219 if (ntohs(ih->frag_off) & IP_OFFSET)
220 break;
221
222 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
223 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
224 sizeof(_icmph), &_icmph);
225 if (ich == NULL) {
226 printk("INCOMPLETE [%u bytes] ",
227 skb->len - iphoff - ih->ihl*4);
228 break;
229 }
230
231 /* Max length: 18 "TYPE=255 CODE=255 " */
232 printk("TYPE=%u CODE=%u ", ich->type, ich->code);
233
234 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
235 if (ich->type <= NR_ICMP_TYPES
236 && required_len[ich->type]
237 && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
238 printk("INCOMPLETE [%u bytes] ",
239 skb->len - iphoff - ih->ihl*4);
240 break;
241 }
242
243 switch (ich->type) {
244 case ICMP_ECHOREPLY:
245 case ICMP_ECHO:
246 /* Max length: 19 "ID=65535 SEQ=65535 " */
247 printk("ID=%u SEQ=%u ",
248 ntohs(ich->un.echo.id),
249 ntohs(ich->un.echo.sequence));
250 break;
251
252 case ICMP_PARAMETERPROB:
253 /* Max length: 14 "PARAMETER=255 " */
254 printk("PARAMETER=%u ",
255 ntohl(ich->un.gateway) >> 24);
256 break;
257 case ICMP_REDIRECT:
258 /* Max length: 24 "GATEWAY=255.255.255.255 " */
259 printk("GATEWAY=%u.%u.%u.%u ",
260 NIPQUAD(ich->un.gateway));
261 /* Fall through */
262 case ICMP_DEST_UNREACH:
263 case ICMP_SOURCE_QUENCH:
264 case ICMP_TIME_EXCEEDED:
265 /* Max length: 3+maxlen */
266 if (!iphoff) { /* Only recurse once. */
267 printk("[");
268 dump_packet(info, skb,
269 iphoff + ih->ihl*4+sizeof(_icmph));
270 printk("] ");
271 }
272
273 /* Max length: 10 "MTU=65535 " */
274 if (ich->type == ICMP_DEST_UNREACH
275 && ich->code == ICMP_FRAG_NEEDED)
276 printk("MTU=%u ", ntohs(ich->un.frag.mtu));
277 }
278 break;
279 }
280 /* Max Length */
281 case IPPROTO_AH: {
282 struct ip_auth_hdr _ahdr, *ah;
283
284 if (ntohs(ih->frag_off) & IP_OFFSET)
285 break;
286
287 /* Max length: 9 "PROTO=AH " */
288 printk("PROTO=AH ");
289
290 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
291 ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
292 sizeof(_ahdr), &_ahdr);
293 if (ah == NULL) {
294 printk("INCOMPLETE [%u bytes] ",
295 skb->len - iphoff - ih->ihl*4);
296 break;
297 }
298
299 /* Length: 15 "SPI=0xF1234567 " */
300 printk("SPI=0x%x ", ntohl(ah->spi));
301 break;
302 }
303 case IPPROTO_ESP: {
304 struct ip_esp_hdr _esph, *eh;
305
306 /* Max length: 10 "PROTO=ESP " */
307 printk("PROTO=ESP ");
308
309 if (ntohs(ih->frag_off) & IP_OFFSET)
310 break;
311
312 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
313 eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
314 sizeof(_esph), &_esph);
315 if (eh == NULL) {
316 printk("INCOMPLETE [%u bytes] ",
317 skb->len - iphoff - ih->ihl*4);
318 break;
319 }
320
321 /* Length: 15 "SPI=0xF1234567 " */
322 printk("SPI=0x%x ", ntohl(eh->spi));
323 break;
324 }
325 /* Max length: 10 "PROTO 255 " */
326 default:
327 printk("PROTO=%u ", ih->protocol);
328 }
329
330 /* Max length: 15 "UID=4294967295 " */
331 if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
332 read_lock_bh(&skb->sk->sk_callback_lock);
333 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
334 printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
335 read_unlock_bh(&skb->sk->sk_callback_lock);
336 }
337
338 /* Proto Max log string length */
339 /* IP: 40+46+6+11+127 = 230 */
340 /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
341 /* UDP: 10+max(25,20) = 35 */
342 /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
343 /* ESP: 10+max(25)+15 = 50 */
344 /* AH: 9+max(25)+15 = 49 */
345 /* unknown: 10 */
346
347 /* (ICMP allows recursion one level deep) */
348 /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
349 /* maxlen = 230+ 91 + 230 + 252 = 803 */
350}
351
352static void
353ipt_log_packet(unsigned int hooknum,
354 const struct sk_buff *skb,
355 const struct net_device *in,
356 const struct net_device *out,
357 const struct ipt_log_info *loginfo,
358 const char *level_string,
359 const char *prefix)
360{
361 spin_lock_bh(&log_lock);
362 printk(level_string);
363 printk("%sIN=%s OUT=%s ",
364 prefix == NULL ? loginfo->prefix : prefix,
365 in ? in->name : "",
366 out ? out->name : "");
367#ifdef CONFIG_BRIDGE_NETFILTER
368 if (skb->nf_bridge) {
369 struct net_device *physindev = skb->nf_bridge->physindev;
370 struct net_device *physoutdev = skb->nf_bridge->physoutdev;
371
372 if (physindev && in != physindev)
373 printk("PHYSIN=%s ", physindev->name);
374 if (physoutdev && out != physoutdev)
375 printk("PHYSOUT=%s ", physoutdev->name);
376 }
377#endif
378
379 if (in && !out) {
380 /* MAC logging for input chain only. */
381 printk("MAC=");
382 if (skb->dev && skb->dev->hard_header_len
383 && skb->mac.raw != (void*)skb->nh.iph) {
384 int i;
385 unsigned char *p = skb->mac.raw;
386 for (i = 0; i < skb->dev->hard_header_len; i++,p++)
387 printk("%02x%c", *p,
388 i==skb->dev->hard_header_len - 1
389 ? ' ':':');
390 } else
391 printk(" ");
392 }
393
394 dump_packet(loginfo, skb, 0);
395 printk("\n");
396 spin_unlock_bh(&log_lock);
397}
398
399static unsigned int
400ipt_log_target(struct sk_buff **pskb,
401 const struct net_device *in,
402 const struct net_device *out,
403 unsigned int hooknum,
404 const void *targinfo,
405 void *userinfo)
406{
407 const struct ipt_log_info *loginfo = targinfo;
408 char level_string[4] = "< >";
409
410 level_string[1] = '0' + (loginfo->level % 8);
411 ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
412
413 return IPT_CONTINUE;
414}
415
416static void
417ipt_logfn(unsigned int hooknum,
418 const struct sk_buff *skb,
419 const struct net_device *in,
420 const struct net_device *out,
421 const char *prefix)
422{
423 struct ipt_log_info loginfo = {
424 .level = 0,
425 .logflags = IPT_LOG_MASK,
426 .prefix = ""
427 };
428
429 ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
430}
431
432static int ipt_log_checkentry(const char *tablename,
433 const struct ipt_entry *e,
434 void *targinfo,
435 unsigned int targinfosize,
436 unsigned int hook_mask)
437{
438 const struct ipt_log_info *loginfo = targinfo;
439
440 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) {
441 DEBUGP("LOG: targinfosize %u != %u\n",
442 targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info)));
443 return 0;
444 }
445
446 if (loginfo->level >= 8) {
447 DEBUGP("LOG: level %u >= 8\n", loginfo->level);
448 return 0;
449 }
450
451 if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
452 DEBUGP("LOG: prefix term %i\n",
453 loginfo->prefix[sizeof(loginfo->prefix)-1]);
454 return 0;
455 }
456
457 return 1;
458}
459
460static struct ipt_target ipt_log_reg = {
461 .name = "LOG",
462 .target = ipt_log_target,
463 .checkentry = ipt_log_checkentry,
464 .me = THIS_MODULE,
465};
466
467static int __init init(void)
468{
469 if (ipt_register_target(&ipt_log_reg))
470 return -EINVAL;
471 if (nflog)
472 nf_log_register(PF_INET, &ipt_logfn);
473
474 return 0;
475}
476
477static void __exit fini(void)
478{
479 if (nflog)
480 nf_log_unregister(PF_INET, &ipt_logfn);
481 ipt_unregister_target(&ipt_log_reg);
482}
483
484module_init(init);
485module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
new file mode 100644
index 000000000000..33c6f9b63b8d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -0,0 +1,162 @@
1/* This is a module which is used for setting the NFMARK field of an skb. */
2
3/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/ip.h>
13#include <net/checksum.h>
14
15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/netfilter_ipv4/ipt_MARK.h>
17
18MODULE_LICENSE("GPL");
19MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
20MODULE_DESCRIPTION("iptables MARK modification module");
21
22static unsigned int
23target_v0(struct sk_buff **pskb,
24 const struct net_device *in,
25 const struct net_device *out,
26 unsigned int hooknum,
27 const void *targinfo,
28 void *userinfo)
29{
30 const struct ipt_mark_target_info *markinfo = targinfo;
31
32 if((*pskb)->nfmark != markinfo->mark) {
33 (*pskb)->nfmark = markinfo->mark;
34 (*pskb)->nfcache |= NFC_ALTERED;
35 }
36 return IPT_CONTINUE;
37}
38
39static unsigned int
40target_v1(struct sk_buff **pskb,
41 const struct net_device *in,
42 const struct net_device *out,
43 unsigned int hooknum,
44 const void *targinfo,
45 void *userinfo)
46{
47 const struct ipt_mark_target_info_v1 *markinfo = targinfo;
48 int mark = 0;
49
50 switch (markinfo->mode) {
51 case IPT_MARK_SET:
52 mark = markinfo->mark;
53 break;
54
55 case IPT_MARK_AND:
56 mark = (*pskb)->nfmark & markinfo->mark;
57 break;
58
59 case IPT_MARK_OR:
60 mark = (*pskb)->nfmark | markinfo->mark;
61 break;
62 }
63
64 if((*pskb)->nfmark != mark) {
65 (*pskb)->nfmark = mark;
66 (*pskb)->nfcache |= NFC_ALTERED;
67 }
68 return IPT_CONTINUE;
69}
70
71
72static int
73checkentry_v0(const char *tablename,
74 const struct ipt_entry *e,
75 void *targinfo,
76 unsigned int targinfosize,
77 unsigned int hook_mask)
78{
79 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
80 printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
81 targinfosize,
82 IPT_ALIGN(sizeof(struct ipt_mark_target_info)));
83 return 0;
84 }
85
86 if (strcmp(tablename, "mangle") != 0) {
87 printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
88 return 0;
89 }
90
91 return 1;
92}
93
94static int
95checkentry_v1(const char *tablename,
96 const struct ipt_entry *e,
97 void *targinfo,
98 unsigned int targinfosize,
99 unsigned int hook_mask)
100{
101 struct ipt_mark_target_info_v1 *markinfo = targinfo;
102
103 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){
104 printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
105 targinfosize,
106 IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1)));
107 return 0;
108 }
109
110 if (strcmp(tablename, "mangle") != 0) {
111 printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
112 return 0;
113 }
114
115 if (markinfo->mode != IPT_MARK_SET
116 && markinfo->mode != IPT_MARK_AND
117 && markinfo->mode != IPT_MARK_OR) {
118 printk(KERN_WARNING "MARK: unknown mode %u\n",
119 markinfo->mode);
120 return 0;
121 }
122
123 return 1;
124}
125
126static struct ipt_target ipt_mark_reg_v0 = {
127 .name = "MARK",
128 .target = target_v0,
129 .checkentry = checkentry_v0,
130 .me = THIS_MODULE,
131 .revision = 0,
132};
133
134static struct ipt_target ipt_mark_reg_v1 = {
135 .name = "MARK",
136 .target = target_v1,
137 .checkentry = checkentry_v1,
138 .me = THIS_MODULE,
139 .revision = 1,
140};
141
142static int __init init(void)
143{
144 int err;
145
146 err = ipt_register_target(&ipt_mark_reg_v0);
147 if (!err) {
148 err = ipt_register_target(&ipt_mark_reg_v1);
149 if (err)
150 ipt_unregister_target(&ipt_mark_reg_v0);
151 }
152 return err;
153}
154
155static void __exit fini(void)
156{
157 ipt_unregister_target(&ipt_mark_reg_v0);
158 ipt_unregister_target(&ipt_mark_reg_v1);
159}
160
161module_init(init);
162module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 000000000000..57e9f6cf1c36
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,207 @@
1/* Masquerade. Simple mapping which alters range to a local IP address
2 (depending on route). */
3
4/* (C) 1999-2001 Paul `Rusty' Russell
5 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/config.h>
13#include <linux/types.h>
14#include <linux/ip.h>
15#include <linux/timer.h>
16#include <linux/module.h>
17#include <linux/netfilter.h>
18#include <net/protocol.h>
19#include <net/ip.h>
20#include <net/checksum.h>
21#include <linux/netfilter_ipv4.h>
22#include <linux/netfilter_ipv4/ip_nat_rule.h>
23#include <linux/netfilter_ipv4/ip_tables.h>
24
25MODULE_LICENSE("GPL");
26MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
27MODULE_DESCRIPTION("iptables MASQUERADE target module");
28
29#if 0
30#define DEBUGP printk
31#else
32#define DEBUGP(format, args...)
33#endif
34
35/* Lock protects masq region inside conntrack */
36static DECLARE_RWLOCK(masq_lock);
37
38/* FIXME: Multiple targets. --RR */
39static int
40masquerade_check(const char *tablename,
41 const struct ipt_entry *e,
42 void *targinfo,
43 unsigned int targinfosize,
44 unsigned int hook_mask)
45{
46 const struct ip_nat_multi_range_compat *mr = targinfo;
47
48 if (strcmp(tablename, "nat") != 0) {
49 DEBUGP("masquerade_check: bad table `%s'.\n", tablename);
50 return 0;
51 }
52 if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
53 DEBUGP("masquerade_check: size %u != %u.\n",
54 targinfosize, sizeof(*mr));
55 return 0;
56 }
57 if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
58 DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask);
59 return 0;
60 }
61 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
62 DEBUGP("masquerade_check: bad MAP_IPS.\n");
63 return 0;
64 }
65 if (mr->rangesize != 1) {
66 DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
67 return 0;
68 }
69 return 1;
70}
71
72static unsigned int
73masquerade_target(struct sk_buff **pskb,
74 const struct net_device *in,
75 const struct net_device *out,
76 unsigned int hooknum,
77 const void *targinfo,
78 void *userinfo)
79{
80 struct ip_conntrack *ct;
81 enum ip_conntrack_info ctinfo;
82 const struct ip_nat_multi_range_compat *mr;
83 struct ip_nat_range newrange;
84 struct rtable *rt;
85 u_int32_t newsrc;
86
87 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
88
89 /* FIXME: For the moment, don't do local packets, breaks
90 testsuite for 2.3.49 --RR */
91 if ((*pskb)->sk)
92 return NF_ACCEPT;
93
94 ct = ip_conntrack_get(*pskb, &ctinfo);
95 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
96 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
97
98 mr = targinfo;
99 rt = (struct rtable *)(*pskb)->dst;
100 newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
101 if (!newsrc) {
102 printk("MASQUERADE: %s ate my IP address\n", out->name);
103 return NF_DROP;
104 }
105
106 WRITE_LOCK(&masq_lock);
107 ct->nat.masq_index = out->ifindex;
108 WRITE_UNLOCK(&masq_lock);
109
110 /* Transfer from original range. */
111 newrange = ((struct ip_nat_range)
112 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
113 newsrc, newsrc,
114 mr->range[0].min, mr->range[0].max });
115
116 /* Hand modified range to generic setup. */
117 return ip_nat_setup_info(ct, &newrange, hooknum);
118}
119
120static inline int
121device_cmp(struct ip_conntrack *i, void *ifindex)
122{
123 int ret;
124
125 READ_LOCK(&masq_lock);
126 ret = (i->nat.masq_index == (int)(long)ifindex);
127 READ_UNLOCK(&masq_lock);
128
129 return ret;
130}
131
132static int masq_device_event(struct notifier_block *this,
133 unsigned long event,
134 void *ptr)
135{
136 struct net_device *dev = ptr;
137
138 if (event == NETDEV_DOWN) {
139 /* Device was downed. Search entire table for
140 conntracks which were associated with that device,
141 and forget them. */
142 IP_NF_ASSERT(dev->ifindex != 0);
143
144 ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
145 }
146
147 return NOTIFY_DONE;
148}
149
150static int masq_inet_event(struct notifier_block *this,
151 unsigned long event,
152 void *ptr)
153{
154 struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
155
156 if (event == NETDEV_DOWN) {
157 /* IP address was deleted. Search entire table for
158 conntracks which were associated with that device,
159 and forget them. */
160 IP_NF_ASSERT(dev->ifindex != 0);
161
162 ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
163 }
164
165 return NOTIFY_DONE;
166}
167
168static struct notifier_block masq_dev_notifier = {
169 .notifier_call = masq_device_event,
170};
171
172static struct notifier_block masq_inet_notifier = {
173 .notifier_call = masq_inet_event,
174};
175
176static struct ipt_target masquerade = {
177 .name = "MASQUERADE",
178 .target = masquerade_target,
179 .checkentry = masquerade_check,
180 .me = THIS_MODULE,
181};
182
183static int __init init(void)
184{
185 int ret;
186
187 ret = ipt_register_target(&masquerade);
188
189 if (ret == 0) {
190 /* Register for device down reports */
191 register_netdevice_notifier(&masq_dev_notifier);
192 /* Register IP address change reports */
193 register_inetaddr_notifier(&masq_inet_notifier);
194 }
195
196 return ret;
197}
198
199static void __exit fini(void)
200{
201 ipt_unregister_target(&masquerade);
202 unregister_netdevice_notifier(&masq_dev_notifier);
203 unregister_inetaddr_notifier(&masq_inet_notifier);
204}
205
206module_init(init);
207module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
new file mode 100644
index 000000000000..06254b29d034
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -0,0 +1,117 @@
1/* NETMAP - static NAT mapping of IP network addresses (1:1).
2 * The mapping can be applied to source (POSTROUTING),
3 * destination (PREROUTING), or both (with separate rules).
4 */
5
6/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/config.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/netdevice.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19#include <linux/netfilter_ipv4/ip_nat_rule.h>
20
21#define MODULENAME "NETMAP"
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
24MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(format, args...)
30#endif
31
32static int
33check(const char *tablename,
34 const struct ipt_entry *e,
35 void *targinfo,
36 unsigned int targinfosize,
37 unsigned int hook_mask)
38{
39 const struct ip_nat_multi_range_compat *mr = targinfo;
40
41 if (strcmp(tablename, "nat") != 0) {
42 DEBUGP(MODULENAME":check: bad table `%s'.\n", tablename);
43 return 0;
44 }
45 if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
46 DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
47 return 0;
48 }
49 if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
50 DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
51 return 0;
52 }
53 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
54 DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
55 return 0;
56 }
57 if (mr->rangesize != 1) {
58 DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize);
59 return 0;
60 }
61 return 1;
62}
63
64static unsigned int
65target(struct sk_buff **pskb,
66 const struct net_device *in,
67 const struct net_device *out,
68 unsigned int hooknum,
69 const void *targinfo,
70 void *userinfo)
71{
72 struct ip_conntrack *ct;
73 enum ip_conntrack_info ctinfo;
74 u_int32_t new_ip, netmask;
75 const struct ip_nat_multi_range_compat *mr = targinfo;
76 struct ip_nat_range newrange;
77
78 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
79 || hooknum == NF_IP_POST_ROUTING);
80 ct = ip_conntrack_get(*pskb, &ctinfo);
81
82 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
83
84 if (hooknum == NF_IP_PRE_ROUTING)
85 new_ip = (*pskb)->nh.iph->daddr & ~netmask;
86 else
87 new_ip = (*pskb)->nh.iph->saddr & ~netmask;
88 new_ip |= mr->range[0].min_ip & netmask;
89
90 newrange = ((struct ip_nat_range)
91 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
92 new_ip, new_ip,
93 mr->range[0].min, mr->range[0].max });
94
95 /* Hand modified range to generic setup. */
96 return ip_nat_setup_info(ct, &newrange, hooknum);
97}
98
99static struct ipt_target target_module = {
100 .name = MODULENAME,
101 .target = target,
102 .checkentry = check,
103 .me = THIS_MODULE
104};
105
106static int __init init(void)
107{
108 return ipt_register_target(&target_module);
109}
110
111static void __exit fini(void)
112{
113 ipt_unregister_target(&target_module);
114}
115
116module_init(init);
117module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c
new file mode 100644
index 000000000000..a4bb9b3bc292
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NOTRACK.c
@@ -0,0 +1,76 @@
1/* This is a module which is used for setting up fake conntracks
2 * on packets so that they are not seen by the conntrack/NAT code.
3 */
4#include <linux/module.h>
5#include <linux/skbuff.h>
6
7#include <linux/netfilter_ipv4/ip_tables.h>
8#include <linux/netfilter_ipv4/ip_conntrack.h>
9
10static unsigned int
11target(struct sk_buff **pskb,
12 const struct net_device *in,
13 const struct net_device *out,
14 unsigned int hooknum,
15 const void *targinfo,
16 void *userinfo)
17{
18 /* Previously seen (loopback)? Ignore. */
19 if ((*pskb)->nfct != NULL)
20 return IPT_CONTINUE;
21
22 /* Attach fake conntrack entry.
23 If there is a real ct entry correspondig to this packet,
24 it'll hang aroun till timing out. We don't deal with it
25 for performance reasons. JK */
26 (*pskb)->nfct = &ip_conntrack_untracked.ct_general;
27 (*pskb)->nfctinfo = IP_CT_NEW;
28 nf_conntrack_get((*pskb)->nfct);
29
30 return IPT_CONTINUE;
31}
32
33static int
34checkentry(const char *tablename,
35 const struct ipt_entry *e,
36 void *targinfo,
37 unsigned int targinfosize,
38 unsigned int hook_mask)
39{
40 if (targinfosize != 0) {
41 printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n",
42 targinfosize);
43 return 0;
44 }
45
46 if (strcmp(tablename, "raw") != 0) {
47 printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename);
48 return 0;
49 }
50
51 return 1;
52}
53
54static struct ipt_target ipt_notrack_reg = {
55 .name = "NOTRACK",
56 .target = target,
57 .checkentry = checkentry,
58 .me = THIS_MODULE
59};
60
61static int __init init(void)
62{
63 if (ipt_register_target(&ipt_notrack_reg))
64 return -EINVAL;
65
66 return 0;
67}
68
69static void __exit fini(void)
70{
71 ipt_unregister_target(&ipt_notrack_reg);
72}
73
74module_init(init);
75module_exit(fini);
76MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
new file mode 100644
index 000000000000..d2e13447678e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -0,0 +1,129 @@
1/* Redirect. Simple mapping which alters dst to a local IP address. */
2/* (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/ip.h>
12#include <linux/timer.h>
13#include <linux/module.h>
14#include <linux/netfilter.h>
15#include <linux/netdevice.h>
16#include <linux/if.h>
17#include <linux/inetdevice.h>
18#include <net/protocol.h>
19#include <net/checksum.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter_ipv4/ip_nat_rule.h>
22
23MODULE_LICENSE("GPL");
24MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
25MODULE_DESCRIPTION("iptables REDIRECT target module");
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(format, args...)
31#endif
32
33/* FIXME: Take multiple ranges --RR */
34static int
35redirect_check(const char *tablename,
36 const struct ipt_entry *e,
37 void *targinfo,
38 unsigned int targinfosize,
39 unsigned int hook_mask)
40{
41 const struct ip_nat_multi_range_compat *mr = targinfo;
42
43 if (strcmp(tablename, "nat") != 0) {
44 DEBUGP("redirect_check: bad table `%s'.\n", table);
45 return 0;
46 }
47 if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
48 DEBUGP("redirect_check: size %u.\n", targinfosize);
49 return 0;
50 }
51 if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
52 DEBUGP("redirect_check: bad hooks %x.\n", hook_mask);
53 return 0;
54 }
55 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
56 DEBUGP("redirect_check: bad MAP_IPS.\n");
57 return 0;
58 }
59 if (mr->rangesize != 1) {
60 DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
61 return 0;
62 }
63 return 1;
64}
65
66static unsigned int
67redirect_target(struct sk_buff **pskb,
68 const struct net_device *in,
69 const struct net_device *out,
70 unsigned int hooknum,
71 const void *targinfo,
72 void *userinfo)
73{
74 struct ip_conntrack *ct;
75 enum ip_conntrack_info ctinfo;
76 u_int32_t newdst;
77 const struct ip_nat_multi_range_compat *mr = targinfo;
78 struct ip_nat_range newrange;
79
80 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
81 || hooknum == NF_IP_LOCAL_OUT);
82
83 ct = ip_conntrack_get(*pskb, &ctinfo);
84 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
85
86 /* Local packets: make them go to loopback */
87 if (hooknum == NF_IP_LOCAL_OUT)
88 newdst = htonl(0x7F000001);
89 else {
90 struct in_device *indev;
91
92 /* Device might not have an associated in_device. */
93 indev = (struct in_device *)(*pskb)->dev->ip_ptr;
94 if (indev == NULL || indev->ifa_list == NULL)
95 return NF_DROP;
96
97 /* Grab first address on interface. */
98 newdst = indev->ifa_list->ifa_local;
99 }
100
101 /* Transfer from original range. */
102 newrange = ((struct ip_nat_range)
103 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
104 newdst, newdst,
105 mr->range[0].min, mr->range[0].max });
106
107 /* Hand modified range to generic setup. */
108 return ip_nat_setup_info(ct, &newrange, hooknum);
109}
110
111static struct ipt_target redirect_reg = {
112 .name = "REDIRECT",
113 .target = redirect_target,
114 .checkentry = redirect_check,
115 .me = THIS_MODULE,
116};
117
118static int __init init(void)
119{
120 return ipt_register_target(&redirect_reg);
121}
122
123static void __exit fini(void)
124{
125 ipt_unregister_target(&redirect_reg);
126}
127
128module_init(init);
129module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 000000000000..266d64979286
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,335 @@
1/*
2 * This is a module which is used for rejecting packets.
3 * Added support for customized reject packets (Jozsef Kadlecsik).
4 * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812]
5 */
6
7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/skbuff.h>
18#include <linux/ip.h>
19#include <linux/udp.h>
20#include <linux/icmp.h>
21#include <net/icmp.h>
22#include <net/ip.h>
23#include <net/tcp.h>
24#include <net/route.h>
25#include <net/dst.h>
26#include <linux/netfilter_ipv4/ip_tables.h>
27#include <linux/netfilter_ipv4/ipt_REJECT.h>
28#ifdef CONFIG_BRIDGE_NETFILTER
29#include <linux/netfilter_bridge.h>
30#endif
31
32MODULE_LICENSE("GPL");
33MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
34MODULE_DESCRIPTION("iptables REJECT target module");
35
36#if 0
37#define DEBUGP printk
38#else
39#define DEBUGP(format, args...)
40#endif
41
42static inline struct rtable *route_reverse(struct sk_buff *skb,
43 struct tcphdr *tcph, int hook)
44{
45 struct iphdr *iph = skb->nh.iph;
46 struct dst_entry *odst;
47 struct flowi fl = {};
48 struct rtable *rt;
49
50 /* We don't require ip forwarding to be enabled to be able to
51 * send a RST reply for bridged traffic. */
52 if (hook != NF_IP_FORWARD
53#ifdef CONFIG_BRIDGE_NETFILTER
54 || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED)
55#endif
56 ) {
57 fl.nl_u.ip4_u.daddr = iph->saddr;
58 if (hook == NF_IP_LOCAL_IN)
59 fl.nl_u.ip4_u.saddr = iph->daddr;
60 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
61
62 if (ip_route_output_key(&rt, &fl) != 0)
63 return NULL;
64 } else {
65 /* non-local src, find valid iif to satisfy
66 * rp-filter when calling ip_route_input. */
67 fl.nl_u.ip4_u.daddr = iph->daddr;
68 if (ip_route_output_key(&rt, &fl) != 0)
69 return NULL;
70
71 odst = skb->dst;
72 if (ip_route_input(skb, iph->saddr, iph->daddr,
73 RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
74 dst_release(&rt->u.dst);
75 return NULL;
76 }
77 dst_release(&rt->u.dst);
78 rt = (struct rtable *)skb->dst;
79 skb->dst = odst;
80
81 fl.nl_u.ip4_u.daddr = iph->saddr;
82 fl.nl_u.ip4_u.saddr = iph->daddr;
83 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
84 }
85
86 if (rt->u.dst.error) {
87 dst_release(&rt->u.dst);
88 return NULL;
89 }
90
91 fl.proto = IPPROTO_TCP;
92 fl.fl_ip_sport = tcph->dest;
93 fl.fl_ip_dport = tcph->source;
94
95 if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) {
96 dst_release(&rt->u.dst);
97 rt = NULL;
98 }
99
100 return rt;
101}
102
103/* Send RST reply */
104static void send_reset(struct sk_buff *oldskb, int hook)
105{
106 struct sk_buff *nskb;
107 struct tcphdr _otcph, *oth, *tcph;
108 struct rtable *rt;
109 u_int16_t tmp_port;
110 u_int32_t tmp_addr;
111 int needs_ack;
112 int hh_len;
113
114 /* IP header checks: fragment. */
115 if (oldskb->nh.iph->frag_off & htons(IP_OFFSET))
116 return;
117
118 oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4,
119 sizeof(_otcph), &_otcph);
120 if (oth == NULL)
121 return;
122
123 /* No RST for RST. */
124 if (oth->rst)
125 return;
126
127 /* FIXME: Check checksum --RR */
128 if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
129 return;
130
131 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
132
133 /* We need a linear, writeable skb. We also need to expand
134 headroom in case hh_len of incoming interface < hh_len of
135 outgoing interface */
136 nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb),
137 GFP_ATOMIC);
138 if (!nskb) {
139 dst_release(&rt->u.dst);
140 return;
141 }
142
143 dst_release(nskb->dst);
144 nskb->dst = &rt->u.dst;
145
146 /* This packet will not be the same as the other: clear nf fields */
147 nf_reset(nskb);
148 nskb->nfcache = 0;
149 nskb->nfmark = 0;
150#ifdef CONFIG_BRIDGE_NETFILTER
151 nf_bridge_put(nskb->nf_bridge);
152 nskb->nf_bridge = NULL;
153#endif
154
155 tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
156
157 /* Swap source and dest */
158 tmp_addr = nskb->nh.iph->saddr;
159 nskb->nh.iph->saddr = nskb->nh.iph->daddr;
160 nskb->nh.iph->daddr = tmp_addr;
161 tmp_port = tcph->source;
162 tcph->source = tcph->dest;
163 tcph->dest = tmp_port;
164
165 /* Truncate to length (no data) */
166 tcph->doff = sizeof(struct tcphdr)/4;
167 skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr));
168 nskb->nh.iph->tot_len = htons(nskb->len);
169
170 if (tcph->ack) {
171 needs_ack = 0;
172 tcph->seq = oth->ack_seq;
173 tcph->ack_seq = 0;
174 } else {
175 needs_ack = 1;
176 tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin
177 + oldskb->len - oldskb->nh.iph->ihl*4
178 - (oth->doff<<2));
179 tcph->seq = 0;
180 }
181
182 /* Reset flags */
183 ((u_int8_t *)tcph)[13] = 0;
184 tcph->rst = 1;
185 tcph->ack = needs_ack;
186
187 tcph->window = 0;
188 tcph->urg_ptr = 0;
189
190 /* Adjust TCP checksum */
191 tcph->check = 0;
192 tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr),
193 nskb->nh.iph->saddr,
194 nskb->nh.iph->daddr,
195 csum_partial((char *)tcph,
196 sizeof(struct tcphdr), 0));
197
198 /* Adjust IP TTL, DF */
199 nskb->nh.iph->ttl = MAXTTL;
200 /* Set DF, id = 0 */
201 nskb->nh.iph->frag_off = htons(IP_DF);
202 nskb->nh.iph->id = 0;
203
204 /* Adjust IP checksum */
205 nskb->nh.iph->check = 0;
206 nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph,
207 nskb->nh.iph->ihl);
208
209 /* "Never happens" */
210 if (nskb->len > dst_mtu(nskb->dst))
211 goto free_nskb;
212
213 nf_ct_attach(nskb, oldskb);
214
215 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
216 dst_output);
217 return;
218
219 free_nskb:
220 kfree_skb(nskb);
221}
222
223static inline void send_unreach(struct sk_buff *skb_in, int code)
224{
225 icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
226}
227
228static unsigned int reject(struct sk_buff **pskb,
229 const struct net_device *in,
230 const struct net_device *out,
231 unsigned int hooknum,
232 const void *targinfo,
233 void *userinfo)
234{
235 const struct ipt_reject_info *reject = targinfo;
236
237 /* Our naive response construction doesn't deal with IP
238 options, and probably shouldn't try. */
239 if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr))
240 return NF_DROP;
241
242 /* WARNING: This code causes reentry within iptables.
243 This means that the iptables jump stack is now crap. We
244 must return an absolute verdict. --RR */
245 switch (reject->with) {
246 case IPT_ICMP_NET_UNREACHABLE:
247 send_unreach(*pskb, ICMP_NET_UNREACH);
248 break;
249 case IPT_ICMP_HOST_UNREACHABLE:
250 send_unreach(*pskb, ICMP_HOST_UNREACH);
251 break;
252 case IPT_ICMP_PROT_UNREACHABLE:
253 send_unreach(*pskb, ICMP_PROT_UNREACH);
254 break;
255 case IPT_ICMP_PORT_UNREACHABLE:
256 send_unreach(*pskb, ICMP_PORT_UNREACH);
257 break;
258 case IPT_ICMP_NET_PROHIBITED:
259 send_unreach(*pskb, ICMP_NET_ANO);
260 break;
261 case IPT_ICMP_HOST_PROHIBITED:
262 send_unreach(*pskb, ICMP_HOST_ANO);
263 break;
264 case IPT_ICMP_ADMIN_PROHIBITED:
265 send_unreach(*pskb, ICMP_PKT_FILTERED);
266 break;
267 case IPT_TCP_RESET:
268 send_reset(*pskb, hooknum);
269 case IPT_ICMP_ECHOREPLY:
270 /* Doesn't happen. */
271 break;
272 }
273
274 return NF_DROP;
275}
276
277static int check(const char *tablename,
278 const struct ipt_entry *e,
279 void *targinfo,
280 unsigned int targinfosize,
281 unsigned int hook_mask)
282{
283 const struct ipt_reject_info *rejinfo = targinfo;
284
285 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) {
286 DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize);
287 return 0;
288 }
289
290 /* Only allow these for packet filtering. */
291 if (strcmp(tablename, "filter") != 0) {
292 DEBUGP("REJECT: bad table `%s'.\n", tablename);
293 return 0;
294 }
295 if ((hook_mask & ~((1 << NF_IP_LOCAL_IN)
296 | (1 << NF_IP_FORWARD)
297 | (1 << NF_IP_LOCAL_OUT))) != 0) {
298 DEBUGP("REJECT: bad hook mask %X\n", hook_mask);
299 return 0;
300 }
301
302 if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
303 printk("REJECT: ECHOREPLY no longer supported.\n");
304 return 0;
305 } else if (rejinfo->with == IPT_TCP_RESET) {
306 /* Must specify that it's a TCP packet */
307 if (e->ip.proto != IPPROTO_TCP
308 || (e->ip.invflags & IPT_INV_PROTO)) {
309 DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n");
310 return 0;
311 }
312 }
313
314 return 1;
315}
316
317static struct ipt_target ipt_reject_reg = {
318 .name = "REJECT",
319 .target = reject,
320 .checkentry = check,
321 .me = THIS_MODULE,
322};
323
324static int __init init(void)
325{
326 return ipt_register_target(&ipt_reject_reg);
327}
328
329static void __exit fini(void)
330{
331 ipt_unregister_target(&ipt_reject_reg);
332}
333
334module_init(init);
335module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
new file mode 100644
index 000000000000..7a0536d864ac
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -0,0 +1,211 @@
1/* Same. Just like SNAT, only try to make the connections
2 * between client A and server B always have the same source ip.
3 *
4 * (C) 2000 Paul `Rusty' Russell
5 * (C) 2001 Martin Josefsson
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * 010320 Martin Josefsson <gandalf@wlug.westbo.se>
12 * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things.
13 * 010728 Martin Josefsson <gandalf@wlug.westbo.se>
14 * * added --nodst to not include destination-ip in new source
15 * calculations.
16 * * added some more sanity-checks.
17 * 010729 Martin Josefsson <gandalf@wlug.westbo.se>
18 * * fixed a buggy if-statement in same_check(), should have
19 * used ntohl() but didn't.
20 * * added support for multiple ranges. IPT_SAME_MAX_RANGE is
21 * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h
22 * and is currently set to 10.
23 * * added support for 1-address range, nice to have now that
24 * we have multiple ranges.
25 */
26#include <linux/types.h>
27#include <linux/ip.h>
28#include <linux/timer.h>
29#include <linux/module.h>
30#include <linux/netfilter.h>
31#include <linux/netdevice.h>
32#include <linux/if.h>
33#include <linux/inetdevice.h>
34#include <net/protocol.h>
35#include <net/checksum.h>
36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv4/ip_nat_rule.h>
38#include <linux/netfilter_ipv4/ipt_SAME.h>
39
40MODULE_LICENSE("GPL");
41MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>");
42MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
43
44#if 0
45#define DEBUGP printk
46#else
47#define DEBUGP(format, args...)
48#endif
49
50static int
51same_check(const char *tablename,
52 const struct ipt_entry *e,
53 void *targinfo,
54 unsigned int targinfosize,
55 unsigned int hook_mask)
56{
57 unsigned int count, countess, rangeip, index = 0;
58 struct ipt_same_info *mr = targinfo;
59
60 mr->ipnum = 0;
61
62 if (strcmp(tablename, "nat") != 0) {
63 DEBUGP("same_check: bad table `%s'.\n", tablename);
64 return 0;
65 }
66 if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
67 DEBUGP("same_check: size %u.\n", targinfosize);
68 return 0;
69 }
70 if (hook_mask & ~(1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING)) {
71 DEBUGP("same_check: bad hooks %x.\n", hook_mask);
72 return 0;
73 }
74 if (mr->rangesize < 1) {
75 DEBUGP("same_check: need at least one dest range.\n");
76 return 0;
77 }
78 if (mr->rangesize > IPT_SAME_MAX_RANGE) {
79 DEBUGP("same_check: too many ranges specified, maximum "
80 "is %u ranges\n",
81 IPT_SAME_MAX_RANGE);
82 return 0;
83 }
84 for (count = 0; count < mr->rangesize; count++) {
85 if (ntohl(mr->range[count].min_ip) >
86 ntohl(mr->range[count].max_ip)) {
87 DEBUGP("same_check: min_ip is larger than max_ip in "
88 "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
89 NIPQUAD(mr->range[count].min_ip),
90 NIPQUAD(mr->range[count].max_ip));
91 return 0;
92 }
93 if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
94 DEBUGP("same_check: bad MAP_IPS.\n");
95 return 0;
96 }
97 rangeip = (ntohl(mr->range[count].max_ip) -
98 ntohl(mr->range[count].min_ip) + 1);
99 mr->ipnum += rangeip;
100
101 DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip);
102 }
103 DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum);
104
105 mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL);
106 if (!mr->iparray) {
107 DEBUGP("same_check: Couldn't allocate %u bytes "
108 "for %u ipaddresses!\n",
109 (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
110 return 0;
111 }
112 DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n",
113 (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
114
115 for (count = 0; count < mr->rangesize; count++) {
116 for (countess = ntohl(mr->range[count].min_ip);
117 countess <= ntohl(mr->range[count].max_ip);
118 countess++) {
119 mr->iparray[index] = countess;
120 DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' "
121 "in index %u.\n",
122 HIPQUAD(countess), index);
123 index++;
124 }
125 }
126 return 1;
127}
128
129static void
130same_destroy(void *targinfo,
131 unsigned int targinfosize)
132{
133 struct ipt_same_info *mr = targinfo;
134
135 kfree(mr->iparray);
136
137 DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n",
138 (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
139}
140
141static unsigned int
142same_target(struct sk_buff **pskb,
143 const struct net_device *in,
144 const struct net_device *out,
145 unsigned int hooknum,
146 const void *targinfo,
147 void *userinfo)
148{
149 struct ip_conntrack *ct;
150 enum ip_conntrack_info ctinfo;
151 u_int32_t tmpip, aindex, new_ip;
152 const struct ipt_same_info *same = targinfo;
153 struct ip_nat_range newrange;
154 const struct ip_conntrack_tuple *t;
155
156 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
157 hooknum == NF_IP_POST_ROUTING);
158 ct = ip_conntrack_get(*pskb, &ctinfo);
159
160 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
161
162 /* Base new source on real src ip and optionally dst ip,
163 giving some hope for consistency across reboots.
164 Here we calculate the index in same->iparray which
165 holds the ipaddress we should use */
166
167 tmpip = ntohl(t->src.ip);
168
169 if (!(same->info & IPT_SAME_NODST))
170 tmpip += ntohl(t->dst.ip);
171
172 aindex = tmpip % same->ipnum;
173
174 new_ip = htonl(same->iparray[aindex]);
175
176 DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
177 "new src=%u.%u.%u.%u\n",
178 NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip),
179 NIPQUAD(new_ip));
180
181 /* Transfer from original range. */
182 newrange = ((struct ip_nat_range)
183 { same->range[0].flags, new_ip, new_ip,
184 /* FIXME: Use ports from correct range! */
185 same->range[0].min, same->range[0].max });
186
187 /* Hand modified range to generic setup. */
188 return ip_nat_setup_info(ct, &newrange, hooknum);
189}
190
191static struct ipt_target same_reg = {
192 .name = "SAME",
193 .target = same_target,
194 .checkentry = same_check,
195 .destroy = same_destroy,
196 .me = THIS_MODULE,
197};
198
199static int __init init(void)
200{
201 return ipt_register_target(&same_reg);
202}
203
204static void __exit fini(void)
205{
206 ipt_unregister_target(&same_reg);
207}
208
209module_init(init);
210module_exit(fini);
211
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
new file mode 100644
index 000000000000..1049050b2bfb
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -0,0 +1,262 @@
1/*
2 * This is a module which is used for setting the MSS option in TCP packets.
3 *
4 * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13
14#include <linux/ip.h>
15#include <net/tcp.h>
16
17#include <linux/netfilter_ipv4/ip_tables.h>
18#include <linux/netfilter_ipv4/ipt_TCPMSS.h>
19
20MODULE_LICENSE("GPL");
21MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
22MODULE_DESCRIPTION("iptables TCP MSS modification module");
23
24#if 0
25#define DEBUGP printk
26#else
27#define DEBUGP(format, args...)
28#endif
29
30static u_int16_t
31cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
32{
33 u_int32_t diffs[] = { oldvalinv, newval };
34 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
35 oldcheck^0xFFFF));
36}
37
38static inline unsigned int
39optlen(const u_int8_t *opt, unsigned int offset)
40{
41 /* Beware zero-length options: make finite progress */
42 if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1;
43 else return opt[offset+1];
44}
45
46static unsigned int
47ipt_tcpmss_target(struct sk_buff **pskb,
48 const struct net_device *in,
49 const struct net_device *out,
50 unsigned int hooknum,
51 const void *targinfo,
52 void *userinfo)
53{
54 const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
55 struct tcphdr *tcph;
56 struct iphdr *iph;
57 u_int16_t tcplen, newtotlen, oldval, newmss;
58 unsigned int i;
59 u_int8_t *opt;
60
61 if (!skb_ip_make_writable(pskb, (*pskb)->len))
62 return NF_DROP;
63
64 iph = (*pskb)->nh.iph;
65 tcplen = (*pskb)->len - iph->ihl*4;
66
67 tcph = (void *)iph + iph->ihl*4;
68
69 /* Since it passed flags test in tcp match, we know it is is
70 not a fragment, and has data >= tcp header length. SYN
71 packets should not contain data: if they did, then we risk
72 running over MTU, sending Frag Needed and breaking things
73 badly. --RR */
74 if (tcplen != tcph->doff*4) {
75 if (net_ratelimit())
76 printk(KERN_ERR
77 "ipt_tcpmss_target: bad length (%d bytes)\n",
78 (*pskb)->len);
79 return NF_DROP;
80 }
81
82 if(tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) {
83 if(!(*pskb)->dst) {
84 if (net_ratelimit())
85 printk(KERN_ERR
86 "ipt_tcpmss_target: no dst?! can't determine path-MTU\n");
87 return NF_DROP; /* or IPT_CONTINUE ?? */
88 }
89
90 if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
91 if (net_ratelimit())
92 printk(KERN_ERR
93 "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst));
94 return NF_DROP; /* or IPT_CONTINUE ?? */
95 }
96
97 newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr);
98 } else
99 newmss = tcpmssinfo->mss;
100
101 opt = (u_int8_t *)tcph;
102 for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)){
103 if ((opt[i] == TCPOPT_MSS) &&
104 ((tcph->doff*4 - i) >= TCPOLEN_MSS) &&
105 (opt[i+1] == TCPOLEN_MSS)) {
106 u_int16_t oldmss;
107
108 oldmss = (opt[i+2] << 8) | opt[i+3];
109
110 if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) &&
111 (oldmss <= newmss))
112 return IPT_CONTINUE;
113
114 opt[i+2] = (newmss & 0xff00) >> 8;
115 opt[i+3] = (newmss & 0x00ff);
116
117 tcph->check = cheat_check(htons(oldmss)^0xFFFF,
118 htons(newmss),
119 tcph->check);
120
121 DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
122 "->%u.%u.%u.%u:%hu changed TCP MSS option"
123 " (from %u to %u)\n",
124 NIPQUAD((*pskb)->nh.iph->saddr),
125 ntohs(tcph->source),
126 NIPQUAD((*pskb)->nh.iph->daddr),
127 ntohs(tcph->dest),
128 oldmss, newmss);
129 goto retmodified;
130 }
131 }
132
133 /*
134 * MSS Option not found ?! add it..
135 */
136 if (skb_tailroom((*pskb)) < TCPOLEN_MSS) {
137 struct sk_buff *newskb;
138
139 newskb = skb_copy_expand(*pskb, skb_headroom(*pskb),
140 TCPOLEN_MSS, GFP_ATOMIC);
141 if (!newskb) {
142 if (net_ratelimit())
143 printk(KERN_ERR "ipt_tcpmss_target:"
144 " unable to allocate larger skb\n");
145 return NF_DROP;
146 }
147
148 kfree_skb(*pskb);
149 *pskb = newskb;
150 iph = (*pskb)->nh.iph;
151 tcph = (void *)iph + iph->ihl*4;
152 }
153
154 skb_put((*pskb), TCPOLEN_MSS);
155
156 opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
157 memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
158
159 tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF,
160 htons(tcplen + TCPOLEN_MSS), tcph->check);
161 tcplen += TCPOLEN_MSS;
162
163 opt[0] = TCPOPT_MSS;
164 opt[1] = TCPOLEN_MSS;
165 opt[2] = (newmss & 0xff00) >> 8;
166 opt[3] = (newmss & 0x00ff);
167
168 tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check);
169
170 oldval = ((u_int16_t *)tcph)[6];
171 tcph->doff += TCPOLEN_MSS/4;
172 tcph->check = cheat_check(oldval ^ 0xFFFF,
173 ((u_int16_t *)tcph)[6], tcph->check);
174
175 newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS);
176 iph->check = cheat_check(iph->tot_len ^ 0xFFFF,
177 newtotlen, iph->check);
178 iph->tot_len = newtotlen;
179
180 DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
181 "->%u.%u.%u.%u:%hu added TCP MSS option (%u)\n",
182 NIPQUAD((*pskb)->nh.iph->saddr),
183 ntohs(tcph->source),
184 NIPQUAD((*pskb)->nh.iph->daddr),
185 ntohs(tcph->dest),
186 newmss);
187
188 retmodified:
189 /* We never hw checksum SYN packets. */
190 BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
191
192 (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
193 return IPT_CONTINUE;
194}
195
196#define TH_SYN 0x02
197
198static inline int find_syn_match(const struct ipt_entry_match *m)
199{
200 const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data;
201
202 if (strcmp(m->u.kernel.match->name, "tcp") == 0
203 && (tcpinfo->flg_cmp & TH_SYN)
204 && !(tcpinfo->invflags & IPT_TCP_INV_FLAGS))
205 return 1;
206
207 return 0;
208}
209
210/* Must specify -p tcp --syn/--tcp-flags SYN */
211static int
212ipt_tcpmss_checkentry(const char *tablename,
213 const struct ipt_entry *e,
214 void *targinfo,
215 unsigned int targinfosize,
216 unsigned int hook_mask)
217{
218 const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
219
220 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) {
221 DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n",
222 targinfosize, IPT_ALIGN(sizeof(struct ipt_tcpmss_info)));
223 return 0;
224 }
225
226
227 if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) &&
228 ((hook_mask & ~((1 << NF_IP_FORWARD)
229 | (1 << NF_IP_LOCAL_OUT)
230 | (1 << NF_IP_POST_ROUTING))) != 0)) {
231 printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
232 return 0;
233 }
234
235 if (e->ip.proto == IPPROTO_TCP
236 && !(e->ip.invflags & IPT_INV_PROTO)
237 && IPT_MATCH_ITERATE(e, find_syn_match))
238 return 1;
239
240 printk("TCPMSS: Only works on TCP SYN packets\n");
241 return 0;
242}
243
244static struct ipt_target ipt_tcpmss_reg = {
245 .name = "TCPMSS",
246 .target = ipt_tcpmss_target,
247 .checkentry = ipt_tcpmss_checkentry,
248 .me = THIS_MODULE,
249};
250
251static int __init init(void)
252{
253 return ipt_register_target(&ipt_tcpmss_reg);
254}
255
256static void __exit fini(void)
257{
258 ipt_unregister_target(&ipt_tcpmss_reg);
259}
260
261module_init(init);
262module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
new file mode 100644
index 000000000000..85c70d240f8b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -0,0 +1,105 @@
1/* This is a module which is used for setting the TOS field of a packet. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/ip.h>
14#include <net/checksum.h>
15
16#include <linux/netfilter_ipv4/ip_tables.h>
17#include <linux/netfilter_ipv4/ipt_TOS.h>
18
19MODULE_LICENSE("GPL");
20MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
21MODULE_DESCRIPTION("iptables TOS mangling module");
22
23static unsigned int
24target(struct sk_buff **pskb,
25 const struct net_device *in,
26 const struct net_device *out,
27 unsigned int hooknum,
28 const void *targinfo,
29 void *userinfo)
30{
31 const struct ipt_tos_target_info *tosinfo = targinfo;
32
33 if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
34 u_int16_t diffs[2];
35
36 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
37 return NF_DROP;
38
39 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
40 (*pskb)->nh.iph->tos
41 = ((*pskb)->nh.iph->tos & IPTOS_PREC_MASK)
42 | tosinfo->tos;
43 diffs[1] = htons((*pskb)->nh.iph->tos);
44 (*pskb)->nh.iph->check
45 = csum_fold(csum_partial((char *)diffs,
46 sizeof(diffs),
47 (*pskb)->nh.iph->check
48 ^0xFFFF));
49 (*pskb)->nfcache |= NFC_ALTERED;
50 }
51 return IPT_CONTINUE;
52}
53
54static int
55checkentry(const char *tablename,
56 const struct ipt_entry *e,
57 void *targinfo,
58 unsigned int targinfosize,
59 unsigned int hook_mask)
60{
61 const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
62
63 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) {
64 printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n",
65 targinfosize,
66 IPT_ALIGN(sizeof(struct ipt_tos_target_info)));
67 return 0;
68 }
69
70 if (strcmp(tablename, "mangle") != 0) {
71 printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
72 return 0;
73 }
74
75 if (tos != IPTOS_LOWDELAY
76 && tos != IPTOS_THROUGHPUT
77 && tos != IPTOS_RELIABILITY
78 && tos != IPTOS_MINCOST
79 && tos != IPTOS_NORMALSVC) {
80 printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
81 return 0;
82 }
83
84 return 1;
85}
86
87static struct ipt_target ipt_tos_reg = {
88 .name = "TOS",
89 .target = target,
90 .checkentry = checkentry,
91 .me = THIS_MODULE,
92};
93
94static int __init init(void)
95{
96 return ipt_register_target(&ipt_tos_reg);
97}
98
99static void __exit fini(void)
100{
101 ipt_unregister_target(&ipt_tos_reg);
102}
103
104module_init(init);
105module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
new file mode 100644
index 000000000000..6f2cefbe16cd
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -0,0 +1,419 @@
1/*
2 * netfilter module for userspace packet logging daemons
3 *
4 * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
5 *
6 * 2000/09/22 ulog-cprange feature added
7 * 2001/01/04 in-kernel queue as proposed by Sebastian Zander
8 * <zander@fokus.gmd.de>
9 * 2001/01/30 per-rule nlgroup conflicts with global queue.
10 * nlgroup now global (sysctl)
11 * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at
12 * module loadtime -HW
13 * 2002/07/07 remove broken nflog_rcv() function -HW
14 * 2002/08/29 fix shifted/unshifted nlgroup bug -HW
15 * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen>
16 * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT
17 * resulting in bogus 'error during NLMSG_PUT' messages.
18 *
19 * (C) 1999-2001 Paul `Rusty' Russell
20 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
21 *
22 * This program is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License version 2 as
24 * published by the Free Software Foundation.
25 *
26 * This module accepts two parameters:
27 *
28 * nlbufsiz:
29 * The parameter specifies how big the buffer for each netlink multicast
30 * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
31 * get accumulated in the kernel until they are sent to userspace. It is
32 * NOT possible to allocate more than 128kB, and it is strongly discouraged,
33 * because atomically allocating 128kB inside the network rx softirq is not
34 * reliable. Please also keep in mind that this buffer size is allocated for
35 * each nlgroup you are using, so the total kernel memory usage increases
36 * by that factor.
37 *
38 * flushtimeout:
39 * Specify, after how many hundredths of a second the queue should be
40 * flushed even if it is not full yet.
41 *
42 * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp
43 */
44
45#include <linux/module.h>
46#include <linux/config.h>
47#include <linux/spinlock.h>
48#include <linux/socket.h>
49#include <linux/skbuff.h>
50#include <linux/kernel.h>
51#include <linux/timer.h>
52#include <linux/netlink.h>
53#include <linux/netdevice.h>
54#include <linux/mm.h>
55#include <linux/moduleparam.h>
56#include <linux/netfilter.h>
57#include <linux/netfilter_ipv4/ip_tables.h>
58#include <linux/netfilter_ipv4/ipt_ULOG.h>
59#include <linux/netfilter_ipv4/lockhelp.h>
60#include <net/sock.h>
61#include <linux/bitops.h>
62
63MODULE_LICENSE("GPL");
64MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
65MODULE_DESCRIPTION("iptables userspace logging module");
66
67#define ULOG_NL_EVENT 111 /* Harald's favorite number */
68#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
69
70#if 0
71#define DEBUGP(format, args...) printk("%s:%s:" format, \
72 __FILE__, __FUNCTION__ , ## args)
73#else
74#define DEBUGP(format, args...)
75#endif
76
77#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
78
79static unsigned int nlbufsiz = 4096;
80module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */
81MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
82
83static unsigned int flushtimeout = 10;
84module_param(flushtimeout, int, 0600);
85MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
86
87static unsigned int nflog = 1;
88module_param(nflog, int, 0400);
89MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
90
91/* global data structures */
92
93typedef struct {
94 unsigned int qlen; /* number of nlmsgs' in the skb */
95 struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
96 struct sk_buff *skb; /* the pre-allocated skb */
97 struct timer_list timer; /* the timer function */
98} ulog_buff_t;
99
100static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
101
102static struct sock *nflognl; /* our socket */
103static DECLARE_LOCK(ulog_lock); /* spinlock */
104
105/* send one ulog_buff_t to userspace */
106static void ulog_send(unsigned int nlgroupnum)
107{
108 ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
109
110 if (timer_pending(&ub->timer)) {
111 DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n");
112 del_timer(&ub->timer);
113 }
114
115 /* last nlmsg needs NLMSG_DONE */
116 if (ub->qlen > 1)
117 ub->lastnlh->nlmsg_type = NLMSG_DONE;
118
119 NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
120 DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
121 ub->qlen, nlgroupnum);
122 netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
123
124 ub->qlen = 0;
125 ub->skb = NULL;
126 ub->lastnlh = NULL;
127
128}
129
130
131/* timer function to flush queue in flushtimeout time */
132static void ulog_timer(unsigned long data)
133{
134 DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n");
135
136 /* lock to protect against somebody modifying our structure
137 * from ipt_ulog_target at the same time */
138 LOCK_BH(&ulog_lock);
139 ulog_send(data);
140 UNLOCK_BH(&ulog_lock);
141}
142
143static struct sk_buff *ulog_alloc_skb(unsigned int size)
144{
145 struct sk_buff *skb;
146
147 /* alloc skb which should be big enough for a whole
148 * multipart message. WARNING: has to be <= 131000
149 * due to slab allocator restrictions */
150
151 skb = alloc_skb(nlbufsiz, GFP_ATOMIC);
152 if (!skb) {
153 PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n",
154 nlbufsiz);
155
156 /* try to allocate only as much as we need for
157 * current packet */
158
159 skb = alloc_skb(size, GFP_ATOMIC);
160 if (!skb)
161 PRINTR("ipt_ULOG: can't even allocate %ub\n", size);
162 }
163
164 return skb;
165}
166
167static void ipt_ulog_packet(unsigned int hooknum,
168 const struct sk_buff *skb,
169 const struct net_device *in,
170 const struct net_device *out,
171 const struct ipt_ulog_info *loginfo,
172 const char *prefix)
173{
174 ulog_buff_t *ub;
175 ulog_packet_msg_t *pm;
176 size_t size, copy_len;
177 struct nlmsghdr *nlh;
178
179 /* ffs == find first bit set, necessary because userspace
180 * is already shifting groupnumber, but we need unshifted.
181 * ffs() returns [1..32], we need [0..31] */
182 unsigned int groupnum = ffs(loginfo->nl_group) - 1;
183
184 /* calculate the size of the skb needed */
185 if ((loginfo->copy_range == 0) ||
186 (loginfo->copy_range > skb->len)) {
187 copy_len = skb->len;
188 } else {
189 copy_len = loginfo->copy_range;
190 }
191
192 size = NLMSG_SPACE(sizeof(*pm) + copy_len);
193
194 ub = &ulog_buffers[groupnum];
195
196 LOCK_BH(&ulog_lock);
197
198 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size)))
200 goto alloc_failure;
201 } else if (ub->qlen >= loginfo->qthreshold ||
202 size > skb_tailroom(ub->skb)) {
203 /* either the queue len is too high or we don't have
204 * enough room in nlskb left. send it to userspace. */
205
206 ulog_send(groupnum);
207
208 if (!(ub->skb = ulog_alloc_skb(size)))
209 goto alloc_failure;
210 }
211
212 DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen,
213 loginfo->qthreshold);
214
215 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
216 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
217 sizeof(*pm)+copy_len);
218 ub->qlen++;
219
220 pm = NLMSG_DATA(nlh);
221
222 /* We might not have a timestamp, get one */
223 if (skb->stamp.tv_sec == 0)
224 do_gettimeofday((struct timeval *)&skb->stamp);
225
226 /* copy hook, prefix, timestamp, payload, etc. */
227 pm->data_len = copy_len;
228 pm->timestamp_sec = skb->stamp.tv_sec;
229 pm->timestamp_usec = skb->stamp.tv_usec;
230 pm->mark = skb->nfmark;
231 pm->hook = hooknum;
232 if (prefix != NULL)
233 strncpy(pm->prefix, prefix, sizeof(pm->prefix));
234 else if (loginfo->prefix[0] != '\0')
235 strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
236 else
237 *(pm->prefix) = '\0';
238
239 if (in && in->hard_header_len > 0
240 && skb->mac.raw != (void *) skb->nh.iph
241 && in->hard_header_len <= ULOG_MAC_LEN) {
242 memcpy(pm->mac, skb->mac.raw, in->hard_header_len);
243 pm->mac_len = in->hard_header_len;
244 } else
245 pm->mac_len = 0;
246
247 if (in)
248 strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
249 else
250 pm->indev_name[0] = '\0';
251
252 if (out)
253 strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
254 else
255 pm->outdev_name[0] = '\0';
256
257 /* copy_len <= skb->len, so can't fail. */
258 if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
259 BUG();
260
261 /* check if we are building multi-part messages */
262 if (ub->qlen > 1) {
263 ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
264 }
265
266 ub->lastnlh = nlh;
267
268 /* if timer isn't already running, start it */
269 if (!timer_pending(&ub->timer)) {
270 ub->timer.expires = jiffies + flushtimeout * HZ / 100;
271 add_timer(&ub->timer);
272 }
273
274 /* if threshold is reached, send message to userspace */
275 if (ub->qlen >= loginfo->qthreshold) {
276 if (loginfo->qthreshold > 1)
277 nlh->nlmsg_type = NLMSG_DONE;
278 ulog_send(groupnum);
279 }
280
281 UNLOCK_BH(&ulog_lock);
282
283 return;
284
285nlmsg_failure:
286 PRINTR("ipt_ULOG: error during NLMSG_PUT\n");
287
288alloc_failure:
289 PRINTR("ipt_ULOG: Error building netlink message\n");
290
291 UNLOCK_BH(&ulog_lock);
292}
293
294static unsigned int ipt_ulog_target(struct sk_buff **pskb,
295 const struct net_device *in,
296 const struct net_device *out,
297 unsigned int hooknum,
298 const void *targinfo, void *userinfo)
299{
300 struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
301
302 ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL);
303
304 return IPT_CONTINUE;
305}
306
307static void ipt_logfn(unsigned int hooknum,
308 const struct sk_buff *skb,
309 const struct net_device *in,
310 const struct net_device *out,
311 const char *prefix)
312{
313 struct ipt_ulog_info loginfo = {
314 .nl_group = ULOG_DEFAULT_NLGROUP,
315 .copy_range = 0,
316 .qthreshold = ULOG_DEFAULT_QTHRESHOLD,
317 .prefix = ""
318 };
319
320 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
321}
322
323static int ipt_ulog_checkentry(const char *tablename,
324 const struct ipt_entry *e,
325 void *targinfo,
326 unsigned int targinfosize,
327 unsigned int hookmask)
328{
329 struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
330
331 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ulog_info))) {
332 DEBUGP("ipt_ULOG: targinfosize %u != 0\n", targinfosize);
333 return 0;
334 }
335
336 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
337 DEBUGP("ipt_ULOG: prefix term %i\n",
338 loginfo->prefix[sizeof(loginfo->prefix) - 1]);
339 return 0;
340 }
341
342 if (loginfo->qthreshold > ULOG_MAX_QLEN) {
343 DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n",
344 loginfo->qthreshold);
345 return 0;
346 }
347
348 return 1;
349}
350
351static struct ipt_target ipt_ulog_reg = {
352 .name = "ULOG",
353 .target = ipt_ulog_target,
354 .checkentry = ipt_ulog_checkentry,
355 .me = THIS_MODULE,
356};
357
358static int __init init(void)
359{
360 int i;
361
362 DEBUGP("ipt_ULOG: init module\n");
363
364 if (nlbufsiz >= 128*1024) {
365 printk("Netlink buffer has to be <= 128kB\n");
366 return -EINVAL;
367 }
368
369 /* initialize ulog_buffers */
370 for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
371 init_timer(&ulog_buffers[i].timer);
372 ulog_buffers[i].timer.function = ulog_timer;
373 ulog_buffers[i].timer.data = i;
374 }
375
376 nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
377 if (!nflognl)
378 return -ENOMEM;
379
380 if (ipt_register_target(&ipt_ulog_reg) != 0) {
381 sock_release(nflognl->sk_socket);
382 return -EINVAL;
383 }
384 if (nflog)
385 nf_log_register(PF_INET, &ipt_logfn);
386
387 return 0;
388}
389
390static void __exit fini(void)
391{
392 ulog_buff_t *ub;
393 int i;
394
395 DEBUGP("ipt_ULOG: cleanup_module\n");
396
397 if (nflog)
398 nf_log_unregister(PF_INET, &ipt_logfn);
399 ipt_unregister_target(&ipt_ulog_reg);
400 sock_release(nflognl->sk_socket);
401
402 /* remove pending timers and free allocated skb's */
403 for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
404 ub = &ulog_buffers[i];
405 if (timer_pending(&ub->timer)) {
406 DEBUGP("timer was pending, deleting\n");
407 del_timer(&ub->timer);
408 }
409
410 if (ub->skb) {
411 kfree_skb(ub->skb);
412 ub->skb = NULL;
413 }
414 }
415
416}
417
418module_init(init);
419module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
new file mode 100644
index 000000000000..f5909a4c3fc7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -0,0 +1,77 @@
1/*
2 * iptables module to match inet_addr_type() of an ip.
3 *
4 * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/netdevice.h>
15#include <linux/ip.h>
16#include <net/route.h>
17
18#include <linux/netfilter_ipv4/ipt_addrtype.h>
19#include <linux/netfilter_ipv4/ip_tables.h>
20
21MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
23MODULE_DESCRIPTION("iptables addrtype match");
24
25static inline int match_type(u_int32_t addr, u_int16_t mask)
26{
27 return !!(mask & (1 << inet_addr_type(addr)));
28}
29
30static int match(const struct sk_buff *skb, const struct net_device *in,
31 const struct net_device *out, const void *matchinfo,
32 int offset, int *hotdrop)
33{
34 const struct ipt_addrtype_info *info = matchinfo;
35 const struct iphdr *iph = skb->nh.iph;
36 int ret = 1;
37
38 if (info->source)
39 ret &= match_type(iph->saddr, info->source)^info->invert_source;
40 if (info->dest)
41 ret &= match_type(iph->daddr, info->dest)^info->invert_dest;
42
43 return ret;
44}
45
46static int checkentry(const char *tablename, const struct ipt_ip *ip,
47 void *matchinfo, unsigned int matchsize,
48 unsigned int hook_mask)
49{
50 if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) {
51 printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.",
52 matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info)));
53 return 0;
54 }
55
56 return 1;
57}
58
59static struct ipt_match addrtype_match = {
60 .name = "addrtype",
61 .match = match,
62 .checkentry = checkentry,
63 .me = THIS_MODULE
64};
65
66static int __init init(void)
67{
68 return ipt_register_match(&addrtype_match);
69}
70
71static void __exit fini(void)
72{
73 ipt_unregister_match(&addrtype_match);
74}
75
76module_init(init);
77module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 000000000000..a0fea847cb72
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,117 @@
1/* Kernel module to match AH parameters. */
2/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/ip.h>
12
13#include <linux/netfilter_ipv4/ipt_ah.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15
16MODULE_LICENSE("GPL");
17MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
18MODULE_DESCRIPTION("iptables AH SPI match module");
19
20#ifdef DEBUG_CONNTRACK
21#define duprintf(format, args...) printk(format , ## args)
22#else
23#define duprintf(format, args...)
24#endif
25
26/* Returns 1 if the spi is matched by the range, 0 otherwise */
27static inline int
28spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
29{
30 int r=0;
31 duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
32 min,spi,max);
33 r=(spi >= min && spi <= max) ^ invert;
34 duprintf(" result %s\n",r? "PASS" : "FAILED");
35 return r;
36}
37
38static int
39match(const struct sk_buff *skb,
40 const struct net_device *in,
41 const struct net_device *out,
42 const void *matchinfo,
43 int offset,
44 int *hotdrop)
45{
46 struct ip_auth_hdr _ahdr, *ah;
47 const struct ipt_ah *ahinfo = matchinfo;
48
49 /* Must not be a fragment. */
50 if (offset)
51 return 0;
52
53 ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
54 sizeof(_ahdr), &_ahdr);
55 if (ah == NULL) {
56 /* We've been asked to examine this packet, and we
57 * can't. Hence, no choice but to drop.
58 */
59 duprintf("Dropping evil AH tinygram.\n");
60 *hotdrop = 1;
61 return 0;
62 }
63
64 return spi_match(ahinfo->spis[0], ahinfo->spis[1],
65 ntohl(ah->spi),
66 !!(ahinfo->invflags & IPT_AH_INV_SPI));
67}
68
69/* Called when user tries to insert an entry of this type. */
70static int
71checkentry(const char *tablename,
72 const struct ipt_ip *ip,
73 void *matchinfo,
74 unsigned int matchinfosize,
75 unsigned int hook_mask)
76{
77 const struct ipt_ah *ahinfo = matchinfo;
78
79 /* Must specify proto == AH, and no unknown invflags */
80 if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) {
81 duprintf("ipt_ah: Protocol %u != %u\n", ip->proto,
82 IPPROTO_AH);
83 return 0;
84 }
85 if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_ah))) {
86 duprintf("ipt_ah: matchsize %u != %u\n",
87 matchinfosize, IPT_ALIGN(sizeof(struct ipt_ah)));
88 return 0;
89 }
90 if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
91 duprintf("ipt_ah: unknown flags %X\n",
92 ahinfo->invflags);
93 return 0;
94 }
95
96 return 1;
97}
98
99static struct ipt_match ah_match = {
100 .name = "ah",
101 .match = &match,
102 .checkentry = &checkentry,
103 .me = THIS_MODULE,
104};
105
106static int __init init(void)
107{
108 return ipt_register_match(&ah_match);
109}
110
111static void __exit cleanup(void)
112{
113 ipt_unregister_match(&ah_match);
114}
115
116module_init(init);
117module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c
new file mode 100644
index 000000000000..6b76a1ea5245
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_comment.c
@@ -0,0 +1,59 @@
1/*
2 * Implements a dummy match to allow attaching comments to rules
3 *
4 * 2003-05-13 Brad Fisher (brad@info-link.net)
5 */
6
7#include <linux/module.h>
8#include <linux/skbuff.h>
9#include <linux/netfilter_ipv4/ip_tables.h>
10#include <linux/netfilter_ipv4/ipt_comment.h>
11
12MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
13MODULE_DESCRIPTION("iptables comment match module");
14MODULE_LICENSE("GPL");
15
16static int
17match(const struct sk_buff *skb,
18 const struct net_device *in,
19 const struct net_device *out,
20 const void *matchinfo,
21 int offset,
22 int *hotdrop)
23{
24 /* We always match */
25 return 1;
26}
27
28static int
29checkentry(const char *tablename,
30 const struct ipt_ip *ip,
31 void *matchinfo,
32 unsigned int matchsize,
33 unsigned int hook_mask)
34{
35 /* Check the size */
36 if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info)))
37 return 0;
38 return 1;
39}
40
41static struct ipt_match comment_match = {
42 .name = "comment",
43 .match = match,
44 .checkentry = checkentry,
45 .me = THIS_MODULE
46};
47
48static int __init init(void)
49{
50 return ipt_register_match(&comment_match);
51}
52
53static void __exit fini(void)
54{
55 ipt_unregister_match(&comment_match);
56}
57
58module_init(init);
59module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
new file mode 100644
index 000000000000..2706f96cea55
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -0,0 +1,81 @@
1/* This kernel module matches connection mark values set by the
2 * CONNMARK target
3 *
4 * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
5 * by Henrik Nordstrom <hno@marasystems.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/module.h>
23#include <linux/skbuff.h>
24
25MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
26MODULE_DESCRIPTION("IP tables connmark match module");
27MODULE_LICENSE("GPL");
28
29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_connmark.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h>
32
33static int
34match(const struct sk_buff *skb,
35 const struct net_device *in,
36 const struct net_device *out,
37 const void *matchinfo,
38 int offset,
39 int *hotdrop)
40{
41 const struct ipt_connmark_info *info = matchinfo;
42 enum ip_conntrack_info ctinfo;
43 struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
44 if (!ct)
45 return 0;
46
47 return ((ct->mark & info->mask) == info->mark) ^ info->invert;
48}
49
50static int
51checkentry(const char *tablename,
52 const struct ipt_ip *ip,
53 void *matchinfo,
54 unsigned int matchsize,
55 unsigned int hook_mask)
56{
57 if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
58 return 0;
59
60 return 1;
61}
62
63static struct ipt_match connmark_match = {
64 .name = "connmark",
65 .match = &match,
66 .checkentry = &checkentry,
67 .me = THIS_MODULE
68};
69
70static int __init init(void)
71{
72 return ipt_register_match(&connmark_match);
73}
74
75static void __exit fini(void)
76{
77 ipt_unregister_match(&connmark_match);
78}
79
80module_init(init);
81module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c
new file mode 100644
index 000000000000..c1d22801b7cf
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_conntrack.c
@@ -0,0 +1,136 @@
1/* Kernel module to match connection tracking information.
2 * Superset of Rusty's minimalistic state match.
3 *
4 * (C) 2001 Marc Boucher (marc@mbsi.ca).
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ip_conntrack.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_conntrack.h>
16
17MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
19MODULE_DESCRIPTION("iptables connection tracking match module");
20
21static int
22match(const struct sk_buff *skb,
23 const struct net_device *in,
24 const struct net_device *out,
25 const void *matchinfo,
26 int offset,
27 int *hotdrop)
28{
29 const struct ipt_conntrack_info *sinfo = matchinfo;
30 struct ip_conntrack *ct;
31 enum ip_conntrack_info ctinfo;
32 unsigned int statebit;
33
34 ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
35
36#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
37
38 if (ct == &ip_conntrack_untracked)
39 statebit = IPT_CONNTRACK_STATE_UNTRACKED;
40 else if (ct)
41 statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
42 else
43 statebit = IPT_CONNTRACK_STATE_INVALID;
44
45 if(sinfo->flags & IPT_CONNTRACK_STATE) {
46 if (ct) {
47 if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip !=
48 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip)
49 statebit |= IPT_CONNTRACK_STATE_SNAT;
50
51 if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip !=
52 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip)
53 statebit |= IPT_CONNTRACK_STATE_DNAT;
54 }
55
56 if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
57 return 0;
58 }
59
60 if(sinfo->flags & IPT_CONNTRACK_PROTO) {
61 if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
62 return 0;
63 }
64
65 if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
66 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
67 return 0;
68 }
69
70 if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
71 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
72 return 0;
73 }
74
75 if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
76 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
77 return 0;
78 }
79
80 if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
81 if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
82 return 0;
83 }
84
85 if(sinfo->flags & IPT_CONNTRACK_STATUS) {
86 if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
87 return 0;
88 }
89
90 if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
91 unsigned long expires;
92
93 if(!ct)
94 return 0;
95
96 expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
97
98 if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
99 return 0;
100 }
101
102 return 1;
103}
104
105static int check(const char *tablename,
106 const struct ipt_ip *ip,
107 void *matchinfo,
108 unsigned int matchsize,
109 unsigned int hook_mask)
110{
111 if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info)))
112 return 0;
113
114 return 1;
115}
116
117static struct ipt_match conntrack_match = {
118 .name = "conntrack",
119 .match = &match,
120 .checkentry = &check,
121 .me = THIS_MODULE,
122};
123
124static int __init init(void)
125{
126 need_ip_conntrack();
127 return ipt_register_match(&conntrack_match);
128}
129
130static void __exit fini(void)
131{
132 ipt_unregister_match(&conntrack_match);
133}
134
135module_init(init);
136module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c
new file mode 100644
index 000000000000..5df52a64a5d4
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dscp.c
@@ -0,0 +1,63 @@
1/* IP tables module for matching the value of the IPv4 DSCP field
2 *
3 * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp
4 *
5 * (C) 2002 by Harald Welte <laforge@netfilter.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/module.h>
13#include <linux/skbuff.h>
14
15#include <linux/netfilter_ipv4/ipt_dscp.h>
16#include <linux/netfilter_ipv4/ip_tables.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("iptables DSCP matching module");
20MODULE_LICENSE("GPL");
21
22static int match(const struct sk_buff *skb, const struct net_device *in,
23 const struct net_device *out, const void *matchinfo,
24 int offset, int *hotdrop)
25{
26 const struct ipt_dscp_info *info = matchinfo;
27 const struct iphdr *iph = skb->nh.iph;
28
29 u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
30
31 return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert;
32}
33
34static int checkentry(const char *tablename, const struct ipt_ip *ip,
35 void *matchinfo, unsigned int matchsize,
36 unsigned int hook_mask)
37{
38 if (matchsize != IPT_ALIGN(sizeof(struct ipt_dscp_info)))
39 return 0;
40
41 return 1;
42}
43
44static struct ipt_match dscp_match = {
45 .name = "dscp",
46 .match = &match,
47 .checkentry = &checkentry,
48 .me = THIS_MODULE,
49};
50
51static int __init init(void)
52{
53 return ipt_register_match(&dscp_match);
54}
55
56static void __exit fini(void)
57{
58 ipt_unregister_match(&dscp_match);
59
60}
61
62module_init(init);
63module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
new file mode 100644
index 000000000000..b6f7181e89cc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -0,0 +1,131 @@
1/* IP tables module for matching the value of the IPv4 and TCP ECN bits
2 *
3 * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp
4 *
5 * (C) 2002 by Harald Welte <laforge@gnumonks.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/tcp.h>
15
16#include <linux/netfilter_ipv4/ip_tables.h>
17#include <linux/netfilter_ipv4/ipt_ecn.h>
18
19MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
20MODULE_DESCRIPTION("iptables ECN matching module");
21MODULE_LICENSE("GPL");
22
23static inline int match_ip(const struct sk_buff *skb,
24 const struct ipt_ecn_info *einfo)
25{
26 return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect);
27}
28
29static inline int match_tcp(const struct sk_buff *skb,
30 const struct ipt_ecn_info *einfo,
31 int *hotdrop)
32{
33 struct tcphdr _tcph, *th;
34
35 /* In practice, TCP match does this, so can't fail. But let's
36 * be good citizens.
37 */
38 th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
39 sizeof(_tcph), &_tcph);
40 if (th == NULL) {
41 *hotdrop = 0;
42 return 0;
43 }
44
45 if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
46 if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
47 if (th->ece == 1)
48 return 0;
49 } else {
50 if (th->ece == 0)
51 return 0;
52 }
53 }
54
55 if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
56 if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
57 if (th->cwr == 1)
58 return 0;
59 } else {
60 if (th->cwr == 0)
61 return 0;
62 }
63 }
64
65 return 1;
66}
67
68static int match(const struct sk_buff *skb, const struct net_device *in,
69 const struct net_device *out, const void *matchinfo,
70 int offset, int *hotdrop)
71{
72 const struct ipt_ecn_info *info = matchinfo;
73
74 if (info->operation & IPT_ECN_OP_MATCH_IP)
75 if (!match_ip(skb, info))
76 return 0;
77
78 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
79 if (skb->nh.iph->protocol != IPPROTO_TCP)
80 return 0;
81 if (!match_tcp(skb, info, hotdrop))
82 return 0;
83 }
84
85 return 1;
86}
87
88static int checkentry(const char *tablename, const struct ipt_ip *ip,
89 void *matchinfo, unsigned int matchsize,
90 unsigned int hook_mask)
91{
92 const struct ipt_ecn_info *info = matchinfo;
93
94 if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info)))
95 return 0;
96
97 if (info->operation & IPT_ECN_OP_MATCH_MASK)
98 return 0;
99
100 if (info->invert & IPT_ECN_OP_MATCH_MASK)
101 return 0;
102
103 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)
104 && ip->proto != IPPROTO_TCP) {
105 printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
106 " non-tcp packets\n");
107 return 0;
108 }
109
110 return 1;
111}
112
113static struct ipt_match ecn_match = {
114 .name = "ecn",
115 .match = &match,
116 .checkentry = &checkentry,
117 .me = THIS_MODULE,
118};
119
120static int __init init(void)
121{
122 return ipt_register_match(&ecn_match);
123}
124
125static void __exit fini(void)
126{
127 ipt_unregister_match(&ecn_match);
128}
129
130module_init(init);
131module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c
new file mode 100644
index 000000000000..e1d0dd31e117
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_esp.c
@@ -0,0 +1,118 @@
1/* Kernel module to match ESP parameters. */
2
3/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/ip.h>
13
14#include <linux/netfilter_ipv4/ipt_esp.h>
15#include <linux/netfilter_ipv4/ip_tables.h>
16
17MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
19MODULE_DESCRIPTION("iptables ESP SPI match module");
20
21#ifdef DEBUG_CONNTRACK
22#define duprintf(format, args...) printk(format , ## args)
23#else
24#define duprintf(format, args...)
25#endif
26
27/* Returns 1 if the spi is matched by the range, 0 otherwise */
28static inline int
29spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
30{
31 int r=0;
32 duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
33 min,spi,max);
34 r=(spi >= min && spi <= max) ^ invert;
35 duprintf(" result %s\n",r? "PASS" : "FAILED");
36 return r;
37}
38
39static int
40match(const struct sk_buff *skb,
41 const struct net_device *in,
42 const struct net_device *out,
43 const void *matchinfo,
44 int offset,
45 int *hotdrop)
46{
47 struct ip_esp_hdr _esp, *eh;
48 const struct ipt_esp *espinfo = matchinfo;
49
50 /* Must not be a fragment. */
51 if (offset)
52 return 0;
53
54 eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
55 sizeof(_esp), &_esp);
56 if (eh == NULL) {
57 /* We've been asked to examine this packet, and we
58 * can't. Hence, no choice but to drop.
59 */
60 duprintf("Dropping evil ESP tinygram.\n");
61 *hotdrop = 1;
62 return 0;
63 }
64
65 return spi_match(espinfo->spis[0], espinfo->spis[1],
66 ntohl(eh->spi),
67 !!(espinfo->invflags & IPT_ESP_INV_SPI));
68}
69
70/* Called when user tries to insert an entry of this type. */
71static int
72checkentry(const char *tablename,
73 const struct ipt_ip *ip,
74 void *matchinfo,
75 unsigned int matchinfosize,
76 unsigned int hook_mask)
77{
78 const struct ipt_esp *espinfo = matchinfo;
79
80 /* Must specify proto == ESP, and no unknown invflags */
81 if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) {
82 duprintf("ipt_esp: Protocol %u != %u\n", ip->proto,
83 IPPROTO_ESP);
84 return 0;
85 }
86 if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_esp))) {
87 duprintf("ipt_esp: matchsize %u != %u\n",
88 matchinfosize, IPT_ALIGN(sizeof(struct ipt_esp)));
89 return 0;
90 }
91 if (espinfo->invflags & ~IPT_ESP_INV_MASK) {
92 duprintf("ipt_esp: unknown flags %X\n",
93 espinfo->invflags);
94 return 0;
95 }
96
97 return 1;
98}
99
100static struct ipt_match esp_match = {
101 .name = "esp",
102 .match = &match,
103 .checkentry = &checkentry,
104 .me = THIS_MODULE,
105};
106
107static int __init init(void)
108{
109 return ipt_register_match(&esp_match);
110}
111
112static void __exit cleanup(void)
113{
114 ipt_unregister_match(&esp_match);
115}
116
117module_init(init);
118module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
new file mode 100644
index 000000000000..f1937190cd77
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -0,0 +1,731 @@
1/* iptables match extension to limit the number of packets per second
2 * seperately for each hashbucket (sourceip/sourceport/dstip/dstport)
3 *
4 * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
5 *
6 * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $
7 *
8 * Development of this code was funded by Astaro AG, http://www.astaro.com/
9 *
10 * based on ipt_limit.c by:
11 * Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
12 * Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
13 * Rusty Russell <rusty@rustcorp.com.au>
14 *
15 * The general idea is to create a hash table for every dstip and have a
16 * seperate limit counter per tuple. This way you can do something like 'limit
17 * the number of syn packets for each of my internal addresses.
18 *
19 * Ideally this would just be implemented as a general 'hash' match, which would
20 * allow us to attach any iptables target to it's hash buckets. But this is
21 * not possible in the current iptables architecture. As always, pkttables for
22 * 2.7.x will help ;)
23 */
24#include <linux/module.h>
25#include <linux/skbuff.h>
26#include <linux/spinlock.h>
27#include <linux/random.h>
28#include <linux/jhash.h>
29#include <linux/slab.h>
30#include <linux/vmalloc.h>
31#include <linux/tcp.h>
32#include <linux/udp.h>
33#include <linux/sctp.h>
34#include <linux/proc_fs.h>
35#include <linux/seq_file.h>
36#include <linux/list.h>
37
38#include <linux/netfilter_ipv4/ip_tables.h>
39#include <linux/netfilter_ipv4/ipt_hashlimit.h>
40#include <linux/netfilter_ipv4/lockhelp.h>
41
42/* FIXME: this is just for IP_NF_ASSERRT */
43#include <linux/netfilter_ipv4/ip_conntrack.h>
44
45MODULE_LICENSE("GPL");
46MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
47MODULE_DESCRIPTION("iptables match for limiting per hash-bucket");
48
49/* need to declare this at the top */
50static struct proc_dir_entry *hashlimit_procdir;
51static struct file_operations dl_file_ops;
52
53/* hash table crap */
54
55struct dsthash_dst {
56 u_int32_t src_ip;
57 u_int32_t dst_ip;
58 /* ports have to be consecutive !!! */
59 u_int16_t src_port;
60 u_int16_t dst_port;
61};
62
63struct dsthash_ent {
64 /* static / read-only parts in the beginning */
65 struct hlist_node node;
66 struct dsthash_dst dst;
67
68 /* modified structure members in the end */
69 unsigned long expires; /* precalculated expiry time */
70 struct {
71 unsigned long prev; /* last modification */
72 u_int32_t credit;
73 u_int32_t credit_cap, cost;
74 } rateinfo;
75};
76
77struct ipt_hashlimit_htable {
78 struct hlist_node node; /* global list of all htables */
79 atomic_t use;
80
81 struct hashlimit_cfg cfg; /* config */
82
83 /* used internally */
84 spinlock_t lock; /* lock for list_head */
85 u_int32_t rnd; /* random seed for hash */
86 struct timer_list timer; /* timer for gc */
87 atomic_t count; /* number entries in table */
88
89 /* seq_file stuff */
90 struct proc_dir_entry *pde;
91
92 struct hlist_head hash[0]; /* hashtable itself */
93};
94
95static DECLARE_LOCK(hashlimit_lock); /* protects htables list */
96static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
97static HLIST_HEAD(hashlimit_htables);
98static kmem_cache_t *hashlimit_cachep;
99
100static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
101{
102 return (ent->dst.dst_ip == b->dst_ip
103 && ent->dst.dst_port == b->dst_port
104 && ent->dst.src_port == b->src_port
105 && ent->dst.src_ip == b->src_ip);
106}
107
108static inline u_int32_t
109hash_dst(const struct ipt_hashlimit_htable *ht, const struct dsthash_dst *dst)
110{
111 return (jhash_3words(dst->dst_ip, (dst->dst_port<<16 | dst->src_port),
112 dst->src_ip, ht->rnd) % ht->cfg.size);
113}
114
115static inline struct dsthash_ent *
116__dsthash_find(const struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
117{
118 struct dsthash_ent *ent;
119 struct hlist_node *pos;
120 u_int32_t hash = hash_dst(ht, dst);
121
122 if (!hlist_empty(&ht->hash[hash]))
123 hlist_for_each_entry(ent, pos, &ht->hash[hash], node) {
124 if (dst_cmp(ent, dst)) {
125 return ent;
126 }
127 }
128
129 return NULL;
130}
131
132/* allocate dsthash_ent, initialize dst, put in htable and lock it */
133static struct dsthash_ent *
134__dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
135{
136 struct dsthash_ent *ent;
137
138 /* initialize hash with random val at the time we allocate
139 * the first hashtable entry */
140 if (!ht->rnd)
141 get_random_bytes(&ht->rnd, 4);
142
143 if (ht->cfg.max &&
144 atomic_read(&ht->count) >= ht->cfg.max) {
145 /* FIXME: do something. question is what.. */
146 if (net_ratelimit())
147 printk(KERN_WARNING
148 "ipt_hashlimit: max count of %u reached\n",
149 ht->cfg.max);
150 return NULL;
151 }
152
153 ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
154 if (!ent) {
155 if (net_ratelimit())
156 printk(KERN_ERR
157 "ipt_hashlimit: can't allocate dsthash_ent\n");
158 return NULL;
159 }
160
161 atomic_inc(&ht->count);
162
163 ent->dst.dst_ip = dst->dst_ip;
164 ent->dst.dst_port = dst->dst_port;
165 ent->dst.src_ip = dst->src_ip;
166 ent->dst.src_port = dst->src_port;
167
168 hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
169
170 return ent;
171}
172
173static inline void
174__dsthash_free(struct ipt_hashlimit_htable *ht, struct dsthash_ent *ent)
175{
176 hlist_del(&ent->node);
177 kmem_cache_free(hashlimit_cachep, ent);
178 atomic_dec(&ht->count);
179}
180static void htable_gc(unsigned long htlong);
181
182static int htable_create(struct ipt_hashlimit_info *minfo)
183{
184 int i;
185 unsigned int size;
186 struct ipt_hashlimit_htable *hinfo;
187
188 if (minfo->cfg.size)
189 size = minfo->cfg.size;
190 else {
191 size = (((num_physpages << PAGE_SHIFT) / 16384)
192 / sizeof(struct list_head));
193 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
194 size = 8192;
195 if (size < 16)
196 size = 16;
197 }
198 /* FIXME: don't use vmalloc() here or anywhere else -HW */
199 hinfo = vmalloc(sizeof(struct ipt_hashlimit_htable)
200 + (sizeof(struct list_head) * size));
201 if (!hinfo) {
202 printk(KERN_ERR "ipt_hashlimit: Unable to create hashtable\n");
203 return -1;
204 }
205 minfo->hinfo = hinfo;
206
207 /* copy match config into hashtable config */
208 memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
209 hinfo->cfg.size = size;
210 if (!hinfo->cfg.max)
211 hinfo->cfg.max = 8 * hinfo->cfg.size;
212 else if (hinfo->cfg.max < hinfo->cfg.size)
213 hinfo->cfg.max = hinfo->cfg.size;
214
215 for (i = 0; i < hinfo->cfg.size; i++)
216 INIT_HLIST_HEAD(&hinfo->hash[i]);
217
218 atomic_set(&hinfo->count, 0);
219 atomic_set(&hinfo->use, 1);
220 hinfo->rnd = 0;
221 spin_lock_init(&hinfo->lock);
222 hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
223 if (!hinfo->pde) {
224 vfree(hinfo);
225 return -1;
226 }
227 hinfo->pde->proc_fops = &dl_file_ops;
228 hinfo->pde->data = hinfo;
229
230 init_timer(&hinfo->timer);
231 hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval);
232 hinfo->timer.data = (unsigned long )hinfo;
233 hinfo->timer.function = htable_gc;
234 add_timer(&hinfo->timer);
235
236 LOCK_BH(&hashlimit_lock);
237 hlist_add_head(&hinfo->node, &hashlimit_htables);
238 UNLOCK_BH(&hashlimit_lock);
239
240 return 0;
241}
242
243static int select_all(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
244{
245 return 1;
246}
247
248static int select_gc(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
249{
250 return (jiffies >= he->expires);
251}
252
253static void htable_selective_cleanup(struct ipt_hashlimit_htable *ht,
254 int (*select)(struct ipt_hashlimit_htable *ht,
255 struct dsthash_ent *he))
256{
257 int i;
258
259 IP_NF_ASSERT(ht->cfg.size && ht->cfg.max);
260
261 /* lock hash table and iterate over it */
262 spin_lock_bh(&ht->lock);
263 for (i = 0; i < ht->cfg.size; i++) {
264 struct dsthash_ent *dh;
265 struct hlist_node *pos, *n;
266 hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) {
267 if ((*select)(ht, dh))
268 __dsthash_free(ht, dh);
269 }
270 }
271 spin_unlock_bh(&ht->lock);
272}
273
274/* hash table garbage collector, run by timer */
275static void htable_gc(unsigned long htlong)
276{
277 struct ipt_hashlimit_htable *ht = (struct ipt_hashlimit_htable *)htlong;
278
279 htable_selective_cleanup(ht, select_gc);
280
281 /* re-add the timer accordingly */
282 ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval);
283 add_timer(&ht->timer);
284}
285
286static void htable_destroy(struct ipt_hashlimit_htable *hinfo)
287{
288 /* remove timer, if it is pending */
289 if (timer_pending(&hinfo->timer))
290 del_timer(&hinfo->timer);
291
292 /* remove proc entry */
293 remove_proc_entry(hinfo->pde->name, hashlimit_procdir);
294
295 htable_selective_cleanup(hinfo, select_all);
296 vfree(hinfo);
297}
298
299static struct ipt_hashlimit_htable *htable_find_get(char *name)
300{
301 struct ipt_hashlimit_htable *hinfo;
302 struct hlist_node *pos;
303
304 LOCK_BH(&hashlimit_lock);
305 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
306 if (!strcmp(name, hinfo->pde->name)) {
307 atomic_inc(&hinfo->use);
308 UNLOCK_BH(&hashlimit_lock);
309 return hinfo;
310 }
311 }
312 UNLOCK_BH(&hashlimit_lock);
313
314 return NULL;
315}
316
317static void htable_put(struct ipt_hashlimit_htable *hinfo)
318{
319 if (atomic_dec_and_test(&hinfo->use)) {
320 LOCK_BH(&hashlimit_lock);
321 hlist_del(&hinfo->node);
322 UNLOCK_BH(&hashlimit_lock);
323 htable_destroy(hinfo);
324 }
325}
326
327
328/* The algorithm used is the Simple Token Bucket Filter (TBF)
329 * see net/sched/sch_tbf.c in the linux source tree
330 */
331
332/* Rusty: This is my (non-mathematically-inclined) understanding of
333 this algorithm. The `average rate' in jiffies becomes your initial
334 amount of credit `credit' and the most credit you can ever have
335 `credit_cap'. The `peak rate' becomes the cost of passing the
336 test, `cost'.
337
338 `prev' tracks the last packet hit: you gain one credit per jiffy.
339 If you get credit balance more than this, the extra credit is
340 discarded. Every time the match passes, you lose `cost' credits;
341 if you don't have that many, the test fails.
342
343 See Alexey's formal explanation in net/sched/sch_tbf.c.
344
345 To get the maximum range, we multiply by this factor (ie. you get N
346 credits per jiffy). We want to allow a rate as low as 1 per day
347 (slowest userspace tool allows), which means
348 CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
349*/
350#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
351
352/* Repeated shift and or gives us all 1s, final shift and add 1 gives
353 * us the power of 2 below the theoretical max, so GCC simply does a
354 * shift. */
355#define _POW2_BELOW2(x) ((x)|((x)>>1))
356#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
357#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
358#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
359#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
360#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
361
362#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
363
364/* Precision saver. */
365static inline u_int32_t
366user2credits(u_int32_t user)
367{
368 /* If multiplying would overflow... */
369 if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
370 /* Divide first. */
371 return (user / IPT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
372
373 return (user * HZ * CREDITS_PER_JIFFY) / IPT_HASHLIMIT_SCALE;
374}
375
376static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
377{
378 dh->rateinfo.credit += (now - xchg(&dh->rateinfo.prev, now))
379 * CREDITS_PER_JIFFY;
380 if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
381 dh->rateinfo.credit = dh->rateinfo.credit_cap;
382}
383
384static inline int get_ports(const struct sk_buff *skb, int offset,
385 u16 ports[2])
386{
387 union {
388 struct tcphdr th;
389 struct udphdr uh;
390 sctp_sctphdr_t sctph;
391 } hdr_u, *ptr_u;
392
393 /* Must not be a fragment. */
394 if (offset)
395 return 1;
396
397 /* Must be big enough to read ports (both UDP and TCP have
398 them at the start). */
399 ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u);
400 if (!ptr_u)
401 return 1;
402
403 switch (skb->nh.iph->protocol) {
404 case IPPROTO_TCP:
405 ports[0] = ptr_u->th.source;
406 ports[1] = ptr_u->th.dest;
407 break;
408 case IPPROTO_UDP:
409 ports[0] = ptr_u->uh.source;
410 ports[1] = ptr_u->uh.dest;
411 break;
412 case IPPROTO_SCTP:
413 ports[0] = ptr_u->sctph.source;
414 ports[1] = ptr_u->sctph.dest;
415 break;
416 default:
417 /* all other protocols don't supprot per-port hash
418 * buckets */
419 ports[0] = ports[1] = 0;
420 break;
421 }
422
423 return 0;
424}
425
426
427static int
428hashlimit_match(const struct sk_buff *skb,
429 const struct net_device *in,
430 const struct net_device *out,
431 const void *matchinfo,
432 int offset,
433 int *hotdrop)
434{
435 struct ipt_hashlimit_info *r =
436 ((struct ipt_hashlimit_info *)matchinfo)->u.master;
437 struct ipt_hashlimit_htable *hinfo = r->hinfo;
438 unsigned long now = jiffies;
439 struct dsthash_ent *dh;
440 struct dsthash_dst dst;
441
442 /* build 'dst' according to hinfo->cfg and current packet */
443 memset(&dst, 0, sizeof(dst));
444 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DIP)
445 dst.dst_ip = skb->nh.iph->daddr;
446 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SIP)
447 dst.src_ip = skb->nh.iph->saddr;
448 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
449 ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
450 u_int16_t ports[2];
451 if (get_ports(skb, offset, ports)) {
452 /* We've been asked to examine this packet, and we
453 can't. Hence, no choice but to drop. */
454 *hotdrop = 1;
455 return 0;
456 }
457 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT)
458 dst.src_port = ports[0];
459 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT)
460 dst.dst_port = ports[1];
461 }
462
463 spin_lock_bh(&hinfo->lock);
464 dh = __dsthash_find(hinfo, &dst);
465 if (!dh) {
466 dh = __dsthash_alloc_init(hinfo, &dst);
467
468 if (!dh) {
469 /* enomem... don't match == DROP */
470 if (net_ratelimit())
471 printk(KERN_ERR "%s: ENOMEM\n", __FUNCTION__);
472 spin_unlock_bh(&hinfo->lock);
473 return 0;
474 }
475
476 dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
477
478 dh->rateinfo.prev = jiffies;
479 dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
480 hinfo->cfg.burst);
481 dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
482 hinfo->cfg.burst);
483 dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
484
485 spin_unlock_bh(&hinfo->lock);
486 return 1;
487 }
488
489 /* update expiration timeout */
490 dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
491
492 rateinfo_recalc(dh, now);
493 if (dh->rateinfo.credit >= dh->rateinfo.cost) {
494 /* We're underlimit. */
495 dh->rateinfo.credit -= dh->rateinfo.cost;
496 spin_unlock_bh(&hinfo->lock);
497 return 1;
498 }
499
500 spin_unlock_bh(&hinfo->lock);
501
502 /* default case: we're overlimit, thus don't match */
503 return 0;
504}
505
506static int
507hashlimit_checkentry(const char *tablename,
508 const struct ipt_ip *ip,
509 void *matchinfo,
510 unsigned int matchsize,
511 unsigned int hook_mask)
512{
513 struct ipt_hashlimit_info *r = matchinfo;
514
515 if (matchsize != IPT_ALIGN(sizeof(struct ipt_hashlimit_info)))
516 return 0;
517
518 /* Check for overflow. */
519 if (r->cfg.burst == 0
520 || user2credits(r->cfg.avg * r->cfg.burst) <
521 user2credits(r->cfg.avg)) {
522 printk(KERN_ERR "ipt_hashlimit: Overflow, try lower: %u/%u\n",
523 r->cfg.avg, r->cfg.burst);
524 return 0;
525 }
526
527 if (r->cfg.mode == 0
528 || r->cfg.mode > (IPT_HASHLIMIT_HASH_DPT
529 |IPT_HASHLIMIT_HASH_DIP
530 |IPT_HASHLIMIT_HASH_SIP
531 |IPT_HASHLIMIT_HASH_SPT))
532 return 0;
533
534 if (!r->cfg.gc_interval)
535 return 0;
536
537 if (!r->cfg.expire)
538 return 0;
539
540 /* This is the best we've got: We cannot release and re-grab lock,
541 * since checkentry() is called before ip_tables.c grabs ipt_mutex.
542 * We also cannot grab the hashtable spinlock, since htable_create will
543 * call vmalloc, and that can sleep. And we cannot just re-search
544 * the list of htable's in htable_create(), since then we would
545 * create duplicate proc files. -HW */
546 down(&hlimit_mutex);
547 r->hinfo = htable_find_get(r->name);
548 if (!r->hinfo && (htable_create(r) != 0)) {
549 up(&hlimit_mutex);
550 return 0;
551 }
552 up(&hlimit_mutex);
553
554 /* Ugly hack: For SMP, we only want to use one set */
555 r->u.master = r;
556
557 return 1;
558}
559
560static void
561hashlimit_destroy(void *matchinfo, unsigned int matchsize)
562{
563 struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo;
564
565 htable_put(r->hinfo);
566}
567
568static struct ipt_match ipt_hashlimit = {
569 .name = "hashlimit",
570 .match = hashlimit_match,
571 .checkentry = hashlimit_checkentry,
572 .destroy = hashlimit_destroy,
573 .me = THIS_MODULE
574};
575
576/* PROC stuff */
577
578static void *dl_seq_start(struct seq_file *s, loff_t *pos)
579{
580 struct proc_dir_entry *pde = s->private;
581 struct ipt_hashlimit_htable *htable = pde->data;
582 unsigned int *bucket;
583
584 spin_lock_bh(&htable->lock);
585 if (*pos >= htable->cfg.size)
586 return NULL;
587
588 bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC);
589 if (!bucket)
590 return ERR_PTR(-ENOMEM);
591
592 *bucket = *pos;
593 return bucket;
594}
595
596static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
597{
598 struct proc_dir_entry *pde = s->private;
599 struct ipt_hashlimit_htable *htable = pde->data;
600 unsigned int *bucket = (unsigned int *)v;
601
602 *pos = ++(*bucket);
603 if (*pos >= htable->cfg.size) {
604 kfree(v);
605 return NULL;
606 }
607 return bucket;
608}
609
610static void dl_seq_stop(struct seq_file *s, void *v)
611{
612 struct proc_dir_entry *pde = s->private;
613 struct ipt_hashlimit_htable *htable = pde->data;
614 unsigned int *bucket = (unsigned int *)v;
615
616 kfree(bucket);
617
618 spin_unlock_bh(&htable->lock);
619}
620
621static inline int dl_seq_real_show(struct dsthash_ent *ent, struct seq_file *s)
622{
623 /* recalculate to show accurate numbers */
624 rateinfo_recalc(ent, jiffies);
625
626 return seq_printf(s, "%ld %u.%u.%u.%u:%u->%u.%u.%u.%u:%u %u %u %u\n",
627 (long)(ent->expires - jiffies)/HZ,
628 NIPQUAD(ent->dst.src_ip), ntohs(ent->dst.src_port),
629 NIPQUAD(ent->dst.dst_ip), ntohs(ent->dst.dst_port),
630 ent->rateinfo.credit, ent->rateinfo.credit_cap,
631 ent->rateinfo.cost);
632}
633
634static int dl_seq_show(struct seq_file *s, void *v)
635{
636 struct proc_dir_entry *pde = s->private;
637 struct ipt_hashlimit_htable *htable = pde->data;
638 unsigned int *bucket = (unsigned int *)v;
639 struct dsthash_ent *ent;
640 struct hlist_node *pos;
641
642 if (!hlist_empty(&htable->hash[*bucket]))
643 hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) {
644 if (dl_seq_real_show(ent, s)) {
645 /* buffer was filled and unable to print that tuple */
646 return 1;
647 }
648 }
649
650 return 0;
651}
652
653static struct seq_operations dl_seq_ops = {
654 .start = dl_seq_start,
655 .next = dl_seq_next,
656 .stop = dl_seq_stop,
657 .show = dl_seq_show
658};
659
660static int dl_proc_open(struct inode *inode, struct file *file)
661{
662 int ret = seq_open(file, &dl_seq_ops);
663
664 if (!ret) {
665 struct seq_file *sf = file->private_data;
666 sf->private = PDE(inode);
667 }
668 return ret;
669}
670
671static struct file_operations dl_file_ops = {
672 .owner = THIS_MODULE,
673 .open = dl_proc_open,
674 .read = seq_read,
675 .llseek = seq_lseek,
676 .release = seq_release
677};
678
679static int init_or_fini(int fini)
680{
681 int ret = 0;
682
683 if (fini)
684 goto cleanup;
685
686 if (ipt_register_match(&ipt_hashlimit)) {
687 ret = -EINVAL;
688 goto cleanup_nothing;
689 }
690
691 hashlimit_cachep = kmem_cache_create("ipt_hashlimit",
692 sizeof(struct dsthash_ent), 0,
693 0, NULL, NULL);
694 if (!hashlimit_cachep) {
695 printk(KERN_ERR "Unable to create ipt_hashlimit slab cache\n");
696 ret = -ENOMEM;
697 goto cleanup_unreg_match;
698 }
699
700 hashlimit_procdir = proc_mkdir("ipt_hashlimit", proc_net);
701 if (!hashlimit_procdir) {
702 printk(KERN_ERR "Unable to create proc dir entry\n");
703 ret = -ENOMEM;
704 goto cleanup_free_slab;
705 }
706
707 return ret;
708
709cleanup:
710 remove_proc_entry("ipt_hashlimit", proc_net);
711cleanup_free_slab:
712 kmem_cache_destroy(hashlimit_cachep);
713cleanup_unreg_match:
714 ipt_unregister_match(&ipt_hashlimit);
715cleanup_nothing:
716 return ret;
717
718}
719
720static int __init init(void)
721{
722 return init_or_fini(0);
723}
724
725static void __exit fini(void)
726{
727 init_or_fini(1);
728}
729
730module_init(init);
731module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
new file mode 100644
index 000000000000..33fdf364d3d3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -0,0 +1,113 @@
1/* iptables module to match on related connections */
2/*
3 * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * 19 Mar 2002 Harald Welte <laforge@gnumonks.org>:
10 * - Port to newnat infrastructure
11 */
12
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/netfilter.h>
16#include <linux/netfilter_ipv4/ip_conntrack.h>
17#include <linux/netfilter_ipv4/ip_conntrack_core.h>
18#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
19#include <linux/netfilter_ipv4/ip_tables.h>
20#include <linux/netfilter_ipv4/ipt_helper.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
24MODULE_DESCRIPTION("iptables helper match module");
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(format, args...)
30#endif
31
32static int
33match(const struct sk_buff *skb,
34 const struct net_device *in,
35 const struct net_device *out,
36 const void *matchinfo,
37 int offset,
38 int *hotdrop)
39{
40 const struct ipt_helper_info *info = matchinfo;
41 struct ip_conntrack *ct;
42 enum ip_conntrack_info ctinfo;
43 int ret = info->invert;
44
45 ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
46 if (!ct) {
47 DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
48 return ret;
49 }
50
51 if (!ct->master) {
52 DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
53 return ret;
54 }
55
56 READ_LOCK(&ip_conntrack_lock);
57 if (!ct->master->helper) {
58 DEBUGP("ipt_helper: master ct %p has no helper\n",
59 exp->expectant);
60 goto out_unlock;
61 }
62
63 DEBUGP("master's name = %s , info->name = %s\n",
64 ct->master->helper->name, info->name);
65
66 if (info->name[0] == '\0')
67 ret ^= 1;
68 else
69 ret ^= !strncmp(ct->master->helper->name, info->name,
70 strlen(ct->master->helper->name));
71out_unlock:
72 READ_UNLOCK(&ip_conntrack_lock);
73 return ret;
74}
75
76static int check(const char *tablename,
77 const struct ipt_ip *ip,
78 void *matchinfo,
79 unsigned int matchsize,
80 unsigned int hook_mask)
81{
82 struct ipt_helper_info *info = matchinfo;
83
84 info->name[29] = '\0';
85
86 /* verify size */
87 if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info)))
88 return 0;
89
90 return 1;
91}
92
93static struct ipt_match helper_match = {
94 .name = "helper",
95 .match = &match,
96 .checkentry = &check,
97 .me = THIS_MODULE,
98};
99
100static int __init init(void)
101{
102 need_ip_conntrack();
103 return ipt_register_match(&helper_match);
104}
105
106static void __exit fini(void)
107{
108 ipt_unregister_match(&helper_match);
109}
110
111module_init(init);
112module_exit(fini);
113
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
new file mode 100644
index 000000000000..b835b7b2e560
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -0,0 +1,99 @@
1/*
2 * iptables module to match IP address ranges
3 *
4 * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/ip.h>
13#include <linux/netfilter_ipv4/ip_tables.h>
14#include <linux/netfilter_ipv4/ipt_iprange.h>
15
16MODULE_LICENSE("GPL");
17MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
18MODULE_DESCRIPTION("iptables arbitrary IP range match module");
19
20#if 0
21#define DEBUGP printk
22#else
23#define DEBUGP(format, args...)
24#endif
25
26static int
27match(const struct sk_buff *skb,
28 const struct net_device *in,
29 const struct net_device *out,
30 const void *matchinfo,
31 int offset, int *hotdrop)
32{
33 const struct ipt_iprange_info *info = matchinfo;
34 const struct iphdr *iph = skb->nh.iph;
35
36 if (info->flags & IPRANGE_SRC) {
37 if (((ntohl(iph->saddr) < ntohl(info->src.min_ip))
38 || (ntohl(iph->saddr) > ntohl(info->src.max_ip)))
39 ^ !!(info->flags & IPRANGE_SRC_INV)) {
40 DEBUGP("src IP %u.%u.%u.%u NOT in range %s"
41 "%u.%u.%u.%u-%u.%u.%u.%u\n",
42 NIPQUAD(iph->saddr),
43 info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
44 NIPQUAD(info->src.min_ip),
45 NIPQUAD(info->src.max_ip));
46 return 0;
47 }
48 }
49 if (info->flags & IPRANGE_DST) {
50 if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip))
51 || (ntohl(iph->daddr) > ntohl(info->dst.max_ip)))
52 ^ !!(info->flags & IPRANGE_DST_INV)) {
53 DEBUGP("dst IP %u.%u.%u.%u NOT in range %s"
54 "%u.%u.%u.%u-%u.%u.%u.%u\n",
55 NIPQUAD(iph->daddr),
56 info->flags & IPRANGE_DST_INV ? "(INV) " : "",
57 NIPQUAD(info->dst.min_ip),
58 NIPQUAD(info->dst.max_ip));
59 return 0;
60 }
61 }
62 return 1;
63}
64
65static int check(const char *tablename,
66 const struct ipt_ip *ip,
67 void *matchinfo,
68 unsigned int matchsize,
69 unsigned int hook_mask)
70{
71 /* verify size */
72 if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info)))
73 return 0;
74
75 return 1;
76}
77
78static struct ipt_match iprange_match =
79{
80 .list = { NULL, NULL },
81 .name = "iprange",
82 .match = &match,
83 .checkentry = &check,
84 .destroy = NULL,
85 .me = THIS_MODULE
86};
87
88static int __init init(void)
89{
90 return ipt_register_match(&iprange_match);
91}
92
93static void __exit fini(void)
94{
95 ipt_unregister_match(&iprange_match);
96}
97
98module_init(init);
99module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c
new file mode 100644
index 000000000000..4eabcfbda9d1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_length.c
@@ -0,0 +1,64 @@
1/* Kernel module to match packet length. */
2/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/module.h>
10#include <linux/skbuff.h>
11
12#include <linux/netfilter_ipv4/ipt_length.h>
13#include <linux/netfilter_ipv4/ip_tables.h>
14
15MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
16MODULE_DESCRIPTION("IP tables packet length matching module");
17MODULE_LICENSE("GPL");
18
19static int
20match(const struct sk_buff *skb,
21 const struct net_device *in,
22 const struct net_device *out,
23 const void *matchinfo,
24 int offset,
25 int *hotdrop)
26{
27 const struct ipt_length_info *info = matchinfo;
28 u_int16_t pktlen = ntohs(skb->nh.iph->tot_len);
29
30 return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
31}
32
33static int
34checkentry(const char *tablename,
35 const struct ipt_ip *ip,
36 void *matchinfo,
37 unsigned int matchsize,
38 unsigned int hook_mask)
39{
40 if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info)))
41 return 0;
42
43 return 1;
44}
45
46static struct ipt_match length_match = {
47 .name = "length",
48 .match = &match,
49 .checkentry = &checkentry,
50 .me = THIS_MODULE,
51};
52
53static int __init init(void)
54{
55 return ipt_register_match(&length_match);
56}
57
58static void __exit fini(void)
59{
60 ipt_unregister_match(&length_match);
61}
62
63module_init(init);
64module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c
new file mode 100644
index 000000000000..0c24dcc703a5
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_limit.c
@@ -0,0 +1,157 @@
1/* Kernel module to control the rate
2 *
3 * 2 September 1999: Changed from the target RATE to the match
4 * `limit', removed logging. Did I mention that
5 * Alexey is a fucking genius?
6 * Rusty Russell (rusty@rustcorp.com.au). */
7
8/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
9 * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15
16#include <linux/module.h>
17#include <linux/skbuff.h>
18#include <linux/spinlock.h>
19#include <linux/interrupt.h>
20
21#include <linux/netfilter_ipv4/ip_tables.h>
22#include <linux/netfilter_ipv4/ipt_limit.h>
23
24MODULE_LICENSE("GPL");
25MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
26MODULE_DESCRIPTION("iptables rate limit match");
27
28/* The algorithm used is the Simple Token Bucket Filter (TBF)
29 * see net/sched/sch_tbf.c in the linux source tree
30 */
31
32static DEFINE_SPINLOCK(limit_lock);
33
34/* Rusty: This is my (non-mathematically-inclined) understanding of
35 this algorithm. The `average rate' in jiffies becomes your initial
36 amount of credit `credit' and the most credit you can ever have
37 `credit_cap'. The `peak rate' becomes the cost of passing the
38 test, `cost'.
39
40 `prev' tracks the last packet hit: you gain one credit per jiffy.
41 If you get credit balance more than this, the extra credit is
42 discarded. Every time the match passes, you lose `cost' credits;
43 if you don't have that many, the test fails.
44
45 See Alexey's formal explanation in net/sched/sch_tbf.c.
46
47 To get the maxmum range, we multiply by this factor (ie. you get N
48 credits per jiffy). We want to allow a rate as low as 1 per day
49 (slowest userspace tool allows), which means
50 CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
51#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
52
53/* Repeated shift and or gives us all 1s, final shift and add 1 gives
54 * us the power of 2 below the theoretical max, so GCC simply does a
55 * shift. */
56#define _POW2_BELOW2(x) ((x)|((x)>>1))
57#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
58#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
59#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
60#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
61#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
62
63#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
64
65static int
66ipt_limit_match(const struct sk_buff *skb,
67 const struct net_device *in,
68 const struct net_device *out,
69 const void *matchinfo,
70 int offset,
71 int *hotdrop)
72{
73 struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master;
74 unsigned long now = jiffies;
75
76 spin_lock_bh(&limit_lock);
77 r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
78 if (r->credit > r->credit_cap)
79 r->credit = r->credit_cap;
80
81 if (r->credit >= r->cost) {
82 /* We're not limited. */
83 r->credit -= r->cost;
84 spin_unlock_bh(&limit_lock);
85 return 1;
86 }
87
88 spin_unlock_bh(&limit_lock);
89 return 0;
90}
91
92/* Precision saver. */
93static u_int32_t
94user2credits(u_int32_t user)
95{
96 /* If multiplying would overflow... */
97 if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
98 /* Divide first. */
99 return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
100
101 return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE;
102}
103
104static int
105ipt_limit_checkentry(const char *tablename,
106 const struct ipt_ip *ip,
107 void *matchinfo,
108 unsigned int matchsize,
109 unsigned int hook_mask)
110{
111 struct ipt_rateinfo *r = matchinfo;
112
113 if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo)))
114 return 0;
115
116 /* Check for overflow. */
117 if (r->burst == 0
118 || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
119 printk("Overflow in ipt_limit, try lower: %u/%u\n",
120 r->avg, r->burst);
121 return 0;
122 }
123
124 /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies *
125 128. */
126 r->prev = jiffies;
127 r->credit = user2credits(r->avg * r->burst); /* Credits full. */
128 r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
129 r->cost = user2credits(r->avg);
130
131 /* For SMP, we only want to use one set of counters. */
132 r->master = r;
133
134 return 1;
135}
136
137static struct ipt_match ipt_limit_reg = {
138 .name = "limit",
139 .match = ipt_limit_match,
140 .checkentry = ipt_limit_checkentry,
141 .me = THIS_MODULE,
142};
143
144static int __init init(void)
145{
146 if (ipt_register_match(&ipt_limit_reg))
147 return -EINVAL;
148 return 0;
149}
150
151static void __exit fini(void)
152{
153 ipt_unregister_match(&ipt_limit_reg);
154}
155
156module_init(init);
157module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c
new file mode 100644
index 000000000000..11a459e33f25
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mac.c
@@ -0,0 +1,79 @@
1/* Kernel module to match MAC address parameters. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/if_ether.h>
14
15#include <linux/netfilter_ipv4/ipt_mac.h>
16#include <linux/netfilter_ipv4/ip_tables.h>
17
18MODULE_LICENSE("GPL");
19MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
20MODULE_DESCRIPTION("iptables mac matching module");
21
22static int
23match(const struct sk_buff *skb,
24 const struct net_device *in,
25 const struct net_device *out,
26 const void *matchinfo,
27 int offset,
28 int *hotdrop)
29{
30 const struct ipt_mac_info *info = matchinfo;
31
32 /* Is mac pointer valid? */
33 return (skb->mac.raw >= skb->head
34 && (skb->mac.raw + ETH_HLEN) <= skb->data
35 /* If so, compare... */
36 && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN)
37 == 0) ^ info->invert));
38}
39
40static int
41ipt_mac_checkentry(const char *tablename,
42 const struct ipt_ip *ip,
43 void *matchinfo,
44 unsigned int matchsize,
45 unsigned int hook_mask)
46{
47 /* FORWARD isn't always valid, but it's nice to be able to do --RR */
48 if (hook_mask
49 & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
50 | (1 << NF_IP_FORWARD))) {
51 printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
52 return 0;
53 }
54
55 if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info)))
56 return 0;
57
58 return 1;
59}
60
61static struct ipt_match mac_match = {
62 .name = "mac",
63 .match = &match,
64 .checkentry = &ipt_mac_checkentry,
65 .me = THIS_MODULE,
66};
67
68static int __init init(void)
69{
70 return ipt_register_match(&mac_match);
71}
72
73static void __exit fini(void)
74{
75 ipt_unregister_match(&mac_match);
76}
77
78module_init(init);
79module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
new file mode 100644
index 000000000000..8955728127b9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -0,0 +1,64 @@
1/* Kernel module to match NFMARK values. */
2
3/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/module.h>
11#include <linux/skbuff.h>
12
13#include <linux/netfilter_ipv4/ipt_mark.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15
16MODULE_LICENSE("GPL");
17MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
18MODULE_DESCRIPTION("iptables mark matching module");
19
20static int
21match(const struct sk_buff *skb,
22 const struct net_device *in,
23 const struct net_device *out,
24 const void *matchinfo,
25 int offset,
26 int *hotdrop)
27{
28 const struct ipt_mark_info *info = matchinfo;
29
30 return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
31}
32
33static int
34checkentry(const char *tablename,
35 const struct ipt_ip *ip,
36 void *matchinfo,
37 unsigned int matchsize,
38 unsigned int hook_mask)
39{
40 if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
41 return 0;
42
43 return 1;
44}
45
46static struct ipt_match mark_match = {
47 .name = "mark",
48 .match = &match,
49 .checkentry = &checkentry,
50 .me = THIS_MODULE,
51};
52
53static int __init init(void)
54{
55 return ipt_register_match(&mark_match);
56}
57
58static void __exit fini(void)
59{
60 ipt_unregister_match(&mark_match);
61}
62
63module_init(init);
64module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c
new file mode 100644
index 000000000000..99e8188162e2
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_multiport.c
@@ -0,0 +1,212 @@
1/* Kernel module to match one of a list of TCP/UDP ports: ports are in
2 the same place so we can treat them as equal. */
3
4/* (C) 1999-2001 Paul `Rusty' Russell
5 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/udp.h>
15#include <linux/skbuff.h>
16
17#include <linux/netfilter_ipv4/ipt_multiport.h>
18#include <linux/netfilter_ipv4/ip_tables.h>
19
20MODULE_LICENSE("GPL");
21MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
22MODULE_DESCRIPTION("iptables multiple port match module");
23
24#if 0
25#define duprintf(format, args...) printk(format , ## args)
26#else
27#define duprintf(format, args...)
28#endif
29
30/* Returns 1 if the port is matched by the test, 0 otherwise. */
31static inline int
32ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags,
33 u_int8_t count, u_int16_t src, u_int16_t dst)
34{
35 unsigned int i;
36 for (i=0; i<count; i++) {
37 if (flags != IPT_MULTIPORT_DESTINATION
38 && portlist[i] == src)
39 return 1;
40
41 if (flags != IPT_MULTIPORT_SOURCE
42 && portlist[i] == dst)
43 return 1;
44 }
45
46 return 0;
47}
48
49/* Returns 1 if the port is matched by the test, 0 otherwise. */
50static inline int
51ports_match_v1(const struct ipt_multiport_v1 *minfo,
52 u_int16_t src, u_int16_t dst)
53{
54 unsigned int i;
55 u_int16_t s, e;
56
57 for (i=0; i < minfo->count; i++) {
58 s = minfo->ports[i];
59
60 if (minfo->pflags[i]) {
61 /* range port matching */
62 e = minfo->ports[++i];
63 duprintf("src or dst matches with %d-%d?\n", s, e);
64
65 if (minfo->flags == IPT_MULTIPORT_SOURCE
66 && src >= s && src <= e)
67 return 1 ^ minfo->invert;
68 if (minfo->flags == IPT_MULTIPORT_DESTINATION
69 && dst >= s && dst <= e)
70 return 1 ^ minfo->invert;
71 if (minfo->flags == IPT_MULTIPORT_EITHER
72 && ((dst >= s && dst <= e)
73 || (src >= s && src <= e)))
74 return 1 ^ minfo->invert;
75 } else {
76 /* exact port matching */
77 duprintf("src or dst matches with %d?\n", s);
78
79 if (minfo->flags == IPT_MULTIPORT_SOURCE
80 && src == s)
81 return 1 ^ minfo->invert;
82 if (minfo->flags == IPT_MULTIPORT_DESTINATION
83 && dst == s)
84 return 1 ^ minfo->invert;
85 if (minfo->flags == IPT_MULTIPORT_EITHER
86 && (src == s || dst == s))
87 return 1 ^ minfo->invert;
88 }
89 }
90
91 return minfo->invert;
92}
93
94static int
95match(const struct sk_buff *skb,
96 const struct net_device *in,
97 const struct net_device *out,
98 const void *matchinfo,
99 int offset,
100 int *hotdrop)
101{
102 u16 _ports[2], *pptr;
103 const struct ipt_multiport *multiinfo = matchinfo;
104
105 if (offset)
106 return 0;
107
108 pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
109 sizeof(_ports), _ports);
110 if (pptr == NULL) {
111 /* We've been asked to examine this packet, and we
112 * can't. Hence, no choice but to drop.
113 */
114 duprintf("ipt_multiport:"
115 " Dropping evil offset=0 tinygram.\n");
116 *hotdrop = 1;
117 return 0;
118 }
119
120 return ports_match(multiinfo->ports,
121 multiinfo->flags, multiinfo->count,
122 ntohs(pptr[0]), ntohs(pptr[1]));
123}
124
125static int
126match_v1(const struct sk_buff *skb,
127 const struct net_device *in,
128 const struct net_device *out,
129 const void *matchinfo,
130 int offset,
131 int *hotdrop)
132{
133 u16 _ports[2], *pptr;
134 const struct ipt_multiport_v1 *multiinfo = matchinfo;
135
136 if (offset)
137 return 0;
138
139 pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
140 sizeof(_ports), _ports);
141 if (pptr == NULL) {
142 /* We've been asked to examine this packet, and we
143 * can't. Hence, no choice but to drop.
144 */
145 duprintf("ipt_multiport:"
146 " Dropping evil offset=0 tinygram.\n");
147 *hotdrop = 1;
148 return 0;
149 }
150
151 return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
152}
153
154/* Called when user tries to insert an entry of this type. */
155static int
156checkentry(const char *tablename,
157 const struct ipt_ip *ip,
158 void *matchinfo,
159 unsigned int matchsize,
160 unsigned int hook_mask)
161{
162 return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport)));
163}
164
165static int
166checkentry_v1(const char *tablename,
167 const struct ipt_ip *ip,
168 void *matchinfo,
169 unsigned int matchsize,
170 unsigned int hook_mask)
171{
172 return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1)));
173}
174
175static struct ipt_match multiport_match = {
176 .name = "multiport",
177 .revision = 0,
178 .match = &match,
179 .checkentry = &checkentry,
180 .me = THIS_MODULE,
181};
182
183static struct ipt_match multiport_match_v1 = {
184 .name = "multiport",
185 .revision = 1,
186 .match = &match_v1,
187 .checkentry = &checkentry_v1,
188 .me = THIS_MODULE,
189};
190
191static int __init init(void)
192{
193 int err;
194
195 err = ipt_register_match(&multiport_match);
196 if (!err) {
197 err = ipt_register_match(&multiport_match_v1);
198 if (err)
199 ipt_unregister_match(&multiport_match);
200 }
201
202 return err;
203}
204
205static void __exit fini(void)
206{
207 ipt_unregister_match(&multiport_match);
208 ipt_unregister_match(&multiport_match_v1);
209}
210
211module_init(init);
212module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
new file mode 100644
index 000000000000..3b9065e06381
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -0,0 +1,217 @@
1/* Kernel module to match various things tied to sockets associated with
2 locally generated outgoing packets. */
3
4/* (C) 2000 Marc Boucher <marc@mbsi.ca>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/file.h>
14#include <net/sock.h>
15
16#include <linux/netfilter_ipv4/ipt_owner.h>
17#include <linux/netfilter_ipv4/ip_tables.h>
18
19MODULE_LICENSE("GPL");
20MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
21MODULE_DESCRIPTION("iptables owner match");
22
23static int
24match_comm(const struct sk_buff *skb, const char *comm)
25{
26 struct task_struct *g, *p;
27 struct files_struct *files;
28 int i;
29
30 read_lock(&tasklist_lock);
31 do_each_thread(g, p) {
32 if(strncmp(p->comm, comm, sizeof(p->comm)))
33 continue;
34
35 task_lock(p);
36 files = p->files;
37 if(files) {
38 spin_lock(&files->file_lock);
39 for (i=0; i < files->max_fds; i++) {
40 if (fcheck_files(files, i) ==
41 skb->sk->sk_socket->file) {
42 spin_unlock(&files->file_lock);
43 task_unlock(p);
44 read_unlock(&tasklist_lock);
45 return 1;
46 }
47 }
48 spin_unlock(&files->file_lock);
49 }
50 task_unlock(p);
51 } while_each_thread(g, p);
52 read_unlock(&tasklist_lock);
53 return 0;
54}
55
56static int
57match_pid(const struct sk_buff *skb, pid_t pid)
58{
59 struct task_struct *p;
60 struct files_struct *files;
61 int i;
62
63 read_lock(&tasklist_lock);
64 p = find_task_by_pid(pid);
65 if (!p)
66 goto out;
67 task_lock(p);
68 files = p->files;
69 if(files) {
70 spin_lock(&files->file_lock);
71 for (i=0; i < files->max_fds; i++) {
72 if (fcheck_files(files, i) ==
73 skb->sk->sk_socket->file) {
74 spin_unlock(&files->file_lock);
75 task_unlock(p);
76 read_unlock(&tasklist_lock);
77 return 1;
78 }
79 }
80 spin_unlock(&files->file_lock);
81 }
82 task_unlock(p);
83out:
84 read_unlock(&tasklist_lock);
85 return 0;
86}
87
88static int
89match_sid(const struct sk_buff *skb, pid_t sid)
90{
91 struct task_struct *g, *p;
92 struct file *file = skb->sk->sk_socket->file;
93 int i, found=0;
94
95 read_lock(&tasklist_lock);
96 do_each_thread(g, p) {
97 struct files_struct *files;
98 if (p->signal->session != sid)
99 continue;
100
101 task_lock(p);
102 files = p->files;
103 if (files) {
104 spin_lock(&files->file_lock);
105 for (i=0; i < files->max_fds; i++) {
106 if (fcheck_files(files, i) == file) {
107 found = 1;
108 break;
109 }
110 }
111 spin_unlock(&files->file_lock);
112 }
113 task_unlock(p);
114 if (found)
115 goto out;
116 } while_each_thread(g, p);
117out:
118 read_unlock(&tasklist_lock);
119
120 return found;
121}
122
123static int
124match(const struct sk_buff *skb,
125 const struct net_device *in,
126 const struct net_device *out,
127 const void *matchinfo,
128 int offset,
129 int *hotdrop)
130{
131 const struct ipt_owner_info *info = matchinfo;
132
133 if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
134 return 0;
135
136 if(info->match & IPT_OWNER_UID) {
137 if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
138 !!(info->invert & IPT_OWNER_UID))
139 return 0;
140 }
141
142 if(info->match & IPT_OWNER_GID) {
143 if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
144 !!(info->invert & IPT_OWNER_GID))
145 return 0;
146 }
147
148 if(info->match & IPT_OWNER_PID) {
149 if (!match_pid(skb, info->pid) ^
150 !!(info->invert & IPT_OWNER_PID))
151 return 0;
152 }
153
154 if(info->match & IPT_OWNER_SID) {
155 if (!match_sid(skb, info->sid) ^
156 !!(info->invert & IPT_OWNER_SID))
157 return 0;
158 }
159
160 if(info->match & IPT_OWNER_COMM) {
161 if (!match_comm(skb, info->comm) ^
162 !!(info->invert & IPT_OWNER_COMM))
163 return 0;
164 }
165
166 return 1;
167}
168
169static int
170checkentry(const char *tablename,
171 const struct ipt_ip *ip,
172 void *matchinfo,
173 unsigned int matchsize,
174 unsigned int hook_mask)
175{
176 if (hook_mask
177 & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
178 printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
179 return 0;
180 }
181
182 if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) {
183 printk("Matchsize %u != %Zu\n", matchsize,
184 IPT_ALIGN(sizeof(struct ipt_owner_info)));
185 return 0;
186 }
187#ifdef CONFIG_SMP
188 /* files->file_lock can not be used in a BH */
189 if (((struct ipt_owner_info *)matchinfo)->match
190 & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
191 printk("ipt_owner: pid, sid and command matching is broken "
192 "on SMP.\n");
193 return 0;
194 }
195#endif
196 return 1;
197}
198
199static struct ipt_match owner_match = {
200 .name = "owner",
201 .match = &match,
202 .checkentry = &checkentry,
203 .me = THIS_MODULE,
204};
205
206static int __init init(void)
207{
208 return ipt_register_match(&owner_match);
209}
210
211static void __exit fini(void)
212{
213 ipt_unregister_match(&owner_match);
214}
215
216module_init(init);
217module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
new file mode 100644
index 000000000000..1a53924041fc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -0,0 +1,134 @@
1/* Kernel module to match the bridge port in and
2 * out device for IP packets coming into contact with a bridge. */
3
4/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ipt_physdev.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_bridge.h>
16#define MATCH 1
17#define NOMATCH 0
18
19MODULE_LICENSE("GPL");
20MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
21MODULE_DESCRIPTION("iptables bridge physical device match module");
22
23static int
24match(const struct sk_buff *skb,
25 const struct net_device *in,
26 const struct net_device *out,
27 const void *matchinfo,
28 int offset,
29 int *hotdrop)
30{
31 int i;
32 static const char nulldevname[IFNAMSIZ];
33 const struct ipt_physdev_info *info = matchinfo;
34 unsigned int ret;
35 const char *indev, *outdev;
36 struct nf_bridge_info *nf_bridge;
37
38 /* Not a bridged IP packet or no info available yet:
39 * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
40 * the destination device will be a bridge. */
41 if (!(nf_bridge = skb->nf_bridge)) {
42 /* Return MATCH if the invert flags of the used options are on */
43 if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
44 !(info->invert & IPT_PHYSDEV_OP_BRIDGED))
45 return NOMATCH;
46 if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) &&
47 !(info->invert & IPT_PHYSDEV_OP_ISIN))
48 return NOMATCH;
49 if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) &&
50 !(info->invert & IPT_PHYSDEV_OP_ISOUT))
51 return NOMATCH;
52 if ((info->bitmask & IPT_PHYSDEV_OP_IN) &&
53 !(info->invert & IPT_PHYSDEV_OP_IN))
54 return NOMATCH;
55 if ((info->bitmask & IPT_PHYSDEV_OP_OUT) &&
56 !(info->invert & IPT_PHYSDEV_OP_OUT))
57 return NOMATCH;
58 return MATCH;
59 }
60
61 /* This only makes sense in the FORWARD and POSTROUTING chains */
62 if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
63 (!!(nf_bridge->mask & BRNF_BRIDGED) ^
64 !(info->invert & IPT_PHYSDEV_OP_BRIDGED)))
65 return NOMATCH;
66
67 if ((info->bitmask & IPT_PHYSDEV_OP_ISIN &&
68 (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) ||
69 (info->bitmask & IPT_PHYSDEV_OP_ISOUT &&
70 (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT))))
71 return NOMATCH;
72
73 if (!(info->bitmask & IPT_PHYSDEV_OP_IN))
74 goto match_outdev;
75 indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
76 for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
77 ret |= (((const unsigned int *)indev)[i]
78 ^ ((const unsigned int *)info->physindev)[i])
79 & ((const unsigned int *)info->in_mask)[i];
80 }
81
82 if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN))
83 return NOMATCH;
84
85match_outdev:
86 if (!(info->bitmask & IPT_PHYSDEV_OP_OUT))
87 return MATCH;
88 outdev = nf_bridge->physoutdev ?
89 nf_bridge->physoutdev->name : nulldevname;
90 for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
91 ret |= (((const unsigned int *)outdev)[i]
92 ^ ((const unsigned int *)info->physoutdev)[i])
93 & ((const unsigned int *)info->out_mask)[i];
94 }
95
96 return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT);
97}
98
99static int
100checkentry(const char *tablename,
101 const struct ipt_ip *ip,
102 void *matchinfo,
103 unsigned int matchsize,
104 unsigned int hook_mask)
105{
106 const struct ipt_physdev_info *info = matchinfo;
107
108 if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info)))
109 return 0;
110 if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) ||
111 info->bitmask & ~IPT_PHYSDEV_OP_MASK)
112 return 0;
113 return 1;
114}
115
116static struct ipt_match physdev_match = {
117 .name = "physdev",
118 .match = &match,
119 .checkentry = &checkentry,
120 .me = THIS_MODULE,
121};
122
123static int __init init(void)
124{
125 return ipt_register_match(&physdev_match);
126}
127
128static void __exit fini(void)
129{
130 ipt_unregister_match(&physdev_match);
131}
132
133module_init(init);
134module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c
new file mode 100644
index 000000000000..8ddb1dc5e5ae
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_pkttype.c
@@ -0,0 +1,70 @@
1/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 */
7
8#include <linux/module.h>
9#include <linux/skbuff.h>
10#include <linux/if_ether.h>
11#include <linux/if_packet.h>
12
13#include <linux/netfilter_ipv4/ipt_pkttype.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15
16MODULE_LICENSE("GPL");
17MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
18MODULE_DESCRIPTION("IP tables match to match on linklayer packet type");
19
20static int match(const struct sk_buff *skb,
21 const struct net_device *in,
22 const struct net_device *out,
23 const void *matchinfo,
24 int offset,
25 int *hotdrop)
26{
27 const struct ipt_pkttype_info *info = matchinfo;
28
29 return (skb->pkt_type == info->pkttype) ^ info->invert;
30}
31
32static int checkentry(const char *tablename,
33 const struct ipt_ip *ip,
34 void *matchinfo,
35 unsigned int matchsize,
36 unsigned int hook_mask)
37{
38/*
39 if (hook_mask
40 & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
41 | (1 << NF_IP_FORWARD))) {
42 printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
43 return 0;
44 }
45*/
46 if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info)))
47 return 0;
48
49 return 1;
50}
51
52static struct ipt_match pkttype_match = {
53 .name = "pkttype",
54 .match = &match,
55 .checkentry = &checkentry,
56 .me = THIS_MODULE,
57};
58
59static int __init init(void)
60{
61 return ipt_register_match(&pkttype_match);
62}
63
64static void __exit fini(void)
65{
66 ipt_unregister_match(&pkttype_match);
67}
68
69module_init(init);
70module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c
new file mode 100644
index 000000000000..54a6897ebaa6
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_realm.c
@@ -0,0 +1,76 @@
1/* IP tables module for matching the routing realm
2 *
3 * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $
4 *
5 * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/netdevice.h>
15#include <net/route.h>
16
17#include <linux/netfilter_ipv4/ipt_realm.h>
18#include <linux/netfilter_ipv4/ip_tables.h>
19
20MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
21MODULE_LICENSE("GPL");
22MODULE_DESCRIPTION("iptables realm match");
23
24static int
25match(const struct sk_buff *skb,
26 const struct net_device *in,
27 const struct net_device *out,
28 const void *matchinfo,
29 int offset,
30 int *hotdrop)
31{
32 const struct ipt_realm_info *info = matchinfo;
33 struct dst_entry *dst = skb->dst;
34
35 return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
36}
37
38static int check(const char *tablename,
39 const struct ipt_ip *ip,
40 void *matchinfo,
41 unsigned int matchsize,
42 unsigned int hook_mask)
43{
44 if (hook_mask
45 & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) |
46 (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) {
47 printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, "
48 "LOCAL_IN or FORWARD.\n");
49 return 0;
50 }
51 if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) {
52 printk("ipt_realm: invalid matchsize.\n");
53 return 0;
54 }
55 return 1;
56}
57
58static struct ipt_match realm_match = {
59 .name = "realm",
60 .match = match,
61 .checkentry = check,
62 .me = THIS_MODULE
63};
64
65static int __init init(void)
66{
67 return ipt_register_match(&realm_match);
68}
69
70static void __exit fini(void)
71{
72 ipt_unregister_match(&realm_match);
73}
74
75module_init(init);
76module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
new file mode 100644
index 000000000000..25ab9fabdcba
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -0,0 +1,1002 @@
1/* Kernel module to check if the source address has been seen recently. */
2/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */
3/* Author: Stephen Frost <sfrost@snowman.net> */
4/* Project Page: http://snowman.net/projects/ipt_recent/ */
5/* This software is distributed under the terms of the GPL, Version 2 */
6/* This copyright does not cover user programs that use kernel services
7 * by normal system calls. */
8
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/proc_fs.h>
12#include <linux/spinlock.h>
13#include <linux/interrupt.h>
14#include <asm/uaccess.h>
15#include <linux/ctype.h>
16#include <linux/ip.h>
17#include <linux/vmalloc.h>
18#include <linux/moduleparam.h>
19
20#include <linux/netfilter_ipv4/ip_tables.h>
21#include <linux/netfilter_ipv4/ipt_recent.h>
22
23#undef DEBUG
24#define HASH_LOG 9
25
26/* Defaults, these can be overridden on the module command-line. */
27static int ip_list_tot = 100;
28static int ip_pkt_list_tot = 20;
29static int ip_list_hash_size = 0;
30static int ip_list_perms = 0644;
31#ifdef DEBUG
32static int debug = 1;
33#endif
34
35static char version[] =
36KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n";
37
38MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
39MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
40MODULE_LICENSE("GPL");
41module_param(ip_list_tot, int, 0400);
42module_param(ip_pkt_list_tot, int, 0400);
43module_param(ip_list_hash_size, int, 0400);
44module_param(ip_list_perms, int, 0400);
45#ifdef DEBUG
46module_param(debug, int, 0600);
47MODULE_PARM_DESC(debug,"debugging level, defaults to 1");
48#endif
49MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list");
50MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember");
51MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs");
52MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files");
53
54/* Structure of our list of recently seen addresses. */
55struct recent_ip_list {
56 u_int32_t addr;
57 u_int8_t ttl;
58 unsigned long last_seen;
59 unsigned long *last_pkts;
60 u_int32_t oldest_pkt;
61 u_int32_t hash_entry;
62 u_int32_t time_pos;
63};
64
65struct time_info_list {
66 u_int32_t position;
67 u_int32_t time;
68};
69
70/* Structure of our linked list of tables of recent lists. */
71struct recent_ip_tables {
72 char name[IPT_RECENT_NAME_LEN];
73 int count;
74 int time_pos;
75 struct recent_ip_list *table;
76 struct recent_ip_tables *next;
77 spinlock_t list_lock;
78 int *hash_table;
79 struct time_info_list *time_info;
80#ifdef CONFIG_PROC_FS
81 struct proc_dir_entry *status_proc;
82#endif /* CONFIG_PROC_FS */
83};
84
85/* Our current list of addresses we have recently seen.
86 * Only added to on a --set, and only updated on --set || --update
87 */
88static struct recent_ip_tables *r_tables = NULL;
89
90/* We protect r_list with this spinlock so two processors are not modifying
91 * the list at the same time.
92 */
93static DEFINE_SPINLOCK(recent_lock);
94
95#ifdef CONFIG_PROC_FS
96/* Our /proc/net/ipt_recent entry */
97static struct proc_dir_entry *proc_net_ipt_recent = NULL;
98#endif
99
100/* Function declaration for later. */
101static int
102match(const struct sk_buff *skb,
103 const struct net_device *in,
104 const struct net_device *out,
105 const void *matchinfo,
106 int offset,
107 int *hotdrop);
108
109/* Function to hash a given address into the hash table of table_size size */
110static int hash_func(unsigned int addr, int table_size)
111{
112 int result = 0;
113 unsigned int value = addr;
114 do { result ^= value; } while((value >>= HASH_LOG));
115
116#ifdef DEBUG
117 if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
118 result & (table_size - 1),
119 addr,
120 table_size);
121#endif
122
123 return(result & (table_size - 1));
124}
125
126#ifdef CONFIG_PROC_FS
127/* This is the function which produces the output for our /proc output
128 * interface which lists each IP address, the last seen time and the
129 * other recent times the address was seen.
130 */
131
132static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
133{
134 int len = 0, count, last_len = 0, pkt_count;
135 off_t pos = 0;
136 off_t begin = 0;
137 struct recent_ip_tables *curr_table;
138
139 curr_table = (struct recent_ip_tables*) data;
140
141 spin_lock_bh(&curr_table->list_lock);
142 for(count = 0; count < ip_list_tot; count++) {
143 if(!curr_table->table[count].addr) continue;
144 last_len = len;
145 len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
146 len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
147 len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
148 len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
149 len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
150 for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
151 if(!curr_table->table[count].last_pkts[pkt_count]) break;
152 len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
153 }
154 len += sprintf(buffer+len,"\n");
155 pos = begin + len;
156 if(pos < offset) { len = 0; begin = pos; }
157 if(pos > offset + length) { len = last_len; break; }
158 }
159
160 *start = buffer + (offset - begin);
161 len -= (offset - begin);
162 if(len > length) len = length;
163
164 spin_unlock_bh(&curr_table->list_lock);
165 return len;
166}
167
168/* ip_recent_ctrl provides an interface for users to modify the table
169 * directly. This allows adding entries, removing entries, and
170 * flushing the entire table.
171 * This is done by opening up the appropriate table for writing and
172 * sending one of:
173 * xx.xx.xx.xx -- Add entry to table with current time
174 * +xx.xx.xx.xx -- Add entry to table with current time
175 * -xx.xx.xx.xx -- Remove entry from table
176 * clear -- Flush table, remove all entries
177 */
178
179static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
180{
181 static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff };
182 u_int32_t val;
183 int base, used = 0;
184 char c, *cp;
185 union iaddr {
186 uint8_t bytes[4];
187 uint32_t word;
188 } res;
189 uint8_t *pp = res.bytes;
190 int digit;
191
192 char buffer[20];
193 int len, check_set = 0, count;
194 u_int32_t addr = 0;
195 struct sk_buff *skb;
196 struct ipt_recent_info *info;
197 struct recent_ip_tables *curr_table;
198
199 curr_table = (struct recent_ip_tables*) data;
200
201 if(size > 20) len = 20; else len = size;
202
203 if(copy_from_user(buffer,input,len)) return -EFAULT;
204
205 if(len < 20) buffer[len] = '\0';
206
207#ifdef DEBUG
208 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
209#endif
210
211 cp = buffer;
212 while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; }
213
214 /* Check if we are asked to flush the entire table */
215 if(!memcmp(cp,"clear",5)) {
216 used += 5;
217 spin_lock_bh(&curr_table->list_lock);
218 curr_table->time_pos = 0;
219 for(count = 0; count < ip_list_hash_size; count++) {
220 curr_table->hash_table[count] = -1;
221 }
222 for(count = 0; count < ip_list_tot; count++) {
223 curr_table->table[count].last_seen = 0;
224 curr_table->table[count].addr = 0;
225 curr_table->table[count].ttl = 0;
226 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
227 curr_table->table[count].oldest_pkt = 0;
228 curr_table->table[count].time_pos = 0;
229 curr_table->time_info[count].position = count;
230 curr_table->time_info[count].time = 0;
231 }
232 spin_unlock_bh(&curr_table->list_lock);
233 return used;
234 }
235
236 check_set = IPT_RECENT_SET;
237 switch(*cp) {
238 case '+': check_set = IPT_RECENT_SET; cp++; used++; break;
239 case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
240 default: if(!isdigit(*cp)) return (used+1); break;
241 }
242
243#ifdef DEBUG
244 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set);
245#endif
246 /* Get addr (effectively inet_aton()) */
247 /* Shamelessly stolen from libc, a function in the kernel for doing
248 * this would, of course, be greatly preferred, but our options appear
249 * to be rather limited, so we will just do it ourselves here.
250 */
251 res.word = 0;
252
253 c = *cp;
254 for(;;) {
255 if(!isdigit(c)) return used;
256 val = 0; base = 10; digit = 0;
257 if(c == '0') {
258 c = *++cp;
259 if(c == 'x' || c == 'X') base = 16, c = *++cp;
260 else { base = 8; digit = 1; }
261 }
262 for(;;) {
263 if(isascii(c) && isdigit(c)) {
264 if(base == 8 && (c == '8' || c == '0')) return used;
265 val = (val * base) + (c - '0');
266 c = *++cp;
267 digit = 1;
268 } else if(base == 16 && isascii(c) && isxdigit(c)) {
269 val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
270 c = *++cp;
271 digit = 1;
272 } else break;
273 }
274 if(c == '.') {
275 if(pp > res.bytes + 2 || val > 0xff) return used;
276 *pp++ = val;
277 c = *++cp;
278 } else break;
279 }
280 used = cp - buffer;
281 if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
282 if(c == '\n') used++;
283 if(!digit) return used;
284
285 if(val > max[pp - res.bytes]) return used;
286 addr = res.word | htonl(val);
287
288 if(!addr && check_set == IPT_RECENT_SET) return used;
289
290#ifdef DEBUG
291 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used);
292#endif
293
294 /* Set up and just call match */
295 info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL);
296 if(!info) { return -ENOMEM; }
297 info->seconds = 0;
298 info->hit_count = 0;
299 info->check_set = check_set;
300 info->invert = 0;
301 info->side = IPT_RECENT_SOURCE;
302 strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
303 info->name[IPT_RECENT_NAME_LEN-1] = '\0';
304
305 skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
306 if (!skb) {
307 used = -ENOMEM;
308 goto out_free_info;
309 }
310 skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
311 if (!skb->nh.iph) {
312 used = -ENOMEM;
313 goto out_free_skb;
314 }
315
316 skb->nh.iph->saddr = addr;
317 skb->nh.iph->daddr = 0;
318 /* Clear ttl since we have no way of knowing it */
319 skb->nh.iph->ttl = 0;
320 match(skb,NULL,NULL,info,0,NULL);
321
322 kfree(skb->nh.iph);
323out_free_skb:
324 kfree(skb);
325out_free_info:
326 kfree(info);
327
328#ifdef DEBUG
329 if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
330#endif
331 return used;
332}
333
334#endif /* CONFIG_PROC_FS */
335
336/* 'match' is our primary function, called by the kernel whenever a rule is
337 * hit with our module as an option to it.
338 * What this function does depends on what was specifically asked of it by
339 * the user:
340 * --set -- Add or update last seen time of the source address of the packet
341 * -- matchinfo->check_set == IPT_RECENT_SET
342 * --rcheck -- Just check if the source address is in the list
343 * -- matchinfo->check_set == IPT_RECENT_CHECK
344 * --update -- If the source address is in the list, update last_seen
345 * -- matchinfo->check_set == IPT_RECENT_UPDATE
346 * --remove -- If the source address is in the list, remove it
347 * -- matchinfo->check_set == IPT_RECENT_REMOVE
348 * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
349 * -- matchinfo->seconds
350 * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
351 * -- matchinfo->hit_count
352 * --seconds and --hitcount can be combined
353 */
354static int
355match(const struct sk_buff *skb,
356 const struct net_device *in,
357 const struct net_device *out,
358 const void *matchinfo,
359 int offset,
360 int *hotdrop)
361{
362 int pkt_count, hits_found, ans;
363 unsigned long now;
364 const struct ipt_recent_info *info = matchinfo;
365 u_int32_t addr = 0, time_temp;
366 u_int8_t ttl = skb->nh.iph->ttl;
367 int *hash_table;
368 int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1;
369 struct time_info_list *time_info;
370 struct recent_ip_tables *curr_table;
371 struct recent_ip_tables *last_table;
372 struct recent_ip_list *r_list;
373
374#ifdef DEBUG
375 if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
376#endif
377
378 /* Default is false ^ info->invert */
379 ans = info->invert;
380
381#ifdef DEBUG
382 if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name);
383#endif
384
385 /* if out != NULL then routing has been done and TTL changed.
386 * We change it back here internally for match what came in before routing. */
387 if(out) ttl++;
388
389 /* Find the right table */
390 spin_lock_bh(&recent_lock);
391 curr_table = r_tables;
392 while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) );
393
394#ifdef DEBUG
395 if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name);
396#endif
397
398 spin_unlock_bh(&recent_lock);
399
400 /* Table with this name not found, match impossible */
401 if(!curr_table) { return ans; }
402
403 /* Make sure no one is changing the list while we work with it */
404 spin_lock_bh(&curr_table->list_lock);
405
406 r_list = curr_table->table;
407 if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
408
409 if(!addr) {
410#ifdef DEBUG
411 if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
412#endif
413 spin_unlock_bh(&curr_table->list_lock);
414 return ans;
415 }
416
417#ifdef DEBUG
418 if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
419#endif
420
421 /* Get jiffies now in case they changed while we were waiting for a lock */
422 now = jiffies;
423 hash_table = curr_table->hash_table;
424 time_info = curr_table->time_info;
425
426 orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
427 /* Hash entry at this result used */
428 /* Check for TTL match if requested. If TTL is zero then a match would never
429 * happen, so match regardless of existing TTL in that case. Zero means the
430 * entry was added via the /proc interface anyway, so we will just use the
431 * first TTL we get for that IP address. */
432 if(info->check_set & IPT_RECENT_TTL) {
433 while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
434 (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
435 /* Collision in hash table */
436 hash_result = (hash_result + 1) % ip_list_hash_size;
437 }
438 } else {
439 while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
440 /* Collision in hash table */
441 hash_result = (hash_result + 1) % ip_list_hash_size;
442 }
443 }
444
445 if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
446 /* IP not in list and not asked to SET */
447 spin_unlock_bh(&curr_table->list_lock);
448 return ans;
449 }
450
451 /* Check if we need to handle the collision, do not need to on REMOVE */
452 if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
453#ifdef DEBUG
454 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
455 orig_hash_result,
456 hash_result,
457 r_list[hash_table[orig_hash_result]].addr,
458 addr);
459#endif
460
461 /* We had a collision.
462 * orig_hash_result is where we started, hash_result is where we ended up.
463 * So, swap them because we are likely to see the same guy again sooner */
464#ifdef DEBUG
465 if(debug) {
466 printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
467 printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
468 r_list[hash_table[orig_hash_result]].hash_entry);
469 }
470#endif
471
472 r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
473
474
475 temp = hash_table[orig_hash_result];
476#ifdef DEBUG
477 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
478#endif
479 hash_table[orig_hash_result] = hash_table[hash_result];
480 hash_table[hash_result] = temp;
481 temp = hash_result;
482 hash_result = orig_hash_result;
483 orig_hash_result = temp;
484 time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
485 if(hash_table[hash_result] != -1) {
486 r_list[hash_table[hash_result]].hash_entry = hash_result;
487 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
488 }
489
490#ifdef DEBUG
491 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
492#endif
493 }
494
495 if(hash_table[hash_result] == -1) {
496#ifdef DEBUG
497 if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n",
498 hash_result, addr);
499#endif
500
501 /* New item found and IPT_RECENT_SET, so we need to add it */
502 location = time_info[curr_table->time_pos].position;
503 hash_table[r_list[location].hash_entry] = -1;
504 hash_table[hash_result] = location;
505 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
506 r_list[location].time_pos = curr_table->time_pos;
507 r_list[location].addr = addr;
508 r_list[location].ttl = ttl;
509 r_list[location].last_seen = now;
510 r_list[location].oldest_pkt = 1;
511 r_list[location].last_pkts[0] = now;
512 r_list[location].hash_entry = hash_result;
513 time_info[curr_table->time_pos].time = r_list[location].last_seen;
514 curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
515
516 ans = !info->invert;
517 } else {
518#ifdef DEBUG
519 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
520 hash_result,
521 addr);
522#endif
523
524 /* Existing item found */
525 location = hash_table[hash_result];
526 /* We have a match on address, now to make sure it meets all requirements for a
527 * full match. */
528 if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
529 if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
530 if(info->seconds && !info->hit_count) {
531 if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
532 }
533 if(info->seconds && info->hit_count) {
534 for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
535 if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
536 }
537 if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
538 }
539 if(info->hit_count && !info->seconds) {
540 for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
541 if(r_list[location].last_pkts[pkt_count] == 0) break;
542 hits_found++;
543 }
544 if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
545 }
546 }
547#ifdef DEBUG
548 if(debug) {
549 if(ans)
550 printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
551 else
552 printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
553 }
554#endif
555
556 /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
557 * current timestamp to the last_seen. */
558 if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
559#ifdef DEBUG
560 if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
561#endif
562 /* Have to update our time info */
563 time_loc = r_list[location].time_pos;
564 time_info[time_loc].time = now;
565 time_info[time_loc].position = location;
566 while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
567 time_temp = time_info[time_loc].time;
568 time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
569 time_info[(time_loc+1)%ip_list_tot].time = time_temp;
570 time_temp = time_info[time_loc].position;
571 time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
572 time_info[(time_loc+1)%ip_list_tot].position = time_temp;
573 r_list[time_info[time_loc].position].time_pos = time_loc;
574 r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
575 time_loc = (time_loc+1) % ip_list_tot;
576 }
577 r_list[location].time_pos = time_loc;
578 r_list[location].ttl = ttl;
579 r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
580 r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
581 r_list[location].last_seen = now;
582 }
583 /* If we have been asked to remove the entry from the list, just set it to 0 */
584 if(info->check_set & IPT_RECENT_REMOVE) {
585#ifdef DEBUG
586 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
587#endif
588 /* Check if this is part of a collision chain */
589 while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
590 orig_hash_result++;
591 if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
592 /* Found collision chain, how deep does this rabbit hole go? */
593#ifdef DEBUG
594 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
595#endif
596 end_collision_chain = orig_hash_result;
597 }
598 }
599 if(end_collision_chain != -1) {
600#ifdef DEBUG
601 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
602#endif
603 /* Part of a collision chain, swap it with the end of the chain
604 * before removing. */
605 r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
606 temp = hash_table[end_collision_chain];
607 hash_table[end_collision_chain] = hash_table[hash_result];
608 hash_table[hash_result] = temp;
609 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
610 hash_result = end_collision_chain;
611 r_list[hash_table[hash_result]].hash_entry = hash_result;
612 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
613 }
614 location = hash_table[hash_result];
615 hash_table[r_list[location].hash_entry] = -1;
616 time_loc = r_list[location].time_pos;
617 time_info[time_loc].time = 0;
618 time_info[time_loc].position = location;
619 while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
620 time_temp = time_info[time_loc].time;
621 time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
622 time_info[(time_loc+1)%ip_list_tot].time = time_temp;
623 time_temp = time_info[time_loc].position;
624 time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
625 time_info[(time_loc+1)%ip_list_tot].position = time_temp;
626 r_list[time_info[time_loc].position].time_pos = time_loc;
627 r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
628 time_loc = (time_loc+1) % ip_list_tot;
629 }
630 r_list[location].time_pos = time_loc;
631 r_list[location].last_seen = 0;
632 r_list[location].addr = 0;
633 r_list[location].ttl = 0;
634 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
635 r_list[location].oldest_pkt = 0;
636 ans = !info->invert;
637 }
638 spin_unlock_bh(&curr_table->list_lock);
639 return ans;
640 }
641
642 spin_unlock_bh(&curr_table->list_lock);
643#ifdef DEBUG
644 if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n");
645#endif
646 return ans;
647}
648
649/* This function is to verify that the rule given during the userspace iptables
650 * command is correct.
651 * If the command is valid then we check if the table name referred to by the
652 * rule exists, if not it is created.
653 */
654static int
655checkentry(const char *tablename,
656 const struct ipt_ip *ip,
657 void *matchinfo,
658 unsigned int matchsize,
659 unsigned int hook_mask)
660{
661 int flag = 0, c;
662 unsigned long *hold;
663 const struct ipt_recent_info *info = matchinfo;
664 struct recent_ip_tables *curr_table, *find_table, *last_table;
665
666#ifdef DEBUG
667 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
668#endif
669
670 if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0;
671
672 /* seconds and hit_count only valid for CHECK/UPDATE */
673 if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
674 if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
675 if(info->check_set & IPT_RECENT_CHECK) flag++;
676 if(info->check_set & IPT_RECENT_UPDATE) flag++;
677
678 /* One and only one of these should ever be set */
679 if(flag != 1) return 0;
680
681 /* Name must be set to something */
682 if(!info->name || !info->name[0]) return 0;
683
684 /* Things look good, create a list for this if it does not exist */
685 /* Lock the linked list while we play with it */
686 spin_lock_bh(&recent_lock);
687
688 /* Look for an entry with this name already created */
689 /* Finds the end of the list and the entry before the end if current name does not exist */
690 find_table = r_tables;
691 while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
692
693 /* If a table already exists just increment the count on that table and return */
694 if(find_table) {
695#ifdef DEBUG
696 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name);
697#endif
698 find_table->count++;
699 spin_unlock_bh(&recent_lock);
700 return 1;
701 }
702
703 spin_unlock_bh(&recent_lock);
704
705 /* Table with this name not found */
706 /* Allocate memory for new linked list item */
707
708#ifdef DEBUG
709 if(debug) {
710 printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name);
711 printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables));
712 }
713#endif
714
715 curr_table = vmalloc(sizeof(struct recent_ip_tables));
716 if(curr_table == NULL) return 0;
717
718 spin_lock_init(&curr_table->list_lock);
719 curr_table->next = NULL;
720 curr_table->count = 1;
721 curr_table->time_pos = 0;
722 strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
723 curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
724
725 /* Allocate memory for this table and the list of packets in each entry. */
726#ifdef DEBUG
727 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
728 sizeof(struct recent_ip_list)*ip_list_tot,
729 info->name);
730#endif
731
732 curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
733 if(curr_table->table == NULL) { vfree(curr_table); return 0; }
734 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
735#ifdef DEBUG
736 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
737 sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
738#endif
739
740 hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
741#ifdef DEBUG
742 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
743#endif
744 if(hold == NULL) {
745 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
746 vfree(curr_table->table);
747 vfree(curr_table);
748 return 0;
749 }
750 for(c = 0; c < ip_list_tot; c++) {
751 curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
752 }
753
754 /* Allocate memory for the hash table */
755#ifdef DEBUG
756 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n",
757 sizeof(int)*ip_list_hash_size);
758#endif
759
760 curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
761 if(!curr_table->hash_table) {
762 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
763 vfree(hold);
764 vfree(curr_table->table);
765 vfree(curr_table);
766 return 0;
767 }
768
769 for(c = 0; c < ip_list_hash_size; c++) {
770 curr_table->hash_table[c] = -1;
771 }
772
773 /* Allocate memory for the time info */
774#ifdef DEBUG
775 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n",
776 sizeof(struct time_info_list)*ip_list_tot);
777#endif
778
779 curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot);
780 if(!curr_table->time_info) {
781 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n");
782 vfree(curr_table->hash_table);
783 vfree(hold);
784 vfree(curr_table->table);
785 vfree(curr_table);
786 return 0;
787 }
788 for(c = 0; c < ip_list_tot; c++) {
789 curr_table->time_info[c].position = c;
790 curr_table->time_info[c].time = 0;
791 }
792
793 /* Put the new table in place */
794 spin_lock_bh(&recent_lock);
795 find_table = r_tables;
796 while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
797
798 /* If a table already exists just increment the count on that table and return */
799 if(find_table) {
800 find_table->count++;
801 spin_unlock_bh(&recent_lock);
802#ifdef DEBUG
803 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
804#endif
805 vfree(curr_table->time_info);
806 vfree(curr_table->hash_table);
807 vfree(hold);
808 vfree(curr_table->table);
809 vfree(curr_table);
810 return 1;
811 }
812 if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
813
814 spin_unlock_bh(&recent_lock);
815
816#ifdef CONFIG_PROC_FS
817 /* Create our proc 'status' entry. */
818 curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent);
819 if (!curr_table->status_proc) {
820 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
821 /* Destroy the created table */
822 spin_lock_bh(&recent_lock);
823 last_table = NULL;
824 curr_table = r_tables;
825 if(!curr_table) {
826#ifdef DEBUG
827 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
828#endif
829 spin_unlock_bh(&recent_lock);
830 return 0;
831 }
832 while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
833 if(!curr_table) {
834#ifdef DEBUG
835 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
836#endif
837 spin_unlock_bh(&recent_lock);
838 return 0;
839 }
840 if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
841 spin_unlock_bh(&recent_lock);
842 vfree(curr_table->time_info);
843 vfree(curr_table->hash_table);
844 vfree(hold);
845 vfree(curr_table->table);
846 vfree(curr_table);
847 return 0;
848 }
849
850 curr_table->status_proc->owner = THIS_MODULE;
851 curr_table->status_proc->data = curr_table;
852 wmb();
853 curr_table->status_proc->read_proc = ip_recent_get_info;
854 curr_table->status_proc->write_proc = ip_recent_ctrl;
855#endif /* CONFIG_PROC_FS */
856
857#ifdef DEBUG
858 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
859#endif
860
861 return 1;
862}
863
864/* This function is called in the event that a rule matching this module is
865 * removed.
866 * When this happens we need to check if there are no other rules matching
867 * the table given. If that is the case then we remove the table and clean
868 * up its memory.
869 */
870static void
871destroy(void *matchinfo, unsigned int matchsize)
872{
873 const struct ipt_recent_info *info = matchinfo;
874 struct recent_ip_tables *curr_table, *last_table;
875
876#ifdef DEBUG
877 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n");
878#endif
879
880 if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return;
881
882 /* Lock the linked list while we play with it */
883 spin_lock_bh(&recent_lock);
884
885 /* Look for an entry with this name already created */
886 /* Finds the end of the list and the entry before the end if current name does not exist */
887 last_table = NULL;
888 curr_table = r_tables;
889 if(!curr_table) {
890#ifdef DEBUG
891 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n");
892#endif
893 spin_unlock_bh(&recent_lock);
894 return;
895 }
896 while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
897
898 /* If a table does not exist then do nothing and return */
899 if(!curr_table) {
900#ifdef DEBUG
901 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n");
902#endif
903 spin_unlock_bh(&recent_lock);
904 return;
905 }
906
907 curr_table->count--;
908
909 /* If count is still non-zero then there are still rules referenceing it so we do nothing */
910 if(curr_table->count) {
911#ifdef DEBUG
912 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n");
913#endif
914 spin_unlock_bh(&recent_lock);
915 return;
916 }
917
918#ifdef DEBUG
919 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
920#endif
921
922 /* Count must be zero so we remove this table from the list */
923 if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
924
925 spin_unlock_bh(&recent_lock);
926
927 /* lock to make sure any late-runners still using this after we removed it from
928 * the list finish up then remove everything */
929 spin_lock_bh(&curr_table->list_lock);
930 spin_unlock_bh(&curr_table->list_lock);
931
932#ifdef CONFIG_PROC_FS
933 if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent);
934#endif /* CONFIG_PROC_FS */
935 vfree(curr_table->table[0].last_pkts);
936 vfree(curr_table->table);
937 vfree(curr_table->hash_table);
938 vfree(curr_table->time_info);
939 vfree(curr_table);
940
941#ifdef DEBUG
942 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
943#endif
944
945 return;
946}
947
948/* This is the structure we pass to ipt_register to register our
949 * module with iptables.
950 */
951static struct ipt_match recent_match = {
952 .name = "recent",
953 .match = &match,
954 .checkentry = &checkentry,
955 .destroy = &destroy,
956 .me = THIS_MODULE
957};
958
959/* Kernel module initialization. */
960static int __init init(void)
961{
962 int err, count;
963
964 printk(version);
965#ifdef CONFIG_PROC_FS
966 proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net);
967 if(!proc_net_ipt_recent) return -ENOMEM;
968#endif
969
970 if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
971 printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
972 ip_list_hash_size = 0;
973 }
974
975 if(!ip_list_hash_size) {
976 ip_list_hash_size = ip_list_tot*3;
977 count = 2*2;
978 while(ip_list_hash_size > count) count = count*2;
979 ip_list_hash_size = count;
980 }
981
982#ifdef DEBUG
983 if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
984#endif
985
986 err = ipt_register_match(&recent_match);
987 if (err)
988 remove_proc_entry("ipt_recent", proc_net);
989 return err;
990}
991
992/* Kernel module destruction. */
993static void __exit fini(void)
994{
995 ipt_unregister_match(&recent_match);
996
997 remove_proc_entry("ipt_recent",proc_net);
998}
999
1000/* Register our module with the kernel. */
1001module_init(init);
1002module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c
new file mode 100644
index 000000000000..fe2b327bcaa4
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_sctp.c
@@ -0,0 +1,203 @@
1#include <linux/module.h>
2#include <linux/skbuff.h>
3#include <net/ip.h>
4#include <linux/sctp.h>
5
6#include <linux/netfilter_ipv4/ip_tables.h>
7#include <linux/netfilter_ipv4/ipt_sctp.h>
8
9#ifdef DEBUG_SCTP
10#define duprintf(format, args...) printk(format , ## args)
11#else
12#define duprintf(format, args...)
13#endif
14
15#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
16 || (!!((invflag) & (option)) ^ (cond)))
17
18static int
19match_flags(const struct ipt_sctp_flag_info *flag_info,
20 const int flag_count,
21 u_int8_t chunktype,
22 u_int8_t chunkflags)
23{
24 int i;
25
26 for (i = 0; i < flag_count; i++) {
27 if (flag_info[i].chunktype == chunktype) {
28 return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
29 }
30 }
31
32 return 1;
33}
34
35static int
36match_packet(const struct sk_buff *skb,
37 const u_int32_t *chunkmap,
38 int chunk_match_type,
39 const struct ipt_sctp_flag_info *flag_info,
40 const int flag_count,
41 int *hotdrop)
42{
43 int offset;
44 u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
45 sctp_chunkhdr_t _sch, *sch;
46
47#ifdef DEBUG_SCTP
48 int i = 0;
49#endif
50
51 if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) {
52 SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap);
53 }
54
55 offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t);
56 do {
57 sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
58 if (sch == NULL) {
59 duprintf("Dropping invalid SCTP packet.\n");
60 *hotdrop = 1;
61 return 0;
62 }
63
64 duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n",
65 ++i, offset, sch->type, htons(sch->length), sch->flags);
66
67 offset += (htons(sch->length) + 3) & ~3;
68
69 duprintf("skb->len: %d\toffset: %d\n", skb->len, offset);
70
71 if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) {
72 switch (chunk_match_type) {
73 case SCTP_CHUNK_MATCH_ANY:
74 if (match_flags(flag_info, flag_count,
75 sch->type, sch->flags)) {
76 return 1;
77 }
78 break;
79
80 case SCTP_CHUNK_MATCH_ALL:
81 if (match_flags(flag_info, flag_count,
82 sch->type, sch->flags)) {
83 SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
84 }
85 break;
86
87 case SCTP_CHUNK_MATCH_ONLY:
88 if (!match_flags(flag_info, flag_count,
89 sch->type, sch->flags)) {
90 return 0;
91 }
92 break;
93 }
94 } else {
95 switch (chunk_match_type) {
96 case SCTP_CHUNK_MATCH_ONLY:
97 return 0;
98 }
99 }
100 } while (offset < skb->len);
101
102 switch (chunk_match_type) {
103 case SCTP_CHUNK_MATCH_ALL:
104 return SCTP_CHUNKMAP_IS_CLEAR(chunkmap);
105 case SCTP_CHUNK_MATCH_ANY:
106 return 0;
107 case SCTP_CHUNK_MATCH_ONLY:
108 return 1;
109 }
110
111 /* This will never be reached, but required to stop compiler whine */
112 return 0;
113}
114
115static int
116match(const struct sk_buff *skb,
117 const struct net_device *in,
118 const struct net_device *out,
119 const void *matchinfo,
120 int offset,
121 int *hotdrop)
122{
123 const struct ipt_sctp_info *info;
124 sctp_sctphdr_t _sh, *sh;
125
126 info = (const struct ipt_sctp_info *)matchinfo;
127
128 if (offset) {
129 duprintf("Dropping non-first fragment.. FIXME\n");
130 return 0;
131 }
132
133 sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh);
134 if (sh == NULL) {
135 duprintf("Dropping evil TCP offset=0 tinygram.\n");
136 *hotdrop = 1;
137 return 0;
138 }
139 duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
140
141 return SCCHECK(((ntohs(sh->source) >= info->spts[0])
142 && (ntohs(sh->source) <= info->spts[1])),
143 IPT_SCTP_SRC_PORTS, info->flags, info->invflags)
144 && SCCHECK(((ntohs(sh->dest) >= info->dpts[0])
145 && (ntohs(sh->dest) <= info->dpts[1])),
146 IPT_SCTP_DEST_PORTS, info->flags, info->invflags)
147 && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type,
148 info->flag_info, info->flag_count,
149 hotdrop),
150 IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
151}
152
153static int
154checkentry(const char *tablename,
155 const struct ipt_ip *ip,
156 void *matchinfo,
157 unsigned int matchsize,
158 unsigned int hook_mask)
159{
160 const struct ipt_sctp_info *info;
161
162 info = (const struct ipt_sctp_info *)matchinfo;
163
164 return ip->proto == IPPROTO_SCTP
165 && !(ip->invflags & IPT_INV_PROTO)
166 && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info))
167 && !(info->flags & ~IPT_SCTP_VALID_FLAGS)
168 && !(info->invflags & ~IPT_SCTP_VALID_FLAGS)
169 && !(info->invflags & ~info->flags)
170 && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) ||
171 (info->chunk_match_type &
172 (SCTP_CHUNK_MATCH_ALL
173 | SCTP_CHUNK_MATCH_ANY
174 | SCTP_CHUNK_MATCH_ONLY)));
175}
176
177static struct ipt_match sctp_match =
178{
179 .list = { NULL, NULL},
180 .name = "sctp",
181 .match = &match,
182 .checkentry = &checkentry,
183 .destroy = NULL,
184 .me = THIS_MODULE
185};
186
187static int __init init(void)
188{
189 return ipt_register_match(&sctp_match);
190}
191
192static void __exit fini(void)
193{
194 ipt_unregister_match(&sctp_match);
195}
196
197module_init(init);
198module_exit(fini);
199
200MODULE_LICENSE("GPL");
201MODULE_AUTHOR("Kiran Kumar Immidi");
202MODULE_DESCRIPTION("Match for SCTP protocol packets");
203
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
new file mode 100644
index 000000000000..b1511b97ea5f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_state.c
@@ -0,0 +1,74 @@
1/* Kernel module to match connection tracking information. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ip_conntrack.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_state.h>
16
17MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
19MODULE_DESCRIPTION("iptables connection tracking state match module");
20
21static int
22match(const struct sk_buff *skb,
23 const struct net_device *in,
24 const struct net_device *out,
25 const void *matchinfo,
26 int offset,
27 int *hotdrop)
28{
29 const struct ipt_state_info *sinfo = matchinfo;
30 enum ip_conntrack_info ctinfo;
31 unsigned int statebit;
32
33 if (skb->nfct == &ip_conntrack_untracked.ct_general)
34 statebit = IPT_STATE_UNTRACKED;
35 else if (!ip_conntrack_get(skb, &ctinfo))
36 statebit = IPT_STATE_INVALID;
37 else
38 statebit = IPT_STATE_BIT(ctinfo);
39
40 return (sinfo->statemask & statebit);
41}
42
43static int check(const char *tablename,
44 const struct ipt_ip *ip,
45 void *matchinfo,
46 unsigned int matchsize,
47 unsigned int hook_mask)
48{
49 if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info)))
50 return 0;
51
52 return 1;
53}
54
55static struct ipt_match state_match = {
56 .name = "state",
57 .match = &match,
58 .checkentry = &check,
59 .me = THIS_MODULE,
60};
61
62static int __init init(void)
63{
64 need_ip_conntrack();
65 return ipt_register_match(&state_match);
66}
67
68static void __exit fini(void)
69{
70 ipt_unregister_match(&state_match);
71}
72
73module_init(init);
74module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c
new file mode 100644
index 000000000000..4dc9b16ab4a3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tcpmss.c
@@ -0,0 +1,127 @@
1/* Kernel module to match TCP MSS values. */
2
3/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <net/tcp.h>
13
14#include <linux/netfilter_ipv4/ipt_tcpmss.h>
15#include <linux/netfilter_ipv4/ip_tables.h>
16
17#define TH_SYN 0x02
18
19MODULE_LICENSE("GPL");
20MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
21MODULE_DESCRIPTION("iptables TCP MSS match module");
22
23/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */
24static inline int
25mssoption_match(u_int16_t min, u_int16_t max,
26 const struct sk_buff *skb,
27 int invert,
28 int *hotdrop)
29{
30 struct tcphdr _tcph, *th;
31 /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
32 u8 _opt[15 * 4 - sizeof(_tcph)], *op;
33 unsigned int i, optlen;
34
35 /* If we don't have the whole header, drop packet. */
36 th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
37 sizeof(_tcph), &_tcph);
38 if (th == NULL)
39 goto dropit;
40
41 /* Malformed. */
42 if (th->doff*4 < sizeof(*th))
43 goto dropit;
44
45 optlen = th->doff*4 - sizeof(*th);
46 if (!optlen)
47 goto out;
48
49 /* Truncated options. */
50 op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th),
51 optlen, _opt);
52 if (op == NULL)
53 goto dropit;
54
55 for (i = 0; i < optlen; ) {
56 if (op[i] == TCPOPT_MSS
57 && (optlen - i) >= TCPOLEN_MSS
58 && op[i+1] == TCPOLEN_MSS) {
59 u_int16_t mssval;
60
61 mssval = (op[i+2] << 8) | op[i+3];
62
63 return (mssval >= min && mssval <= max) ^ invert;
64 }
65 if (op[i] < 2) i++;
66 else i += op[i+1]?:1;
67 }
68out:
69 return invert;
70
71 dropit:
72 *hotdrop = 1;
73 return 0;
74}
75
76static int
77match(const struct sk_buff *skb,
78 const struct net_device *in,
79 const struct net_device *out,
80 const void *matchinfo,
81 int offset,
82 int *hotdrop)
83{
84 const struct ipt_tcpmss_match_info *info = matchinfo;
85
86 return mssoption_match(info->mss_min, info->mss_max, skb,
87 info->invert, hotdrop);
88}
89
90static int
91checkentry(const char *tablename,
92 const struct ipt_ip *ip,
93 void *matchinfo,
94 unsigned int matchsize,
95 unsigned int hook_mask)
96{
97 if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info)))
98 return 0;
99
100 /* Must specify -p tcp */
101 if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) {
102 printk("tcpmss: Only works on TCP packets\n");
103 return 0;
104 }
105
106 return 1;
107}
108
109static struct ipt_match tcpmss_match = {
110 .name = "tcpmss",
111 .match = &match,
112 .checkentry = &checkentry,
113 .me = THIS_MODULE,
114};
115
116static int __init init(void)
117{
118 return ipt_register_match(&tcpmss_match);
119}
120
121static void __exit fini(void)
122{
123 ipt_unregister_match(&tcpmss_match);
124}
125
126module_init(init);
127module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
new file mode 100644
index 000000000000..086a1bb61e3e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -0,0 +1,64 @@
1/* Kernel module to match TOS values. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13
14#include <linux/netfilter_ipv4/ipt_tos.h>
15#include <linux/netfilter_ipv4/ip_tables.h>
16
17MODULE_LICENSE("GPL");
18MODULE_DESCRIPTION("iptables TOS match module");
19
20static int
21match(const struct sk_buff *skb,
22 const struct net_device *in,
23 const struct net_device *out,
24 const void *matchinfo,
25 int offset,
26 int *hotdrop)
27{
28 const struct ipt_tos_info *info = matchinfo;
29
30 return (skb->nh.iph->tos == info->tos) ^ info->invert;
31}
32
33static int
34checkentry(const char *tablename,
35 const struct ipt_ip *ip,
36 void *matchinfo,
37 unsigned int matchsize,
38 unsigned int hook_mask)
39{
40 if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info)))
41 return 0;
42
43 return 1;
44}
45
46static struct ipt_match tos_match = {
47 .name = "tos",
48 .match = &match,
49 .checkentry = &checkentry,
50 .me = THIS_MODULE,
51};
52
53static int __init init(void)
54{
55 return ipt_register_match(&tos_match);
56}
57
58static void __exit fini(void)
59{
60 ipt_unregister_match(&tos_match);
61}
62
63module_init(init);
64module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
new file mode 100644
index 000000000000..219aa9de88cc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -0,0 +1,79 @@
1/* IP tables module for matching the value of the TTL
2 *
3 * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp
4 *
5 * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/module.h>
13#include <linux/skbuff.h>
14
15#include <linux/netfilter_ipv4/ipt_ttl.h>
16#include <linux/netfilter_ipv4/ip_tables.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("IP tables TTL matching module");
20MODULE_LICENSE("GPL");
21
22static int match(const struct sk_buff *skb, const struct net_device *in,
23 const struct net_device *out, const void *matchinfo,
24 int offset, int *hotdrop)
25{
26 const struct ipt_ttl_info *info = matchinfo;
27
28 switch (info->mode) {
29 case IPT_TTL_EQ:
30 return (skb->nh.iph->ttl == info->ttl);
31 break;
32 case IPT_TTL_NE:
33 return (!(skb->nh.iph->ttl == info->ttl));
34 break;
35 case IPT_TTL_LT:
36 return (skb->nh.iph->ttl < info->ttl);
37 break;
38 case IPT_TTL_GT:
39 return (skb->nh.iph->ttl > info->ttl);
40 break;
41 default:
42 printk(KERN_WARNING "ipt_ttl: unknown mode %d\n",
43 info->mode);
44 return 0;
45 }
46
47 return 0;
48}
49
50static int checkentry(const char *tablename, const struct ipt_ip *ip,
51 void *matchinfo, unsigned int matchsize,
52 unsigned int hook_mask)
53{
54 if (matchsize != IPT_ALIGN(sizeof(struct ipt_ttl_info)))
55 return 0;
56
57 return 1;
58}
59
60static struct ipt_match ttl_match = {
61 .name = "ttl",
62 .match = &match,
63 .checkentry = &checkentry,
64 .me = THIS_MODULE,
65};
66
67static int __init init(void)
68{
69 return ipt_register_match(&ttl_match);
70}
71
72static void __exit fini(void)
73{
74 ipt_unregister_match(&ttl_match);
75
76}
77
78module_init(init);
79module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 000000000000..260a4f0a2a90
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,194 @@
1/*
2 * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
3 *
4 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
5 * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/moduleparam.h>
15#include <linux/netfilter_ipv4/ip_tables.h>
16
17MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
19MODULE_DESCRIPTION("iptables filter table");
20
21#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
22
23static struct
24{
25 struct ipt_replace repl;
26 struct ipt_standard entries[3];
27 struct ipt_error term;
28} initial_table __initdata
29= { { "filter", FILTER_VALID_HOOKS, 4,
30 sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
31 { [NF_IP_LOCAL_IN] = 0,
32 [NF_IP_FORWARD] = sizeof(struct ipt_standard),
33 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
34 { [NF_IP_LOCAL_IN] = 0,
35 [NF_IP_FORWARD] = sizeof(struct ipt_standard),
36 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
37 0, NULL, { } },
38 {
39 /* LOCAL_IN */
40 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
41 0,
42 sizeof(struct ipt_entry),
43 sizeof(struct ipt_standard),
44 0, { 0, 0 }, { } },
45 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
46 -NF_ACCEPT - 1 } },
47 /* FORWARD */
48 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
49 0,
50 sizeof(struct ipt_entry),
51 sizeof(struct ipt_standard),
52 0, { 0, 0 }, { } },
53 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
54 -NF_ACCEPT - 1 } },
55 /* LOCAL_OUT */
56 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
57 0,
58 sizeof(struct ipt_entry),
59 sizeof(struct ipt_standard),
60 0, { 0, 0 }, { } },
61 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
62 -NF_ACCEPT - 1 } }
63 },
64 /* ERROR */
65 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
66 0,
67 sizeof(struct ipt_entry),
68 sizeof(struct ipt_error),
69 0, { 0, 0 }, { } },
70 { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
71 { } },
72 "ERROR"
73 }
74 }
75};
76
77static struct ipt_table packet_filter = {
78 .name = "filter",
79 .valid_hooks = FILTER_VALID_HOOKS,
80 .lock = RW_LOCK_UNLOCKED,
81 .me = THIS_MODULE
82};
83
84/* The work comes in here from netfilter.c. */
85static unsigned int
86ipt_hook(unsigned int hook,
87 struct sk_buff **pskb,
88 const struct net_device *in,
89 const struct net_device *out,
90 int (*okfn)(struct sk_buff *))
91{
92 return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
93}
94
95static unsigned int
96ipt_local_out_hook(unsigned int hook,
97 struct sk_buff **pskb,
98 const struct net_device *in,
99 const struct net_device *out,
100 int (*okfn)(struct sk_buff *))
101{
102 /* root is playing with raw sockets. */
103 if ((*pskb)->len < sizeof(struct iphdr)
104 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
105 if (net_ratelimit())
106 printk("ipt_hook: happy cracking.\n");
107 return NF_ACCEPT;
108 }
109
110 return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
111}
112
113static struct nf_hook_ops ipt_ops[] = {
114 {
115 .hook = ipt_hook,
116 .owner = THIS_MODULE,
117 .pf = PF_INET,
118 .hooknum = NF_IP_LOCAL_IN,
119 .priority = NF_IP_PRI_FILTER,
120 },
121 {
122 .hook = ipt_hook,
123 .owner = THIS_MODULE,
124 .pf = PF_INET,
125 .hooknum = NF_IP_FORWARD,
126 .priority = NF_IP_PRI_FILTER,
127 },
128 {
129 .hook = ipt_local_out_hook,
130 .owner = THIS_MODULE,
131 .pf = PF_INET,
132 .hooknum = NF_IP_LOCAL_OUT,
133 .priority = NF_IP_PRI_FILTER,
134 },
135};
136
137/* Default to forward because I got too much mail already. */
138static int forward = NF_ACCEPT;
139module_param(forward, bool, 0000);
140
141static int __init init(void)
142{
143 int ret;
144
145 if (forward < 0 || forward > NF_MAX_VERDICT) {
146 printk("iptables forward must be 0 or 1\n");
147 return -EINVAL;
148 }
149
150 /* Entry 1 is the FORWARD hook */
151 initial_table.entries[1].target.verdict = -forward - 1;
152
153 /* Register table */
154 ret = ipt_register_table(&packet_filter, &initial_table.repl);
155 if (ret < 0)
156 return ret;
157
158 /* Register hooks */
159 ret = nf_register_hook(&ipt_ops[0]);
160 if (ret < 0)
161 goto cleanup_table;
162
163 ret = nf_register_hook(&ipt_ops[1]);
164 if (ret < 0)
165 goto cleanup_hook0;
166
167 ret = nf_register_hook(&ipt_ops[2]);
168 if (ret < 0)
169 goto cleanup_hook1;
170
171 return ret;
172
173 cleanup_hook1:
174 nf_unregister_hook(&ipt_ops[1]);
175 cleanup_hook0:
176 nf_unregister_hook(&ipt_ops[0]);
177 cleanup_table:
178 ipt_unregister_table(&packet_filter);
179
180 return ret;
181}
182
183static void __exit fini(void)
184{
185 unsigned int i;
186
187 for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
188 nf_unregister_hook(&ipt_ops[i]);
189
190 ipt_unregister_table(&packet_filter);
191}
192
193module_init(init);
194module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 000000000000..160eb11b6e2f
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,260 @@
1/*
2 * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
3 *
4 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
5 * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Extended to all five netfilter hooks by Brad Chapman & Harald Welte
12 */
13#include <linux/config.h>
14#include <linux/module.h>
15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/netdevice.h>
17#include <linux/skbuff.h>
18#include <net/sock.h>
19#include <net/route.h>
20#include <linux/ip.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
24MODULE_DESCRIPTION("iptables mangle table");
25
26#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \
27 (1 << NF_IP_LOCAL_IN) | \
28 (1 << NF_IP_FORWARD) | \
29 (1 << NF_IP_LOCAL_OUT) | \
30 (1 << NF_IP_POST_ROUTING))
31
32/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
33static struct
34{
35 struct ipt_replace repl;
36 struct ipt_standard entries[5];
37 struct ipt_error term;
38} initial_table __initdata
39= { { "mangle", MANGLE_VALID_HOOKS, 6,
40 sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
41 { [NF_IP_PRE_ROUTING] = 0,
42 [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
43 [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
44 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
45 [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 },
46 { [NF_IP_PRE_ROUTING] = 0,
47 [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
48 [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
49 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
50 [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 },
51 0, NULL, { } },
52 {
53 /* PRE_ROUTING */
54 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
55 0,
56 sizeof(struct ipt_entry),
57 sizeof(struct ipt_standard),
58 0, { 0, 0 }, { } },
59 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
60 -NF_ACCEPT - 1 } },
61 /* LOCAL_IN */
62 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
63 0,
64 sizeof(struct ipt_entry),
65 sizeof(struct ipt_standard),
66 0, { 0, 0 }, { } },
67 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
68 -NF_ACCEPT - 1 } },
69 /* FORWARD */
70 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
71 0,
72 sizeof(struct ipt_entry),
73 sizeof(struct ipt_standard),
74 0, { 0, 0 }, { } },
75 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
76 -NF_ACCEPT - 1 } },
77 /* LOCAL_OUT */
78 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
79 0,
80 sizeof(struct ipt_entry),
81 sizeof(struct ipt_standard),
82 0, { 0, 0 }, { } },
83 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
84 -NF_ACCEPT - 1 } },
85 /* POST_ROUTING */
86 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
87 0,
88 sizeof(struct ipt_entry),
89 sizeof(struct ipt_standard),
90 0, { 0, 0 }, { } },
91 { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
92 -NF_ACCEPT - 1 } },
93 },
94 /* ERROR */
95 { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
96 0,
97 sizeof(struct ipt_entry),
98 sizeof(struct ipt_error),
99 0, { 0, 0 }, { } },
100 { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
101 { } },
102 "ERROR"
103 }
104 }
105};
106
107static struct ipt_table packet_mangler = {
108 .name = "mangle",
109 .valid_hooks = MANGLE_VALID_HOOKS,
110 .lock = RW_LOCK_UNLOCKED,
111 .me = THIS_MODULE,
112};
113
114/* The work comes in here from netfilter.c. */
115static unsigned int
116ipt_route_hook(unsigned int hook,
117 struct sk_buff **pskb,
118 const struct net_device *in,
119 const struct net_device *out,
120 int (*okfn)(struct sk_buff *))
121{
122 return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
123}
124
125static unsigned int
126ipt_local_hook(unsigned int hook,
127 struct sk_buff **pskb,
128 const struct net_device *in,
129 const struct net_device *out,
130 int (*okfn)(struct sk_buff *))
131{
132 unsigned int ret;
133 u_int8_t tos;
134 u_int32_t saddr, daddr;
135 unsigned long nfmark;
136
137 /* root is playing with raw sockets. */
138 if ((*pskb)->len < sizeof(struct iphdr)
139 || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
140 if (net_ratelimit())
141 printk("ipt_hook: happy cracking.\n");
142 return NF_ACCEPT;
143 }
144
145 /* Save things which could affect route */
146 nfmark = (*pskb)->nfmark;
147 saddr = (*pskb)->nh.iph->saddr;
148 daddr = (*pskb)->nh.iph->daddr;
149 tos = (*pskb)->nh.iph->tos;
150
151 ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
152 /* Reroute for ANY change. */
153 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE
154 && ((*pskb)->nh.iph->saddr != saddr
155 || (*pskb)->nh.iph->daddr != daddr
156#ifdef CONFIG_IP_ROUTE_FWMARK
157 || (*pskb)->nfmark != nfmark
158#endif
159 || (*pskb)->nh.iph->tos != tos))
160 return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
161
162 return ret;
163}
164
165static struct nf_hook_ops ipt_ops[] = {
166 {
167 .hook = ipt_route_hook,
168 .owner = THIS_MODULE,
169 .pf = PF_INET,
170 .hooknum = NF_IP_PRE_ROUTING,
171 .priority = NF_IP_PRI_MANGLE,
172 },
173 {
174 .hook = ipt_route_hook,
175 .owner = THIS_MODULE,
176 .pf = PF_INET,
177 .hooknum = NF_IP_LOCAL_IN,
178 .priority = NF_IP_PRI_MANGLE,
179 },
180 {
181 .hook = ipt_route_hook,
182 .owner = THIS_MODULE,
183 .pf = PF_INET,
184 .hooknum = NF_IP_FORWARD,
185 .priority = NF_IP_PRI_MANGLE,
186 },
187 {
188 .hook = ipt_local_hook,
189 .owner = THIS_MODULE,
190 .pf = PF_INET,
191 .hooknum = NF_IP_LOCAL_OUT,
192 .priority = NF_IP_PRI_MANGLE,
193 },
194 {
195 .hook = ipt_route_hook,
196 .owner = THIS_MODULE,
197 .pf = PF_INET,
198 .hooknum = NF_IP_POST_ROUTING,
199 .priority = NF_IP_PRI_MANGLE,
200 },
201};
202
203static int __init init(void)
204{
205 int ret;
206
207 /* Register table */
208 ret = ipt_register_table(&packet_mangler, &initial_table.repl);
209 if (ret < 0)
210 return ret;
211
212 /* Register hooks */
213 ret = nf_register_hook(&ipt_ops[0]);
214 if (ret < 0)
215 goto cleanup_table;
216
217 ret = nf_register_hook(&ipt_ops[1]);
218 if (ret < 0)
219 goto cleanup_hook0;
220
221 ret = nf_register_hook(&ipt_ops[2]);
222 if (ret < 0)
223 goto cleanup_hook1;
224
225 ret = nf_register_hook(&ipt_ops[3]);
226 if (ret < 0)
227 goto cleanup_hook2;
228
229 ret = nf_register_hook(&ipt_ops[4]);
230 if (ret < 0)
231 goto cleanup_hook3;
232
233 return ret;
234
235 cleanup_hook3:
236 nf_unregister_hook(&ipt_ops[3]);
237 cleanup_hook2:
238 nf_unregister_hook(&ipt_ops[2]);
239 cleanup_hook1:
240 nf_unregister_hook(&ipt_ops[1]);
241 cleanup_hook0:
242 nf_unregister_hook(&ipt_ops[0]);
243 cleanup_table:
244 ipt_unregister_table(&packet_mangler);
245
246 return ret;
247}
248
249static void __exit fini(void)
250{
251 unsigned int i;
252
253 for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
254 nf_unregister_hook(&ipt_ops[i]);
255
256 ipt_unregister_table(&packet_mangler);
257}
258
259module_init(init);
260module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 000000000000..01b4a3c814d3
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,156 @@
1/*
2 * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
3 *
4 * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
5 */
6#include <linux/module.h>
7#include <linux/netfilter_ipv4/ip_tables.h>
8
9#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
10
11static struct
12{
13 struct ipt_replace repl;
14 struct ipt_standard entries[2];
15 struct ipt_error term;
16} initial_table __initdata = {
17 .repl = {
18 .name = "raw",
19 .valid_hooks = RAW_VALID_HOOKS,
20 .num_entries = 3,
21 .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
22 .hook_entry = {
23 [NF_IP_PRE_ROUTING] = 0,
24 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
25 .underflow = {
26 [NF_IP_PRE_ROUTING] = 0,
27 [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
28 },
29 .entries = {
30 /* PRE_ROUTING */
31 {
32 .entry = {
33 .target_offset = sizeof(struct ipt_entry),
34 .next_offset = sizeof(struct ipt_standard),
35 },
36 .target = {
37 .target = {
38 .u = {
39 .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
40 },
41 },
42 .verdict = -NF_ACCEPT - 1,
43 },
44 },
45
46 /* LOCAL_OUT */
47 {
48 .entry = {
49 .target_offset = sizeof(struct ipt_entry),
50 .next_offset = sizeof(struct ipt_standard),
51 },
52 .target = {
53 .target = {
54 .u = {
55 .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
56 },
57 },
58 .verdict = -NF_ACCEPT - 1,
59 },
60 },
61 },
62 /* ERROR */
63 .term = {
64 .entry = {
65 .target_offset = sizeof(struct ipt_entry),
66 .next_offset = sizeof(struct ipt_error),
67 },
68 .target = {
69 .target = {
70 .u = {
71 .user = {
72 .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)),
73 .name = IPT_ERROR_TARGET,
74 },
75 },
76 },
77 .errorname = "ERROR",
78 },
79 }
80};
81
82static struct ipt_table packet_raw = {
83 .name = "raw",
84 .valid_hooks = RAW_VALID_HOOKS,
85 .lock = RW_LOCK_UNLOCKED,
86 .me = THIS_MODULE
87};
88
89/* The work comes in here from netfilter.c. */
90static unsigned int
91ipt_hook(unsigned int hook,
92 struct sk_buff **pskb,
93 const struct net_device *in,
94 const struct net_device *out,
95 int (*okfn)(struct sk_buff *))
96{
97 return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL);
98}
99
100/* 'raw' is the very first table. */
101static struct nf_hook_ops ipt_ops[] = {
102 {
103 .hook = ipt_hook,
104 .pf = PF_INET,
105 .hooknum = NF_IP_PRE_ROUTING,
106 .priority = NF_IP_PRI_RAW
107 },
108 {
109 .hook = ipt_hook,
110 .pf = PF_INET,
111 .hooknum = NF_IP_LOCAL_OUT,
112 .priority = NF_IP_PRI_RAW
113 },
114};
115
116static int __init init(void)
117{
118 int ret;
119
120 /* Register table */
121 ret = ipt_register_table(&packet_raw, &initial_table.repl);
122 if (ret < 0)
123 return ret;
124
125 /* Register hooks */
126 ret = nf_register_hook(&ipt_ops[0]);
127 if (ret < 0)
128 goto cleanup_table;
129
130 ret = nf_register_hook(&ipt_ops[1]);
131 if (ret < 0)
132 goto cleanup_hook0;
133
134 return ret;
135
136 cleanup_hook0:
137 nf_unregister_hook(&ipt_ops[0]);
138 cleanup_table:
139 ipt_unregister_table(&packet_raw);
140
141 return ret;
142}
143
144static void __exit fini(void)
145{
146 unsigned int i;
147
148 for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
149 nf_unregister_hook(&ipt_ops[i]);
150
151 ipt_unregister_table(&packet_raw);
152}
153
154module_init(init);
155module_exit(fini);
156MODULE_LICENSE("GPL");
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 000000000000..912bbcc7f415
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,382 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * This file implements the various access functions for the
7 * PROC file system. It is mainly used for debugging and
8 * statistics.
9 *
10 * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $
11 *
12 * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
13 * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
14 * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
15 * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
16 *
17 * Fixes:
18 * Alan Cox : UDP sockets show the rxqueue/txqueue
19 * using hint flag for the netinfo.
20 * Pauline Middelink : identd support
21 * Alan Cox : Make /proc safer.
22 * Erik Schoenfelder : /proc/net/snmp
23 * Alan Cox : Handle dead sockets properly.
24 * Gerhard Koerting : Show both timers
25 * Alan Cox : Allow inode to be NULL (kernel socket)
26 * Andi Kleen : Add support for open_requests and
27 * split functions for more readibility.
28 * Andi Kleen : Add support for /proc/net/netstat
29 * Arnaldo C. Melo : Convert to seq_file
30 *
31 * This program is free software; you can redistribute it and/or
32 * modify it under the terms of the GNU General Public License
33 * as published by the Free Software Foundation; either version
34 * 2 of the License, or (at your option) any later version.
35 */
36#include <linux/types.h>
37#include <net/icmp.h>
38#include <net/protocol.h>
39#include <net/tcp.h>
40#include <net/udp.h>
41#include <linux/proc_fs.h>
42#include <linux/seq_file.h>
43#include <net/sock.h>
44#include <net/raw.h>
45
46static int fold_prot_inuse(struct proto *proto)
47{
48 int res = 0;
49 int cpu;
50
51 for (cpu = 0; cpu < NR_CPUS; cpu++)
52 res += proto->stats[cpu].inuse;
53
54 return res;
55}
56
57/*
58 * Report socket allocation statistics [mea@utu.fi]
59 */
60static int sockstat_seq_show(struct seq_file *seq, void *v)
61{
62 /* From net/socket.c */
63 extern void socket_seq_show(struct seq_file *seq);
64
65 socket_seq_show(seq);
66 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
67 fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
68 tcp_tw_count, atomic_read(&tcp_sockets_allocated),
69 atomic_read(&tcp_memory_allocated));
70 seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
71 seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
72 seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues,
73 atomic_read(&ip_frag_mem));
74 return 0;
75}
76
77static int sockstat_seq_open(struct inode *inode, struct file *file)
78{
79 return single_open(file, sockstat_seq_show, NULL);
80}
81
82static struct file_operations sockstat_seq_fops = {
83 .owner = THIS_MODULE,
84 .open = sockstat_seq_open,
85 .read = seq_read,
86 .llseek = seq_lseek,
87 .release = single_release,
88};
89
90static unsigned long
91fold_field(void *mib[], int offt)
92{
93 unsigned long res = 0;
94 int i;
95
96 for (i = 0; i < NR_CPUS; i++) {
97 if (!cpu_possible(i))
98 continue;
99 res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
100 res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
101 }
102 return res;
103}
104
105/* snmp items */
106static struct snmp_mib snmp4_ipstats_list[] = {
107 SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
108 SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
109 SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
110 SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
111 SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
112 SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
113 SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
114 SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS),
115 SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
116 SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
117 SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
118 SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
119 SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
120 SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
121 SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
122 SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
123 SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
124 SNMP_MIB_SENTINEL
125};
126
127static struct snmp_mib snmp4_icmp_list[] = {
128 SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
129 SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
130 SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
131 SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
132 SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
133 SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
134 SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
135 SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
136 SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
137 SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
138 SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
139 SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
140 SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
141 SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
142 SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
143 SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
144 SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
145 SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
146 SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
147 SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
148 SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
149 SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
150 SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
151 SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
152 SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
153 SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
154 SNMP_MIB_SENTINEL
155};
156
157static struct snmp_mib snmp4_tcp_list[] = {
158 SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
159 SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
160 SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
161 SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
162 SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
163 SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
164 SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
165 SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
166 SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
167 SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
168 SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
169 SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
170 SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
171 SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
172 SNMP_MIB_SENTINEL
173};
174
175static struct snmp_mib snmp4_udp_list[] = {
176 SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
177 SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
178 SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
179 SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
180 SNMP_MIB_SENTINEL
181};
182
183static struct snmp_mib snmp4_net_list[] = {
184 SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
185 SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
186 SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
187 SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
188 SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
189 SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
190 SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
191 SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
192 SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
193 SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
194 SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
195 SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
196 SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
197 SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
198 SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
199 SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
200 SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
201 SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
202 SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
203 SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
204 SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
205 SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
206 SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
207 SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
208 SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
209 SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
210 SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
211 SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
212 SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
213 SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
214 SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
215 SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
216 SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
217 SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
218 SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
219 SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
220 SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
221 SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
222 SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
223 SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
224 SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
225 SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
226 SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
227 SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
228 SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
229 SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
230 SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
231 SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
232 SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
233 SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
234 SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
235 SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
236 SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
237 SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
238 SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
239 SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
240 SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
241 SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
242 SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
243 SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
244 SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
245 SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
246 SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
247 SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
248 SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
249 SNMP_MIB_SENTINEL
250};
251
252/*
253 * Called from the PROCfs module. This outputs /proc/net/snmp.
254 */
255static int snmp_seq_show(struct seq_file *seq, void *v)
256{
257 int i;
258
259 seq_puts(seq, "Ip: Forwarding DefaultTTL");
260
261 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
262 seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
263
264 seq_printf(seq, "\nIp: %d %d",
265 ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl);
266
267 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
268 seq_printf(seq, " %lu",
269 fold_field((void **) ip_statistics,
270 snmp4_ipstats_list[i].entry));
271
272 seq_puts(seq, "\nIcmp:");
273 for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
274 seq_printf(seq, " %s", snmp4_icmp_list[i].name);
275
276 seq_puts(seq, "\nIcmp:");
277 for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
278 seq_printf(seq, " %lu",
279 fold_field((void **) icmp_statistics,
280 snmp4_icmp_list[i].entry));
281
282 seq_puts(seq, "\nTcp:");
283 for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
284 seq_printf(seq, " %s", snmp4_tcp_list[i].name);
285
286 seq_puts(seq, "\nTcp:");
287 for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
288 /* MaxConn field is signed, RFC 2012 */
289 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
290 seq_printf(seq, " %ld",
291 fold_field((void **) tcp_statistics,
292 snmp4_tcp_list[i].entry));
293 else
294 seq_printf(seq, " %lu",
295 fold_field((void **) tcp_statistics,
296 snmp4_tcp_list[i].entry));
297 }
298
299 seq_puts(seq, "\nUdp:");
300 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
301 seq_printf(seq, " %s", snmp4_udp_list[i].name);
302
303 seq_puts(seq, "\nUdp:");
304 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
305 seq_printf(seq, " %lu",
306 fold_field((void **) udp_statistics,
307 snmp4_udp_list[i].entry));
308
309 seq_putc(seq, '\n');
310 return 0;
311}
312
313static int snmp_seq_open(struct inode *inode, struct file *file)
314{
315 return single_open(file, snmp_seq_show, NULL);
316}
317
318static struct file_operations snmp_seq_fops = {
319 .owner = THIS_MODULE,
320 .open = snmp_seq_open,
321 .read = seq_read,
322 .llseek = seq_lseek,
323 .release = single_release,
324};
325
326/*
327 * Output /proc/net/netstat
328 */
329static int netstat_seq_show(struct seq_file *seq, void *v)
330{
331 int i;
332
333 seq_puts(seq, "TcpExt:");
334 for (i = 0; snmp4_net_list[i].name != NULL; i++)
335 seq_printf(seq, " %s", snmp4_net_list[i].name);
336
337 seq_puts(seq, "\nTcpExt:");
338 for (i = 0; snmp4_net_list[i].name != NULL; i++)
339 seq_printf(seq, " %lu",
340 fold_field((void **) net_statistics,
341 snmp4_net_list[i].entry));
342
343 seq_putc(seq, '\n');
344 return 0;
345}
346
347static int netstat_seq_open(struct inode *inode, struct file *file)
348{
349 return single_open(file, netstat_seq_show, NULL);
350}
351
352static struct file_operations netstat_seq_fops = {
353 .owner = THIS_MODULE,
354 .open = netstat_seq_open,
355 .read = seq_read,
356 .llseek = seq_lseek,
357 .release = single_release,
358};
359
360int __init ip_misc_proc_init(void)
361{
362 int rc = 0;
363
364 if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops))
365 goto out_netstat;
366
367 if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops))
368 goto out_snmp;
369
370 if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops))
371 goto out_sockstat;
372out:
373 return rc;
374out_sockstat:
375 proc_net_remove("snmp");
376out_snmp:
377 proc_net_remove("netstat");
378out_netstat:
379 rc = -ENOMEM;
380 goto out;
381}
382
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 000000000000..90a587cacaa4
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,101 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * INET protocol dispatch tables.
7 *
8 * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 * Alan Cox : Ahah! udp icmp errors don't work because
15 * udp_err is never called!
16 * Alan Cox : Added new fields for init and ready for
17 * proper fragmentation (_NO_ 4K limits!)
18 * Richard Colella : Hang on hash collision
19 * Vince Laviano : Modified inet_del_protocol() to correctly
20 * maintain copy bit.
21 *
22 * This program is free software; you can redistribute it and/or
23 * modify it under the terms of the GNU General Public License
24 * as published by the Free Software Foundation; either version
25 * 2 of the License, or (at your option) any later version.
26 */
27
28#include <asm/uaccess.h>
29#include <asm/system.h>
30#include <linux/module.h>
31#include <linux/types.h>
32#include <linux/kernel.h>
33#include <linux/sched.h>
34#include <linux/string.h>
35#include <linux/config.h>
36#include <linux/socket.h>
37#include <linux/in.h>
38#include <linux/inet.h>
39#include <linux/netdevice.h>
40#include <linux/timer.h>
41#include <net/ip.h>
42#include <net/protocol.h>
43#include <net/tcp.h>
44#include <linux/skbuff.h>
45#include <net/sock.h>
46#include <net/icmp.h>
47#include <net/udp.h>
48#include <net/ipip.h>
49#include <linux/igmp.h>
50
51struct net_protocol *inet_protos[MAX_INET_PROTOS];
52static DEFINE_SPINLOCK(inet_proto_lock);
53
54/*
55 * Add a protocol handler to the hash tables
56 */
57
58int inet_add_protocol(struct net_protocol *prot, unsigned char protocol)
59{
60 int hash, ret;
61
62 hash = protocol & (MAX_INET_PROTOS - 1);
63
64 spin_lock_bh(&inet_proto_lock);
65 if (inet_protos[hash]) {
66 ret = -1;
67 } else {
68 inet_protos[hash] = prot;
69 ret = 0;
70 }
71 spin_unlock_bh(&inet_proto_lock);
72
73 return ret;
74}
75
76/*
77 * Remove a protocol from the hash tables.
78 */
79
80int inet_del_protocol(struct net_protocol *prot, unsigned char protocol)
81{
82 int hash, ret;
83
84 hash = protocol & (MAX_INET_PROTOS - 1);
85
86 spin_lock_bh(&inet_proto_lock);
87 if (inet_protos[hash] == prot) {
88 inet_protos[hash] = NULL;
89 ret = 0;
90 } else {
91 ret = -1;
92 }
93 spin_unlock_bh(&inet_proto_lock);
94
95 synchronize_net();
96
97 return ret;
98}
99
100EXPORT_SYMBOL(inet_add_protocol);
101EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 000000000000..93624a32eb9a
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,888 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * RAW - implementation of IP "raw" sockets.
7 *
8 * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 * Alan Cox : verify_area() fixed up
15 * Alan Cox : ICMP error handling
16 * Alan Cox : EMSGSIZE if you send too big a packet
17 * Alan Cox : Now uses generic datagrams and shared
18 * skbuff library. No more peek crashes,
19 * no more backlogs
20 * Alan Cox : Checks sk->broadcast.
21 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
22 * Alan Cox : Raw passes ip options too
23 * Alan Cox : Setsocketopt added
24 * Alan Cox : Fixed error return for broadcasts
25 * Alan Cox : Removed wake_up calls
26 * Alan Cox : Use ttl/tos
27 * Alan Cox : Cleaned up old debugging
28 * Alan Cox : Use new kernel side addresses
29 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
30 * Alan Cox : BSD style RAW socket demultiplexing.
31 * Alan Cox : Beginnings of mrouted support.
32 * Alan Cox : Added IP_HDRINCL option.
33 * Alan Cox : Skip broadcast check if BSDism set.
34 * David S. Miller : New socket lookup architecture.
35 *
36 * This program is free software; you can redistribute it and/or
37 * modify it under the terms of the GNU General Public License
38 * as published by the Free Software Foundation; either version
39 * 2 of the License, or (at your option) any later version.
40 */
41
42#include <linux/config.h>
43#include <asm/atomic.h>
44#include <asm/byteorder.h>
45#include <asm/current.h>
46#include <asm/uaccess.h>
47#include <asm/ioctls.h>
48#include <linux/types.h>
49#include <linux/stddef.h>
50#include <linux/slab.h>
51#include <linux/errno.h>
52#include <linux/aio.h>
53#include <linux/kernel.h>
54#include <linux/spinlock.h>
55#include <linux/sockios.h>
56#include <linux/socket.h>
57#include <linux/in.h>
58#include <linux/mroute.h>
59#include <linux/netdevice.h>
60#include <linux/in_route.h>
61#include <linux/route.h>
62#include <linux/tcp.h>
63#include <linux/skbuff.h>
64#include <net/dst.h>
65#include <net/sock.h>
66#include <linux/gfp.h>
67#include <linux/ip.h>
68#include <linux/net.h>
69#include <net/ip.h>
70#include <net/icmp.h>
71#include <net/udp.h>
72#include <net/raw.h>
73#include <net/snmp.h>
74#include <net/inet_common.h>
75#include <net/checksum.h>
76#include <net/xfrm.h>
77#include <linux/rtnetlink.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80#include <linux/netfilter.h>
81#include <linux/netfilter_ipv4.h>
82
83struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE];
84DEFINE_RWLOCK(raw_v4_lock);
85
86static void raw_v4_hash(struct sock *sk)
87{
88 struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num &
89 (RAWV4_HTABLE_SIZE - 1)];
90
91 write_lock_bh(&raw_v4_lock);
92 sk_add_node(sk, head);
93 sock_prot_inc_use(sk->sk_prot);
94 write_unlock_bh(&raw_v4_lock);
95}
96
97static void raw_v4_unhash(struct sock *sk)
98{
99 write_lock_bh(&raw_v4_lock);
100 if (sk_del_node_init(sk))
101 sock_prot_dec_use(sk->sk_prot);
102 write_unlock_bh(&raw_v4_lock);
103}
104
105struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
106 unsigned long raddr, unsigned long laddr,
107 int dif)
108{
109 struct hlist_node *node;
110
111 sk_for_each_from(sk, node) {
112 struct inet_sock *inet = inet_sk(sk);
113
114 if (inet->num == num &&
115 !(inet->daddr && inet->daddr != raddr) &&
116 !(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
117 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
118 goto found; /* gotcha */
119 }
120 sk = NULL;
121found:
122 return sk;
123}
124
125/*
126 * 0 - deliver
127 * 1 - block
128 */
129static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
130{
131 int type;
132
133 if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
134 return 1;
135
136 type = skb->h.icmph->type;
137 if (type < 32) {
138 __u32 data = raw_sk(sk)->filter.data;
139
140 return ((1 << type) & data) != 0;
141 }
142
143 /* Do not block unknown ICMP types */
144 return 0;
145}
146
147/* IP input processing comes here for RAW socket delivery.
148 * Caller owns SKB, so we must make clones.
149 *
150 * RFC 1122: SHOULD pass TOS value up to the transport layer.
151 * -> It does. And not only TOS, but all IP header.
152 */
153void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
154{
155 struct sock *sk;
156 struct hlist_head *head;
157
158 read_lock(&raw_v4_lock);
159 head = &raw_v4_htable[hash];
160 if (hlist_empty(head))
161 goto out;
162 sk = __raw_v4_lookup(__sk_head(head), iph->protocol,
163 iph->saddr, iph->daddr,
164 skb->dev->ifindex);
165
166 while (sk) {
167 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
168 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
169
170 /* Not releasing hash table! */
171 if (clone)
172 raw_rcv(sk, clone);
173 }
174 sk = __raw_v4_lookup(sk_next(sk), iph->protocol,
175 iph->saddr, iph->daddr,
176 skb->dev->ifindex);
177 }
178out:
179 read_unlock(&raw_v4_lock);
180}
181
182void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
183{
184 struct inet_sock *inet = inet_sk(sk);
185 int type = skb->h.icmph->type;
186 int code = skb->h.icmph->code;
187 int err = 0;
188 int harderr = 0;
189
190 /* Report error on raw socket, if:
191 1. User requested ip_recverr.
192 2. Socket is connected (otherwise the error indication
193 is useless without ip_recverr and error is hard.
194 */
195 if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
196 return;
197
198 switch (type) {
199 default:
200 case ICMP_TIME_EXCEEDED:
201 err = EHOSTUNREACH;
202 break;
203 case ICMP_SOURCE_QUENCH:
204 return;
205 case ICMP_PARAMETERPROB:
206 err = EPROTO;
207 harderr = 1;
208 break;
209 case ICMP_DEST_UNREACH:
210 err = EHOSTUNREACH;
211 if (code > NR_ICMP_UNREACH)
212 break;
213 err = icmp_err_convert[code].errno;
214 harderr = icmp_err_convert[code].fatal;
215 if (code == ICMP_FRAG_NEEDED) {
216 harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
217 err = EMSGSIZE;
218 }
219 }
220
221 if (inet->recverr) {
222 struct iphdr *iph = (struct iphdr*)skb->data;
223 u8 *payload = skb->data + (iph->ihl << 2);
224
225 if (inet->hdrincl)
226 payload = skb->data;
227 ip_icmp_error(sk, skb, err, 0, info, payload);
228 }
229
230 if (inet->recverr || harderr) {
231 sk->sk_err = err;
232 sk->sk_error_report(sk);
233 }
234}
235
236static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
237{
238 /* Charge it to the socket. */
239
240 if (sock_queue_rcv_skb(sk, skb) < 0) {
241 /* FIXME: increment a raw drops counter here */
242 kfree_skb(skb);
243 return NET_RX_DROP;
244 }
245
246 return NET_RX_SUCCESS;
247}
248
249int raw_rcv(struct sock *sk, struct sk_buff *skb)
250{
251 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
252 kfree_skb(skb);
253 return NET_RX_DROP;
254 }
255
256 skb_push(skb, skb->data - skb->nh.raw);
257
258 raw_rcv_skb(sk, skb);
259 return 0;
260}
261
262static int raw_send_hdrinc(struct sock *sk, void *from, int length,
263 struct rtable *rt,
264 unsigned int flags)
265{
266 struct inet_sock *inet = inet_sk(sk);
267 int hh_len;
268 struct iphdr *iph;
269 struct sk_buff *skb;
270 int err;
271
272 if (length > rt->u.dst.dev->mtu) {
273 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
274 rt->u.dst.dev->mtu);
275 return -EMSGSIZE;
276 }
277 if (flags&MSG_PROBE)
278 goto out;
279
280 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
281
282 skb = sock_alloc_send_skb(sk, length+hh_len+15,
283 flags&MSG_DONTWAIT, &err);
284 if (skb == NULL)
285 goto error;
286 skb_reserve(skb, hh_len);
287
288 skb->priority = sk->sk_priority;
289 skb->dst = dst_clone(&rt->u.dst);
290
291 skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
292
293 skb->ip_summed = CHECKSUM_NONE;
294
295 skb->h.raw = skb->nh.raw;
296 err = memcpy_fromiovecend((void *)iph, from, 0, length);
297 if (err)
298 goto error_fault;
299
300 /* We don't modify invalid header */
301 if (length >= sizeof(*iph) && iph->ihl * 4 <= length) {
302 if (!iph->saddr)
303 iph->saddr = rt->rt_src;
304 iph->check = 0;
305 iph->tot_len = htons(length);
306 if (!iph->id)
307 ip_select_ident(iph, &rt->u.dst, NULL);
308
309 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
310 }
311
312 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
313 dst_output);
314 if (err > 0)
315 err = inet->recverr ? net_xmit_errno(err) : 0;
316 if (err)
317 goto error;
318out:
319 return 0;
320
321error_fault:
322 err = -EFAULT;
323 kfree_skb(skb);
324error:
325 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
326 return err;
327}
328
329static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
330{
331 struct iovec *iov;
332 u8 __user *type = NULL;
333 u8 __user *code = NULL;
334 int probed = 0;
335 int i;
336
337 if (!msg->msg_iov)
338 return;
339
340 for (i = 0; i < msg->msg_iovlen; i++) {
341 iov = &msg->msg_iov[i];
342 if (!iov)
343 continue;
344
345 switch (fl->proto) {
346 case IPPROTO_ICMP:
347 /* check if one-byte field is readable or not. */
348 if (iov->iov_base && iov->iov_len < 1)
349 break;
350
351 if (!type) {
352 type = iov->iov_base;
353 /* check if code field is readable or not. */
354 if (iov->iov_len > 1)
355 code = type + 1;
356 } else if (!code)
357 code = iov->iov_base;
358
359 if (type && code) {
360 get_user(fl->fl_icmp_type, type);
361 __get_user(fl->fl_icmp_code, code);
362 probed = 1;
363 }
364 break;
365 default:
366 probed = 1;
367 break;
368 }
369 if (probed)
370 break;
371 }
372}
373
374static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
375 size_t len)
376{
377 struct inet_sock *inet = inet_sk(sk);
378 struct ipcm_cookie ipc;
379 struct rtable *rt = NULL;
380 int free = 0;
381 u32 daddr;
382 u32 saddr;
383 u8 tos;
384 int err;
385
386 err = -EMSGSIZE;
387 if (len < 0 || len > 0xFFFF)
388 goto out;
389
390 /*
391 * Check the flags.
392 */
393
394 err = -EOPNOTSUPP;
395 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */
396 goto out; /* compatibility */
397
398 /*
399 * Get and verify the address.
400 */
401
402 if (msg->msg_namelen) {
403 struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
404 err = -EINVAL;
405 if (msg->msg_namelen < sizeof(*usin))
406 goto out;
407 if (usin->sin_family != AF_INET) {
408 static int complained;
409 if (!complained++)
410 printk(KERN_INFO "%s forgot to set AF_INET in "
411 "raw sendmsg. Fix it!\n",
412 current->comm);
413 err = -EAFNOSUPPORT;
414 if (usin->sin_family)
415 goto out;
416 }
417 daddr = usin->sin_addr.s_addr;
418 /* ANK: I did not forget to get protocol from port field.
419 * I just do not know, who uses this weirdness.
420 * IP_HDRINCL is much more convenient.
421 */
422 } else {
423 err = -EDESTADDRREQ;
424 if (sk->sk_state != TCP_ESTABLISHED)
425 goto out;
426 daddr = inet->daddr;
427 }
428
429 ipc.addr = inet->saddr;
430 ipc.opt = NULL;
431 ipc.oif = sk->sk_bound_dev_if;
432
433 if (msg->msg_controllen) {
434 err = ip_cmsg_send(msg, &ipc);
435 if (err)
436 goto out;
437 if (ipc.opt)
438 free = 1;
439 }
440
441 saddr = ipc.addr;
442 ipc.addr = daddr;
443
444 if (!ipc.opt)
445 ipc.opt = inet->opt;
446
447 if (ipc.opt) {
448 err = -EINVAL;
449 /* Linux does not mangle headers on raw sockets,
450 * so that IP options + IP_HDRINCL is non-sense.
451 */
452 if (inet->hdrincl)
453 goto done;
454 if (ipc.opt->srr) {
455 if (!daddr)
456 goto done;
457 daddr = ipc.opt->faddr;
458 }
459 }
460 tos = RT_CONN_FLAGS(sk);
461 if (msg->msg_flags & MSG_DONTROUTE)
462 tos |= RTO_ONLINK;
463
464 if (MULTICAST(daddr)) {
465 if (!ipc.oif)
466 ipc.oif = inet->mc_index;
467 if (!saddr)
468 saddr = inet->mc_addr;
469 }
470
471 {
472 struct flowi fl = { .oif = ipc.oif,
473 .nl_u = { .ip4_u =
474 { .daddr = daddr,
475 .saddr = saddr,
476 .tos = tos } },
477 .proto = inet->hdrincl ? IPPROTO_RAW :
478 sk->sk_protocol,
479 };
480 if (!inet->hdrincl)
481 raw_probe_proto_opt(&fl, msg);
482
483 err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
484 }
485 if (err)
486 goto done;
487
488 err = -EACCES;
489 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
490 goto done;
491
492 if (msg->msg_flags & MSG_CONFIRM)
493 goto do_confirm;
494back_from_confirm:
495
496 if (inet->hdrincl)
497 err = raw_send_hdrinc(sk, msg->msg_iov, len,
498 rt, msg->msg_flags);
499
500 else {
501 if (!ipc.addr)
502 ipc.addr = rt->rt_dst;
503 lock_sock(sk);
504 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
505 &ipc, rt, msg->msg_flags);
506 if (err)
507 ip_flush_pending_frames(sk);
508 else if (!(msg->msg_flags & MSG_MORE))
509 err = ip_push_pending_frames(sk);
510 release_sock(sk);
511 }
512done:
513 if (free)
514 kfree(ipc.opt);
515 ip_rt_put(rt);
516
517out: return err < 0 ? err : len;
518
519do_confirm:
520 dst_confirm(&rt->u.dst);
521 if (!(msg->msg_flags & MSG_PROBE) || len)
522 goto back_from_confirm;
523 err = 0;
524 goto done;
525}
526
527static void raw_close(struct sock *sk, long timeout)
528{
529 /*
530 * Raw sockets may have direct kernel refereneces. Kill them.
531 */
532 ip_ra_control(sk, 0, NULL);
533
534 sk_common_release(sk);
535}
536
537/* This gets rid of all the nasties in af_inet. -DaveM */
538static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
539{
540 struct inet_sock *inet = inet_sk(sk);
541 struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
542 int ret = -EINVAL;
543 int chk_addr_ret;
544
545 if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
546 goto out;
547 chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
548 ret = -EADDRNOTAVAIL;
549 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
550 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
551 goto out;
552 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
553 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
554 inet->saddr = 0; /* Use device */
555 sk_dst_reset(sk);
556 ret = 0;
557out: return ret;
558}
559
560/*
561 * This should be easy, if there is something there
562 * we return it, otherwise we block.
563 */
564
565static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
566 size_t len, int noblock, int flags, int *addr_len)
567{
568 struct inet_sock *inet = inet_sk(sk);
569 size_t copied = 0;
570 int err = -EOPNOTSUPP;
571 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
572 struct sk_buff *skb;
573
574 if (flags & MSG_OOB)
575 goto out;
576
577 if (addr_len)
578 *addr_len = sizeof(*sin);
579
580 if (flags & MSG_ERRQUEUE) {
581 err = ip_recv_error(sk, msg, len);
582 goto out;
583 }
584
585 skb = skb_recv_datagram(sk, flags, noblock, &err);
586 if (!skb)
587 goto out;
588
589 copied = skb->len;
590 if (len < copied) {
591 msg->msg_flags |= MSG_TRUNC;
592 copied = len;
593 }
594
595 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
596 if (err)
597 goto done;
598
599 sock_recv_timestamp(msg, sk, skb);
600
601 /* Copy the address. */
602 if (sin) {
603 sin->sin_family = AF_INET;
604 sin->sin_addr.s_addr = skb->nh.iph->saddr;
605 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
606 }
607 if (inet->cmsg_flags)
608 ip_cmsg_recv(msg, skb);
609 if (flags & MSG_TRUNC)
610 copied = skb->len;
611done:
612 skb_free_datagram(sk, skb);
613out: return err ? err : copied;
614}
615
616static int raw_init(struct sock *sk)
617{
618 struct raw_sock *rp = raw_sk(sk);
619
620 if (inet_sk(sk)->num == IPPROTO_ICMP)
621 memset(&rp->filter, 0, sizeof(rp->filter));
622 return 0;
623}
624
625static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
626{
627 if (optlen > sizeof(struct icmp_filter))
628 optlen = sizeof(struct icmp_filter);
629 if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
630 return -EFAULT;
631 return 0;
632}
633
634static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
635{
636 int len, ret = -EFAULT;
637
638 if (get_user(len, optlen))
639 goto out;
640 ret = -EINVAL;
641 if (len < 0)
642 goto out;
643 if (len > sizeof(struct icmp_filter))
644 len = sizeof(struct icmp_filter);
645 ret = -EFAULT;
646 if (put_user(len, optlen) ||
647 copy_to_user(optval, &raw_sk(sk)->filter, len))
648 goto out;
649 ret = 0;
650out: return ret;
651}
652
653static int raw_setsockopt(struct sock *sk, int level, int optname,
654 char __user *optval, int optlen)
655{
656 if (level != SOL_RAW)
657 return ip_setsockopt(sk, level, optname, optval, optlen);
658
659 if (optname == ICMP_FILTER) {
660 if (inet_sk(sk)->num != IPPROTO_ICMP)
661 return -EOPNOTSUPP;
662 else
663 return raw_seticmpfilter(sk, optval, optlen);
664 }
665 return -ENOPROTOOPT;
666}
667
668static int raw_getsockopt(struct sock *sk, int level, int optname,
669 char __user *optval, int __user *optlen)
670{
671 if (level != SOL_RAW)
672 return ip_getsockopt(sk, level, optname, optval, optlen);
673
674 if (optname == ICMP_FILTER) {
675 if (inet_sk(sk)->num != IPPROTO_ICMP)
676 return -EOPNOTSUPP;
677 else
678 return raw_geticmpfilter(sk, optval, optlen);
679 }
680 return -ENOPROTOOPT;
681}
682
683static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
684{
685 switch (cmd) {
686 case SIOCOUTQ: {
687 int amount = atomic_read(&sk->sk_wmem_alloc);
688 return put_user(amount, (int __user *)arg);
689 }
690 case SIOCINQ: {
691 struct sk_buff *skb;
692 int amount = 0;
693
694 spin_lock_irq(&sk->sk_receive_queue.lock);
695 skb = skb_peek(&sk->sk_receive_queue);
696 if (skb != NULL)
697 amount = skb->len;
698 spin_unlock_irq(&sk->sk_receive_queue.lock);
699 return put_user(amount, (int __user *)arg);
700 }
701
702 default:
703#ifdef CONFIG_IP_MROUTE
704 return ipmr_ioctl(sk, cmd, (void __user *)arg);
705#else
706 return -ENOIOCTLCMD;
707#endif
708 }
709}
710
711struct proto raw_prot = {
712 .name = "RAW",
713 .owner = THIS_MODULE,
714 .close = raw_close,
715 .connect = ip4_datagram_connect,
716 .disconnect = udp_disconnect,
717 .ioctl = raw_ioctl,
718 .init = raw_init,
719 .setsockopt = raw_setsockopt,
720 .getsockopt = raw_getsockopt,
721 .sendmsg = raw_sendmsg,
722 .recvmsg = raw_recvmsg,
723 .bind = raw_bind,
724 .backlog_rcv = raw_rcv_skb,
725 .hash = raw_v4_hash,
726 .unhash = raw_v4_unhash,
727 .obj_size = sizeof(struct raw_sock),
728};
729
730#ifdef CONFIG_PROC_FS
731struct raw_iter_state {
732 int bucket;
733};
734
735#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private)
736
737static struct sock *raw_get_first(struct seq_file *seq)
738{
739 struct sock *sk;
740 struct raw_iter_state* state = raw_seq_private(seq);
741
742 for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) {
743 struct hlist_node *node;
744
745 sk_for_each(sk, node, &raw_v4_htable[state->bucket])
746 if (sk->sk_family == PF_INET)
747 goto found;
748 }
749 sk = NULL;
750found:
751 return sk;
752}
753
754static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
755{
756 struct raw_iter_state* state = raw_seq_private(seq);
757
758 do {
759 sk = sk_next(sk);
760try_again:
761 ;
762 } while (sk && sk->sk_family != PF_INET);
763
764 if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) {
765 sk = sk_head(&raw_v4_htable[state->bucket]);
766 goto try_again;
767 }
768 return sk;
769}
770
771static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
772{
773 struct sock *sk = raw_get_first(seq);
774
775 if (sk)
776 while (pos && (sk = raw_get_next(seq, sk)) != NULL)
777 --pos;
778 return pos ? NULL : sk;
779}
780
781static void *raw_seq_start(struct seq_file *seq, loff_t *pos)
782{
783 read_lock(&raw_v4_lock);
784 return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
785}
786
787static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
788{
789 struct sock *sk;
790
791 if (v == SEQ_START_TOKEN)
792 sk = raw_get_first(seq);
793 else
794 sk = raw_get_next(seq, v);
795 ++*pos;
796 return sk;
797}
798
799static void raw_seq_stop(struct seq_file *seq, void *v)
800{
801 read_unlock(&raw_v4_lock);
802}
803
804static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i)
805{
806 struct inet_sock *inet = inet_sk(sp);
807 unsigned int dest = inet->daddr,
808 src = inet->rcv_saddr;
809 __u16 destp = 0,
810 srcp = inet->num;
811
812 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
813 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
814 i, src, srcp, dest, destp, sp->sk_state,
815 atomic_read(&sp->sk_wmem_alloc),
816 atomic_read(&sp->sk_rmem_alloc),
817 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
818 atomic_read(&sp->sk_refcnt), sp);
819 return tmpbuf;
820}
821
822static int raw_seq_show(struct seq_file *seq, void *v)
823{
824 char tmpbuf[129];
825
826 if (v == SEQ_START_TOKEN)
827 seq_printf(seq, "%-127s\n",
828 " sl local_address rem_address st tx_queue "
829 "rx_queue tr tm->when retrnsmt uid timeout "
830 "inode");
831 else {
832 struct raw_iter_state *state = raw_seq_private(seq);
833
834 seq_printf(seq, "%-127s\n",
835 get_raw_sock(v, tmpbuf, state->bucket));
836 }
837 return 0;
838}
839
840static struct seq_operations raw_seq_ops = {
841 .start = raw_seq_start,
842 .next = raw_seq_next,
843 .stop = raw_seq_stop,
844 .show = raw_seq_show,
845};
846
847static int raw_seq_open(struct inode *inode, struct file *file)
848{
849 struct seq_file *seq;
850 int rc = -ENOMEM;
851 struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
852
853 if (!s)
854 goto out;
855 rc = seq_open(file, &raw_seq_ops);
856 if (rc)
857 goto out_kfree;
858
859 seq = file->private_data;
860 seq->private = s;
861 memset(s, 0, sizeof(*s));
862out:
863 return rc;
864out_kfree:
865 kfree(s);
866 goto out;
867}
868
869static struct file_operations raw_seq_fops = {
870 .owner = THIS_MODULE,
871 .open = raw_seq_open,
872 .read = seq_read,
873 .llseek = seq_lseek,
874 .release = seq_release_private,
875};
876
877int __init raw_proc_init(void)
878{
879 if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops))
880 return -ENOMEM;
881 return 0;
882}
883
884void __init raw_proc_exit(void)
885{
886 proc_net_remove("raw");
887}
888#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 000000000000..9f91a116d919
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,3177 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 *
58 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License
60 * as published by the Free Software Foundation; either version
61 * 2 of the License, or (at your option) any later version.
62 */
63
64#include <linux/config.h>
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/sched.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/rtnetlink.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <net/protocol.h>
94#include <net/ip.h>
95#include <net/route.h>
96#include <net/inetpeer.h>
97#include <net/sock.h>
98#include <net/ip_fib.h>
99#include <net/arp.h>
100#include <net/tcp.h>
101#include <net/icmp.h>
102#include <net/xfrm.h>
103#include <net/ip_mp_alg.h>
104#ifdef CONFIG_SYSCTL
105#include <linux/sysctl.h>
106#endif
107
108#define RT_FL_TOS(oldflp) \
109 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
110
111#define IP_MAX_MTU 0xFFF0
112
113#define RT_GC_TIMEOUT (300*HZ)
114
115static int ip_rt_min_delay = 2 * HZ;
116static int ip_rt_max_delay = 10 * HZ;
117static int ip_rt_max_size;
118static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
119static int ip_rt_gc_interval = 60 * HZ;
120static int ip_rt_gc_min_interval = HZ / 2;
121static int ip_rt_redirect_number = 9;
122static int ip_rt_redirect_load = HZ / 50;
123static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
124static int ip_rt_error_cost = HZ;
125static int ip_rt_error_burst = 5 * HZ;
126static int ip_rt_gc_elasticity = 8;
127static int ip_rt_mtu_expires = 10 * 60 * HZ;
128static int ip_rt_min_pmtu = 512 + 20 + 20;
129static int ip_rt_min_advmss = 256;
130static int ip_rt_secret_interval = 10 * 60 * HZ;
131static unsigned long rt_deadline;
132
133#define RTprint(a...) printk(KERN_DEBUG a)
134
135static struct timer_list rt_flush_timer;
136static struct timer_list rt_periodic_timer;
137static struct timer_list rt_secret_timer;
138
139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144static void ipv4_dst_destroy(struct dst_entry *dst);
145static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150static int rt_garbage_collect(void);
151
152
153static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET,
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
163 .entry_size = sizeof(struct rtable),
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
168__u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
204 spinlock_t lock;
205} __attribute__((__aligned__(8)));
206
207static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask;
209static int rt_hash_log;
210static unsigned int rt_hash_rnd;
211
212struct rt_cache_stat *rt_cache_stat;
213
214static int rt_intern_hash(unsigned hash, struct rtable *rth,
215 struct rtable **res);
216
217static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
218{
219 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
220 & rt_hash_mask);
221}
222
223#ifdef CONFIG_PROC_FS
224struct rt_cache_iter_state {
225 int bucket;
226};
227
228static struct rtable *rt_cache_get_first(struct seq_file *seq)
229{
230 struct rtable *r = NULL;
231 struct rt_cache_iter_state *st = seq->private;
232
233 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
234 rcu_read_lock_bh();
235 r = rt_hash_table[st->bucket].chain;
236 if (r)
237 break;
238 rcu_read_unlock_bh();
239 }
240 return r;
241}
242
243static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
244{
245 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
246
247 r = r->u.rt_next;
248 while (!r) {
249 rcu_read_unlock_bh();
250 if (--st->bucket < 0)
251 break;
252 rcu_read_lock_bh();
253 r = rt_hash_table[st->bucket].chain;
254 }
255 return r;
256}
257
258static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
259{
260 struct rtable *r = rt_cache_get_first(seq);
261
262 if (r)
263 while (pos && (r = rt_cache_get_next(seq, r)))
264 --pos;
265 return pos ? NULL : r;
266}
267
268static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
269{
270 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
271}
272
273static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
274{
275 struct rtable *r = NULL;
276
277 if (v == SEQ_START_TOKEN)
278 r = rt_cache_get_first(seq);
279 else
280 r = rt_cache_get_next(seq, v);
281 ++*pos;
282 return r;
283}
284
285static void rt_cache_seq_stop(struct seq_file *seq, void *v)
286{
287 if (v && v != SEQ_START_TOKEN)
288 rcu_read_unlock_bh();
289}
290
291static int rt_cache_seq_show(struct seq_file *seq, void *v)
292{
293 if (v == SEQ_START_TOKEN)
294 seq_printf(seq, "%-127s\n",
295 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
296 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
297 "HHUptod\tSpecDst");
298 else {
299 struct rtable *r = v;
300 char temp[256];
301
302 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
303 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
304 r->u.dst.dev ? r->u.dst.dev->name : "*",
305 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
306 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
307 r->u.dst.__use, 0, (unsigned long)r->rt_src,
308 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
309 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
310 dst_metric(&r->u.dst, RTAX_WINDOW),
311 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
312 dst_metric(&r->u.dst, RTAX_RTTVAR)),
313 r->fl.fl4_tos,
314 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
315 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
316 dev_queue_xmit) : 0,
317 r->rt_spec_dst);
318 seq_printf(seq, "%-127s\n", temp);
319 }
320 return 0;
321}
322
323static struct seq_operations rt_cache_seq_ops = {
324 .start = rt_cache_seq_start,
325 .next = rt_cache_seq_next,
326 .stop = rt_cache_seq_stop,
327 .show = rt_cache_seq_show,
328};
329
330static int rt_cache_seq_open(struct inode *inode, struct file *file)
331{
332 struct seq_file *seq;
333 int rc = -ENOMEM;
334 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
335
336 if (!s)
337 goto out;
338 rc = seq_open(file, &rt_cache_seq_ops);
339 if (rc)
340 goto out_kfree;
341 seq = file->private_data;
342 seq->private = s;
343 memset(s, 0, sizeof(*s));
344out:
345 return rc;
346out_kfree:
347 kfree(s);
348 goto out;
349}
350
351static struct file_operations rt_cache_seq_fops = {
352 .owner = THIS_MODULE,
353 .open = rt_cache_seq_open,
354 .read = seq_read,
355 .llseek = seq_lseek,
356 .release = seq_release_private,
357};
358
359
360static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
361{
362 int cpu;
363
364 if (*pos == 0)
365 return SEQ_START_TOKEN;
366
367 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
368 if (!cpu_possible(cpu))
369 continue;
370 *pos = cpu+1;
371 return per_cpu_ptr(rt_cache_stat, cpu);
372 }
373 return NULL;
374}
375
376static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
377{
378 int cpu;
379
380 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
381 if (!cpu_possible(cpu))
382 continue;
383 *pos = cpu+1;
384 return per_cpu_ptr(rt_cache_stat, cpu);
385 }
386 return NULL;
387
388}
389
390static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
391{
392
393}
394
395static int rt_cpu_seq_show(struct seq_file *seq, void *v)
396{
397 struct rt_cache_stat *st = v;
398
399 if (v == SEQ_START_TOKEN) {
400 seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
401 return 0;
402 }
403
404 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
405 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
406 atomic_read(&ipv4_dst_ops.entries),
407 st->in_hit,
408 st->in_slow_tot,
409 st->in_slow_mc,
410 st->in_no_route,
411 st->in_brd,
412 st->in_martian_dst,
413 st->in_martian_src,
414
415 st->out_hit,
416 st->out_slow_tot,
417 st->out_slow_mc,
418
419 st->gc_total,
420 st->gc_ignored,
421 st->gc_goal_miss,
422 st->gc_dst_overflow,
423 st->in_hlist_search,
424 st->out_hlist_search
425 );
426 return 0;
427}
428
429static struct seq_operations rt_cpu_seq_ops = {
430 .start = rt_cpu_seq_start,
431 .next = rt_cpu_seq_next,
432 .stop = rt_cpu_seq_stop,
433 .show = rt_cpu_seq_show,
434};
435
436
437static int rt_cpu_seq_open(struct inode *inode, struct file *file)
438{
439 return seq_open(file, &rt_cpu_seq_ops);
440}
441
442static struct file_operations rt_cpu_seq_fops = {
443 .owner = THIS_MODULE,
444 .open = rt_cpu_seq_open,
445 .read = seq_read,
446 .llseek = seq_lseek,
447 .release = seq_release,
448};
449
450#endif /* CONFIG_PROC_FS */
451
452static __inline__ void rt_free(struct rtable *rt)
453{
454 multipath_remove(rt);
455 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
456}
457
458static __inline__ void rt_drop(struct rtable *rt)
459{
460 multipath_remove(rt);
461 ip_rt_put(rt);
462 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
463}
464
465static __inline__ int rt_fast_clean(struct rtable *rth)
466{
467 /* Kill broadcast/multicast entries very aggresively, if they
468 collide in hash table with more useful entries */
469 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
470 rth->fl.iif && rth->u.rt_next;
471}
472
473static __inline__ int rt_valuable(struct rtable *rth)
474{
475 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
476 rth->u.dst.expires;
477}
478
479static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
480{
481 unsigned long age;
482 int ret = 0;
483
484 if (atomic_read(&rth->u.dst.__refcnt))
485 goto out;
486
487 ret = 1;
488 if (rth->u.dst.expires &&
489 time_after_eq(jiffies, rth->u.dst.expires))
490 goto out;
491
492 age = jiffies - rth->u.dst.lastuse;
493 ret = 0;
494 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
495 (age <= tmo2 && rt_valuable(rth)))
496 goto out;
497 ret = 1;
498out: return ret;
499}
500
501/* Bits of score are:
502 * 31: very valuable
503 * 30: not quite useless
504 * 29..0: usage counter
505 */
506static inline u32 rt_score(struct rtable *rt)
507{
508 u32 score = jiffies - rt->u.dst.lastuse;
509
510 score = ~score & ~(3<<30);
511
512 if (rt_valuable(rt))
513 score |= (1<<31);
514
515 if (!rt->fl.iif ||
516 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
517 score |= (1<<30);
518
519 return score;
520}
521
522static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
523{
524 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
525 fl1->oif == fl2->oif &&
526 fl1->iif == fl2->iif;
527}
528
529#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
530static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
531 struct rtable *expentry,
532 int *removed_count)
533{
534 int passedexpired = 0;
535 struct rtable **nextstep = NULL;
536 struct rtable **rthp = chain_head;
537 struct rtable *rth;
538
539 if (removed_count)
540 *removed_count = 0;
541
542 while ((rth = *rthp) != NULL) {
543 if (rth == expentry)
544 passedexpired = 1;
545
546 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
547 compare_keys(&(*rthp)->fl, &expentry->fl)) {
548 if (*rthp == expentry) {
549 *rthp = rth->u.rt_next;
550 continue;
551 } else {
552 *rthp = rth->u.rt_next;
553 rt_free(rth);
554 if (removed_count)
555 ++(*removed_count);
556 }
557 } else {
558 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
559 passedexpired && !nextstep)
560 nextstep = &rth->u.rt_next;
561
562 rthp = &rth->u.rt_next;
563 }
564 }
565
566 rt_free(expentry);
567 if (removed_count)
568 ++(*removed_count);
569
570 return nextstep;
571}
572#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
573
574
575/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy)
577{
578 static int rover;
579 int i = rover, t;
580 struct rtable *rth, **rthp;
581 unsigned long now = jiffies;
582
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
584 t -= ip_rt_gc_timeout) {
585 unsigned long tmo = ip_rt_gc_timeout;
586
587 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain;
589
590 spin_lock(&rt_hash_table[i].lock);
591 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */
594 if (time_before_eq(now, rth->u.dst.expires)) {
595 tmo >>= 1;
596 rthp = &rth->u.rt_next;
597 continue;
598 }
599 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
600 tmo >>= 1;
601 rthp = &rth->u.rt_next;
602 continue;
603 }
604
605 /* Cleanup aged off entries. */
606#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
607 /* remove all related balanced entries if necessary */
608 if (rth->u.dst.flags & DST_BALANCED) {
609 rthp = rt_remove_balanced_route(
610 &rt_hash_table[i].chain,
611 rth, NULL);
612 if (!rthp)
613 break;
614 } else {
615 *rthp = rth->u.rt_next;
616 rt_free(rth);
617 }
618#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
619 *rthp = rth->u.rt_next;
620 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 }
623 spin_unlock(&rt_hash_table[i].lock);
624
625 /* Fallback loop breaker. */
626 if (time_after(jiffies, now))
627 break;
628 }
629 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
631}
632
633/* This can run from both BH and non-BH contexts, the latter
634 * in the case of a forced flush event.
635 */
636static void rt_run_flush(unsigned long dummy)
637{
638 int i;
639 struct rtable *rth, *next;
640
641 rt_deadline = 0;
642
643 get_random_bytes(&rt_hash_rnd, 4);
644
645 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock);
647 rth = rt_hash_table[i].chain;
648 if (rth)
649 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock);
651
652 for (; rth; rth = next) {
653 next = rth->u.rt_next;
654 rt_free(rth);
655 }
656 }
657}
658
659static DEFINE_SPINLOCK(rt_flush_lock);
660
661void rt_cache_flush(int delay)
662{
663 unsigned long now = jiffies;
664 int user_mode = !in_softirq();
665
666 if (delay < 0)
667 delay = ip_rt_min_delay;
668
669 /* flush existing multipath state*/
670 multipath_flush();
671
672 spin_lock_bh(&rt_flush_lock);
673
674 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
675 long tmo = (long)(rt_deadline - now);
676
677 /* If flush timer is already running
678 and flush request is not immediate (delay > 0):
679
680 if deadline is not achieved, prolongate timer to "delay",
681 otherwise fire it at deadline time.
682 */
683
684 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
685 tmo = 0;
686
687 if (delay > tmo)
688 delay = tmo;
689 }
690
691 if (delay <= 0) {
692 spin_unlock_bh(&rt_flush_lock);
693 rt_run_flush(0);
694 return;
695 }
696
697 if (rt_deadline == 0)
698 rt_deadline = now + ip_rt_max_delay;
699
700 mod_timer(&rt_flush_timer, now+delay);
701 spin_unlock_bh(&rt_flush_lock);
702}
703
704static void rt_secret_rebuild(unsigned long dummy)
705{
706 unsigned long now = jiffies;
707
708 rt_cache_flush(0);
709 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
710}
711
712/*
713 Short description of GC goals.
714
715 We want to build algorithm, which will keep routing cache
716 at some equilibrium point, when number of aged off entries
717 is kept approximately equal to newly generated ones.
718
719 Current expiration strength is variable "expire".
720 We try to adjust it dynamically, so that if networking
721 is idle expires is large enough to keep enough of warm entries,
722 and when load increases it reduces to limit cache size.
723 */
724
725static int rt_garbage_collect(void)
726{
727 static unsigned long expire = RT_GC_TIMEOUT;
728 static unsigned long last_gc;
729 static int rover;
730 static int equilibrium;
731 struct rtable *rth, **rthp;
732 unsigned long now = jiffies;
733 int goal;
734
735 /*
736 * Garbage collection is pretty expensive,
737 * do not make it too frequently.
738 */
739
740 RT_CACHE_STAT_INC(gc_total);
741
742 if (now - last_gc < ip_rt_gc_min_interval &&
743 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
744 RT_CACHE_STAT_INC(gc_ignored);
745 goto out;
746 }
747
748 /* Calculate number of entries, which we want to expire now. */
749 goal = atomic_read(&ipv4_dst_ops.entries) -
750 (ip_rt_gc_elasticity << rt_hash_log);
751 if (goal <= 0) {
752 if (equilibrium < ipv4_dst_ops.gc_thresh)
753 equilibrium = ipv4_dst_ops.gc_thresh;
754 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
755 if (goal > 0) {
756 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
757 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
758 }
759 } else {
760 /* We are in dangerous area. Try to reduce cache really
761 * aggressively.
762 */
763 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
764 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
765 }
766
767 if (now - last_gc >= ip_rt_gc_min_interval)
768 last_gc = now;
769
770 if (goal <= 0) {
771 equilibrium += goal;
772 goto work_done;
773 }
774
775 do {
776 int i, k;
777
778 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
779 unsigned long tmo = expire;
780
781 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock);
784 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1;
787 rthp = &rth->u.rt_next;
788 continue;
789 }
790#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
791 /* remove all related balanced entries
792 * if necessary
793 */
794 if (rth->u.dst.flags & DST_BALANCED) {
795 int r;
796
797 rthp = rt_remove_balanced_route(
798 &rt_hash_table[i].chain,
799 rth,
800 &r);
801 goal -= r;
802 if (!rthp)
803 break;
804 } else {
805 *rthp = rth->u.rt_next;
806 rt_free(rth);
807 goal--;
808 }
809#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
810 *rthp = rth->u.rt_next;
811 rt_free(rth);
812 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 }
815 spin_unlock_bh(&rt_hash_table[k].lock);
816 if (goal <= 0)
817 break;
818 }
819 rover = k;
820
821 if (goal <= 0)
822 goto work_done;
823
824 /* Goal is not achieved. We stop process if:
825
826 - if expire reduced to zero. Otherwise, expire is halfed.
827 - if table is not full.
828 - if we are called from interrupt.
829 - jiffies check is just fallback/debug loop breaker.
830 We will not spin here for long time in any case.
831 */
832
833 RT_CACHE_STAT_INC(gc_goal_miss);
834
835 if (expire == 0)
836 break;
837
838 expire >>= 1;
839#if RT_CACHE_DEBUG >= 2
840 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
841 atomic_read(&ipv4_dst_ops.entries), goal, i);
842#endif
843
844 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
845 goto out;
846 } while (!in_softirq() && time_before_eq(jiffies, now));
847
848 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
849 goto out;
850 if (net_ratelimit())
851 printk(KERN_WARNING "dst cache overflow\n");
852 RT_CACHE_STAT_INC(gc_dst_overflow);
853 return 1;
854
855work_done:
856 expire += ip_rt_gc_min_interval;
857 if (expire > ip_rt_gc_timeout ||
858 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
859 expire = ip_rt_gc_timeout;
860#if RT_CACHE_DEBUG >= 2
861 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
862 atomic_read(&ipv4_dst_ops.entries), goal, rover);
863#endif
864out: return 0;
865}
866
867static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
868{
869 struct rtable *rth, **rthp;
870 unsigned long now;
871 struct rtable *cand, **candp;
872 u32 min_score;
873 int chain_length;
874 int attempts = !in_softirq();
875
876restart:
877 chain_length = 0;
878 min_score = ~(u32)0;
879 cand = NULL;
880 candp = NULL;
881 now = jiffies;
882
883 rthp = &rt_hash_table[hash].chain;
884
885 spin_lock_bh(&rt_hash_table[hash].lock);
886 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) &&
889 compare_keys(&rth->fl, &rt->fl)) {
890#else
891 if (compare_keys(&rth->fl, &rt->fl)) {
892#endif
893 /* Put it first */
894 *rthp = rth->u.rt_next;
895 /*
896 * Since lookup is lockfree, the deletion
897 * must be visible to another weakly ordered CPU before
898 * the insertion at the start of the hash chain.
899 */
900 rcu_assign_pointer(rth->u.rt_next,
901 rt_hash_table[hash].chain);
902 /*
903 * Since lookup is lockfree, the update writes
904 * must be ordered for consistency on SMP.
905 */
906 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
907
908 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock);
912
913 rt_drop(rt);
914 *rp = rth;
915 return 0;
916 }
917
918 if (!atomic_read(&rth->u.dst.__refcnt)) {
919 u32 score = rt_score(rth);
920
921 if (score <= min_score) {
922 cand = rth;
923 candp = rthp;
924 min_score = score;
925 }
926 }
927
928 chain_length++;
929
930 rthp = &rth->u.rt_next;
931 }
932
933 if (cand) {
934 /* ip_rt_gc_elasticity used to be average length of chain
935 * length, when exceeded gc becomes really aggressive.
936 *
937 * The second limit is less certain. At the moment it allows
938 * only 2 entries per bucket. We will see.
939 */
940 if (chain_length > ip_rt_gc_elasticity) {
941 *candp = cand->u.rt_next;
942 rt_free(cand);
943 }
944 }
945
946 /* Try to bind route to arp only if it is output
947 route or unicast forwarding path.
948 */
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock);
953
954 if (err != -ENOBUFS) {
955 rt_drop(rt);
956 return err;
957 }
958
959 /* Neighbour tables are full and nothing
960 can be released. Try to shrink route cache,
961 it is most likely it holds some neighbour records.
962 */
963 if (attempts-- > 0) {
964 int saved_elasticity = ip_rt_gc_elasticity;
965 int saved_int = ip_rt_gc_min_interval;
966 ip_rt_gc_elasticity = 1;
967 ip_rt_gc_min_interval = 0;
968 rt_garbage_collect();
969 ip_rt_gc_min_interval = saved_int;
970 ip_rt_gc_elasticity = saved_elasticity;
971 goto restart;
972 }
973
974 if (net_ratelimit())
975 printk(KERN_WARNING "Neighbour table overflow.\n");
976 rt_drop(rt);
977 return -ENOBUFS;
978 }
979 }
980
981 rt->u.rt_next = rt_hash_table[hash].chain;
982#if RT_CACHE_DEBUG >= 2
983 if (rt->u.rt_next) {
984 struct rtable *trt;
985 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
986 NIPQUAD(rt->rt_dst));
987 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
988 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
989 printk("\n");
990 }
991#endif
992 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock);
994 *rp = rt;
995 return 0;
996}
997
998void rt_bind_peer(struct rtable *rt, int create)
999{
1000 static DEFINE_SPINLOCK(rt_peer_lock);
1001 struct inet_peer *peer;
1002
1003 peer = inet_getpeer(rt->rt_dst, create);
1004
1005 spin_lock_bh(&rt_peer_lock);
1006 if (rt->peer == NULL) {
1007 rt->peer = peer;
1008 peer = NULL;
1009 }
1010 spin_unlock_bh(&rt_peer_lock);
1011 if (peer)
1012 inet_putpeer(peer);
1013}
1014
1015/*
1016 * Peer allocation may fail only in serious out-of-memory conditions. However
1017 * we still can generate some output.
1018 * Random ID selection looks a bit dangerous because we have no chances to
1019 * select ID being unique in a reasonable period of time.
1020 * But broken packet identifier may be better than no packet at all.
1021 */
1022static void ip_select_fb_ident(struct iphdr *iph)
1023{
1024 static DEFINE_SPINLOCK(ip_fb_id_lock);
1025 static u32 ip_fallback_id;
1026 u32 salt;
1027
1028 spin_lock_bh(&ip_fb_id_lock);
1029 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1030 iph->id = htons(salt & 0xFFFF);
1031 ip_fallback_id = salt;
1032 spin_unlock_bh(&ip_fb_id_lock);
1033}
1034
1035void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1036{
1037 struct rtable *rt = (struct rtable *) dst;
1038
1039 if (rt) {
1040 if (rt->peer == NULL)
1041 rt_bind_peer(rt, 1);
1042
1043 /* If peer is attached to destination, it is never detached,
1044 so that we need not to grab a lock to dereference it.
1045 */
1046 if (rt->peer) {
1047 iph->id = htons(inet_getid(rt->peer, more));
1048 return;
1049 }
1050 } else
1051 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
1052
1053 ip_select_fb_ident(iph);
1054}
1055
1056static void rt_del(unsigned hash, struct rtable *rt)
1057{
1058 struct rtable **rthp;
1059
1060 spin_lock_bh(&rt_hash_table[hash].lock);
1061 ip_rt_put(rt);
1062 for (rthp = &rt_hash_table[hash].chain; *rthp;
1063 rthp = &(*rthp)->u.rt_next)
1064 if (*rthp == rt) {
1065 *rthp = rt->u.rt_next;
1066 rt_free(rt);
1067 break;
1068 }
1069 spin_unlock_bh(&rt_hash_table[hash].lock);
1070}
1071
1072void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1073 u32 saddr, u8 tos, struct net_device *dev)
1074{
1075 int i, k;
1076 struct in_device *in_dev = in_dev_get(dev);
1077 struct rtable *rth, **rthp;
1078 u32 skeys[2] = { saddr, 0 };
1079 int ikeys[2] = { dev->ifindex, 0 };
1080
1081 tos &= IPTOS_RT_MASK;
1082
1083 if (!in_dev)
1084 return;
1085
1086 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1087 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1088 goto reject_redirect;
1089
1090 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1091 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1092 goto reject_redirect;
1093 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1094 goto reject_redirect;
1095 } else {
1096 if (inet_addr_type(new_gw) != RTN_UNICAST)
1097 goto reject_redirect;
1098 }
1099
1100 for (i = 0; i < 2; i++) {
1101 for (k = 0; k < 2; k++) {
1102 unsigned hash = rt_hash_code(daddr,
1103 skeys[i] ^ (ikeys[k] << 5),
1104 tos);
1105
1106 rthp=&rt_hash_table[hash].chain;
1107
1108 rcu_read_lock();
1109 while ((rth = rcu_dereference(*rthp)) != NULL) {
1110 struct rtable *rt;
1111
1112 if (rth->fl.fl4_dst != daddr ||
1113 rth->fl.fl4_src != skeys[i] ||
1114 rth->fl.fl4_tos != tos ||
1115 rth->fl.oif != ikeys[k] ||
1116 rth->fl.iif != 0) {
1117 rthp = &rth->u.rt_next;
1118 continue;
1119 }
1120
1121 if (rth->rt_dst != daddr ||
1122 rth->rt_src != saddr ||
1123 rth->u.dst.error ||
1124 rth->rt_gateway != old_gw ||
1125 rth->u.dst.dev != dev)
1126 break;
1127
1128 dst_hold(&rth->u.dst);
1129 rcu_read_unlock();
1130
1131 rt = dst_alloc(&ipv4_dst_ops);
1132 if (rt == NULL) {
1133 ip_rt_put(rth);
1134 in_dev_put(in_dev);
1135 return;
1136 }
1137
1138 /* Copy all the information. */
1139 *rt = *rth;
1140 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1141 rt->u.dst.__use = 1;
1142 atomic_set(&rt->u.dst.__refcnt, 1);
1143 rt->u.dst.child = NULL;
1144 if (rt->u.dst.dev)
1145 dev_hold(rt->u.dst.dev);
1146 if (rt->idev)
1147 in_dev_hold(rt->idev);
1148 rt->u.dst.obsolete = 0;
1149 rt->u.dst.lastuse = jiffies;
1150 rt->u.dst.path = &rt->u.dst;
1151 rt->u.dst.neighbour = NULL;
1152 rt->u.dst.hh = NULL;
1153 rt->u.dst.xfrm = NULL;
1154
1155 rt->rt_flags |= RTCF_REDIRECTED;
1156
1157 /* Gateway is different ... */
1158 rt->rt_gateway = new_gw;
1159
1160 /* Redirect received -> path was valid */
1161 dst_confirm(&rth->u.dst);
1162
1163 if (rt->peer)
1164 atomic_inc(&rt->peer->refcnt);
1165
1166 if (arp_bind_neighbour(&rt->u.dst) ||
1167 !(rt->u.dst.neighbour->nud_state &
1168 NUD_VALID)) {
1169 if (rt->u.dst.neighbour)
1170 neigh_event_send(rt->u.dst.neighbour, NULL);
1171 ip_rt_put(rth);
1172 rt_drop(rt);
1173 goto do_next;
1174 }
1175
1176 rt_del(hash, rth);
1177 if (!rt_intern_hash(hash, rt, &rt))
1178 ip_rt_put(rt);
1179 goto do_next;
1180 }
1181 rcu_read_unlock();
1182 do_next:
1183 ;
1184 }
1185 }
1186 in_dev_put(in_dev);
1187 return;
1188
1189reject_redirect:
1190#ifdef CONFIG_IP_ROUTE_VERBOSE
1191 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1192 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1193 "%u.%u.%u.%u ignored.\n"
1194 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1195 "tos %02x\n",
1196 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1197 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1198#endif
1199 in_dev_put(in_dev);
1200}
1201
1202static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1203{
1204 struct rtable *rt = (struct rtable*)dst;
1205 struct dst_entry *ret = dst;
1206
1207 if (rt) {
1208 if (dst->obsolete) {
1209 ip_rt_put(rt);
1210 ret = NULL;
1211 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1212 rt->u.dst.expires) {
1213 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1214 rt->fl.fl4_src ^
1215 (rt->fl.oif << 5),
1216 rt->fl.fl4_tos);
1217#if RT_CACHE_DEBUG >= 1
1218 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1219 "%u.%u.%u.%u/%02x dropped\n",
1220 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1221#endif
1222 rt_del(hash, rt);
1223 ret = NULL;
1224 }
1225 }
1226 return ret;
1227}
1228
1229/*
1230 * Algorithm:
1231 * 1. The first ip_rt_redirect_number redirects are sent
1232 * with exponential backoff, then we stop sending them at all,
1233 * assuming that the host ignores our redirects.
1234 * 2. If we did not see packets requiring redirects
1235 * during ip_rt_redirect_silence, we assume that the host
1236 * forgot redirected route and start to send redirects again.
1237 *
1238 * This algorithm is much cheaper and more intelligent than dumb load limiting
1239 * in icmp.c.
1240 *
1241 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1242 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1243 */
1244
1245void ip_rt_send_redirect(struct sk_buff *skb)
1246{
1247 struct rtable *rt = (struct rtable*)skb->dst;
1248 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1249
1250 if (!in_dev)
1251 return;
1252
1253 if (!IN_DEV_TX_REDIRECTS(in_dev))
1254 goto out;
1255
1256 /* No redirected packets during ip_rt_redirect_silence;
1257 * reset the algorithm.
1258 */
1259 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1260 rt->u.dst.rate_tokens = 0;
1261
1262 /* Too many ignored redirects; do not send anything
1263 * set u.dst.rate_last to the last seen redirected packet.
1264 */
1265 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1266 rt->u.dst.rate_last = jiffies;
1267 goto out;
1268 }
1269
1270 /* Check for load limit; set rate_last to the latest sent
1271 * redirect.
1272 */
1273 if (time_after(jiffies,
1274 (rt->u.dst.rate_last +
1275 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1276 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1277 rt->u.dst.rate_last = jiffies;
1278 ++rt->u.dst.rate_tokens;
1279#ifdef CONFIG_IP_ROUTE_VERBOSE
1280 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1281 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1282 net_ratelimit())
1283 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1284 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1285 NIPQUAD(rt->rt_src), rt->rt_iif,
1286 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1287#endif
1288 }
1289out:
1290 in_dev_put(in_dev);
1291}
1292
1293static int ip_error(struct sk_buff *skb)
1294{
1295 struct rtable *rt = (struct rtable*)skb->dst;
1296 unsigned long now;
1297 int code;
1298
1299 switch (rt->u.dst.error) {
1300 case EINVAL:
1301 default:
1302 goto out;
1303 case EHOSTUNREACH:
1304 code = ICMP_HOST_UNREACH;
1305 break;
1306 case ENETUNREACH:
1307 code = ICMP_NET_UNREACH;
1308 break;
1309 case EACCES:
1310 code = ICMP_PKT_FILTERED;
1311 break;
1312 }
1313
1314 now = jiffies;
1315 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1316 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1317 rt->u.dst.rate_tokens = ip_rt_error_burst;
1318 rt->u.dst.rate_last = now;
1319 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1320 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1321 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1322 }
1323
1324out: kfree_skb(skb);
1325 return 0;
1326}
1327
1328/*
1329 * The last two values are not from the RFC but
1330 * are needed for AMPRnet AX.25 paths.
1331 */
1332
1333static unsigned short mtu_plateau[] =
1334{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1335
1336static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1337{
1338 int i;
1339
1340 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1341 if (old_mtu > mtu_plateau[i])
1342 return mtu_plateau[i];
1343 return 68;
1344}
1345
1346unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1347{
1348 int i;
1349 unsigned short old_mtu = ntohs(iph->tot_len);
1350 struct rtable *rth;
1351 u32 skeys[2] = { iph->saddr, 0, };
1352 u32 daddr = iph->daddr;
1353 u8 tos = iph->tos & IPTOS_RT_MASK;
1354 unsigned short est_mtu = 0;
1355
1356 if (ipv4_config.no_pmtu_disc)
1357 return 0;
1358
1359 for (i = 0; i < 2; i++) {
1360 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1361
1362 rcu_read_lock();
1363 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1364 rth = rcu_dereference(rth->u.rt_next)) {
1365 if (rth->fl.fl4_dst == daddr &&
1366 rth->fl.fl4_src == skeys[i] &&
1367 rth->rt_dst == daddr &&
1368 rth->rt_src == iph->saddr &&
1369 rth->fl.fl4_tos == tos &&
1370 rth->fl.iif == 0 &&
1371 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1372 unsigned short mtu = new_mtu;
1373
1374 if (new_mtu < 68 || new_mtu >= old_mtu) {
1375
1376 /* BSD 4.2 compatibility hack :-( */
1377 if (mtu == 0 &&
1378 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1379 old_mtu >= 68 + (iph->ihl << 2))
1380 old_mtu -= iph->ihl << 2;
1381
1382 mtu = guess_mtu(old_mtu);
1383 }
1384 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1385 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1386 dst_confirm(&rth->u.dst);
1387 if (mtu < ip_rt_min_pmtu) {
1388 mtu = ip_rt_min_pmtu;
1389 rth->u.dst.metrics[RTAX_LOCK-1] |=
1390 (1 << RTAX_MTU);
1391 }
1392 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1393 dst_set_expires(&rth->u.dst,
1394 ip_rt_mtu_expires);
1395 }
1396 est_mtu = mtu;
1397 }
1398 }
1399 }
1400 rcu_read_unlock();
1401 }
1402 return est_mtu ? : new_mtu;
1403}
1404
1405static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1406{
1407 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1408 !(dst_metric_locked(dst, RTAX_MTU))) {
1409 if (mtu < ip_rt_min_pmtu) {
1410 mtu = ip_rt_min_pmtu;
1411 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1412 }
1413 dst->metrics[RTAX_MTU-1] = mtu;
1414 dst_set_expires(dst, ip_rt_mtu_expires);
1415 }
1416}
1417
1418static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1419{
1420 return NULL;
1421}
1422
1423static void ipv4_dst_destroy(struct dst_entry *dst)
1424{
1425 struct rtable *rt = (struct rtable *) dst;
1426 struct inet_peer *peer = rt->peer;
1427 struct in_device *idev = rt->idev;
1428
1429 if (peer) {
1430 rt->peer = NULL;
1431 inet_putpeer(peer);
1432 }
1433
1434 if (idev) {
1435 rt->idev = NULL;
1436 in_dev_put(idev);
1437 }
1438}
1439
1440static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1441 int how)
1442{
1443 struct rtable *rt = (struct rtable *) dst;
1444 struct in_device *idev = rt->idev;
1445 if (dev != &loopback_dev && idev && idev->dev == dev) {
1446 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1447 if (loopback_idev) {
1448 rt->idev = loopback_idev;
1449 in_dev_put(idev);
1450 }
1451 }
1452}
1453
1454static void ipv4_link_failure(struct sk_buff *skb)
1455{
1456 struct rtable *rt;
1457
1458 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1459
1460 rt = (struct rtable *) skb->dst;
1461 if (rt)
1462 dst_set_expires(&rt->u.dst, 0);
1463}
1464
1465static int ip_rt_bug(struct sk_buff *skb)
1466{
1467 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1468 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1469 skb->dev ? skb->dev->name : "?");
1470 kfree_skb(skb);
1471 return 0;
1472}
1473
1474/*
1475 We do not cache source address of outgoing interface,
1476 because it is used only by IP RR, TS and SRR options,
1477 so that it out of fast path.
1478
1479 BTW remember: "addr" is allowed to be not aligned
1480 in IP options!
1481 */
1482
1483void ip_rt_get_source(u8 *addr, struct rtable *rt)
1484{
1485 u32 src;
1486 struct fib_result res;
1487
1488 if (rt->fl.iif == 0)
1489 src = rt->rt_src;
1490 else if (fib_lookup(&rt->fl, &res) == 0) {
1491 src = FIB_RES_PREFSRC(res);
1492 fib_res_put(&res);
1493 } else
1494 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1495 RT_SCOPE_UNIVERSE);
1496 memcpy(addr, &src, 4);
1497}
1498
1499#ifdef CONFIG_NET_CLS_ROUTE
1500static void set_class_tag(struct rtable *rt, u32 tag)
1501{
1502 if (!(rt->u.dst.tclassid & 0xFFFF))
1503 rt->u.dst.tclassid |= tag & 0xFFFF;
1504 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1505 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1506}
1507#endif
1508
1509static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1510{
1511 struct fib_info *fi = res->fi;
1512
1513 if (fi) {
1514 if (FIB_RES_GW(*res) &&
1515 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1516 rt->rt_gateway = FIB_RES_GW(*res);
1517 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1518 sizeof(rt->u.dst.metrics));
1519 if (fi->fib_mtu == 0) {
1520 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1521 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1522 rt->rt_gateway != rt->rt_dst &&
1523 rt->u.dst.dev->mtu > 576)
1524 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1525 }
1526#ifdef CONFIG_NET_CLS_ROUTE
1527 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1528#endif
1529 } else
1530 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1531
1532 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1533 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1534 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1535 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1536 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1537 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1538 ip_rt_min_advmss);
1539 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1540 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1541
1542#ifdef CONFIG_NET_CLS_ROUTE
1543#ifdef CONFIG_IP_MULTIPLE_TABLES
1544 set_class_tag(rt, fib_rules_tclass(res));
1545#endif
1546 set_class_tag(rt, itag);
1547#endif
1548 rt->rt_type = res->type;
1549}
1550
1551static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1552 u8 tos, struct net_device *dev, int our)
1553{
1554 unsigned hash;
1555 struct rtable *rth;
1556 u32 spec_dst;
1557 struct in_device *in_dev = in_dev_get(dev);
1558 u32 itag = 0;
1559
1560 /* Primary sanity checks. */
1561
1562 if (in_dev == NULL)
1563 return -EINVAL;
1564
1565 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1566 skb->protocol != htons(ETH_P_IP))
1567 goto e_inval;
1568
1569 if (ZERONET(saddr)) {
1570 if (!LOCAL_MCAST(daddr))
1571 goto e_inval;
1572 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1573 } else if (fib_validate_source(saddr, 0, tos, 0,
1574 dev, &spec_dst, &itag) < 0)
1575 goto e_inval;
1576
1577 rth = dst_alloc(&ipv4_dst_ops);
1578 if (!rth)
1579 goto e_nobufs;
1580
1581 rth->u.dst.output= ip_rt_bug;
1582
1583 atomic_set(&rth->u.dst.__refcnt, 1);
1584 rth->u.dst.flags= DST_HOST;
1585 if (in_dev->cnf.no_policy)
1586 rth->u.dst.flags |= DST_NOPOLICY;
1587 rth->fl.fl4_dst = daddr;
1588 rth->rt_dst = daddr;
1589 rth->fl.fl4_tos = tos;
1590#ifdef CONFIG_IP_ROUTE_FWMARK
1591 rth->fl.fl4_fwmark= skb->nfmark;
1592#endif
1593 rth->fl.fl4_src = saddr;
1594 rth->rt_src = saddr;
1595#ifdef CONFIG_NET_CLS_ROUTE
1596 rth->u.dst.tclassid = itag;
1597#endif
1598 rth->rt_iif =
1599 rth->fl.iif = dev->ifindex;
1600 rth->u.dst.dev = &loopback_dev;
1601 dev_hold(rth->u.dst.dev);
1602 rth->idev = in_dev_get(rth->u.dst.dev);
1603 rth->fl.oif = 0;
1604 rth->rt_gateway = daddr;
1605 rth->rt_spec_dst= spec_dst;
1606 rth->rt_type = RTN_MULTICAST;
1607 rth->rt_flags = RTCF_MULTICAST;
1608 if (our) {
1609 rth->u.dst.input= ip_local_deliver;
1610 rth->rt_flags |= RTCF_LOCAL;
1611 }
1612
1613#ifdef CONFIG_IP_MROUTE
1614 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1615 rth->u.dst.input = ip_mr_input;
1616#endif
1617 RT_CACHE_STAT_INC(in_slow_mc);
1618
1619 in_dev_put(in_dev);
1620 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1621 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1622
1623e_nobufs:
1624 in_dev_put(in_dev);
1625 return -ENOBUFS;
1626
1627e_inval:
1628 in_dev_put(in_dev);
1629 return -EINVAL;
1630}
1631
1632
1633static void ip_handle_martian_source(struct net_device *dev,
1634 struct in_device *in_dev,
1635 struct sk_buff *skb,
1636 u32 daddr,
1637 u32 saddr)
1638{
1639 RT_CACHE_STAT_INC(in_martian_src);
1640#ifdef CONFIG_IP_ROUTE_VERBOSE
1641 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1642 /*
1643 * RFC1812 recommendation, if source is martian,
1644 * the only hint is MAC header.
1645 */
1646 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1647 "%u.%u.%u.%u, on dev %s\n",
1648 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1649 if (dev->hard_header_len) {
1650 int i;
1651 unsigned char *p = skb->mac.raw;
1652 printk(KERN_WARNING "ll header: ");
1653 for (i = 0; i < dev->hard_header_len; i++, p++) {
1654 printk("%02x", *p);
1655 if (i < (dev->hard_header_len - 1))
1656 printk(":");
1657 }
1658 printk("\n");
1659 }
1660 }
1661#endif
1662}
1663
1664static inline int __mkroute_input(struct sk_buff *skb,
1665 struct fib_result* res,
1666 struct in_device *in_dev,
1667 u32 daddr, u32 saddr, u32 tos,
1668 struct rtable **result)
1669{
1670
1671 struct rtable *rth;
1672 int err;
1673 struct in_device *out_dev;
1674 unsigned flags = 0;
1675 u32 spec_dst, itag;
1676
1677 /* get a working reference to the output device */
1678 out_dev = in_dev_get(FIB_RES_DEV(*res));
1679 if (out_dev == NULL) {
1680 if (net_ratelimit())
1681 printk(KERN_CRIT "Bug in ip_route_input" \
1682 "_slow(). Please, report\n");
1683 return -EINVAL;
1684 }
1685
1686
1687 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1688 in_dev->dev, &spec_dst, &itag);
1689 if (err < 0) {
1690 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1691 saddr);
1692
1693 err = -EINVAL;
1694 goto cleanup;
1695 }
1696
1697 if (err)
1698 flags |= RTCF_DIRECTSRC;
1699
1700 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1701 (IN_DEV_SHARED_MEDIA(out_dev) ||
1702 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1703 flags |= RTCF_DOREDIRECT;
1704
1705 if (skb->protocol != htons(ETH_P_IP)) {
1706 /* Not IP (i.e. ARP). Do not create route, if it is
1707 * invalid for proxy arp. DNAT routes are always valid.
1708 */
1709 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1710 err = -EINVAL;
1711 goto cleanup;
1712 }
1713 }
1714
1715
1716 rth = dst_alloc(&ipv4_dst_ops);
1717 if (!rth) {
1718 err = -ENOBUFS;
1719 goto cleanup;
1720 }
1721
1722 rth->u.dst.flags= DST_HOST;
1723#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1724 if (res->fi->fib_nhs > 1)
1725 rth->u.dst.flags |= DST_BALANCED;
1726#endif
1727 if (in_dev->cnf.no_policy)
1728 rth->u.dst.flags |= DST_NOPOLICY;
1729 if (in_dev->cnf.no_xfrm)
1730 rth->u.dst.flags |= DST_NOXFRM;
1731 rth->fl.fl4_dst = daddr;
1732 rth->rt_dst = daddr;
1733 rth->fl.fl4_tos = tos;
1734#ifdef CONFIG_IP_ROUTE_FWMARK
1735 rth->fl.fl4_fwmark= skb->nfmark;
1736#endif
1737 rth->fl.fl4_src = saddr;
1738 rth->rt_src = saddr;
1739 rth->rt_gateway = daddr;
1740 rth->rt_iif =
1741 rth->fl.iif = in_dev->dev->ifindex;
1742 rth->u.dst.dev = (out_dev)->dev;
1743 dev_hold(rth->u.dst.dev);
1744 rth->idev = in_dev_get(rth->u.dst.dev);
1745 rth->fl.oif = 0;
1746 rth->rt_spec_dst= spec_dst;
1747
1748 rth->u.dst.input = ip_forward;
1749 rth->u.dst.output = ip_output;
1750
1751 rt_set_nexthop(rth, res, itag);
1752
1753 rth->rt_flags = flags;
1754
1755 *result = rth;
1756 err = 0;
1757 cleanup:
1758 /* release the working reference to the output device */
1759 in_dev_put(out_dev);
1760 return err;
1761}
1762
1763static inline int ip_mkroute_input_def(struct sk_buff *skb,
1764 struct fib_result* res,
1765 const struct flowi *fl,
1766 struct in_device *in_dev,
1767 u32 daddr, u32 saddr, u32 tos)
1768{
1769 struct rtable* rth;
1770 int err;
1771 unsigned hash;
1772
1773#ifdef CONFIG_IP_ROUTE_MULTIPATH
1774 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1775 fib_select_multipath(fl, res);
1776#endif
1777
1778 /* create a routing cache entry */
1779 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1780 if (err)
1781 return err;
1782 atomic_set(&rth->u.dst.__refcnt, 1);
1783
1784 /* put it into the cache */
1785 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1786 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1787}
1788
1789static inline int ip_mkroute_input(struct sk_buff *skb,
1790 struct fib_result* res,
1791 const struct flowi *fl,
1792 struct in_device *in_dev,
1793 u32 daddr, u32 saddr, u32 tos)
1794{
1795#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1796 struct rtable* rth;
1797 unsigned char hop, hopcount, lasthop;
1798 int err = -EINVAL;
1799 unsigned int hash;
1800
1801 if (res->fi)
1802 hopcount = res->fi->fib_nhs;
1803 else
1804 hopcount = 1;
1805
1806 lasthop = hopcount - 1;
1807
1808 /* distinguish between multipath and singlepath */
1809 if (hopcount < 2)
1810 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1811 saddr, tos);
1812
1813 /* add all alternatives to the routing cache */
1814 for (hop = 0; hop < hopcount; hop++) {
1815 res->nh_sel = hop;
1816
1817 /* create a routing cache entry */
1818 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1819 &rth);
1820 if (err)
1821 return err;
1822
1823 /* put it into the cache */
1824 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1826 if (err)
1827 return err;
1828
1829 /* forward hop information to multipath impl. */
1830 multipath_set_nhinfo(rth,
1831 FIB_RES_NETWORK(*res),
1832 FIB_RES_NETMASK(*res),
1833 res->prefixlen,
1834 &FIB_RES_NH(*res));
1835
1836 /* only for the last hop the reference count is handled
1837 * outside
1838 */
1839 if (hop == lasthop)
1840 atomic_set(&(skb->dst->__refcnt), 1);
1841 }
1842 return err;
1843#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1844 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1845#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1846}
1847
1848
1849/*
1850 * NOTE. We drop all the packets that has local source
1851 * addresses, because every properly looped back packet
1852 * must have correct destination already attached by output routine.
1853 *
1854 * Such approach solves two big problems:
1855 * 1. Not simplex devices are handled properly.
1856 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1857 */
1858
1859static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1860 u8 tos, struct net_device *dev)
1861{
1862 struct fib_result res;
1863 struct in_device *in_dev = in_dev_get(dev);
1864 struct flowi fl = { .nl_u = { .ip4_u =
1865 { .daddr = daddr,
1866 .saddr = saddr,
1867 .tos = tos,
1868 .scope = RT_SCOPE_UNIVERSE,
1869#ifdef CONFIG_IP_ROUTE_FWMARK
1870 .fwmark = skb->nfmark
1871#endif
1872 } },
1873 .iif = dev->ifindex };
1874 unsigned flags = 0;
1875 u32 itag = 0;
1876 struct rtable * rth;
1877 unsigned hash;
1878 u32 spec_dst;
1879 int err = -EINVAL;
1880 int free_res = 0;
1881
1882 /* IP on this device is disabled. */
1883
1884 if (!in_dev)
1885 goto out;
1886
1887 /* Check for the most weird martians, which can be not detected
1888 by fib_lookup.
1889 */
1890
1891 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1892 goto martian_source;
1893
1894 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1895 goto brd_input;
1896
1897 /* Accept zero addresses only to limited broadcast;
1898 * I even do not know to fix it or not. Waiting for complains :-)
1899 */
1900 if (ZERONET(saddr))
1901 goto martian_source;
1902
1903 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1904 goto martian_destination;
1905
1906 /*
1907 * Now we are ready to route packet.
1908 */
1909 if ((err = fib_lookup(&fl, &res)) != 0) {
1910 if (!IN_DEV_FORWARD(in_dev))
1911 goto e_inval;
1912 goto no_route;
1913 }
1914 free_res = 1;
1915
1916 RT_CACHE_STAT_INC(in_slow_tot);
1917
1918 if (res.type == RTN_BROADCAST)
1919 goto brd_input;
1920
1921 if (res.type == RTN_LOCAL) {
1922 int result;
1923 result = fib_validate_source(saddr, daddr, tos,
1924 loopback_dev.ifindex,
1925 dev, &spec_dst, &itag);
1926 if (result < 0)
1927 goto martian_source;
1928 if (result)
1929 flags |= RTCF_DIRECTSRC;
1930 spec_dst = daddr;
1931 goto local_input;
1932 }
1933
1934 if (!IN_DEV_FORWARD(in_dev))
1935 goto e_inval;
1936 if (res.type != RTN_UNICAST)
1937 goto martian_destination;
1938
1939 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1940 if (err == -ENOBUFS)
1941 goto e_nobufs;
1942 if (err == -EINVAL)
1943 goto e_inval;
1944
1945done:
1946 in_dev_put(in_dev);
1947 if (free_res)
1948 fib_res_put(&res);
1949out: return err;
1950
1951brd_input:
1952 if (skb->protocol != htons(ETH_P_IP))
1953 goto e_inval;
1954
1955 if (ZERONET(saddr))
1956 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1957 else {
1958 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1959 &itag);
1960 if (err < 0)
1961 goto martian_source;
1962 if (err)
1963 flags |= RTCF_DIRECTSRC;
1964 }
1965 flags |= RTCF_BROADCAST;
1966 res.type = RTN_BROADCAST;
1967 RT_CACHE_STAT_INC(in_brd);
1968
1969local_input:
1970 rth = dst_alloc(&ipv4_dst_ops);
1971 if (!rth)
1972 goto e_nobufs;
1973
1974 rth->u.dst.output= ip_rt_bug;
1975
1976 atomic_set(&rth->u.dst.__refcnt, 1);
1977 rth->u.dst.flags= DST_HOST;
1978 if (in_dev->cnf.no_policy)
1979 rth->u.dst.flags |= DST_NOPOLICY;
1980 rth->fl.fl4_dst = daddr;
1981 rth->rt_dst = daddr;
1982 rth->fl.fl4_tos = tos;
1983#ifdef CONFIG_IP_ROUTE_FWMARK
1984 rth->fl.fl4_fwmark= skb->nfmark;
1985#endif
1986 rth->fl.fl4_src = saddr;
1987 rth->rt_src = saddr;
1988#ifdef CONFIG_NET_CLS_ROUTE
1989 rth->u.dst.tclassid = itag;
1990#endif
1991 rth->rt_iif =
1992 rth->fl.iif = dev->ifindex;
1993 rth->u.dst.dev = &loopback_dev;
1994 dev_hold(rth->u.dst.dev);
1995 rth->idev = in_dev_get(rth->u.dst.dev);
1996 rth->rt_gateway = daddr;
1997 rth->rt_spec_dst= spec_dst;
1998 rth->u.dst.input= ip_local_deliver;
1999 rth->rt_flags = flags|RTCF_LOCAL;
2000 if (res.type == RTN_UNREACHABLE) {
2001 rth->u.dst.input= ip_error;
2002 rth->u.dst.error= -err;
2003 rth->rt_flags &= ~RTCF_LOCAL;
2004 }
2005 rth->rt_type = res.type;
2006 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2007 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2008 goto done;
2009
2010no_route:
2011 RT_CACHE_STAT_INC(in_no_route);
2012 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2013 res.type = RTN_UNREACHABLE;
2014 goto local_input;
2015
2016 /*
2017 * Do not cache martian addresses: they should be logged (RFC1812)
2018 */
2019martian_destination:
2020 RT_CACHE_STAT_INC(in_martian_dst);
2021#ifdef CONFIG_IP_ROUTE_VERBOSE
2022 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2023 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2024 "%u.%u.%u.%u, dev %s\n",
2025 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2026#endif
2027e_inval:
2028 err = -EINVAL;
2029 goto done;
2030
2031e_nobufs:
2032 err = -ENOBUFS;
2033 goto done;
2034
2035martian_source:
2036 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2037 goto e_inval;
2038}
2039
2040int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2041 u8 tos, struct net_device *dev)
2042{
2043 struct rtable * rth;
2044 unsigned hash;
2045 int iif = dev->ifindex;
2046
2047 tos &= IPTOS_RT_MASK;
2048 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2049
2050 rcu_read_lock();
2051 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2052 rth = rcu_dereference(rth->u.rt_next)) {
2053 if (rth->fl.fl4_dst == daddr &&
2054 rth->fl.fl4_src == saddr &&
2055 rth->fl.iif == iif &&
2056 rth->fl.oif == 0 &&
2057#ifdef CONFIG_IP_ROUTE_FWMARK
2058 rth->fl.fl4_fwmark == skb->nfmark &&
2059#endif
2060 rth->fl.fl4_tos == tos) {
2061 rth->u.dst.lastuse = jiffies;
2062 dst_hold(&rth->u.dst);
2063 rth->u.dst.__use++;
2064 RT_CACHE_STAT_INC(in_hit);
2065 rcu_read_unlock();
2066 skb->dst = (struct dst_entry*)rth;
2067 return 0;
2068 }
2069 RT_CACHE_STAT_INC(in_hlist_search);
2070 }
2071 rcu_read_unlock();
2072
2073 /* Multicast recognition logic is moved from route cache to here.
2074 The problem was that too many Ethernet cards have broken/missing
2075 hardware multicast filters :-( As result the host on multicasting
2076 network acquires a lot of useless route cache entries, sort of
2077 SDR messages from all the world. Now we try to get rid of them.
2078 Really, provided software IP multicast filter is organized
2079 reasonably (at least, hashed), it does not result in a slowdown
2080 comparing with route cache reject entries.
2081 Note, that multicast routers are not affected, because
2082 route cache entry is created eventually.
2083 */
2084 if (MULTICAST(daddr)) {
2085 struct in_device *in_dev;
2086
2087 rcu_read_lock();
2088 if ((in_dev = __in_dev_get(dev)) != NULL) {
2089 int our = ip_check_mc(in_dev, daddr, saddr,
2090 skb->nh.iph->protocol);
2091 if (our
2092#ifdef CONFIG_IP_MROUTE
2093 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2094#endif
2095 ) {
2096 rcu_read_unlock();
2097 return ip_route_input_mc(skb, daddr, saddr,
2098 tos, dev, our);
2099 }
2100 }
2101 rcu_read_unlock();
2102 return -EINVAL;
2103 }
2104 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2105}
2106
2107static inline int __mkroute_output(struct rtable **result,
2108 struct fib_result* res,
2109 const struct flowi *fl,
2110 const struct flowi *oldflp,
2111 struct net_device *dev_out,
2112 unsigned flags)
2113{
2114 struct rtable *rth;
2115 struct in_device *in_dev;
2116 u32 tos = RT_FL_TOS(oldflp);
2117 int err = 0;
2118
2119 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2120 return -EINVAL;
2121
2122 if (fl->fl4_dst == 0xFFFFFFFF)
2123 res->type = RTN_BROADCAST;
2124 else if (MULTICAST(fl->fl4_dst))
2125 res->type = RTN_MULTICAST;
2126 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2127 return -EINVAL;
2128
2129 if (dev_out->flags & IFF_LOOPBACK)
2130 flags |= RTCF_LOCAL;
2131
2132 /* get work reference to inet device */
2133 in_dev = in_dev_get(dev_out);
2134 if (!in_dev)
2135 return -EINVAL;
2136
2137 if (res->type == RTN_BROADCAST) {
2138 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2139 if (res->fi) {
2140 fib_info_put(res->fi);
2141 res->fi = NULL;
2142 }
2143 } else if (res->type == RTN_MULTICAST) {
2144 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2145 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2146 oldflp->proto))
2147 flags &= ~RTCF_LOCAL;
2148 /* If multicast route do not exist use
2149 default one, but do not gateway in this case.
2150 Yes, it is hack.
2151 */
2152 if (res->fi && res->prefixlen < 4) {
2153 fib_info_put(res->fi);
2154 res->fi = NULL;
2155 }
2156 }
2157
2158
2159 rth = dst_alloc(&ipv4_dst_ops);
2160 if (!rth) {
2161 err = -ENOBUFS;
2162 goto cleanup;
2163 }
2164
2165 rth->u.dst.flags= DST_HOST;
2166#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2167 if (res->fi) {
2168 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2169 if (res->fi->fib_nhs > 1)
2170 rth->u.dst.flags |= DST_BALANCED;
2171 }
2172#endif
2173 if (in_dev->cnf.no_xfrm)
2174 rth->u.dst.flags |= DST_NOXFRM;
2175 if (in_dev->cnf.no_policy)
2176 rth->u.dst.flags |= DST_NOPOLICY;
2177
2178 rth->fl.fl4_dst = oldflp->fl4_dst;
2179 rth->fl.fl4_tos = tos;
2180 rth->fl.fl4_src = oldflp->fl4_src;
2181 rth->fl.oif = oldflp->oif;
2182#ifdef CONFIG_IP_ROUTE_FWMARK
2183 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2184#endif
2185 rth->rt_dst = fl->fl4_dst;
2186 rth->rt_src = fl->fl4_src;
2187 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2188 /* get references to the devices that are to be hold by the routing
2189 cache entry */
2190 rth->u.dst.dev = dev_out;
2191 dev_hold(dev_out);
2192 rth->idev = in_dev_get(dev_out);
2193 rth->rt_gateway = fl->fl4_dst;
2194 rth->rt_spec_dst= fl->fl4_src;
2195
2196 rth->u.dst.output=ip_output;
2197
2198 RT_CACHE_STAT_INC(out_slow_tot);
2199
2200 if (flags & RTCF_LOCAL) {
2201 rth->u.dst.input = ip_local_deliver;
2202 rth->rt_spec_dst = fl->fl4_dst;
2203 }
2204 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2205 rth->rt_spec_dst = fl->fl4_src;
2206 if (flags & RTCF_LOCAL &&
2207 !(dev_out->flags & IFF_LOOPBACK)) {
2208 rth->u.dst.output = ip_mc_output;
2209 RT_CACHE_STAT_INC(out_slow_mc);
2210 }
2211#ifdef CONFIG_IP_MROUTE
2212 if (res->type == RTN_MULTICAST) {
2213 if (IN_DEV_MFORWARD(in_dev) &&
2214 !LOCAL_MCAST(oldflp->fl4_dst)) {
2215 rth->u.dst.input = ip_mr_input;
2216 rth->u.dst.output = ip_mc_output;
2217 }
2218 }
2219#endif
2220 }
2221
2222 rt_set_nexthop(rth, res, 0);
2223
2224 rth->rt_flags = flags;
2225
2226 *result = rth;
2227 cleanup:
2228 /* release work reference to inet device */
2229 in_dev_put(in_dev);
2230
2231 return err;
2232}
2233
2234static inline int ip_mkroute_output_def(struct rtable **rp,
2235 struct fib_result* res,
2236 const struct flowi *fl,
2237 const struct flowi *oldflp,
2238 struct net_device *dev_out,
2239 unsigned flags)
2240{
2241 struct rtable *rth;
2242 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2243 unsigned hash;
2244 if (err == 0) {
2245 u32 tos = RT_FL_TOS(oldflp);
2246
2247 atomic_set(&rth->u.dst.__refcnt, 1);
2248
2249 hash = rt_hash_code(oldflp->fl4_dst,
2250 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2251 err = rt_intern_hash(hash, rth, rp);
2252 }
2253
2254 return err;
2255}
2256
2257static inline int ip_mkroute_output(struct rtable** rp,
2258 struct fib_result* res,
2259 const struct flowi *fl,
2260 const struct flowi *oldflp,
2261 struct net_device *dev_out,
2262 unsigned flags)
2263{
2264#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2265 u32 tos = RT_FL_TOS(oldflp);
2266 unsigned char hop;
2267 unsigned hash;
2268 int err = -EINVAL;
2269 struct rtable *rth;
2270
2271 if (res->fi && res->fi->fib_nhs > 1) {
2272 unsigned char hopcount = res->fi->fib_nhs;
2273
2274 for (hop = 0; hop < hopcount; hop++) {
2275 struct net_device *dev2nexthop;
2276
2277 res->nh_sel = hop;
2278
2279 /* hold a work reference to the output device */
2280 dev2nexthop = FIB_RES_DEV(*res);
2281 dev_hold(dev2nexthop);
2282
2283 err = __mkroute_output(&rth, res, fl, oldflp,
2284 dev2nexthop, flags);
2285
2286 if (err != 0)
2287 goto cleanup;
2288
2289 hash = rt_hash_code(oldflp->fl4_dst,
2290 oldflp->fl4_src ^
2291 (oldflp->oif << 5), tos);
2292 err = rt_intern_hash(hash, rth, rp);
2293
2294 /* forward hop information to multipath impl. */
2295 multipath_set_nhinfo(rth,
2296 FIB_RES_NETWORK(*res),
2297 FIB_RES_NETMASK(*res),
2298 res->prefixlen,
2299 &FIB_RES_NH(*res));
2300 cleanup:
2301 /* release work reference to output device */
2302 dev_put(dev2nexthop);
2303
2304 if (err != 0)
2305 return err;
2306 }
2307 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2308 return err;
2309 } else {
2310 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2311 flags);
2312 }
2313#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2314 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2315#endif
2316}
2317
2318/*
2319 * Major route resolver routine.
2320 */
2321
2322static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2323{
2324 u32 tos = RT_FL_TOS(oldflp);
2325 struct flowi fl = { .nl_u = { .ip4_u =
2326 { .daddr = oldflp->fl4_dst,
2327 .saddr = oldflp->fl4_src,
2328 .tos = tos & IPTOS_RT_MASK,
2329 .scope = ((tos & RTO_ONLINK) ?
2330 RT_SCOPE_LINK :
2331 RT_SCOPE_UNIVERSE),
2332#ifdef CONFIG_IP_ROUTE_FWMARK
2333 .fwmark = oldflp->fl4_fwmark
2334#endif
2335 } },
2336 .iif = loopback_dev.ifindex,
2337 .oif = oldflp->oif };
2338 struct fib_result res;
2339 unsigned flags = 0;
2340 struct net_device *dev_out = NULL;
2341 int free_res = 0;
2342 int err;
2343
2344
2345 res.fi = NULL;
2346#ifdef CONFIG_IP_MULTIPLE_TABLES
2347 res.r = NULL;
2348#endif
2349
2350 if (oldflp->fl4_src) {
2351 err = -EINVAL;
2352 if (MULTICAST(oldflp->fl4_src) ||
2353 BADCLASS(oldflp->fl4_src) ||
2354 ZERONET(oldflp->fl4_src))
2355 goto out;
2356
2357 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2358 dev_out = ip_dev_find(oldflp->fl4_src);
2359 if (dev_out == NULL)
2360 goto out;
2361
2362 /* I removed check for oif == dev_out->oif here.
2363 It was wrong for two reasons:
2364 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2365 assigned to multiple interfaces.
2366 2. Moreover, we are allowed to send packets with saddr
2367 of another iface. --ANK
2368 */
2369
2370 if (oldflp->oif == 0
2371 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2372 /* Special hack: user can direct multicasts
2373 and limited broadcast via necessary interface
2374 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2375 This hack is not just for fun, it allows
2376 vic,vat and friends to work.
2377 They bind socket to loopback, set ttl to zero
2378 and expect that it will work.
2379 From the viewpoint of routing cache they are broken,
2380 because we are not allowed to build multicast path
2381 with loopback source addr (look, routing cache
2382 cannot know, that ttl is zero, so that packet
2383 will not leave this host and route is valid).
2384 Luckily, this hack is good workaround.
2385 */
2386
2387 fl.oif = dev_out->ifindex;
2388 goto make_route;
2389 }
2390 if (dev_out)
2391 dev_put(dev_out);
2392 dev_out = NULL;
2393 }
2394
2395
2396 if (oldflp->oif) {
2397 dev_out = dev_get_by_index(oldflp->oif);
2398 err = -ENODEV;
2399 if (dev_out == NULL)
2400 goto out;
2401 if (__in_dev_get(dev_out) == NULL) {
2402 dev_put(dev_out);
2403 goto out; /* Wrong error code */
2404 }
2405
2406 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2407 if (!fl.fl4_src)
2408 fl.fl4_src = inet_select_addr(dev_out, 0,
2409 RT_SCOPE_LINK);
2410 goto make_route;
2411 }
2412 if (!fl.fl4_src) {
2413 if (MULTICAST(oldflp->fl4_dst))
2414 fl.fl4_src = inet_select_addr(dev_out, 0,
2415 fl.fl4_scope);
2416 else if (!oldflp->fl4_dst)
2417 fl.fl4_src = inet_select_addr(dev_out, 0,
2418 RT_SCOPE_HOST);
2419 }
2420 }
2421
2422 if (!fl.fl4_dst) {
2423 fl.fl4_dst = fl.fl4_src;
2424 if (!fl.fl4_dst)
2425 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2426 if (dev_out)
2427 dev_put(dev_out);
2428 dev_out = &loopback_dev;
2429 dev_hold(dev_out);
2430 fl.oif = loopback_dev.ifindex;
2431 res.type = RTN_LOCAL;
2432 flags |= RTCF_LOCAL;
2433 goto make_route;
2434 }
2435
2436 if (fib_lookup(&fl, &res)) {
2437 res.fi = NULL;
2438 if (oldflp->oif) {
2439 /* Apparently, routing tables are wrong. Assume,
2440 that the destination is on link.
2441
2442 WHY? DW.
2443 Because we are allowed to send to iface
2444 even if it has NO routes and NO assigned
2445 addresses. When oif is specified, routing
2446 tables are looked up with only one purpose:
2447 to catch if destination is gatewayed, rather than
2448 direct. Moreover, if MSG_DONTROUTE is set,
2449 we send packet, ignoring both routing tables
2450 and ifaddr state. --ANK
2451
2452
2453 We could make it even if oif is unknown,
2454 likely IPv6, but we do not.
2455 */
2456
2457 if (fl.fl4_src == 0)
2458 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 RT_SCOPE_LINK);
2460 res.type = RTN_UNICAST;
2461 goto make_route;
2462 }
2463 if (dev_out)
2464 dev_put(dev_out);
2465 err = -ENETUNREACH;
2466 goto out;
2467 }
2468 free_res = 1;
2469
2470 if (res.type == RTN_LOCAL) {
2471 if (!fl.fl4_src)
2472 fl.fl4_src = fl.fl4_dst;
2473 if (dev_out)
2474 dev_put(dev_out);
2475 dev_out = &loopback_dev;
2476 dev_hold(dev_out);
2477 fl.oif = dev_out->ifindex;
2478 if (res.fi)
2479 fib_info_put(res.fi);
2480 res.fi = NULL;
2481 flags |= RTCF_LOCAL;
2482 goto make_route;
2483 }
2484
2485#ifdef CONFIG_IP_ROUTE_MULTIPATH
2486 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2487 fib_select_multipath(&fl, &res);
2488 else
2489#endif
2490 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2491 fib_select_default(&fl, &res);
2492
2493 if (!fl.fl4_src)
2494 fl.fl4_src = FIB_RES_PREFSRC(res);
2495
2496 if (dev_out)
2497 dev_put(dev_out);
2498 dev_out = FIB_RES_DEV(res);
2499 dev_hold(dev_out);
2500 fl.oif = dev_out->ifindex;
2501
2502
2503make_route:
2504 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2505
2506
2507 if (free_res)
2508 fib_res_put(&res);
2509 if (dev_out)
2510 dev_put(dev_out);
2511out: return err;
2512}
2513
2514int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2515{
2516 unsigned hash;
2517 struct rtable *rth;
2518
2519 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2520
2521 rcu_read_lock_bh();
2522 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2523 rth = rcu_dereference(rth->u.rt_next)) {
2524 if (rth->fl.fl4_dst == flp->fl4_dst &&
2525 rth->fl.fl4_src == flp->fl4_src &&
2526 rth->fl.iif == 0 &&
2527 rth->fl.oif == flp->oif &&
2528#ifdef CONFIG_IP_ROUTE_FWMARK
2529 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2530#endif
2531 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2532 (IPTOS_RT_MASK | RTO_ONLINK))) {
2533
2534 /* check for multipath routes and choose one if
2535 * necessary
2536 */
2537 if (multipath_select_route(flp, rth, rp)) {
2538 dst_hold(&(*rp)->u.dst);
2539 RT_CACHE_STAT_INC(out_hit);
2540 rcu_read_unlock_bh();
2541 return 0;
2542 }
2543
2544 rth->u.dst.lastuse = jiffies;
2545 dst_hold(&rth->u.dst);
2546 rth->u.dst.__use++;
2547 RT_CACHE_STAT_INC(out_hit);
2548 rcu_read_unlock_bh();
2549 *rp = rth;
2550 return 0;
2551 }
2552 RT_CACHE_STAT_INC(out_hlist_search);
2553 }
2554 rcu_read_unlock_bh();
2555
2556 return ip_route_output_slow(rp, flp);
2557}
2558
2559int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2560{
2561 int err;
2562
2563 if ((err = __ip_route_output_key(rp, flp)) != 0)
2564 return err;
2565
2566 if (flp->proto) {
2567 if (!flp->fl4_src)
2568 flp->fl4_src = (*rp)->rt_src;
2569 if (!flp->fl4_dst)
2570 flp->fl4_dst = (*rp)->rt_dst;
2571 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2572 }
2573
2574 return 0;
2575}
2576
2577int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2578{
2579 return ip_route_output_flow(rp, flp, NULL, 0);
2580}
2581
2582static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2583 int nowait)
2584{
2585 struct rtable *rt = (struct rtable*)skb->dst;
2586 struct rtmsg *r;
2587 struct nlmsghdr *nlh;
2588 unsigned char *b = skb->tail;
2589 struct rta_cacheinfo ci;
2590#ifdef CONFIG_IP_MROUTE
2591 struct rtattr *eptr;
2592#endif
2593 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2594 r = NLMSG_DATA(nlh);
2595 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2596 r->rtm_family = AF_INET;
2597 r->rtm_dst_len = 32;
2598 r->rtm_src_len = 0;
2599 r->rtm_tos = rt->fl.fl4_tos;
2600 r->rtm_table = RT_TABLE_MAIN;
2601 r->rtm_type = rt->rt_type;
2602 r->rtm_scope = RT_SCOPE_UNIVERSE;
2603 r->rtm_protocol = RTPROT_UNSPEC;
2604 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2605 if (rt->rt_flags & RTCF_NOTIFY)
2606 r->rtm_flags |= RTM_F_NOTIFY;
2607 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2608 if (rt->fl.fl4_src) {
2609 r->rtm_src_len = 32;
2610 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2611 }
2612 if (rt->u.dst.dev)
2613 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2614#ifdef CONFIG_NET_CLS_ROUTE
2615 if (rt->u.dst.tclassid)
2616 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2617#endif
2618#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2619 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2620 __u32 alg = rt->rt_multipath_alg;
2621
2622 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2623 }
2624#endif
2625 if (rt->fl.iif)
2626 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2627 else if (rt->rt_src != rt->fl.fl4_src)
2628 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2629 if (rt->rt_dst != rt->rt_gateway)
2630 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2631 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2632 goto rtattr_failure;
2633 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2634 ci.rta_used = rt->u.dst.__use;
2635 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2636 if (rt->u.dst.expires)
2637 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2638 else
2639 ci.rta_expires = 0;
2640 ci.rta_error = rt->u.dst.error;
2641 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2642 if (rt->peer) {
2643 ci.rta_id = rt->peer->ip_id_count;
2644 if (rt->peer->tcp_ts_stamp) {
2645 ci.rta_ts = rt->peer->tcp_ts;
2646 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2647 }
2648 }
2649#ifdef CONFIG_IP_MROUTE
2650 eptr = (struct rtattr*)skb->tail;
2651#endif
2652 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2653 if (rt->fl.iif) {
2654#ifdef CONFIG_IP_MROUTE
2655 u32 dst = rt->rt_dst;
2656
2657 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2658 ipv4_devconf.mc_forwarding) {
2659 int err = ipmr_get_route(skb, r, nowait);
2660 if (err <= 0) {
2661 if (!nowait) {
2662 if (err == 0)
2663 return 0;
2664 goto nlmsg_failure;
2665 } else {
2666 if (err == -EMSGSIZE)
2667 goto nlmsg_failure;
2668 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2669 }
2670 }
2671 } else
2672#endif
2673 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2674 }
2675
2676 nlh->nlmsg_len = skb->tail - b;
2677 return skb->len;
2678
2679nlmsg_failure:
2680rtattr_failure:
2681 skb_trim(skb, b - skb->data);
2682 return -1;
2683}
2684
2685int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2686{
2687 struct rtattr **rta = arg;
2688 struct rtmsg *rtm = NLMSG_DATA(nlh);
2689 struct rtable *rt = NULL;
2690 u32 dst = 0;
2691 u32 src = 0;
2692 int iif = 0;
2693 int err = -ENOBUFS;
2694 struct sk_buff *skb;
2695
2696 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2697 if (!skb)
2698 goto out;
2699
2700 /* Reserve room for dummy headers, this skb can pass
2701 through good chunk of routing engine.
2702 */
2703 skb->mac.raw = skb->data;
2704 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2705
2706 if (rta[RTA_SRC - 1])
2707 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2708 if (rta[RTA_DST - 1])
2709 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2710 if (rta[RTA_IIF - 1])
2711 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2712
2713 if (iif) {
2714 struct net_device *dev = __dev_get_by_index(iif);
2715 err = -ENODEV;
2716 if (!dev)
2717 goto out_free;
2718 skb->protocol = htons(ETH_P_IP);
2719 skb->dev = dev;
2720 local_bh_disable();
2721 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2722 local_bh_enable();
2723 rt = (struct rtable*)skb->dst;
2724 if (!err && rt->u.dst.error)
2725 err = -rt->u.dst.error;
2726 } else {
2727 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2728 .saddr = src,
2729 .tos = rtm->rtm_tos } } };
2730 int oif = 0;
2731 if (rta[RTA_OIF - 1])
2732 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2733 fl.oif = oif;
2734 err = ip_route_output_key(&rt, &fl);
2735 }
2736 if (err)
2737 goto out_free;
2738
2739 skb->dst = &rt->u.dst;
2740 if (rtm->rtm_flags & RTM_F_NOTIFY)
2741 rt->rt_flags |= RTCF_NOTIFY;
2742
2743 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2744
2745 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2746 RTM_NEWROUTE, 0);
2747 if (!err)
2748 goto out_free;
2749 if (err < 0) {
2750 err = -EMSGSIZE;
2751 goto out_free;
2752 }
2753
2754 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2755 if (err > 0)
2756 err = 0;
2757out: return err;
2758
2759out_free:
2760 kfree_skb(skb);
2761 goto out;
2762}
2763
2764int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2765{
2766 struct rtable *rt;
2767 int h, s_h;
2768 int idx, s_idx;
2769
2770 s_h = cb->args[0];
2771 s_idx = idx = cb->args[1];
2772 for (h = 0; h <= rt_hash_mask; h++) {
2773 if (h < s_h) continue;
2774 if (h > s_h)
2775 s_idx = 0;
2776 rcu_read_lock_bh();
2777 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2778 rt = rcu_dereference(rt->u.rt_next), idx++) {
2779 if (idx < s_idx)
2780 continue;
2781 skb->dst = dst_clone(&rt->u.dst);
2782 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2783 cb->nlh->nlmsg_seq,
2784 RTM_NEWROUTE, 1) <= 0) {
2785 dst_release(xchg(&skb->dst, NULL));
2786 rcu_read_unlock_bh();
2787 goto done;
2788 }
2789 dst_release(xchg(&skb->dst, NULL));
2790 }
2791 rcu_read_unlock_bh();
2792 }
2793
2794done:
2795 cb->args[0] = h;
2796 cb->args[1] = idx;
2797 return skb->len;
2798}
2799
2800void ip_rt_multicast_event(struct in_device *in_dev)
2801{
2802 rt_cache_flush(0);
2803}
2804
2805#ifdef CONFIG_SYSCTL
2806static int flush_delay;
2807
2808static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2809 struct file *filp, void __user *buffer,
2810 size_t *lenp, loff_t *ppos)
2811{
2812 if (write) {
2813 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2814 rt_cache_flush(flush_delay);
2815 return 0;
2816 }
2817
2818 return -EINVAL;
2819}
2820
2821static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2822 int __user *name,
2823 int nlen,
2824 void __user *oldval,
2825 size_t __user *oldlenp,
2826 void __user *newval,
2827 size_t newlen,
2828 void **context)
2829{
2830 int delay;
2831 if (newlen != sizeof(int))
2832 return -EINVAL;
2833 if (get_user(delay, (int __user *)newval))
2834 return -EFAULT;
2835 rt_cache_flush(delay);
2836 return 0;
2837}
2838
2839ctl_table ipv4_route_table[] = {
2840 {
2841 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2842 .procname = "flush",
2843 .data = &flush_delay,
2844 .maxlen = sizeof(int),
2845 .mode = 0644,
2846 .proc_handler = &ipv4_sysctl_rtcache_flush,
2847 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2848 },
2849 {
2850 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2851 .procname = "min_delay",
2852 .data = &ip_rt_min_delay,
2853 .maxlen = sizeof(int),
2854 .mode = 0644,
2855 .proc_handler = &proc_dointvec_jiffies,
2856 .strategy = &sysctl_jiffies,
2857 },
2858 {
2859 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2860 .procname = "max_delay",
2861 .data = &ip_rt_max_delay,
2862 .maxlen = sizeof(int),
2863 .mode = 0644,
2864 .proc_handler = &proc_dointvec_jiffies,
2865 .strategy = &sysctl_jiffies,
2866 },
2867 {
2868 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2869 .procname = "gc_thresh",
2870 .data = &ipv4_dst_ops.gc_thresh,
2871 .maxlen = sizeof(int),
2872 .mode = 0644,
2873 .proc_handler = &proc_dointvec,
2874 },
2875 {
2876 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2877 .procname = "max_size",
2878 .data = &ip_rt_max_size,
2879 .maxlen = sizeof(int),
2880 .mode = 0644,
2881 .proc_handler = &proc_dointvec,
2882 },
2883 {
2884 /* Deprecated. Use gc_min_interval_ms */
2885
2886 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2887 .procname = "gc_min_interval",
2888 .data = &ip_rt_gc_min_interval,
2889 .maxlen = sizeof(int),
2890 .mode = 0644,
2891 .proc_handler = &proc_dointvec_jiffies,
2892 .strategy = &sysctl_jiffies,
2893 },
2894 {
2895 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2896 .procname = "gc_min_interval_ms",
2897 .data = &ip_rt_gc_min_interval,
2898 .maxlen = sizeof(int),
2899 .mode = 0644,
2900 .proc_handler = &proc_dointvec_ms_jiffies,
2901 .strategy = &sysctl_ms_jiffies,
2902 },
2903 {
2904 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2905 .procname = "gc_timeout",
2906 .data = &ip_rt_gc_timeout,
2907 .maxlen = sizeof(int),
2908 .mode = 0644,
2909 .proc_handler = &proc_dointvec_jiffies,
2910 .strategy = &sysctl_jiffies,
2911 },
2912 {
2913 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2914 .procname = "gc_interval",
2915 .data = &ip_rt_gc_interval,
2916 .maxlen = sizeof(int),
2917 .mode = 0644,
2918 .proc_handler = &proc_dointvec_jiffies,
2919 .strategy = &sysctl_jiffies,
2920 },
2921 {
2922 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2923 .procname = "redirect_load",
2924 .data = &ip_rt_redirect_load,
2925 .maxlen = sizeof(int),
2926 .mode = 0644,
2927 .proc_handler = &proc_dointvec,
2928 },
2929 {
2930 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2931 .procname = "redirect_number",
2932 .data = &ip_rt_redirect_number,
2933 .maxlen = sizeof(int),
2934 .mode = 0644,
2935 .proc_handler = &proc_dointvec,
2936 },
2937 {
2938 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2939 .procname = "redirect_silence",
2940 .data = &ip_rt_redirect_silence,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec,
2944 },
2945 {
2946 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2947 .procname = "error_cost",
2948 .data = &ip_rt_error_cost,
2949 .maxlen = sizeof(int),
2950 .mode = 0644,
2951 .proc_handler = &proc_dointvec,
2952 },
2953 {
2954 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2955 .procname = "error_burst",
2956 .data = &ip_rt_error_burst,
2957 .maxlen = sizeof(int),
2958 .mode = 0644,
2959 .proc_handler = &proc_dointvec,
2960 },
2961 {
2962 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2963 .procname = "gc_elasticity",
2964 .data = &ip_rt_gc_elasticity,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec,
2968 },
2969 {
2970 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2971 .procname = "mtu_expires",
2972 .data = &ip_rt_mtu_expires,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec_jiffies,
2976 .strategy = &sysctl_jiffies,
2977 },
2978 {
2979 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2980 .procname = "min_pmtu",
2981 .data = &ip_rt_min_pmtu,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
2984 .proc_handler = &proc_dointvec,
2985 },
2986 {
2987 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2988 .procname = "min_adv_mss",
2989 .data = &ip_rt_min_advmss,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2993 },
2994 {
2995 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2996 .procname = "secret_interval",
2997 .data = &ip_rt_secret_interval,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec_jiffies,
3001 .strategy = &sysctl_jiffies,
3002 },
3003 { .ctl_name = 0 }
3004};
3005#endif
3006
3007#ifdef CONFIG_NET_CLS_ROUTE
3008struct ip_rt_acct *ip_rt_acct;
3009
3010/* This code sucks. But you should have seen it before! --RR */
3011
3012/* IP route accounting ptr for this logical cpu number. */
3013#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3014
3015#ifdef CONFIG_PROC_FS
3016static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3017 int length, int *eof, void *data)
3018{
3019 unsigned int i;
3020
3021 if ((offset & 3) || (length & 3))
3022 return -EIO;
3023
3024 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3025 *eof = 1;
3026 return 0;
3027 }
3028
3029 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3030 length = sizeof(struct ip_rt_acct) * 256 - offset;
3031 *eof = 1;
3032 }
3033
3034 offset /= sizeof(u32);
3035
3036 if (length > 0) {
3037 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3038 u32 *dst = (u32 *) buffer;
3039
3040 /* Copy first cpu. */
3041 *start = buffer;
3042 memcpy(dst, src, length);
3043
3044 /* Add the other cpus in, one int at a time */
3045 for_each_cpu(i) {
3046 unsigned int j;
3047
3048 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3049
3050 for (j = 0; j < length/4; j++)
3051 dst[j] += src[j];
3052 }
3053 }
3054 return length;
3055}
3056#endif /* CONFIG_PROC_FS */
3057#endif /* CONFIG_NET_CLS_ROUTE */
3058
3059static __initdata unsigned long rhash_entries;
3060static int __init set_rhash_entries(char *str)
3061{
3062 if (!str)
3063 return 0;
3064 rhash_entries = simple_strtoul(str, &str, 0);
3065 return 1;
3066}
3067__setup("rhash_entries=", set_rhash_entries);
3068
3069int __init ip_rt_init(void)
3070{
3071 int i, order, goal, rc = 0;
3072
3073 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074 (jiffies ^ (jiffies >> 7)));
3075
3076#ifdef CONFIG_NET_CLS_ROUTE
3077 for (order = 0;
3078 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079 /* NOTHING */;
3080 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3081 if (!ip_rt_acct)
3082 panic("IP: failed to allocate ip_rt_acct\n");
3083 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3084#endif
3085
3086 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3087 sizeof(struct rtable),
3088 0, SLAB_HWCACHE_ALIGN,
3089 NULL, NULL);
3090
3091 if (!ipv4_dst_ops.kmem_cachep)
3092 panic("IP: failed to allocate ip_dst_cache\n");
3093
3094 goal = num_physpages >> (26 - PAGE_SHIFT);
3095 if (rhash_entries)
3096 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3097 for (order = 0; (1UL << order) < goal; order++)
3098 /* NOTHING */;
3099
3100 do {
3101 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3102 sizeof(struct rt_hash_bucket);
3103 while (rt_hash_mask & (rt_hash_mask - 1))
3104 rt_hash_mask--;
3105 rt_hash_table = (struct rt_hash_bucket *)
3106 __get_free_pages(GFP_ATOMIC, order);
3107 } while (rt_hash_table == NULL && --order > 0);
3108
3109 if (!rt_hash_table)
3110 panic("Failed to allocate IP route cache hash table\n");
3111
3112 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113 rt_hash_mask,
3114 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115
3116 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117 /* NOTHING */;
3118
3119 rt_hash_mask--;
3120 for (i = 0; i <= rt_hash_mask; i++) {
3121 spin_lock_init(&rt_hash_table[i].lock);
3122 rt_hash_table[i].chain = NULL;
3123 }
3124
3125 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3127
3128 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3129 if (!rt_cache_stat)
3130 return -ENOMEM;
3131
3132 devinet_init();
3133 ip_fib_init();
3134
3135 init_timer(&rt_flush_timer);
3136 rt_flush_timer.function = rt_run_flush;
3137 init_timer(&rt_periodic_timer);
3138 rt_periodic_timer.function = rt_check_expire;
3139 init_timer(&rt_secret_timer);
3140 rt_secret_timer.function = rt_secret_rebuild;
3141
3142 /* All the timers, started at system startup tend
3143 to synchronize. Perturb it a bit.
3144 */
3145 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3146 ip_rt_gc_interval;
3147 add_timer(&rt_periodic_timer);
3148
3149 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3150 ip_rt_secret_interval;
3151 add_timer(&rt_secret_timer);
3152
3153#ifdef CONFIG_PROC_FS
3154 {
3155 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3156 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3157 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3158 proc_net_stat))) {
3159 free_percpu(rt_cache_stat);
3160 return -ENOMEM;
3161 }
3162 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3163 }
3164#ifdef CONFIG_NET_CLS_ROUTE
3165 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3166#endif
3167#endif
3168#ifdef CONFIG_XFRM
3169 xfrm_init();
3170 xfrm4_init();
3171#endif
3172 return rc;
3173}
3174
3175EXPORT_SYMBOL(__ip_select_ident);
3176EXPORT_SYMBOL(ip_route_input);
3177EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 000000000000..e923d2f021aa
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,279 @@
1/*
2 * Syncookies implementation for the Linux kernel
3 *
4 * Copyright (C) 1997 Andi Kleen
5 * Based on ideas by D.J.Bernstein and Eric Schenk.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $
13 *
14 * Missing: IPv6 support.
15 */
16
17#include <linux/tcp.h>
18#include <linux/slab.h>
19#include <linux/random.h>
20#include <linux/cryptohash.h>
21#include <linux/kernel.h>
22#include <net/tcp.h>
23
24extern int sysctl_tcp_syncookies;
25
26static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
27
28static __init int init_syncookies(void)
29{
30 get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
31 return 0;
32}
33module_init(init_syncookies);
34
35#define COOKIEBITS 24 /* Upper bits store count */
36#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
37
38static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport,
39 u32 count, int c)
40{
41 __u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
42
43 memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c]));
44 tmp[0] = saddr;
45 tmp[1] = daddr;
46 tmp[2] = (sport << 16) + dport;
47 tmp[3] = count;
48 sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
49
50 return tmp[17];
51}
52
53static __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
54 __u16 dport, __u32 sseq, __u32 count,
55 __u32 data)
56{
57 /*
58 * Compute the secure sequence number.
59 * The output should be:
60 * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
61 * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
62 * Where sseq is their sequence number and count increases every
63 * minute by 1.
64 * As an extra hack, we add a small "data" value that encodes the
65 * MSS into the second hash value.
66 */
67
68 return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
69 sseq + (count << COOKIEBITS) +
70 ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
71 & COOKIEMASK));
72}
73
74/*
75 * This retrieves the small "data" value from the syncookie.
76 * If the syncookie is bad, the data returned will be out of
77 * range. This must be checked by the caller.
78 *
79 * The count value used to generate the cookie must be within
80 * "maxdiff" if the current (passed-in) "count". The return value
81 * is (__u32)-1 if this test fails.
82 */
83static __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr,
84 __u16 sport, __u16 dport, __u32 sseq,
85 __u32 count, __u32 maxdiff)
86{
87 __u32 diff;
88
89 /* Strip away the layers from the cookie */
90 cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
91
92 /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
93 diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
94 if (diff >= maxdiff)
95 return (__u32)-1;
96
97 return (cookie -
98 cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
99 & COOKIEMASK; /* Leaving the data behind */
100}
101
102/*
103 * This table has to be sorted and terminated with (__u16)-1.
104 * XXX generate a better table.
105 * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
106 */
107static __u16 const msstab[] = {
108 64 - 1,
109 256 - 1,
110 512 - 1,
111 536 - 1,
112 1024 - 1,
113 1440 - 1,
114 1460 - 1,
115 4312 - 1,
116 (__u16)-1
117};
118/* The number doesn't include the -1 terminator */
119#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
120
121/*
122 * Generate a syncookie. mssp points to the mss, which is returned
123 * rounded down to the value encoded in the cookie.
124 */
125__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
126{
127 struct tcp_sock *tp = tcp_sk(sk);
128 int mssind;
129 const __u16 mss = *mssp;
130
131
132 tp->last_synq_overflow = jiffies;
133
134 /* XXX sort msstab[] by probability? Binary search? */
135 for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
136 ;
137 *mssp = msstab[mssind] + 1;
138
139 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT);
140
141 return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr,
142 skb->h.th->source, skb->h.th->dest,
143 ntohl(skb->h.th->seq),
144 jiffies / (HZ * 60), mssind);
145}
146
147/*
148 * This (misnamed) value is the age of syncookie which is permitted.
149 * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
150 * sysctl_tcp_retries1. It's a rather complicated formula (exponential
151 * backoff) to compute at runtime so it's currently hardcoded here.
152 */
153#define COUNTER_TRIES 4
154/*
155 * Check if a ack sequence number is a valid syncookie.
156 * Return the decoded mss if it is, or 0 if not.
157 */
158static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
159{
160 __u32 seq;
161 __u32 mssind;
162
163 seq = ntohl(skb->h.th->seq)-1;
164 mssind = check_tcp_syn_cookie(cookie,
165 skb->nh.iph->saddr, skb->nh.iph->daddr,
166 skb->h.th->source, skb->h.th->dest,
167 seq, jiffies / (HZ * 60), COUNTER_TRIES);
168
169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
170}
171
172extern struct or_calltable or_ipv4;
173
174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
175 struct open_request *req,
176 struct dst_entry *dst)
177{
178 struct tcp_sock *tp = tcp_sk(sk);
179 struct sock *child;
180
181 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
182 if (child)
183 tcp_acceptq_queue(sk, req, child);
184 else
185 tcp_openreq_free(req);
186
187 return child;
188}
189
190struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
191 struct ip_options *opt)
192{
193 struct tcp_sock *tp = tcp_sk(sk);
194 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
195 struct sock *ret = sk;
196 struct open_request *req;
197 int mss;
198 struct rtable *rt;
199 __u8 rcv_wscale;
200
201 if (!sysctl_tcp_syncookies || !skb->h.th->ack)
202 goto out;
203
204 if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
205 (mss = cookie_check(skb, cookie)) == 0) {
206 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED);
207 goto out;
208 }
209
210 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
211
212 req = tcp_openreq_alloc();
213 ret = NULL;
214 if (!req)
215 goto out;
216
217 req->rcv_isn = htonl(skb->h.th->seq) - 1;
218 req->snt_isn = cookie;
219 req->mss = mss;
220 req->rmt_port = skb->h.th->source;
221 req->af.v4_req.loc_addr = skb->nh.iph->daddr;
222 req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
223 req->class = &or_ipv4; /* for savety */
224 req->af.v4_req.opt = NULL;
225
226 /* We throwed the options of the initial SYN away, so we hope
227 * the ACK carries the same options again (see RFC1122 4.2.3.8)
228 */
229 if (opt && opt->optlen) {
230 int opt_size = sizeof(struct ip_options) + opt->optlen;
231
232 req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
233 if (req->af.v4_req.opt) {
234 if (ip_options_echo(req->af.v4_req.opt, skb)) {
235 kfree(req->af.v4_req.opt);
236 req->af.v4_req.opt = NULL;
237 }
238 }
239 }
240
241 req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
242 req->wscale_ok = req->sack_ok = 0;
243 req->expires = 0UL;
244 req->retrans = 0;
245
246 /*
247 * We need to lookup the route here to get at the correct
248 * window size. We should better make sure that the window size
249 * hasn't changed since we received the original syn, but I see
250 * no easy way to do this.
251 */
252 {
253 struct flowi fl = { .nl_u = { .ip4_u =
254 { .daddr = ((opt && opt->srr) ?
255 opt->faddr :
256 req->af.v4_req.rmt_addr),
257 .saddr = req->af.v4_req.loc_addr,
258 .tos = RT_CONN_FLAGS(sk) } },
259 .proto = IPPROTO_TCP,
260 .uli_u = { .ports =
261 { .sport = skb->h.th->dest,
262 .dport = skb->h.th->source } } };
263 if (ip_route_output_key(&rt, &fl)) {
264 tcp_openreq_free(req);
265 goto out;
266 }
267 }
268
269 /* Try to redo what tcp_v4_send_synack did. */
270 req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW);
271 tcp_select_initial_window(tcp_full_space(sk), req->mss,
272 &req->rcv_wnd, &req->window_clamp,
273 0, &rcv_wscale);
274 /* BTW win scale with syncookies is 0 by definition */
275 req->rcv_wscale = rcv_wscale;
276
277 ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
278out: return ret;
279}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 000000000000..3aafb298c1c1
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,698 @@
1/*
2 * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
3 *
4 * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $
5 *
6 * Begun April 1, 1996, Mike Shaver.
7 * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
8 */
9
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <linux/sysctl.h>
13#include <linux/config.h>
14#include <net/snmp.h>
15#include <net/ip.h>
16#include <net/route.h>
17#include <net/tcp.h>
18
19/* From af_inet.c */
20extern int sysctl_ip_nonlocal_bind;
21
22/* From icmp.c */
23extern int sysctl_icmp_echo_ignore_all;
24extern int sysctl_icmp_echo_ignore_broadcasts;
25extern int sysctl_icmp_ignore_bogus_error_responses;
26
27/* From ip_fragment.c */
28extern int sysctl_ipfrag_low_thresh;
29extern int sysctl_ipfrag_high_thresh;
30extern int sysctl_ipfrag_time;
31extern int sysctl_ipfrag_secret_interval;
32
33/* From ip_output.c */
34extern int sysctl_ip_dynaddr;
35
36/* From icmp.c */
37extern int sysctl_icmp_ratelimit;
38extern int sysctl_icmp_ratemask;
39
40/* From igmp.c */
41extern int sysctl_igmp_max_memberships;
42extern int sysctl_igmp_max_msf;
43
44/* From inetpeer.c */
45extern int inet_peer_threshold;
46extern int inet_peer_minttl;
47extern int inet_peer_maxttl;
48extern int inet_peer_gc_mintime;
49extern int inet_peer_gc_maxtime;
50
51#ifdef CONFIG_SYSCTL
52static int tcp_retr1_max = 255;
53static int ip_local_port_range_min[] = { 1, 1 };
54static int ip_local_port_range_max[] = { 65535, 65535 };
55#endif
56
57struct ipv4_config ipv4_config;
58
59extern ctl_table ipv4_route_table[];
60
61#ifdef CONFIG_SYSCTL
62
63static
64int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
65 void __user *buffer, size_t *lenp, loff_t *ppos)
66{
67 int val = ipv4_devconf.forwarding;
68 int ret;
69
70 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
71
72 if (write && ipv4_devconf.forwarding != val)
73 inet_forward_change();
74
75 return ret;
76}
77
78static int ipv4_sysctl_forward_strategy(ctl_table *table,
79 int __user *name, int nlen,
80 void __user *oldval, size_t __user *oldlenp,
81 void __user *newval, size_t newlen,
82 void **context)
83{
84 int *valp = table->data;
85 int new;
86
87 if (!newval || !newlen)
88 return 0;
89
90 if (newlen != sizeof(int))
91 return -EINVAL;
92
93 if (get_user(new, (int __user *)newval))
94 return -EFAULT;
95
96 if (new == *valp)
97 return 0;
98
99 if (oldval && oldlenp) {
100 size_t len;
101
102 if (get_user(len, oldlenp))
103 return -EFAULT;
104
105 if (len) {
106 if (len > table->maxlen)
107 len = table->maxlen;
108 if (copy_to_user(oldval, valp, len))
109 return -EFAULT;
110 if (put_user(len, oldlenp))
111 return -EFAULT;
112 }
113 }
114
115 *valp = new;
116 inet_forward_change();
117 return 1;
118}
119
120ctl_table ipv4_table[] = {
121 {
122 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
123 .procname = "tcp_timestamps",
124 .data = &sysctl_tcp_timestamps,
125 .maxlen = sizeof(int),
126 .mode = 0644,
127 .proc_handler = &proc_dointvec
128 },
129 {
130 .ctl_name = NET_IPV4_TCP_WINDOW_SCALING,
131 .procname = "tcp_window_scaling",
132 .data = &sysctl_tcp_window_scaling,
133 .maxlen = sizeof(int),
134 .mode = 0644,
135 .proc_handler = &proc_dointvec
136 },
137 {
138 .ctl_name = NET_IPV4_TCP_SACK,
139 .procname = "tcp_sack",
140 .data = &sysctl_tcp_sack,
141 .maxlen = sizeof(int),
142 .mode = 0644,
143 .proc_handler = &proc_dointvec
144 },
145 {
146 .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE,
147 .procname = "tcp_retrans_collapse",
148 .data = &sysctl_tcp_retrans_collapse,
149 .maxlen = sizeof(int),
150 .mode = 0644,
151 .proc_handler = &proc_dointvec
152 },
153 {
154 .ctl_name = NET_IPV4_FORWARD,
155 .procname = "ip_forward",
156 .data = &ipv4_devconf.forwarding,
157 .maxlen = sizeof(int),
158 .mode = 0644,
159 .proc_handler = &ipv4_sysctl_forward,
160 .strategy = &ipv4_sysctl_forward_strategy
161 },
162 {
163 .ctl_name = NET_IPV4_DEFAULT_TTL,
164 .procname = "ip_default_ttl",
165 .data = &sysctl_ip_default_ttl,
166 .maxlen = sizeof(int),
167 .mode = 0644,
168 .proc_handler = &ipv4_doint_and_flush,
169 .strategy = &ipv4_doint_and_flush_strategy,
170 },
171 {
172 .ctl_name = NET_IPV4_AUTOCONFIG,
173 .procname = "ip_autoconfig",
174 .data = &ipv4_config.autoconfig,
175 .maxlen = sizeof(int),
176 .mode = 0644,
177 .proc_handler = &proc_dointvec
178 },
179 {
180 .ctl_name = NET_IPV4_NO_PMTU_DISC,
181 .procname = "ip_no_pmtu_disc",
182 .data = &ipv4_config.no_pmtu_disc,
183 .maxlen = sizeof(int),
184 .mode = 0644,
185 .proc_handler = &proc_dointvec
186 },
187 {
188 .ctl_name = NET_IPV4_NONLOCAL_BIND,
189 .procname = "ip_nonlocal_bind",
190 .data = &sysctl_ip_nonlocal_bind,
191 .maxlen = sizeof(int),
192 .mode = 0644,
193 .proc_handler = &proc_dointvec
194 },
195 {
196 .ctl_name = NET_IPV4_TCP_SYN_RETRIES,
197 .procname = "tcp_syn_retries",
198 .data = &sysctl_tcp_syn_retries,
199 .maxlen = sizeof(int),
200 .mode = 0644,
201 .proc_handler = &proc_dointvec
202 },
203 {
204 .ctl_name = NET_TCP_SYNACK_RETRIES,
205 .procname = "tcp_synack_retries",
206 .data = &sysctl_tcp_synack_retries,
207 .maxlen = sizeof(int),
208 .mode = 0644,
209 .proc_handler = &proc_dointvec
210 },
211 {
212 .ctl_name = NET_TCP_MAX_ORPHANS,
213 .procname = "tcp_max_orphans",
214 .data = &sysctl_tcp_max_orphans,
215 .maxlen = sizeof(int),
216 .mode = 0644,
217 .proc_handler = &proc_dointvec
218 },
219 {
220 .ctl_name = NET_TCP_MAX_TW_BUCKETS,
221 .procname = "tcp_max_tw_buckets",
222 .data = &sysctl_tcp_max_tw_buckets,
223 .maxlen = sizeof(int),
224 .mode = 0644,
225 .proc_handler = &proc_dointvec
226 },
227 {
228 .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
229 .procname = "ipfrag_high_thresh",
230 .data = &sysctl_ipfrag_high_thresh,
231 .maxlen = sizeof(int),
232 .mode = 0644,
233 .proc_handler = &proc_dointvec
234 },
235 {
236 .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
237 .procname = "ipfrag_low_thresh",
238 .data = &sysctl_ipfrag_low_thresh,
239 .maxlen = sizeof(int),
240 .mode = 0644,
241 .proc_handler = &proc_dointvec
242 },
243 {
244 .ctl_name = NET_IPV4_DYNADDR,
245 .procname = "ip_dynaddr",
246 .data = &sysctl_ip_dynaddr,
247 .maxlen = sizeof(int),
248 .mode = 0644,
249 .proc_handler = &proc_dointvec
250 },
251 {
252 .ctl_name = NET_IPV4_IPFRAG_TIME,
253 .procname = "ipfrag_time",
254 .data = &sysctl_ipfrag_time,
255 .maxlen = sizeof(int),
256 .mode = 0644,
257 .proc_handler = &proc_dointvec_jiffies,
258 .strategy = &sysctl_jiffies
259 },
260 {
261 .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
262 .procname = "tcp_keepalive_time",
263 .data = &sysctl_tcp_keepalive_time,
264 .maxlen = sizeof(int),
265 .mode = 0644,
266 .proc_handler = &proc_dointvec_jiffies,
267 .strategy = &sysctl_jiffies
268 },
269 {
270 .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES,
271 .procname = "tcp_keepalive_probes",
272 .data = &sysctl_tcp_keepalive_probes,
273 .maxlen = sizeof(int),
274 .mode = 0644,
275 .proc_handler = &proc_dointvec
276 },
277 {
278 .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL,
279 .procname = "tcp_keepalive_intvl",
280 .data = &sysctl_tcp_keepalive_intvl,
281 .maxlen = sizeof(int),
282 .mode = 0644,
283 .proc_handler = &proc_dointvec_jiffies,
284 .strategy = &sysctl_jiffies
285 },
286 {
287 .ctl_name = NET_IPV4_TCP_RETRIES1,
288 .procname = "tcp_retries1",
289 .data = &sysctl_tcp_retries1,
290 .maxlen = sizeof(int),
291 .mode = 0644,
292 .proc_handler = &proc_dointvec_minmax,
293 .strategy = &sysctl_intvec,
294 .extra2 = &tcp_retr1_max
295 },
296 {
297 .ctl_name = NET_IPV4_TCP_RETRIES2,
298 .procname = "tcp_retries2",
299 .data = &sysctl_tcp_retries2,
300 .maxlen = sizeof(int),
301 .mode = 0644,
302 .proc_handler = &proc_dointvec
303 },
304 {
305 .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT,
306 .procname = "tcp_fin_timeout",
307 .data = &sysctl_tcp_fin_timeout,
308 .maxlen = sizeof(int),
309 .mode = 0644,
310 .proc_handler = &proc_dointvec_jiffies,
311 .strategy = &sysctl_jiffies
312 },
313#ifdef CONFIG_SYN_COOKIES
314 {
315 .ctl_name = NET_TCP_SYNCOOKIES,
316 .procname = "tcp_syncookies",
317 .data = &sysctl_tcp_syncookies,
318 .maxlen = sizeof(int),
319 .mode = 0644,
320 .proc_handler = &proc_dointvec
321 },
322#endif
323 {
324 .ctl_name = NET_TCP_TW_RECYCLE,
325 .procname = "tcp_tw_recycle",
326 .data = &sysctl_tcp_tw_recycle,
327 .maxlen = sizeof(int),
328 .mode = 0644,
329 .proc_handler = &proc_dointvec
330 },
331 {
332 .ctl_name = NET_TCP_ABORT_ON_OVERFLOW,
333 .procname = "tcp_abort_on_overflow",
334 .data = &sysctl_tcp_abort_on_overflow,
335 .maxlen = sizeof(int),
336 .mode = 0644,
337 .proc_handler = &proc_dointvec
338 },
339 {
340 .ctl_name = NET_TCP_STDURG,
341 .procname = "tcp_stdurg",
342 .data = &sysctl_tcp_stdurg,
343 .maxlen = sizeof(int),
344 .mode = 0644,
345 .proc_handler = &proc_dointvec
346 },
347 {
348 .ctl_name = NET_TCP_RFC1337,
349 .procname = "tcp_rfc1337",
350 .data = &sysctl_tcp_rfc1337,
351 .maxlen = sizeof(int),
352 .mode = 0644,
353 .proc_handler = &proc_dointvec
354 },
355 {
356 .ctl_name = NET_TCP_MAX_SYN_BACKLOG,
357 .procname = "tcp_max_syn_backlog",
358 .data = &sysctl_max_syn_backlog,
359 .maxlen = sizeof(int),
360 .mode = 0644,
361 .proc_handler = &proc_dointvec
362 },
363 {
364 .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
365 .procname = "ip_local_port_range",
366 .data = &sysctl_local_port_range,
367 .maxlen = sizeof(sysctl_local_port_range),
368 .mode = 0644,
369 .proc_handler = &proc_dointvec_minmax,
370 .strategy = &sysctl_intvec,
371 .extra1 = ip_local_port_range_min,
372 .extra2 = ip_local_port_range_max
373 },
374 {
375 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
376 .procname = "icmp_echo_ignore_all",
377 .data = &sysctl_icmp_echo_ignore_all,
378 .maxlen = sizeof(int),
379 .mode = 0644,
380 .proc_handler = &proc_dointvec
381 },
382 {
383 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
384 .procname = "icmp_echo_ignore_broadcasts",
385 .data = &sysctl_icmp_echo_ignore_broadcasts,
386 .maxlen = sizeof(int),
387 .mode = 0644,
388 .proc_handler = &proc_dointvec
389 },
390 {
391 .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
392 .procname = "icmp_ignore_bogus_error_responses",
393 .data = &sysctl_icmp_ignore_bogus_error_responses,
394 .maxlen = sizeof(int),
395 .mode = 0644,
396 .proc_handler = &proc_dointvec
397 },
398 {
399 .ctl_name = NET_IPV4_ROUTE,
400 .procname = "route",
401 .maxlen = 0,
402 .mode = 0555,
403 .child = ipv4_route_table
404 },
405#ifdef CONFIG_IP_MULTICAST
406 {
407 .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS,
408 .procname = "igmp_max_memberships",
409 .data = &sysctl_igmp_max_memberships,
410 .maxlen = sizeof(int),
411 .mode = 0644,
412 .proc_handler = &proc_dointvec
413 },
414
415#endif
416 {
417 .ctl_name = NET_IPV4_IGMP_MAX_MSF,
418 .procname = "igmp_max_msf",
419 .data = &sysctl_igmp_max_msf,
420 .maxlen = sizeof(int),
421 .mode = 0644,
422 .proc_handler = &proc_dointvec
423 },
424 {
425 .ctl_name = NET_IPV4_INET_PEER_THRESHOLD,
426 .procname = "inet_peer_threshold",
427 .data = &inet_peer_threshold,
428 .maxlen = sizeof(int),
429 .mode = 0644,
430 .proc_handler = &proc_dointvec
431 },
432 {
433 .ctl_name = NET_IPV4_INET_PEER_MINTTL,
434 .procname = "inet_peer_minttl",
435 .data = &inet_peer_minttl,
436 .maxlen = sizeof(int),
437 .mode = 0644,
438 .proc_handler = &proc_dointvec_jiffies,
439 .strategy = &sysctl_jiffies
440 },
441 {
442 .ctl_name = NET_IPV4_INET_PEER_MAXTTL,
443 .procname = "inet_peer_maxttl",
444 .data = &inet_peer_maxttl,
445 .maxlen = sizeof(int),
446 .mode = 0644,
447 .proc_handler = &proc_dointvec_jiffies,
448 .strategy = &sysctl_jiffies
449 },
450 {
451 .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME,
452 .procname = "inet_peer_gc_mintime",
453 .data = &inet_peer_gc_mintime,
454 .maxlen = sizeof(int),
455 .mode = 0644,
456 .proc_handler = &proc_dointvec_jiffies,
457 .strategy = &sysctl_jiffies
458 },
459 {
460 .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME,
461 .procname = "inet_peer_gc_maxtime",
462 .data = &inet_peer_gc_maxtime,
463 .maxlen = sizeof(int),
464 .mode = 0644,
465 .proc_handler = &proc_dointvec_jiffies,
466 .strategy = &sysctl_jiffies
467 },
468 {
469 .ctl_name = NET_TCP_ORPHAN_RETRIES,
470 .procname = "tcp_orphan_retries",
471 .data = &sysctl_tcp_orphan_retries,
472 .maxlen = sizeof(int),
473 .mode = 0644,
474 .proc_handler = &proc_dointvec
475 },
476 {
477 .ctl_name = NET_TCP_FACK,
478 .procname = "tcp_fack",
479 .data = &sysctl_tcp_fack,
480 .maxlen = sizeof(int),
481 .mode = 0644,
482 .proc_handler = &proc_dointvec
483 },
484 {
485 .ctl_name = NET_TCP_REORDERING,
486 .procname = "tcp_reordering",
487 .data = &sysctl_tcp_reordering,
488 .maxlen = sizeof(int),
489 .mode = 0644,
490 .proc_handler = &proc_dointvec
491 },
492 {
493 .ctl_name = NET_TCP_ECN,
494 .procname = "tcp_ecn",
495 .data = &sysctl_tcp_ecn,
496 .maxlen = sizeof(int),
497 .mode = 0644,
498 .proc_handler = &proc_dointvec
499 },
500 {
501 .ctl_name = NET_TCP_DSACK,
502 .procname = "tcp_dsack",
503 .data = &sysctl_tcp_dsack,
504 .maxlen = sizeof(int),
505 .mode = 0644,
506 .proc_handler = &proc_dointvec
507 },
508 {
509 .ctl_name = NET_TCP_MEM,
510 .procname = "tcp_mem",
511 .data = &sysctl_tcp_mem,
512 .maxlen = sizeof(sysctl_tcp_mem),
513 .mode = 0644,
514 .proc_handler = &proc_dointvec
515 },
516 {
517 .ctl_name = NET_TCP_WMEM,
518 .procname = "tcp_wmem",
519 .data = &sysctl_tcp_wmem,
520 .maxlen = sizeof(sysctl_tcp_wmem),
521 .mode = 0644,
522 .proc_handler = &proc_dointvec
523 },
524 {
525 .ctl_name = NET_TCP_RMEM,
526 .procname = "tcp_rmem",
527 .data = &sysctl_tcp_rmem,
528 .maxlen = sizeof(sysctl_tcp_rmem),
529 .mode = 0644,
530 .proc_handler = &proc_dointvec
531 },
532 {
533 .ctl_name = NET_TCP_APP_WIN,
534 .procname = "tcp_app_win",
535 .data = &sysctl_tcp_app_win,
536 .maxlen = sizeof(int),
537 .mode = 0644,
538 .proc_handler = &proc_dointvec
539 },
540 {
541 .ctl_name = NET_TCP_ADV_WIN_SCALE,
542 .procname = "tcp_adv_win_scale",
543 .data = &sysctl_tcp_adv_win_scale,
544 .maxlen = sizeof(int),
545 .mode = 0644,
546 .proc_handler = &proc_dointvec
547 },
548 {
549 .ctl_name = NET_IPV4_ICMP_RATELIMIT,
550 .procname = "icmp_ratelimit",
551 .data = &sysctl_icmp_ratelimit,
552 .maxlen = sizeof(int),
553 .mode = 0644,
554 .proc_handler = &proc_dointvec
555 },
556 {
557 .ctl_name = NET_IPV4_ICMP_RATEMASK,
558 .procname = "icmp_ratemask",
559 .data = &sysctl_icmp_ratemask,
560 .maxlen = sizeof(int),
561 .mode = 0644,
562 .proc_handler = &proc_dointvec
563 },
564 {
565 .ctl_name = NET_TCP_TW_REUSE,
566 .procname = "tcp_tw_reuse",
567 .data = &sysctl_tcp_tw_reuse,
568 .maxlen = sizeof(int),
569 .mode = 0644,
570 .proc_handler = &proc_dointvec
571 },
572 {
573 .ctl_name = NET_TCP_FRTO,
574 .procname = "tcp_frto",
575 .data = &sysctl_tcp_frto,
576 .maxlen = sizeof(int),
577 .mode = 0644,
578 .proc_handler = &proc_dointvec
579 },
580 {
581 .ctl_name = NET_TCP_LOW_LATENCY,
582 .procname = "tcp_low_latency",
583 .data = &sysctl_tcp_low_latency,
584 .maxlen = sizeof(int),
585 .mode = 0644,
586 .proc_handler = &proc_dointvec
587 },
588 {
589 .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
590 .procname = "ipfrag_secret_interval",
591 .data = &sysctl_ipfrag_secret_interval,
592 .maxlen = sizeof(int),
593 .mode = 0644,
594 .proc_handler = &proc_dointvec_jiffies,
595 .strategy = &sysctl_jiffies
596 },
597 {
598 .ctl_name = NET_TCP_NO_METRICS_SAVE,
599 .procname = "tcp_no_metrics_save",
600 .data = &sysctl_tcp_nometrics_save,
601 .maxlen = sizeof(int),
602 .mode = 0644,
603 .proc_handler = &proc_dointvec,
604 },
605 {
606 .ctl_name = NET_TCP_WESTWOOD,
607 .procname = "tcp_westwood",
608 .data = &sysctl_tcp_westwood,
609 .maxlen = sizeof(int),
610 .mode = 0644,
611 .proc_handler = &proc_dointvec,
612 },
613 {
614 .ctl_name = NET_TCP_VEGAS,
615 .procname = "tcp_vegas_cong_avoid",
616 .data = &sysctl_tcp_vegas_cong_avoid,
617 .maxlen = sizeof(int),
618 .mode = 0644,
619 .proc_handler = &proc_dointvec,
620 },
621 {
622 .ctl_name = NET_TCP_VEGAS_ALPHA,
623 .procname = "tcp_vegas_alpha",
624 .data = &sysctl_tcp_vegas_alpha,
625 .maxlen = sizeof(int),
626 .mode = 0644,
627 .proc_handler = &proc_dointvec,
628 },
629 {
630 .ctl_name = NET_TCP_VEGAS_BETA,
631 .procname = "tcp_vegas_beta",
632 .data = &sysctl_tcp_vegas_beta,
633 .maxlen = sizeof(int),
634 .mode = 0644,
635 .proc_handler = &proc_dointvec,
636 },
637 {
638 .ctl_name = NET_TCP_VEGAS_GAMMA,
639 .procname = "tcp_vegas_gamma",
640 .data = &sysctl_tcp_vegas_gamma,
641 .maxlen = sizeof(int),
642 .mode = 0644,
643 .proc_handler = &proc_dointvec,
644 },
645 {
646 .ctl_name = NET_TCP_BIC,
647 .procname = "tcp_bic",
648 .data = &sysctl_tcp_bic,
649 .maxlen = sizeof(int),
650 .mode = 0644,
651 .proc_handler = &proc_dointvec,
652 },
653 {
654 .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
655 .procname = "tcp_bic_fast_convergence",
656 .data = &sysctl_tcp_bic_fast_convergence,
657 .maxlen = sizeof(int),
658 .mode = 0644,
659 .proc_handler = &proc_dointvec,
660 },
661 {
662 .ctl_name = NET_TCP_BIC_LOW_WINDOW,
663 .procname = "tcp_bic_low_window",
664 .data = &sysctl_tcp_bic_low_window,
665 .maxlen = sizeof(int),
666 .mode = 0644,
667 .proc_handler = &proc_dointvec,
668 },
669 {
670 .ctl_name = NET_TCP_MODERATE_RCVBUF,
671 .procname = "tcp_moderate_rcvbuf",
672 .data = &sysctl_tcp_moderate_rcvbuf,
673 .maxlen = sizeof(int),
674 .mode = 0644,
675 .proc_handler = &proc_dointvec,
676 },
677 {
678 .ctl_name = NET_TCP_TSO_WIN_DIVISOR,
679 .procname = "tcp_tso_win_divisor",
680 .data = &sysctl_tcp_tso_win_divisor,
681 .maxlen = sizeof(int),
682 .mode = 0644,
683 .proc_handler = &proc_dointvec,
684 },
685 {
686 .ctl_name = NET_TCP_BIC_BETA,
687 .procname = "tcp_bic_beta",
688 .data = &sysctl_tcp_bic_beta,
689 .maxlen = sizeof(int),
690 .mode = 0644,
691 .proc_handler = &proc_dointvec,
692 },
693 { .ctl_name = 0 }
694};
695
696#endif /* CONFIG_SYSCTL */
697
698EXPORT_SYMBOL(ipv4_config);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 000000000000..5cff56af7855
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,2386 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but it's a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
207 * Hirokazu Takahashi : Use copy_from_user() instead of
208 * csum_and_copy_from_user() if possible.
209 *
210 * This program is free software; you can redistribute it and/or
211 * modify it under the terms of the GNU General Public License
212 * as published by the Free Software Foundation; either version
213 * 2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 * TCP_SYN_SENT sent a connection request, waiting for ack
218 *
219 * TCP_SYN_RECV received a connection request, sent ack,
220 * waiting for final ack in three-way handshake.
221 *
222 * TCP_ESTABLISHED connection established
223 *
224 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
225 * transmission of remaining buffered data
226 *
227 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
228 * to shutdown
229 *
230 * TCP_CLOSING both sides have shutdown but we still have
231 * data we have to finish sending
232 *
233 * TCP_TIME_WAIT timeout to catch resent junk before entering
234 * closed, can only be entered from FIN_WAIT2
235 * or CLOSING. Required because the other end
236 * may not have gotten our last ACK causing it
237 * to retransmit the data packet (which we ignore)
238 *
239 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
240 * us to finish writing our data and to shutdown
241 * (we have to close() to move on to LAST_ACK)
242 *
243 * TCP_LAST_ACK out side has shutdown after remote has
244 * shutdown. There may still be data in our
245 * buffer that we have to finish sending
246 *
247 * TCP_CLOSE socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274kmem_cache_t *tcp_openreq_cachep;
275kmem_cache_t *tcp_bucket_cachep;
276kmem_cache_t *tcp_timewait_cachep;
277
278atomic_t tcp_orphan_count = ATOMIC_INIT(0);
279
280int sysctl_tcp_mem[3];
281int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
282int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
283
284EXPORT_SYMBOL(sysctl_tcp_mem);
285EXPORT_SYMBOL(sysctl_tcp_rmem);
286EXPORT_SYMBOL(sysctl_tcp_wmem);
287
288atomic_t tcp_memory_allocated; /* Current allocated memory. */
289atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
290
291EXPORT_SYMBOL(tcp_memory_allocated);
292EXPORT_SYMBOL(tcp_sockets_allocated);
293
294/*
295 * Pressure flag: try to collapse.
296 * Technical note: it is used by multiple contexts non atomically.
297 * All the sk_stream_mem_schedule() is of this nature: accounting
298 * is strict, actions are advisory and have some latency.
299 */
300int tcp_memory_pressure;
301
302EXPORT_SYMBOL(tcp_memory_pressure);
303
304void tcp_enter_memory_pressure(void)
305{
306 if (!tcp_memory_pressure) {
307 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
308 tcp_memory_pressure = 1;
309 }
310}
311
312EXPORT_SYMBOL(tcp_enter_memory_pressure);
313
314/*
315 * LISTEN is a special case for poll..
316 */
317static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318 poll_table *wait)
319{
320 return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
321}
322
323/*
324 * Wait for a TCP event.
325 *
326 * Note that we don't need to lock the socket, as the upper poll layers
327 * take care of normal races (between the test and the event) and we don't
328 * go look at any of the socket buffers directly.
329 */
330unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
331{
332 unsigned int mask;
333 struct sock *sk = sock->sk;
334 struct tcp_sock *tp = tcp_sk(sk);
335
336 poll_wait(file, sk->sk_sleep, wait);
337 if (sk->sk_state == TCP_LISTEN)
338 return tcp_listen_poll(sk, wait);
339
340 /* Socket is not locked. We are protected from async events
341 by poll logic and correct handling of state changes
342 made by another threads is impossible in any case.
343 */
344
345 mask = 0;
346 if (sk->sk_err)
347 mask = POLLERR;
348
349 /*
350 * POLLHUP is certainly not done right. But poll() doesn't
351 * have a notion of HUP in just one direction, and for a
352 * socket the read side is more interesting.
353 *
354 * Some poll() documentation says that POLLHUP is incompatible
355 * with the POLLOUT/POLLWR flags, so somebody should check this
356 * all. But careful, it tends to be safer to return too many
357 * bits than too few, and you can easily break real applications
358 * if you don't tell them that something has hung up!
359 *
360 * Check-me.
361 *
362 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
363 * our fs/select.c). It means that after we received EOF,
364 * poll always returns immediately, making impossible poll() on write()
365 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
366 * if and only if shutdown has been made in both directions.
367 * Actually, it is interesting to look how Solaris and DUX
368 * solve this dilemma. I would prefer, if PULLHUP were maskable,
369 * then we could set it on SND_SHUTDOWN. BTW examples given
370 * in Stevens' books assume exactly this behaviour, it explains
371 * why PULLHUP is incompatible with POLLOUT. --ANK
372 *
373 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
374 * blocking on fresh not-connected or disconnected socket. --ANK
375 */
376 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
377 mask |= POLLHUP;
378 if (sk->sk_shutdown & RCV_SHUTDOWN)
379 mask |= POLLIN | POLLRDNORM;
380
381 /* Connected? */
382 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
383 /* Potential race condition. If read of tp below will
384 * escape above sk->sk_state, we can be illegally awaken
385 * in SYN_* states. */
386 if ((tp->rcv_nxt != tp->copied_seq) &&
387 (tp->urg_seq != tp->copied_seq ||
388 tp->rcv_nxt != tp->copied_seq + 1 ||
389 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
390 mask |= POLLIN | POLLRDNORM;
391
392 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
393 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
394 mask |= POLLOUT | POLLWRNORM;
395 } else { /* send SIGIO later */
396 set_bit(SOCK_ASYNC_NOSPACE,
397 &sk->sk_socket->flags);
398 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
399
400 /* Race breaker. If space is freed after
401 * wspace test but before the flags are set,
402 * IO signal will be lost.
403 */
404 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
405 mask |= POLLOUT | POLLWRNORM;
406 }
407 }
408
409 if (tp->urg_data & TCP_URG_VALID)
410 mask |= POLLPRI;
411 }
412 return mask;
413}
414
415int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
416{
417 struct tcp_sock *tp = tcp_sk(sk);
418 int answ;
419
420 switch (cmd) {
421 case SIOCINQ:
422 if (sk->sk_state == TCP_LISTEN)
423 return -EINVAL;
424
425 lock_sock(sk);
426 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
427 answ = 0;
428 else if (sock_flag(sk, SOCK_URGINLINE) ||
429 !tp->urg_data ||
430 before(tp->urg_seq, tp->copied_seq) ||
431 !before(tp->urg_seq, tp->rcv_nxt)) {
432 answ = tp->rcv_nxt - tp->copied_seq;
433
434 /* Subtract 1, if FIN is in queue. */
435 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
436 answ -=
437 ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
438 } else
439 answ = tp->urg_seq - tp->copied_seq;
440 release_sock(sk);
441 break;
442 case SIOCATMARK:
443 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
444 break;
445 case SIOCOUTQ:
446 if (sk->sk_state == TCP_LISTEN)
447 return -EINVAL;
448
449 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
450 answ = 0;
451 else
452 answ = tp->write_seq - tp->snd_una;
453 break;
454 default:
455 return -ENOIOCTLCMD;
456 };
457
458 return put_user(answ, (int __user *)arg);
459}
460
461
462int tcp_listen_start(struct sock *sk)
463{
464 struct inet_sock *inet = inet_sk(sk);
465 struct tcp_sock *tp = tcp_sk(sk);
466 struct tcp_listen_opt *lopt;
467
468 sk->sk_max_ack_backlog = 0;
469 sk->sk_ack_backlog = 0;
470 tp->accept_queue = tp->accept_queue_tail = NULL;
471 rwlock_init(&tp->syn_wait_lock);
472 tcp_delack_init(tp);
473
474 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475 if (!lopt)
476 return -ENOMEM;
477
478 memset(lopt, 0, sizeof(struct tcp_listen_opt));
479 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481 break;
482 get_random_bytes(&lopt->hash_rnd, 4);
483
484 write_lock_bh(&tp->syn_wait_lock);
485 tp->listen_opt = lopt;
486 write_unlock_bh(&tp->syn_wait_lock);
487
488 /* There is race window here: we announce ourselves listening,
489 * but this transition is still not validated by get_port().
490 * It is OK, because this socket enters to hash table only
491 * after validation is complete.
492 */
493 sk->sk_state = TCP_LISTEN;
494 if (!sk->sk_prot->get_port(sk, inet->num)) {
495 inet->sport = htons(inet->num);
496
497 sk_dst_reset(sk);
498 sk->sk_prot->hash(sk);
499
500 return 0;
501 }
502
503 sk->sk_state = TCP_CLOSE;
504 write_lock_bh(&tp->syn_wait_lock);
505 tp->listen_opt = NULL;
506 write_unlock_bh(&tp->syn_wait_lock);
507 kfree(lopt);
508 return -EADDRINUSE;
509}
510
511/*
512 * This routine closes sockets which have been at least partially
513 * opened, but not yet accepted.
514 */
515
516static void tcp_listen_stop (struct sock *sk)
517{
518 struct tcp_sock *tp = tcp_sk(sk);
519 struct tcp_listen_opt *lopt = tp->listen_opt;
520 struct open_request *acc_req = tp->accept_queue;
521 struct open_request *req;
522 int i;
523
524 tcp_delete_keepalive_timer(sk);
525
526 /* make all the listen_opt local to us */
527 write_lock_bh(&tp->syn_wait_lock);
528 tp->listen_opt = NULL;
529 write_unlock_bh(&tp->syn_wait_lock);
530 tp->accept_queue = tp->accept_queue_tail = NULL;
531
532 if (lopt->qlen) {
533 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534 while ((req = lopt->syn_table[i]) != NULL) {
535 lopt->syn_table[i] = req->dl_next;
536 lopt->qlen--;
537 tcp_openreq_free(req);
538
539 /* Following specs, it would be better either to send FIN
540 * (and enter FIN-WAIT-1, it is normal close)
541 * or to send active reset (abort).
542 * Certainly, it is pretty dangerous while synflood, but it is
543 * bad justification for our negligence 8)
544 * To be honest, we are not able to make either
545 * of the variants now. --ANK
546 */
547 }
548 }
549 }
550 BUG_TRAP(!lopt->qlen);
551
552 kfree(lopt);
553
554 while ((req = acc_req) != NULL) {
555 struct sock *child = req->sk;
556
557 acc_req = req->dl_next;
558
559 local_bh_disable();
560 bh_lock_sock(child);
561 BUG_TRAP(!sock_owned_by_user(child));
562 sock_hold(child);
563
564 tcp_disconnect(child, O_NONBLOCK);
565
566 sock_orphan(child);
567
568 atomic_inc(&tcp_orphan_count);
569
570 tcp_destroy_sock(child);
571
572 bh_unlock_sock(child);
573 local_bh_enable();
574 sock_put(child);
575
576 sk_acceptq_removed(sk);
577 tcp_openreq_fastfree(req);
578 }
579 BUG_TRAP(!sk->sk_ack_backlog);
580}
581
582static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
583{
584 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
585 tp->pushed_seq = tp->write_seq;
586}
587
588static inline int forced_push(struct tcp_sock *tp)
589{
590 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
591}
592
593static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
594 struct sk_buff *skb)
595{
596 skb->csum = 0;
597 TCP_SKB_CB(skb)->seq = tp->write_seq;
598 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
599 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
600 TCP_SKB_CB(skb)->sacked = 0;
601 skb_header_release(skb);
602 __skb_queue_tail(&sk->sk_write_queue, skb);
603 sk_charge_skb(sk, skb);
604 if (!sk->sk_send_head)
605 sk->sk_send_head = skb;
606 else if (tp->nonagle&TCP_NAGLE_PUSH)
607 tp->nonagle &= ~TCP_NAGLE_PUSH;
608}
609
610static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
611 struct sk_buff *skb)
612{
613 if (flags & MSG_OOB) {
614 tp->urg_mode = 1;
615 tp->snd_up = tp->write_seq;
616 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
617 }
618}
619
620static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
621 int mss_now, int nonagle)
622{
623 if (sk->sk_send_head) {
624 struct sk_buff *skb = sk->sk_write_queue.prev;
625 if (!(flags & MSG_MORE) || forced_push(tp))
626 tcp_mark_push(tp, skb);
627 tcp_mark_urg(tp, flags, skb);
628 __tcp_push_pending_frames(sk, tp, mss_now,
629 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
630 }
631}
632
633static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
634 size_t psize, int flags)
635{
636 struct tcp_sock *tp = tcp_sk(sk);
637 int mss_now;
638 int err;
639 ssize_t copied;
640 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
641
642 /* Wait for a connection to finish. */
643 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
644 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
645 goto out_err;
646
647 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
648
649 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
650 copied = 0;
651
652 err = -EPIPE;
653 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
654 goto do_error;
655
656 while (psize > 0) {
657 struct sk_buff *skb = sk->sk_write_queue.prev;
658 struct page *page = pages[poffset / PAGE_SIZE];
659 int copy, i, can_coalesce;
660 int offset = poffset % PAGE_SIZE;
661 int size = min_t(size_t, psize, PAGE_SIZE - offset);
662
663 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
664new_segment:
665 if (!sk_stream_memory_free(sk))
666 goto wait_for_sndbuf;
667
668 skb = sk_stream_alloc_pskb(sk, 0, 0,
669 sk->sk_allocation);
670 if (!skb)
671 goto wait_for_memory;
672
673 skb_entail(sk, tp, skb);
674 copy = mss_now;
675 }
676
677 if (copy > size)
678 copy = size;
679
680 i = skb_shinfo(skb)->nr_frags;
681 can_coalesce = skb_can_coalesce(skb, i, page, offset);
682 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
683 tcp_mark_push(tp, skb);
684 goto new_segment;
685 }
686 if (sk->sk_forward_alloc < copy &&
687 !sk_stream_mem_schedule(sk, copy, 0))
688 goto wait_for_memory;
689
690 if (can_coalesce) {
691 skb_shinfo(skb)->frags[i - 1].size += copy;
692 } else {
693 get_page(page);
694 skb_fill_page_desc(skb, i, page, offset, copy);
695 }
696
697 skb->len += copy;
698 skb->data_len += copy;
699 skb->truesize += copy;
700 sk->sk_wmem_queued += copy;
701 sk->sk_forward_alloc -= copy;
702 skb->ip_summed = CHECKSUM_HW;
703 tp->write_seq += copy;
704 TCP_SKB_CB(skb)->end_seq += copy;
705 skb_shinfo(skb)->tso_segs = 0;
706
707 if (!copied)
708 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
709
710 copied += copy;
711 poffset += copy;
712 if (!(psize -= copy))
713 goto out;
714
715 if (skb->len != mss_now || (flags & MSG_OOB))
716 continue;
717
718 if (forced_push(tp)) {
719 tcp_mark_push(tp, skb);
720 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
721 } else if (skb == sk->sk_send_head)
722 tcp_push_one(sk, mss_now);
723 continue;
724
725wait_for_sndbuf:
726 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
727wait_for_memory:
728 if (copied)
729 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
730
731 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
732 goto do_error;
733
734 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
735 }
736
737out:
738 if (copied)
739 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
740 return copied;
741
742do_error:
743 if (copied)
744 goto out;
745out_err:
746 return sk_stream_error(sk, flags, err);
747}
748
749ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
750 size_t size, int flags)
751{
752 ssize_t res;
753 struct sock *sk = sock->sk;
754
755#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
756
757 if (!(sk->sk_route_caps & NETIF_F_SG) ||
758 !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
759 return sock_no_sendpage(sock, page, offset, size, flags);
760
761#undef TCP_ZC_CSUM_FLAGS
762
763 lock_sock(sk);
764 TCP_CHECK_TIMER(sk);
765 res = do_tcp_sendpages(sk, &page, offset, size, flags);
766 TCP_CHECK_TIMER(sk);
767 release_sock(sk);
768 return res;
769}
770
771#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
772#define TCP_OFF(sk) (sk->sk_sndmsg_off)
773
774static inline int select_size(struct sock *sk, struct tcp_sock *tp)
775{
776 int tmp = tp->mss_cache_std;
777
778 if (sk->sk_route_caps & NETIF_F_SG) {
779 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
780
781 if (tmp >= pgbreak &&
782 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
783 tmp = pgbreak;
784 }
785 return tmp;
786}
787
788int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
789 size_t size)
790{
791 struct iovec *iov;
792 struct tcp_sock *tp = tcp_sk(sk);
793 struct sk_buff *skb;
794 int iovlen, flags;
795 int mss_now;
796 int err, copied;
797 long timeo;
798
799 lock_sock(sk);
800 TCP_CHECK_TIMER(sk);
801
802 flags = msg->msg_flags;
803 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
804
805 /* Wait for a connection to finish. */
806 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
807 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
808 goto out_err;
809
810 /* This should be in poll */
811 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
812
813 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
814
815 /* Ok commence sending. */
816 iovlen = msg->msg_iovlen;
817 iov = msg->msg_iov;
818 copied = 0;
819
820 err = -EPIPE;
821 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
822 goto do_error;
823
824 while (--iovlen >= 0) {
825 int seglen = iov->iov_len;
826 unsigned char __user *from = iov->iov_base;
827
828 iov++;
829
830 while (seglen > 0) {
831 int copy;
832
833 skb = sk->sk_write_queue.prev;
834
835 if (!sk->sk_send_head ||
836 (copy = mss_now - skb->len) <= 0) {
837
838new_segment:
839 /* Allocate new segment. If the interface is SG,
840 * allocate skb fitting to single page.
841 */
842 if (!sk_stream_memory_free(sk))
843 goto wait_for_sndbuf;
844
845 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
846 0, sk->sk_allocation);
847 if (!skb)
848 goto wait_for_memory;
849
850 /*
851 * Check whether we can use HW checksum.
852 */
853 if (sk->sk_route_caps &
854 (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
855 NETIF_F_HW_CSUM))
856 skb->ip_summed = CHECKSUM_HW;
857
858 skb_entail(sk, tp, skb);
859 copy = mss_now;
860 }
861
862 /* Try to append data to the end of skb. */
863 if (copy > seglen)
864 copy = seglen;
865
866 /* Where to copy to? */
867 if (skb_tailroom(skb) > 0) {
868 /* We have some space in skb head. Superb! */
869 if (copy > skb_tailroom(skb))
870 copy = skb_tailroom(skb);
871 if ((err = skb_add_data(skb, from, copy)) != 0)
872 goto do_fault;
873 } else {
874 int merge = 0;
875 int i = skb_shinfo(skb)->nr_frags;
876 struct page *page = TCP_PAGE(sk);
877 int off = TCP_OFF(sk);
878
879 if (skb_can_coalesce(skb, i, page, off) &&
880 off != PAGE_SIZE) {
881 /* We can extend the last page
882 * fragment. */
883 merge = 1;
884 } else if (i == MAX_SKB_FRAGS ||
885 (!i &&
886 !(sk->sk_route_caps & NETIF_F_SG))) {
887 /* Need to add new fragment and cannot
888 * do this because interface is non-SG,
889 * or because all the page slots are
890 * busy. */
891 tcp_mark_push(tp, skb);
892 goto new_segment;
893 } else if (page) {
894 /* If page is cached, align
895 * offset to L1 cache boundary
896 */
897 off = (off + L1_CACHE_BYTES - 1) &
898 ~(L1_CACHE_BYTES - 1);
899 if (off == PAGE_SIZE) {
900 put_page(page);
901 TCP_PAGE(sk) = page = NULL;
902 }
903 }
904
905 if (!page) {
906 /* Allocate new cache page. */
907 if (!(page = sk_stream_alloc_page(sk)))
908 goto wait_for_memory;
909 off = 0;
910 }
911
912 if (copy > PAGE_SIZE - off)
913 copy = PAGE_SIZE - off;
914
915 /* Time to copy data. We are close to
916 * the end! */
917 err = skb_copy_to_page(sk, from, skb, page,
918 off, copy);
919 if (err) {
920 /* If this page was new, give it to the
921 * socket so it does not get leaked.
922 */
923 if (!TCP_PAGE(sk)) {
924 TCP_PAGE(sk) = page;
925 TCP_OFF(sk) = 0;
926 }
927 goto do_error;
928 }
929
930 /* Update the skb. */
931 if (merge) {
932 skb_shinfo(skb)->frags[i - 1].size +=
933 copy;
934 } else {
935 skb_fill_page_desc(skb, i, page, off, copy);
936 if (TCP_PAGE(sk)) {
937 get_page(page);
938 } else if (off + copy < PAGE_SIZE) {
939 get_page(page);
940 TCP_PAGE(sk) = page;
941 }
942 }
943
944 TCP_OFF(sk) = off + copy;
945 }
946
947 if (!copied)
948 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
949
950 tp->write_seq += copy;
951 TCP_SKB_CB(skb)->end_seq += copy;
952 skb_shinfo(skb)->tso_segs = 0;
953
954 from += copy;
955 copied += copy;
956 if ((seglen -= copy) == 0 && iovlen == 0)
957 goto out;
958
959 if (skb->len != mss_now || (flags & MSG_OOB))
960 continue;
961
962 if (forced_push(tp)) {
963 tcp_mark_push(tp, skb);
964 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
965 } else if (skb == sk->sk_send_head)
966 tcp_push_one(sk, mss_now);
967 continue;
968
969wait_for_sndbuf:
970 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
971wait_for_memory:
972 if (copied)
973 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
974
975 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
976 goto do_error;
977
978 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
979 }
980 }
981
982out:
983 if (copied)
984 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
985 TCP_CHECK_TIMER(sk);
986 release_sock(sk);
987 return copied;
988
989do_fault:
990 if (!skb->len) {
991 if (sk->sk_send_head == skb)
992 sk->sk_send_head = NULL;
993 __skb_unlink(skb, skb->list);
994 sk_stream_free_skb(sk, skb);
995 }
996
997do_error:
998 if (copied)
999 goto out;
1000out_err:
1001 err = sk_stream_error(sk, flags, err);
1002 TCP_CHECK_TIMER(sk);
1003 release_sock(sk);
1004 return err;
1005}
1006
1007/*
1008 * Handle reading urgent data. BSD has very simple semantics for
1009 * this, no blocking and very strange errors 8)
1010 */
1011
1012static int tcp_recv_urg(struct sock *sk, long timeo,
1013 struct msghdr *msg, int len, int flags,
1014 int *addr_len)
1015{
1016 struct tcp_sock *tp = tcp_sk(sk);
1017
1018 /* No URG data to read. */
1019 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1020 tp->urg_data == TCP_URG_READ)
1021 return -EINVAL; /* Yes this is right ! */
1022
1023 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1024 return -ENOTCONN;
1025
1026 if (tp->urg_data & TCP_URG_VALID) {
1027 int err = 0;
1028 char c = tp->urg_data;
1029
1030 if (!(flags & MSG_PEEK))
1031 tp->urg_data = TCP_URG_READ;
1032
1033 /* Read urgent data. */
1034 msg->msg_flags |= MSG_OOB;
1035
1036 if (len > 0) {
1037 if (!(flags & MSG_TRUNC))
1038 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1039 len = 1;
1040 } else
1041 msg->msg_flags |= MSG_TRUNC;
1042
1043 return err ? -EFAULT : len;
1044 }
1045
1046 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1047 return 0;
1048
1049 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1050 * the available implementations agree in this case:
1051 * this call should never block, independent of the
1052 * blocking state of the socket.
1053 * Mike <pall@rz.uni-karlsruhe.de>
1054 */
1055 return -EAGAIN;
1056}
1057
1058/* Clean up the receive buffer for full frames taken by the user,
1059 * then send an ACK if necessary. COPIED is the number of bytes
1060 * tcp_recvmsg has given to the user so far, it speeds up the
1061 * calculation of whether or not we must ACK for the sake of
1062 * a window update.
1063 */
1064static void cleanup_rbuf(struct sock *sk, int copied)
1065{
1066 struct tcp_sock *tp = tcp_sk(sk);
1067 int time_to_ack = 0;
1068
1069#if TCP_DEBUG
1070 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1071
1072 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1073#endif
1074
1075 if (tcp_ack_scheduled(tp)) {
1076 /* Delayed ACKs frequently hit locked sockets during bulk
1077 * receive. */
1078 if (tp->ack.blocked ||
1079 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1080 tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1081 /*
1082 * If this read emptied read buffer, we send ACK, if
1083 * connection is not bidirectional, user drained
1084 * receive buffer and there was a small segment
1085 * in queue.
1086 */
1087 (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1088 !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1089 time_to_ack = 1;
1090 }
1091
1092 /* We send an ACK if we can now advertise a non-zero window
1093 * which has been raised "significantly".
1094 *
1095 * Even if window raised up to infinity, do not send window open ACK
1096 * in states, where we will not receive more. It is useless.
1097 */
1098 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1099 __u32 rcv_window_now = tcp_receive_window(tp);
1100
1101 /* Optimize, __tcp_select_window() is not cheap. */
1102 if (2*rcv_window_now <= tp->window_clamp) {
1103 __u32 new_window = __tcp_select_window(sk);
1104
1105 /* Send ACK now, if this read freed lots of space
1106 * in our buffer. Certainly, new_window is new window.
1107 * We can advertise it now, if it is not less than current one.
1108 * "Lots" means "at least twice" here.
1109 */
1110 if (new_window && new_window >= 2 * rcv_window_now)
1111 time_to_ack = 1;
1112 }
1113 }
1114 if (time_to_ack)
1115 tcp_send_ack(sk);
1116}
1117
1118static void tcp_prequeue_process(struct sock *sk)
1119{
1120 struct sk_buff *skb;
1121 struct tcp_sock *tp = tcp_sk(sk);
1122
1123 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1124
1125 /* RX process wants to run with disabled BHs, though it is not
1126 * necessary */
1127 local_bh_disable();
1128 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1129 sk->sk_backlog_rcv(sk, skb);
1130 local_bh_enable();
1131
1132 /* Clear memory counter. */
1133 tp->ucopy.memory = 0;
1134}
1135
1136static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1137{
1138 struct sk_buff *skb;
1139 u32 offset;
1140
1141 skb_queue_walk(&sk->sk_receive_queue, skb) {
1142 offset = seq - TCP_SKB_CB(skb)->seq;
1143 if (skb->h.th->syn)
1144 offset--;
1145 if (offset < skb->len || skb->h.th->fin) {
1146 *off = offset;
1147 return skb;
1148 }
1149 }
1150 return NULL;
1151}
1152
1153/*
1154 * This routine provides an alternative to tcp_recvmsg() for routines
1155 * that would like to handle copying from skbuffs directly in 'sendfile'
1156 * fashion.
1157 * Note:
1158 * - It is assumed that the socket was locked by the caller.
1159 * - The routine does not block.
1160 * - At present, there is no support for reading OOB data
1161 * or for 'peeking' the socket using this routine
1162 * (although both would be easy to implement).
1163 */
1164int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1165 sk_read_actor_t recv_actor)
1166{
1167 struct sk_buff *skb;
1168 struct tcp_sock *tp = tcp_sk(sk);
1169 u32 seq = tp->copied_seq;
1170 u32 offset;
1171 int copied = 0;
1172
1173 if (sk->sk_state == TCP_LISTEN)
1174 return -ENOTCONN;
1175 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1176 if (offset < skb->len) {
1177 size_t used, len;
1178
1179 len = skb->len - offset;
1180 /* Stop reading if we hit a patch of urgent data */
1181 if (tp->urg_data) {
1182 u32 urg_offset = tp->urg_seq - seq;
1183 if (urg_offset < len)
1184 len = urg_offset;
1185 if (!len)
1186 break;
1187 }
1188 used = recv_actor(desc, skb, offset, len);
1189 if (used <= len) {
1190 seq += used;
1191 copied += used;
1192 offset += used;
1193 }
1194 if (offset != skb->len)
1195 break;
1196 }
1197 if (skb->h.th->fin) {
1198 sk_eat_skb(sk, skb);
1199 ++seq;
1200 break;
1201 }
1202 sk_eat_skb(sk, skb);
1203 if (!desc->count)
1204 break;
1205 }
1206 tp->copied_seq = seq;
1207
1208 tcp_rcv_space_adjust(sk);
1209
1210 /* Clean up data we have read: This will do ACK frames. */
1211 if (copied)
1212 cleanup_rbuf(sk, copied);
1213 return copied;
1214}
1215
1216/*
1217 * This routine copies from a sock struct into the user buffer.
1218 *
1219 * Technical note: in 2.3 we work on _locked_ socket, so that
1220 * tricks with *seq access order and skb->users are not required.
1221 * Probably, code can be easily improved even more.
1222 */
1223
1224int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1225 size_t len, int nonblock, int flags, int *addr_len)
1226{
1227 struct tcp_sock *tp = tcp_sk(sk);
1228 int copied = 0;
1229 u32 peek_seq;
1230 u32 *seq;
1231 unsigned long used;
1232 int err;
1233 int target; /* Read at least this many bytes */
1234 long timeo;
1235 struct task_struct *user_recv = NULL;
1236
1237 lock_sock(sk);
1238
1239 TCP_CHECK_TIMER(sk);
1240
1241 err = -ENOTCONN;
1242 if (sk->sk_state == TCP_LISTEN)
1243 goto out;
1244
1245 timeo = sock_rcvtimeo(sk, nonblock);
1246
1247 /* Urgent data needs to be handled specially. */
1248 if (flags & MSG_OOB)
1249 goto recv_urg;
1250
1251 seq = &tp->copied_seq;
1252 if (flags & MSG_PEEK) {
1253 peek_seq = tp->copied_seq;
1254 seq = &peek_seq;
1255 }
1256
1257 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1258
1259 do {
1260 struct sk_buff *skb;
1261 u32 offset;
1262
1263 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1264 if (tp->urg_data && tp->urg_seq == *seq) {
1265 if (copied)
1266 break;
1267 if (signal_pending(current)) {
1268 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1269 break;
1270 }
1271 }
1272
1273 /* Next get a buffer. */
1274
1275 skb = skb_peek(&sk->sk_receive_queue);
1276 do {
1277 if (!skb)
1278 break;
1279
1280 /* Now that we have two receive queues this
1281 * shouldn't happen.
1282 */
1283 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1284 printk(KERN_INFO "recvmsg bug: copied %X "
1285 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1286 break;
1287 }
1288 offset = *seq - TCP_SKB_CB(skb)->seq;
1289 if (skb->h.th->syn)
1290 offset--;
1291 if (offset < skb->len)
1292 goto found_ok_skb;
1293 if (skb->h.th->fin)
1294 goto found_fin_ok;
1295 BUG_TRAP(flags & MSG_PEEK);
1296 skb = skb->next;
1297 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1298
1299 /* Well, if we have backlog, try to process it now yet. */
1300
1301 if (copied >= target && !sk->sk_backlog.tail)
1302 break;
1303
1304 if (copied) {
1305 if (sk->sk_err ||
1306 sk->sk_state == TCP_CLOSE ||
1307 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1308 !timeo ||
1309 signal_pending(current) ||
1310 (flags & MSG_PEEK))
1311 break;
1312 } else {
1313 if (sock_flag(sk, SOCK_DONE))
1314 break;
1315
1316 if (sk->sk_err) {
1317 copied = sock_error(sk);
1318 break;
1319 }
1320
1321 if (sk->sk_shutdown & RCV_SHUTDOWN)
1322 break;
1323
1324 if (sk->sk_state == TCP_CLOSE) {
1325 if (!sock_flag(sk, SOCK_DONE)) {
1326 /* This occurs when user tries to read
1327 * from never connected socket.
1328 */
1329 copied = -ENOTCONN;
1330 break;
1331 }
1332 break;
1333 }
1334
1335 if (!timeo) {
1336 copied = -EAGAIN;
1337 break;
1338 }
1339
1340 if (signal_pending(current)) {
1341 copied = sock_intr_errno(timeo);
1342 break;
1343 }
1344 }
1345
1346 cleanup_rbuf(sk, copied);
1347
1348 if (tp->ucopy.task == user_recv) {
1349 /* Install new reader */
1350 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1351 user_recv = current;
1352 tp->ucopy.task = user_recv;
1353 tp->ucopy.iov = msg->msg_iov;
1354 }
1355
1356 tp->ucopy.len = len;
1357
1358 BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1359 (flags & (MSG_PEEK | MSG_TRUNC)));
1360
1361 /* Ugly... If prequeue is not empty, we have to
1362 * process it before releasing socket, otherwise
1363 * order will be broken at second iteration.
1364 * More elegant solution is required!!!
1365 *
1366 * Look: we have the following (pseudo)queues:
1367 *
1368 * 1. packets in flight
1369 * 2. backlog
1370 * 3. prequeue
1371 * 4. receive_queue
1372 *
1373 * Each queue can be processed only if the next ones
1374 * are empty. At this point we have empty receive_queue.
1375 * But prequeue _can_ be not empty after 2nd iteration,
1376 * when we jumped to start of loop because backlog
1377 * processing added something to receive_queue.
1378 * We cannot release_sock(), because backlog contains
1379 * packets arrived _after_ prequeued ones.
1380 *
1381 * Shortly, algorithm is clear --- to process all
1382 * the queues in order. We could make it more directly,
1383 * requeueing packets from backlog to prequeue, if
1384 * is not empty. It is more elegant, but eats cycles,
1385 * unfortunately.
1386 */
1387 if (skb_queue_len(&tp->ucopy.prequeue))
1388 goto do_prequeue;
1389
1390 /* __ Set realtime policy in scheduler __ */
1391 }
1392
1393 if (copied >= target) {
1394 /* Do not sleep, just process backlog. */
1395 release_sock(sk);
1396 lock_sock(sk);
1397 } else
1398 sk_wait_data(sk, &timeo);
1399
1400 if (user_recv) {
1401 int chunk;
1402
1403 /* __ Restore normal policy in scheduler __ */
1404
1405 if ((chunk = len - tp->ucopy.len) != 0) {
1406 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1407 len -= chunk;
1408 copied += chunk;
1409 }
1410
1411 if (tp->rcv_nxt == tp->copied_seq &&
1412 skb_queue_len(&tp->ucopy.prequeue)) {
1413do_prequeue:
1414 tcp_prequeue_process(sk);
1415
1416 if ((chunk = len - tp->ucopy.len) != 0) {
1417 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1418 len -= chunk;
1419 copied += chunk;
1420 }
1421 }
1422 }
1423 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1424 if (net_ratelimit())
1425 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1426 current->comm, current->pid);
1427 peek_seq = tp->copied_seq;
1428 }
1429 continue;
1430
1431 found_ok_skb:
1432 /* Ok so how much can we use? */
1433 used = skb->len - offset;
1434 if (len < used)
1435 used = len;
1436
1437 /* Do we have urgent data here? */
1438 if (tp->urg_data) {
1439 u32 urg_offset = tp->urg_seq - *seq;
1440 if (urg_offset < used) {
1441 if (!urg_offset) {
1442 if (!sock_flag(sk, SOCK_URGINLINE)) {
1443 ++*seq;
1444 offset++;
1445 used--;
1446 if (!used)
1447 goto skip_copy;
1448 }
1449 } else
1450 used = urg_offset;
1451 }
1452 }
1453
1454 if (!(flags & MSG_TRUNC)) {
1455 err = skb_copy_datagram_iovec(skb, offset,
1456 msg->msg_iov, used);
1457 if (err) {
1458 /* Exception. Bailout! */
1459 if (!copied)
1460 copied = -EFAULT;
1461 break;
1462 }
1463 }
1464
1465 *seq += used;
1466 copied += used;
1467 len -= used;
1468
1469 tcp_rcv_space_adjust(sk);
1470
1471skip_copy:
1472 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1473 tp->urg_data = 0;
1474 tcp_fast_path_check(sk, tp);
1475 }
1476 if (used + offset < skb->len)
1477 continue;
1478
1479 if (skb->h.th->fin)
1480 goto found_fin_ok;
1481 if (!(flags & MSG_PEEK))
1482 sk_eat_skb(sk, skb);
1483 continue;
1484
1485 found_fin_ok:
1486 /* Process the FIN. */
1487 ++*seq;
1488 if (!(flags & MSG_PEEK))
1489 sk_eat_skb(sk, skb);
1490 break;
1491 } while (len > 0);
1492
1493 if (user_recv) {
1494 if (skb_queue_len(&tp->ucopy.prequeue)) {
1495 int chunk;
1496
1497 tp->ucopy.len = copied > 0 ? len : 0;
1498
1499 tcp_prequeue_process(sk);
1500
1501 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1502 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1503 len -= chunk;
1504 copied += chunk;
1505 }
1506 }
1507
1508 tp->ucopy.task = NULL;
1509 tp->ucopy.len = 0;
1510 }
1511
1512 /* According to UNIX98, msg_name/msg_namelen are ignored
1513 * on connected socket. I was just happy when found this 8) --ANK
1514 */
1515
1516 /* Clean up data we have read: This will do ACK frames. */
1517 cleanup_rbuf(sk, copied);
1518
1519 TCP_CHECK_TIMER(sk);
1520 release_sock(sk);
1521 return copied;
1522
1523out:
1524 TCP_CHECK_TIMER(sk);
1525 release_sock(sk);
1526 return err;
1527
1528recv_urg:
1529 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1530 goto out;
1531}
1532
1533/*
1534 * State processing on a close. This implements the state shift for
1535 * sending our FIN frame. Note that we only send a FIN for some
1536 * states. A shutdown() may have already sent the FIN, or we may be
1537 * closed.
1538 */
1539
1540static unsigned char new_state[16] = {
1541 /* current state: new state: action: */
1542 /* (Invalid) */ TCP_CLOSE,
1543 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1544 /* TCP_SYN_SENT */ TCP_CLOSE,
1545 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1546 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1547 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1548 /* TCP_TIME_WAIT */ TCP_CLOSE,
1549 /* TCP_CLOSE */ TCP_CLOSE,
1550 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1551 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1552 /* TCP_LISTEN */ TCP_CLOSE,
1553 /* TCP_CLOSING */ TCP_CLOSING,
1554};
1555
1556static int tcp_close_state(struct sock *sk)
1557{
1558 int next = (int)new_state[sk->sk_state];
1559 int ns = next & TCP_STATE_MASK;
1560
1561 tcp_set_state(sk, ns);
1562
1563 return next & TCP_ACTION_FIN;
1564}
1565
1566/*
1567 * Shutdown the sending side of a connection. Much like close except
1568 * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1569 */
1570
1571void tcp_shutdown(struct sock *sk, int how)
1572{
1573 /* We need to grab some memory, and put together a FIN,
1574 * and then put it into the queue to be sent.
1575 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1576 */
1577 if (!(how & SEND_SHUTDOWN))
1578 return;
1579
1580 /* If we've already sent a FIN, or it's a closed state, skip this. */
1581 if ((1 << sk->sk_state) &
1582 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1583 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1584 /* Clear out any half completed packets. FIN if needed. */
1585 if (tcp_close_state(sk))
1586 tcp_send_fin(sk);
1587 }
1588}
1589
1590/*
1591 * At this point, there should be no process reference to this
1592 * socket, and thus no user references at all. Therefore we
1593 * can assume the socket waitqueue is inactive and nobody will
1594 * try to jump onto it.
1595 */
1596void tcp_destroy_sock(struct sock *sk)
1597{
1598 BUG_TRAP(sk->sk_state == TCP_CLOSE);
1599 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1600
1601 /* It cannot be in hash table! */
1602 BUG_TRAP(sk_unhashed(sk));
1603
1604 /* If it has not 0 inet_sk(sk)->num, it must be bound */
1605 BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1606
1607 sk->sk_prot->destroy(sk);
1608
1609 sk_stream_kill_queues(sk);
1610
1611 xfrm_sk_free_policy(sk);
1612
1613#ifdef INET_REFCNT_DEBUG
1614 if (atomic_read(&sk->sk_refcnt) != 1) {
1615 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1616 sk, atomic_read(&sk->sk_refcnt));
1617 }
1618#endif
1619
1620 atomic_dec(&tcp_orphan_count);
1621 sock_put(sk);
1622}
1623
1624void tcp_close(struct sock *sk, long timeout)
1625{
1626 struct sk_buff *skb;
1627 int data_was_unread = 0;
1628
1629 lock_sock(sk);
1630 sk->sk_shutdown = SHUTDOWN_MASK;
1631
1632 if (sk->sk_state == TCP_LISTEN) {
1633 tcp_set_state(sk, TCP_CLOSE);
1634
1635 /* Special case. */
1636 tcp_listen_stop(sk);
1637
1638 goto adjudge_to_death;
1639 }
1640
1641 /* We need to flush the recv. buffs. We do this only on the
1642 * descriptor close, not protocol-sourced closes, because the
1643 * reader process may not have drained the data yet!
1644 */
1645 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1646 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1647 skb->h.th->fin;
1648 data_was_unread += len;
1649 __kfree_skb(skb);
1650 }
1651
1652 sk_stream_mem_reclaim(sk);
1653
1654 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1655 * 3.10, we send a RST here because data was lost. To
1656 * witness the awful effects of the old behavior of always
1657 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1658 * a bulk GET in an FTP client, suspend the process, wait
1659 * for the client to advertise a zero window, then kill -9
1660 * the FTP client, wheee... Note: timeout is always zero
1661 * in such a case.
1662 */
1663 if (data_was_unread) {
1664 /* Unread data was tossed, zap the connection. */
1665 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1666 tcp_set_state(sk, TCP_CLOSE);
1667 tcp_send_active_reset(sk, GFP_KERNEL);
1668 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1669 /* Check zero linger _after_ checking for unread data. */
1670 sk->sk_prot->disconnect(sk, 0);
1671 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1672 } else if (tcp_close_state(sk)) {
1673 /* We FIN if the application ate all the data before
1674 * zapping the connection.
1675 */
1676
1677 /* RED-PEN. Formally speaking, we have broken TCP state
1678 * machine. State transitions:
1679 *
1680 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1681 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1682 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1683 *
1684 * are legal only when FIN has been sent (i.e. in window),
1685 * rather than queued out of window. Purists blame.
1686 *
1687 * F.e. "RFC state" is ESTABLISHED,
1688 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1689 *
1690 * The visible declinations are that sometimes
1691 * we enter time-wait state, when it is not required really
1692 * (harmless), do not send active resets, when they are
1693 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1694 * they look as CLOSING or LAST_ACK for Linux)
1695 * Probably, I missed some more holelets.
1696 * --ANK
1697 */
1698 tcp_send_fin(sk);
1699 }
1700
1701 sk_stream_wait_close(sk, timeout);
1702
1703adjudge_to_death:
1704 /* It is the last release_sock in its life. It will remove backlog. */
1705 release_sock(sk);
1706
1707
1708 /* Now socket is owned by kernel and we acquire BH lock
1709 to finish close. No need to check for user refs.
1710 */
1711 local_bh_disable();
1712 bh_lock_sock(sk);
1713 BUG_TRAP(!sock_owned_by_user(sk));
1714
1715 sock_hold(sk);
1716 sock_orphan(sk);
1717
1718 /* This is a (useful) BSD violating of the RFC. There is a
1719 * problem with TCP as specified in that the other end could
1720 * keep a socket open forever with no application left this end.
1721 * We use a 3 minute timeout (about the same as BSD) then kill
1722 * our end. If they send after that then tough - BUT: long enough
1723 * that we won't make the old 4*rto = almost no time - whoops
1724 * reset mistake.
1725 *
1726 * Nope, it was not mistake. It is really desired behaviour
1727 * f.e. on http servers, when such sockets are useless, but
1728 * consume significant resources. Let's do it with special
1729 * linger2 option. --ANK
1730 */
1731
1732 if (sk->sk_state == TCP_FIN_WAIT2) {
1733 struct tcp_sock *tp = tcp_sk(sk);
1734 if (tp->linger2 < 0) {
1735 tcp_set_state(sk, TCP_CLOSE);
1736 tcp_send_active_reset(sk, GFP_ATOMIC);
1737 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1738 } else {
1739 int tmo = tcp_fin_time(tp);
1740
1741 if (tmo > TCP_TIMEWAIT_LEN) {
1742 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1743 } else {
1744 atomic_inc(&tcp_orphan_count);
1745 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1746 goto out;
1747 }
1748 }
1749 }
1750 if (sk->sk_state != TCP_CLOSE) {
1751 sk_stream_mem_reclaim(sk);
1752 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1753 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1754 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1755 if (net_ratelimit())
1756 printk(KERN_INFO "TCP: too many of orphaned "
1757 "sockets\n");
1758 tcp_set_state(sk, TCP_CLOSE);
1759 tcp_send_active_reset(sk, GFP_ATOMIC);
1760 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1761 }
1762 }
1763 atomic_inc(&tcp_orphan_count);
1764
1765 if (sk->sk_state == TCP_CLOSE)
1766 tcp_destroy_sock(sk);
1767 /* Otherwise, socket is reprieved until protocol close. */
1768
1769out:
1770 bh_unlock_sock(sk);
1771 local_bh_enable();
1772 sock_put(sk);
1773}
1774
1775/* These states need RST on ABORT according to RFC793 */
1776
1777static inline int tcp_need_reset(int state)
1778{
1779 return (1 << state) &
1780 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1781 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1782}
1783
1784int tcp_disconnect(struct sock *sk, int flags)
1785{
1786 struct inet_sock *inet = inet_sk(sk);
1787 struct tcp_sock *tp = tcp_sk(sk);
1788 int err = 0;
1789 int old_state = sk->sk_state;
1790
1791 if (old_state != TCP_CLOSE)
1792 tcp_set_state(sk, TCP_CLOSE);
1793
1794 /* ABORT function of RFC793 */
1795 if (old_state == TCP_LISTEN) {
1796 tcp_listen_stop(sk);
1797 } else if (tcp_need_reset(old_state) ||
1798 (tp->snd_nxt != tp->write_seq &&
1799 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1800 /* The last check adjusts for discrepance of Linux wrt. RFC
1801 * states
1802 */
1803 tcp_send_active_reset(sk, gfp_any());
1804 sk->sk_err = ECONNRESET;
1805 } else if (old_state == TCP_SYN_SENT)
1806 sk->sk_err = ECONNRESET;
1807
1808 tcp_clear_xmit_timers(sk);
1809 __skb_queue_purge(&sk->sk_receive_queue);
1810 sk_stream_writequeue_purge(sk);
1811 __skb_queue_purge(&tp->out_of_order_queue);
1812
1813 inet->dport = 0;
1814
1815 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1816 inet_reset_saddr(sk);
1817
1818 sk->sk_shutdown = 0;
1819 sock_reset_flag(sk, SOCK_DONE);
1820 tp->srtt = 0;
1821 if ((tp->write_seq += tp->max_window + 2) == 0)
1822 tp->write_seq = 1;
1823 tp->backoff = 0;
1824 tp->snd_cwnd = 2;
1825 tp->probes_out = 0;
1826 tp->packets_out = 0;
1827 tp->snd_ssthresh = 0x7fffffff;
1828 tp->snd_cwnd_cnt = 0;
1829 tcp_set_ca_state(tp, TCP_CA_Open);
1830 tcp_clear_retrans(tp);
1831 tcp_delack_init(tp);
1832 sk->sk_send_head = NULL;
1833 tp->rx_opt.saw_tstamp = 0;
1834 tcp_sack_reset(&tp->rx_opt);
1835 __sk_dst_reset(sk);
1836
1837 BUG_TRAP(!inet->num || tp->bind_hash);
1838
1839 sk->sk_error_report(sk);
1840 return err;
1841}
1842
1843/*
1844 * Wait for an incoming connection, avoid race
1845 * conditions. This must be called with the socket locked.
1846 */
1847static int wait_for_connect(struct sock *sk, long timeo)
1848{
1849 struct tcp_sock *tp = tcp_sk(sk);
1850 DEFINE_WAIT(wait);
1851 int err;
1852
1853 /*
1854 * True wake-one mechanism for incoming connections: only
1855 * one process gets woken up, not the 'whole herd'.
1856 * Since we do not 'race & poll' for established sockets
1857 * anymore, the common case will execute the loop only once.
1858 *
1859 * Subtle issue: "add_wait_queue_exclusive()" will be added
1860 * after any current non-exclusive waiters, and we know that
1861 * it will always _stay_ after any new non-exclusive waiters
1862 * because all non-exclusive waiters are added at the
1863 * beginning of the wait-queue. As such, it's ok to "drop"
1864 * our exclusiveness temporarily when we get woken up without
1865 * having to remove and re-insert us on the wait queue.
1866 */
1867 for (;;) {
1868 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1869 TASK_INTERRUPTIBLE);
1870 release_sock(sk);
1871 if (!tp->accept_queue)
1872 timeo = schedule_timeout(timeo);
1873 lock_sock(sk);
1874 err = 0;
1875 if (tp->accept_queue)
1876 break;
1877 err = -EINVAL;
1878 if (sk->sk_state != TCP_LISTEN)
1879 break;
1880 err = sock_intr_errno(timeo);
1881 if (signal_pending(current))
1882 break;
1883 err = -EAGAIN;
1884 if (!timeo)
1885 break;
1886 }
1887 finish_wait(sk->sk_sleep, &wait);
1888 return err;
1889}
1890
1891/*
1892 * This will accept the next outstanding connection.
1893 */
1894
1895struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1896{
1897 struct tcp_sock *tp = tcp_sk(sk);
1898 struct open_request *req;
1899 struct sock *newsk;
1900 int error;
1901
1902 lock_sock(sk);
1903
1904 /* We need to make sure that this socket is listening,
1905 * and that it has something pending.
1906 */
1907 error = -EINVAL;
1908 if (sk->sk_state != TCP_LISTEN)
1909 goto out;
1910
1911 /* Find already established connection */
1912 if (!tp->accept_queue) {
1913 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1914
1915 /* If this is a non blocking socket don't sleep */
1916 error = -EAGAIN;
1917 if (!timeo)
1918 goto out;
1919
1920 error = wait_for_connect(sk, timeo);
1921 if (error)
1922 goto out;
1923 }
1924
1925 req = tp->accept_queue;
1926 if ((tp->accept_queue = req->dl_next) == NULL)
1927 tp->accept_queue_tail = NULL;
1928
1929 newsk = req->sk;
1930 sk_acceptq_removed(sk);
1931 tcp_openreq_fastfree(req);
1932 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1933 release_sock(sk);
1934 return newsk;
1935
1936out:
1937 release_sock(sk);
1938 *err = error;
1939 return NULL;
1940}
1941
1942/*
1943 * Socket option code for TCP.
1944 */
1945int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1946 int optlen)
1947{
1948 struct tcp_sock *tp = tcp_sk(sk);
1949 int val;
1950 int err = 0;
1951
1952 if (level != SOL_TCP)
1953 return tp->af_specific->setsockopt(sk, level, optname,
1954 optval, optlen);
1955
1956 if (optlen < sizeof(int))
1957 return -EINVAL;
1958
1959 if (get_user(val, (int __user *)optval))
1960 return -EFAULT;
1961
1962 lock_sock(sk);
1963
1964 switch (optname) {
1965 case TCP_MAXSEG:
1966 /* Values greater than interface MTU won't take effect. However
1967 * at the point when this call is done we typically don't yet
1968 * know which interface is going to be used */
1969 if (val < 8 || val > MAX_TCP_WINDOW) {
1970 err = -EINVAL;
1971 break;
1972 }
1973 tp->rx_opt.user_mss = val;
1974 break;
1975
1976 case TCP_NODELAY:
1977 if (val) {
1978 /* TCP_NODELAY is weaker than TCP_CORK, so that
1979 * this option on corked socket is remembered, but
1980 * it is not activated until cork is cleared.
1981 *
1982 * However, when TCP_NODELAY is set we make
1983 * an explicit push, which overrides even TCP_CORK
1984 * for currently queued segments.
1985 */
1986 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1987 tcp_push_pending_frames(sk, tp);
1988 } else {
1989 tp->nonagle &= ~TCP_NAGLE_OFF;
1990 }
1991 break;
1992
1993 case TCP_CORK:
1994 /* When set indicates to always queue non-full frames.
1995 * Later the user clears this option and we transmit
1996 * any pending partial frames in the queue. This is
1997 * meant to be used alongside sendfile() to get properly
1998 * filled frames when the user (for example) must write
1999 * out headers with a write() call first and then use
2000 * sendfile to send out the data parts.
2001 *
2002 * TCP_CORK can be set together with TCP_NODELAY and it is
2003 * stronger than TCP_NODELAY.
2004 */
2005 if (val) {
2006 tp->nonagle |= TCP_NAGLE_CORK;
2007 } else {
2008 tp->nonagle &= ~TCP_NAGLE_CORK;
2009 if (tp->nonagle&TCP_NAGLE_OFF)
2010 tp->nonagle |= TCP_NAGLE_PUSH;
2011 tcp_push_pending_frames(sk, tp);
2012 }
2013 break;
2014
2015 case TCP_KEEPIDLE:
2016 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2017 err = -EINVAL;
2018 else {
2019 tp->keepalive_time = val * HZ;
2020 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2021 !((1 << sk->sk_state) &
2022 (TCPF_CLOSE | TCPF_LISTEN))) {
2023 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2024 if (tp->keepalive_time > elapsed)
2025 elapsed = tp->keepalive_time - elapsed;
2026 else
2027 elapsed = 0;
2028 tcp_reset_keepalive_timer(sk, elapsed);
2029 }
2030 }
2031 break;
2032 case TCP_KEEPINTVL:
2033 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2034 err = -EINVAL;
2035 else
2036 tp->keepalive_intvl = val * HZ;
2037 break;
2038 case TCP_KEEPCNT:
2039 if (val < 1 || val > MAX_TCP_KEEPCNT)
2040 err = -EINVAL;
2041 else
2042 tp->keepalive_probes = val;
2043 break;
2044 case TCP_SYNCNT:
2045 if (val < 1 || val > MAX_TCP_SYNCNT)
2046 err = -EINVAL;
2047 else
2048 tp->syn_retries = val;
2049 break;
2050
2051 case TCP_LINGER2:
2052 if (val < 0)
2053 tp->linger2 = -1;
2054 else if (val > sysctl_tcp_fin_timeout / HZ)
2055 tp->linger2 = 0;
2056 else
2057 tp->linger2 = val * HZ;
2058 break;
2059
2060 case TCP_DEFER_ACCEPT:
2061 tp->defer_accept = 0;
2062 if (val > 0) {
2063 /* Translate value in seconds to number of
2064 * retransmits */
2065 while (tp->defer_accept < 32 &&
2066 val > ((TCP_TIMEOUT_INIT / HZ) <<
2067 tp->defer_accept))
2068 tp->defer_accept++;
2069 tp->defer_accept++;
2070 }
2071 break;
2072
2073 case TCP_WINDOW_CLAMP:
2074 if (!val) {
2075 if (sk->sk_state != TCP_CLOSE) {
2076 err = -EINVAL;
2077 break;
2078 }
2079 tp->window_clamp = 0;
2080 } else
2081 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2082 SOCK_MIN_RCVBUF / 2 : val;
2083 break;
2084
2085 case TCP_QUICKACK:
2086 if (!val) {
2087 tp->ack.pingpong = 1;
2088 } else {
2089 tp->ack.pingpong = 0;
2090 if ((1 << sk->sk_state) &
2091 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2092 tcp_ack_scheduled(tp)) {
2093 tp->ack.pending |= TCP_ACK_PUSHED;
2094 cleanup_rbuf(sk, 1);
2095 if (!(val & 1))
2096 tp->ack.pingpong = 1;
2097 }
2098 }
2099 break;
2100
2101 default:
2102 err = -ENOPROTOOPT;
2103 break;
2104 };
2105 release_sock(sk);
2106 return err;
2107}
2108
2109/* Return information about state of tcp endpoint in API format. */
2110void tcp_get_info(struct sock *sk, struct tcp_info *info)
2111{
2112 struct tcp_sock *tp = tcp_sk(sk);
2113 u32 now = tcp_time_stamp;
2114
2115 memset(info, 0, sizeof(*info));
2116
2117 info->tcpi_state = sk->sk_state;
2118 info->tcpi_ca_state = tp->ca_state;
2119 info->tcpi_retransmits = tp->retransmits;
2120 info->tcpi_probes = tp->probes_out;
2121 info->tcpi_backoff = tp->backoff;
2122
2123 if (tp->rx_opt.tstamp_ok)
2124 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2125 if (tp->rx_opt.sack_ok)
2126 info->tcpi_options |= TCPI_OPT_SACK;
2127 if (tp->rx_opt.wscale_ok) {
2128 info->tcpi_options |= TCPI_OPT_WSCALE;
2129 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2130 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2131 }
2132
2133 if (tp->ecn_flags&TCP_ECN_OK)
2134 info->tcpi_options |= TCPI_OPT_ECN;
2135
2136 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2137 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2138 info->tcpi_snd_mss = tp->mss_cache_std;
2139 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2140
2141 info->tcpi_unacked = tp->packets_out;
2142 info->tcpi_sacked = tp->sacked_out;
2143 info->tcpi_lost = tp->lost_out;
2144 info->tcpi_retrans = tp->retrans_out;
2145 info->tcpi_fackets = tp->fackets_out;
2146
2147 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2148 info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2149 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2150
2151 info->tcpi_pmtu = tp->pmtu_cookie;
2152 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2153 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2154 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2155 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2156 info->tcpi_snd_cwnd = tp->snd_cwnd;
2157 info->tcpi_advmss = tp->advmss;
2158 info->tcpi_reordering = tp->reordering;
2159
2160 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2161 info->tcpi_rcv_space = tp->rcvq_space.space;
2162
2163 info->tcpi_total_retrans = tp->total_retrans;
2164}
2165
2166EXPORT_SYMBOL_GPL(tcp_get_info);
2167
2168int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2169 int __user *optlen)
2170{
2171 struct tcp_sock *tp = tcp_sk(sk);
2172 int val, len;
2173
2174 if (level != SOL_TCP)
2175 return tp->af_specific->getsockopt(sk, level, optname,
2176 optval, optlen);
2177
2178 if (get_user(len, optlen))
2179 return -EFAULT;
2180
2181 len = min_t(unsigned int, len, sizeof(int));
2182
2183 if (len < 0)
2184 return -EINVAL;
2185
2186 switch (optname) {
2187 case TCP_MAXSEG:
2188 val = tp->mss_cache_std;
2189 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2190 val = tp->rx_opt.user_mss;
2191 break;
2192 case TCP_NODELAY:
2193 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2194 break;
2195 case TCP_CORK:
2196 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2197 break;
2198 case TCP_KEEPIDLE:
2199 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2200 break;
2201 case TCP_KEEPINTVL:
2202 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2203 break;
2204 case TCP_KEEPCNT:
2205 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2206 break;
2207 case TCP_SYNCNT:
2208 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2209 break;
2210 case TCP_LINGER2:
2211 val = tp->linger2;
2212 if (val >= 0)
2213 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2214 break;
2215 case TCP_DEFER_ACCEPT:
2216 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2217 (tp->defer_accept - 1));
2218 break;
2219 case TCP_WINDOW_CLAMP:
2220 val = tp->window_clamp;
2221 break;
2222 case TCP_INFO: {
2223 struct tcp_info info;
2224
2225 if (get_user(len, optlen))
2226 return -EFAULT;
2227
2228 tcp_get_info(sk, &info);
2229
2230 len = min_t(unsigned int, len, sizeof(info));
2231 if (put_user(len, optlen))
2232 return -EFAULT;
2233 if (copy_to_user(optval, &info, len))
2234 return -EFAULT;
2235 return 0;
2236 }
2237 case TCP_QUICKACK:
2238 val = !tp->ack.pingpong;
2239 break;
2240 default:
2241 return -ENOPROTOOPT;
2242 };
2243
2244 if (put_user(len, optlen))
2245 return -EFAULT;
2246 if (copy_to_user(optval, &val, len))
2247 return -EFAULT;
2248 return 0;
2249}
2250
2251
2252extern void __skb_cb_too_small_for_tcp(int, int);
2253extern void tcpdiag_init(void);
2254
2255static __initdata unsigned long thash_entries;
2256static int __init set_thash_entries(char *str)
2257{
2258 if (!str)
2259 return 0;
2260 thash_entries = simple_strtoul(str, &str, 0);
2261 return 1;
2262}
2263__setup("thash_entries=", set_thash_entries);
2264
2265void __init tcp_init(void)
2266{
2267 struct sk_buff *skb = NULL;
2268 int order, i;
2269
2270 if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2271 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2272 sizeof(skb->cb));
2273
2274 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2275 sizeof(struct open_request),
2276 0, SLAB_HWCACHE_ALIGN,
2277 NULL, NULL);
2278 if (!tcp_openreq_cachep)
2279 panic("tcp_init: Cannot alloc open_request cache.");
2280
2281 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282 sizeof(struct tcp_bind_bucket),
2283 0, SLAB_HWCACHE_ALIGN,
2284 NULL, NULL);
2285 if (!tcp_bucket_cachep)
2286 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287
2288 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289 sizeof(struct tcp_tw_bucket),
2290 0, SLAB_HWCACHE_ALIGN,
2291 NULL, NULL);
2292 if (!tcp_timewait_cachep)
2293 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294
2295 /* Size and allocate the main established and bind bucket
2296 * hash tables.
2297 *
2298 * The methodology is similar to that of the buffer cache.
2299 */
2300 tcp_ehash = (struct tcp_ehash_bucket *)
2301 alloc_large_system_hash("TCP established",
2302 sizeof(struct tcp_ehash_bucket),
2303 thash_entries,
2304 (num_physpages >= 128 * 1024) ?
2305 (25 - PAGE_SHIFT) :
2306 (27 - PAGE_SHIFT),
2307 HASH_HIGHMEM,
2308 &tcp_ehash_size,
2309 NULL,
2310 0);
2311 tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2312 for (i = 0; i < (tcp_ehash_size << 1); i++) {
2313 rwlock_init(&tcp_ehash[i].lock);
2314 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2315 }
2316
2317 tcp_bhash = (struct tcp_bind_hashbucket *)
2318 alloc_large_system_hash("TCP bind",
2319 sizeof(struct tcp_bind_hashbucket),
2320 tcp_ehash_size,
2321 (num_physpages >= 128 * 1024) ?
2322 (25 - PAGE_SHIFT) :
2323 (27 - PAGE_SHIFT),
2324 HASH_HIGHMEM,
2325 &tcp_bhash_size,
2326 NULL,
2327 64 * 1024);
2328 tcp_bhash_size = 1 << tcp_bhash_size;
2329 for (i = 0; i < tcp_bhash_size; i++) {
2330 spin_lock_init(&tcp_bhash[i].lock);
2331 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2332 }
2333
2334 /* Try to be a bit smarter and adjust defaults depending
2335 * on available memory.
2336 */
2337 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339 order++)
2340 ;
2341 if (order > 4) {
2342 sysctl_local_port_range[0] = 32768;
2343 sysctl_local_port_range[1] = 61000;
2344 sysctl_tcp_max_tw_buckets = 180000;
2345 sysctl_tcp_max_orphans = 4096 << (order - 4);
2346 sysctl_max_syn_backlog = 1024;
2347 } else if (order < 3) {
2348 sysctl_local_port_range[0] = 1024 * (3 - order);
2349 sysctl_tcp_max_tw_buckets >>= (3 - order);
2350 sysctl_tcp_max_orphans >>= (3 - order);
2351 sysctl_max_syn_backlog = 128;
2352 }
2353 tcp_port_rover = sysctl_local_port_range[0] - 1;
2354
2355 sysctl_tcp_mem[0] = 768 << order;
2356 sysctl_tcp_mem[1] = 1024 << order;
2357 sysctl_tcp_mem[2] = 1536 << order;
2358
2359 if (order < 3) {
2360 sysctl_tcp_wmem[2] = 64 * 1024;
2361 sysctl_tcp_rmem[0] = PAGE_SIZE;
2362 sysctl_tcp_rmem[1] = 43689;
2363 sysctl_tcp_rmem[2] = 2 * 43689;
2364 }
2365
2366 printk(KERN_INFO "TCP: Hash tables configured "
2367 "(established %d bind %d)\n",
2368 tcp_ehash_size << 1, tcp_bhash_size);
2369}
2370
2371EXPORT_SYMBOL(tcp_accept);
2372EXPORT_SYMBOL(tcp_close);
2373EXPORT_SYMBOL(tcp_destroy_sock);
2374EXPORT_SYMBOL(tcp_disconnect);
2375EXPORT_SYMBOL(tcp_getsockopt);
2376EXPORT_SYMBOL(tcp_ioctl);
2377EXPORT_SYMBOL(tcp_openreq_cachep);
2378EXPORT_SYMBOL(tcp_poll);
2379EXPORT_SYMBOL(tcp_read_sock);
2380EXPORT_SYMBOL(tcp_recvmsg);
2381EXPORT_SYMBOL(tcp_sendmsg);
2382EXPORT_SYMBOL(tcp_sendpage);
2383EXPORT_SYMBOL(tcp_setsockopt);
2384EXPORT_SYMBOL(tcp_shutdown);
2385EXPORT_SYMBOL(tcp_statistics);
2386EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 000000000000..313c1408da33
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,802 @@
1/*
2 * tcp_diag.c Module for monitoring TCP sockets.
3 *
4 * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
5 *
6 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/fcntl.h>
18#include <linux/random.h>
19#include <linux/cache.h>
20#include <linux/init.h>
21#include <linux/time.h>
22
23#include <net/icmp.h>
24#include <net/tcp.h>
25#include <net/ipv6.h>
26#include <net/inet_common.h>
27
28#include <linux/inet.h>
29#include <linux/stddef.h>
30
31#include <linux/tcp_diag.h>
32
33struct tcpdiag_entry
34{
35 u32 *saddr;
36 u32 *daddr;
37 u16 sport;
38 u16 dport;
39 u16 family;
40 u16 userlocks;
41};
42
43static struct sock *tcpnl;
44
45
46#define TCPDIAG_PUT(skb, attrtype, attrlen) \
47({ int rtalen = RTA_LENGTH(attrlen); \
48 struct rtattr *rta; \
49 if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
50 rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
51 rta->rta_type = attrtype; \
52 rta->rta_len = rtalen; \
53 RTA_DATA(rta); })
54
55static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
56 int ext, u32 pid, u32 seq, u16 nlmsg_flags)
57{
58 struct inet_sock *inet = inet_sk(sk);
59 struct tcp_sock *tp = tcp_sk(sk);
60 struct tcpdiagmsg *r;
61 struct nlmsghdr *nlh;
62 struct tcp_info *info = NULL;
63 struct tcpdiag_meminfo *minfo = NULL;
64 struct tcpvegas_info *vinfo = NULL;
65 unsigned char *b = skb->tail;
66
67 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
68 nlh->nlmsg_flags = nlmsg_flags;
69 r = NLMSG_DATA(nlh);
70 if (sk->sk_state != TCP_TIME_WAIT) {
71 if (ext & (1<<(TCPDIAG_MEMINFO-1)))
72 minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
73 if (ext & (1<<(TCPDIAG_INFO-1)))
74 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
75
76 if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
77 && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
78 vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
79 }
80 r->tcpdiag_family = sk->sk_family;
81 r->tcpdiag_state = sk->sk_state;
82 r->tcpdiag_timer = 0;
83 r->tcpdiag_retrans = 0;
84
85 r->id.tcpdiag_if = sk->sk_bound_dev_if;
86 r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
87 r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
88
89 if (r->tcpdiag_state == TCP_TIME_WAIT) {
90 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
91 long tmo = tw->tw_ttd - jiffies;
92 if (tmo < 0)
93 tmo = 0;
94
95 r->id.tcpdiag_sport = tw->tw_sport;
96 r->id.tcpdiag_dport = tw->tw_dport;
97 r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
98 r->id.tcpdiag_dst[0] = tw->tw_daddr;
99 r->tcpdiag_state = tw->tw_substate;
100 r->tcpdiag_timer = 3;
101 r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
102 r->tcpdiag_rqueue = 0;
103 r->tcpdiag_wqueue = 0;
104 r->tcpdiag_uid = 0;
105 r->tcpdiag_inode = 0;
106#ifdef CONFIG_IP_TCPDIAG_IPV6
107 if (r->tcpdiag_family == AF_INET6) {
108 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
109 &tw->tw_v6_rcv_saddr);
110 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
111 &tw->tw_v6_daddr);
112 }
113#endif
114 nlh->nlmsg_len = skb->tail - b;
115 return skb->len;
116 }
117
118 r->id.tcpdiag_sport = inet->sport;
119 r->id.tcpdiag_dport = inet->dport;
120 r->id.tcpdiag_src[0] = inet->rcv_saddr;
121 r->id.tcpdiag_dst[0] = inet->daddr;
122
123#ifdef CONFIG_IP_TCPDIAG_IPV6
124 if (r->tcpdiag_family == AF_INET6) {
125 struct ipv6_pinfo *np = inet6_sk(sk);
126
127 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
128 &np->rcv_saddr);
129 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
130 &np->daddr);
131 }
132#endif
133
134#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
135
136 if (tp->pending == TCP_TIME_RETRANS) {
137 r->tcpdiag_timer = 1;
138 r->tcpdiag_retrans = tp->retransmits;
139 r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
140 } else if (tp->pending == TCP_TIME_PROBE0) {
141 r->tcpdiag_timer = 4;
142 r->tcpdiag_retrans = tp->probes_out;
143 r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
144 } else if (timer_pending(&sk->sk_timer)) {
145 r->tcpdiag_timer = 2;
146 r->tcpdiag_retrans = tp->probes_out;
147 r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
148 } else {
149 r->tcpdiag_timer = 0;
150 r->tcpdiag_expires = 0;
151 }
152#undef EXPIRES_IN_MS
153
154 r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
155 r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
156 r->tcpdiag_uid = sock_i_uid(sk);
157 r->tcpdiag_inode = sock_i_ino(sk);
158
159 if (minfo) {
160 minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
161 minfo->tcpdiag_wmem = sk->sk_wmem_queued;
162 minfo->tcpdiag_fmem = sk->sk_forward_alloc;
163 minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
164 }
165
166 if (info)
167 tcp_get_info(sk, info);
168
169 if (vinfo) {
170 if (tcp_is_vegas(tp)) {
171 vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
172 vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
173 vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
174 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
175 } else {
176 vinfo->tcpv_enabled = 0;
177 vinfo->tcpv_rttcnt = 0;
178 vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
179 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
180 }
181 }
182
183 nlh->nlmsg_len = skb->tail - b;
184 return skb->len;
185
186nlmsg_failure:
187 skb_trim(skb, b - skb->data);
188 return -1;
189}
190
191extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
192 int dif);
193#ifdef CONFIG_IP_TCPDIAG_IPV6
194extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
195 struct in6_addr *daddr, u16 dport,
196 int dif);
197#else
198static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
199 struct in6_addr *daddr, u16 dport,
200 int dif)
201{
202 return NULL;
203}
204#endif
205
206static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
207{
208 int err;
209 struct sock *sk;
210 struct tcpdiagreq *req = NLMSG_DATA(nlh);
211 struct sk_buff *rep;
212
213 if (req->tcpdiag_family == AF_INET) {
214 sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
215 req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
216 req->id.tcpdiag_if);
217 }
218#ifdef CONFIG_IP_TCPDIAG_IPV6
219 else if (req->tcpdiag_family == AF_INET6) {
220 sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
221 (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
222 req->id.tcpdiag_if);
223 }
224#endif
225 else {
226 return -EINVAL;
227 }
228
229 if (sk == NULL)
230 return -ENOENT;
231
232 err = -ESTALE;
233 if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
234 req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
235 ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
236 (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
237 goto out;
238
239 err = -ENOMEM;
240 rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
241 sizeof(struct tcpdiag_meminfo)+
242 sizeof(struct tcp_info)+64), GFP_KERNEL);
243 if (!rep)
244 goto out;
245
246 if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
247 NETLINK_CB(in_skb).pid,
248 nlh->nlmsg_seq, 0) <= 0)
249 BUG();
250
251 err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
252 if (err > 0)
253 err = 0;
254
255out:
256 if (sk) {
257 if (sk->sk_state == TCP_TIME_WAIT)
258 tcp_tw_put((struct tcp_tw_bucket*)sk);
259 else
260 sock_put(sk);
261 }
262 return err;
263}
264
265static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
266{
267 int words = bits >> 5;
268
269 bits &= 0x1f;
270
271 if (words) {
272 if (memcmp(a1, a2, words << 2))
273 return 0;
274 }
275 if (bits) {
276 __u32 w1, w2;
277 __u32 mask;
278
279 w1 = a1[words];
280 w2 = a2[words];
281
282 mask = htonl((0xffffffff) << (32 - bits));
283
284 if ((w1 ^ w2) & mask)
285 return 0;
286 }
287
288 return 1;
289}
290
291
292static int tcpdiag_bc_run(const void *bc, int len,
293 const struct tcpdiag_entry *entry)
294{
295 while (len > 0) {
296 int yes = 1;
297 const struct tcpdiag_bc_op *op = bc;
298
299 switch (op->code) {
300 case TCPDIAG_BC_NOP:
301 break;
302 case TCPDIAG_BC_JMP:
303 yes = 0;
304 break;
305 case TCPDIAG_BC_S_GE:
306 yes = entry->sport >= op[1].no;
307 break;
308 case TCPDIAG_BC_S_LE:
309 yes = entry->dport <= op[1].no;
310 break;
311 case TCPDIAG_BC_D_GE:
312 yes = entry->dport >= op[1].no;
313 break;
314 case TCPDIAG_BC_D_LE:
315 yes = entry->dport <= op[1].no;
316 break;
317 case TCPDIAG_BC_AUTO:
318 yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
319 break;
320 case TCPDIAG_BC_S_COND:
321 case TCPDIAG_BC_D_COND:
322 {
323 struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
324 u32 *addr;
325
326 if (cond->port != -1 &&
327 cond->port != (op->code == TCPDIAG_BC_S_COND ?
328 entry->sport : entry->dport)) {
329 yes = 0;
330 break;
331 }
332
333 if (cond->prefix_len == 0)
334 break;
335
336 if (op->code == TCPDIAG_BC_S_COND)
337 addr = entry->saddr;
338 else
339 addr = entry->daddr;
340
341 if (bitstring_match(addr, cond->addr, cond->prefix_len))
342 break;
343 if (entry->family == AF_INET6 &&
344 cond->family == AF_INET) {
345 if (addr[0] == 0 && addr[1] == 0 &&
346 addr[2] == htonl(0xffff) &&
347 bitstring_match(addr+3, cond->addr, cond->prefix_len))
348 break;
349 }
350 yes = 0;
351 break;
352 }
353 }
354
355 if (yes) {
356 len -= op->yes;
357 bc += op->yes;
358 } else {
359 len -= op->no;
360 bc += op->no;
361 }
362 }
363 return (len == 0);
364}
365
366static int valid_cc(const void *bc, int len, int cc)
367{
368 while (len >= 0) {
369 const struct tcpdiag_bc_op *op = bc;
370
371 if (cc > len)
372 return 0;
373 if (cc == len)
374 return 1;
375 if (op->yes < 4)
376 return 0;
377 len -= op->yes;
378 bc += op->yes;
379 }
380 return 0;
381}
382
383static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
384{
385 const unsigned char *bc = bytecode;
386 int len = bytecode_len;
387
388 while (len > 0) {
389 struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
390
391//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
392 switch (op->code) {
393 case TCPDIAG_BC_AUTO:
394 case TCPDIAG_BC_S_COND:
395 case TCPDIAG_BC_D_COND:
396 case TCPDIAG_BC_S_GE:
397 case TCPDIAG_BC_S_LE:
398 case TCPDIAG_BC_D_GE:
399 case TCPDIAG_BC_D_LE:
400 if (op->yes < 4 || op->yes > len+4)
401 return -EINVAL;
402 case TCPDIAG_BC_JMP:
403 if (op->no < 4 || op->no > len+4)
404 return -EINVAL;
405 if (op->no < len &&
406 !valid_cc(bytecode, bytecode_len, len-op->no))
407 return -EINVAL;
408 break;
409 case TCPDIAG_BC_NOP:
410 if (op->yes < 4 || op->yes > len+4)
411 return -EINVAL;
412 break;
413 default:
414 return -EINVAL;
415 }
416 bc += op->yes;
417 len -= op->yes;
418 }
419 return len == 0 ? 0 : -EINVAL;
420}
421
422static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
423 struct netlink_callback *cb)
424{
425 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
426
427 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
428 struct tcpdiag_entry entry;
429 struct rtattr *bc = (struct rtattr *)(r + 1);
430 struct inet_sock *inet = inet_sk(sk);
431
432 entry.family = sk->sk_family;
433#ifdef CONFIG_IP_TCPDIAG_IPV6
434 if (entry.family == AF_INET6) {
435 struct ipv6_pinfo *np = inet6_sk(sk);
436
437 entry.saddr = np->rcv_saddr.s6_addr32;
438 entry.daddr = np->daddr.s6_addr32;
439 } else
440#endif
441 {
442 entry.saddr = &inet->rcv_saddr;
443 entry.daddr = &inet->daddr;
444 }
445 entry.sport = inet->num;
446 entry.dport = ntohs(inet->dport);
447 entry.userlocks = sk->sk_userlocks;
448
449 if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
450 return 0;
451 }
452
453 return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
454 cb->nlh->nlmsg_seq, NLM_F_MULTI);
455}
456
457static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
458 struct open_request *req,
459 u32 pid, u32 seq)
460{
461 struct inet_sock *inet = inet_sk(sk);
462 unsigned char *b = skb->tail;
463 struct tcpdiagmsg *r;
464 struct nlmsghdr *nlh;
465 long tmo;
466
467 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
468 nlh->nlmsg_flags = NLM_F_MULTI;
469 r = NLMSG_DATA(nlh);
470
471 r->tcpdiag_family = sk->sk_family;
472 r->tcpdiag_state = TCP_SYN_RECV;
473 r->tcpdiag_timer = 1;
474 r->tcpdiag_retrans = req->retrans;
475
476 r->id.tcpdiag_if = sk->sk_bound_dev_if;
477 r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
478 r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
479
480 tmo = req->expires - jiffies;
481 if (tmo < 0)
482 tmo = 0;
483
484 r->id.tcpdiag_sport = inet->sport;
485 r->id.tcpdiag_dport = req->rmt_port;
486 r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr;
487 r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr;
488 r->tcpdiag_expires = jiffies_to_msecs(tmo),
489 r->tcpdiag_rqueue = 0;
490 r->tcpdiag_wqueue = 0;
491 r->tcpdiag_uid = sock_i_uid(sk);
492 r->tcpdiag_inode = 0;
493#ifdef CONFIG_IP_TCPDIAG_IPV6
494 if (r->tcpdiag_family == AF_INET6) {
495 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
496 &req->af.v6_req.loc_addr);
497 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
498 &req->af.v6_req.rmt_addr);
499 }
500#endif
501 nlh->nlmsg_len = skb->tail - b;
502
503 return skb->len;
504
505nlmsg_failure:
506 skb_trim(skb, b - skb->data);
507 return -1;
508}
509
510static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
511 struct netlink_callback *cb)
512{
513 struct tcpdiag_entry entry;
514 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
515 struct tcp_sock *tp = tcp_sk(sk);
516 struct tcp_listen_opt *lopt;
517 struct rtattr *bc = NULL;
518 struct inet_sock *inet = inet_sk(sk);
519 int j, s_j;
520 int reqnum, s_reqnum;
521 int err = 0;
522
523 s_j = cb->args[3];
524 s_reqnum = cb->args[4];
525
526 if (s_j > 0)
527 s_j--;
528
529 entry.family = sk->sk_family;
530
531 read_lock_bh(&tp->syn_wait_lock);
532
533 lopt = tp->listen_opt;
534 if (!lopt || !lopt->qlen)
535 goto out;
536
537 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
538 bc = (struct rtattr *)(r + 1);
539 entry.sport = inet->num;
540 entry.userlocks = sk->sk_userlocks;
541 }
542
543 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
544 struct open_request *req, *head = lopt->syn_table[j];
545
546 reqnum = 0;
547 for (req = head; req; reqnum++, req = req->dl_next) {
548 if (reqnum < s_reqnum)
549 continue;
550 if (r->id.tcpdiag_dport != req->rmt_port &&
551 r->id.tcpdiag_dport)
552 continue;
553
554 if (bc) {
555 entry.saddr =
556#ifdef CONFIG_IP_TCPDIAG_IPV6
557 (entry.family == AF_INET6) ?
558 req->af.v6_req.loc_addr.s6_addr32 :
559#endif
560 &req->af.v4_req.loc_addr;
561 entry.daddr =
562#ifdef CONFIG_IP_TCPDIAG_IPV6
563 (entry.family == AF_INET6) ?
564 req->af.v6_req.rmt_addr.s6_addr32 :
565#endif
566 &req->af.v4_req.rmt_addr;
567 entry.dport = ntohs(req->rmt_port);
568
569 if (!tcpdiag_bc_run(RTA_DATA(bc),
570 RTA_PAYLOAD(bc), &entry))
571 continue;
572 }
573
574 err = tcpdiag_fill_req(skb, sk, req,
575 NETLINK_CB(cb->skb).pid,
576 cb->nlh->nlmsg_seq);
577 if (err < 0) {
578 cb->args[3] = j + 1;
579 cb->args[4] = reqnum;
580 goto out;
581 }
582 }
583
584 s_reqnum = 0;
585 }
586
587out:
588 read_unlock_bh(&tp->syn_wait_lock);
589
590 return err;
591}
592
593static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
594{
595 int i, num;
596 int s_i, s_num;
597 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
598
599 s_i = cb->args[1];
600 s_num = num = cb->args[2];
601
602 if (cb->args[0] == 0) {
603 if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
604 goto skip_listen_ht;
605 tcp_listen_lock();
606 for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
607 struct sock *sk;
608 struct hlist_node *node;
609
610 num = 0;
611 sk_for_each(sk, node, &tcp_listening_hash[i]) {
612 struct inet_sock *inet = inet_sk(sk);
613
614 if (num < s_num) {
615 num++;
616 continue;
617 }
618
619 if (r->id.tcpdiag_sport != inet->sport &&
620 r->id.tcpdiag_sport)
621 goto next_listen;
622
623 if (!(r->tcpdiag_states&TCPF_LISTEN) ||
624 r->id.tcpdiag_dport ||
625 cb->args[3] > 0)
626 goto syn_recv;
627
628 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
629 tcp_listen_unlock();
630 goto done;
631 }
632
633syn_recv:
634 if (!(r->tcpdiag_states&TCPF_SYN_RECV))
635 goto next_listen;
636
637 if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
638 tcp_listen_unlock();
639 goto done;
640 }
641
642next_listen:
643 cb->args[3] = 0;
644 cb->args[4] = 0;
645 ++num;
646 }
647
648 s_num = 0;
649 cb->args[3] = 0;
650 cb->args[4] = 0;
651 }
652 tcp_listen_unlock();
653skip_listen_ht:
654 cb->args[0] = 1;
655 s_i = num = s_num = 0;
656 }
657
658 if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
659 return skb->len;
660
661 for (i = s_i; i < tcp_ehash_size; i++) {
662 struct tcp_ehash_bucket *head = &tcp_ehash[i];
663 struct sock *sk;
664 struct hlist_node *node;
665
666 if (i > s_i)
667 s_num = 0;
668
669 read_lock_bh(&head->lock);
670
671 num = 0;
672 sk_for_each(sk, node, &head->chain) {
673 struct inet_sock *inet = inet_sk(sk);
674
675 if (num < s_num)
676 goto next_normal;
677 if (!(r->tcpdiag_states & (1 << sk->sk_state)))
678 goto next_normal;
679 if (r->id.tcpdiag_sport != inet->sport &&
680 r->id.tcpdiag_sport)
681 goto next_normal;
682 if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
683 goto next_normal;
684 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
685 read_unlock_bh(&head->lock);
686 goto done;
687 }
688next_normal:
689 ++num;
690 }
691
692 if (r->tcpdiag_states&TCPF_TIME_WAIT) {
693 sk_for_each(sk, node,
694 &tcp_ehash[i + tcp_ehash_size].chain) {
695 struct inet_sock *inet = inet_sk(sk);
696
697 if (num < s_num)
698 goto next_dying;
699 if (r->id.tcpdiag_sport != inet->sport &&
700 r->id.tcpdiag_sport)
701 goto next_dying;
702 if (r->id.tcpdiag_dport != inet->dport &&
703 r->id.tcpdiag_dport)
704 goto next_dying;
705 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
706 read_unlock_bh(&head->lock);
707 goto done;
708 }
709next_dying:
710 ++num;
711 }
712 }
713 read_unlock_bh(&head->lock);
714 }
715
716done:
717 cb->args[1] = i;
718 cb->args[2] = num;
719 return skb->len;
720}
721
722static int tcpdiag_dump_done(struct netlink_callback *cb)
723{
724 return 0;
725}
726
727
728static __inline__ int
729tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
730{
731 if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
732 return 0;
733
734 if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
735 goto err_inval;
736
737 if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
738 goto err_inval;
739
740 if (nlh->nlmsg_flags&NLM_F_DUMP) {
741 if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
742 struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
743 if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
744 rta->rta_len < 8 ||
745 rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
746 goto err_inval;
747 if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
748 goto err_inval;
749 }
750 return netlink_dump_start(tcpnl, skb, nlh,
751 tcpdiag_dump,
752 tcpdiag_dump_done);
753 } else {
754 return tcpdiag_get_exact(skb, nlh);
755 }
756
757err_inval:
758 return -EINVAL;
759}
760
761
762static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
763{
764 int err;
765 struct nlmsghdr * nlh;
766
767 if (skb->len >= NLMSG_SPACE(0)) {
768 nlh = (struct nlmsghdr *)skb->data;
769 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
770 return;
771 err = tcpdiag_rcv_msg(skb, nlh);
772 if (err || nlh->nlmsg_flags & NLM_F_ACK)
773 netlink_ack(skb, nlh, err);
774 }
775}
776
777static void tcpdiag_rcv(struct sock *sk, int len)
778{
779 struct sk_buff *skb;
780
781 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
782 tcpdiag_rcv_skb(skb);
783 kfree_skb(skb);
784 }
785}
786
787static int __init tcpdiag_init(void)
788{
789 tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
790 if (tcpnl == NULL)
791 return -ENOMEM;
792 return 0;
793}
794
795static void __exit tcpdiag_exit(void)
796{
797 sock_release(tcpnl->sk_socket);
798}
799
800module_init(tcpdiag_init);
801module_exit(tcpdiag_exit);
802MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 000000000000..250492735902
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,4959 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23/*
24 * Changes:
25 * Pedro Roque : Fast Retransmit/Recovery.
26 * Two receive queues.
27 * Retransmit queue handled by TCP.
28 * Better retransmit timer handling.
29 * New congestion avoidance.
30 * Header prediction.
31 * Variable renaming.
32 *
33 * Eric : Fast Retransmit.
34 * Randy Scott : MSS option defines.
35 * Eric Schenk : Fixes to slow start algorithm.
36 * Eric Schenk : Yet another double ACK bug.
37 * Eric Schenk : Delayed ACK bug fixes.
38 * Eric Schenk : Floyd style fast retrans war avoidance.
39 * David S. Miller : Don't allow zero congestion window.
40 * Eric Schenk : Fix retransmitter so that it sends
41 * next packet on ack of previous packet.
42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of
46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming
49 * data segments.
50 * Andi Kleen: Make sure we never ack data there is not
51 * enough room for. Also make this condition
52 * a fatal error if it might still happen.
53 * Andi Kleen: Add tcp_measure_rcv_mss to make
54 * connections with MSS<min(MTU,ann. MSS)
55 * work without delayed acks.
56 * Andi Kleen: Process packets with PSH set in the
57 * fast path.
58 * J Hadi Salim: ECN support
59 * Andrei Gurtov,
60 * Pasi Sarolahti,
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */
66
67#include <linux/config.h>
68#include <linux/mm.h>
69#include <linux/module.h>
70#include <linux/sysctl.h>
71#include <net/tcp.h>
72#include <net/inet_common.h>
73#include <linux/ipsec.h>
74#include <asm/unaligned.h>
75
76int sysctl_tcp_timestamps = 1;
77int sysctl_tcp_window_scaling = 1;
78int sysctl_tcp_sack = 1;
79int sysctl_tcp_fack = 1;
80int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
81int sysctl_tcp_ecn;
82int sysctl_tcp_dsack = 1;
83int sysctl_tcp_app_win = 31;
84int sysctl_tcp_adv_win_scale = 2;
85
86int sysctl_tcp_stdurg;
87int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93
94int sysctl_tcp_moderate_rcvbuf = 1;
95
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
111#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
112#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
113#define FLAG_DATA_SACKED 0x20 /* New SACK. */
114#define FLAG_ECE 0x40 /* ECE in this ACK */
115#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
116#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
117
118#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
119#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
120#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
121#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
122
123#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0)
124#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
125#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
126
127#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
128
129/* Adapt the MSS value used to make delayed ack decision to the
130 * real world.
131 */
132static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
133 struct sk_buff *skb)
134{
135 unsigned int len, lss;
136
137 lss = tp->ack.last_seg_size;
138 tp->ack.last_seg_size = 0;
139
140 /* skb->len may jitter because of SACKs, even if peer
141 * sends good full-sized frames.
142 */
143 len = skb->len;
144 if (len >= tp->ack.rcv_mss) {
145 tp->ack.rcv_mss = len;
146 } else {
147 /* Otherwise, we make more careful check taking into account,
148 * that SACKs block is variable.
149 *
150 * "len" is invariant segment length, including TCP header.
151 */
152 len += skb->data - skb->h.raw;
153 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
154 /* If PSH is not set, packet should be
155 * full sized, provided peer TCP is not badly broken.
156 * This observation (if it is correct 8)) allows
157 * to handle super-low mtu links fairly.
158 */
159 (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
160 !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) {
161 /* Subtract also invariant (if peer is RFC compliant),
162 * tcp header plus fixed timestamp option length.
163 * Resulting "len" is MSS free of SACK jitter.
164 */
165 len -= tp->tcp_header_len;
166 tp->ack.last_seg_size = len;
167 if (len == lss) {
168 tp->ack.rcv_mss = len;
169 return;
170 }
171 }
172 tp->ack.pending |= TCP_ACK_PUSHED;
173 }
174}
175
176static void tcp_incr_quickack(struct tcp_sock *tp)
177{
178 unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
179
180 if (quickacks==0)
181 quickacks=2;
182 if (quickacks > tp->ack.quick)
183 tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
184}
185
186void tcp_enter_quickack_mode(struct tcp_sock *tp)
187{
188 tcp_incr_quickack(tp);
189 tp->ack.pingpong = 0;
190 tp->ack.ato = TCP_ATO_MIN;
191}
192
193/* Send ACKs quickly, if "quick" count is not exhausted
194 * and the session is not interactive.
195 */
196
197static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
198{
199 return (tp->ack.quick && !tp->ack.pingpong);
200}
201
202/* Buffer size and advertised window tuning.
203 *
204 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
205 */
206
207static void tcp_fixup_sndbuf(struct sock *sk)
208{
209 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
210 sizeof(struct sk_buff);
211
212 if (sk->sk_sndbuf < 3 * sndmem)
213 sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
214}
215
216/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
217 *
218 * All tcp_full_space() is split to two parts: "network" buffer, allocated
219 * forward and advertised in receiver window (tp->rcv_wnd) and
220 * "application buffer", required to isolate scheduling/application
221 * latencies from network.
222 * window_clamp is maximal advertised window. It can be less than
223 * tcp_full_space(), in this case tcp_full_space() - window_clamp
224 * is reserved for "application" buffer. The less window_clamp is
225 * the smoother our behaviour from viewpoint of network, but the lower
226 * throughput and the higher sensitivity of the connection to losses. 8)
227 *
228 * rcv_ssthresh is more strict window_clamp used at "slow start"
229 * phase to predict further behaviour of this connection.
230 * It is used for two goals:
231 * - to enforce header prediction at sender, even when application
232 * requires some significant "application buffer". It is check #1.
233 * - to prevent pruning of receive queue because of misprediction
234 * of receiver window. Check #2.
235 *
236 * The scheme does not work when sender sends good segments opening
237 * window and then starts to feed us spagetti. But it should work
238 * in common situations. Otherwise, we have to rely on queue collapsing.
239 */
240
241/* Slow part of check#2. */
242static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
243 struct sk_buff *skb)
244{
245 /* Optimize this! */
246 int truesize = tcp_win_from_space(skb->truesize)/2;
247 int window = tcp_full_space(sk)/2;
248
249 while (tp->rcv_ssthresh <= window) {
250 if (truesize <= skb->len)
251 return 2*tp->ack.rcv_mss;
252
253 truesize >>= 1;
254 window >>= 1;
255 }
256 return 0;
257}
258
259static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
260 struct sk_buff *skb)
261{
262 /* Check #1 */
263 if (tp->rcv_ssthresh < tp->window_clamp &&
264 (int)tp->rcv_ssthresh < tcp_space(sk) &&
265 !tcp_memory_pressure) {
266 int incr;
267
268 /* Check #2. Increase window, if skb with such overhead
269 * will fit to rcvbuf in future.
270 */
271 if (tcp_win_from_space(skb->truesize) <= skb->len)
272 incr = 2*tp->advmss;
273 else
274 incr = __tcp_grow_window(sk, tp, skb);
275
276 if (incr) {
277 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
278 tp->ack.quick |= 1;
279 }
280 }
281}
282
283/* 3. Tuning rcvbuf, when connection enters established state. */
284
285static void tcp_fixup_rcvbuf(struct sock *sk)
286{
287 struct tcp_sock *tp = tcp_sk(sk);
288 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
289
290 /* Try to select rcvbuf so that 4 mss-sized segments
291 * will fit to window and correspoding skbs will fit to our rcvbuf.
292 * (was 3; 4 is minimum to allow fast retransmit to work.)
293 */
294 while (tcp_win_from_space(rcvmem) < tp->advmss)
295 rcvmem += 128;
296 if (sk->sk_rcvbuf < 4 * rcvmem)
297 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
298}
299
300/* 4. Try to fixup all. It is made iimediately after connection enters
301 * established state.
302 */
303static void tcp_init_buffer_space(struct sock *sk)
304{
305 struct tcp_sock *tp = tcp_sk(sk);
306 int maxwin;
307
308 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
309 tcp_fixup_rcvbuf(sk);
310 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
311 tcp_fixup_sndbuf(sk);
312
313 tp->rcvq_space.space = tp->rcv_wnd;
314
315 maxwin = tcp_full_space(sk);
316
317 if (tp->window_clamp >= maxwin) {
318 tp->window_clamp = maxwin;
319
320 if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
321 tp->window_clamp = max(maxwin -
322 (maxwin >> sysctl_tcp_app_win),
323 4 * tp->advmss);
324 }
325
326 /* Force reservation of one segment. */
327 if (sysctl_tcp_app_win &&
328 tp->window_clamp > 2 * tp->advmss &&
329 tp->window_clamp + tp->advmss > maxwin)
330 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
331
332 tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
333 tp->snd_cwnd_stamp = tcp_time_stamp;
334}
335
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{
348 struct sk_buff *skb;
349 unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
350 int ofo_win = 0;
351
352 tp->ack.quick = 0;
353
354 skb_queue_walk(&tp->out_of_order_queue, skb) {
355 ofo_win += skb->len;
356 }
357
358 /* If overcommit is due to out of order segments,
359 * do not clamp window. Try to expand rcvbuf instead.
360 */
361 if (ofo_win) {
362 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
363 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
364 !tcp_memory_pressure &&
365 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
366 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
367 sysctl_tcp_rmem[2]);
368 }
369 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
370 app_win += ofo_win;
371 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
372 app_win >>= 1;
373 if (app_win > tp->ack.rcv_mss)
374 app_win -= tp->ack.rcv_mss;
375 app_win = max(app_win, 2U*tp->advmss);
376
377 if (!ofo_win)
378 tp->window_clamp = min(tp->window_clamp, app_win);
379 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
380 }
381}
382
383/* Receiver "autotuning" code.
384 *
385 * The algorithm for RTT estimation w/o timestamps is based on
386 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
387 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
388 *
389 * More detail on this code can be found at
390 * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
391 * though this reference is out of date. A new paper
392 * is pending.
393 */
394static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
395{
396 u32 new_sample = tp->rcv_rtt_est.rtt;
397 long m = sample;
398
399 if (m == 0)
400 m = 1;
401
402 if (new_sample != 0) {
403 /* If we sample in larger samples in the non-timestamp
404 * case, we could grossly overestimate the RTT especially
405 * with chatty applications or bulk transfer apps which
406 * are stalled on filesystem I/O.
407 *
408 * Also, since we are only going for a minimum in the
409 * non-timestamp case, we do not smoothe things out
410 * else with timestamps disabled convergance takes too
411 * long.
412 */
413 if (!win_dep) {
414 m -= (new_sample >> 3);
415 new_sample += m;
416 } else if (m < new_sample)
417 new_sample = m << 3;
418 } else {
419 /* No previous mesaure. */
420 new_sample = m << 3;
421 }
422
423 if (tp->rcv_rtt_est.rtt != new_sample)
424 tp->rcv_rtt_est.rtt = new_sample;
425}
426
427static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
428{
429 if (tp->rcv_rtt_est.time == 0)
430 goto new_measure;
431 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
432 return;
433 tcp_rcv_rtt_update(tp,
434 jiffies - tp->rcv_rtt_est.time,
435 1);
436
437new_measure:
438 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
439 tp->rcv_rtt_est.time = tcp_time_stamp;
440}
441
442static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
443{
444 if (tp->rx_opt.rcv_tsecr &&
445 (TCP_SKB_CB(skb)->end_seq -
446 TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
447 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
448}
449
450/*
451 * This function should be called every time data is copied to user space.
452 * It calculates the appropriate TCP receive buffer space.
453 */
454void tcp_rcv_space_adjust(struct sock *sk)
455{
456 struct tcp_sock *tp = tcp_sk(sk);
457 int time;
458 int space;
459
460 if (tp->rcvq_space.time == 0)
461 goto new_measure;
462
463 time = tcp_time_stamp - tp->rcvq_space.time;
464 if (time < (tp->rcv_rtt_est.rtt >> 3) ||
465 tp->rcv_rtt_est.rtt == 0)
466 return;
467
468 space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
469
470 space = max(tp->rcvq_space.space, space);
471
472 if (tp->rcvq_space.space != space) {
473 int rcvmem;
474
475 tp->rcvq_space.space = space;
476
477 if (sysctl_tcp_moderate_rcvbuf) {
478 int new_clamp = space;
479
480 /* Receive space grows, normalize in order to
481 * take into account packet headers and sk_buff
482 * structure overhead.
483 */
484 space /= tp->advmss;
485 if (!space)
486 space = 1;
487 rcvmem = (tp->advmss + MAX_TCP_HEADER +
488 16 + sizeof(struct sk_buff));
489 while (tcp_win_from_space(rcvmem) < tp->advmss)
490 rcvmem += 128;
491 space *= rcvmem;
492 space = min(space, sysctl_tcp_rmem[2]);
493 if (space > sk->sk_rcvbuf) {
494 sk->sk_rcvbuf = space;
495
496 /* Make the window clamp follow along. */
497 tp->window_clamp = new_clamp;
498 }
499 }
500 }
501
502new_measure:
503 tp->rcvq_space.seq = tp->copied_seq;
504 tp->rcvq_space.time = tcp_time_stamp;
505}
506
507/* There is something which you must keep in mind when you analyze the
508 * behavior of the tp->ato delayed ack timeout interval. When a
509 * connection starts up, we want to ack as quickly as possible. The
510 * problem is that "good" TCP's do slow start at the beginning of data
511 * transmission. The means that until we send the first few ACK's the
512 * sender will sit on his end and only queue most of his data, because
513 * he can only send snd_cwnd unacked packets at any given time. For
514 * each ACK we send, he increments snd_cwnd and transmits more of his
515 * queue. -DaveM
516 */
517static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
518{
519 u32 now;
520
521 tcp_schedule_ack(tp);
522
523 tcp_measure_rcv_mss(tp, skb);
524
525 tcp_rcv_rtt_measure(tp);
526
527 now = tcp_time_stamp;
528
529 if (!tp->ack.ato) {
530 /* The _first_ data packet received, initialize
531 * delayed ACK engine.
532 */
533 tcp_incr_quickack(tp);
534 tp->ack.ato = TCP_ATO_MIN;
535 } else {
536 int m = now - tp->ack.lrcvtime;
537
538 if (m <= TCP_ATO_MIN/2) {
539 /* The fastest case is the first. */
540 tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
541 } else if (m < tp->ack.ato) {
542 tp->ack.ato = (tp->ack.ato>>1) + m;
543 if (tp->ack.ato > tp->rto)
544 tp->ack.ato = tp->rto;
545 } else if (m > tp->rto) {
546 /* Too long gap. Apparently sender falled to
547 * restart window, so that we send ACKs quickly.
548 */
549 tcp_incr_quickack(tp);
550 sk_stream_mem_reclaim(sk);
551 }
552 }
553 tp->ack.lrcvtime = now;
554
555 TCP_ECN_check_ce(tp, skb);
556
557 if (skb->len >= 128)
558 tcp_grow_window(sk, tp, skb);
559}
560
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge
603 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
604 * piece by Van Jacobson.
605 * NOTE: the next three routines used to be one big routine.
606 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics
608 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
610{
611 long m = mrtt; /* RTT */
612
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation.
619 * This is designed to be as fast as possible
620 * m stands for "measurement".
621 *
622 * On a 1990 paper the rto value is changed to:
623 * RTO = rtt + 4 * mdev
624 *
625 * Funny. This algorithm seems to be very broken.
626 * These formulae increase RTO, when it should be decreased, increase
627 * too slowly, when it should be incresed fastly, decrease too fastly
628 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
629 * does not matter how to _calculate_ it. Seems, it was trap
630 * that VJ failed to avoid. 8)
631 */
632 if(m == 0)
633 m = 1;
634 if (tp->srtt != 0) {
635 m -= (tp->srtt >> 3); /* m is now error in rtt est */
636 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
637 if (m < 0) {
638 m = -m; /* m is now abs(error) */
639 m -= (tp->mdev >> 2); /* similar update on mdev */
640 /* This is similar to one of Eifel findings.
641 * Eifel blocks mdev updates when rtt decreases.
642 * This solution is a bit different: we use finer gain
643 * for mdev in this case (alpha*beta).
644 * Like Eifel it also prevents growth of rto,
645 * but also it limits too fast rto decreases,
646 * happening in pure Eifel.
647 */
648 if (m > 0)
649 m >>= 3;
650 } else {
651 m -= (tp->mdev >> 2); /* similar update on mdev */
652 }
653 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
654 if (tp->mdev > tp->mdev_max) {
655 tp->mdev_max = tp->mdev;
656 if (tp->mdev_max > tp->rttvar)
657 tp->rttvar = tp->mdev_max;
658 }
659 if (after(tp->snd_una, tp->rtt_seq)) {
660 if (tp->mdev_max < tp->rttvar)
661 tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
662 tp->rtt_seq = tp->snd_nxt;
663 tp->mdev_max = TCP_RTO_MIN;
664 }
665 } else {
666 /* no previous measure. */
667 tp->srtt = m<<3; /* take the measured time to be rtt */
668 tp->mdev = m<<1; /* make sure rto = 3*rtt */
669 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
670 tp->rtt_seq = tp->snd_nxt;
671 }
672
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3);
674}
675
676/* Calculate rto without backoff. This is the second half of Van Jacobson's
677 * routine referred to above.
678 */
679static inline void tcp_set_rto(struct tcp_sock *tp)
680{
681 /* Old crap is replaced with new one. 8)
682 *
683 * More seriously:
684 * 1. If rtt variance happened to be less 50msec, it is hallucination.
685 * It cannot be less due to utterly erratic ACK generation made
686 * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
687 * to do with delayed acks, because at cwnd>2 true delack timeout
688 * is invisible. Actually, Linux-2.4 also generates erratic
689 * ACKs in some curcumstances.
690 */
691 tp->rto = (tp->srtt >> 3) + tp->rttvar;
692
693 /* 2. Fixups made earlier cannot be right.
694 * If we do not estimate RTO correctly without them,
695 * all the algo is pure shit and should be replaced
696 * with correct one. It is exaclty, which we pretend to do.
697 */
698}
699
700/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
701 * guarantees that rto is higher.
702 */
703static inline void tcp_bound_rto(struct tcp_sock *tp)
704{
705 if (tp->rto > TCP_RTO_MAX)
706 tp->rto = TCP_RTO_MAX;
707}
708
709/* Save metrics learned by this TCP session.
710 This function is called only, when TCP finishes successfully
711 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
712 */
713void tcp_update_metrics(struct sock *sk)
714{
715 struct tcp_sock *tp = tcp_sk(sk);
716 struct dst_entry *dst = __sk_dst_get(sk);
717
718 if (sysctl_tcp_nometrics_save)
719 return;
720
721 dst_confirm(dst);
722
723 if (dst && (dst->flags&DST_HOST)) {
724 int m;
725
726 if (tp->backoff || !tp->srtt) {
727 /* This session failed to estimate rtt. Why?
728 * Probably, no packets returned in time.
729 * Reset our results.
730 */
731 if (!(dst_metric_locked(dst, RTAX_RTT)))
732 dst->metrics[RTAX_RTT-1] = 0;
733 return;
734 }
735
736 m = dst_metric(dst, RTAX_RTT) - tp->srtt;
737
738 /* If newly calculated rtt larger than stored one,
739 * store new one. Otherwise, use EWMA. Remember,
740 * rtt overestimation is always better than underestimation.
741 */
742 if (!(dst_metric_locked(dst, RTAX_RTT))) {
743 if (m <= 0)
744 dst->metrics[RTAX_RTT-1] = tp->srtt;
745 else
746 dst->metrics[RTAX_RTT-1] -= (m>>3);
747 }
748
749 if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
750 if (m < 0)
751 m = -m;
752
753 /* Scale deviation to rttvar fixed point */
754 m >>= 1;
755 if (m < tp->mdev)
756 m = tp->mdev;
757
758 if (m >= dst_metric(dst, RTAX_RTTVAR))
759 dst->metrics[RTAX_RTTVAR-1] = m;
760 else
761 dst->metrics[RTAX_RTTVAR-1] -=
762 (dst->metrics[RTAX_RTTVAR-1] - m)>>2;
763 }
764
765 if (tp->snd_ssthresh >= 0xFFFF) {
766 /* Slow start still did not finish. */
767 if (dst_metric(dst, RTAX_SSTHRESH) &&
768 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
769 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
770 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
771 if (!dst_metric_locked(dst, RTAX_CWND) &&
772 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
773 dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
774 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
775 tp->ca_state == TCP_CA_Open) {
776 /* Cong. avoidance phase, cwnd is reliable. */
777 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
778 dst->metrics[RTAX_SSTHRESH-1] =
779 max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
780 if (!dst_metric_locked(dst, RTAX_CWND))
781 dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
782 } else {
783 /* Else slow start did not finish, cwnd is non-sense,
784 ssthresh may be also invalid.
785 */
786 if (!dst_metric_locked(dst, RTAX_CWND))
787 dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
788 if (dst->metrics[RTAX_SSTHRESH-1] &&
789 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
790 tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
791 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
792 }
793
794 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
795 if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
796 tp->reordering != sysctl_tcp_reordering)
797 dst->metrics[RTAX_REORDERING-1] = tp->reordering;
798 }
799 }
800}
801
802/* Numbers are taken from RFC2414. */
803__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
804{
805 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
806
807 if (!cwnd) {
808 if (tp->mss_cache_std > 1460)
809 cwnd = 2;
810 else
811 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
812 }
813 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
814}
815
816/* Initialize metrics on socket. */
817
818static void tcp_init_metrics(struct sock *sk)
819{
820 struct tcp_sock *tp = tcp_sk(sk);
821 struct dst_entry *dst = __sk_dst_get(sk);
822
823 if (dst == NULL)
824 goto reset;
825
826 dst_confirm(dst);
827
828 if (dst_metric_locked(dst, RTAX_CWND))
829 tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
830 if (dst_metric(dst, RTAX_SSTHRESH)) {
831 tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
832 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
833 tp->snd_ssthresh = tp->snd_cwnd_clamp;
834 }
835 if (dst_metric(dst, RTAX_REORDERING) &&
836 tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
837 tp->rx_opt.sack_ok &= ~2;
838 tp->reordering = dst_metric(dst, RTAX_REORDERING);
839 }
840
841 if (dst_metric(dst, RTAX_RTT) == 0)
842 goto reset;
843
844 if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
845 goto reset;
846
847 /* Initial rtt is determined from SYN,SYN-ACK.
848 * The segment is small and rtt may appear much
849 * less than real one. Use per-dst memory
850 * to make it more realistic.
851 *
852 * A bit of theory. RTT is time passed after "normal" sized packet
853 * is sent until it is ACKed. In normal curcumstances sending small
854 * packets force peer to delay ACKs and calculation is correct too.
855 * The algorithm is adaptive and, provided we follow specs, it
856 * NEVER underestimate RTT. BUT! If peer tries to make some clever
857 * tricks sort of "quick acks" for time long enough to decrease RTT
858 * to low value, and then abruptly stops to do it and starts to delay
859 * ACKs, wait for troubles.
860 */
861 if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
862 tp->srtt = dst_metric(dst, RTAX_RTT);
863 tp->rtt_seq = tp->snd_nxt;
864 }
865 if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
866 tp->mdev = dst_metric(dst, RTAX_RTTVAR);
867 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
868 }
869 tcp_set_rto(tp);
870 tcp_bound_rto(tp);
871 if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
872 goto reset;
873 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
874 tp->snd_cwnd_stamp = tcp_time_stamp;
875 return;
876
877reset:
878 /* Play conservative. If timestamps are not
879 * supported, TCP will fail to recalculate correct
880 * rtt, if initial rto is too small. FORGET ALL AND RESET!
881 */
882 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
883 tp->srtt = 0;
884 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
885 tp->rto = TCP_TIMEOUT_INIT;
886 }
887}
888
889static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
890{
891 if (metric > tp->reordering) {
892 tp->reordering = min(TCP_MAX_REORDERING, metric);
893
894 /* This exciting event is worth to be remembered. 8) */
895 if (ts)
896 NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
897 else if (IsReno(tp))
898 NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
899 else if (IsFack(tp))
900 NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
901 else
902 NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
903#if FASTRETRANS_DEBUG > 1
904 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
905 tp->rx_opt.sack_ok, tp->ca_state,
906 tp->reordering,
907 tp->fackets_out,
908 tp->sacked_out,
909 tp->undo_marker ? tp->undo_retrans : 0);
910#endif
911 /* Disable FACK yet. */
912 tp->rx_opt.sack_ok &= ~2;
913 }
914}
915
916/* This procedure tags the retransmission queue when SACKs arrive.
917 *
918 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
919 * Packets in queue with these bits set are counted in variables
920 * sacked_out, retrans_out and lost_out, correspondingly.
921 *
922 * Valid combinations are:
923 * Tag InFlight Description
924 * 0 1 - orig segment is in flight.
925 * S 0 - nothing flies, orig reached receiver.
926 * L 0 - nothing flies, orig lost by net.
927 * R 2 - both orig and retransmit are in flight.
928 * L|R 1 - orig is lost, retransmit is in flight.
929 * S|R 1 - orig reached receiver, retrans is still in flight.
930 * (L|S|R is logically valid, it could occur when L|R is sacked,
931 * but it is equivalent to plain S and code short-curcuits it to S.
932 * L|S is logically invalid, it would mean -1 packet in flight 8))
933 *
934 * These 6 states form finite state machine, controlled by the following events:
935 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
936 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
937 * 3. Loss detection event of one of three flavors:
938 * A. Scoreboard estimator decided the packet is lost.
939 * A'. Reno "three dupacks" marks head of queue lost.
940 * A''. Its FACK modfication, head until snd.fack is lost.
941 * B. SACK arrives sacking data transmitted after never retransmitted
942 * hole was sent out.
943 * C. SACK arrives sacking SND.NXT at the moment, when the
944 * segment was retransmitted.
945 * 4. D-SACK added new rule: D-SACK changes any tag to S.
946 *
947 * It is pleasant to note, that state diagram turns out to be commutative,
948 * so that we are allowed not to be bothered by order of our actions,
949 * when multiple events arrive simultaneously. (see the function below).
950 *
951 * Reordering detection.
952 * --------------------
953 * Reordering metric is maximal distance, which a packet can be displaced
954 * in packet stream. With SACKs we can estimate it:
955 *
956 * 1. SACK fills old hole and the corresponding segment was not
957 * ever retransmitted -> reordering. Alas, we cannot use it
958 * when segment was retransmitted.
959 * 2. The last flaw is solved with D-SACK. D-SACK arrives
960 * for retransmitted and already SACKed segment -> reordering..
961 * Both of these heuristics are not used in Loss state, when we cannot
962 * account for retransmits accurately.
963 */
964static int
965tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
966{
967 struct tcp_sock *tp = tcp_sk(sk);
968 unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
969 struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
970 int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
971 int reord = tp->packets_out;
972 int prior_fackets;
973 u32 lost_retrans = 0;
974 int flag = 0;
975 int i;
976
977 /* So, SACKs for already sent large segments will be lost.
978 * Not good, but alternative is to resegment the queue. */
979 if (sk->sk_route_caps & NETIF_F_TSO) {
980 sk->sk_route_caps &= ~NETIF_F_TSO;
981 sock_set_flag(sk, SOCK_NO_LARGESEND);
982 tp->mss_cache = tp->mss_cache_std;
983 }
984
985 if (!tp->sacked_out)
986 tp->fackets_out = 0;
987 prior_fackets = tp->fackets_out;
988
989 for (i=0; i<num_sacks; i++, sp++) {
990 struct sk_buff *skb;
991 __u32 start_seq = ntohl(sp->start_seq);
992 __u32 end_seq = ntohl(sp->end_seq);
993 int fack_count = 0;
994 int dup_sack = 0;
995
996 /* Check for D-SACK. */
997 if (i == 0) {
998 u32 ack = TCP_SKB_CB(ack_skb)->ack_seq;
999
1000 if (before(start_seq, ack)) {
1001 dup_sack = 1;
1002 tp->rx_opt.sack_ok |= 4;
1003 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
1004 } else if (num_sacks > 1 &&
1005 !after(end_seq, ntohl(sp[1].end_seq)) &&
1006 !before(start_seq, ntohl(sp[1].start_seq))) {
1007 dup_sack = 1;
1008 tp->rx_opt.sack_ok |= 4;
1009 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
1010 }
1011
1012 /* D-SACK for already forgotten data...
1013 * Do dumb counting. */
1014 if (dup_sack &&
1015 !after(end_seq, prior_snd_una) &&
1016 after(end_seq, tp->undo_marker))
1017 tp->undo_retrans--;
1018
1019 /* Eliminate too old ACKs, but take into
1020 * account more or less fresh ones, they can
1021 * contain valid SACK info.
1022 */
1023 if (before(ack, prior_snd_una - tp->max_window))
1024 return 0;
1025 }
1026
1027 /* Event "B" in the comment above. */
1028 if (after(end_seq, tp->high_seq))
1029 flag |= FLAG_DATA_LOST;
1030
1031 sk_stream_for_retrans_queue(skb, sk) {
1032 u8 sacked = TCP_SKB_CB(skb)->sacked;
1033 int in_sack;
1034
1035 /* The retransmission queue is always in order, so
1036 * we can short-circuit the walk early.
1037 */
1038 if(!before(TCP_SKB_CB(skb)->seq, end_seq))
1039 break;
1040
1041 fack_count += tcp_skb_pcount(skb);
1042
1043 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1044 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1045
1046 /* Account D-SACK for retransmitted packet. */
1047 if ((dup_sack && in_sack) &&
1048 (sacked & TCPCB_RETRANS) &&
1049 after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
1050 tp->undo_retrans--;
1051
1052 /* The frame is ACKed. */
1053 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
1054 if (sacked&TCPCB_RETRANS) {
1055 if ((dup_sack && in_sack) &&
1056 (sacked&TCPCB_SACKED_ACKED))
1057 reord = min(fack_count, reord);
1058 } else {
1059 /* If it was in a hole, we detected reordering. */
1060 if (fack_count < prior_fackets &&
1061 !(sacked&TCPCB_SACKED_ACKED))
1062 reord = min(fack_count, reord);
1063 }
1064
1065 /* Nothing to do; acked frame is about to be dropped. */
1066 continue;
1067 }
1068
1069 if ((sacked&TCPCB_SACKED_RETRANS) &&
1070 after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
1071 (!lost_retrans || after(end_seq, lost_retrans)))
1072 lost_retrans = end_seq;
1073
1074 if (!in_sack)
1075 continue;
1076
1077 if (!(sacked&TCPCB_SACKED_ACKED)) {
1078 if (sacked & TCPCB_SACKED_RETRANS) {
1079 /* If the segment is not tagged as lost,
1080 * we do not clear RETRANS, believing
1081 * that retransmission is still in flight.
1082 */
1083 if (sacked & TCPCB_LOST) {
1084 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1085 tp->lost_out -= tcp_skb_pcount(skb);
1086 tp->retrans_out -= tcp_skb_pcount(skb);
1087 }
1088 } else {
1089 /* New sack for not retransmitted frame,
1090 * which was in hole. It is reordering.
1091 */
1092 if (!(sacked & TCPCB_RETRANS) &&
1093 fack_count < prior_fackets)
1094 reord = min(fack_count, reord);
1095
1096 if (sacked & TCPCB_LOST) {
1097 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1098 tp->lost_out -= tcp_skb_pcount(skb);
1099 }
1100 }
1101
1102 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
1103 flag |= FLAG_DATA_SACKED;
1104 tp->sacked_out += tcp_skb_pcount(skb);
1105
1106 if (fack_count > tp->fackets_out)
1107 tp->fackets_out = fack_count;
1108 } else {
1109 if (dup_sack && (sacked&TCPCB_RETRANS))
1110 reord = min(fack_count, reord);
1111 }
1112
1113 /* D-SACK. We can detect redundant retransmission
1114 * in S|R and plain R frames and clear it.
1115 * undo_retrans is decreased above, L|R frames
1116 * are accounted above as well.
1117 */
1118 if (dup_sack &&
1119 (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
1120 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1121 tp->retrans_out -= tcp_skb_pcount(skb);
1122 }
1123 }
1124 }
1125
1126 /* Check for lost retransmit. This superb idea is
1127 * borrowed from "ratehalving". Event "C".
1128 * Later note: FACK people cheated me again 8),
1129 * we have to account for reordering! Ugly,
1130 * but should help.
1131 */
1132 if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
1133 struct sk_buff *skb;
1134
1135 sk_stream_for_retrans_queue(skb, sk) {
1136 if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
1137 break;
1138 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1139 continue;
1140 if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
1141 after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
1142 (IsFack(tp) ||
1143 !before(lost_retrans,
1144 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1145 tp->mss_cache_std))) {
1146 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1147 tp->retrans_out -= tcp_skb_pcount(skb);
1148
1149 if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1150 tp->lost_out += tcp_skb_pcount(skb);
1151 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1152 flag |= FLAG_DATA_SACKED;
1153 NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
1154 }
1155 }
1156 }
1157 }
1158
1159 tp->left_out = tp->sacked_out + tp->lost_out;
1160
1161 if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
1162 tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
1163
1164#if FASTRETRANS_DEBUG > 0
1165 BUG_TRAP((int)tp->sacked_out >= 0);
1166 BUG_TRAP((int)tp->lost_out >= 0);
1167 BUG_TRAP((int)tp->retrans_out >= 0);
1168 BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
1169#endif
1170 return flag;
1171}
1172
1173/* RTO occurred, but do not yet enter loss state. Instead, transmit two new
1174 * segments to see from the next ACKs whether any data was really missing.
1175 * If the RTO was spurious, new ACKs should arrive.
1176 */
1177void tcp_enter_frto(struct sock *sk)
1178{
1179 struct tcp_sock *tp = tcp_sk(sk);
1180 struct sk_buff *skb;
1181
1182 tp->frto_counter = 1;
1183
1184 if (tp->ca_state <= TCP_CA_Disorder ||
1185 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp))
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
1190 }
1191
1192 /* Have to clear retransmission markers here to keep the bookkeeping
1193 * in shape, even though we are not yet in Loss state.
1194 * If something was really lost, it is eventually caught up
1195 * in tcp_enter_frto_loss.
1196 */
1197 tp->retrans_out = 0;
1198 tp->undo_marker = tp->snd_una;
1199 tp->undo_retrans = 0;
1200
1201 sk_stream_for_retrans_queue(skb, sk) {
1202 TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS;
1203 }
1204 tcp_sync_left_out(tp);
1205
1206 tcp_set_ca_state(tp, TCP_CA_Open);
1207 tp->frto_highmark = tp->snd_nxt;
1208}
1209
1210/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
1211 * which indicates that we should follow the traditional RTO recovery,
1212 * i.e. mark everything lost and do go-back-N retransmission.
1213 */
1214static void tcp_enter_frto_loss(struct sock *sk)
1215{
1216 struct tcp_sock *tp = tcp_sk(sk);
1217 struct sk_buff *skb;
1218 int cnt = 0;
1219
1220 tp->sacked_out = 0;
1221 tp->lost_out = 0;
1222 tp->fackets_out = 0;
1223
1224 sk_stream_for_retrans_queue(skb, sk) {
1225 cnt += tcp_skb_pcount(skb);
1226 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1227 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
1228
1229 /* Do not mark those segments lost that were
1230 * forward transmitted after RTO
1231 */
1232 if (!after(TCP_SKB_CB(skb)->end_seq,
1233 tp->frto_highmark)) {
1234 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1235 tp->lost_out += tcp_skb_pcount(skb);
1236 }
1237 } else {
1238 tp->sacked_out += tcp_skb_pcount(skb);
1239 tp->fackets_out = cnt;
1240 }
1241 }
1242 tcp_sync_left_out(tp);
1243
1244 tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1;
1245 tp->snd_cwnd_cnt = 0;
1246 tp->snd_cwnd_stamp = tcp_time_stamp;
1247 tp->undo_marker = 0;
1248 tp->frto_counter = 0;
1249
1250 tp->reordering = min_t(unsigned int, tp->reordering,
1251 sysctl_tcp_reordering);
1252 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257}
1258
1259void tcp_clear_retrans(struct tcp_sock *tp)
1260{
1261 tp->left_out = 0;
1262 tp->retrans_out = 0;
1263
1264 tp->fackets_out = 0;
1265 tp->sacked_out = 0;
1266 tp->lost_out = 0;
1267
1268 tp->undo_marker = 0;
1269 tp->undo_retrans = 0;
1270}
1271
1272/* Enter Loss state. If "how" is not zero, forget all SACK information
1273 * and reset tags completely, otherwise preserve SACKs. If receiver
1274 * dropped its ofo queue, we will know this due to reneging detection.
1275 */
1276void tcp_enter_loss(struct sock *sk, int how)
1277{
1278 struct tcp_sock *tp = tcp_sk(sk);
1279 struct sk_buff *skb;
1280 int cnt = 0;
1281
1282 /* Reduce ssthresh if it has not yet been made inside this window. */
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
1287 }
1288 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0;
1290 tp->snd_cwnd_stamp = tcp_time_stamp;
1291
1292 tcp_clear_retrans(tp);
1293
1294 /* Push undo marker, if it was plain RTO and nothing
1295 * was retransmitted. */
1296 if (!how)
1297 tp->undo_marker = tp->snd_una;
1298
1299 sk_stream_for_retrans_queue(skb, sk) {
1300 cnt += tcp_skb_pcount(skb);
1301 if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
1302 tp->undo_marker = 0;
1303 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1304 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
1305 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1306 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1307 tp->lost_out += tcp_skb_pcount(skb);
1308 } else {
1309 tp->sacked_out += tcp_skb_pcount(skb);
1310 tp->fackets_out = cnt;
1311 }
1312 }
1313 tcp_sync_left_out(tp);
1314
1315 tp->reordering = min_t(unsigned int, tp->reordering,
1316 sysctl_tcp_reordering);
1317 tcp_set_ca_state(tp, TCP_CA_Loss);
1318 tp->high_seq = tp->snd_nxt;
1319 TCP_ECN_queue_cwr(tp);
1320}
1321
1322static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
1323{
1324 struct sk_buff *skb;
1325
1326 /* If ACK arrived pointing to a remembered SACK,
1327 * it means that our remembered SACKs do not reflect
1328 * real state of receiver i.e.
1329 * receiver _host_ is heavily congested (or buggy).
1330 * Do processing similar to RTO timeout.
1331 */
1332 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
1333 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1334 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
1335
1336 tcp_enter_loss(sk, 1);
1337 tp->retransmits++;
1338 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
1339 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1340 return 1;
1341 }
1342 return 0;
1343}
1344
1345static inline int tcp_fackets_out(struct tcp_sock *tp)
1346{
1347 return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
1348}
1349
1350static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
1351{
1352 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
1353}
1354
1355static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
1356{
1357 return tp->packets_out &&
1358 tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
1359}
1360
1361/* Linux NewReno/SACK/FACK/ECN state machine.
1362 * --------------------------------------
1363 *
1364 * "Open" Normal state, no dubious events, fast path.
1365 * "Disorder" In all the respects it is "Open",
1366 * but requires a bit more attention. It is entered when
1367 * we see some SACKs or dupacks. It is split of "Open"
1368 * mainly to move some processing from fast path to slow one.
1369 * "CWR" CWND was reduced due to some Congestion Notification event.
1370 * It can be ECN, ICMP source quench, local device congestion.
1371 * "Recovery" CWND was reduced, we are fast-retransmitting.
1372 * "Loss" CWND was reduced due to RTO timeout or SACK reneging.
1373 *
1374 * tcp_fastretrans_alert() is entered:
1375 * - each incoming ACK, if state is not "Open"
1376 * - when arrived ACK is unusual, namely:
1377 * * SACK
1378 * * Duplicate ACK.
1379 * * ECN ECE.
1380 *
1381 * Counting packets in flight is pretty simple.
1382 *
1383 * in_flight = packets_out - left_out + retrans_out
1384 *
1385 * packets_out is SND.NXT-SND.UNA counted in packets.
1386 *
1387 * retrans_out is number of retransmitted segments.
1388 *
1389 * left_out is number of segments left network, but not ACKed yet.
1390 *
1391 * left_out = sacked_out + lost_out
1392 *
1393 * sacked_out: Packets, which arrived to receiver out of order
1394 * and hence not ACKed. With SACKs this number is simply
1395 * amount of SACKed data. Even without SACKs
1396 * it is easy to give pretty reliable estimate of this number,
1397 * counting duplicate ACKs.
1398 *
1399 * lost_out: Packets lost by network. TCP has no explicit
1400 * "loss notification" feedback from network (for now).
1401 * It means that this number can be only _guessed_.
1402 * Actually, it is the heuristics to predict lossage that
1403 * distinguishes different algorithms.
1404 *
1405 * F.e. after RTO, when all the queue is considered as lost,
1406 * lost_out = packets_out and in_flight = retrans_out.
1407 *
1408 * Essentially, we have now two algorithms counting
1409 * lost packets.
1410 *
1411 * FACK: It is the simplest heuristics. As soon as we decided
1412 * that something is lost, we decide that _all_ not SACKed
1413 * packets until the most forward SACK are lost. I.e.
1414 * lost_out = fackets_out - sacked_out and left_out = fackets_out.
1415 * It is absolutely correct estimate, if network does not reorder
1416 * packets. And it loses any connection to reality when reordering
1417 * takes place. We use FACK by default until reordering
1418 * is suspected on the path to this destination.
1419 *
1420 * NewReno: when Recovery is entered, we assume that one segment
1421 * is lost (classic Reno). While we are in Recovery and
1422 * a partial ACK arrives, we assume that one more packet
1423 * is lost (NewReno). This heuristics are the same in NewReno
1424 * and SACK.
1425 *
1426 * Imagine, that's all! Forget about all this shamanism about CWND inflation
1427 * deflation etc. CWND is real congestion window, never inflated, changes
1428 * only according to classic VJ rules.
1429 *
1430 * Really tricky (and requiring careful tuning) part of algorithm
1431 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
1432 * The first determines the moment _when_ we should reduce CWND and,
1433 * hence, slow down forward transmission. In fact, it determines the moment
1434 * when we decide that hole is caused by loss, rather than by a reorder.
1435 *
1436 * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
1437 * holes, caused by lost packets.
1438 *
1439 * And the most logically complicated part of algorithm is undo
1440 * heuristics. We detect false retransmits due to both too early
1441 * fast retransmit (reordering) and underestimated RTO, analyzing
1442 * timestamps and D-SACKs. When we detect that some segments were
1443 * retransmitted by mistake and CWND reduction was wrong, we undo
1444 * window reduction and abort recovery phase. This logic is hidden
1445 * inside several functions named tcp_try_undo_<something>.
1446 */
1447
1448/* This function decides, when we should leave Disordered state
1449 * and enter Recovery phase, reducing congestion window.
1450 *
1451 * Main question: may we further continue forward transmission
1452 * with the same cwnd?
1453 */
1454static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
1455{
1456 __u32 packets_out;
1457
1458 /* Trick#1: The loss is proven. */
1459 if (tp->lost_out)
1460 return 1;
1461
1462 /* Not-A-Trick#2 : Classic rule... */
1463 if (tcp_fackets_out(tp) > tp->reordering)
1464 return 1;
1465
1466 /* Trick#3 : when we use RFC2988 timer restart, fast
1467 * retransmit can be triggered by timeout of queue head.
1468 */
1469 if (tcp_head_timedout(sk, tp))
1470 return 1;
1471
1472 /* Trick#4: It is still not OK... But will it be useful to delay
1473 * recovery more?
1474 */
1475 packets_out = tp->packets_out;
1476 if (packets_out <= tp->reordering &&
1477 tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
1478 !tcp_may_send_now(sk, tp)) {
1479 /* We have nothing to send. This connection is limited
1480 * either by receiver window or by application.
1481 */
1482 return 1;
1483 }
1484
1485 return 0;
1486}
1487
1488/* If we receive more dupacks than we expected counting segments
1489 * in assumption of absent reordering, interpret this as reordering.
1490 * The only another reason could be bug in receiver TCP.
1491 */
1492static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
1493{
1494 u32 holes;
1495
1496 holes = max(tp->lost_out, 1U);
1497 holes = min(holes, tp->packets_out);
1498
1499 if ((tp->sacked_out + holes) > tp->packets_out) {
1500 tp->sacked_out = tp->packets_out - holes;
1501 tcp_update_reordering(tp, tp->packets_out+addend, 0);
1502 }
1503}
1504
1505/* Emulate SACKs for SACKless connection: account for a new dupack. */
1506
1507static void tcp_add_reno_sack(struct tcp_sock *tp)
1508{
1509 tp->sacked_out++;
1510 tcp_check_reno_reordering(tp, 0);
1511 tcp_sync_left_out(tp);
1512}
1513
1514/* Account for ACK, ACKing some data in Reno Recovery phase. */
1515
1516static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked)
1517{
1518 if (acked > 0) {
1519 /* One ACK acked hole. The rest eat duplicate ACKs. */
1520 if (acked-1 >= tp->sacked_out)
1521 tp->sacked_out = 0;
1522 else
1523 tp->sacked_out -= acked-1;
1524 }
1525 tcp_check_reno_reordering(tp, acked);
1526 tcp_sync_left_out(tp);
1527}
1528
1529static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1530{
1531 tp->sacked_out = 0;
1532 tp->left_out = tp->lost_out;
1533}
1534
1535/* Mark head of queue up as lost. */
1536static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
1537 int packets, u32 high_seq)
1538{
1539 struct sk_buff *skb;
1540 int cnt = packets;
1541
1542 BUG_TRAP(cnt <= tp->packets_out);
1543
1544 sk_stream_for_retrans_queue(skb, sk) {
1545 cnt -= tcp_skb_pcount(skb);
1546 if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
1547 break;
1548 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1549 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1550 tp->lost_out += tcp_skb_pcount(skb);
1551 }
1552 }
1553 tcp_sync_left_out(tp);
1554}
1555
1556/* Account newly detected lost packet(s) */
1557
1558static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
1559{
1560 if (IsFack(tp)) {
1561 int lost = tp->fackets_out - tp->reordering;
1562 if (lost <= 0)
1563 lost = 1;
1564 tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
1565 } else {
1566 tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
1567 }
1568
1569 /* New heuristics: it is possible only after we switched
1570 * to restart timer each time when something is ACKed.
1571 * Hence, we can detect timed out packets during fast
1572 * retransmit without falling to slow start.
1573 */
1574 if (tcp_head_timedout(sk, tp)) {
1575 struct sk_buff *skb;
1576
1577 sk_stream_for_retrans_queue(skb, sk) {
1578 if (tcp_skb_timedout(tp, skb) &&
1579 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1580 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1581 tp->lost_out += tcp_skb_pcount(skb);
1582 }
1583 }
1584 tcp_sync_left_out(tp);
1585 }
1586}
1587
1588/* CWND moderation, preventing bursts due to too big ACKs
1589 * in dubious situations.
1590 */
1591static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1592{
1593 tp->snd_cwnd = min(tp->snd_cwnd,
1594 tcp_packets_in_flight(tp)+tcp_max_burst(tp));
1595 tp->snd_cwnd_stamp = tcp_time_stamp;
1596}
1597
1598/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp)
1601{
1602 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616
1617 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1;
1619
1620 if (decr && tp->snd_cwnd > limit)
1621 tp->snd_cwnd -= decr;
1622
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
1624 tp->snd_cwnd_stamp = tcp_time_stamp;
1625}
1626
1627/* Nothing was retransmitted or returned timestamp is less
1628 * than timestamp of the first retransmission.
1629 */
1630static inline int tcp_packet_delayed(struct tcp_sock *tp)
1631{
1632 return !tp->retrans_stamp ||
1633 (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
1634 (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
1635}
1636
1637/* Undo procedures. */
1638
1639#if FASTRETRANS_DEBUG > 1
1640static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1641{
1642 struct inet_sock *inet = inet_sk(sk);
1643 printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
1644 msg,
1645 NIPQUAD(inet->daddr), ntohs(inet->dport),
1646 tp->snd_cwnd, tp->left_out,
1647 tp->snd_ssthresh, tp->prior_ssthresh,
1648 tp->packets_out);
1649}
1650#else
1651#define DBGUNDO(x...) do { } while (0)
1652#endif
1653
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{
1656 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp))
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
1659 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661
1662 if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
1663 tp->snd_ssthresh = tp->prior_ssthresh;
1664 TCP_ECN_withdraw_cwr(tp);
1665 }
1666 } else {
1667 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
1668 }
1669 tcp_moderate_cwnd(tp);
1670 tp->snd_cwnd_stamp = tcp_time_stamp;
1671}
1672
1673static inline int tcp_may_undo(struct tcp_sock *tp)
1674{
1675 return tp->undo_marker &&
1676 (!tp->undo_retrans || tcp_packet_delayed(tp));
1677}
1678
1679/* People celebrate: "We love our President!" */
1680static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
1681{
1682 if (tcp_may_undo(tp)) {
1683 /* Happy end! We did not retransmit anything
1684 * or our original transmission succeeded.
1685 */
1686 DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
1687 tcp_undo_cwr(tp, 1);
1688 if (tp->ca_state == TCP_CA_Loss)
1689 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
1690 else
1691 NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
1692 tp->undo_marker = 0;
1693 }
1694 if (tp->snd_una == tp->high_seq && IsReno(tp)) {
1695 /* Hold old state until something *above* high_seq
1696 * is ACKed. For Reno it is MUST to prevent false
1697 * fast retransmits (RFC2582). SACK TCP is safe. */
1698 tcp_moderate_cwnd(tp);
1699 return 1;
1700 }
1701 tcp_set_ca_state(tp, TCP_CA_Open);
1702 return 0;
1703}
1704
1705/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
1706static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
1707{
1708 if (tp->undo_marker && !tp->undo_retrans) {
1709 DBGUNDO(sk, tp, "D-SACK");
1710 tcp_undo_cwr(tp, 1);
1711 tp->undo_marker = 0;
1712 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
1713 }
1714}
1715
1716/* Undo during fast recovery after partial ACK. */
1717
1718static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
1719 int acked)
1720{
1721 /* Partial ACK arrived. Force Hoe's retransmit. */
1722 int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
1723
1724 if (tcp_may_undo(tp)) {
1725 /* Plain luck! Hole if filled with delayed
1726 * packet, rather than with a retransmit.
1727 */
1728 if (tp->retrans_out == 0)
1729 tp->retrans_stamp = 0;
1730
1731 tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
1732
1733 DBGUNDO(sk, tp, "Hoe");
1734 tcp_undo_cwr(tp, 0);
1735 NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
1736
1737 /* So... Do not make Hoe's retransmit yet.
1738 * If the first packet was delayed, the rest
1739 * ones are most probably delayed as well.
1740 */
1741 failed = 0;
1742 }
1743 return failed;
1744}
1745
1746/* Undo during loss recovery after partial ACK. */
1747static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1748{
1749 if (tcp_may_undo(tp)) {
1750 struct sk_buff *skb;
1751 sk_stream_for_retrans_queue(skb, sk) {
1752 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1753 }
1754 DBGUNDO(sk, tp, "partial loss");
1755 tp->lost_out = 0;
1756 tp->left_out = tp->sacked_out;
1757 tcp_undo_cwr(tp, 1);
1758 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
1759 tp->retransmits = 0;
1760 tp->undo_marker = 0;
1761 if (!IsReno(tp))
1762 tcp_set_ca_state(tp, TCP_CA_Open);
1763 return 1;
1764 }
1765 return 0;
1766}
1767
1768static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{
1770 if (tcp_westwood_cwnd(tp))
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp;
1775}
1776
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
1778{
1779 tp->left_out = tp->sacked_out;
1780
1781 if (tp->retrans_out == 0)
1782 tp->retrans_stamp = 0;
1783
1784 if (flag&FLAG_ECE)
1785 tcp_enter_cwr(tp);
1786
1787 if (tp->ca_state != TCP_CA_CWR) {
1788 int state = TCP_CA_Open;
1789
1790 if (tp->left_out || tp->retrans_out || tp->undo_marker)
1791 state = TCP_CA_Disorder;
1792
1793 if (tp->ca_state != state) {
1794 tcp_set_ca_state(tp, state);
1795 tp->high_seq = tp->snd_nxt;
1796 }
1797 tcp_moderate_cwnd(tp);
1798 } else {
1799 tcp_cwnd_down(tp);
1800 }
1801}
1802
1803/* Process an event, which can update packets-in-flight not trivially.
1804 * Main goal of this function is to calculate new estimate for left_out,
1805 * taking into account both packets sitting in receiver's buffer and
1806 * packets lost by network.
1807 *
1808 * Besides that it does CWND reduction, when packet loss is detected
1809 * and changes state of machine.
1810 *
1811 * It does _not_ decide what to send, it is made in function
1812 * tcp_xmit_retransmit_queue().
1813 */
1814static void
1815tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1816 int prior_packets, int flag)
1817{
1818 struct tcp_sock *tp = tcp_sk(sk);
1819 int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
1820
1821 /* Some technical things:
1822 * 1. Reno does not count dupacks (sacked_out) automatically. */
1823 if (!tp->packets_out)
1824 tp->sacked_out = 0;
1825 /* 2. SACK counts snd_fack in packets inaccurately. */
1826 if (tp->sacked_out == 0)
1827 tp->fackets_out = 0;
1828
1829 /* Now state machine starts.
1830 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
1831 if (flag&FLAG_ECE)
1832 tp->prior_ssthresh = 0;
1833
1834 /* B. In all the states check for reneging SACKs. */
1835 if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
1836 return;
1837
1838 /* C. Process data loss notification, provided it is valid. */
1839 if ((flag&FLAG_DATA_LOST) &&
1840 before(tp->snd_una, tp->high_seq) &&
1841 tp->ca_state != TCP_CA_Open &&
1842 tp->fackets_out > tp->reordering) {
1843 tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
1844 NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
1845 }
1846
1847 /* D. Synchronize left_out to current state. */
1848 tcp_sync_left_out(tp);
1849
1850 /* E. Check state exit conditions. State can be terminated
1851 * when high_seq is ACKed. */
1852 if (tp->ca_state == TCP_CA_Open) {
1853 if (!sysctl_tcp_frto)
1854 BUG_TRAP(tp->retrans_out == 0);
1855 tp->retrans_stamp = 0;
1856 } else if (!before(tp->snd_una, tp->high_seq)) {
1857 switch (tp->ca_state) {
1858 case TCP_CA_Loss:
1859 tp->retransmits = 0;
1860 if (tcp_try_undo_recovery(sk, tp))
1861 return;
1862 break;
1863
1864 case TCP_CA_CWR:
1865 /* CWR is to be held something *above* high_seq
1866 * is ACKed for CWR bit to reach receiver. */
1867 if (tp->snd_una != tp->high_seq) {
1868 tcp_complete_cwr(tp);
1869 tcp_set_ca_state(tp, TCP_CA_Open);
1870 }
1871 break;
1872
1873 case TCP_CA_Disorder:
1874 tcp_try_undo_dsack(sk, tp);
1875 if (!tp->undo_marker ||
1876 /* For SACK case do not Open to allow to undo
1877 * catching for all duplicate ACKs. */
1878 IsReno(tp) || tp->snd_una != tp->high_seq) {
1879 tp->undo_marker = 0;
1880 tcp_set_ca_state(tp, TCP_CA_Open);
1881 }
1882 break;
1883
1884 case TCP_CA_Recovery:
1885 if (IsReno(tp))
1886 tcp_reset_reno_sack(tp);
1887 if (tcp_try_undo_recovery(sk, tp))
1888 return;
1889 tcp_complete_cwr(tp);
1890 break;
1891 }
1892 }
1893
1894 /* F. Process state. */
1895 switch (tp->ca_state) {
1896 case TCP_CA_Recovery:
1897 if (prior_snd_una == tp->snd_una) {
1898 if (IsReno(tp) && is_dupack)
1899 tcp_add_reno_sack(tp);
1900 } else {
1901 int acked = prior_packets - tp->packets_out;
1902 if (IsReno(tp))
1903 tcp_remove_reno_sacks(sk, tp, acked);
1904 is_dupack = tcp_try_undo_partial(sk, tp, acked);
1905 }
1906 break;
1907 case TCP_CA_Loss:
1908 if (flag&FLAG_DATA_ACKED)
1909 tp->retransmits = 0;
1910 if (!tcp_try_undo_loss(sk, tp)) {
1911 tcp_moderate_cwnd(tp);
1912 tcp_xmit_retransmit_queue(sk);
1913 return;
1914 }
1915 if (tp->ca_state != TCP_CA_Open)
1916 return;
1917 /* Loss is undone; fall through to processing in Open state. */
1918 default:
1919 if (IsReno(tp)) {
1920 if (tp->snd_una != prior_snd_una)
1921 tcp_reset_reno_sack(tp);
1922 if (is_dupack)
1923 tcp_add_reno_sack(tp);
1924 }
1925
1926 if (tp->ca_state == TCP_CA_Disorder)
1927 tcp_try_undo_dsack(sk, tp);
1928
1929 if (!tcp_time_to_recover(sk, tp)) {
1930 tcp_try_to_open(sk, tp, flag);
1931 return;
1932 }
1933
1934 /* Otherwise enter Recovery state */
1935
1936 if (IsReno(tp))
1937 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
1938 else
1939 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
1940
1941 tp->high_seq = tp->snd_nxt;
1942 tp->prior_ssthresh = 0;
1943 tp->undo_marker = tp->snd_una;
1944 tp->undo_retrans = tp->retrans_out;
1945
1946 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp);
1951 }
1952
1953 tp->snd_cwnd_cnt = 0;
1954 tcp_set_ca_state(tp, TCP_CA_Recovery);
1955 }
1956
1957 if (is_dupack || tcp_head_timedout(sk, tp))
1958 tcp_update_scoreboard(sk, tp);
1959 tcp_cwnd_down(tp);
1960 tcp_xmit_retransmit_queue(sk);
1961}
1962
1963/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323)
1965 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1967{
1968 __u32 seq_rtt;
1969
1970 /* RTTM Rule: A TSecr value received in a segment is used to
1971 * update the averaged RTT measurement only if the segment
1972 * acknowledges some new data, i.e., only if it advances the
1973 * left edge of the send window.
1974 *
1975 * See draft-ietf-tcplw-high-performance-00, section 3.3.
1976 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
1977 *
1978 * Changed: reset backoff as soon as we see the first valid sample.
1979 * If we do not, we get strongly overstimated rto. With timestamps
1980 * samples are accepted even from very old segments: f.e., when rtt=1
1981 * increases to 8, we retransmit 5 times and after 8 seconds delayed
1982 * answer arrives rto becomes 120 seconds! If at least one of segments
1983 * in window is lost... Voila. --ANK (010210)
1984 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt);
1987 tcp_set_rto(tp);
1988 tp->backoff = 0;
1989 tcp_bound_rto(tp);
1990}
1991
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
1993{
1994 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine
1996 * rtt estimates. Also, we must not reset the
1997 * backoff for rto until we get a non-retransmitted
1998 * packet. This allows us to deal with a situation
1999 * where the network delay has increased suddenly.
2000 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
2001 */
2002
2003 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return;
2005
2006 tcp_rtt_estimator(tp, seq_rtt);
2007 tcp_set_rto(tp);
2008 tp->backoff = 0;
2009 tcp_bound_rto(tp);
2010}
2011
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt)
2014{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag);
2018 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag);
2020}
2021
2022/*
2023 * Compute congestion window to use.
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083}
2084
2085/* This is Jacobson's slow start and congestion avoidance.
2086 * SIGCOMM '88, p. 328.
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) {
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp;
2106}
2107
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto.
2340 */
2341
2342static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2343{
2344 if (!tp->packets_out) {
2345 tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
2346 } else {
2347 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
2348 }
2349}
2350
2351/* There is one downside to this scheme. Although we keep the
2352 * ACK clock ticking, adjusting packet counters and advancing
2353 * congestion window, we do not liberate socket send buffer
2354 * space.
2355 *
2356 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
2357 * then making a write space wakeup callback is a possible
2358 * future enhancement. WARNING: it is not trivial to make.
2359 */
2360static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2361 __u32 now, __s32 *seq_rtt)
2362{
2363 struct tcp_sock *tp = tcp_sk(sk);
2364 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
2365 __u32 seq = tp->snd_una;
2366 __u32 packets_acked;
2367 int acked = 0;
2368
2369 /* If we get here, the whole TSO packet has not been
2370 * acked.
2371 */
2372 BUG_ON(!after(scb->end_seq, seq));
2373
2374 packets_acked = tcp_skb_pcount(skb);
2375 if (tcp_trim_head(sk, skb, seq - scb->seq))
2376 return 0;
2377 packets_acked -= tcp_skb_pcount(skb);
2378
2379 if (packets_acked) {
2380 __u8 sacked = scb->sacked;
2381
2382 acked |= FLAG_DATA_ACKED;
2383 if (sacked) {
2384 if (sacked & TCPCB_RETRANS) {
2385 if (sacked & TCPCB_SACKED_RETRANS)
2386 tp->retrans_out -= packets_acked;
2387 acked |= FLAG_RETRANS_DATA_ACKED;
2388 *seq_rtt = -1;
2389 } else if (*seq_rtt < 0)
2390 *seq_rtt = now - scb->when;
2391 if (sacked & TCPCB_SACKED_ACKED)
2392 tp->sacked_out -= packets_acked;
2393 if (sacked & TCPCB_LOST)
2394 tp->lost_out -= packets_acked;
2395 if (sacked & TCPCB_URG) {
2396 if (tp->urg_mode &&
2397 !before(seq, tp->snd_up))
2398 tp->urg_mode = 0;
2399 }
2400 } else if (*seq_rtt < 0)
2401 *seq_rtt = now - scb->when;
2402
2403 if (tp->fackets_out) {
2404 __u32 dval = min(tp->fackets_out, packets_acked);
2405 tp->fackets_out -= dval;
2406 }
2407 tp->packets_out -= packets_acked;
2408
2409 BUG_ON(tcp_skb_pcount(skb) == 0);
2410 BUG_ON(!before(scb->seq, scb->end_seq));
2411 }
2412
2413 return acked;
2414}
2415
2416
2417/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2419{
2420 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp;
2423 int acked = 0;
2424 __s32 seq_rtt = -1;
2425
2426 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) {
2428 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
2429 __u8 sacked = scb->sacked;
2430
2431 /* If our packet is before the ack sequence we can
2432 * discard it as it's confirmed to have arrived at
2433 * the other end.
2434 */
2435 if (after(scb->end_seq, tp->snd_una)) {
2436 if (tcp_skb_pcount(skb) > 1)
2437 acked |= tcp_tso_acked(sk, skb,
2438 now, &seq_rtt);
2439 break;
2440 }
2441
2442 /* Initial outgoing SYN's get put onto the write_queue
2443 * just like anything else we transmit. It is not
2444 * true data, and if we misinform our callers that
2445 * this ACK acks real data, we will erroneously exit
2446 * connection startup slow start one packet too
2447 * quickly. This is severely frowned upon behavior.
2448 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED;
2451 } else {
2452 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0;
2454 }
2455
2456 if (sacked) {
2457 if (sacked & TCPCB_RETRANS) {
2458 if(sacked & TCPCB_SACKED_RETRANS)
2459 tp->retrans_out -= tcp_skb_pcount(skb);
2460 acked |= FLAG_RETRANS_DATA_ACKED;
2461 seq_rtt = -1;
2462 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when;
2464 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST)
2467 tp->lost_out -= tcp_skb_pcount(skb);
2468 if (sacked & TCPCB_URG) {
2469 if (tp->urg_mode &&
2470 !before(scb->end_seq, tp->snd_up))
2471 tp->urg_mode = 0;
2472 }
2473 } else if (seq_rtt < 0)
2474 seq_rtt = now - scb->when;
2475 tcp_dec_pcount_approx(&tp->fackets_out, skb);
2476 tcp_packets_out_dec(tp, skb);
2477 __skb_unlink(skb, skb->list);
2478 sk_stream_free_skb(sk, skb);
2479 }
2480
2481 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt);
2483 tcp_ack_packets_out(sk, tp);
2484 }
2485
2486#if FASTRETRANS_DEBUG > 0
2487 BUG_TRAP((int)tp->sacked_out >= 0);
2488 BUG_TRAP((int)tp->lost_out >= 0);
2489 BUG_TRAP((int)tp->retrans_out >= 0);
2490 if (!tp->packets_out && tp->rx_opt.sack_ok) {
2491 if (tp->lost_out) {
2492 printk(KERN_DEBUG "Leak l=%u %d\n",
2493 tp->lost_out, tp->ca_state);
2494 tp->lost_out = 0;
2495 }
2496 if (tp->sacked_out) {
2497 printk(KERN_DEBUG "Leak s=%u %d\n",
2498 tp->sacked_out, tp->ca_state);
2499 tp->sacked_out = 0;
2500 }
2501 if (tp->retrans_out) {
2502 printk(KERN_DEBUG "Leak r=%u %d\n",
2503 tp->retrans_out, tp->ca_state);
2504 tp->retrans_out = 0;
2505 }
2506 }
2507#endif
2508 *seq_rtt_p = seq_rtt;
2509 return acked;
2510}
2511
2512static void tcp_ack_probe(struct sock *sk)
2513{
2514 struct tcp_sock *tp = tcp_sk(sk);
2515
2516 /* Was it a usable window open? */
2517
2518 if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
2519 tp->snd_una + tp->snd_wnd)) {
2520 tp->backoff = 0;
2521 tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
2522 /* Socket must be waked up by subsequent tcp_data_snd_check().
2523 * This function is not for random using!
2524 */
2525 } else {
2526 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
2527 min(tp->rto << tp->backoff, TCP_RTO_MAX));
2528 }
2529}
2530
2531static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
2532{
2533 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
2534 tp->ca_state != TCP_CA_Open);
2535}
2536
2537static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
2538{
2539 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
2540 !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
2541}
2542
2543/* Check that window update is acceptable.
2544 * The function assumes that snd_una<=ack<=snd_next.
2545 */
2546static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
2547 u32 ack_seq, u32 nwin)
2548{
2549 return (after(ack, tp->snd_una) ||
2550 after(ack_seq, tp->snd_wl1) ||
2551 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
2552}
2553
2554/* Update our send window.
2555 *
2556 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
2557 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
2558 */
2559static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
2560 struct sk_buff *skb, u32 ack, u32 ack_seq)
2561{
2562 int flag = 0;
2563 u32 nwin = ntohs(skb->h.th->window);
2564
2565 if (likely(!skb->h.th->syn))
2566 nwin <<= tp->rx_opt.snd_wscale;
2567
2568 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
2569 flag |= FLAG_WIN_UPDATE;
2570 tcp_update_wl(tp, ack, ack_seq);
2571
2572 if (tp->snd_wnd != nwin) {
2573 tp->snd_wnd = nwin;
2574
2575 /* Note, it is the only place, where
2576 * fast path is recovered for sending TCP.
2577 */
2578 tcp_fast_path_check(sk, tp);
2579
2580 if (nwin > tp->max_window) {
2581 tp->max_window = nwin;
2582 tcp_sync_mss(sk, tp->pmtu_cookie);
2583 }
2584 }
2585 }
2586
2587 tp->snd_una = ack;
2588
2589 return flag;
2590}
2591
2592static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2593{
2594 struct tcp_sock *tp = tcp_sk(sk);
2595
2596 tcp_sync_left_out(tp);
2597
2598 if (tp->snd_una == prior_snd_una ||
2599 !before(tp->snd_una, tp->frto_highmark)) {
2600 /* RTO was caused by loss, start retransmitting in
2601 * go-back-N slow start
2602 */
2603 tcp_enter_frto_loss(sk);
2604 return;
2605 }
2606
2607 if (tp->frto_counter == 1) {
2608 /* First ACK after RTO advances the window: allow two new
2609 * segments out.
2610 */
2611 tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
2612 } else {
2613 /* Also the second ACK after RTO advances the window.
2614 * The RTO was likely spurious. Reduce cwnd and continue
2615 * in congestion avoidance
2616 */
2617 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
2618 tcp_moderate_cwnd(tp);
2619 }
2620
2621 /* F-RTO affects on two new ACKs following RTO.
2622 * At latest on third ACK the TCP behavor is back to normal.
2623 */
2624 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625}
2626
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{
2881 struct tcp_sock *tp = tcp_sk(sk);
2882 u32 prior_snd_una = tp->snd_una;
2883 u32 ack_seq = TCP_SKB_CB(skb)->seq;
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight;
2886 s32 seq_rtt;
2887 int prior_packets;
2888
2889 /* If the ack is newer than sent or older than previous acks
2890 * then we can probably ignore it.
2891 */
2892 if (after(ack, tp->snd_nxt))
2893 goto uninteresting_ack;
2894
2895 if (before(ack, prior_snd_una))
2896 goto old_ack;
2897
2898 if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
2899 /* Window is constant, pure forward advance.
2900 * No more checks are required.
2901 * Note, we use the fact that SND.UNA>=SND.WL2.
2902 */
2903 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE;
2907
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
2911 flag |= FLAG_DATA;
2912 else
2913 NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
2914
2915 flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
2916
2917 if (TCP_SKB_CB(skb)->sacked)
2918 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
2919
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE;
2922
2923 tcp_westwood_slow_bw(sk,skb);
2924 }
2925
2926 /* We passed data and got it acked, remove any soft error
2927 * log. Something worked...
2928 */
2929 sk->sk_err_soft = 0;
2930 tp->rcv_tstamp = tcp_time_stamp;
2931 prior_packets = tp->packets_out;
2932 if (!prior_packets)
2933 goto no_queue;
2934
2935 prior_in_flight = tcp_packets_in_flight(tp);
2936
2937 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
2939
2940 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una);
2942
2943 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) &&
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else {
2951 if ((flag & FLAG_DATA_ACKED) &&
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 }
2955
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
2957 dst_confirm(sk->sk_dst_cache);
2958
2959 return 1;
2960
2961no_queue:
2962 tp->probes_out = 0;
2963
2964 /* If this ack opens up a zero window, clear backoff. It was
2965 * being used to time the probes, and is probably far higher than
2966 * it needs to be for normal retransmission.
2967 */
2968 if (sk->sk_send_head)
2969 tcp_ack_probe(sk);
2970 return 1;
2971
2972old_ack:
2973 if (TCP_SKB_CB(skb)->sacked)
2974 tcp_sacktag_write_queue(sk, skb, prior_snd_una);
2975
2976uninteresting_ack:
2977 SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
2978 return 0;
2979}
2980
2981
2982/* Look for tcp options. Normally only called on SYN and SYNACK packets.
2983 * But, this can also be called on packets in the established flow when
2984 * the fast version below fails.
2985 */
2986void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
2987{
2988 unsigned char *ptr;
2989 struct tcphdr *th = skb->h.th;
2990 int length=(th->doff*4)-sizeof(struct tcphdr);
2991
2992 ptr = (unsigned char *)(th + 1);
2993 opt_rx->saw_tstamp = 0;
2994
2995 while(length>0) {
2996 int opcode=*ptr++;
2997 int opsize;
2998
2999 switch (opcode) {
3000 case TCPOPT_EOL:
3001 return;
3002 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
3003 length--;
3004 continue;
3005 default:
3006 opsize=*ptr++;
3007 if (opsize < 2) /* "silly options" */
3008 return;
3009 if (opsize > length)
3010 return; /* don't parse partial options */
3011 switch(opcode) {
3012 case TCPOPT_MSS:
3013 if(opsize==TCPOLEN_MSS && th->syn && !estab) {
3014 u16 in_mss = ntohs(get_unaligned((__u16 *)ptr));
3015 if (in_mss) {
3016 if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
3017 in_mss = opt_rx->user_mss;
3018 opt_rx->mss_clamp = in_mss;
3019 }
3020 }
3021 break;
3022 case TCPOPT_WINDOW:
3023 if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
3024 if (sysctl_tcp_window_scaling) {
3025 __u8 snd_wscale = *(__u8 *) ptr;
3026 opt_rx->wscale_ok = 1;
3027 if (snd_wscale > 14) {
3028 if(net_ratelimit())
3029 printk(KERN_INFO "tcp_parse_options: Illegal window "
3030 "scaling value %d >14 received.\n",
3031 snd_wscale);
3032 snd_wscale = 14;
3033 }
3034 opt_rx->snd_wscale = snd_wscale;
3035 }
3036 break;
3037 case TCPOPT_TIMESTAMP:
3038 if(opsize==TCPOLEN_TIMESTAMP) {
3039 if ((estab && opt_rx->tstamp_ok) ||
3040 (!estab && sysctl_tcp_timestamps)) {
3041 opt_rx->saw_tstamp = 1;
3042 opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr));
3043 opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4)));
3044 }
3045 }
3046 break;
3047 case TCPOPT_SACK_PERM:
3048 if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
3049 if (sysctl_tcp_sack) {
3050 opt_rx->sack_ok = 1;
3051 tcp_sack_reset(opt_rx);
3052 }
3053 }
3054 break;
3055
3056 case TCPOPT_SACK:
3057 if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
3058 !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
3059 opt_rx->sack_ok) {
3060 TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
3061 }
3062 };
3063 ptr+=opsize-2;
3064 length-=opsize;
3065 };
3066 }
3067}
3068
3069/* Fast parse options. This hopes to only see timestamps.
3070 * If it is wrong it falls back on tcp_parse_options().
3071 */
3072static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3073 struct tcp_sock *tp)
3074{
3075 if (th->doff == sizeof(struct tcphdr)>>2) {
3076 tp->rx_opt.saw_tstamp = 0;
3077 return 0;
3078 } else if (tp->rx_opt.tstamp_ok &&
3079 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
3080 __u32 *ptr = (__u32 *)(th + 1);
3081 if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3082 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3083 tp->rx_opt.saw_tstamp = 1;
3084 ++ptr;
3085 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3086 ++ptr;
3087 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3088 return 1;
3089 }
3090 }
3091 tcp_parse_options(skb, &tp->rx_opt, 1);
3092 return 1;
3093}
3094
3095static inline void tcp_store_ts_recent(struct tcp_sock *tp)
3096{
3097 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3098 tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
3099}
3100
3101static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3102{
3103 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3104 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
3105 * extra check below makes sure this can only happen
3106 * for pure ACK frames. -DaveM
3107 *
3108 * Not only, also it occurs for expired timestamps.
3109 */
3110
3111 if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
3112 xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
3113 tcp_store_ts_recent(tp);
3114 }
3115}
3116
3117/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
3118 *
3119 * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
3120 * it can pass through stack. So, the following predicate verifies that
3121 * this segment is not used for anything but congestion avoidance or
3122 * fast retransmit. Moreover, we even are able to eliminate most of such
3123 * second order effects, if we apply some small "replay" window (~RTO)
3124 * to timestamp space.
3125 *
3126 * All these measures still do not guarantee that we reject wrapped ACKs
3127 * on networks with high bandwidth, when sequence space is recycled fastly,
3128 * but it guarantees that such events will be very rare and do not affect
3129 * connection seriously. This doesn't look nice, but alas, PAWS is really
3130 * buggy extension.
3131 *
3132 * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
3133 * states that events when retransmit arrives after original data are rare.
3134 * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
3135 * the biggest problem on large power networks even with minor reordering.
3136 * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
3137 * up to bandwidth of 18Gigabit/sec. 8) ]
3138 */
3139
3140static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
3141{
3142 struct tcphdr *th = skb->h.th;
3143 u32 seq = TCP_SKB_CB(skb)->seq;
3144 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3145
3146 return (/* 1. Pure ACK with correct sequence number. */
3147 (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
3148
3149 /* 2. ... and duplicate ACK. */
3150 ack == tp->snd_una &&
3151
3152 /* 3. ... and does not update window. */
3153 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
3154
3155 /* 4. ... and sits in replay window. */
3156 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
3157}
3158
3159static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
3160{
3161 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
3162 xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
3163 !tcp_disordered_ack(tp, skb));
3164}
3165
3166/* Check segment sequence number for validity.
3167 *
3168 * Segment controls are considered valid, if the segment
3169 * fits to the window after truncation to the window. Acceptability
3170 * of data (and SYN, FIN, of course) is checked separately.
3171 * See tcp_data_queue(), for example.
3172 *
3173 * Also, controls (RST is main one) are accepted using RCV.WUP instead
3174 * of RCV.NXT. Peer still did not advance his SND.UNA when we
3175 * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
3176 * (borrowed from freebsd)
3177 */
3178
3179static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
3180{
3181 return !before(end_seq, tp->rcv_wup) &&
3182 !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
3183}
3184
3185/* When we get a reset we do this. */
3186static void tcp_reset(struct sock *sk)
3187{
3188 /* We want the right error as BSD sees it (and indeed as we do). */
3189 switch (sk->sk_state) {
3190 case TCP_SYN_SENT:
3191 sk->sk_err = ECONNREFUSED;
3192 break;
3193 case TCP_CLOSE_WAIT:
3194 sk->sk_err = EPIPE;
3195 break;
3196 case TCP_CLOSE:
3197 return;
3198 default:
3199 sk->sk_err = ECONNRESET;
3200 }
3201
3202 if (!sock_flag(sk, SOCK_DEAD))
3203 sk->sk_error_report(sk);
3204
3205 tcp_done(sk);
3206}
3207
3208/*
3209 * Process the FIN bit. This now behaves as it is supposed to work
3210 * and the FIN takes effect when it is validly part of sequence
3211 * space. Not before when we get holes.
3212 *
3213 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3214 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3215 * TIME-WAIT)
3216 *
3217 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3218 * close and we go into CLOSING (and later onto TIME-WAIT)
3219 *
3220 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3221 */
3222static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
3223{
3224 struct tcp_sock *tp = tcp_sk(sk);
3225
3226 tcp_schedule_ack(tp);
3227
3228 sk->sk_shutdown |= RCV_SHUTDOWN;
3229 sock_set_flag(sk, SOCK_DONE);
3230
3231 switch (sk->sk_state) {
3232 case TCP_SYN_RECV:
3233 case TCP_ESTABLISHED:
3234 /* Move to CLOSE_WAIT */
3235 tcp_set_state(sk, TCP_CLOSE_WAIT);
3236 tp->ack.pingpong = 1;
3237 break;
3238
3239 case TCP_CLOSE_WAIT:
3240 case TCP_CLOSING:
3241 /* Received a retransmission of the FIN, do
3242 * nothing.
3243 */
3244 break;
3245 case TCP_LAST_ACK:
3246 /* RFC793: Remain in the LAST-ACK state. */
3247 break;
3248
3249 case TCP_FIN_WAIT1:
3250 /* This case occurs when a simultaneous close
3251 * happens, we must ack the received FIN and
3252 * enter the CLOSING state.
3253 */
3254 tcp_send_ack(sk);
3255 tcp_set_state(sk, TCP_CLOSING);
3256 break;
3257 case TCP_FIN_WAIT2:
3258 /* Received a FIN -- send ACK and enter TIME_WAIT. */
3259 tcp_send_ack(sk);
3260 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
3261 break;
3262 default:
3263 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
3264 * cases we should never reach this piece of code.
3265 */
3266 printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
3267 __FUNCTION__, sk->sk_state);
3268 break;
3269 };
3270
3271 /* It _is_ possible, that we have something out-of-order _after_ FIN.
3272 * Probably, we should reset in this case. For now drop them.
3273 */
3274 __skb_queue_purge(&tp->out_of_order_queue);
3275 if (tp->rx_opt.sack_ok)
3276 tcp_sack_reset(&tp->rx_opt);
3277 sk_stream_mem_reclaim(sk);
3278
3279 if (!sock_flag(sk, SOCK_DEAD)) {
3280 sk->sk_state_change(sk);
3281
3282 /* Do not send POLL_HUP for half duplex close. */
3283 if (sk->sk_shutdown == SHUTDOWN_MASK ||
3284 sk->sk_state == TCP_CLOSE)
3285 sk_wake_async(sk, 1, POLL_HUP);
3286 else
3287 sk_wake_async(sk, 1, POLL_IN);
3288 }
3289}
3290
3291static __inline__ int
3292tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
3293{
3294 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
3295 if (before(seq, sp->start_seq))
3296 sp->start_seq = seq;
3297 if (after(end_seq, sp->end_seq))
3298 sp->end_seq = end_seq;
3299 return 1;
3300 }
3301 return 0;
3302}
3303
3304static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
3305{
3306 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
3307 if (before(seq, tp->rcv_nxt))
3308 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
3309 else
3310 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT);
3311
3312 tp->rx_opt.dsack = 1;
3313 tp->duplicate_sack[0].start_seq = seq;
3314 tp->duplicate_sack[0].end_seq = end_seq;
3315 tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
3316 }
3317}
3318
3319static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
3320{
3321 if (!tp->rx_opt.dsack)
3322 tcp_dsack_set(tp, seq, end_seq);
3323 else
3324 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
3325}
3326
3327static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
3328{
3329 struct tcp_sock *tp = tcp_sk(sk);
3330
3331 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
3332 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
3333 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
3334 tcp_enter_quickack_mode(tp);
3335
3336 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
3337 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
3338
3339 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
3340 end_seq = tp->rcv_nxt;
3341 tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq);
3342 }
3343 }
3344
3345 tcp_send_ack(sk);
3346}
3347
3348/* These routines update the SACK block as out-of-order packets arrive or
3349 * in-order packets close up the sequence space.
3350 */
3351static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
3352{
3353 int this_sack;
3354 struct tcp_sack_block *sp = &tp->selective_acks[0];
3355 struct tcp_sack_block *swalk = sp+1;
3356
3357 /* See if the recent change to the first SACK eats into
3358 * or hits the sequence space of other SACK blocks, if so coalesce.
3359 */
3360 for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) {
3361 if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
3362 int i;
3363
3364 /* Zap SWALK, by moving every further SACK up by one slot.
3365 * Decrease num_sacks.
3366 */
3367 tp->rx_opt.num_sacks--;
3368 tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
3369 for(i=this_sack; i < tp->rx_opt.num_sacks; i++)
3370 sp[i] = sp[i+1];
3371 continue;
3372 }
3373 this_sack++, swalk++;
3374 }
3375}
3376
3377static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
3378{
3379 __u32 tmp;
3380
3381 tmp = sack1->start_seq;
3382 sack1->start_seq = sack2->start_seq;
3383 sack2->start_seq = tmp;
3384
3385 tmp = sack1->end_seq;
3386 sack1->end_seq = sack2->end_seq;
3387 sack2->end_seq = tmp;
3388}
3389
3390static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
3391{
3392 struct tcp_sock *tp = tcp_sk(sk);
3393 struct tcp_sack_block *sp = &tp->selective_acks[0];
3394 int cur_sacks = tp->rx_opt.num_sacks;
3395 int this_sack;
3396
3397 if (!cur_sacks)
3398 goto new_sack;
3399
3400 for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) {
3401 if (tcp_sack_extend(sp, seq, end_seq)) {
3402 /* Rotate this_sack to the first one. */
3403 for (; this_sack>0; this_sack--, sp--)
3404 tcp_sack_swap(sp, sp-1);
3405 if (cur_sacks > 1)
3406 tcp_sack_maybe_coalesce(tp);
3407 return;
3408 }
3409 }
3410
3411 /* Could not find an adjacent existing SACK, build a new one,
3412 * put it at the front, and shift everyone else down. We
3413 * always know there is at least one SACK present already here.
3414 *
3415 * If the sack array is full, forget about the last one.
3416 */
3417 if (this_sack >= 4) {
3418 this_sack--;
3419 tp->rx_opt.num_sacks--;
3420 sp--;
3421 }
3422 for(; this_sack > 0; this_sack--, sp--)
3423 *sp = *(sp-1);
3424
3425new_sack:
3426 /* Build the new head SACK, and we're done. */
3427 sp->start_seq = seq;
3428 sp->end_seq = end_seq;
3429 tp->rx_opt.num_sacks++;
3430 tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
3431}
3432
3433/* RCV.NXT advances, some SACKs should be eaten. */
3434
3435static void tcp_sack_remove(struct tcp_sock *tp)
3436{
3437 struct tcp_sack_block *sp = &tp->selective_acks[0];
3438 int num_sacks = tp->rx_opt.num_sacks;
3439 int this_sack;
3440
3441 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
3442 if (skb_queue_len(&tp->out_of_order_queue) == 0) {
3443 tp->rx_opt.num_sacks = 0;
3444 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
3445 return;
3446 }
3447
3448 for(this_sack = 0; this_sack < num_sacks; ) {
3449 /* Check if the start of the sack is covered by RCV.NXT. */
3450 if (!before(tp->rcv_nxt, sp->start_seq)) {
3451 int i;
3452
3453 /* RCV.NXT must cover all the block! */
3454 BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq));
3455
3456 /* Zap this SACK, by moving forward any other SACKS. */
3457 for (i=this_sack+1; i < num_sacks; i++)
3458 tp->selective_acks[i-1] = tp->selective_acks[i];
3459 num_sacks--;
3460 continue;
3461 }
3462 this_sack++;
3463 sp++;
3464 }
3465 if (num_sacks != tp->rx_opt.num_sacks) {
3466 tp->rx_opt.num_sacks = num_sacks;
3467 tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
3468 }
3469}
3470
3471/* This one checks to see if we can put data from the
3472 * out_of_order queue into the receive_queue.
3473 */
3474static void tcp_ofo_queue(struct sock *sk)
3475{
3476 struct tcp_sock *tp = tcp_sk(sk);
3477 __u32 dsack_high = tp->rcv_nxt;
3478 struct sk_buff *skb;
3479
3480 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
3481 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
3482 break;
3483
3484 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
3485 __u32 dsack = dsack_high;
3486 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
3487 dsack_high = TCP_SKB_CB(skb)->end_seq;
3488 tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack);
3489 }
3490
3491 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
3492 SOCK_DEBUG(sk, "ofo packet was already received \n");
3493 __skb_unlink(skb, skb->list);
3494 __kfree_skb(skb);
3495 continue;
3496 }
3497 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
3498 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
3499 TCP_SKB_CB(skb)->end_seq);
3500
3501 __skb_unlink(skb, skb->list);
3502 __skb_queue_tail(&sk->sk_receive_queue, skb);
3503 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3504 if(skb->h.th->fin)
3505 tcp_fin(skb, sk, skb->h.th);
3506 }
3507}
3508
3509static int tcp_prune_queue(struct sock *sk);
3510
3511static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
3512{
3513 struct tcphdr *th = skb->h.th;
3514 struct tcp_sock *tp = tcp_sk(sk);
3515 int eaten = -1;
3516
3517 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
3518 goto drop;
3519
3520 th = skb->h.th;
3521 __skb_pull(skb, th->doff*4);
3522
3523 TCP_ECN_accept_cwr(tp, skb);
3524
3525 if (tp->rx_opt.dsack) {
3526 tp->rx_opt.dsack = 0;
3527 tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
3528 4 - tp->rx_opt.tstamp_ok);
3529 }
3530
3531 /* Queue data for delivery to the user.
3532 * Packets in sequence go to the receive queue.
3533 * Out of sequence packets to the out_of_order_queue.
3534 */
3535 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
3536 if (tcp_receive_window(tp) == 0)
3537 goto out_of_window;
3538
3539 /* Ok. In sequence. In window. */
3540 if (tp->ucopy.task == current &&
3541 tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
3542 sock_owned_by_user(sk) && !tp->urg_data) {
3543 int chunk = min_t(unsigned int, skb->len,
3544 tp->ucopy.len);
3545
3546 __set_current_state(TASK_RUNNING);
3547
3548 local_bh_enable();
3549 if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
3550 tp->ucopy.len -= chunk;
3551 tp->copied_seq += chunk;
3552 eaten = (chunk == skb->len && !th->fin);
3553 tcp_rcv_space_adjust(sk);
3554 }
3555 local_bh_disable();
3556 }
3557
3558 if (eaten <= 0) {
3559queue_and_out:
3560 if (eaten < 0 &&
3561 (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
3562 !sk_stream_rmem_schedule(sk, skb))) {
3563 if (tcp_prune_queue(sk) < 0 ||
3564 !sk_stream_rmem_schedule(sk, skb))
3565 goto drop;
3566 }
3567 sk_stream_set_owner_r(skb, sk);
3568 __skb_queue_tail(&sk->sk_receive_queue, skb);
3569 }
3570 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3571 if(skb->len)
3572 tcp_event_data_recv(sk, tp, skb);
3573 if(th->fin)
3574 tcp_fin(skb, sk, th);
3575
3576 if (skb_queue_len(&tp->out_of_order_queue)) {
3577 tcp_ofo_queue(sk);
3578
3579 /* RFC2581. 4.2. SHOULD send immediate ACK, when
3580 * gap in queue is filled.
3581 */
3582 if (!skb_queue_len(&tp->out_of_order_queue))
3583 tp->ack.pingpong = 0;
3584 }
3585
3586 if (tp->rx_opt.num_sacks)
3587 tcp_sack_remove(tp);
3588
3589 tcp_fast_path_check(sk, tp);
3590
3591 if (eaten > 0)
3592 __kfree_skb(skb);
3593 else if (!sock_flag(sk, SOCK_DEAD))
3594 sk->sk_data_ready(sk, 0);
3595 return;
3596 }
3597
3598 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
3599 /* A retransmit, 2nd most common case. Force an immediate ack. */
3600 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
3601 tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
3602
3603out_of_window:
3604 tcp_enter_quickack_mode(tp);
3605 tcp_schedule_ack(tp);
3606drop:
3607 __kfree_skb(skb);
3608 return;
3609 }
3610
3611 /* Out of window. F.e. zero window probe. */
3612 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
3613 goto out_of_window;
3614
3615 tcp_enter_quickack_mode(tp);
3616
3617 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
3618 /* Partial packet, seq < rcv_next < end_seq */
3619 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
3620 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
3621 TCP_SKB_CB(skb)->end_seq);
3622
3623 tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
3624
3625 /* If window is closed, drop tail of packet. But after
3626 * remembering D-SACK for its head made in previous line.
3627 */
3628 if (!tcp_receive_window(tp))
3629 goto out_of_window;
3630 goto queue_and_out;
3631 }
3632
3633 TCP_ECN_check_ce(tp, skb);
3634
3635 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
3636 !sk_stream_rmem_schedule(sk, skb)) {
3637 if (tcp_prune_queue(sk) < 0 ||
3638 !sk_stream_rmem_schedule(sk, skb))
3639 goto drop;
3640 }
3641
3642 /* Disable header prediction. */
3643 tp->pred_flags = 0;
3644 tcp_schedule_ack(tp);
3645
3646 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
3647 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
3648
3649 sk_stream_set_owner_r(skb, sk);
3650
3651 if (!skb_peek(&tp->out_of_order_queue)) {
3652 /* Initial out of order segment, build 1 SACK. */
3653 if (tp->rx_opt.sack_ok) {
3654 tp->rx_opt.num_sacks = 1;
3655 tp->rx_opt.dsack = 0;
3656 tp->rx_opt.eff_sacks = 1;
3657 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
3658 tp->selective_acks[0].end_seq =
3659 TCP_SKB_CB(skb)->end_seq;
3660 }
3661 __skb_queue_head(&tp->out_of_order_queue,skb);
3662 } else {
3663 struct sk_buff *skb1 = tp->out_of_order_queue.prev;
3664 u32 seq = TCP_SKB_CB(skb)->seq;
3665 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
3666
3667 if (seq == TCP_SKB_CB(skb1)->end_seq) {
3668 __skb_append(skb1, skb);
3669
3670 if (!tp->rx_opt.num_sacks ||
3671 tp->selective_acks[0].end_seq != seq)
3672 goto add_sack;
3673
3674 /* Common case: data arrive in order after hole. */
3675 tp->selective_acks[0].end_seq = end_seq;
3676 return;
3677 }
3678
3679 /* Find place to insert this segment. */
3680 do {
3681 if (!after(TCP_SKB_CB(skb1)->seq, seq))
3682 break;
3683 } while ((skb1 = skb1->prev) !=
3684 (struct sk_buff*)&tp->out_of_order_queue);
3685
3686 /* Do skb overlap to previous one? */
3687 if (skb1 != (struct sk_buff*)&tp->out_of_order_queue &&
3688 before(seq, TCP_SKB_CB(skb1)->end_seq)) {
3689 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
3690 /* All the bits are present. Drop. */
3691 __kfree_skb(skb);
3692 tcp_dsack_set(tp, seq, end_seq);
3693 goto add_sack;
3694 }
3695 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
3696 /* Partial overlap. */
3697 tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq);
3698 } else {
3699 skb1 = skb1->prev;
3700 }
3701 }
3702 __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
3703
3704 /* And clean segments covered by new one as whole. */
3705 while ((skb1 = skb->next) !=
3706 (struct sk_buff*)&tp->out_of_order_queue &&
3707 after(end_seq, TCP_SKB_CB(skb1)->seq)) {
3708 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
3709 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
3710 break;
3711 }
3712 __skb_unlink(skb1, skb1->list);
3713 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
3714 __kfree_skb(skb1);
3715 }
3716
3717add_sack:
3718 if (tp->rx_opt.sack_ok)
3719 tcp_sack_new_ofo_skb(sk, seq, end_seq);
3720 }
3721}
3722
3723/* Collapse contiguous sequence of skbs head..tail with
3724 * sequence numbers start..end.
3725 * Segments with FIN/SYN are not collapsed (only because this
3726 * simplifies code)
3727 */
3728static void
3729tcp_collapse(struct sock *sk, struct sk_buff *head,
3730 struct sk_buff *tail, u32 start, u32 end)
3731{
3732 struct sk_buff *skb;
3733
3734 /* First, check that queue is collapsable and find
3735 * the point where collapsing can be useful. */
3736 for (skb = head; skb != tail; ) {
3737 /* No new bits? It is possible on ofo queue. */
3738 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
3739 struct sk_buff *next = skb->next;
3740 __skb_unlink(skb, skb->list);
3741 __kfree_skb(skb);
3742 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
3743 skb = next;
3744 continue;
3745 }
3746
3747 /* The first skb to collapse is:
3748 * - not SYN/FIN and
3749 * - bloated or contains data before "start" or
3750 * overlaps to the next one.
3751 */
3752 if (!skb->h.th->syn && !skb->h.th->fin &&
3753 (tcp_win_from_space(skb->truesize) > skb->len ||
3754 before(TCP_SKB_CB(skb)->seq, start) ||
3755 (skb->next != tail &&
3756 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
3757 break;
3758
3759 /* Decided to skip this, advance start seq. */
3760 start = TCP_SKB_CB(skb)->end_seq;
3761 skb = skb->next;
3762 }
3763 if (skb == tail || skb->h.th->syn || skb->h.th->fin)
3764 return;
3765
3766 while (before(start, end)) {
3767 struct sk_buff *nskb;
3768 int header = skb_headroom(skb);
3769 int copy = SKB_MAX_ORDER(header, 0);
3770
3771 /* Too big header? This can happen with IPv6. */
3772 if (copy < 0)
3773 return;
3774 if (end-start < copy)
3775 copy = end-start;
3776 nskb = alloc_skb(copy+header, GFP_ATOMIC);
3777 if (!nskb)
3778 return;
3779 skb_reserve(nskb, header);
3780 memcpy(nskb->head, skb->head, header);
3781 nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
3782 nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
3783 nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
3784 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
3785 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
3786 __skb_insert(nskb, skb->prev, skb, skb->list);
3787 sk_stream_set_owner_r(nskb, sk);
3788
3789 /* Copy data, releasing collapsed skbs. */
3790 while (copy > 0) {
3791 int offset = start - TCP_SKB_CB(skb)->seq;
3792 int size = TCP_SKB_CB(skb)->end_seq - start;
3793
3794 if (offset < 0) BUG();
3795 if (size > 0) {
3796 size = min(copy, size);
3797 if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
3798 BUG();
3799 TCP_SKB_CB(nskb)->end_seq += size;
3800 copy -= size;
3801 start += size;
3802 }
3803 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
3804 struct sk_buff *next = skb->next;
3805 __skb_unlink(skb, skb->list);
3806 __kfree_skb(skb);
3807 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
3808 skb = next;
3809 if (skb == tail || skb->h.th->syn || skb->h.th->fin)
3810 return;
3811 }
3812 }
3813 }
3814}
3815
3816/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
3817 * and tcp_collapse() them until all the queue is collapsed.
3818 */
3819static void tcp_collapse_ofo_queue(struct sock *sk)
3820{
3821 struct tcp_sock *tp = tcp_sk(sk);
3822 struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
3823 struct sk_buff *head;
3824 u32 start, end;
3825
3826 if (skb == NULL)
3827 return;
3828
3829 start = TCP_SKB_CB(skb)->seq;
3830 end = TCP_SKB_CB(skb)->end_seq;
3831 head = skb;
3832
3833 for (;;) {
3834 skb = skb->next;
3835
3836 /* Segment is terminated when we see gap or when
3837 * we are at the end of all the queue. */
3838 if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
3839 after(TCP_SKB_CB(skb)->seq, end) ||
3840 before(TCP_SKB_CB(skb)->end_seq, start)) {
3841 tcp_collapse(sk, head, skb, start, end);
3842 head = skb;
3843 if (skb == (struct sk_buff *)&tp->out_of_order_queue)
3844 break;
3845 /* Start new segment */
3846 start = TCP_SKB_CB(skb)->seq;
3847 end = TCP_SKB_CB(skb)->end_seq;
3848 } else {
3849 if (before(TCP_SKB_CB(skb)->seq, start))
3850 start = TCP_SKB_CB(skb)->seq;
3851 if (after(TCP_SKB_CB(skb)->end_seq, end))
3852 end = TCP_SKB_CB(skb)->end_seq;
3853 }
3854 }
3855}
3856
3857/* Reduce allocated memory if we can, trying to get
3858 * the socket within its memory limits again.
3859 *
3860 * Return less than zero if we should start dropping frames
3861 * until the socket owning process reads some of the data
3862 * to stabilize the situation.
3863 */
3864static int tcp_prune_queue(struct sock *sk)
3865{
3866 struct tcp_sock *tp = tcp_sk(sk);
3867
3868 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
3869
3870 NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
3871
3872 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
3873 tcp_clamp_window(sk, tp);
3874 else if (tcp_memory_pressure)
3875 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
3876
3877 tcp_collapse_ofo_queue(sk);
3878 tcp_collapse(sk, sk->sk_receive_queue.next,
3879 (struct sk_buff*)&sk->sk_receive_queue,
3880 tp->copied_seq, tp->rcv_nxt);
3881 sk_stream_mem_reclaim(sk);
3882
3883 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
3884 return 0;
3885
3886 /* Collapsing did not help, destructive actions follow.
3887 * This must not ever occur. */
3888
3889 /* First, purge the out_of_order queue. */
3890 if (skb_queue_len(&tp->out_of_order_queue)) {
3891 NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED,
3892 skb_queue_len(&tp->out_of_order_queue));
3893 __skb_queue_purge(&tp->out_of_order_queue);
3894
3895 /* Reset SACK state. A conforming SACK implementation will
3896 * do the same at a timeout based retransmit. When a connection
3897 * is in a sad state like this, we care only about integrity
3898 * of the connection not performance.
3899 */
3900 if (tp->rx_opt.sack_ok)
3901 tcp_sack_reset(&tp->rx_opt);
3902 sk_stream_mem_reclaim(sk);
3903 }
3904
3905 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
3906 return 0;
3907
3908 /* If we are really being abused, tell the caller to silently
3909 * drop receive data on the floor. It will get retransmitted
3910 * and hopefully then we'll have sufficient space.
3911 */
3912 NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED);
3913
3914 /* Massive buffer overcommit. */
3915 tp->pred_flags = 0;
3916 return -1;
3917}
3918
3919
3920/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
3921 * As additional protections, we do not touch cwnd in retransmission phases,
3922 * and if application hit its sndbuf limit recently.
3923 */
3924void tcp_cwnd_application_limited(struct sock *sk)
3925{
3926 struct tcp_sock *tp = tcp_sk(sk);
3927
3928 if (tp->ca_state == TCP_CA_Open &&
3929 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
3930 /* Limited by application or receiver window. */
3931 u32 win_used = max(tp->snd_cwnd_used, 2U);
3932 if (win_used < tp->snd_cwnd) {
3933 tp->snd_ssthresh = tcp_current_ssthresh(tp);
3934 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
3935 }
3936 tp->snd_cwnd_used = 0;
3937 }
3938 tp->snd_cwnd_stamp = tcp_time_stamp;
3939}
3940
3941
3942/* When incoming ACK allowed to free some skb from write_queue,
3943 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
3944 * on the exit from tcp input handler.
3945 *
3946 * PROBLEM: sndbuf expansion does not work well with largesend.
3947 */
3948static void tcp_new_space(struct sock *sk)
3949{
3950 struct tcp_sock *tp = tcp_sk(sk);
3951
3952 if (tp->packets_out < tp->snd_cwnd &&
3953 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
3954 !tcp_memory_pressure &&
3955 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3956 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3957 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3958 demanded = max_t(unsigned int, tp->snd_cwnd,
3959 tp->reordering + 1);
3960 sndmem *= 2*demanded;
3961 if (sndmem > sk->sk_sndbuf)
3962 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
3963 tp->snd_cwnd_stamp = tcp_time_stamp;
3964 }
3965
3966 sk->sk_write_space(sk);
3967}
3968
3969static inline void tcp_check_space(struct sock *sk)
3970{
3971 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
3972 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
3973 if (sk->sk_socket &&
3974 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
3975 tcp_new_space(sk);
3976 }
3977}
3978
3979static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
3980{
3981 struct tcp_sock *tp = tcp_sk(sk);
3982
3983 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3984 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3985 tcp_write_xmit(sk, tp->nonagle))
3986 tcp_check_probe_timer(sk, tp);
3987}
3988
3989static __inline__ void tcp_data_snd_check(struct sock *sk)
3990{
3991 struct sk_buff *skb = sk->sk_send_head;
3992
3993 if (skb != NULL)
3994 __tcp_data_snd_check(sk, skb);
3995 tcp_check_space(sk);
3996}
3997
3998/*
3999 * Check if sending an ack is needed.
4000 */
4001static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4002{
4003 struct tcp_sock *tp = tcp_sk(sk);
4004
4005 /* More than one full frame received... */
4006 if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
4007 /* ... and right edge of window advances far enough.
4008 * (tcp_recvmsg() will send ACK otherwise). Or...
4009 */
4010 && __tcp_select_window(sk) >= tp->rcv_wnd) ||
4011 /* We ACK each frame or... */
4012 tcp_in_quickack_mode(tp) ||
4013 /* We have out of order data. */
4014 (ofo_possible &&
4015 skb_peek(&tp->out_of_order_queue))) {
4016 /* Then ack it now */
4017 tcp_send_ack(sk);
4018 } else {
4019 /* Else, send delayed ack. */
4020 tcp_send_delayed_ack(sk);
4021 }
4022}
4023
4024static __inline__ void tcp_ack_snd_check(struct sock *sk)
4025{
4026 struct tcp_sock *tp = tcp_sk(sk);
4027 if (!tcp_ack_scheduled(tp)) {
4028 /* We sent a data segment already. */
4029 return;
4030 }
4031 __tcp_ack_snd_check(sk, 1);
4032}
4033
4034/*
4035 * This routine is only called when we have urgent data
4036 * signalled. Its the 'slow' part of tcp_urg. It could be
4037 * moved inline now as tcp_urg is only called from one
4038 * place. We handle URGent data wrong. We have to - as
4039 * BSD still doesn't use the correction from RFC961.
4040 * For 1003.1g we should support a new option TCP_STDURG to permit
4041 * either form (or just set the sysctl tcp_stdurg).
4042 */
4043
4044static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
4045{
4046 struct tcp_sock *tp = tcp_sk(sk);
4047 u32 ptr = ntohs(th->urg_ptr);
4048
4049 if (ptr && !sysctl_tcp_stdurg)
4050 ptr--;
4051 ptr += ntohl(th->seq);
4052
4053 /* Ignore urgent data that we've already seen and read. */
4054 if (after(tp->copied_seq, ptr))
4055 return;
4056
4057 /* Do not replay urg ptr.
4058 *
4059 * NOTE: interesting situation not covered by specs.
4060 * Misbehaving sender may send urg ptr, pointing to segment,
4061 * which we already have in ofo queue. We are not able to fetch
4062 * such data and will stay in TCP_URG_NOTYET until will be eaten
4063 * by recvmsg(). Seems, we are not obliged to handle such wicked
4064 * situations. But it is worth to think about possibility of some
4065 * DoSes using some hypothetical application level deadlock.
4066 */
4067 if (before(ptr, tp->rcv_nxt))
4068 return;
4069
4070 /* Do we already have a newer (or duplicate) urgent pointer? */
4071 if (tp->urg_data && !after(ptr, tp->urg_seq))
4072 return;
4073
4074 /* Tell the world about our new urgent pointer. */
4075 sk_send_sigurg(sk);
4076
4077 /* We may be adding urgent data when the last byte read was
4078 * urgent. To do this requires some care. We cannot just ignore
4079 * tp->copied_seq since we would read the last urgent byte again
4080 * as data, nor can we alter copied_seq until this data arrives
4081 * or we break the sematics of SIOCATMARK (and thus sockatmark())
4082 *
4083 * NOTE. Double Dutch. Rendering to plain English: author of comment
4084 * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
4085 * and expect that both A and B disappear from stream. This is _wrong_.
4086 * Though this happens in BSD with high probability, this is occasional.
4087 * Any application relying on this is buggy. Note also, that fix "works"
4088 * only in this artificial test. Insert some normal data between A and B and we will
4089 * decline of BSD again. Verdict: it is better to remove to trap
4090 * buggy users.
4091 */
4092 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
4093 !sock_flag(sk, SOCK_URGINLINE) &&
4094 tp->copied_seq != tp->rcv_nxt) {
4095 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
4096 tp->copied_seq++;
4097 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
4098 __skb_unlink(skb, skb->list);
4099 __kfree_skb(skb);
4100 }
4101 }
4102
4103 tp->urg_data = TCP_URG_NOTYET;
4104 tp->urg_seq = ptr;
4105
4106 /* Disable header prediction. */
4107 tp->pred_flags = 0;
4108}
4109
4110/* This is the 'fast' part of urgent handling. */
4111static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
4112{
4113 struct tcp_sock *tp = tcp_sk(sk);
4114
4115 /* Check if we get a new urgent pointer - normally not. */
4116 if (th->urg)
4117 tcp_check_urg(sk,th);
4118
4119 /* Do we wait for any urgent data? - normally not... */
4120 if (tp->urg_data == TCP_URG_NOTYET) {
4121 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
4122 th->syn;
4123
4124 /* Is the urgent pointer pointing into this packet? */
4125 if (ptr < skb->len) {
4126 u8 tmp;
4127 if (skb_copy_bits(skb, ptr, &tmp, 1))
4128 BUG();
4129 tp->urg_data = TCP_URG_VALID | tmp;
4130 if (!sock_flag(sk, SOCK_DEAD))
4131 sk->sk_data_ready(sk, 0);
4132 }
4133 }
4134}
4135
4136static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
4137{
4138 struct tcp_sock *tp = tcp_sk(sk);
4139 int chunk = skb->len - hlen;
4140 int err;
4141
4142 local_bh_enable();
4143 if (skb->ip_summed==CHECKSUM_UNNECESSARY)
4144 err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
4145 else
4146 err = skb_copy_and_csum_datagram_iovec(skb, hlen,
4147 tp->ucopy.iov);
4148
4149 if (!err) {
4150 tp->ucopy.len -= chunk;
4151 tp->copied_seq += chunk;
4152 tcp_rcv_space_adjust(sk);
4153 }
4154
4155 local_bh_disable();
4156 return err;
4157}
4158
4159static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
4160{
4161 int result;
4162
4163 if (sock_owned_by_user(sk)) {
4164 local_bh_enable();
4165 result = __tcp_checksum_complete(skb);
4166 local_bh_disable();
4167 } else {
4168 result = __tcp_checksum_complete(skb);
4169 }
4170 return result;
4171}
4172
4173static __inline__ int
4174tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
4175{
4176 return skb->ip_summed != CHECKSUM_UNNECESSARY &&
4177 __tcp_checksum_complete_user(sk, skb);
4178}
4179
4180/*
4181 * TCP receive function for the ESTABLISHED state.
4182 *
4183 * It is split into a fast path and a slow path. The fast path is
4184 * disabled when:
4185 * - A zero window was announced from us - zero window probing
4186 * is only handled properly in the slow path.
4187 * - Out of order segments arrived.
4188 * - Urgent data is expected.
4189 * - There is no buffer space left
4190 * - Unexpected TCP flags/window values/header lengths are received
4191 * (detected by checking the TCP header against pred_flags)
4192 * - Data is sent in both directions. Fast path only supports pure senders
4193 * or pure receivers (this means either the sequence number or the ack
4194 * value must stay constant)
4195 * - Unexpected TCP option.
4196 *
4197 * When these conditions are not satisfied it drops into a standard
4198 * receive procedure patterned after RFC793 to handle all cases.
4199 * The first three cases are guaranteed by proper pred_flags setting,
4200 * the rest is checked inline. Fast processing is turned on in
4201 * tcp_data_queue when everything is OK.
4202 */
4203int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4204 struct tcphdr *th, unsigned len)
4205{
4206 struct tcp_sock *tp = tcp_sk(sk);
4207
4208 /*
4209 * Header prediction.
4210 * The code loosely follows the one in the famous
4211 * "30 instruction TCP receive" Van Jacobson mail.
4212 *
4213 * Van's trick is to deposit buffers into socket queue
4214 * on a device interrupt, to call tcp_recv function
4215 * on the receive process context and checksum and copy
4216 * the buffer to user space. smart...
4217 *
4218 * Our current scheme is not silly either but we take the
4219 * extra cost of the net_bh soft interrupt processing...
4220 * We do checksum and copy also but from device to kernel.
4221 */
4222
4223 tp->rx_opt.saw_tstamp = 0;
4224
4225 /* pred_flags is 0xS?10 << 16 + snd_wnd
4226 * if header_predition is to be made
4227 * 'S' will always be tp->tcp_header_len >> 2
4228 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
4229 * turn it off (when there are holes in the receive
4230 * space for instance)
4231 * PSH flag is ignored.
4232 */
4233
4234 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
4235 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
4236 int tcp_header_len = tp->tcp_header_len;
4237
4238 /* Timestamp header prediction: tcp_header_len
4239 * is automatically equal to th->doff*4 due to pred_flags
4240 * match.
4241 */
4242
4243 /* Check timestamp */
4244 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
4245 __u32 *ptr = (__u32 *)(th + 1);
4246
4247 /* No? Slow path! */
4248 if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
4249 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
4250 goto slow_path;
4251
4252 tp->rx_opt.saw_tstamp = 1;
4253 ++ptr;
4254 tp->rx_opt.rcv_tsval = ntohl(*ptr);
4255 ++ptr;
4256 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
4257
4258 /* If PAWS failed, check it more carefully in slow path */
4259 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
4260 goto slow_path;
4261
4262 /* DO NOT update ts_recent here, if checksum fails
4263 * and timestamp was corrupted part, it will result
4264 * in a hung connection since we will drop all
4265 * future packets due to the PAWS test.
4266 */
4267 }
4268
4269 if (len <= tcp_header_len) {
4270 /* Bulk data transfer: sender */
4271 if (len == tcp_header_len) {
4272 /* Predicted packet is in window by definition.
4273 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
4274 * Hence, check seq<=rcv_wup reduces to:
4275 */
4276 if (tcp_header_len ==
4277 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
4278 tp->rcv_nxt == tp->rcv_wup)
4279 tcp_store_ts_recent(tp);
4280
4281 tcp_rcv_rtt_measure_ts(tp, skb);
4282
4283 /* We know that such packets are checksummed
4284 * on entry.
4285 */
4286 tcp_ack(sk, skb, 0);
4287 __kfree_skb(skb);
4288 tcp_data_snd_check(sk);
4289 return 0;
4290 } else { /* Header too small */
4291 TCP_INC_STATS_BH(TCP_MIB_INERRS);
4292 goto discard;
4293 }
4294 } else {
4295 int eaten = 0;
4296
4297 if (tp->ucopy.task == current &&
4298 tp->copied_seq == tp->rcv_nxt &&
4299 len - tcp_header_len <= tp->ucopy.len &&
4300 sock_owned_by_user(sk)) {
4301 __set_current_state(TASK_RUNNING);
4302
4303 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
4304 /* Predicted packet is in window by definition.
4305 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
4306 * Hence, check seq<=rcv_wup reduces to:
4307 */
4308 if (tcp_header_len ==
4309 (sizeof(struct tcphdr) +
4310 TCPOLEN_TSTAMP_ALIGNED) &&
4311 tp->rcv_nxt == tp->rcv_wup)
4312 tcp_store_ts_recent(tp);
4313
4314 tcp_rcv_rtt_measure_ts(tp, skb);
4315
4316 __skb_pull(skb, tcp_header_len);
4317 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4318 NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
4319 eaten = 1;
4320 }
4321 }
4322 if (!eaten) {
4323 if (tcp_checksum_complete_user(sk, skb))
4324 goto csum_error;
4325
4326 /* Predicted packet is in window by definition.
4327 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
4328 * Hence, check seq<=rcv_wup reduces to:
4329 */
4330 if (tcp_header_len ==
4331 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
4332 tp->rcv_nxt == tp->rcv_wup)
4333 tcp_store_ts_recent(tp);
4334
4335 tcp_rcv_rtt_measure_ts(tp, skb);
4336
4337 if ((int)skb->truesize > sk->sk_forward_alloc)
4338 goto step5;
4339
4340 NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
4341
4342 /* Bulk data transfer: receiver */
4343 __skb_pull(skb,tcp_header_len);
4344 __skb_queue_tail(&sk->sk_receive_queue, skb);
4345 sk_stream_set_owner_r(skb, sk);
4346 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4347 }
4348
4349 tcp_event_data_recv(sk, tp, skb);
4350
4351 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
4352 /* Well, only one small jumplet in fast path... */
4353 tcp_ack(sk, skb, FLAG_DATA);
4354 tcp_data_snd_check(sk);
4355 if (!tcp_ack_scheduled(tp))
4356 goto no_ack;
4357 }
4358
4359 if (eaten) {
4360 if (tcp_in_quickack_mode(tp)) {
4361 tcp_send_ack(sk);
4362 } else {
4363 tcp_send_delayed_ack(sk);
4364 }
4365 } else {
4366 __tcp_ack_snd_check(sk, 0);
4367 }
4368
4369no_ack:
4370 if (eaten)
4371 __kfree_skb(skb);
4372 else
4373 sk->sk_data_ready(sk, 0);
4374 return 0;
4375 }
4376 }
4377
4378slow_path:
4379 if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb))
4380 goto csum_error;
4381
4382 /*
4383 * RFC1323: H1. Apply PAWS check first.
4384 */
4385 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4386 tcp_paws_discard(tp, skb)) {
4387 if (!th->rst) {
4388 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
4389 tcp_send_dupack(sk, skb);
4390 goto discard;
4391 }
4392 /* Resets are accepted even if PAWS failed.
4393
4394 ts_recent update must be made after we are sure
4395 that the packet is in window.
4396 */
4397 }
4398
4399 /*
4400 * Standard slow path.
4401 */
4402
4403 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
4404 /* RFC793, page 37: "In all states except SYN-SENT, all reset
4405 * (RST) segments are validated by checking their SEQ-fields."
4406 * And page 69: "If an incoming segment is not acceptable,
4407 * an acknowledgment should be sent in reply (unless the RST bit
4408 * is set, if so drop the segment and return)".
4409 */
4410 if (!th->rst)
4411 tcp_send_dupack(sk, skb);
4412 goto discard;
4413 }
4414
4415 if(th->rst) {
4416 tcp_reset(sk);
4417 goto discard;
4418 }
4419
4420 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4421
4422 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4423 TCP_INC_STATS_BH(TCP_MIB_INERRS);
4424 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
4425 tcp_reset(sk);
4426 return 1;
4427 }
4428
4429step5:
4430 if(th->ack)
4431 tcp_ack(sk, skb, FLAG_SLOWPATH);
4432
4433 tcp_rcv_rtt_measure_ts(tp, skb);
4434
4435 /* Process urgent data. */
4436 tcp_urg(sk, skb, th);
4437
4438 /* step 7: process the segment text */
4439 tcp_data_queue(sk, skb);
4440
4441 tcp_data_snd_check(sk);
4442 tcp_ack_snd_check(sk);
4443 return 0;
4444
4445csum_error:
4446 TCP_INC_STATS_BH(TCP_MIB_INERRS);
4447
4448discard:
4449 __kfree_skb(skb);
4450 return 0;
4451}
4452
4453static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4454 struct tcphdr *th, unsigned len)
4455{
4456 struct tcp_sock *tp = tcp_sk(sk);
4457 int saved_clamp = tp->rx_opt.mss_clamp;
4458
4459 tcp_parse_options(skb, &tp->rx_opt, 0);
4460
4461 if (th->ack) {
4462 /* rfc793:
4463 * "If the state is SYN-SENT then
4464 * first check the ACK bit
4465 * If the ACK bit is set
4466 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
4467 * a reset (unless the RST bit is set, if so drop
4468 * the segment and return)"
4469 *
4470 * We do not send data with SYN, so that RFC-correct
4471 * test reduces to:
4472 */
4473 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
4474 goto reset_and_undo;
4475
4476 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4477 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
4478 tcp_time_stamp)) {
4479 NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED);
4480 goto reset_and_undo;
4481 }
4482
4483 /* Now ACK is acceptable.
4484 *
4485 * "If the RST bit is set
4486 * If the ACK was acceptable then signal the user "error:
4487 * connection reset", drop the segment, enter CLOSED state,
4488 * delete TCB, and return."
4489 */
4490
4491 if (th->rst) {
4492 tcp_reset(sk);
4493 goto discard;
4494 }
4495
4496 /* rfc793:
4497 * "fifth, if neither of the SYN or RST bits is set then
4498 * drop the segment and return."
4499 *
4500 * See note below!
4501 * --ANK(990513)
4502 */
4503 if (!th->syn)
4504 goto discard_and_undo;
4505
4506 /* rfc793:
4507 * "If the SYN bit is on ...
4508 * are acceptable then ...
4509 * (our SYN has been ACKed), change the connection
4510 * state to ESTABLISHED..."
4511 */
4512
4513 TCP_ECN_rcv_synack(tp, th);
4514 if (tp->ecn_flags&TCP_ECN_OK)
4515 sock_set_flag(sk, SOCK_NO_LARGESEND);
4516
4517 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
4518 tcp_ack(sk, skb, FLAG_SLOWPATH);
4519
4520 /* Ok.. it's good. Set up sequence numbers and
4521 * move to established.
4522 */
4523 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
4524 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
4525
4526 /* RFC1323: The window in SYN & SYN/ACK segments is
4527 * never scaled.
4528 */
4529 tp->snd_wnd = ntohs(th->window);
4530 tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
4531
4532 if (!tp->rx_opt.wscale_ok) {
4533 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
4534 tp->window_clamp = min(tp->window_clamp, 65535U);
4535 }
4536
4537 if (tp->rx_opt.saw_tstamp) {
4538 tp->rx_opt.tstamp_ok = 1;
4539 tp->tcp_header_len =
4540 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
4541 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
4542 tcp_store_ts_recent(tp);
4543 } else {
4544 tp->tcp_header_len = sizeof(struct tcphdr);
4545 }
4546
4547 if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
4548 tp->rx_opt.sack_ok |= 2;
4549
4550 tcp_sync_mss(sk, tp->pmtu_cookie);
4551 tcp_initialize_rcv_mss(sk);
4552
4553 /* Remember, tcp_poll() does not lock socket!
4554 * Change state from SYN-SENT only after copied_seq
4555 * is initialized. */
4556 tp->copied_seq = tp->rcv_nxt;
4557 mb();
4558 tcp_set_state(sk, TCP_ESTABLISHED);
4559
4560 /* Make sure socket is routed, for correct metrics. */
4561 tp->af_specific->rebuild_header(sk);
4562
4563 tcp_init_metrics(sk);
4564
4565 /* Prevent spurious tcp_cwnd_restart() on first data
4566 * packet.
4567 */
4568 tp->lsndtime = tcp_time_stamp;
4569
4570 tcp_init_buffer_space(sk);
4571
4572 if (sock_flag(sk, SOCK_KEEPOPEN))
4573 tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
4574
4575 if (!tp->rx_opt.snd_wscale)
4576 __tcp_fast_path_on(tp, tp->snd_wnd);
4577 else
4578 tp->pred_flags = 0;
4579
4580 if (!sock_flag(sk, SOCK_DEAD)) {
4581 sk->sk_state_change(sk);
4582 sk_wake_async(sk, 0, POLL_OUT);
4583 }
4584
4585 if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
4586 /* Save one ACK. Data will be ready after
4587 * several ticks, if write_pending is set.
4588 *
4589 * It may be deleted, but with this feature tcpdumps
4590 * look so _wonderfully_ clever, that I was not able
4591 * to stand against the temptation 8) --ANK
4592 */
4593 tcp_schedule_ack(tp);
4594 tp->ack.lrcvtime = tcp_time_stamp;
4595 tp->ack.ato = TCP_ATO_MIN;
4596 tcp_incr_quickack(tp);
4597 tcp_enter_quickack_mode(tp);
4598 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
4599
4600discard:
4601 __kfree_skb(skb);
4602 return 0;
4603 } else {
4604 tcp_send_ack(sk);
4605 }
4606 return -1;
4607 }
4608
4609 /* No ACK in the segment */
4610
4611 if (th->rst) {
4612 /* rfc793:
4613 * "If the RST bit is set
4614 *
4615 * Otherwise (no ACK) drop the segment and return."
4616 */
4617
4618 goto discard_and_undo;
4619 }
4620
4621 /* PAWS check. */
4622 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0))
4623 goto discard_and_undo;
4624
4625 if (th->syn) {
4626 /* We see SYN without ACK. It is attempt of
4627 * simultaneous connect with crossed SYNs.
4628 * Particularly, it can be connect to self.
4629 */
4630 tcp_set_state(sk, TCP_SYN_RECV);
4631
4632 if (tp->rx_opt.saw_tstamp) {
4633 tp->rx_opt.tstamp_ok = 1;
4634 tcp_store_ts_recent(tp);
4635 tp->tcp_header_len =
4636 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
4637 } else {
4638 tp->tcp_header_len = sizeof(struct tcphdr);
4639 }
4640
4641 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
4642 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
4643
4644 /* RFC1323: The window in SYN & SYN/ACK segments is
4645 * never scaled.
4646 */
4647 tp->snd_wnd = ntohs(th->window);
4648 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
4649 tp->max_window = tp->snd_wnd;
4650
4651 TCP_ECN_rcv_syn(tp, th);
4652 if (tp->ecn_flags&TCP_ECN_OK)
4653 sock_set_flag(sk, SOCK_NO_LARGESEND);
4654
4655 tcp_sync_mss(sk, tp->pmtu_cookie);
4656 tcp_initialize_rcv_mss(sk);
4657
4658
4659 tcp_send_synack(sk);
4660#if 0
4661 /* Note, we could accept data and URG from this segment.
4662 * There are no obstacles to make this.
4663 *
4664 * However, if we ignore data in ACKless segments sometimes,
4665 * we have no reasons to accept it sometimes.
4666 * Also, seems the code doing it in step6 of tcp_rcv_state_process
4667 * is not flawless. So, discard packet for sanity.
4668 * Uncomment this return to process the data.
4669 */
4670 return -1;
4671#else
4672 goto discard;
4673#endif
4674 }
4675 /* "fifth, if neither of the SYN or RST bits is set then
4676 * drop the segment and return."
4677 */
4678
4679discard_and_undo:
4680 tcp_clear_options(&tp->rx_opt);
4681 tp->rx_opt.mss_clamp = saved_clamp;
4682 goto discard;
4683
4684reset_and_undo:
4685 tcp_clear_options(&tp->rx_opt);
4686 tp->rx_opt.mss_clamp = saved_clamp;
4687 return 1;
4688}
4689
4690
4691/*
4692 * This function implements the receiving procedure of RFC 793 for
4693 * all states except ESTABLISHED and TIME_WAIT.
4694 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
4695 * address independent.
4696 */
4697
4698int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4699 struct tcphdr *th, unsigned len)
4700{
4701 struct tcp_sock *tp = tcp_sk(sk);
4702 int queued = 0;
4703
4704 tp->rx_opt.saw_tstamp = 0;
4705
4706 switch (sk->sk_state) {
4707 case TCP_CLOSE:
4708 goto discard;
4709
4710 case TCP_LISTEN:
4711 if(th->ack)
4712 return 1;
4713
4714 if(th->rst)
4715 goto discard;
4716
4717 if(th->syn) {
4718 if(tp->af_specific->conn_request(sk, skb) < 0)
4719 return 1;
4720
4721 init_westwood(sk);
4722 init_bictcp(tp);
4723
4724 /* Now we have several options: In theory there is
4725 * nothing else in the frame. KA9Q has an option to
4726 * send data with the syn, BSD accepts data with the
4727 * syn up to the [to be] advertised window and
4728 * Solaris 2.1 gives you a protocol error. For now
4729 * we just ignore it, that fits the spec precisely
4730 * and avoids incompatibilities. It would be nice in
4731 * future to drop through and process the data.
4732 *
4733 * Now that TTCP is starting to be used we ought to
4734 * queue this data.
4735 * But, this leaves one open to an easy denial of
4736 * service attack, and SYN cookies can't defend
4737 * against this problem. So, we drop the data
4738 * in the interest of security over speed.
4739 */
4740 goto discard;
4741 }
4742 goto discard;
4743
4744 case TCP_SYN_SENT:
4745 init_westwood(sk);
4746 init_bictcp(tp);
4747
4748 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4749 if (queued >= 0)
4750 return queued;
4751
4752 /* Do step6 onward by hand. */
4753 tcp_urg(sk, skb, th);
4754 __kfree_skb(skb);
4755 tcp_data_snd_check(sk);
4756 return 0;
4757 }
4758
4759 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4760 tcp_paws_discard(tp, skb)) {
4761 if (!th->rst) {
4762 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
4763 tcp_send_dupack(sk, skb);
4764 goto discard;
4765 }
4766 /* Reset is accepted even if it did not pass PAWS. */
4767 }
4768
4769 /* step 1: check sequence number */
4770 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
4771 if (!th->rst)
4772 tcp_send_dupack(sk, skb);
4773 goto discard;
4774 }
4775
4776 /* step 2: check RST bit */
4777 if(th->rst) {
4778 tcp_reset(sk);
4779 goto discard;
4780 }
4781
4782 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4783
4784 /* step 3: check security and precedence [ignored] */
4785
4786 /* step 4:
4787 *
4788 * Check for a SYN in window.
4789 */
4790 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4791 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
4792 tcp_reset(sk);
4793 return 1;
4794 }
4795
4796 /* step 5: check the ACK field */
4797 if (th->ack) {
4798 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
4799
4800 switch(sk->sk_state) {
4801 case TCP_SYN_RECV:
4802 if (acceptable) {
4803 tp->copied_seq = tp->rcv_nxt;
4804 mb();
4805 tcp_set_state(sk, TCP_ESTABLISHED);
4806 sk->sk_state_change(sk);
4807
4808 /* Note, that this wakeup is only for marginal
4809 * crossed SYN case. Passively open sockets
4810 * are not waked up, because sk->sk_sleep ==
4811 * NULL and sk->sk_socket == NULL.
4812 */
4813 if (sk->sk_socket) {
4814 sk_wake_async(sk,0,POLL_OUT);
4815 }
4816
4817 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
4818 tp->snd_wnd = ntohs(th->window) <<
4819 tp->rx_opt.snd_wscale;
4820 tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
4821 TCP_SKB_CB(skb)->seq);
4822
4823 /* tcp_ack considers this ACK as duplicate
4824 * and does not calculate rtt.
4825 * Fix it at least with timestamps.
4826 */
4827 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4828 !tp->srtt)
4829 tcp_ack_saw_tstamp(tp, 0);
4830
4831 if (tp->rx_opt.tstamp_ok)
4832 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
4833
4834 /* Make sure socket is routed, for
4835 * correct metrics.
4836 */
4837 tp->af_specific->rebuild_header(sk);
4838
4839 tcp_init_metrics(sk);
4840
4841 /* Prevent spurious tcp_cwnd_restart() on
4842 * first data packet.
4843 */
4844 tp->lsndtime = tcp_time_stamp;
4845
4846 tcp_initialize_rcv_mss(sk);
4847 tcp_init_buffer_space(sk);
4848 tcp_fast_path_on(tp);
4849 } else {
4850 return 1;
4851 }
4852 break;
4853
4854 case TCP_FIN_WAIT1:
4855 if (tp->snd_una == tp->write_seq) {
4856 tcp_set_state(sk, TCP_FIN_WAIT2);
4857 sk->sk_shutdown |= SEND_SHUTDOWN;
4858 dst_confirm(sk->sk_dst_cache);
4859
4860 if (!sock_flag(sk, SOCK_DEAD))
4861 /* Wake up lingering close() */
4862 sk->sk_state_change(sk);
4863 else {
4864 int tmo;
4865
4866 if (tp->linger2 < 0 ||
4867 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4868 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
4869 tcp_done(sk);
4870 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
4871 return 1;
4872 }
4873
4874 tmo = tcp_fin_time(tp);
4875 if (tmo > TCP_TIMEWAIT_LEN) {
4876 tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
4877 } else if (th->fin || sock_owned_by_user(sk)) {
4878 /* Bad case. We could lose such FIN otherwise.
4879 * It is not a big problem, but it looks confusing
4880 * and not so rare event. We still can lose it now,
4881 * if it spins in bh_lock_sock(), but it is really
4882 * marginal case.
4883 */
4884 tcp_reset_keepalive_timer(sk, tmo);
4885 } else {
4886 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
4887 goto discard;
4888 }
4889 }
4890 }
4891 break;
4892
4893 case TCP_CLOSING:
4894 if (tp->snd_una == tp->write_seq) {
4895 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4896 goto discard;
4897 }
4898 break;
4899
4900 case TCP_LAST_ACK:
4901 if (tp->snd_una == tp->write_seq) {
4902 tcp_update_metrics(sk);
4903 tcp_done(sk);
4904 goto discard;
4905 }
4906 break;
4907 }
4908 } else
4909 goto discard;
4910
4911 /* step 6: check the URG bit */
4912 tcp_urg(sk, skb, th);
4913
4914 /* step 7: process the segment text */
4915 switch (sk->sk_state) {
4916 case TCP_CLOSE_WAIT:
4917 case TCP_CLOSING:
4918 case TCP_LAST_ACK:
4919 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4920 break;
4921 case TCP_FIN_WAIT1:
4922 case TCP_FIN_WAIT2:
4923 /* RFC 793 says to queue data in these states,
4924 * RFC 1122 says we MUST send a reset.
4925 * BSD 4.4 also does reset.
4926 */
4927 if (sk->sk_shutdown & RCV_SHUTDOWN) {
4928 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4929 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
4930 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
4931 tcp_reset(sk);
4932 return 1;
4933 }
4934 }
4935 /* Fall through */
4936 case TCP_ESTABLISHED:
4937 tcp_data_queue(sk, skb);
4938 queued = 1;
4939 break;
4940 }
4941
4942 /* tcp_data could move socket to TIME-WAIT */
4943 if (sk->sk_state != TCP_CLOSE) {
4944 tcp_data_snd_check(sk);
4945 tcp_ack_snd_check(sk);
4946 }
4947
4948 if (!queued) {
4949discard:
4950 __kfree_skb(skb);
4951 }
4952 return 0;
4953}
4954
4955EXPORT_SYMBOL(sysctl_tcp_ecn);
4956EXPORT_SYMBOL(sysctl_tcp_reordering);
4957EXPORT_SYMBOL(tcp_parse_options);
4958EXPORT_SYMBOL(tcp_rcv_established);
4959EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 000000000000..3ac6659869c4
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2663 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
67#include <net/tcp.h>
68#include <net/ipv6.h>
69#include <net/inet_common.h>
70#include <net/xfrm.h>
71
72#include <linux/inet.h>
73#include <linux/ipv6.h>
74#include <linux/stddef.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77
78extern int sysctl_ip_dynaddr;
79int sysctl_tcp_tw_reuse;
80int sysctl_tcp_low_latency;
81
82/* Check TCP sequence numbers in ICMP packets. */
83#define ICMP_MIN_LENGTH 8
84
85/* Socket used for sending RSTs */
86static struct socket *tcp_socket;
87
88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89 struct sk_buff *skb);
90
91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
94 .__tcp_lhash_wait
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
97};
98
99/*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104int sysctl_local_port_range[2] = { 1024, 4999 };
105int tcp_port_rover = 1024 - 1;
106
107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
109{
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
111 h ^= h >> 16;
112 h ^= h >> 8;
113 return h & (tcp_ehash_size - 1);
114}
115
116static __inline__ int tcp_sk_hashfn(struct sock *sk)
117{
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
123
124 return tcp_hashfn(laddr, lport, faddr, fport);
125}
126
127/* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
129 */
130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131 unsigned short snum)
132{
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134 SLAB_ATOMIC);
135 if (tb) {
136 tb->port = snum;
137 tb->fastreuse = 0;
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
140 }
141 return tb;
142}
143
144/* Caller must hold hashbucket lock for this tb with local BH disabled */
145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146{
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
150 }
151}
152
153/* Caller must disable local BH processing. */
154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155{
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
159
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
165}
166
167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168{
169 local_bh_disable();
170 __tcp_inherit_port(sk, child);
171 local_bh_enable();
172}
173
174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175 unsigned short snum)
176{
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
180}
181
182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183{
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185 struct sock *sk2;
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
188
189 sk_for_each_bound(sk2, node, &tb->owners) {
190 if (sk != sk2 &&
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
200 break;
201 }
202 }
203 }
204 return node != NULL;
205}
206
207/* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
209 */
210static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211{
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
215 int ret;
216
217 local_bh_disable();
218 if (!snum) {
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
222 int rover;
223
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
226 do {
227 rover++;
228 if (rover < low || rover > high)
229 rover = low;
230 head = &tcp_bhash[tcp_bhashfn(rover)];
231 spin_lock(&head->lock);
232 tb_for_each(tb, node, &head->chain)
233 if (tb->port == rover)
234 goto next;
235 break;
236 next:
237 spin_unlock(&head->lock);
238 } while (--remaining > 0);
239 tcp_port_rover = rover;
240 spin_unlock(&tcp_portalloc_lock);
241
242 /* Exhausted local port range during search? */
243 ret = 1;
244 if (remaining <= 0)
245 goto fail;
246
247 /* OK, here is the one we will use. HEAD is
248 * non-NULL and we hold it's mutex.
249 */
250 snum = rover;
251 } else {
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 tb_for_each(tb, node, &head->chain)
255 if (tb->port == snum)
256 goto tb_found;
257 }
258 tb = NULL;
259 goto tb_not_found;
260tb_found:
261 if (!hlist_empty(&tb->owners)) {
262 if (sk->sk_reuse > 1)
263 goto success;
264 if (tb->fastreuse > 0 &&
265 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
266 goto success;
267 } else {
268 ret = 1;
269 if (tcp_bind_conflict(sk, tb))
270 goto fail_unlock;
271 }
272 }
273tb_not_found:
274 ret = 1;
275 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
276 goto fail_unlock;
277 if (hlist_empty(&tb->owners)) {
278 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
279 tb->fastreuse = 1;
280 else
281 tb->fastreuse = 0;
282 } else if (tb->fastreuse &&
283 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
284 tb->fastreuse = 0;
285success:
286 if (!tcp_sk(sk)->bind_hash)
287 tcp_bind_hash(sk, tb, snum);
288 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
289 ret = 0;
290
291fail_unlock:
292 spin_unlock(&head->lock);
293fail:
294 local_bh_enable();
295 return ret;
296}
297
298/* Get rid of any references to a local port held by the
299 * given sock.
300 */
301static void __tcp_put_port(struct sock *sk)
302{
303 struct inet_sock *inet = inet_sk(sk);
304 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
305 struct tcp_bind_bucket *tb;
306
307 spin_lock(&head->lock);
308 tb = tcp_sk(sk)->bind_hash;
309 __sk_del_bind_node(sk);
310 tcp_sk(sk)->bind_hash = NULL;
311 inet->num = 0;
312 tcp_bucket_destroy(tb);
313 spin_unlock(&head->lock);
314}
315
316void tcp_put_port(struct sock *sk)
317{
318 local_bh_disable();
319 __tcp_put_port(sk);
320 local_bh_enable();
321}
322
323/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
324 * Look, when several writers sleep and reader wakes them up, all but one
325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
326 * this, _but_ remember, it adds useless work on UP machines (wake up each
327 * exclusive lock release). It should be ifdefed really.
328 */
329
330void tcp_listen_wlock(void)
331{
332 write_lock(&tcp_lhash_lock);
333
334 if (atomic_read(&tcp_lhash_users)) {
335 DEFINE_WAIT(wait);
336
337 for (;;) {
338 prepare_to_wait_exclusive(&tcp_lhash_wait,
339 &wait, TASK_UNINTERRUPTIBLE);
340 if (!atomic_read(&tcp_lhash_users))
341 break;
342 write_unlock_bh(&tcp_lhash_lock);
343 schedule();
344 write_lock_bh(&tcp_lhash_lock);
345 }
346
347 finish_wait(&tcp_lhash_wait, &wait);
348 }
349}
350
351static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
352{
353 struct hlist_head *list;
354 rwlock_t *lock;
355
356 BUG_TRAP(sk_unhashed(sk));
357 if (listen_possible && sk->sk_state == TCP_LISTEN) {
358 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359 lock = &tcp_lhash_lock;
360 tcp_listen_wlock();
361 } else {
362 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
363 lock = &tcp_ehash[sk->sk_hashent].lock;
364 write_lock(lock);
365 }
366 __sk_add_node(sk, list);
367 sock_prot_inc_use(sk->sk_prot);
368 write_unlock(lock);
369 if (listen_possible && sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
371}
372
373static void tcp_v4_hash(struct sock *sk)
374{
375 if (sk->sk_state != TCP_CLOSE) {
376 local_bh_disable();
377 __tcp_v4_hash(sk, 1);
378 local_bh_enable();
379 }
380}
381
382void tcp_unhash(struct sock *sk)
383{
384 rwlock_t *lock;
385
386 if (sk_unhashed(sk))
387 goto ende;
388
389 if (sk->sk_state == TCP_LISTEN) {
390 local_bh_disable();
391 tcp_listen_wlock();
392 lock = &tcp_lhash_lock;
393 } else {
394 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
395 lock = &head->lock;
396 write_lock_bh(&head->lock);
397 }
398
399 if (__sk_del_node_init(sk))
400 sock_prot_dec_use(sk->sk_prot);
401 write_unlock_bh(lock);
402
403 ende:
404 if (sk->sk_state == TCP_LISTEN)
405 wake_up(&tcp_lhash_wait);
406}
407
408/* Don't inline this cruft. Here are some nice properties to
409 * exploit here. The BSD API does not allow a listening TCP
410 * to specify the remote port nor the remote address for the
411 * connection. So always assume those are both wildcarded
412 * during the search since they can never be otherwise.
413 */
414static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
415 unsigned short hnum, int dif)
416{
417 struct sock *result = NULL, *sk;
418 struct hlist_node *node;
419 int score, hiscore;
420
421 hiscore=-1;
422 sk_for_each(sk, node, head) {
423 struct inet_sock *inet = inet_sk(sk);
424
425 if (inet->num == hnum && !ipv6_only_sock(sk)) {
426 __u32 rcv_saddr = inet->rcv_saddr;
427
428 score = (sk->sk_family == PF_INET ? 1 : 0);
429 if (rcv_saddr) {
430 if (rcv_saddr != daddr)
431 continue;
432 score+=2;
433 }
434 if (sk->sk_bound_dev_if) {
435 if (sk->sk_bound_dev_if != dif)
436 continue;
437 score+=2;
438 }
439 if (score == 5)
440 return sk;
441 if (score > hiscore) {
442 hiscore = score;
443 result = sk;
444 }
445 }
446 }
447 return result;
448}
449
450/* Optimize the common listener case. */
451static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
452 unsigned short hnum, int dif)
453{
454 struct sock *sk = NULL;
455 struct hlist_head *head;
456
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
461
462 if (inet->num == hnum && !sk->sk_node.next &&
463 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
464 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
465 !sk->sk_bound_dev_if)
466 goto sherry_cache;
467 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
468 }
469 if (sk) {
470sherry_cache:
471 sock_hold(sk);
472 }
473 read_unlock(&tcp_lhash_lock);
474 return sk;
475}
476
477/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
479 *
480 * Local BH must be disabled here.
481 */
482
483static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
484 u32 daddr, u16 hnum,
485 int dif)
486{
487 struct tcp_ehash_bucket *head;
488 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
489 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
490 struct sock *sk;
491 struct hlist_node *node;
492 /* Optimize here for direct hit, only listening connections can
493 * have wildcards anyways.
494 */
495 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
496 head = &tcp_ehash[hash];
497 read_lock(&head->lock);
498 sk_for_each(sk, node, &head->chain) {
499 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
500 goto hit; /* You sunk my battleship! */
501 }
502
503 /* Must check for a TIME_WAIT'er before going to listener hash. */
504 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
505 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
506 goto hit;
507 }
508 sk = NULL;
509out:
510 read_unlock(&head->lock);
511 return sk;
512hit:
513 sock_hold(sk);
514 goto out;
515}
516
517static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
518 u32 daddr, u16 hnum, int dif)
519{
520 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
521 daddr, hnum, dif);
522
523 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
524}
525
526inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
527 u16 dport, int dif)
528{
529 struct sock *sk;
530
531 local_bh_disable();
532 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
533 local_bh_enable();
534
535 return sk;
536}
537
538EXPORT_SYMBOL_GPL(tcp_v4_lookup);
539
540static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
541{
542 return secure_tcp_sequence_number(skb->nh.iph->daddr,
543 skb->nh.iph->saddr,
544 skb->h.th->dest,
545 skb->h.th->source);
546}
547
548/* called with local bh disabled */
549static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
550 struct tcp_tw_bucket **twp)
551{
552 struct inet_sock *inet = inet_sk(sk);
553 u32 daddr = inet->rcv_saddr;
554 u32 saddr = inet->daddr;
555 int dif = sk->sk_bound_dev_if;
556 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
557 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
558 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
559 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
560 struct sock *sk2;
561 struct hlist_node *node;
562 struct tcp_tw_bucket *tw;
563
564 write_lock(&head->lock);
565
566 /* Check TIME-WAIT sockets first. */
567 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
568 tw = (struct tcp_tw_bucket *)sk2;
569
570 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
571 struct tcp_sock *tp = tcp_sk(sk);
572
573 /* With PAWS, it is safe from the viewpoint
574 of data integrity. Even without PAWS it
575 is safe provided sequence spaces do not
576 overlap i.e. at data rates <= 80Mbit/sec.
577
578 Actually, the idea is close to VJ's one,
579 only timestamp cache is held not per host,
580 but per port pair and TW bucket is used
581 as state holder.
582
583 If TW bucket has been already destroyed we
584 fall back to VJ's scheme and use initial
585 timestamp retrieved from peer table.
586 */
587 if (tw->tw_ts_recent_stamp &&
588 (!twp || (sysctl_tcp_tw_reuse &&
589 xtime.tv_sec -
590 tw->tw_ts_recent_stamp > 1))) {
591 if ((tp->write_seq =
592 tw->tw_snd_nxt + 65535 + 2) == 0)
593 tp->write_seq = 1;
594 tp->rx_opt.ts_recent = tw->tw_ts_recent;
595 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
596 sock_hold(sk2);
597 goto unique;
598 } else
599 goto not_unique;
600 }
601 }
602 tw = NULL;
603
604 /* And established part... */
605 sk_for_each(sk2, node, &head->chain) {
606 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
607 goto not_unique;
608 }
609
610unique:
611 /* Must record num and sport now. Otherwise we will see
612 * in hash table socket with a funny identity. */
613 inet->num = lport;
614 inet->sport = htons(lport);
615 sk->sk_hashent = hash;
616 BUG_TRAP(sk_unhashed(sk));
617 __sk_add_node(sk, &head->chain);
618 sock_prot_inc_use(sk->sk_prot);
619 write_unlock(&head->lock);
620
621 if (twp) {
622 *twp = tw;
623 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
624 } else if (tw) {
625 /* Silly. Should hash-dance instead... */
626 tcp_tw_deschedule(tw);
627 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
628
629 tcp_tw_put(tw);
630 }
631
632 return 0;
633
634not_unique:
635 write_unlock(&head->lock);
636 return -EADDRNOTAVAIL;
637}
638
639static inline u32 connect_port_offset(const struct sock *sk)
640{
641 const struct inet_sock *inet = inet_sk(sk);
642
643 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
644 inet->dport);
645}
646
647/*
648 * Bind a port for a connect operation and hash it.
649 */
650static inline int tcp_v4_hash_connect(struct sock *sk)
651{
652 unsigned short snum = inet_sk(sk)->num;
653 struct tcp_bind_hashbucket *head;
654 struct tcp_bind_bucket *tb;
655 int ret;
656
657 if (!snum) {
658 int low = sysctl_local_port_range[0];
659 int high = sysctl_local_port_range[1];
660 int range = high - low;
661 int i;
662 int port;
663 static u32 hint;
664 u32 offset = hint + connect_port_offset(sk);
665 struct hlist_node *node;
666 struct tcp_tw_bucket *tw = NULL;
667
668 local_bh_disable();
669 for (i = 1; i <= range; i++) {
670 port = low + (i + offset) % range;
671 head = &tcp_bhash[tcp_bhashfn(port)];
672 spin_lock(&head->lock);
673
674 /* Does not bother with rcv_saddr checks,
675 * because the established check is already
676 * unique enough.
677 */
678 tb_for_each(tb, node, &head->chain) {
679 if (tb->port == port) {
680 BUG_TRAP(!hlist_empty(&tb->owners));
681 if (tb->fastreuse >= 0)
682 goto next_port;
683 if (!__tcp_v4_check_established(sk,
684 port,
685 &tw))
686 goto ok;
687 goto next_port;
688 }
689 }
690
691 tb = tcp_bucket_create(head, port);
692 if (!tb) {
693 spin_unlock(&head->lock);
694 break;
695 }
696 tb->fastreuse = -1;
697 goto ok;
698
699 next_port:
700 spin_unlock(&head->lock);
701 }
702 local_bh_enable();
703
704 return -EADDRNOTAVAIL;
705
706ok:
707 hint += i;
708
709 /* Head lock still held and bh's disabled */
710 tcp_bind_hash(sk, tb, port);
711 if (sk_unhashed(sk)) {
712 inet_sk(sk)->sport = htons(port);
713 __tcp_v4_hash(sk, 0);
714 }
715 spin_unlock(&head->lock);
716
717 if (tw) {
718 tcp_tw_deschedule(tw);
719 tcp_tw_put(tw);
720 }
721
722 ret = 0;
723 goto out;
724 }
725
726 head = &tcp_bhash[tcp_bhashfn(snum)];
727 tb = tcp_sk(sk)->bind_hash;
728 spin_lock_bh(&head->lock);
729 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
730 __tcp_v4_hash(sk, 0);
731 spin_unlock_bh(&head->lock);
732 return 0;
733 } else {
734 spin_unlock(&head->lock);
735 /* No definite answer... Walk to established hash table */
736 ret = __tcp_v4_check_established(sk, snum, NULL);
737out:
738 local_bh_enable();
739 return ret;
740 }
741}
742
743/* This will initiate an outgoing connection. */
744int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
745{
746 struct inet_sock *inet = inet_sk(sk);
747 struct tcp_sock *tp = tcp_sk(sk);
748 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
749 struct rtable *rt;
750 u32 daddr, nexthop;
751 int tmp;
752 int err;
753
754 if (addr_len < sizeof(struct sockaddr_in))
755 return -EINVAL;
756
757 if (usin->sin_family != AF_INET)
758 return -EAFNOSUPPORT;
759
760 nexthop = daddr = usin->sin_addr.s_addr;
761 if (inet->opt && inet->opt->srr) {
762 if (!daddr)
763 return -EINVAL;
764 nexthop = inet->opt->faddr;
765 }
766
767 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
768 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
769 IPPROTO_TCP,
770 inet->sport, usin->sin_port, sk);
771 if (tmp < 0)
772 return tmp;
773
774 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
775 ip_rt_put(rt);
776 return -ENETUNREACH;
777 }
778
779 if (!inet->opt || !inet->opt->srr)
780 daddr = rt->rt_dst;
781
782 if (!inet->saddr)
783 inet->saddr = rt->rt_src;
784 inet->rcv_saddr = inet->saddr;
785
786 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
787 /* Reset inherited state */
788 tp->rx_opt.ts_recent = 0;
789 tp->rx_opt.ts_recent_stamp = 0;
790 tp->write_seq = 0;
791 }
792
793 if (sysctl_tcp_tw_recycle &&
794 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
795 struct inet_peer *peer = rt_get_peer(rt);
796
797 /* VJ's idea. We save last timestamp seen from
798 * the destination in peer table, when entering state TIME-WAIT
799 * and initialize rx_opt.ts_recent from it, when trying new connection.
800 */
801
802 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
803 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
804 tp->rx_opt.ts_recent = peer->tcp_ts;
805 }
806 }
807
808 inet->dport = usin->sin_port;
809 inet->daddr = daddr;
810
811 tp->ext_header_len = 0;
812 if (inet->opt)
813 tp->ext_header_len = inet->opt->optlen;
814
815 tp->rx_opt.mss_clamp = 536;
816
817 /* Socket identity is still unknown (sport may be zero).
818 * However we set state to SYN-SENT and not releasing socket
819 * lock select source port, enter ourselves into the hash tables and
820 * complete initialization after this.
821 */
822 tcp_set_state(sk, TCP_SYN_SENT);
823 err = tcp_v4_hash_connect(sk);
824 if (err)
825 goto failure;
826
827 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
828 if (err)
829 goto failure;
830
831 /* OK, now commit destination to socket. */
832 __sk_dst_set(sk, &rt->u.dst);
833 tcp_v4_setup_caps(sk, &rt->u.dst);
834
835 if (!tp->write_seq)
836 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
837 inet->daddr,
838 inet->sport,
839 usin->sin_port);
840
841 inet->id = tp->write_seq ^ jiffies;
842
843 err = tcp_connect(sk);
844 rt = NULL;
845 if (err)
846 goto failure;
847
848 return 0;
849
850failure:
851 /* This unhashes the socket and releases the local port, if necessary. */
852 tcp_set_state(sk, TCP_CLOSE);
853 ip_rt_put(rt);
854 sk->sk_route_caps = 0;
855 inet->dport = 0;
856 return err;
857}
858
859static __inline__ int tcp_v4_iif(struct sk_buff *skb)
860{
861 return ((struct rtable *)skb->dst)->rt_iif;
862}
863
864static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
865{
866 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
867}
868
869static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
870 struct open_request ***prevp,
871 __u16 rport,
872 __u32 raddr, __u32 laddr)
873{
874 struct tcp_listen_opt *lopt = tp->listen_opt;
875 struct open_request *req, **prev;
876
877 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
878 (req = *prev) != NULL;
879 prev = &req->dl_next) {
880 if (req->rmt_port == rport &&
881 req->af.v4_req.rmt_addr == raddr &&
882 req->af.v4_req.loc_addr == laddr &&
883 TCP_INET_FAMILY(req->class->family)) {
884 BUG_TRAP(!req->sk);
885 *prevp = prev;
886 break;
887 }
888 }
889
890 return req;
891}
892
893static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
894{
895 struct tcp_sock *tp = tcp_sk(sk);
896 struct tcp_listen_opt *lopt = tp->listen_opt;
897 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
898
899 req->expires = jiffies + TCP_TIMEOUT_INIT;
900 req->retrans = 0;
901 req->sk = NULL;
902 req->dl_next = lopt->syn_table[h];
903
904 write_lock(&tp->syn_wait_lock);
905 lopt->syn_table[h] = req;
906 write_unlock(&tp->syn_wait_lock);
907
908 tcp_synq_added(sk);
909}
910
911
912/*
913 * This routine does path mtu discovery as defined in RFC1191.
914 */
915static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
916 u32 mtu)
917{
918 struct dst_entry *dst;
919 struct inet_sock *inet = inet_sk(sk);
920 struct tcp_sock *tp = tcp_sk(sk);
921
922 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
923 * send out by Linux are always <576bytes so they should go through
924 * unfragmented).
925 */
926 if (sk->sk_state == TCP_LISTEN)
927 return;
928
929 /* We don't check in the destentry if pmtu discovery is forbidden
930 * on this route. We just assume that no packet_to_big packets
931 * are send back when pmtu discovery is not active.
932 * There is a small race when the user changes this flag in the
933 * route, but I think that's acceptable.
934 */
935 if ((dst = __sk_dst_check(sk, 0)) == NULL)
936 return;
937
938 dst->ops->update_pmtu(dst, mtu);
939
940 /* Something is about to be wrong... Remember soft error
941 * for the case, if this connection will not able to recover.
942 */
943 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
944 sk->sk_err_soft = EMSGSIZE;
945
946 mtu = dst_mtu(dst);
947
948 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
949 tp->pmtu_cookie > mtu) {
950 tcp_sync_mss(sk, mtu);
951
952 /* Resend the TCP packet because it's
953 * clear that the old packet has been
954 * dropped. This is the new "fast" path mtu
955 * discovery.
956 */
957 tcp_simple_retransmit(sk);
958 } /* else let the usual retransmit timer handle it */
959}
960
961/*
962 * This routine is called by the ICMP module when it gets some
963 * sort of error condition. If err < 0 then the socket should
964 * be closed and the error returned to the user. If err > 0
965 * it's just the icmp type << 8 | icmp code. After adjustment
966 * header points to the first 8 bytes of the tcp header. We need
967 * to find the appropriate port.
968 *
969 * The locking strategy used here is very "optimistic". When
970 * someone else accesses the socket the ICMP is just dropped
971 * and for some paths there is no check at all.
972 * A more general error queue to queue errors for later handling
973 * is probably better.
974 *
975 */
976
977void tcp_v4_err(struct sk_buff *skb, u32 info)
978{
979 struct iphdr *iph = (struct iphdr *)skb->data;
980 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
981 struct tcp_sock *tp;
982 struct inet_sock *inet;
983 int type = skb->h.icmph->type;
984 int code = skb->h.icmph->code;
985 struct sock *sk;
986 __u32 seq;
987 int err;
988
989 if (skb->len < (iph->ihl << 2) + 8) {
990 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
991 return;
992 }
993
994 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
995 th->source, tcp_v4_iif(skb));
996 if (!sk) {
997 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
998 return;
999 }
1000 if (sk->sk_state == TCP_TIME_WAIT) {
1001 tcp_tw_put((struct tcp_tw_bucket *)sk);
1002 return;
1003 }
1004
1005 bh_lock_sock(sk);
1006 /* If too many ICMPs get dropped on busy
1007 * servers this needs to be solved differently.
1008 */
1009 if (sock_owned_by_user(sk))
1010 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1011
1012 if (sk->sk_state == TCP_CLOSE)
1013 goto out;
1014
1015 tp = tcp_sk(sk);
1016 seq = ntohl(th->seq);
1017 if (sk->sk_state != TCP_LISTEN &&
1018 !between(seq, tp->snd_una, tp->snd_nxt)) {
1019 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1020 goto out;
1021 }
1022
1023 switch (type) {
1024 case ICMP_SOURCE_QUENCH:
1025 /* Just silently ignore these. */
1026 goto out;
1027 case ICMP_PARAMETERPROB:
1028 err = EPROTO;
1029 break;
1030 case ICMP_DEST_UNREACH:
1031 if (code > NR_ICMP_UNREACH)
1032 goto out;
1033
1034 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1035 if (!sock_owned_by_user(sk))
1036 do_pmtu_discovery(sk, iph, info);
1037 goto out;
1038 }
1039
1040 err = icmp_err_convert[code].errno;
1041 break;
1042 case ICMP_TIME_EXCEEDED:
1043 err = EHOSTUNREACH;
1044 break;
1045 default:
1046 goto out;
1047 }
1048
1049 switch (sk->sk_state) {
1050 struct open_request *req, **prev;
1051 case TCP_LISTEN:
1052 if (sock_owned_by_user(sk))
1053 goto out;
1054
1055 req = tcp_v4_search_req(tp, &prev, th->dest,
1056 iph->daddr, iph->saddr);
1057 if (!req)
1058 goto out;
1059
1060 /* ICMPs are not backlogged, hence we cannot get
1061 an established socket here.
1062 */
1063 BUG_TRAP(!req->sk);
1064
1065 if (seq != req->snt_isn) {
1066 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1067 goto out;
1068 }
1069
1070 /*
1071 * Still in SYN_RECV, just remove it silently.
1072 * There is no good way to pass the error to the newly
1073 * created socket, and POSIX does not want network
1074 * errors returned from accept().
1075 */
1076 tcp_synq_drop(sk, req, prev);
1077 goto out;
1078
1079 case TCP_SYN_SENT:
1080 case TCP_SYN_RECV: /* Cannot happen.
1081 It can f.e. if SYNs crossed.
1082 */
1083 if (!sock_owned_by_user(sk)) {
1084 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1085 sk->sk_err = err;
1086
1087 sk->sk_error_report(sk);
1088
1089 tcp_done(sk);
1090 } else {
1091 sk->sk_err_soft = err;
1092 }
1093 goto out;
1094 }
1095
1096 /* If we've already connected we will keep trying
1097 * until we time out, or the user gives up.
1098 *
1099 * rfc1122 4.2.3.9 allows to consider as hard errors
1100 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1101 * but it is obsoleted by pmtu discovery).
1102 *
1103 * Note, that in modern internet, where routing is unreliable
1104 * and in each dark corner broken firewalls sit, sending random
1105 * errors ordered by their masters even this two messages finally lose
1106 * their original sense (even Linux sends invalid PORT_UNREACHs)
1107 *
1108 * Now we are in compliance with RFCs.
1109 * --ANK (980905)
1110 */
1111
1112 inet = inet_sk(sk);
1113 if (!sock_owned_by_user(sk) && inet->recverr) {
1114 sk->sk_err = err;
1115 sk->sk_error_report(sk);
1116 } else { /* Only an error on timeout */
1117 sk->sk_err_soft = err;
1118 }
1119
1120out:
1121 bh_unlock_sock(sk);
1122 sock_put(sk);
1123}
1124
1125/* This routine computes an IPv4 TCP checksum. */
1126void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1127 struct sk_buff *skb)
1128{
1129 struct inet_sock *inet = inet_sk(sk);
1130
1131 if (skb->ip_summed == CHECKSUM_HW) {
1132 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1133 skb->csum = offsetof(struct tcphdr, check);
1134 } else {
1135 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1136 csum_partial((char *)th,
1137 th->doff << 2,
1138 skb->csum));
1139 }
1140}
1141
1142/*
1143 * This routine will send an RST to the other tcp.
1144 *
1145 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1146 * for reset.
1147 * Answer: if a packet caused RST, it is not for a socket
1148 * existing in our system, if it is matched to a socket,
1149 * it is just duplicate segment or bug in other side's TCP.
1150 * So that we build reply only basing on parameters
1151 * arrived with segment.
1152 * Exception: precedence violation. We do not implement it in any case.
1153 */
1154
1155static void tcp_v4_send_reset(struct sk_buff *skb)
1156{
1157 struct tcphdr *th = skb->h.th;
1158 struct tcphdr rth;
1159 struct ip_reply_arg arg;
1160
1161 /* Never send a reset in response to a reset. */
1162 if (th->rst)
1163 return;
1164
1165 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1166 return;
1167
1168 /* Swap the send and the receive. */
1169 memset(&rth, 0, sizeof(struct tcphdr));
1170 rth.dest = th->source;
1171 rth.source = th->dest;
1172 rth.doff = sizeof(struct tcphdr) / 4;
1173 rth.rst = 1;
1174
1175 if (th->ack) {
1176 rth.seq = th->ack_seq;
1177 } else {
1178 rth.ack = 1;
1179 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1180 skb->len - (th->doff << 2));
1181 }
1182
1183 memset(&arg, 0, sizeof arg);
1184 arg.iov[0].iov_base = (unsigned char *)&rth;
1185 arg.iov[0].iov_len = sizeof rth;
1186 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1187 skb->nh.iph->saddr, /*XXX*/
1188 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1189 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1190
1191 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1192
1193 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1194 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1195}
1196
1197/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1198 outside socket context is ugly, certainly. What can I do?
1199 */
1200
1201static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1202 u32 win, u32 ts)
1203{
1204 struct tcphdr *th = skb->h.th;
1205 struct {
1206 struct tcphdr th;
1207 u32 tsopt[3];
1208 } rep;
1209 struct ip_reply_arg arg;
1210
1211 memset(&rep.th, 0, sizeof(struct tcphdr));
1212 memset(&arg, 0, sizeof arg);
1213
1214 arg.iov[0].iov_base = (unsigned char *)&rep;
1215 arg.iov[0].iov_len = sizeof(rep.th);
1216 if (ts) {
1217 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1218 (TCPOPT_TIMESTAMP << 8) |
1219 TCPOLEN_TIMESTAMP);
1220 rep.tsopt[1] = htonl(tcp_time_stamp);
1221 rep.tsopt[2] = htonl(ts);
1222 arg.iov[0].iov_len = sizeof(rep);
1223 }
1224
1225 /* Swap the send and the receive. */
1226 rep.th.dest = th->source;
1227 rep.th.source = th->dest;
1228 rep.th.doff = arg.iov[0].iov_len / 4;
1229 rep.th.seq = htonl(seq);
1230 rep.th.ack_seq = htonl(ack);
1231 rep.th.ack = 1;
1232 rep.th.window = htons(win);
1233
1234 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1235 skb->nh.iph->saddr, /*XXX*/
1236 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1237 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1238
1239 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1240
1241 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1242}
1243
1244static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1245{
1246 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1247
1248 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1249 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1250
1251 tcp_tw_put(tw);
1252}
1253
1254static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1255{
1256 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1257 req->ts_recent);
1258}
1259
1260static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1261 struct open_request *req)
1262{
1263 struct rtable *rt;
1264 struct ip_options *opt = req->af.v4_req.opt;
1265 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1266 .nl_u = { .ip4_u =
1267 { .daddr = ((opt && opt->srr) ?
1268 opt->faddr :
1269 req->af.v4_req.rmt_addr),
1270 .saddr = req->af.v4_req.loc_addr,
1271 .tos = RT_CONN_FLAGS(sk) } },
1272 .proto = IPPROTO_TCP,
1273 .uli_u = { .ports =
1274 { .sport = inet_sk(sk)->sport,
1275 .dport = req->rmt_port } } };
1276
1277 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1278 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1279 return NULL;
1280 }
1281 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1282 ip_rt_put(rt);
1283 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1284 return NULL;
1285 }
1286 return &rt->u.dst;
1287}
1288
1289/*
1290 * Send a SYN-ACK after having received an ACK.
1291 * This still operates on a open_request only, not on a big
1292 * socket.
1293 */
1294static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1295 struct dst_entry *dst)
1296{
1297 int err = -1;
1298 struct sk_buff * skb;
1299
1300 /* First, grab a route. */
1301 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1302 goto out;
1303
1304 skb = tcp_make_synack(sk, dst, req);
1305
1306 if (skb) {
1307 struct tcphdr *th = skb->h.th;
1308
1309 th->check = tcp_v4_check(th, skb->len,
1310 req->af.v4_req.loc_addr,
1311 req->af.v4_req.rmt_addr,
1312 csum_partial((char *)th, skb->len,
1313 skb->csum));
1314
1315 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1316 req->af.v4_req.rmt_addr,
1317 req->af.v4_req.opt);
1318 if (err == NET_XMIT_CN)
1319 err = 0;
1320 }
1321
1322out:
1323 dst_release(dst);
1324 return err;
1325}
1326
1327/*
1328 * IPv4 open_request destructor.
1329 */
1330static void tcp_v4_or_free(struct open_request *req)
1331{
1332 if (req->af.v4_req.opt)
1333 kfree(req->af.v4_req.opt);
1334}
1335
1336static inline void syn_flood_warning(struct sk_buff *skb)
1337{
1338 static unsigned long warntime;
1339
1340 if (time_after(jiffies, (warntime + HZ * 60))) {
1341 warntime = jiffies;
1342 printk(KERN_INFO
1343 "possible SYN flooding on port %d. Sending cookies.\n",
1344 ntohs(skb->h.th->dest));
1345 }
1346}
1347
1348/*
1349 * Save and compile IPv4 options into the open_request if needed.
1350 */
1351static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1352 struct sk_buff *skb)
1353{
1354 struct ip_options *opt = &(IPCB(skb)->opt);
1355 struct ip_options *dopt = NULL;
1356
1357 if (opt && opt->optlen) {
1358 int opt_size = optlength(opt);
1359 dopt = kmalloc(opt_size, GFP_ATOMIC);
1360 if (dopt) {
1361 if (ip_options_echo(dopt, skb)) {
1362 kfree(dopt);
1363 dopt = NULL;
1364 }
1365 }
1366 }
1367 return dopt;
1368}
1369
1370/*
1371 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1372 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1373 * It would be better to replace it with a global counter for all sockets
1374 * but then some measure against one socket starving all other sockets
1375 * would be needed.
1376 *
1377 * It was 128 by default. Experiments with real servers show, that
1378 * it is absolutely not enough even at 100conn/sec. 256 cures most
1379 * of problems. This value is adjusted to 128 for very small machines
1380 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1381 * Further increasing requires to change hash table size.
1382 */
1383int sysctl_max_syn_backlog = 256;
1384
1385struct or_calltable or_ipv4 = {
1386 .family = PF_INET,
1387 .rtx_syn_ack = tcp_v4_send_synack,
1388 .send_ack = tcp_v4_or_send_ack,
1389 .destructor = tcp_v4_or_free,
1390 .send_reset = tcp_v4_send_reset,
1391};
1392
1393int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1394{
1395 struct tcp_options_received tmp_opt;
1396 struct open_request *req;
1397 __u32 saddr = skb->nh.iph->saddr;
1398 __u32 daddr = skb->nh.iph->daddr;
1399 __u32 isn = TCP_SKB_CB(skb)->when;
1400 struct dst_entry *dst = NULL;
1401#ifdef CONFIG_SYN_COOKIES
1402 int want_cookie = 0;
1403#else
1404#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1405#endif
1406
1407 /* Never answer to SYNs send to broadcast or multicast */
1408 if (((struct rtable *)skb->dst)->rt_flags &
1409 (RTCF_BROADCAST | RTCF_MULTICAST))
1410 goto drop;
1411
1412 /* TW buckets are converted to open requests without
1413 * limitations, they conserve resources and peer is
1414 * evidently real one.
1415 */
1416 if (tcp_synq_is_full(sk) && !isn) {
1417#ifdef CONFIG_SYN_COOKIES
1418 if (sysctl_tcp_syncookies) {
1419 want_cookie = 1;
1420 } else
1421#endif
1422 goto drop;
1423 }
1424
1425 /* Accept backlog is full. If we have already queued enough
1426 * of warm entries in syn queue, drop request. It is better than
1427 * clogging syn queue with openreqs with exponentially increasing
1428 * timeout.
1429 */
1430 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1431 goto drop;
1432
1433 req = tcp_openreq_alloc();
1434 if (!req)
1435 goto drop;
1436
1437 tcp_clear_options(&tmp_opt);
1438 tmp_opt.mss_clamp = 536;
1439 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1440
1441 tcp_parse_options(skb, &tmp_opt, 0);
1442
1443 if (want_cookie) {
1444 tcp_clear_options(&tmp_opt);
1445 tmp_opt.saw_tstamp = 0;
1446 }
1447
1448 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1449 /* Some OSes (unknown ones, but I see them on web server, which
1450 * contains information interesting only for windows'
1451 * users) do not send their stamp in SYN. It is easy case.
1452 * We simply do not advertise TS support.
1453 */
1454 tmp_opt.saw_tstamp = 0;
1455 tmp_opt.tstamp_ok = 0;
1456 }
1457 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1458
1459 tcp_openreq_init(req, &tmp_opt, skb);
1460
1461 req->af.v4_req.loc_addr = daddr;
1462 req->af.v4_req.rmt_addr = saddr;
1463 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1464 req->class = &or_ipv4;
1465 if (!want_cookie)
1466 TCP_ECN_create_request(req, skb->h.th);
1467
1468 if (want_cookie) {
1469#ifdef CONFIG_SYN_COOKIES
1470 syn_flood_warning(skb);
1471#endif
1472 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1473 } else if (!isn) {
1474 struct inet_peer *peer = NULL;
1475
1476 /* VJ's idea. We save last timestamp seen
1477 * from the destination in peer table, when entering
1478 * state TIME-WAIT, and check against it before
1479 * accepting new connection request.
1480 *
1481 * If "isn" is not zero, this request hit alive
1482 * timewait bucket, so that all the necessary checks
1483 * are made in the function processing timewait state.
1484 */
1485 if (tmp_opt.saw_tstamp &&
1486 sysctl_tcp_tw_recycle &&
1487 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1488 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1489 peer->v4daddr == saddr) {
1490 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1491 (s32)(peer->tcp_ts - req->ts_recent) >
1492 TCP_PAWS_WINDOW) {
1493 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1494 dst_release(dst);
1495 goto drop_and_free;
1496 }
1497 }
1498 /* Kill the following clause, if you dislike this way. */
1499 else if (!sysctl_tcp_syncookies &&
1500 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1501 (sysctl_max_syn_backlog >> 2)) &&
1502 (!peer || !peer->tcp_ts_stamp) &&
1503 (!dst || !dst_metric(dst, RTAX_RTT))) {
1504 /* Without syncookies last quarter of
1505 * backlog is filled with destinations,
1506 * proven to be alive.
1507 * It means that we continue to communicate
1508 * to destinations, already remembered
1509 * to the moment of synflood.
1510 */
1511 NETDEBUG(if (net_ratelimit()) \
1512 printk(KERN_DEBUG "TCP: drop open "
1513 "request from %u.%u."
1514 "%u.%u/%u\n", \
1515 NIPQUAD(saddr),
1516 ntohs(skb->h.th->source)));
1517 dst_release(dst);
1518 goto drop_and_free;
1519 }
1520
1521 isn = tcp_v4_init_sequence(sk, skb);
1522 }
1523 req->snt_isn = isn;
1524
1525 if (tcp_v4_send_synack(sk, req, dst))
1526 goto drop_and_free;
1527
1528 if (want_cookie) {
1529 tcp_openreq_free(req);
1530 } else {
1531 tcp_v4_synq_add(sk, req);
1532 }
1533 return 0;
1534
1535drop_and_free:
1536 tcp_openreq_free(req);
1537drop:
1538 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1539 return 0;
1540}
1541
1542
1543/*
1544 * The three way handshake has completed - we got a valid synack -
1545 * now create the new socket.
1546 */
1547struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1548 struct open_request *req,
1549 struct dst_entry *dst)
1550{
1551 struct inet_sock *newinet;
1552 struct tcp_sock *newtp;
1553 struct sock *newsk;
1554
1555 if (sk_acceptq_is_full(sk))
1556 goto exit_overflow;
1557
1558 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1559 goto exit;
1560
1561 newsk = tcp_create_openreq_child(sk, req, skb);
1562 if (!newsk)
1563 goto exit;
1564
1565 newsk->sk_dst_cache = dst;
1566 tcp_v4_setup_caps(newsk, dst);
1567
1568 newtp = tcp_sk(newsk);
1569 newinet = inet_sk(newsk);
1570 newinet->daddr = req->af.v4_req.rmt_addr;
1571 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1572 newinet->saddr = req->af.v4_req.loc_addr;
1573 newinet->opt = req->af.v4_req.opt;
1574 req->af.v4_req.opt = NULL;
1575 newinet->mc_index = tcp_v4_iif(skb);
1576 newinet->mc_ttl = skb->nh.iph->ttl;
1577 newtp->ext_header_len = 0;
1578 if (newinet->opt)
1579 newtp->ext_header_len = newinet->opt->optlen;
1580 newinet->id = newtp->write_seq ^ jiffies;
1581
1582 tcp_sync_mss(newsk, dst_mtu(dst));
1583 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1584 tcp_initialize_rcv_mss(newsk);
1585
1586 __tcp_v4_hash(newsk, 0);
1587 __tcp_inherit_port(sk, newsk);
1588
1589 return newsk;
1590
1591exit_overflow:
1592 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1593exit:
1594 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1595 dst_release(dst);
1596 return NULL;
1597}
1598
1599static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1600{
1601 struct tcphdr *th = skb->h.th;
1602 struct iphdr *iph = skb->nh.iph;
1603 struct tcp_sock *tp = tcp_sk(sk);
1604 struct sock *nsk;
1605 struct open_request **prev;
1606 /* Find possible connection requests. */
1607 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1608 iph->saddr, iph->daddr);
1609 if (req)
1610 return tcp_check_req(sk, skb, req, prev);
1611
1612 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1613 th->source,
1614 skb->nh.iph->daddr,
1615 ntohs(th->dest),
1616 tcp_v4_iif(skb));
1617
1618 if (nsk) {
1619 if (nsk->sk_state != TCP_TIME_WAIT) {
1620 bh_lock_sock(nsk);
1621 return nsk;
1622 }
1623 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1624 return NULL;
1625 }
1626
1627#ifdef CONFIG_SYN_COOKIES
1628 if (!th->rst && !th->syn && th->ack)
1629 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1630#endif
1631 return sk;
1632}
1633
1634static int tcp_v4_checksum_init(struct sk_buff *skb)
1635{
1636 if (skb->ip_summed == CHECKSUM_HW) {
1637 skb->ip_summed = CHECKSUM_UNNECESSARY;
1638 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1639 skb->nh.iph->daddr, skb->csum))
1640 return 0;
1641
1642 NETDEBUG(if (net_ratelimit())
1643 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1644 skb->ip_summed = CHECKSUM_NONE;
1645 }
1646 if (skb->len <= 76) {
1647 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1648 skb->nh.iph->daddr,
1649 skb_checksum(skb, 0, skb->len, 0)))
1650 return -1;
1651 skb->ip_summed = CHECKSUM_UNNECESSARY;
1652 } else {
1653 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1654 skb->nh.iph->saddr,
1655 skb->nh.iph->daddr, 0);
1656 }
1657 return 0;
1658}
1659
1660
1661/* The socket must have it's spinlock held when we get
1662 * here.
1663 *
1664 * We have a potential double-lock case here, so even when
1665 * doing backlog processing we use the BH locking scheme.
1666 * This is because we cannot sleep with the original spinlock
1667 * held.
1668 */
1669int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1670{
1671 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1672 TCP_CHECK_TIMER(sk);
1673 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1674 goto reset;
1675 TCP_CHECK_TIMER(sk);
1676 return 0;
1677 }
1678
1679 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1680 goto csum_err;
1681
1682 if (sk->sk_state == TCP_LISTEN) {
1683 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1684 if (!nsk)
1685 goto discard;
1686
1687 if (nsk != sk) {
1688 if (tcp_child_process(sk, nsk, skb))
1689 goto reset;
1690 return 0;
1691 }
1692 }
1693
1694 TCP_CHECK_TIMER(sk);
1695 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1696 goto reset;
1697 TCP_CHECK_TIMER(sk);
1698 return 0;
1699
1700reset:
1701 tcp_v4_send_reset(skb);
1702discard:
1703 kfree_skb(skb);
1704 /* Be careful here. If this function gets more complicated and
1705 * gcc suffers from register pressure on the x86, sk (in %ebx)
1706 * might be destroyed here. This current version compiles correctly,
1707 * but you have been warned.
1708 */
1709 return 0;
1710
1711csum_err:
1712 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1713 goto discard;
1714}
1715
1716/*
1717 * From tcp_input.c
1718 */
1719
1720int tcp_v4_rcv(struct sk_buff *skb)
1721{
1722 struct tcphdr *th;
1723 struct sock *sk;
1724 int ret;
1725
1726 if (skb->pkt_type != PACKET_HOST)
1727 goto discard_it;
1728
1729 /* Count it even if it's bad */
1730 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1731
1732 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1733 goto discard_it;
1734
1735 th = skb->h.th;
1736
1737 if (th->doff < sizeof(struct tcphdr) / 4)
1738 goto bad_packet;
1739 if (!pskb_may_pull(skb, th->doff * 4))
1740 goto discard_it;
1741
1742 /* An explanation is required here, I think.
1743 * Packet length and doff are validated by header prediction,
1744 * provided case of th->doff==0 is elimineted.
1745 * So, we defer the checks. */
1746 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1747 tcp_v4_checksum_init(skb) < 0))
1748 goto bad_packet;
1749
1750 th = skb->h.th;
1751 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1752 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1753 skb->len - th->doff * 4);
1754 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1755 TCP_SKB_CB(skb)->when = 0;
1756 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1757 TCP_SKB_CB(skb)->sacked = 0;
1758
1759 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1760 skb->nh.iph->daddr, ntohs(th->dest),
1761 tcp_v4_iif(skb));
1762
1763 if (!sk)
1764 goto no_tcp_socket;
1765
1766process:
1767 if (sk->sk_state == TCP_TIME_WAIT)
1768 goto do_time_wait;
1769
1770 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1771 goto discard_and_relse;
1772
1773 if (sk_filter(sk, skb, 0))
1774 goto discard_and_relse;
1775
1776 skb->dev = NULL;
1777
1778 bh_lock_sock(sk);
1779 ret = 0;
1780 if (!sock_owned_by_user(sk)) {
1781 if (!tcp_prequeue(sk, skb))
1782 ret = tcp_v4_do_rcv(sk, skb);
1783 } else
1784 sk_add_backlog(sk, skb);
1785 bh_unlock_sock(sk);
1786
1787 sock_put(sk);
1788
1789 return ret;
1790
1791no_tcp_socket:
1792 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1793 goto discard_it;
1794
1795 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1796bad_packet:
1797 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1798 } else {
1799 tcp_v4_send_reset(skb);
1800 }
1801
1802discard_it:
1803 /* Discard frame. */
1804 kfree_skb(skb);
1805 return 0;
1806
1807discard_and_relse:
1808 sock_put(sk);
1809 goto discard_it;
1810
1811do_time_wait:
1812 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1813 tcp_tw_put((struct tcp_tw_bucket *) sk);
1814 goto discard_it;
1815 }
1816
1817 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1818 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1819 tcp_tw_put((struct tcp_tw_bucket *) sk);
1820 goto discard_it;
1821 }
1822 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1823 skb, th, skb->len)) {
1824 case TCP_TW_SYN: {
1825 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1826 ntohs(th->dest),
1827 tcp_v4_iif(skb));
1828 if (sk2) {
1829 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1830 tcp_tw_put((struct tcp_tw_bucket *)sk);
1831 sk = sk2;
1832 goto process;
1833 }
1834 /* Fall through to ACK */
1835 }
1836 case TCP_TW_ACK:
1837 tcp_v4_timewait_ack(sk, skb);
1838 break;
1839 case TCP_TW_RST:
1840 goto no_tcp_socket;
1841 case TCP_TW_SUCCESS:;
1842 }
1843 goto discard_it;
1844}
1845
1846/* With per-bucket locks this operation is not-atomic, so that
1847 * this version is not worse.
1848 */
1849static void __tcp_v4_rehash(struct sock *sk)
1850{
1851 sk->sk_prot->unhash(sk);
1852 sk->sk_prot->hash(sk);
1853}
1854
1855static int tcp_v4_reselect_saddr(struct sock *sk)
1856{
1857 struct inet_sock *inet = inet_sk(sk);
1858 int err;
1859 struct rtable *rt;
1860 __u32 old_saddr = inet->saddr;
1861 __u32 new_saddr;
1862 __u32 daddr = inet->daddr;
1863
1864 if (inet->opt && inet->opt->srr)
1865 daddr = inet->opt->faddr;
1866
1867 /* Query new route. */
1868 err = ip_route_connect(&rt, daddr, 0,
1869 RT_CONN_FLAGS(sk),
1870 sk->sk_bound_dev_if,
1871 IPPROTO_TCP,
1872 inet->sport, inet->dport, sk);
1873 if (err)
1874 return err;
1875
1876 __sk_dst_set(sk, &rt->u.dst);
1877 tcp_v4_setup_caps(sk, &rt->u.dst);
1878
1879 new_saddr = rt->rt_src;
1880
1881 if (new_saddr == old_saddr)
1882 return 0;
1883
1884 if (sysctl_ip_dynaddr > 1) {
1885 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1886 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1887 NIPQUAD(old_saddr),
1888 NIPQUAD(new_saddr));
1889 }
1890
1891 inet->saddr = new_saddr;
1892 inet->rcv_saddr = new_saddr;
1893
1894 /* XXX The only one ugly spot where we need to
1895 * XXX really change the sockets identity after
1896 * XXX it has entered the hashes. -DaveM
1897 *
1898 * Besides that, it does not check for connection
1899 * uniqueness. Wait for troubles.
1900 */
1901 __tcp_v4_rehash(sk);
1902 return 0;
1903}
1904
1905int tcp_v4_rebuild_header(struct sock *sk)
1906{
1907 struct inet_sock *inet = inet_sk(sk);
1908 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1909 u32 daddr;
1910 int err;
1911
1912 /* Route is OK, nothing to do. */
1913 if (rt)
1914 return 0;
1915
1916 /* Reroute. */
1917 daddr = inet->daddr;
1918 if (inet->opt && inet->opt->srr)
1919 daddr = inet->opt->faddr;
1920
1921 {
1922 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1923 .nl_u = { .ip4_u =
1924 { .daddr = daddr,
1925 .saddr = inet->saddr,
1926 .tos = RT_CONN_FLAGS(sk) } },
1927 .proto = IPPROTO_TCP,
1928 .uli_u = { .ports =
1929 { .sport = inet->sport,
1930 .dport = inet->dport } } };
1931
1932 err = ip_route_output_flow(&rt, &fl, sk, 0);
1933 }
1934 if (!err) {
1935 __sk_dst_set(sk, &rt->u.dst);
1936 tcp_v4_setup_caps(sk, &rt->u.dst);
1937 return 0;
1938 }
1939
1940 /* Routing failed... */
1941 sk->sk_route_caps = 0;
1942
1943 if (!sysctl_ip_dynaddr ||
1944 sk->sk_state != TCP_SYN_SENT ||
1945 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1946 (err = tcp_v4_reselect_saddr(sk)) != 0)
1947 sk->sk_err_soft = -err;
1948
1949 return err;
1950}
1951
1952static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1953{
1954 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1955 struct inet_sock *inet = inet_sk(sk);
1956
1957 sin->sin_family = AF_INET;
1958 sin->sin_addr.s_addr = inet->daddr;
1959 sin->sin_port = inet->dport;
1960}
1961
1962/* VJ's idea. Save last timestamp seen from this destination
1963 * and hold it at least for normal timewait interval to use for duplicate
1964 * segment detection in subsequent connections, before they enter synchronized
1965 * state.
1966 */
1967
1968int tcp_v4_remember_stamp(struct sock *sk)
1969{
1970 struct inet_sock *inet = inet_sk(sk);
1971 struct tcp_sock *tp = tcp_sk(sk);
1972 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1973 struct inet_peer *peer = NULL;
1974 int release_it = 0;
1975
1976 if (!rt || rt->rt_dst != inet->daddr) {
1977 peer = inet_getpeer(inet->daddr, 1);
1978 release_it = 1;
1979 } else {
1980 if (!rt->peer)
1981 rt_bind_peer(rt, 1);
1982 peer = rt->peer;
1983 }
1984
1985 if (peer) {
1986 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1987 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1988 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1989 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1990 peer->tcp_ts = tp->rx_opt.ts_recent;
1991 }
1992 if (release_it)
1993 inet_putpeer(peer);
1994 return 1;
1995 }
1996
1997 return 0;
1998}
1999
2000int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2001{
2002 struct inet_peer *peer = NULL;
2003
2004 peer = inet_getpeer(tw->tw_daddr, 1);
2005
2006 if (peer) {
2007 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2008 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2009 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2010 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2011 peer->tcp_ts = tw->tw_ts_recent;
2012 }
2013 inet_putpeer(peer);
2014 return 1;
2015 }
2016
2017 return 0;
2018}
2019
2020struct tcp_func ipv4_specific = {
2021 .queue_xmit = ip_queue_xmit,
2022 .send_check = tcp_v4_send_check,
2023 .rebuild_header = tcp_v4_rebuild_header,
2024 .conn_request = tcp_v4_conn_request,
2025 .syn_recv_sock = tcp_v4_syn_recv_sock,
2026 .remember_stamp = tcp_v4_remember_stamp,
2027 .net_header_len = sizeof(struct iphdr),
2028 .setsockopt = ip_setsockopt,
2029 .getsockopt = ip_getsockopt,
2030 .addr2sockaddr = v4_addr2sockaddr,
2031 .sockaddr_len = sizeof(struct sockaddr_in),
2032};
2033
2034/* NOTE: A lot of things set to zero explicitly by call to
2035 * sk_alloc() so need not be done here.
2036 */
2037static int tcp_v4_init_sock(struct sock *sk)
2038{
2039 struct tcp_sock *tp = tcp_sk(sk);
2040
2041 skb_queue_head_init(&tp->out_of_order_queue);
2042 tcp_init_xmit_timers(sk);
2043 tcp_prequeue_init(tp);
2044
2045 tp->rto = TCP_TIMEOUT_INIT;
2046 tp->mdev = TCP_TIMEOUT_INIT;
2047
2048 /* So many TCP implementations out there (incorrectly) count the
2049 * initial SYN frame in their delayed-ACK and congestion control
2050 * algorithms that we must have the following bandaid to talk
2051 * efficiently to them. -DaveM
2052 */
2053 tp->snd_cwnd = 2;
2054
2055 /* See draft-stevens-tcpca-spec-01 for discussion of the
2056 * initialization of these values.
2057 */
2058 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2059 tp->snd_cwnd_clamp = ~0;
2060 tp->mss_cache_std = tp->mss_cache = 536;
2061
2062 tp->reordering = sysctl_tcp_reordering;
2063
2064 sk->sk_state = TCP_CLOSE;
2065
2066 sk->sk_write_space = sk_stream_write_space;
2067 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2068
2069 tp->af_specific = &ipv4_specific;
2070
2071 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2072 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2073
2074 atomic_inc(&tcp_sockets_allocated);
2075
2076 return 0;
2077}
2078
2079int tcp_v4_destroy_sock(struct sock *sk)
2080{
2081 struct tcp_sock *tp = tcp_sk(sk);
2082
2083 tcp_clear_xmit_timers(sk);
2084
2085 /* Cleanup up the write buffer. */
2086 sk_stream_writequeue_purge(sk);
2087
2088 /* Cleans up our, hopefully empty, out_of_order_queue. */
2089 __skb_queue_purge(&tp->out_of_order_queue);
2090
2091 /* Clean prequeue, it must be empty really */
2092 __skb_queue_purge(&tp->ucopy.prequeue);
2093
2094 /* Clean up a referenced TCP bind bucket. */
2095 if (tp->bind_hash)
2096 tcp_put_port(sk);
2097
2098 /*
2099 * If sendmsg cached page exists, toss it.
2100 */
2101 if (sk->sk_sndmsg_page) {
2102 __free_page(sk->sk_sndmsg_page);
2103 sk->sk_sndmsg_page = NULL;
2104 }
2105
2106 atomic_dec(&tcp_sockets_allocated);
2107
2108 return 0;
2109}
2110
2111EXPORT_SYMBOL(tcp_v4_destroy_sock);
2112
2113#ifdef CONFIG_PROC_FS
2114/* Proc filesystem TCP sock list dumping. */
2115
2116static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2117{
2118 return hlist_empty(head) ? NULL :
2119 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2120}
2121
2122static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2123{
2124 return tw->tw_node.next ?
2125 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2126}
2127
2128static void *listening_get_next(struct seq_file *seq, void *cur)
2129{
2130 struct tcp_sock *tp;
2131 struct hlist_node *node;
2132 struct sock *sk = cur;
2133 struct tcp_iter_state* st = seq->private;
2134
2135 if (!sk) {
2136 st->bucket = 0;
2137 sk = sk_head(&tcp_listening_hash[0]);
2138 goto get_sk;
2139 }
2140
2141 ++st->num;
2142
2143 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2144 struct open_request *req = cur;
2145
2146 tp = tcp_sk(st->syn_wait_sk);
2147 req = req->dl_next;
2148 while (1) {
2149 while (req) {
2150 if (req->class->family == st->family) {
2151 cur = req;
2152 goto out;
2153 }
2154 req = req->dl_next;
2155 }
2156 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2157 break;
2158get_req:
2159 req = tp->listen_opt->syn_table[st->sbucket];
2160 }
2161 sk = sk_next(st->syn_wait_sk);
2162 st->state = TCP_SEQ_STATE_LISTENING;
2163 read_unlock_bh(&tp->syn_wait_lock);
2164 } else {
2165 tp = tcp_sk(sk);
2166 read_lock_bh(&tp->syn_wait_lock);
2167 if (tp->listen_opt && tp->listen_opt->qlen)
2168 goto start_req;
2169 read_unlock_bh(&tp->syn_wait_lock);
2170 sk = sk_next(sk);
2171 }
2172get_sk:
2173 sk_for_each_from(sk, node) {
2174 if (sk->sk_family == st->family) {
2175 cur = sk;
2176 goto out;
2177 }
2178 tp = tcp_sk(sk);
2179 read_lock_bh(&tp->syn_wait_lock);
2180 if (tp->listen_opt && tp->listen_opt->qlen) {
2181start_req:
2182 st->uid = sock_i_uid(sk);
2183 st->syn_wait_sk = sk;
2184 st->state = TCP_SEQ_STATE_OPENREQ;
2185 st->sbucket = 0;
2186 goto get_req;
2187 }
2188 read_unlock_bh(&tp->syn_wait_lock);
2189 }
2190 if (++st->bucket < TCP_LHTABLE_SIZE) {
2191 sk = sk_head(&tcp_listening_hash[st->bucket]);
2192 goto get_sk;
2193 }
2194 cur = NULL;
2195out:
2196 return cur;
2197}
2198
2199static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2200{
2201 void *rc = listening_get_next(seq, NULL);
2202
2203 while (rc && *pos) {
2204 rc = listening_get_next(seq, rc);
2205 --*pos;
2206 }
2207 return rc;
2208}
2209
2210static void *established_get_first(struct seq_file *seq)
2211{
2212 struct tcp_iter_state* st = seq->private;
2213 void *rc = NULL;
2214
2215 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2216 struct sock *sk;
2217 struct hlist_node *node;
2218 struct tcp_tw_bucket *tw;
2219
2220 /* We can reschedule _before_ having picked the target: */
2221 cond_resched_softirq();
2222
2223 read_lock(&tcp_ehash[st->bucket].lock);
2224 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2225 if (sk->sk_family != st->family) {
2226 continue;
2227 }
2228 rc = sk;
2229 goto out;
2230 }
2231 st->state = TCP_SEQ_STATE_TIME_WAIT;
2232 tw_for_each(tw, node,
2233 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2234 if (tw->tw_family != st->family) {
2235 continue;
2236 }
2237 rc = tw;
2238 goto out;
2239 }
2240 read_unlock(&tcp_ehash[st->bucket].lock);
2241 st->state = TCP_SEQ_STATE_ESTABLISHED;
2242 }
2243out:
2244 return rc;
2245}
2246
2247static void *established_get_next(struct seq_file *seq, void *cur)
2248{
2249 struct sock *sk = cur;
2250 struct tcp_tw_bucket *tw;
2251 struct hlist_node *node;
2252 struct tcp_iter_state* st = seq->private;
2253
2254 ++st->num;
2255
2256 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2257 tw = cur;
2258 tw = tw_next(tw);
2259get_tw:
2260 while (tw && tw->tw_family != st->family) {
2261 tw = tw_next(tw);
2262 }
2263 if (tw) {
2264 cur = tw;
2265 goto out;
2266 }
2267 read_unlock(&tcp_ehash[st->bucket].lock);
2268 st->state = TCP_SEQ_STATE_ESTABLISHED;
2269
2270 /* We can reschedule between buckets: */
2271 cond_resched_softirq();
2272
2273 if (++st->bucket < tcp_ehash_size) {
2274 read_lock(&tcp_ehash[st->bucket].lock);
2275 sk = sk_head(&tcp_ehash[st->bucket].chain);
2276 } else {
2277 cur = NULL;
2278 goto out;
2279 }
2280 } else
2281 sk = sk_next(sk);
2282
2283 sk_for_each_from(sk, node) {
2284 if (sk->sk_family == st->family)
2285 goto found;
2286 }
2287
2288 st->state = TCP_SEQ_STATE_TIME_WAIT;
2289 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2290 goto get_tw;
2291found:
2292 cur = sk;
2293out:
2294 return cur;
2295}
2296
2297static void *established_get_idx(struct seq_file *seq, loff_t pos)
2298{
2299 void *rc = established_get_first(seq);
2300
2301 while (rc && pos) {
2302 rc = established_get_next(seq, rc);
2303 --pos;
2304 }
2305 return rc;
2306}
2307
2308static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2309{
2310 void *rc;
2311 struct tcp_iter_state* st = seq->private;
2312
2313 tcp_listen_lock();
2314 st->state = TCP_SEQ_STATE_LISTENING;
2315 rc = listening_get_idx(seq, &pos);
2316
2317 if (!rc) {
2318 tcp_listen_unlock();
2319 local_bh_disable();
2320 st->state = TCP_SEQ_STATE_ESTABLISHED;
2321 rc = established_get_idx(seq, pos);
2322 }
2323
2324 return rc;
2325}
2326
2327static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2328{
2329 struct tcp_iter_state* st = seq->private;
2330 st->state = TCP_SEQ_STATE_LISTENING;
2331 st->num = 0;
2332 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2333}
2334
2335static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2336{
2337 void *rc = NULL;
2338 struct tcp_iter_state* st;
2339
2340 if (v == SEQ_START_TOKEN) {
2341 rc = tcp_get_idx(seq, 0);
2342 goto out;
2343 }
2344 st = seq->private;
2345
2346 switch (st->state) {
2347 case TCP_SEQ_STATE_OPENREQ:
2348 case TCP_SEQ_STATE_LISTENING:
2349 rc = listening_get_next(seq, v);
2350 if (!rc) {
2351 tcp_listen_unlock();
2352 local_bh_disable();
2353 st->state = TCP_SEQ_STATE_ESTABLISHED;
2354 rc = established_get_first(seq);
2355 }
2356 break;
2357 case TCP_SEQ_STATE_ESTABLISHED:
2358 case TCP_SEQ_STATE_TIME_WAIT:
2359 rc = established_get_next(seq, v);
2360 break;
2361 }
2362out:
2363 ++*pos;
2364 return rc;
2365}
2366
2367static void tcp_seq_stop(struct seq_file *seq, void *v)
2368{
2369 struct tcp_iter_state* st = seq->private;
2370
2371 switch (st->state) {
2372 case TCP_SEQ_STATE_OPENREQ:
2373 if (v) {
2374 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2375 read_unlock_bh(&tp->syn_wait_lock);
2376 }
2377 case TCP_SEQ_STATE_LISTENING:
2378 if (v != SEQ_START_TOKEN)
2379 tcp_listen_unlock();
2380 break;
2381 case TCP_SEQ_STATE_TIME_WAIT:
2382 case TCP_SEQ_STATE_ESTABLISHED:
2383 if (v)
2384 read_unlock(&tcp_ehash[st->bucket].lock);
2385 local_bh_enable();
2386 break;
2387 }
2388}
2389
2390static int tcp_seq_open(struct inode *inode, struct file *file)
2391{
2392 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2393 struct seq_file *seq;
2394 struct tcp_iter_state *s;
2395 int rc;
2396
2397 if (unlikely(afinfo == NULL))
2398 return -EINVAL;
2399
2400 s = kmalloc(sizeof(*s), GFP_KERNEL);
2401 if (!s)
2402 return -ENOMEM;
2403 memset(s, 0, sizeof(*s));
2404 s->family = afinfo->family;
2405 s->seq_ops.start = tcp_seq_start;
2406 s->seq_ops.next = tcp_seq_next;
2407 s->seq_ops.show = afinfo->seq_show;
2408 s->seq_ops.stop = tcp_seq_stop;
2409
2410 rc = seq_open(file, &s->seq_ops);
2411 if (rc)
2412 goto out_kfree;
2413 seq = file->private_data;
2414 seq->private = s;
2415out:
2416 return rc;
2417out_kfree:
2418 kfree(s);
2419 goto out;
2420}
2421
2422int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2423{
2424 int rc = 0;
2425 struct proc_dir_entry *p;
2426
2427 if (!afinfo)
2428 return -EINVAL;
2429 afinfo->seq_fops->owner = afinfo->owner;
2430 afinfo->seq_fops->open = tcp_seq_open;
2431 afinfo->seq_fops->read = seq_read;
2432 afinfo->seq_fops->llseek = seq_lseek;
2433 afinfo->seq_fops->release = seq_release_private;
2434
2435 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2436 if (p)
2437 p->data = afinfo;
2438 else
2439 rc = -ENOMEM;
2440 return rc;
2441}
2442
2443void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2444{
2445 if (!afinfo)
2446 return;
2447 proc_net_remove(afinfo->name);
2448 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2449}
2450
2451static void get_openreq4(struct sock *sk, struct open_request *req,
2452 char *tmpbuf, int i, int uid)
2453{
2454 int ttd = req->expires - jiffies;
2455
2456 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2457 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2458 i,
2459 req->af.v4_req.loc_addr,
2460 ntohs(inet_sk(sk)->sport),
2461 req->af.v4_req.rmt_addr,
2462 ntohs(req->rmt_port),
2463 TCP_SYN_RECV,
2464 0, 0, /* could print option size, but that is af dependent. */
2465 1, /* timers active (only the expire timer) */
2466 jiffies_to_clock_t(ttd),
2467 req->retrans,
2468 uid,
2469 0, /* non standard timer */
2470 0, /* open_requests have no inode */
2471 atomic_read(&sk->sk_refcnt),
2472 req);
2473}
2474
2475static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2476{
2477 int timer_active;
2478 unsigned long timer_expires;
2479 struct tcp_sock *tp = tcp_sk(sp);
2480 struct inet_sock *inet = inet_sk(sp);
2481 unsigned int dest = inet->daddr;
2482 unsigned int src = inet->rcv_saddr;
2483 __u16 destp = ntohs(inet->dport);
2484 __u16 srcp = ntohs(inet->sport);
2485
2486 if (tp->pending == TCP_TIME_RETRANS) {
2487 timer_active = 1;
2488 timer_expires = tp->timeout;
2489 } else if (tp->pending == TCP_TIME_PROBE0) {
2490 timer_active = 4;
2491 timer_expires = tp->timeout;
2492 } else if (timer_pending(&sp->sk_timer)) {
2493 timer_active = 2;
2494 timer_expires = sp->sk_timer.expires;
2495 } else {
2496 timer_active = 0;
2497 timer_expires = jiffies;
2498 }
2499
2500 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2501 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2502 i, src, srcp, dest, destp, sp->sk_state,
2503 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2504 timer_active,
2505 jiffies_to_clock_t(timer_expires - jiffies),
2506 tp->retransmits,
2507 sock_i_uid(sp),
2508 tp->probes_out,
2509 sock_i_ino(sp),
2510 atomic_read(&sp->sk_refcnt), sp,
2511 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2512 tp->snd_cwnd,
2513 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2514}
2515
2516static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2517{
2518 unsigned int dest, src;
2519 __u16 destp, srcp;
2520 int ttd = tw->tw_ttd - jiffies;
2521
2522 if (ttd < 0)
2523 ttd = 0;
2524
2525 dest = tw->tw_daddr;
2526 src = tw->tw_rcv_saddr;
2527 destp = ntohs(tw->tw_dport);
2528 srcp = ntohs(tw->tw_sport);
2529
2530 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2531 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2532 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2533 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2534 atomic_read(&tw->tw_refcnt), tw);
2535}
2536
2537#define TMPSZ 150
2538
2539static int tcp4_seq_show(struct seq_file *seq, void *v)
2540{
2541 struct tcp_iter_state* st;
2542 char tmpbuf[TMPSZ + 1];
2543
2544 if (v == SEQ_START_TOKEN) {
2545 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2546 " sl local_address rem_address st tx_queue "
2547 "rx_queue tr tm->when retrnsmt uid timeout "
2548 "inode");
2549 goto out;
2550 }
2551 st = seq->private;
2552
2553 switch (st->state) {
2554 case TCP_SEQ_STATE_LISTENING:
2555 case TCP_SEQ_STATE_ESTABLISHED:
2556 get_tcp4_sock(v, tmpbuf, st->num);
2557 break;
2558 case TCP_SEQ_STATE_OPENREQ:
2559 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2560 break;
2561 case TCP_SEQ_STATE_TIME_WAIT:
2562 get_timewait4_sock(v, tmpbuf, st->num);
2563 break;
2564 }
2565 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2566out:
2567 return 0;
2568}
2569
2570static struct file_operations tcp4_seq_fops;
2571static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2572 .owner = THIS_MODULE,
2573 .name = "tcp",
2574 .family = AF_INET,
2575 .seq_show = tcp4_seq_show,
2576 .seq_fops = &tcp4_seq_fops,
2577};
2578
2579int __init tcp4_proc_init(void)
2580{
2581 return tcp_proc_register(&tcp4_seq_afinfo);
2582}
2583
2584void tcp4_proc_exit(void)
2585{
2586 tcp_proc_unregister(&tcp4_seq_afinfo);
2587}
2588#endif /* CONFIG_PROC_FS */
2589
2590struct proto tcp_prot = {
2591 .name = "TCP",
2592 .owner = THIS_MODULE,
2593 .close = tcp_close,
2594 .connect = tcp_v4_connect,
2595 .disconnect = tcp_disconnect,
2596 .accept = tcp_accept,
2597 .ioctl = tcp_ioctl,
2598 .init = tcp_v4_init_sock,
2599 .destroy = tcp_v4_destroy_sock,
2600 .shutdown = tcp_shutdown,
2601 .setsockopt = tcp_setsockopt,
2602 .getsockopt = tcp_getsockopt,
2603 .sendmsg = tcp_sendmsg,
2604 .recvmsg = tcp_recvmsg,
2605 .backlog_rcv = tcp_v4_do_rcv,
2606 .hash = tcp_v4_hash,
2607 .unhash = tcp_unhash,
2608 .get_port = tcp_v4_get_port,
2609 .enter_memory_pressure = tcp_enter_memory_pressure,
2610 .sockets_allocated = &tcp_sockets_allocated,
2611 .memory_allocated = &tcp_memory_allocated,
2612 .memory_pressure = &tcp_memory_pressure,
2613 .sysctl_mem = sysctl_tcp_mem,
2614 .sysctl_wmem = sysctl_tcp_wmem,
2615 .sysctl_rmem = sysctl_tcp_rmem,
2616 .max_header = MAX_TCP_HEADER,
2617 .obj_size = sizeof(struct tcp_sock),
2618};
2619
2620
2621
2622void __init tcp_v4_init(struct net_proto_family *ops)
2623{
2624 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2625 if (err < 0)
2626 panic("Failed to create the TCP control socket.\n");
2627 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2628 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2629
2630 /* Unhash it so that IP input processing does not even
2631 * see it, we do not wish this socket to see incoming
2632 * packets.
2633 */
2634 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2635}
2636
2637EXPORT_SYMBOL(ipv4_specific);
2638EXPORT_SYMBOL(tcp_bind_hash);
2639EXPORT_SYMBOL(tcp_bucket_create);
2640EXPORT_SYMBOL(tcp_hashinfo);
2641EXPORT_SYMBOL(tcp_inherit_port);
2642EXPORT_SYMBOL(tcp_listen_wlock);
2643EXPORT_SYMBOL(tcp_port_rover);
2644EXPORT_SYMBOL(tcp_prot);
2645EXPORT_SYMBOL(tcp_put_port);
2646EXPORT_SYMBOL(tcp_unhash);
2647EXPORT_SYMBOL(tcp_v4_conn_request);
2648EXPORT_SYMBOL(tcp_v4_connect);
2649EXPORT_SYMBOL(tcp_v4_do_rcv);
2650EXPORT_SYMBOL(tcp_v4_rebuild_header);
2651EXPORT_SYMBOL(tcp_v4_remember_stamp);
2652EXPORT_SYMBOL(tcp_v4_send_check);
2653EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2654
2655#ifdef CONFIG_PROC_FS
2656EXPORT_SYMBOL(tcp_proc_register);
2657EXPORT_SYMBOL(tcp_proc_unregister);
2658#endif
2659EXPORT_SYMBOL(sysctl_local_port_range);
2660EXPORT_SYMBOL(sysctl_max_syn_backlog);
2661EXPORT_SYMBOL(sysctl_tcp_low_latency);
2662EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2663
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 000000000000..fd70509f0d53
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,1077 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23#include <linux/config.h>
24#include <linux/mm.h>
25#include <linux/module.h>
26#include <linux/sysctl.h>
27#include <linux/workqueue.h>
28#include <net/tcp.h>
29#include <net/inet_common.h>
30#include <net/xfrm.h>
31
32#ifdef CONFIG_SYSCTL
33#define SYNC_INIT 0 /* let the user enable it */
34#else
35#define SYNC_INIT 1
36#endif
37
38int sysctl_tcp_tw_recycle;
39int sysctl_tcp_max_tw_buckets = NR_FILE*2;
40
41int sysctl_tcp_syncookies = SYNC_INIT;
42int sysctl_tcp_abort_on_overflow;
43
44static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
45
46static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
47{
48 if (seq == s_win)
49 return 1;
50 if (after(end_seq, s_win) && before(seq, e_win))
51 return 1;
52 return (seq == e_win && seq == end_seq);
53}
54
55/* New-style handling of TIME_WAIT sockets. */
56
57int tcp_tw_count;
58
59
60/* Must be called with locally disabled BHs. */
61static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
62{
63 struct tcp_ehash_bucket *ehead;
64 struct tcp_bind_hashbucket *bhead;
65 struct tcp_bind_bucket *tb;
66
67 /* Unlink from established hashes. */
68 ehead = &tcp_ehash[tw->tw_hashent];
69 write_lock(&ehead->lock);
70 if (hlist_unhashed(&tw->tw_node)) {
71 write_unlock(&ehead->lock);
72 return;
73 }
74 __hlist_del(&tw->tw_node);
75 sk_node_init(&tw->tw_node);
76 write_unlock(&ehead->lock);
77
78 /* Disassociate with bind bucket. */
79 bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
80 spin_lock(&bhead->lock);
81 tb = tw->tw_tb;
82 __hlist_del(&tw->tw_bind_node);
83 tw->tw_tb = NULL;
84 tcp_bucket_destroy(tb);
85 spin_unlock(&bhead->lock);
86
87#ifdef INET_REFCNT_DEBUG
88 if (atomic_read(&tw->tw_refcnt) != 1) {
89 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
90 atomic_read(&tw->tw_refcnt));
91 }
92#endif
93 tcp_tw_put(tw);
94}
95
96/*
97 * * Main purpose of TIME-WAIT state is to close connection gracefully,
98 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
99 * (and, probably, tail of data) and one or more our ACKs are lost.
100 * * What is TIME-WAIT timeout? It is associated with maximal packet
101 * lifetime in the internet, which results in wrong conclusion, that
102 * it is set to catch "old duplicate segments" wandering out of their path.
103 * It is not quite correct. This timeout is calculated so that it exceeds
104 * maximal retransmission timeout enough to allow to lose one (or more)
105 * segments sent by peer and our ACKs. This time may be calculated from RTO.
106 * * When TIME-WAIT socket receives RST, it means that another end
107 * finally closed and we are allowed to kill TIME-WAIT too.
108 * * Second purpose of TIME-WAIT is catching old duplicate segments.
109 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
110 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
111 * * If we invented some more clever way to catch duplicates
112 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
113 *
114 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
115 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
116 * from the very beginning.
117 *
118 * NOTE. With recycling (and later with fin-wait-2) TW bucket
119 * is _not_ stateless. It means, that strictly speaking we must
120 * spinlock it. I do not want! Well, probability of misbehaviour
121 * is ridiculously low and, seems, we could use some mb() tricks
122 * to avoid misread sequence numbers, states etc. --ANK
123 */
124enum tcp_tw_status
125tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
126 struct tcphdr *th, unsigned len)
127{
128 struct tcp_options_received tmp_opt;
129 int paws_reject = 0;
130
131 tmp_opt.saw_tstamp = 0;
132 if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
133 tcp_parse_options(skb, &tmp_opt, 0);
134
135 if (tmp_opt.saw_tstamp) {
136 tmp_opt.ts_recent = tw->tw_ts_recent;
137 tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
138 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
139 }
140 }
141
142 if (tw->tw_substate == TCP_FIN_WAIT2) {
143 /* Just repeat all the checks of tcp_rcv_state_process() */
144
145 /* Out of window, send ACK */
146 if (paws_reject ||
147 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
148 tw->tw_rcv_nxt,
149 tw->tw_rcv_nxt + tw->tw_rcv_wnd))
150 return TCP_TW_ACK;
151
152 if (th->rst)
153 goto kill;
154
155 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
156 goto kill_with_rst;
157
158 /* Dup ACK? */
159 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
160 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
161 tcp_tw_put(tw);
162 return TCP_TW_SUCCESS;
163 }
164
165 /* New data or FIN. If new data arrive after half-duplex close,
166 * reset.
167 */
168 if (!th->fin ||
169 TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
170kill_with_rst:
171 tcp_tw_deschedule(tw);
172 tcp_tw_put(tw);
173 return TCP_TW_RST;
174 }
175
176 /* FIN arrived, enter true time-wait state. */
177 tw->tw_substate = TCP_TIME_WAIT;
178 tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
179 if (tmp_opt.saw_tstamp) {
180 tw->tw_ts_recent_stamp = xtime.tv_sec;
181 tw->tw_ts_recent = tmp_opt.rcv_tsval;
182 }
183
184 /* I am shamed, but failed to make it more elegant.
185 * Yes, it is direct reference to IP, which is impossible
186 * to generalize to IPv6. Taking into account that IPv6
187 * do not undertsnad recycling in any case, it not
188 * a big problem in practice. --ANK */
189 if (tw->tw_family == AF_INET &&
190 sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
191 tcp_v4_tw_remember_stamp(tw))
192 tcp_tw_schedule(tw, tw->tw_timeout);
193 else
194 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
195 return TCP_TW_ACK;
196 }
197
198 /*
199 * Now real TIME-WAIT state.
200 *
201 * RFC 1122:
202 * "When a connection is [...] on TIME-WAIT state [...]
203 * [a TCP] MAY accept a new SYN from the remote TCP to
204 * reopen the connection directly, if it:
205 *
206 * (1) assigns its initial sequence number for the new
207 * connection to be larger than the largest sequence
208 * number it used on the previous connection incarnation,
209 * and
210 *
211 * (2) returns to TIME-WAIT state if the SYN turns out
212 * to be an old duplicate".
213 */
214
215 if (!paws_reject &&
216 (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
217 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
218 /* In window segment, it may be only reset or bare ack. */
219
220 if (th->rst) {
221 /* This is TIME_WAIT assasination, in two flavors.
222 * Oh well... nobody has a sufficient solution to this
223 * protocol bug yet.
224 */
225 if (sysctl_tcp_rfc1337 == 0) {
226kill:
227 tcp_tw_deschedule(tw);
228 tcp_tw_put(tw);
229 return TCP_TW_SUCCESS;
230 }
231 }
232 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
233
234 if (tmp_opt.saw_tstamp) {
235 tw->tw_ts_recent = tmp_opt.rcv_tsval;
236 tw->tw_ts_recent_stamp = xtime.tv_sec;
237 }
238
239 tcp_tw_put(tw);
240 return TCP_TW_SUCCESS;
241 }
242
243 /* Out of window segment.
244
245 All the segments are ACKed immediately.
246
247 The only exception is new SYN. We accept it, if it is
248 not old duplicate and we are not in danger to be killed
249 by delayed old duplicates. RFC check is that it has
250 newer sequence number works at rates <40Mbit/sec.
251 However, if paws works, it is reliable AND even more,
252 we even may relax silly seq space cutoff.
253
254 RED-PEN: we violate main RFC requirement, if this SYN will appear
255 old duplicate (i.e. we receive RST in reply to SYN-ACK),
256 we must return socket to time-wait state. It is not good,
257 but not fatal yet.
258 */
259
260 if (th->syn && !th->rst && !th->ack && !paws_reject &&
261 (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
262 (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
263 u32 isn = tw->tw_snd_nxt + 65535 + 2;
264 if (isn == 0)
265 isn++;
266 TCP_SKB_CB(skb)->when = isn;
267 return TCP_TW_SYN;
268 }
269
270 if (paws_reject)
271 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
272
273 if(!th->rst) {
274 /* In this case we must reset the TIMEWAIT timer.
275 *
276 * If it is ACKless SYN it may be both old duplicate
277 * and new good SYN with random sequence number <rcv_nxt.
278 * Do not reschedule in the last case.
279 */
280 if (paws_reject || th->ack)
281 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
282
283 /* Send ACK. Note, we do not put the bucket,
284 * it will be released by caller.
285 */
286 return TCP_TW_ACK;
287 }
288 tcp_tw_put(tw);
289 return TCP_TW_SUCCESS;
290}
291
292/* Enter the time wait state. This is called with locally disabled BH.
293 * Essentially we whip up a timewait bucket, copy the
294 * relevant info into it from the SK, and mess with hash chains
295 * and list linkage.
296 */
297static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
298{
299 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
300 struct tcp_bind_hashbucket *bhead;
301
302 /* Step 1: Put TW into bind hash. Original socket stays there too.
303 Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
304 binding cache, even if it is closed.
305 */
306 bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
307 spin_lock(&bhead->lock);
308 tw->tw_tb = tcp_sk(sk)->bind_hash;
309 BUG_TRAP(tcp_sk(sk)->bind_hash);
310 tw_add_bind_node(tw, &tw->tw_tb->owners);
311 spin_unlock(&bhead->lock);
312
313 write_lock(&ehead->lock);
314
315 /* Step 2: Remove SK from established hash. */
316 if (__sk_del_node_init(sk))
317 sock_prot_dec_use(sk->sk_prot);
318
319 /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
320 tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
321 atomic_inc(&tw->tw_refcnt);
322
323 write_unlock(&ehead->lock);
324}
325
326/*
327 * Move a socket to time-wait or dead fin-wait-2 state.
328 */
329void tcp_time_wait(struct sock *sk, int state, int timeo)
330{
331 struct tcp_tw_bucket *tw = NULL;
332 struct tcp_sock *tp = tcp_sk(sk);
333 int recycle_ok = 0;
334
335 if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
336 recycle_ok = tp->af_specific->remember_stamp(sk);
337
338 if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
339 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
340
341 if(tw != NULL) {
342 struct inet_sock *inet = inet_sk(sk);
343 int rto = (tp->rto<<2) - (tp->rto>>1);
344
345 /* Give us an identity. */
346 tw->tw_daddr = inet->daddr;
347 tw->tw_rcv_saddr = inet->rcv_saddr;
348 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
349 tw->tw_num = inet->num;
350 tw->tw_state = TCP_TIME_WAIT;
351 tw->tw_substate = state;
352 tw->tw_sport = inet->sport;
353 tw->tw_dport = inet->dport;
354 tw->tw_family = sk->sk_family;
355 tw->tw_reuse = sk->sk_reuse;
356 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
357 atomic_set(&tw->tw_refcnt, 1);
358
359 tw->tw_hashent = sk->sk_hashent;
360 tw->tw_rcv_nxt = tp->rcv_nxt;
361 tw->tw_snd_nxt = tp->snd_nxt;
362 tw->tw_rcv_wnd = tcp_receive_window(tp);
363 tw->tw_ts_recent = tp->rx_opt.ts_recent;
364 tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
365 tw_dead_node_init(tw);
366
367#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
368 if (tw->tw_family == PF_INET6) {
369 struct ipv6_pinfo *np = inet6_sk(sk);
370
371 ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
372 ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
373 tw->tw_v6_ipv6only = np->ipv6only;
374 } else {
375 memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
376 memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
377 tw->tw_v6_ipv6only = 0;
378 }
379#endif
380 /* Linkage updates. */
381 __tcp_tw_hashdance(sk, tw);
382
383 /* Get the TIME_WAIT timeout firing. */
384 if (timeo < rto)
385 timeo = rto;
386
387 if (recycle_ok) {
388 tw->tw_timeout = rto;
389 } else {
390 tw->tw_timeout = TCP_TIMEWAIT_LEN;
391 if (state == TCP_TIME_WAIT)
392 timeo = TCP_TIMEWAIT_LEN;
393 }
394
395 tcp_tw_schedule(tw, timeo);
396 tcp_tw_put(tw);
397 } else {
398 /* Sorry, if we're out of memory, just CLOSE this
399 * socket up. We've got bigger problems than
400 * non-graceful socket closings.
401 */
402 if (net_ratelimit())
403 printk(KERN_INFO "TCP: time wait bucket table overflow\n");
404 }
405
406 tcp_update_metrics(sk);
407 tcp_done(sk);
408}
409
410/* Kill off TIME_WAIT sockets once their lifetime has expired. */
411static int tcp_tw_death_row_slot;
412
413static void tcp_twkill(unsigned long);
414
415/* TIME_WAIT reaping mechanism. */
416#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
417#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
418
419#define TCP_TWKILL_QUOTA 100
420
421static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
422static DEFINE_SPINLOCK(tw_death_lock);
423static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
424static void twkill_work(void *);
425static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
426static u32 twkill_thread_slots;
427
428/* Returns non-zero if quota exceeded. */
429static int tcp_do_twkill_work(int slot, unsigned int quota)
430{
431 struct tcp_tw_bucket *tw;
432 struct hlist_node *node;
433 unsigned int killed;
434 int ret;
435
436 /* NOTE: compare this to previous version where lock
437 * was released after detaching chain. It was racy,
438 * because tw buckets are scheduled in not serialized context
439 * in 2.3 (with netfilter), and with softnet it is common, because
440 * soft irqs are not sequenced.
441 */
442 killed = 0;
443 ret = 0;
444rescan:
445 tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
446 __tw_del_dead_node(tw);
447 spin_unlock(&tw_death_lock);
448 tcp_timewait_kill(tw);
449 tcp_tw_put(tw);
450 killed++;
451 spin_lock(&tw_death_lock);
452 if (killed > quota) {
453 ret = 1;
454 break;
455 }
456
457 /* While we dropped tw_death_lock, another cpu may have
458 * killed off the next TW bucket in the list, therefore
459 * do a fresh re-read of the hlist head node with the
460 * lock reacquired. We still use the hlist traversal
461 * macro in order to get the prefetches.
462 */
463 goto rescan;
464 }
465
466 tcp_tw_count -= killed;
467 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
468
469 return ret;
470}
471
472static void tcp_twkill(unsigned long dummy)
473{
474 int need_timer, ret;
475
476 spin_lock(&tw_death_lock);
477
478 if (tcp_tw_count == 0)
479 goto out;
480
481 need_timer = 0;
482 ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
483 if (ret) {
484 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
485 mb();
486 schedule_work(&tcp_twkill_work);
487 need_timer = 1;
488 } else {
489 /* We purged the entire slot, anything left? */
490 if (tcp_tw_count)
491 need_timer = 1;
492 }
493 tcp_tw_death_row_slot =
494 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
495 if (need_timer)
496 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
497out:
498 spin_unlock(&tw_death_lock);
499}
500
501extern void twkill_slots_invalid(void);
502
503static void twkill_work(void *dummy)
504{
505 int i;
506
507 if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
508 twkill_slots_invalid();
509
510 while (twkill_thread_slots) {
511 spin_lock_bh(&tw_death_lock);
512 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
513 if (!(twkill_thread_slots & (1 << i)))
514 continue;
515
516 while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
517 if (need_resched()) {
518 spin_unlock_bh(&tw_death_lock);
519 schedule();
520 spin_lock_bh(&tw_death_lock);
521 }
522 }
523
524 twkill_thread_slots &= ~(1 << i);
525 }
526 spin_unlock_bh(&tw_death_lock);
527 }
528}
529
530/* These are always called from BH context. See callers in
531 * tcp_input.c to verify this.
532 */
533
534/* This is for handling early-kills of TIME_WAIT sockets. */
535void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
536{
537 spin_lock(&tw_death_lock);
538 if (tw_del_dead_node(tw)) {
539 tcp_tw_put(tw);
540 if (--tcp_tw_count == 0)
541 del_timer(&tcp_tw_timer);
542 }
543 spin_unlock(&tw_death_lock);
544 tcp_timewait_kill(tw);
545}
546
547/* Short-time timewait calendar */
548
549static int tcp_twcal_hand = -1;
550static int tcp_twcal_jiffie;
551static void tcp_twcal_tick(unsigned long);
552static struct timer_list tcp_twcal_timer =
553 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
554static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
555
556static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
557{
558 struct hlist_head *list;
559 int slot;
560
561 /* timeout := RTO * 3.5
562 *
563 * 3.5 = 1+2+0.5 to wait for two retransmits.
564 *
565 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
566 * our ACK acking that FIN can be lost. If N subsequent retransmitted
567 * FINs (or previous seqments) are lost (probability of such event
568 * is p^(N+1), where p is probability to lose single packet and
569 * time to detect the loss is about RTO*(2^N - 1) with exponential
570 * backoff). Normal timewait length is calculated so, that we
571 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
572 * [ BTW Linux. following BSD, violates this requirement waiting
573 * only for 60sec, we should wait at least for 240 secs.
574 * Well, 240 consumes too much of resources 8)
575 * ]
576 * This interval is not reduced to catch old duplicate and
577 * responces to our wandering segments living for two MSLs.
578 * However, if we use PAWS to detect
579 * old duplicates, we can reduce the interval to bounds required
580 * by RTO, rather than MSL. So, if peer understands PAWS, we
581 * kill tw bucket after 3.5*RTO (it is important that this number
582 * is greater than TS tick!) and detect old duplicates with help
583 * of PAWS.
584 */
585 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
586
587 spin_lock(&tw_death_lock);
588
589 /* Unlink it, if it was scheduled */
590 if (tw_del_dead_node(tw))
591 tcp_tw_count--;
592 else
593 atomic_inc(&tw->tw_refcnt);
594
595 if (slot >= TCP_TW_RECYCLE_SLOTS) {
596 /* Schedule to slow timer */
597 if (timeo >= TCP_TIMEWAIT_LEN) {
598 slot = TCP_TWKILL_SLOTS-1;
599 } else {
600 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
601 if (slot >= TCP_TWKILL_SLOTS)
602 slot = TCP_TWKILL_SLOTS-1;
603 }
604 tw->tw_ttd = jiffies + timeo;
605 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
606 list = &tcp_tw_death_row[slot];
607 } else {
608 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
609
610 if (tcp_twcal_hand < 0) {
611 tcp_twcal_hand = 0;
612 tcp_twcal_jiffie = jiffies;
613 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
614 add_timer(&tcp_twcal_timer);
615 } else {
616 if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
617 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
618 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
619 }
620 list = &tcp_twcal_row[slot];
621 }
622
623 hlist_add_head(&tw->tw_death_node, list);
624
625 if (tcp_tw_count++ == 0)
626 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
627 spin_unlock(&tw_death_lock);
628}
629
630void tcp_twcal_tick(unsigned long dummy)
631{
632 int n, slot;
633 unsigned long j;
634 unsigned long now = jiffies;
635 int killed = 0;
636 int adv = 0;
637
638 spin_lock(&tw_death_lock);
639 if (tcp_twcal_hand < 0)
640 goto out;
641
642 slot = tcp_twcal_hand;
643 j = tcp_twcal_jiffie;
644
645 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
646 if (time_before_eq(j, now)) {
647 struct hlist_node *node, *safe;
648 struct tcp_tw_bucket *tw;
649
650 tw_for_each_inmate_safe(tw, node, safe,
651 &tcp_twcal_row[slot]) {
652 __tw_del_dead_node(tw);
653 tcp_timewait_kill(tw);
654 tcp_tw_put(tw);
655 killed++;
656 }
657 } else {
658 if (!adv) {
659 adv = 1;
660 tcp_twcal_jiffie = j;
661 tcp_twcal_hand = slot;
662 }
663
664 if (!hlist_empty(&tcp_twcal_row[slot])) {
665 mod_timer(&tcp_twcal_timer, j);
666 goto out;
667 }
668 }
669 j += (1<<TCP_TW_RECYCLE_TICK);
670 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
671 }
672 tcp_twcal_hand = -1;
673
674out:
675 if ((tcp_tw_count -= killed) == 0)
676 del_timer(&tcp_tw_timer);
677 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
678 spin_unlock(&tw_death_lock);
679}
680
681/* This is not only more efficient than what we used to do, it eliminates
682 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
683 *
684 * Actually, we could lots of memory writes here. tp of listening
685 * socket contains all necessary default parameters.
686 */
687struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
688{
689 /* allocate the newsk from the same slab of the master sock,
690 * if not, at sk_free time we'll try to free it from the wrong
691 * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
693
694 if(newsk != NULL) {
695 struct tcp_sock *newtp;
696 struct sk_filter *filter;
697
698 memcpy(newsk, sk, sizeof(struct tcp_sock));
699 newsk->sk_state = TCP_SYN_RECV;
700
701 /* SANITY */
702 sk_node_init(&newsk->sk_node);
703 tcp_sk(newsk)->bind_hash = NULL;
704
705 /* Clone the TCP header template */
706 inet_sk(newsk)->dport = req->rmt_port;
707
708 sock_lock_init(newsk);
709 bh_lock_sock(newsk);
710
711 rwlock_init(&newsk->sk_dst_lock);
712 atomic_set(&newsk->sk_rmem_alloc, 0);
713 skb_queue_head_init(&newsk->sk_receive_queue);
714 atomic_set(&newsk->sk_wmem_alloc, 0);
715 skb_queue_head_init(&newsk->sk_write_queue);
716 atomic_set(&newsk->sk_omem_alloc, 0);
717 newsk->sk_wmem_queued = 0;
718 newsk->sk_forward_alloc = 0;
719
720 sock_reset_flag(newsk, SOCK_DONE);
721 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
722 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
723 newsk->sk_send_head = NULL;
724 rwlock_init(&newsk->sk_callback_lock);
725 skb_queue_head_init(&newsk->sk_error_queue);
726 newsk->sk_write_space = sk_stream_write_space;
727
728 if ((filter = newsk->sk_filter) != NULL)
729 sk_filter_charge(newsk, filter);
730
731 if (unlikely(xfrm_sk_clone_policy(newsk))) {
732 /* It is still raw copy of parent, so invalidate
733 * destructor and make plain sk_free() */
734 newsk->sk_destruct = NULL;
735 sk_free(newsk);
736 return NULL;
737 }
738
739 /* Now setup tcp_sock */
740 newtp = tcp_sk(newsk);
741 newtp->pred_flags = 0;
742 newtp->rcv_nxt = req->rcv_isn + 1;
743 newtp->snd_nxt = req->snt_isn + 1;
744 newtp->snd_una = req->snt_isn + 1;
745 newtp->snd_sml = req->snt_isn + 1;
746
747 tcp_prequeue_init(newtp);
748
749 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
750
751 newtp->retransmits = 0;
752 newtp->backoff = 0;
753 newtp->srtt = 0;
754 newtp->mdev = TCP_TIMEOUT_INIT;
755 newtp->rto = TCP_TIMEOUT_INIT;
756
757 newtp->packets_out = 0;
758 newtp->left_out = 0;
759 newtp->retrans_out = 0;
760 newtp->sacked_out = 0;
761 newtp->fackets_out = 0;
762 newtp->snd_ssthresh = 0x7fffffff;
763
764 /* So many TCP implementations out there (incorrectly) count the
765 * initial SYN frame in their delayed-ACK and congestion control
766 * algorithms that we must have the following bandaid to talk
767 * efficiently to them. -DaveM
768 */
769 newtp->snd_cwnd = 2;
770 newtp->snd_cwnd_cnt = 0;
771
772 newtp->frto_counter = 0;
773 newtp->frto_highmark = 0;
774
775 tcp_set_ca_state(newtp, TCP_CA_Open);
776 tcp_init_xmit_timers(newsk);
777 skb_queue_head_init(&newtp->out_of_order_queue);
778 newtp->rcv_wup = req->rcv_isn + 1;
779 newtp->write_seq = req->snt_isn + 1;
780 newtp->pushed_seq = newtp->write_seq;
781 newtp->copied_seq = req->rcv_isn + 1;
782
783 newtp->rx_opt.saw_tstamp = 0;
784
785 newtp->rx_opt.dsack = 0;
786 newtp->rx_opt.eff_sacks = 0;
787
788 newtp->probes_out = 0;
789 newtp->rx_opt.num_sacks = 0;
790 newtp->urg_data = 0;
791 newtp->listen_opt = NULL;
792 newtp->accept_queue = newtp->accept_queue_tail = NULL;
793 /* Deinitialize syn_wait_lock to trap illegal accesses. */
794 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
795
796 /* Back to base struct sock members. */
797 newsk->sk_err = 0;
798 newsk->sk_priority = 0;
799 atomic_set(&newsk->sk_refcnt, 2);
800#ifdef INET_REFCNT_DEBUG
801 atomic_inc(&inet_sock_nr);
802#endif
803 atomic_inc(&tcp_sockets_allocated);
804
805 if (sock_flag(newsk, SOCK_KEEPOPEN))
806 tcp_reset_keepalive_timer(newsk,
807 keepalive_time_when(newtp));
808 newsk->sk_socket = NULL;
809 newsk->sk_sleep = NULL;
810
811 newtp->rx_opt.tstamp_ok = req->tstamp_ok;
812 if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
813 if (sysctl_tcp_fack)
814 newtp->rx_opt.sack_ok |= 2;
815 }
816 newtp->window_clamp = req->window_clamp;
817 newtp->rcv_ssthresh = req->rcv_wnd;
818 newtp->rcv_wnd = req->rcv_wnd;
819 newtp->rx_opt.wscale_ok = req->wscale_ok;
820 if (newtp->rx_opt.wscale_ok) {
821 newtp->rx_opt.snd_wscale = req->snd_wscale;
822 newtp->rx_opt.rcv_wscale = req->rcv_wscale;
823 } else {
824 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
825 newtp->window_clamp = min(newtp->window_clamp, 65535U);
826 }
827 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
828 newtp->max_window = newtp->snd_wnd;
829
830 if (newtp->rx_opt.tstamp_ok) {
831 newtp->rx_opt.ts_recent = req->ts_recent;
832 newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
833 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
834 } else {
835 newtp->rx_opt.ts_recent_stamp = 0;
836 newtp->tcp_header_len = sizeof(struct tcphdr);
837 }
838 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
839 newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
840 newtp->rx_opt.mss_clamp = req->mss;
841 TCP_ECN_openreq_child(newtp, req);
842 if (newtp->ecn_flags&TCP_ECN_OK)
843 sock_set_flag(newsk, SOCK_NO_LARGESEND);
844
845 tcp_ca_init(newtp);
846
847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
848 }
849 return newsk;
850}
851
852/*
853 * Process an incoming packet for SYN_RECV sockets represented
854 * as an open_request.
855 */
856
857struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
858 struct open_request *req,
859 struct open_request **prev)
860{
861 struct tcphdr *th = skb->h.th;
862 struct tcp_sock *tp = tcp_sk(sk);
863 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
864 int paws_reject = 0;
865 struct tcp_options_received tmp_opt;
866 struct sock *child;
867
868 tmp_opt.saw_tstamp = 0;
869 if (th->doff > (sizeof(struct tcphdr)>>2)) {
870 tcp_parse_options(skb, &tmp_opt, 0);
871
872 if (tmp_opt.saw_tstamp) {
873 tmp_opt.ts_recent = req->ts_recent;
874 /* We do not store true stamp, but it is not required,
875 * it can be estimated (approximately)
876 * from another data.
877 */
878 tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
879 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
880 }
881 }
882
883 /* Check for pure retransmitted SYN. */
884 if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
885 flg == TCP_FLAG_SYN &&
886 !paws_reject) {
887 /*
888 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
889 * this case on figure 6 and figure 8, but formal
890 * protocol description says NOTHING.
891 * To be more exact, it says that we should send ACK,
892 * because this segment (at least, if it has no data)
893 * is out of window.
894 *
895 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
896 * describe SYN-RECV state. All the description
897 * is wrong, we cannot believe to it and should
898 * rely only on common sense and implementation
899 * experience.
900 *
901 * Enforce "SYN-ACK" according to figure 8, figure 6
902 * of RFC793, fixed by RFC1122.
903 */
904 req->class->rtx_syn_ack(sk, req, NULL);
905 return NULL;
906 }
907
908 /* Further reproduces section "SEGMENT ARRIVES"
909 for state SYN-RECEIVED of RFC793.
910 It is broken, however, it does not work only
911 when SYNs are crossed.
912
913 You would think that SYN crossing is impossible here, since
914 we should have a SYN_SENT socket (from connect()) on our end,
915 but this is not true if the crossed SYNs were sent to both
916 ends by a malicious third party. We must defend against this,
917 and to do that we first verify the ACK (as per RFC793, page
918 36) and reset if it is invalid. Is this a true full defense?
919 To convince ourselves, let us consider a way in which the ACK
920 test can still pass in this 'malicious crossed SYNs' case.
921 Malicious sender sends identical SYNs (and thus identical sequence
922 numbers) to both A and B:
923
924 A: gets SYN, seq=7
925 B: gets SYN, seq=7
926
927 By our good fortune, both A and B select the same initial
928 send sequence number of seven :-)
929
930 A: sends SYN|ACK, seq=7, ack_seq=8
931 B: sends SYN|ACK, seq=7, ack_seq=8
932
933 So we are now A eating this SYN|ACK, ACK test passes. So
934 does sequence test, SYN is truncated, and thus we consider
935 it a bare ACK.
936
937 If tp->defer_accept, we silently drop this bare ACK. Otherwise,
938 we create an established connection. Both ends (listening sockets)
939 accept the new incoming connection and try to talk to each other. 8-)
940
941 Note: This case is both harmless, and rare. Possibility is about the
942 same as us discovering intelligent life on another plant tomorrow.
943
944 But generally, we should (RFC lies!) to accept ACK
945 from SYNACK both here and in tcp_rcv_state_process().
946 tcp_rcv_state_process() does not, hence, we do not too.
947
948 Note that the case is absolutely generic:
949 we cannot optimize anything here without
950 violating protocol. All the checks must be made
951 before attempt to create socket.
952 */
953
954 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
955 * and the incoming segment acknowledges something not yet
956 * sent (the segment carries an unaccaptable ACK) ...
957 * a reset is sent."
958 *
959 * Invalid ACK: reset will be sent by listening socket
960 */
961 if ((flg & TCP_FLAG_ACK) &&
962 (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
963 return sk;
964
965 /* Also, it would be not so bad idea to check rcv_tsecr, which
966 * is essentially ACK extension and too early or too late values
967 * should cause reset in unsynchronized states.
968 */
969
970 /* RFC793: "first check sequence number". */
971
972 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
973 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
974 /* Out of window: send ACK and drop. */
975 if (!(flg & TCP_FLAG_RST))
976 req->class->send_ack(skb, req);
977 if (paws_reject)
978 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
979 return NULL;
980 }
981
982 /* In sequence, PAWS is OK. */
983
984 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
985 req->ts_recent = tmp_opt.rcv_tsval;
986
987 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
988 /* Truncate SYN, it is out of window starting
989 at req->rcv_isn+1. */
990 flg &= ~TCP_FLAG_SYN;
991 }
992
993 /* RFC793: "second check the RST bit" and
994 * "fourth, check the SYN bit"
995 */
996 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
997 goto embryonic_reset;
998
999 /* ACK sequence verified above, just make sure ACK is
1000 * set. If ACK not set, just silently drop the packet.
1001 */
1002 if (!(flg & TCP_FLAG_ACK))
1003 return NULL;
1004
1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
1007 req->acked = 1;
1008 return NULL;
1009 }
1010
1011 /* OK, ACK is valid, create big socket and
1012 * feed this segment to it. It will repeat all
1013 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
1014 * ESTABLISHED STATE. If it will be dropped after
1015 * socket is created, wait for troubles.
1016 */
1017 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1018 if (child == NULL)
1019 goto listen_overflow;
1020
1021 tcp_synq_unlink(tp, req, prev);
1022 tcp_synq_removed(sk, req);
1023
1024 tcp_acceptq_queue(sk, req, child);
1025 return child;
1026
1027 listen_overflow:
1028 if (!sysctl_tcp_abort_on_overflow) {
1029 req->acked = 1;
1030 return NULL;
1031 }
1032
1033 embryonic_reset:
1034 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
1035 if (!(flg & TCP_FLAG_RST))
1036 req->class->send_reset(skb);
1037
1038 tcp_synq_drop(sk, req, prev);
1039 return NULL;
1040}
1041
1042/*
1043 * Queue segment on the new socket if the new socket is active,
1044 * otherwise we just shortcircuit this and continue with
1045 * the new socket.
1046 */
1047
1048int tcp_child_process(struct sock *parent, struct sock *child,
1049 struct sk_buff *skb)
1050{
1051 int ret = 0;
1052 int state = child->sk_state;
1053
1054 if (!sock_owned_by_user(child)) {
1055 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1056
1057 /* Wakeup parent, send SIGIO */
1058 if (state == TCP_SYN_RECV && child->sk_state != state)
1059 parent->sk_data_ready(parent, 0);
1060 } else {
1061 /* Alas, it is possible again, because we do lookup
1062 * in main socket hash table and lock on listening
1063 * socket does not protect us more.
1064 */
1065 sk_add_backlog(child, skb);
1066 }
1067
1068 bh_unlock_sock(child);
1069 sock_put(child);
1070 return ret;
1071}
1072
1073EXPORT_SYMBOL(tcp_check_req);
1074EXPORT_SYMBOL(tcp_child_process);
1075EXPORT_SYMBOL(tcp_create_openreq_child);
1076EXPORT_SYMBOL(tcp_timewait_state_process);
1077EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 000000000000..13c14cb6dee4
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,1739 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23/*
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
28 *
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
36 *
37 */
38
39#include <net/tcp.h>
40
41#include <linux/compiler.h>
42#include <linux/module.h>
43#include <linux/smp_lock.h>
44
45/* People can turn this off for buggy TCP's found in printers etc. */
46int sysctl_tcp_retrans_collapse = 1;
47
48/* This limits the percentage of the congestion window which we
49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty.
51 */
52int sysctl_tcp_tso_win_divisor = 8;
53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb)
56{
57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
59 sk->sk_send_head = NULL;
60 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
61 tcp_packets_out_inc(sk, tp, skb);
62}
63
64/* SND.NXT, if window was not shrunk.
65 * If window has been shrunk, what should we make? It is not clear at all.
66 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
67 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
68 * invalid. OK, let's make this for now:
69 */
70static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
71{
72 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
73 return tp->snd_nxt;
74 else
75 return tp->snd_una+tp->snd_wnd;
76}
77
78/* Calculate mss to advertise in SYN segment.
79 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
80 *
81 * 1. It is independent of path mtu.
82 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
83 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
84 * attached devices, because some buggy hosts are confused by
85 * large MSS.
86 * 4. We do not make 3, we advertise MSS, calculated from first
87 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
88 * This may be overridden via information stored in routing table.
89 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
90 * probably even Jumbo".
91 */
92static __u16 tcp_advertise_mss(struct sock *sk)
93{
94 struct tcp_sock *tp = tcp_sk(sk);
95 struct dst_entry *dst = __sk_dst_get(sk);
96 int mss = tp->advmss;
97
98 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
99 mss = dst_metric(dst, RTAX_ADVMSS);
100 tp->advmss = mss;
101 }
102
103 return (__u16)mss;
104}
105
106/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
107 * This is the first part of cwnd validation mechanism. */
108static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
109{
110 s32 delta = tcp_time_stamp - tp->lsndtime;
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd;
113
114 if (tcp_is_vegas(tp))
115 tcp_vegas_enable(tp);
116
117 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd);
119
120 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
121 cwnd >>= 1;
122 tp->snd_cwnd = max(cwnd, restart_cwnd);
123 tp->snd_cwnd_stamp = tcp_time_stamp;
124 tp->snd_cwnd_used = 0;
125}
126
127static inline void tcp_event_data_sent(struct tcp_sock *tp,
128 struct sk_buff *skb, struct sock *sk)
129{
130 u32 now = tcp_time_stamp;
131
132 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
133 tcp_cwnd_restart(tp, __sk_dst_get(sk));
134
135 tp->lsndtime = now;
136
137 /* If it is a reply for ato after last received
138 * packet, enter pingpong mode.
139 */
140 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
141 tp->ack.pingpong = 1;
142}
143
144static __inline__ void tcp_event_ack_sent(struct sock *sk)
145{
146 struct tcp_sock *tp = tcp_sk(sk);
147
148 tcp_dec_quickack_mode(tp);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
150}
151
152/* Determine a window scaling and initial window to offer.
153 * Based on the assumption that the given amount of space
154 * will be offered. Store the results in the tp structure.
155 * NOTE: for smooth operation initial space offering should
156 * be a multiple of mss if possible. We assume here that mss >= 1.
157 * This MUST be enforced by all callers.
158 */
159void tcp_select_initial_window(int __space, __u32 mss,
160 __u32 *rcv_wnd, __u32 *window_clamp,
161 int wscale_ok, __u8 *rcv_wscale)
162{
163 unsigned int space = (__space < 0 ? 0 : __space);
164
165 /* If no clamp set the clamp to the max possible scaled window */
166 if (*window_clamp == 0)
167 (*window_clamp) = (65535 << 14);
168 space = min(*window_clamp, space);
169
170 /* Quantize space offering to a multiple of mss if possible. */
171 if (space > mss)
172 space = (space / mss) * mss;
173
174 /* NOTE: offering an initial window larger than 32767
175 * will break some buggy TCP stacks. We try to be nice.
176 * If we are not window scaling, then this truncates
177 * our initial window offering to 32k. There should also
178 * be a sysctl option to stop being nice.
179 */
180 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
181 (*rcv_wscale) = 0;
182 if (wscale_ok) {
183 /* Set window scaling on max possible window
184 * See RFC1323 for an explanation of the limit to 14
185 */
186 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
187 while (space > 65535 && (*rcv_wscale) < 14) {
188 space >>= 1;
189 (*rcv_wscale)++;
190 }
191 }
192
193 /* Set initial window to value enough for senders,
194 * following RFC1414. Senders, not following this RFC,
195 * will be satisfied with 2.
196 */
197 if (mss > (1<<*rcv_wscale)) {
198 int init_cwnd = 4;
199 if (mss > 1460*3)
200 init_cwnd = 2;
201 else if (mss > 1460)
202 init_cwnd = 3;
203 if (*rcv_wnd > init_cwnd*mss)
204 *rcv_wnd = init_cwnd*mss;
205 }
206
207 /* Set the clamp no higher than max representable value */
208 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
209}
210
211/* Chose a new window to advertise, update state in tcp_sock for the
212 * socket, and return result with RFC1323 scaling applied. The return
213 * value can be stuffed directly into th->window for an outgoing
214 * frame.
215 */
216static __inline__ u16 tcp_select_window(struct sock *sk)
217{
218 struct tcp_sock *tp = tcp_sk(sk);
219 u32 cur_win = tcp_receive_window(tp);
220 u32 new_win = __tcp_select_window(sk);
221
222 /* Never shrink the offered window */
223 if(new_win < cur_win) {
224 /* Danger Will Robinson!
225 * Don't update rcv_wup/rcv_wnd here or else
226 * we will not be able to advertise a zero
227 * window in time. --DaveM
228 *
229 * Relax Will Robinson.
230 */
231 new_win = cur_win;
232 }
233 tp->rcv_wnd = new_win;
234 tp->rcv_wup = tp->rcv_nxt;
235
236 /* Make sure we do not exceed the maximum possible
237 * scaled window.
238 */
239 if (!tp->rx_opt.rcv_wscale)
240 new_win = min(new_win, MAX_TCP_WINDOW);
241 else
242 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
243
244 /* RFC1323 scaling applied */
245 new_win >>= tp->rx_opt.rcv_wscale;
246
247 /* If we advertise zero window, disable fast path. */
248 if (new_win == 0)
249 tp->pred_flags = 0;
250
251 return new_win;
252}
253
254
255/* This routine actually transmits TCP packets queued in by
256 * tcp_do_sendmsg(). This is used by both the initial
257 * transmission and possible later retransmissions.
258 * All SKB's seen here are completely headerless. It is our
259 * job to build the TCP header, and pass the packet down to
260 * IP so it can do the same plus pass the packet off to the
261 * device.
262 *
263 * We are working here with either a clone of the original
264 * SKB, or a fresh unique copy made by the retransmit engine.
265 */
266static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
267{
268 if (skb != NULL) {
269 struct inet_sock *inet = inet_sk(sk);
270 struct tcp_sock *tp = tcp_sk(sk);
271 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
272 int tcp_header_size = tp->tcp_header_len;
273 struct tcphdr *th;
274 int sysctl_flags;
275 int err;
276
277 BUG_ON(!tcp_skb_pcount(skb));
278
279#define SYSCTL_FLAG_TSTAMPS 0x1
280#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4
282
283 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
286 if(sysctl_tcp_timestamps) {
287 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
288 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
289 }
290 if(sysctl_tcp_window_scaling) {
291 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
292 sysctl_flags |= SYSCTL_FLAG_WSCALE;
293 }
294 if(sysctl_tcp_sack) {
295 sysctl_flags |= SYSCTL_FLAG_SACK;
296 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
297 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
298 }
299 } else if (tp->rx_opt.eff_sacks) {
300 /* A SACK is 2 pad bytes, a 2 byte header, plus
301 * 2 32-bit sequence numbers for each SACK block.
302 */
303 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 }
306
307 /*
308 * If the connection is idle and we are restarting,
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th;
321 skb_set_owner_w(skb, sk);
322
323 /* Build TCP header and checksum it. */
324 th->source = inet->sport;
325 th->dest = inet->dport;
326 th->seq = htonl(tcb->seq);
327 th->ack_seq = htonl(tp->rcv_nxt);
328 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
329 if (tcb->flags & TCPCB_FLAG_SYN) {
330 /* RFC1323: The window in SYN & SYN/ACK segments
331 * is never scaled.
332 */
333 th->window = htons(tp->rcv_wnd);
334 } else {
335 th->window = htons(tcp_select_window(sk));
336 }
337 th->check = 0;
338 th->urg_ptr = 0;
339
340 if (tp->urg_mode &&
341 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
342 th->urg_ptr = htons(tp->snd_up-tcb->seq);
343 th->urg = 1;
344 }
345
346 if (tcb->flags & TCPCB_FLAG_SYN) {
347 tcp_syn_build_options((__u32 *)(th + 1),
348 tcp_advertise_mss(sk),
349 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
350 (sysctl_flags & SYSCTL_FLAG_SACK),
351 (sysctl_flags & SYSCTL_FLAG_WSCALE),
352 tp->rx_opt.rcv_wscale,
353 tcb->when,
354 tp->rx_opt.ts_recent);
355 } else {
356 tcp_build_and_update_options((__u32 *)(th + 1),
357 tp, tcb->when);
358
359 TCP_ECN_send(sk, tp, skb, tcp_header_size);
360 }
361 tp->af_specific->send_check(sk, th, skb->len, skb);
362
363 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk);
365
366 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk);
368
369 TCP_INC_STATS(TCP_MIB_OUTSEGS);
370
371 err = tp->af_specific->queue_xmit(skb, 0);
372 if (err <= 0)
373 return err;
374
375 tcp_enter_cwr(tp);
376
377 /* NET_XMIT_CN is special. It does not guarantee,
378 * that this packet is lost. It tells that device
379 * is about to start to drop packets or already
380 * drops some packets of the same priority and
381 * invokes us to send less aggressively.
382 */
383 return err == NET_XMIT_CN ? 0 : err;
384 }
385 return -ENOBUFS;
386#undef SYSCTL_FLAG_TSTAMPS
387#undef SYSCTL_FLAG_WSCALE
388#undef SYSCTL_FLAG_SACK
389}
390
391
392/* This routine just queue's the buffer
393 *
394 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
395 * otherwise socket can stall.
396 */
397static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
398{
399 struct tcp_sock *tp = tcp_sk(sk);
400
401 /* Advance write_seq and place onto the write_queue. */
402 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
403 skb_header_release(skb);
404 __skb_queue_tail(&sk->sk_write_queue, skb);
405 sk_charge_skb(sk, skb);
406
407 /* Queue it, remembering where we must start sending. */
408 if (sk->sk_send_head == NULL)
409 sk->sk_send_head = skb;
410}
411
412static inline void tcp_tso_set_push(struct sk_buff *skb)
413{
414 /* Force push to be on for any TSO frames to workaround
415 * problems with busted implementations like Mac OS-X that
416 * hold off socket receive wakeups until push is seen.
417 */
418 if (tcp_skb_pcount(skb) > 1)
419 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
420}
421
422/* Send _single_ skb sitting at the send head. This function requires
423 * true push pending frames to setup probe timer etc.
424 */
425void tcp_push_one(struct sock *sk, unsigned cur_mss)
426{
427 struct tcp_sock *tp = tcp_sk(sk);
428 struct sk_buff *skb = sk->sk_send_head;
429
430 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
431 /* Send it out now. */
432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
433 tcp_tso_set_push(skb);
434 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
435 sk->sk_send_head = NULL;
436 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
437 tcp_packets_out_inc(sk, tp, skb);
438 return;
439 }
440 }
441}
442
443void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
444{
445 if (skb->len <= mss_std) {
446 /* Avoid the costly divide in the normal
447 * non-TSO case.
448 */
449 skb_shinfo(skb)->tso_segs = 1;
450 skb_shinfo(skb)->tso_size = 0;
451 } else {
452 unsigned int factor;
453
454 factor = skb->len + (mss_std - 1);
455 factor /= mss_std;
456 skb_shinfo(skb)->tso_segs = factor;
457 skb_shinfo(skb)->tso_size = mss_std;
458 }
459}
460
461/* Function to create two new TCP segments. Shrinks the given segment
462 * to the specified size and appends a new segment with the rest of the
463 * packet to the list. This won't be called frequently, I hope.
464 * Remember, these are still headerless SKBs at this point.
465 */
466static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
467{
468 struct tcp_sock *tp = tcp_sk(sk);
469 struct sk_buff *buff;
470 int nsize;
471 u16 flags;
472
473 nsize = skb_headlen(skb) - len;
474 if (nsize < 0)
475 nsize = 0;
476
477 if (skb_cloned(skb) &&
478 skb_is_nonlinear(skb) &&
479 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
480 return -ENOMEM;
481
482 /* Get a new skb... force flag on. */
483 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
484 if (buff == NULL)
485 return -ENOMEM; /* We'll just try again later. */
486 sk_charge_skb(sk, buff);
487
488 /* Correct the sequence numbers. */
489 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
490 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
491 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
492
493 /* PSH and FIN should only be set in the second packet. */
494 flags = TCP_SKB_CB(skb)->flags;
495 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
496 TCP_SKB_CB(buff)->flags = flags;
497 TCP_SKB_CB(buff)->sacked =
498 (TCP_SKB_CB(skb)->sacked &
499 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
500 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
501
502 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
503 /* Copy and checksum data tail into the new buffer. */
504 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
505 nsize, 0);
506
507 skb_trim(skb, len);
508
509 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
510 } else {
511 skb->ip_summed = CHECKSUM_HW;
512 skb_split(skb, buff, len);
513 }
514
515 buff->ip_summed = skb->ip_summed;
516
517 /* Looks stupid, but our code really uses when of
518 * skbs, which it never sent before. --ANK
519 */
520 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
521
522 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
523 tp->lost_out -= tcp_skb_pcount(skb);
524 tp->left_out -= tcp_skb_pcount(skb);
525 }
526
527 /* Fix up tso_factor for both original and new SKB. */
528 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
529 tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
530
531 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
532 tp->lost_out += tcp_skb_pcount(skb);
533 tp->left_out += tcp_skb_pcount(skb);
534 }
535
536 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
537 tp->lost_out += tcp_skb_pcount(buff);
538 tp->left_out += tcp_skb_pcount(buff);
539 }
540
541 /* Link BUFF into the send queue. */
542 __skb_append(skb, buff);
543
544 return 0;
545}
546
547/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
548 * eventually). The difference is that pulled data not copied, but
549 * immediately discarded.
550 */
551static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
552{
553 int i, k, eat;
554
555 eat = len;
556 k = 0;
557 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
558 if (skb_shinfo(skb)->frags[i].size <= eat) {
559 put_page(skb_shinfo(skb)->frags[i].page);
560 eat -= skb_shinfo(skb)->frags[i].size;
561 } else {
562 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
563 if (eat) {
564 skb_shinfo(skb)->frags[k].page_offset += eat;
565 skb_shinfo(skb)->frags[k].size -= eat;
566 eat = 0;
567 }
568 k++;
569 }
570 }
571 skb_shinfo(skb)->nr_frags = k;
572
573 skb->tail = skb->data;
574 skb->data_len -= len;
575 skb->len = skb->data_len;
576 return skb->tail;
577}
578
579int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
580{
581 if (skb_cloned(skb) &&
582 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
583 return -ENOMEM;
584
585 if (len <= skb_headlen(skb)) {
586 __skb_pull(skb, len);
587 } else {
588 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
589 return -ENOMEM;
590 }
591
592 TCP_SKB_CB(skb)->seq += len;
593 skb->ip_summed = CHECKSUM_HW;
594
595 skb->truesize -= len;
596 sk->sk_wmem_queued -= len;
597 sk->sk_forward_alloc += len;
598 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
599
600 /* Any change of skb->len requires recalculation of tso
601 * factor and mss.
602 */
603 if (tcp_skb_pcount(skb) > 1)
604 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
605
606 return 0;
607}
608
609/* This function synchronize snd mss to current pmtu/exthdr set.
610
611 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
612 for TCP options, but includes only bare TCP header.
613
614 tp->rx_opt.mss_clamp is mss negotiated at connection setup.
615 It is minumum of user_mss and mss received with SYN.
616 It also does not include TCP options.
617
618 tp->pmtu_cookie is last pmtu, seen by this function.
619
620 tp->mss_cache is current effective sending mss, including
621 all tcp options except for SACKs. It is evaluated,
622 taking into account current pmtu, but never exceeds
623 tp->rx_opt.mss_clamp.
624
625 NOTE1. rfc1122 clearly states that advertised MSS
626 DOES NOT include either tcp or ip options.
627
628 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
629 this function. --ANK (980731)
630 */
631
632unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
633{
634 struct tcp_sock *tp = tcp_sk(sk);
635 int mss_now;
636
637 /* Calculate base mss without TCP options:
638 It is MMS_S - sizeof(tcphdr) of rfc1122
639 */
640 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
641
642 /* Clamp it (mss_clamp does not include tcp options) */
643 if (mss_now > tp->rx_opt.mss_clamp)
644 mss_now = tp->rx_opt.mss_clamp;
645
646 /* Now subtract optional transport overhead */
647 mss_now -= tp->ext_header_len;
648
649 /* Then reserve room for full set of TCP options and 8 bytes of data */
650 if (mss_now < 48)
651 mss_now = 48;
652
653 /* Now subtract TCP options size, not including SACKs */
654 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
655
656 /* Bound mss with half of window */
657 if (tp->max_window && mss_now > (tp->max_window>>1))
658 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
659
660 /* And store cached results */
661 tp->pmtu_cookie = pmtu;
662 tp->mss_cache = tp->mss_cache_std = mss_now;
663
664 return mss_now;
665}
666
667/* Compute the current effective MSS, taking SACKs and IP options,
668 * and even PMTU discovery events into account.
669 *
670 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
671 * cannot be large. However, taking into account rare use of URG, this
672 * is not a big flaw.
673 */
674
675unsigned int tcp_current_mss(struct sock *sk, int large)
676{
677 struct tcp_sock *tp = tcp_sk(sk);
678 struct dst_entry *dst = __sk_dst_get(sk);
679 unsigned int do_large, mss_now;
680
681 mss_now = tp->mss_cache_std;
682 if (dst) {
683 u32 mtu = dst_mtu(dst);
684 if (mtu != tp->pmtu_cookie)
685 mss_now = tcp_sync_mss(sk, mtu);
686 }
687
688 do_large = (large &&
689 (sk->sk_route_caps & NETIF_F_TSO) &&
690 !tp->urg_mode);
691
692 if (do_large) {
693 unsigned int large_mss, factor, limit;
694
695 large_mss = 65535 - tp->af_specific->net_header_len -
696 tp->ext_header_len - tp->tcp_header_len;
697
698 if (tp->max_window && large_mss > (tp->max_window>>1))
699 large_mss = max((tp->max_window>>1),
700 68U - tp->tcp_header_len);
701
702 factor = large_mss / mss_now;
703
704 /* Always keep large mss multiple of real mss, but
705 * do not exceed 1/tso_win_divisor of the congestion window
706 * so we can keep the ACK clock ticking and minimize
707 * bursting.
708 */
709 limit = tp->snd_cwnd;
710 if (sysctl_tcp_tso_win_divisor)
711 limit /= sysctl_tcp_tso_win_divisor;
712 limit = max(1U, limit);
713 if (factor > limit)
714 factor = limit;
715
716 tp->mss_cache = mss_now * factor;
717
718 mss_now = tp->mss_cache;
719 }
720
721 if (tp->rx_opt.eff_sacks)
722 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
723 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
724 return mss_now;
725}
726
727/* This routine writes packets to the network. It advances the
728 * send_head. This happens as incoming acks open up the remote
729 * window for us.
730 *
731 * Returns 1, if no segments are in flight and we have queued segments, but
732 * cannot send anything now because of SWS or another problem.
733 */
734int tcp_write_xmit(struct sock *sk, int nonagle)
735{
736 struct tcp_sock *tp = tcp_sk(sk);
737 unsigned int mss_now;
738
739 /* If we are closed, the bytes will have to remain here.
740 * In time closedown will finish, we empty the write queue and all
741 * will be happy.
742 */
743 if (sk->sk_state != TCP_CLOSE) {
744 struct sk_buff *skb;
745 int sent_pkts = 0;
746
747 /* Account for SACKS, we may need to fragment due to this.
748 * It is just like the real MSS changing on us midstream.
749 * We also handle things correctly when the user adds some
750 * IP options mid-stream. Silly to do, but cover it.
751 */
752 mss_now = tcp_current_mss(sk, 1);
753
754 while ((skb = sk->sk_send_head) &&
755 tcp_snd_test(tp, skb, mss_now,
756 tcp_skb_is_last(sk, skb) ? nonagle :
757 TCP_NAGLE_PUSH)) {
758 if (skb->len > mss_now) {
759 if (tcp_fragment(sk, skb, mss_now))
760 break;
761 }
762
763 TCP_SKB_CB(skb)->when = tcp_time_stamp;
764 tcp_tso_set_push(skb);
765 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
766 break;
767
768 /* Advance the send_head. This one is sent out.
769 * This call will increment packets_out.
770 */
771 update_send_head(sk, tp, skb);
772
773 tcp_minshall_update(tp, mss_now, skb);
774 sent_pkts = 1;
775 }
776
777 if (sent_pkts) {
778 tcp_cwnd_validate(sk, tp);
779 return 0;
780 }
781
782 return !tp->packets_out && sk->sk_send_head;
783 }
784 return 0;
785}
786
787/* This function returns the amount that we can raise the
788 * usable window based on the following constraints
789 *
790 * 1. The window can never be shrunk once it is offered (RFC 793)
791 * 2. We limit memory per socket
792 *
793 * RFC 1122:
794 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
795 * RECV.NEXT + RCV.WIN fixed until:
796 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
797 *
798 * i.e. don't raise the right edge of the window until you can raise
799 * it at least MSS bytes.
800 *
801 * Unfortunately, the recommended algorithm breaks header prediction,
802 * since header prediction assumes th->window stays fixed.
803 *
804 * Strictly speaking, keeping th->window fixed violates the receiver
805 * side SWS prevention criteria. The problem is that under this rule
806 * a stream of single byte packets will cause the right side of the
807 * window to always advance by a single byte.
808 *
809 * Of course, if the sender implements sender side SWS prevention
810 * then this will not be a problem.
811 *
812 * BSD seems to make the following compromise:
813 *
814 * If the free space is less than the 1/4 of the maximum
815 * space available and the free space is less than 1/2 mss,
816 * then set the window to 0.
817 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
818 * Otherwise, just prevent the window from shrinking
819 * and from being larger than the largest representable value.
820 *
821 * This prevents incremental opening of the window in the regime
822 * where TCP is limited by the speed of the reader side taking
823 * data out of the TCP receive queue. It does nothing about
824 * those cases where the window is constrained on the sender side
825 * because the pipeline is full.
826 *
827 * BSD also seems to "accidentally" limit itself to windows that are a
828 * multiple of MSS, at least until the free space gets quite small.
829 * This would appear to be a side effect of the mbuf implementation.
830 * Combining these two algorithms results in the observed behavior
831 * of having a fixed window size at almost all times.
832 *
833 * Below we obtain similar behavior by forcing the offered window to
834 * a multiple of the mss when it is feasible to do so.
835 *
836 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
837 * Regular options like TIMESTAMP are taken into account.
838 */
839u32 __tcp_select_window(struct sock *sk)
840{
841 struct tcp_sock *tp = tcp_sk(sk);
842 /* MSS for the peer's data. Previous verions used mss_clamp
843 * here. I don't know if the value based on our guesses
844 * of peer's MSS is better for the performance. It's more correct
845 * but may be worse for the performance because of rcv_mss
846 * fluctuations. --SAW 1998/11/1
847 */
848 int mss = tp->ack.rcv_mss;
849 int free_space = tcp_space(sk);
850 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
851 int window;
852
853 if (mss > full_space)
854 mss = full_space;
855
856 if (free_space < full_space/2) {
857 tp->ack.quick = 0;
858
859 if (tcp_memory_pressure)
860 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
861
862 if (free_space < mss)
863 return 0;
864 }
865
866 if (free_space > tp->rcv_ssthresh)
867 free_space = tp->rcv_ssthresh;
868
869 /* Don't do rounding if we are using window scaling, since the
870 * scaled window will not line up with the MSS boundary anyway.
871 */
872 window = tp->rcv_wnd;
873 if (tp->rx_opt.rcv_wscale) {
874 window = free_space;
875
876 /* Advertise enough space so that it won't get scaled away.
877 * Import case: prevent zero window announcement if
878 * 1<<rcv_wscale > mss.
879 */
880 if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
881 window = (((window >> tp->rx_opt.rcv_wscale) + 1)
882 << tp->rx_opt.rcv_wscale);
883 } else {
884 /* Get the largest window that is a nice multiple of mss.
885 * Window clamp already applied above.
886 * If our current window offering is within 1 mss of the
887 * free space we just keep it. This prevents the divide
888 * and multiply from happening most of the time.
889 * We also don't do any window rounding when the free space
890 * is too small.
891 */
892 if (window <= free_space - mss || window > free_space)
893 window = (free_space/mss)*mss;
894 }
895
896 return window;
897}
898
899/* Attempt to collapse two adjacent SKB's during retransmission. */
900static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
901{
902 struct tcp_sock *tp = tcp_sk(sk);
903 struct sk_buff *next_skb = skb->next;
904
905 /* The first test we must make is that neither of these two
906 * SKB's are still referenced by someone else.
907 */
908 if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
909 int skb_size = skb->len, next_skb_size = next_skb->len;
910 u16 flags = TCP_SKB_CB(skb)->flags;
911
912 /* Also punt if next skb has been SACK'd. */
913 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
914 return;
915
916 /* Next skb is out of window. */
917 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
918 return;
919
920 /* Punt if not enough space exists in the first SKB for
921 * the data in the second, or the total combined payload
922 * would exceed the MSS.
923 */
924 if ((next_skb_size > skb_tailroom(skb)) ||
925 ((skb_size + next_skb_size) > mss_now))
926 return;
927
928 BUG_ON(tcp_skb_pcount(skb) != 1 ||
929 tcp_skb_pcount(next_skb) != 1);
930
931 /* Ok. We will be able to collapse the packet. */
932 __skb_unlink(next_skb, next_skb->list);
933
934 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
935
936 if (next_skb->ip_summed == CHECKSUM_HW)
937 skb->ip_summed = CHECKSUM_HW;
938
939 if (skb->ip_summed != CHECKSUM_HW)
940 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
941
942 /* Update sequence range on original skb. */
943 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
944
945 /* Merge over control information. */
946 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
947 TCP_SKB_CB(skb)->flags = flags;
948
949 /* All done, get rid of second SKB and account for it so
950 * packet counting does not break.
951 */
952 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
953 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
954 tp->retrans_out -= tcp_skb_pcount(next_skb);
955 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
956 tp->lost_out -= tcp_skb_pcount(next_skb);
957 tp->left_out -= tcp_skb_pcount(next_skb);
958 }
959 /* Reno case is special. Sigh... */
960 if (!tp->rx_opt.sack_ok && tp->sacked_out) {
961 tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
962 tp->left_out -= tcp_skb_pcount(next_skb);
963 }
964
965 /* Not quite right: it can be > snd.fack, but
966 * it is better to underestimate fackets.
967 */
968 tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
969 tcp_packets_out_dec(tp, next_skb);
970 sk_stream_free_skb(sk, next_skb);
971 }
972}
973
974/* Do a simple retransmit without using the backoff mechanisms in
975 * tcp_timer. This is used for path mtu discovery.
976 * The socket is already locked here.
977 */
978void tcp_simple_retransmit(struct sock *sk)
979{
980 struct tcp_sock *tp = tcp_sk(sk);
981 struct sk_buff *skb;
982 unsigned int mss = tcp_current_mss(sk, 0);
983 int lost = 0;
984
985 sk_stream_for_retrans_queue(skb, sk) {
986 if (skb->len > mss &&
987 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
988 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
989 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
990 tp->retrans_out -= tcp_skb_pcount(skb);
991 }
992 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
993 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
994 tp->lost_out += tcp_skb_pcount(skb);
995 lost = 1;
996 }
997 }
998 }
999
1000 if (!lost)
1001 return;
1002
1003 tcp_sync_left_out(tp);
1004
1005 /* Don't muck with the congestion window here.
1006 * Reason is that we do not increase amount of _data_
1007 * in network, but units changed and effective
1008 * cwnd/ssthresh really reduced now.
1009 */
1010 if (tp->ca_state != TCP_CA_Loss) {
1011 tp->high_seq = tp->snd_nxt;
1012 tp->snd_ssthresh = tcp_current_ssthresh(tp);
1013 tp->prior_ssthresh = 0;
1014 tp->undo_marker = 0;
1015 tcp_set_ca_state(tp, TCP_CA_Loss);
1016 }
1017 tcp_xmit_retransmit_queue(sk);
1018}
1019
1020/* This retransmits one SKB. Policy decisions and retransmit queue
1021 * state updates are done by the caller. Returns non-zero if an
1022 * error occurred which prevented the send.
1023 */
1024int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1025{
1026 struct tcp_sock *tp = tcp_sk(sk);
1027 unsigned int cur_mss = tcp_current_mss(sk, 0);
1028 int err;
1029
1030 /* Do not sent more than we queued. 1/4 is reserved for possible
1031 * copying overhead: frgagmentation, tunneling, mangling etc.
1032 */
1033 if (atomic_read(&sk->sk_wmem_alloc) >
1034 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
1035 return -EAGAIN;
1036
1037 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1038 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1039 BUG();
1040
1041 if (sk->sk_route_caps & NETIF_F_TSO) {
1042 sk->sk_route_caps &= ~NETIF_F_TSO;
1043 sock_set_flag(sk, SOCK_NO_LARGESEND);
1044 tp->mss_cache = tp->mss_cache_std;
1045 }
1046
1047 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1048 return -ENOMEM;
1049 }
1050
1051 /* If receiver has shrunk his window, and skb is out of
1052 * new window, do not retransmit it. The exception is the
1053 * case, when window is shrunk to zero. In this case
1054 * our retransmit serves as a zero window probe.
1055 */
1056 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
1057 && TCP_SKB_CB(skb)->seq != tp->snd_una)
1058 return -EAGAIN;
1059
1060 if (skb->len > cur_mss) {
1061 int old_factor = tcp_skb_pcount(skb);
1062 int new_factor;
1063
1064 if (tcp_fragment(sk, skb, cur_mss))
1065 return -ENOMEM; /* We'll try again later. */
1066
1067 /* New SKB created, account for it. */
1068 new_factor = tcp_skb_pcount(skb);
1069 tp->packets_out -= old_factor - new_factor;
1070 tp->packets_out += tcp_skb_pcount(skb->next);
1071 }
1072
1073 /* Collapse two adjacent packets if worthwhile and we can. */
1074 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1075 (skb->len < (cur_mss >> 1)) &&
1076 (skb->next != sk->sk_send_head) &&
1077 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
1078 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
1079 (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) &&
1080 (sysctl_tcp_retrans_collapse != 0))
1081 tcp_retrans_try_collapse(sk, skb, cur_mss);
1082
1083 if(tp->af_specific->rebuild_header(sk))
1084 return -EHOSTUNREACH; /* Routing failure or similar. */
1085
1086 /* Some Solaris stacks overoptimize and ignore the FIN on a
1087 * retransmit when old data is attached. So strip it off
1088 * since it is cheap to do so and saves bytes on the network.
1089 */
1090 if(skb->len > 0 &&
1091 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1092 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1093 if (!pskb_trim(skb, 0)) {
1094 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
1095 skb_shinfo(skb)->tso_segs = 1;
1096 skb_shinfo(skb)->tso_size = 0;
1097 skb->ip_summed = CHECKSUM_NONE;
1098 skb->csum = 0;
1099 }
1100 }
1101
1102 /* Make a copy, if the first transmission SKB clone we made
1103 * is still in somebody's hands, else make a clone.
1104 */
1105 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1106 tcp_tso_set_push(skb);
1107
1108 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1109 pskb_copy(skb, GFP_ATOMIC):
1110 skb_clone(skb, GFP_ATOMIC)));
1111
1112 if (err == 0) {
1113 /* Update global TCP statistics. */
1114 TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
1115
1116 tp->total_retrans++;
1117
1118#if FASTRETRANS_DEBUG > 0
1119 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
1120 if (net_ratelimit())
1121 printk(KERN_DEBUG "retrans_out leaked.\n");
1122 }
1123#endif
1124 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1125 tp->retrans_out += tcp_skb_pcount(skb);
1126
1127 /* Save stamp of the first retransmit. */
1128 if (!tp->retrans_stamp)
1129 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
1130
1131 tp->undo_retrans++;
1132
1133 /* snd_nxt is stored to detect loss of retransmitted segment,
1134 * see tcp_input.c tcp_sacktag_write_queue().
1135 */
1136 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
1137 }
1138 return err;
1139}
1140
1141/* This gets called after a retransmit timeout, and the initially
1142 * retransmitted data is acknowledged. It tries to continue
1143 * resending the rest of the retransmit queue, until either
1144 * we've sent it all or the congestion window limit is reached.
1145 * If doing SACK, the first ACK which comes back for a timeout
1146 * based retransmit packet might feed us FACK information again.
1147 * If so, we use it to avoid unnecessarily retransmissions.
1148 */
1149void tcp_xmit_retransmit_queue(struct sock *sk)
1150{
1151 struct tcp_sock *tp = tcp_sk(sk);
1152 struct sk_buff *skb;
1153 int packet_cnt = tp->lost_out;
1154
1155 /* First pass: retransmit lost packets. */
1156 if (packet_cnt) {
1157 sk_stream_for_retrans_queue(skb, sk) {
1158 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1159
1160 /* Assume this retransmit will generate
1161 * only one packet for congestion window
1162 * calculation purposes. This works because
1163 * tcp_retransmit_skb() will chop up the
1164 * packet to be MSS sized and all the
1165 * packet counting works out.
1166 */
1167 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1168 return;
1169
1170 if (sacked&TCPCB_LOST) {
1171 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1172 if (tcp_retransmit_skb(sk, skb))
1173 return;
1174 if (tp->ca_state != TCP_CA_Loss)
1175 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1176 else
1177 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1178
1179 if (skb ==
1180 skb_peek(&sk->sk_write_queue))
1181 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1182 }
1183
1184 packet_cnt -= tcp_skb_pcount(skb);
1185 if (packet_cnt <= 0)
1186 break;
1187 }
1188 }
1189 }
1190
1191 /* OK, demanded retransmission is finished. */
1192
1193 /* Forward retransmissions are possible only during Recovery. */
1194 if (tp->ca_state != TCP_CA_Recovery)
1195 return;
1196
1197 /* No forward retransmissions in Reno are possible. */
1198 if (!tp->rx_opt.sack_ok)
1199 return;
1200
1201 /* Yeah, we have to make difficult choice between forward transmission
1202 * and retransmission... Both ways have their merits...
1203 *
1204 * For now we do not retransmit anything, while we have some new
1205 * segments to send.
1206 */
1207
1208 if (tcp_may_send_now(sk, tp))
1209 return;
1210
1211 packet_cnt = 0;
1212
1213 sk_stream_for_retrans_queue(skb, sk) {
1214 /* Similar to the retransmit loop above we
1215 * can pretend that the retransmitted SKB
1216 * we send out here will be composed of one
1217 * real MSS sized packet because tcp_retransmit_skb()
1218 * will fragment it if necessary.
1219 */
1220 if (++packet_cnt > tp->fackets_out)
1221 break;
1222
1223 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1224 break;
1225
1226 if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1227 continue;
1228
1229 /* Ok, retransmit it. */
1230 if (tcp_retransmit_skb(sk, skb))
1231 break;
1232
1233 if (skb == skb_peek(&sk->sk_write_queue))
1234 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1235
1236 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1237 }
1238}
1239
1240
1241/* Send a fin. The caller locks the socket for us. This cannot be
1242 * allowed to fail queueing a FIN frame under any circumstances.
1243 */
1244void tcp_send_fin(struct sock *sk)
1245{
1246 struct tcp_sock *tp = tcp_sk(sk);
1247 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1248 int mss_now;
1249
1250 /* Optimization, tack on the FIN if we have a queue of
1251 * unsent frames. But be careful about outgoing SACKS
1252 * and IP options.
1253 */
1254 mss_now = tcp_current_mss(sk, 1);
1255
1256 if (sk->sk_send_head != NULL) {
1257 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1258 TCP_SKB_CB(skb)->end_seq++;
1259 tp->write_seq++;
1260 } else {
1261 /* Socket is locked, keep trying until memory is available. */
1262 for (;;) {
1263 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1264 if (skb)
1265 break;
1266 yield();
1267 }
1268
1269 /* Reserve space for headers and prepare control bits. */
1270 skb_reserve(skb, MAX_TCP_HEADER);
1271 skb->csum = 0;
1272 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1273 TCP_SKB_CB(skb)->sacked = 0;
1274 skb_shinfo(skb)->tso_segs = 1;
1275 skb_shinfo(skb)->tso_size = 0;
1276
1277 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1278 TCP_SKB_CB(skb)->seq = tp->write_seq;
1279 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1280 tcp_queue_skb(sk, skb);
1281 }
1282 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1283}
1284
1285/* We get here when a process closes a file descriptor (either due to
1286 * an explicit close() or as a byproduct of exit()'ing) and there
1287 * was unread data in the receive queue. This behavior is recommended
1288 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1289 */
1290void tcp_send_active_reset(struct sock *sk, int priority)
1291{
1292 struct tcp_sock *tp = tcp_sk(sk);
1293 struct sk_buff *skb;
1294
1295 /* NOTE: No TCP options attached and we never retransmit this. */
1296 skb = alloc_skb(MAX_TCP_HEADER, priority);
1297 if (!skb) {
1298 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1299 return;
1300 }
1301
1302 /* Reserve space for headers and prepare control bits. */
1303 skb_reserve(skb, MAX_TCP_HEADER);
1304 skb->csum = 0;
1305 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1306 TCP_SKB_CB(skb)->sacked = 0;
1307 skb_shinfo(skb)->tso_segs = 1;
1308 skb_shinfo(skb)->tso_size = 0;
1309
1310 /* Send it off. */
1311 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1312 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1313 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1314 if (tcp_transmit_skb(sk, skb))
1315 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1316}
1317
1318/* WARNING: This routine must only be called when we have already sent
1319 * a SYN packet that crossed the incoming SYN that caused this routine
1320 * to get called. If this assumption fails then the initial rcv_wnd
1321 * and rcv_wscale values will not be correct.
1322 */
1323int tcp_send_synack(struct sock *sk)
1324{
1325 struct sk_buff* skb;
1326
1327 skb = skb_peek(&sk->sk_write_queue);
1328 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1329 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1330 return -EFAULT;
1331 }
1332 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1333 if (skb_cloned(skb)) {
1334 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1335 if (nskb == NULL)
1336 return -ENOMEM;
1337 __skb_unlink(skb, &sk->sk_write_queue);
1338 skb_header_release(nskb);
1339 __skb_queue_head(&sk->sk_write_queue, nskb);
1340 sk_stream_free_skb(sk, skb);
1341 sk_charge_skb(sk, nskb);
1342 skb = nskb;
1343 }
1344
1345 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1346 TCP_ECN_send_synack(tcp_sk(sk), skb);
1347 }
1348 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1349 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1350}
1351
1352/*
1353 * Prepare a SYN-ACK.
1354 */
1355struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1356 struct open_request *req)
1357{
1358 struct tcp_sock *tp = tcp_sk(sk);
1359 struct tcphdr *th;
1360 int tcp_header_size;
1361 struct sk_buff *skb;
1362
1363 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1364 if (skb == NULL)
1365 return NULL;
1366
1367 /* Reserve space for headers. */
1368 skb_reserve(skb, MAX_TCP_HEADER);
1369
1370 skb->dst = dst_clone(dst);
1371
1372 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1373 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1374 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1375 /* SACK_PERM is in the place of NOP NOP of TS */
1376 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1377 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1378
1379 memset(th, 0, sizeof(struct tcphdr));
1380 th->syn = 1;
1381 th->ack = 1;
1382 if (dst->dev->features&NETIF_F_TSO)
1383 req->ecn_ok = 0;
1384 TCP_ECN_make_synack(req, th);
1385 th->source = inet_sk(sk)->sport;
1386 th->dest = req->rmt_port;
1387 TCP_SKB_CB(skb)->seq = req->snt_isn;
1388 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1389 TCP_SKB_CB(skb)->sacked = 0;
1390 skb_shinfo(skb)->tso_segs = 1;
1391 skb_shinfo(skb)->tso_size = 0;
1392 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1393 th->ack_seq = htonl(req->rcv_isn + 1);
1394 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1395 __u8 rcv_wscale;
1396 /* Set this up on the first call only */
1397 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1398 /* tcp_full_space because it is guaranteed to be the first packet */
1399 tcp_select_initial_window(tcp_full_space(sk),
1400 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1401 &req->rcv_wnd,
1402 &req->window_clamp,
1403 req->wscale_ok,
1404 &rcv_wscale);
1405 req->rcv_wscale = rcv_wscale;
1406 }
1407
1408 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1409 th->window = htons(req->rcv_wnd);
1410
1411 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1412 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1413 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1414 TCP_SKB_CB(skb)->when,
1415 req->ts_recent);
1416
1417 skb->csum = 0;
1418 th->doff = (tcp_header_size >> 2);
1419 TCP_INC_STATS(TCP_MIB_OUTSEGS);
1420 return skb;
1421}
1422
1423/*
1424 * Do all connect socket setups that can be done AF independent.
1425 */
1426static inline void tcp_connect_init(struct sock *sk)
1427{
1428 struct dst_entry *dst = __sk_dst_get(sk);
1429 struct tcp_sock *tp = tcp_sk(sk);
1430 __u8 rcv_wscale;
1431
1432 /* We'll fix this up when we get a response from the other end.
1433 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1434 */
1435 tp->tcp_header_len = sizeof(struct tcphdr) +
1436 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1437
1438 /* If user gave his TCP_MAXSEG, record it to clamp */
1439 if (tp->rx_opt.user_mss)
1440 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
1441 tp->max_window = 0;
1442 tcp_sync_mss(sk, dst_mtu(dst));
1443
1444 if (!tp->window_clamp)
1445 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1446 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1447 tcp_initialize_rcv_mss(sk);
1448 tcp_ca_init(tp);
1449
1450 tcp_select_initial_window(tcp_full_space(sk),
1451 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1452 &tp->rcv_wnd,
1453 &tp->window_clamp,
1454 sysctl_tcp_window_scaling,
1455 &rcv_wscale);
1456
1457 tp->rx_opt.rcv_wscale = rcv_wscale;
1458 tp->rcv_ssthresh = tp->rcv_wnd;
1459
1460 sk->sk_err = 0;
1461 sock_reset_flag(sk, SOCK_DONE);
1462 tp->snd_wnd = 0;
1463 tcp_init_wl(tp, tp->write_seq, 0);
1464 tp->snd_una = tp->write_seq;
1465 tp->snd_sml = tp->write_seq;
1466 tp->rcv_nxt = 0;
1467 tp->rcv_wup = 0;
1468 tp->copied_seq = 0;
1469
1470 tp->rto = TCP_TIMEOUT_INIT;
1471 tp->retransmits = 0;
1472 tcp_clear_retrans(tp);
1473}
1474
1475/*
1476 * Build a SYN and send it off.
1477 */
1478int tcp_connect(struct sock *sk)
1479{
1480 struct tcp_sock *tp = tcp_sk(sk);
1481 struct sk_buff *buff;
1482
1483 tcp_connect_init(sk);
1484
1485 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1486 if (unlikely(buff == NULL))
1487 return -ENOBUFS;
1488
1489 /* Reserve space for headers. */
1490 skb_reserve(buff, MAX_TCP_HEADER);
1491
1492 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1493 TCP_ECN_send_syn(sk, tp, buff);
1494 TCP_SKB_CB(buff)->sacked = 0;
1495 skb_shinfo(buff)->tso_segs = 1;
1496 skb_shinfo(buff)->tso_size = 0;
1497 buff->csum = 0;
1498 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1499 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1500 tp->snd_nxt = tp->write_seq;
1501 tp->pushed_seq = tp->write_seq;
1502 tcp_ca_init(tp);
1503
1504 /* Send it off. */
1505 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1506 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1507 skb_header_release(buff);
1508 __skb_queue_tail(&sk->sk_write_queue, buff);
1509 sk_charge_skb(sk, buff);
1510 tp->packets_out += tcp_skb_pcount(buff);
1511 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1512 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1513
1514 /* Timer for repeating the SYN until an answer. */
1515 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1516 return 0;
1517}
1518
1519/* Send out a delayed ack, the caller does the policy checking
1520 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1521 * for details.
1522 */
1523void tcp_send_delayed_ack(struct sock *sk)
1524{
1525 struct tcp_sock *tp = tcp_sk(sk);
1526 int ato = tp->ack.ato;
1527 unsigned long timeout;
1528
1529 if (ato > TCP_DELACK_MIN) {
1530 int max_ato = HZ/2;
1531
1532 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1533 max_ato = TCP_DELACK_MAX;
1534
1535 /* Slow path, intersegment interval is "high". */
1536
1537 /* If some rtt estimate is known, use it to bound delayed ack.
1538 * Do not use tp->rto here, use results of rtt measurements
1539 * directly.
1540 */
1541 if (tp->srtt) {
1542 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1543
1544 if (rtt < max_ato)
1545 max_ato = rtt;
1546 }
1547
1548 ato = min(ato, max_ato);
1549 }
1550
1551 /* Stay within the limit we were given */
1552 timeout = jiffies + ato;
1553
1554 /* Use new timeout only if there wasn't a older one earlier. */
1555 if (tp->ack.pending&TCP_ACK_TIMER) {
1556 /* If delack timer was blocked or is about to expire,
1557 * send ACK now.
1558 */
1559 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1560 tcp_send_ack(sk);
1561 return;
1562 }
1563
1564 if (!time_before(timeout, tp->ack.timeout))
1565 timeout = tp->ack.timeout;
1566 }
1567 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1568 tp->ack.timeout = timeout;
1569 sk_reset_timer(sk, &tp->delack_timer, timeout);
1570}
1571
1572/* This routine sends an ack and also updates the window. */
1573void tcp_send_ack(struct sock *sk)
1574{
1575 /* If we have been reset, we may not send again. */
1576 if (sk->sk_state != TCP_CLOSE) {
1577 struct tcp_sock *tp = tcp_sk(sk);
1578 struct sk_buff *buff;
1579
1580 /* We are not putting this on the write queue, so
1581 * tcp_transmit_skb() will set the ownership to this
1582 * sock.
1583 */
1584 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1585 if (buff == NULL) {
1586 tcp_schedule_ack(tp);
1587 tp->ack.ato = TCP_ATO_MIN;
1588 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1589 return;
1590 }
1591
1592 /* Reserve space for headers and prepare control bits. */
1593 skb_reserve(buff, MAX_TCP_HEADER);
1594 buff->csum = 0;
1595 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1596 TCP_SKB_CB(buff)->sacked = 0;
1597 skb_shinfo(buff)->tso_segs = 1;
1598 skb_shinfo(buff)->tso_size = 0;
1599
1600 /* Send it off, this clears delayed acks for us. */
1601 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1602 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1603 tcp_transmit_skb(sk, buff);
1604 }
1605}
1606
1607/* This routine sends a packet with an out of date sequence
1608 * number. It assumes the other end will try to ack it.
1609 *
1610 * Question: what should we make while urgent mode?
1611 * 4.4BSD forces sending single byte of data. We cannot send
1612 * out of window data, because we have SND.NXT==SND.MAX...
1613 *
1614 * Current solution: to send TWO zero-length segments in urgent mode:
1615 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1616 * out-of-date with SND.UNA-1 to probe window.
1617 */
1618static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1619{
1620 struct tcp_sock *tp = tcp_sk(sk);
1621 struct sk_buff *skb;
1622
1623 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1624 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1625 if (skb == NULL)
1626 return -1;
1627
1628 /* Reserve space for headers and set control bits. */
1629 skb_reserve(skb, MAX_TCP_HEADER);
1630 skb->csum = 0;
1631 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1632 TCP_SKB_CB(skb)->sacked = urgent;
1633 skb_shinfo(skb)->tso_segs = 1;
1634 skb_shinfo(skb)->tso_size = 0;
1635
1636 /* Use a previous sequence. This should cause the other
1637 * end to send an ack. Don't queue or clone SKB, just
1638 * send it.
1639 */
1640 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1641 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1642 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1643 return tcp_transmit_skb(sk, skb);
1644}
1645
1646int tcp_write_wakeup(struct sock *sk)
1647{
1648 if (sk->sk_state != TCP_CLOSE) {
1649 struct tcp_sock *tp = tcp_sk(sk);
1650 struct sk_buff *skb;
1651
1652 if ((skb = sk->sk_send_head) != NULL &&
1653 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1654 int err;
1655 unsigned int mss = tcp_current_mss(sk, 0);
1656 unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1657
1658 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1659 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1660
1661 /* We are probing the opening of a window
1662 * but the window size is != 0
1663 * must have been a result SWS avoidance ( sender )
1664 */
1665 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1666 skb->len > mss) {
1667 seg_size = min(seg_size, mss);
1668 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1669 if (tcp_fragment(sk, skb, seg_size))
1670 return -1;
1671 /* SWS override triggered forced fragmentation.
1672 * Disable TSO, the connection is too sick. */
1673 if (sk->sk_route_caps & NETIF_F_TSO) {
1674 sock_set_flag(sk, SOCK_NO_LARGESEND);
1675 sk->sk_route_caps &= ~NETIF_F_TSO;
1676 tp->mss_cache = tp->mss_cache_std;
1677 }
1678 } else if (!tcp_skb_pcount(skb))
1679 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
1680
1681 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1682 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1683 tcp_tso_set_push(skb);
1684 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1685 if (!err) {
1686 update_send_head(sk, tp, skb);
1687 }
1688 return err;
1689 } else {
1690 if (tp->urg_mode &&
1691 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1692 tcp_xmit_probe_skb(sk, TCPCB_URG);
1693 return tcp_xmit_probe_skb(sk, 0);
1694 }
1695 }
1696 return -1;
1697}
1698
1699/* A window probe timeout has occurred. If window is not closed send
1700 * a partial packet else a zero probe.
1701 */
1702void tcp_send_probe0(struct sock *sk)
1703{
1704 struct tcp_sock *tp = tcp_sk(sk);
1705 int err;
1706
1707 err = tcp_write_wakeup(sk);
1708
1709 if (tp->packets_out || !sk->sk_send_head) {
1710 /* Cancel probe timer, if it is not required. */
1711 tp->probes_out = 0;
1712 tp->backoff = 0;
1713 return;
1714 }
1715
1716 if (err <= 0) {
1717 if (tp->backoff < sysctl_tcp_retries2)
1718 tp->backoff++;
1719 tp->probes_out++;
1720 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1721 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1722 } else {
1723 /* If packet was not sent due to local congestion,
1724 * do not backoff and do not remember probes_out.
1725 * Let local senders to fight for local resources.
1726 *
1727 * Use accumulated backoff yet.
1728 */
1729 if (!tp->probes_out)
1730 tp->probes_out=1;
1731 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1732 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1733 }
1734}
1735
1736EXPORT_SYMBOL(tcp_connect);
1737EXPORT_SYMBOL(tcp_make_synack);
1738EXPORT_SYMBOL(tcp_simple_retransmit);
1739EXPORT_SYMBOL(tcp_sync_mss);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 000000000000..85b279f1e935
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,656 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23#include <linux/module.h>
24#include <net/tcp.h>
25
26int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
27int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
28int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
29int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
30int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
31int sysctl_tcp_retries1 = TCP_RETR1;
32int sysctl_tcp_retries2 = TCP_RETR2;
33int sysctl_tcp_orphan_retries;
34
35static void tcp_write_timer(unsigned long);
36static void tcp_delack_timer(unsigned long);
37static void tcp_keepalive_timer (unsigned long data);
38
39#ifdef TCP_DEBUG
40const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
41EXPORT_SYMBOL(tcp_timer_bug_msg);
42#endif
43
44/*
45 * Using different timers for retransmit, delayed acks and probes
46 * We may wish use just one timer maintaining a list of expire jiffies
47 * to optimize.
48 */
49
50void tcp_init_xmit_timers(struct sock *sk)
51{
52 struct tcp_sock *tp = tcp_sk(sk);
53
54 init_timer(&tp->retransmit_timer);
55 tp->retransmit_timer.function=&tcp_write_timer;
56 tp->retransmit_timer.data = (unsigned long) sk;
57 tp->pending = 0;
58
59 init_timer(&tp->delack_timer);
60 tp->delack_timer.function=&tcp_delack_timer;
61 tp->delack_timer.data = (unsigned long) sk;
62 tp->ack.pending = 0;
63
64 init_timer(&sk->sk_timer);
65 sk->sk_timer.function = &tcp_keepalive_timer;
66 sk->sk_timer.data = (unsigned long)sk;
67}
68
69void tcp_clear_xmit_timers(struct sock *sk)
70{
71 struct tcp_sock *tp = tcp_sk(sk);
72
73 tp->pending = 0;
74 sk_stop_timer(sk, &tp->retransmit_timer);
75
76 tp->ack.pending = 0;
77 tp->ack.blocked = 0;
78 sk_stop_timer(sk, &tp->delack_timer);
79
80 sk_stop_timer(sk, &sk->sk_timer);
81}
82
83static void tcp_write_err(struct sock *sk)
84{
85 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
86 sk->sk_error_report(sk);
87
88 tcp_done(sk);
89 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
90}
91
92/* Do not allow orphaned sockets to eat all our resources.
93 * This is direct violation of TCP specs, but it is required
94 * to prevent DoS attacks. It is called when a retransmission timeout
95 * or zero probe timeout occurs on orphaned socket.
96 *
97 * Criterium is still not confirmed experimentally and may change.
98 * We kill the socket, if:
99 * 1. If number of orphaned sockets exceeds an administratively configured
100 * limit.
101 * 2. If we have strong memory pressure.
102 */
103static int tcp_out_of_resources(struct sock *sk, int do_reset)
104{
105 struct tcp_sock *tp = tcp_sk(sk);
106 int orphans = atomic_read(&tcp_orphan_count);
107
108 /* If peer does not open window for long time, or did not transmit
109 * anything for long time, penalize it. */
110 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
111 orphans <<= 1;
112
113 /* If some dubious ICMP arrived, penalize even more. */
114 if (sk->sk_err_soft)
115 orphans <<= 1;
116
117 if (orphans >= sysctl_tcp_max_orphans ||
118 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
119 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
120 if (net_ratelimit())
121 printk(KERN_INFO "Out of socket memory\n");
122
123 /* Catch exceptional cases, when connection requires reset.
124 * 1. Last segment was sent recently. */
125 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
126 /* 2. Window is closed. */
127 (!tp->snd_wnd && !tp->packets_out))
128 do_reset = 1;
129 if (do_reset)
130 tcp_send_active_reset(sk, GFP_ATOMIC);
131 tcp_done(sk);
132 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
133 return 1;
134 }
135 return 0;
136}
137
138/* Calculate maximal number or retries on an orphaned socket. */
139static int tcp_orphan_retries(struct sock *sk, int alive)
140{
141 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
142
143 /* We know from an ICMP that something is wrong. */
144 if (sk->sk_err_soft && !alive)
145 retries = 0;
146
147 /* However, if socket sent something recently, select some safe
148 * number of retries. 8 corresponds to >100 seconds with minimal
149 * RTO of 200msec. */
150 if (retries == 0 && alive)
151 retries = 8;
152 return retries;
153}
154
155/* A write timeout has occurred. Process the after effects. */
156static int tcp_write_timeout(struct sock *sk)
157{
158 struct tcp_sock *tp = tcp_sk(sk);
159 int retry_until;
160
161 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
162 if (tp->retransmits)
163 dst_negative_advice(&sk->sk_dst_cache);
164 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
165 } else {
166 if (tp->retransmits >= sysctl_tcp_retries1) {
167 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
168 hole detection. :-(
169
170 It is place to make it. It is not made. I do not want
171 to make it. It is disguisting. It does not work in any
172 case. Let me to cite the same draft, which requires for
173 us to implement this:
174
175 "The one security concern raised by this memo is that ICMP black holes
176 are often caused by over-zealous security administrators who block
177 all ICMP messages. It is vitally important that those who design and
178 deploy security systems understand the impact of strict filtering on
179 upper-layer protocols. The safest web site in the world is worthless
180 if most TCP implementations cannot transfer data from it. It would
181 be far nicer to have all of the black holes fixed rather than fixing
182 all of the TCP implementations."
183
184 Golden words :-).
185 */
186
187 dst_negative_advice(&sk->sk_dst_cache);
188 }
189
190 retry_until = sysctl_tcp_retries2;
191 if (sock_flag(sk, SOCK_DEAD)) {
192 int alive = (tp->rto < TCP_RTO_MAX);
193
194 retry_until = tcp_orphan_retries(sk, alive);
195
196 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
197 return 1;
198 }
199 }
200
201 if (tp->retransmits >= retry_until) {
202 /* Has it gone just too far? */
203 tcp_write_err(sk);
204 return 1;
205 }
206 return 0;
207}
208
209static void tcp_delack_timer(unsigned long data)
210{
211 struct sock *sk = (struct sock*)data;
212 struct tcp_sock *tp = tcp_sk(sk);
213
214 bh_lock_sock(sk);
215 if (sock_owned_by_user(sk)) {
216 /* Try again later. */
217 tp->ack.blocked = 1;
218 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
219 sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
220 goto out_unlock;
221 }
222
223 sk_stream_mem_reclaim(sk);
224
225 if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
226 goto out;
227
228 if (time_after(tp->ack.timeout, jiffies)) {
229 sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
230 goto out;
231 }
232 tp->ack.pending &= ~TCP_ACK_TIMER;
233
234 if (skb_queue_len(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb;
236
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
238 skb_queue_len(&tp->ucopy.prequeue));
239
240 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241 sk->sk_backlog_rcv(sk, skb);
242
243 tp->ucopy.memory = 0;
244 }
245
246 if (tcp_ack_scheduled(tp)) {
247 if (!tp->ack.pingpong) {
248 /* Delayed ACK missed: inflate ATO. */
249 tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
250 } else {
251 /* Delayed ACK missed: leave pingpong mode and
252 * deflate ATO.
253 */
254 tp->ack.pingpong = 0;
255 tp->ack.ato = TCP_ATO_MIN;
256 }
257 tcp_send_ack(sk);
258 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
259 }
260 TCP_CHECK_TIMER(sk);
261
262out:
263 if (tcp_memory_pressure)
264 sk_stream_mem_reclaim(sk);
265out_unlock:
266 bh_unlock_sock(sk);
267 sock_put(sk);
268}
269
270static void tcp_probe_timer(struct sock *sk)
271{
272 struct tcp_sock *tp = tcp_sk(sk);
273 int max_probes;
274
275 if (tp->packets_out || !sk->sk_send_head) {
276 tp->probes_out = 0;
277 return;
278 }
279
280 /* *WARNING* RFC 1122 forbids this
281 *
282 * It doesn't AFAIK, because we kill the retransmit timer -AK
283 *
284 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
285 * this behaviour in Solaris down as a bug fix. [AC]
286 *
287 * Let me to explain. probes_out is zeroed by incoming ACKs
288 * even if they advertise zero window. Hence, connection is killed only
289 * if we received no ACKs for normal connection timeout. It is not killed
290 * only because window stays zero for some time, window may be zero
291 * until armageddon and even later. We are in full accordance
292 * with RFCs, only probe timer combines both retransmission timeout
293 * and probe timeout in one bottle. --ANK
294 */
295 max_probes = sysctl_tcp_retries2;
296
297 if (sock_flag(sk, SOCK_DEAD)) {
298 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
299
300 max_probes = tcp_orphan_retries(sk, alive);
301
302 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
303 return;
304 }
305
306 if (tp->probes_out > max_probes) {
307 tcp_write_err(sk);
308 } else {
309 /* Only send another probe if we didn't close things up. */
310 tcp_send_probe0(sk);
311 }
312}
313
314/*
315 * The TCP retransmit timer.
316 */
317
318static void tcp_retransmit_timer(struct sock *sk)
319{
320 struct tcp_sock *tp = tcp_sk(sk);
321
322 if (!tp->packets_out)
323 goto out;
324
325 BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
326
327 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
328 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
329 /* Receiver dastardly shrinks window. Our retransmits
330 * become zero probes, but we should not timeout this
331 * connection. If the socket is an orphan, time it out,
332 * we cannot allow such beasts to hang infinitely.
333 */
334#ifdef TCP_DEBUG
335 if (net_ratelimit()) {
336 struct inet_sock *inet = inet_sk(sk);
337 printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
338 NIPQUAD(inet->daddr), htons(inet->dport),
339 inet->num, tp->snd_una, tp->snd_nxt);
340 }
341#endif
342 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
343 tcp_write_err(sk);
344 goto out;
345 }
346 tcp_enter_loss(sk, 0);
347 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
348 __sk_dst_reset(sk);
349 goto out_reset_timer;
350 }
351
352 if (tcp_write_timeout(sk))
353 goto out;
354
355 if (tp->retransmits == 0) {
356 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
357 if (tp->rx_opt.sack_ok) {
358 if (tp->ca_state == TCP_CA_Recovery)
359 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
360 else
361 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
362 } else {
363 if (tp->ca_state == TCP_CA_Recovery)
364 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
365 else
366 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
367 }
368 } else if (tp->ca_state == TCP_CA_Loss) {
369 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
370 } else {
371 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
372 }
373 }
374
375 if (tcp_use_frto(sk)) {
376 tcp_enter_frto(sk);
377 } else {
378 tcp_enter_loss(sk, 0);
379 }
380
381 if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
382 /* Retransmission failed because of local congestion,
383 * do not backoff.
384 */
385 if (!tp->retransmits)
386 tp->retransmits=1;
387 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
388 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
389 goto out;
390 }
391
392 /* Increase the timeout each time we retransmit. Note that
393 * we do not increase the rtt estimate. rto is initialized
394 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
395 * that doubling rto each time is the least we can get away with.
396 * In KA9Q, Karn uses this for the first few times, and then
397 * goes to quadratic. netBSD doubles, but only goes up to *64,
398 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
399 * defined in the protocol as the maximum possible RTT. I guess
400 * we'll have to use something other than TCP to talk to the
401 * University of Mars.
402 *
403 * PAWS allows us longer timeouts and large windows, so once
404 * implemented ftp to mars will work nicely. We will have to fix
405 * the 120 second clamps though!
406 */
407 tp->backoff++;
408 tp->retransmits++;
409
410out_reset_timer:
411 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
412 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
413 if (tp->retransmits > sysctl_tcp_retries1)
414 __sk_dst_reset(sk);
415
416out:;
417}
418
419static void tcp_write_timer(unsigned long data)
420{
421 struct sock *sk = (struct sock*)data;
422 struct tcp_sock *tp = tcp_sk(sk);
423 int event;
424
425 bh_lock_sock(sk);
426 if (sock_owned_by_user(sk)) {
427 /* Try again later */
428 sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
429 goto out_unlock;
430 }
431
432 if (sk->sk_state == TCP_CLOSE || !tp->pending)
433 goto out;
434
435 if (time_after(tp->timeout, jiffies)) {
436 sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
437 goto out;
438 }
439
440 event = tp->pending;
441 tp->pending = 0;
442
443 switch (event) {
444 case TCP_TIME_RETRANS:
445 tcp_retransmit_timer(sk);
446 break;
447 case TCP_TIME_PROBE0:
448 tcp_probe_timer(sk);
449 break;
450 }
451 TCP_CHECK_TIMER(sk);
452
453out:
454 sk_stream_mem_reclaim(sk);
455out_unlock:
456 bh_unlock_sock(sk);
457 sock_put(sk);
458}
459
460/*
461 * Timer for listening sockets
462 */
463
464static void tcp_synack_timer(struct sock *sk)
465{
466 struct tcp_sock *tp = tcp_sk(sk);
467 struct tcp_listen_opt *lopt = tp->listen_opt;
468 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
469 int thresh = max_retries;
470 unsigned long now = jiffies;
471 struct open_request **reqp, *req;
472 int i, budget;
473
474 if (lopt == NULL || lopt->qlen == 0)
475 return;
476
477 /* Normally all the openreqs are young and become mature
478 * (i.e. converted to established socket) for first timeout.
479 * If synack was not acknowledged for 3 seconds, it means
480 * one of the following things: synack was lost, ack was lost,
481 * rtt is high or nobody planned to ack (i.e. synflood).
482 * When server is a bit loaded, queue is populated with old
483 * open requests, reducing effective size of queue.
484 * When server is well loaded, queue size reduces to zero
485 * after several minutes of work. It is not synflood,
486 * it is normal operation. The solution is pruning
487 * too old entries overriding normal timeout, when
488 * situation becomes dangerous.
489 *
490 * Essentially, we reserve half of room for young
491 * embrions; and abort old ones without pity, if old
492 * ones are about to clog our table.
493 */
494 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
495 int young = (lopt->qlen_young<<1);
496
497 while (thresh > 2) {
498 if (lopt->qlen < young)
499 break;
500 thresh--;
501 young <<= 1;
502 }
503 }
504
505 if (tp->defer_accept)
506 max_retries = tp->defer_accept;
507
508 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
509 i = lopt->clock_hand;
510
511 do {
512 reqp=&lopt->syn_table[i];
513 while ((req = *reqp) != NULL) {
514 if (time_after_eq(now, req->expires)) {
515 if ((req->retrans < thresh ||
516 (req->acked && req->retrans < max_retries))
517 && !req->class->rtx_syn_ack(sk, req, NULL)) {
518 unsigned long timeo;
519
520 if (req->retrans++ == 0)
521 lopt->qlen_young--;
522 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
523 TCP_RTO_MAX);
524 req->expires = now + timeo;
525 reqp = &req->dl_next;
526 continue;
527 }
528
529 /* Drop this request */
530 write_lock(&tp->syn_wait_lock);
531 *reqp = req->dl_next;
532 write_unlock(&tp->syn_wait_lock);
533 lopt->qlen--;
534 if (req->retrans == 0)
535 lopt->qlen_young--;
536 tcp_openreq_free(req);
537 continue;
538 }
539 reqp = &req->dl_next;
540 }
541
542 i = (i+1)&(TCP_SYNQ_HSIZE-1);
543
544 } while (--budget > 0);
545
546 lopt->clock_hand = i;
547
548 if (lopt->qlen)
549 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
550}
551
552void tcp_delete_keepalive_timer (struct sock *sk)
553{
554 sk_stop_timer(sk, &sk->sk_timer);
555}
556
557void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
558{
559 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
560}
561
562void tcp_set_keepalive(struct sock *sk, int val)
563{
564 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
565 return;
566
567 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
568 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
569 else if (!val)
570 tcp_delete_keepalive_timer(sk);
571}
572
573
574static void tcp_keepalive_timer (unsigned long data)
575{
576 struct sock *sk = (struct sock *) data;
577 struct tcp_sock *tp = tcp_sk(sk);
578 __u32 elapsed;
579
580 /* Only process if socket is not in use. */
581 bh_lock_sock(sk);
582 if (sock_owned_by_user(sk)) {
583 /* Try again later. */
584 tcp_reset_keepalive_timer (sk, HZ/20);
585 goto out;
586 }
587
588 if (sk->sk_state == TCP_LISTEN) {
589 tcp_synack_timer(sk);
590 goto out;
591 }
592
593 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
594 if (tp->linger2 >= 0) {
595 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
596
597 if (tmo > 0) {
598 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
599 goto out;
600 }
601 }
602 tcp_send_active_reset(sk, GFP_ATOMIC);
603 goto death;
604 }
605
606 if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
607 goto out;
608
609 elapsed = keepalive_time_when(tp);
610
611 /* It is alive without keepalive 8) */
612 if (tp->packets_out || sk->sk_send_head)
613 goto resched;
614
615 elapsed = tcp_time_stamp - tp->rcv_tstamp;
616
617 if (elapsed >= keepalive_time_when(tp)) {
618 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
619 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
620 tcp_send_active_reset(sk, GFP_ATOMIC);
621 tcp_write_err(sk);
622 goto out;
623 }
624 if (tcp_write_wakeup(sk) <= 0) {
625 tp->probes_out++;
626 elapsed = keepalive_intvl_when(tp);
627 } else {
628 /* If keepalive was lost due to local congestion,
629 * try harder.
630 */
631 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
632 }
633 } else {
634 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
635 elapsed = keepalive_time_when(tp) - elapsed;
636 }
637
638 TCP_CHECK_TIMER(sk);
639 sk_stream_mem_reclaim(sk);
640
641resched:
642 tcp_reset_keepalive_timer (sk, elapsed);
643 goto out;
644
645death:
646 tcp_done(sk);
647
648out:
649 bh_unlock_sock(sk);
650 sock_put(sk);
651}
652
653EXPORT_SYMBOL(tcp_clear_xmit_timers);
654EXPORT_SYMBOL(tcp_delete_keepalive_timer);
655EXPORT_SYMBOL(tcp_init_xmit_timers);
656EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 000000000000..6baddfbedca3
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,1575 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The User Datagram Protocol (UDP).
7 *
8 * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Hirokazu Takahashi, <taka@valinux.co.jp>
15 *
16 * Fixes:
17 * Alan Cox : verify_area() calls
18 * Alan Cox : stopped close while in use off icmp
19 * messages. Not a fix but a botch that
20 * for udp at least is 'valid'.
21 * Alan Cox : Fixed icmp handling properly
22 * Alan Cox : Correct error for oversized datagrams
23 * Alan Cox : Tidied select() semantics.
24 * Alan Cox : udp_err() fixed properly, also now
25 * select and read wake correctly on errors
26 * Alan Cox : udp_send verify_area moved to avoid mem leak
27 * Alan Cox : UDP can count its memory
28 * Alan Cox : send to an unknown connection causes
29 * an ECONNREFUSED off the icmp, but
30 * does NOT close.
31 * Alan Cox : Switched to new sk_buff handlers. No more backlog!
32 * Alan Cox : Using generic datagram code. Even smaller and the PEEK
33 * bug no longer crashes it.
34 * Fred Van Kempen : Net2e support for sk->broadcast.
35 * Alan Cox : Uses skb_free_datagram
36 * Alan Cox : Added get/set sockopt support.
37 * Alan Cox : Broadcasting without option set returns EACCES.
38 * Alan Cox : No wakeup calls. Instead we now use the callbacks.
39 * Alan Cox : Use ip_tos and ip_ttl
40 * Alan Cox : SNMP Mibs
41 * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
42 * Matt Dillon : UDP length checks.
43 * Alan Cox : Smarter af_inet used properly.
44 * Alan Cox : Use new kernel side addressing.
45 * Alan Cox : Incorrect return on truncated datagram receive.
46 * Arnt Gulbrandsen : New udp_send and stuff
47 * Alan Cox : Cache last socket
48 * Alan Cox : Route cache
49 * Jon Peatfield : Minor efficiency fix to sendto().
50 * Mike Shaver : RFC1122 checks.
51 * Alan Cox : Nonblocking error fix.
52 * Willy Konynenberg : Transparent proxying support.
53 * Mike McLagan : Routing by source
54 * David S. Miller : New socket lookup architecture.
55 * Last socket cache retained as it
56 * does have a high hit rate.
57 * Olaf Kirch : Don't linearise iovec on sendmsg.
58 * Andi Kleen : Some cleanups, cache destination entry
59 * for connect.
60 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
61 * Melvin Smith : Check msg_name not msg_namelen in sendto(),
62 * return ENOTCONN for unconnected sockets (POSIX)
63 * Janos Farkas : don't deliver multi/broadcasts to a different
64 * bound-to-device socket
65 * Hirokazu Takahashi : HW checksumming for outgoing UDP
66 * datagrams.
67 * Hirokazu Takahashi : sendfile() on UDP works now.
68 * Arnaldo C. Melo : convert /proc/net/udp to seq_file
69 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
70 * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
71 * a single port at the same time.
72 * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
73 *
74 *
75 * This program is free software; you can redistribute it and/or
76 * modify it under the terms of the GNU General Public License
77 * as published by the Free Software Foundation; either version
78 * 2 of the License, or (at your option) any later version.
79 */
80
81#include <asm/system.h>
82#include <asm/uaccess.h>
83#include <asm/ioctls.h>
84#include <linux/types.h>
85#include <linux/fcntl.h>
86#include <linux/module.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/in.h>
90#include <linux/errno.h>
91#include <linux/timer.h>
92#include <linux/mm.h>
93#include <linux/config.h>
94#include <linux/inet.h>
95#include <linux/ipv6.h>
96#include <linux/netdevice.h>
97#include <net/snmp.h>
98#include <net/tcp.h>
99#include <net/protocol.h>
100#include <linux/skbuff.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <net/sock.h>
104#include <net/udp.h>
105#include <net/icmp.h>
106#include <net/route.h>
107#include <net/inet_common.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110
111/*
112 * Snmp MIB for the UDP layer
113 */
114
115DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
116
117struct hlist_head udp_hash[UDP_HTABLE_SIZE];
118DEFINE_RWLOCK(udp_hash_lock);
119
120/* Shared by v4/v6 udp. */
121int udp_port_rover;
122
123static int udp_v4_get_port(struct sock *sk, unsigned short snum)
124{
125 struct hlist_node *node;
126 struct sock *sk2;
127 struct inet_sock *inet = inet_sk(sk);
128
129 write_lock_bh(&udp_hash_lock);
130 if (snum == 0) {
131 int best_size_so_far, best, result, i;
132
133 if (udp_port_rover > sysctl_local_port_range[1] ||
134 udp_port_rover < sysctl_local_port_range[0])
135 udp_port_rover = sysctl_local_port_range[0];
136 best_size_so_far = 32767;
137 best = result = udp_port_rover;
138 for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
139 struct hlist_head *list;
140 int size;
141
142 list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
143 if (hlist_empty(list)) {
144 if (result > sysctl_local_port_range[1])
145 result = sysctl_local_port_range[0] +
146 ((result - sysctl_local_port_range[0]) &
147 (UDP_HTABLE_SIZE - 1));
148 goto gotit;
149 }
150 size = 0;
151 sk_for_each(sk2, node, list)
152 if (++size >= best_size_so_far)
153 goto next;
154 best_size_so_far = size;
155 best = result;
156 next:;
157 }
158 result = best;
159 for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
160 if (result > sysctl_local_port_range[1])
161 result = sysctl_local_port_range[0]
162 + ((result - sysctl_local_port_range[0]) &
163 (UDP_HTABLE_SIZE - 1));
164 if (!udp_lport_inuse(result))
165 break;
166 }
167 if (i >= (1 << 16) / UDP_HTABLE_SIZE)
168 goto fail;
169gotit:
170 udp_port_rover = snum = result;
171 } else {
172 sk_for_each(sk2, node,
173 &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
174 struct inet_sock *inet2 = inet_sk(sk2);
175
176 if (inet2->num == snum &&
177 sk2 != sk &&
178 !ipv6_only_sock(sk2) &&
179 (!sk2->sk_bound_dev_if ||
180 !sk->sk_bound_dev_if ||
181 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
182 (!inet2->rcv_saddr ||
183 !inet->rcv_saddr ||
184 inet2->rcv_saddr == inet->rcv_saddr) &&
185 (!sk2->sk_reuse || !sk->sk_reuse))
186 goto fail;
187 }
188 }
189 inet->num = snum;
190 if (sk_unhashed(sk)) {
191 struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
192
193 sk_add_node(sk, h);
194 sock_prot_inc_use(sk->sk_prot);
195 }
196 write_unlock_bh(&udp_hash_lock);
197 return 0;
198
199fail:
200 write_unlock_bh(&udp_hash_lock);
201 return 1;
202}
203
204static void udp_v4_hash(struct sock *sk)
205{
206 BUG();
207}
208
209static void udp_v4_unhash(struct sock *sk)
210{
211 write_lock_bh(&udp_hash_lock);
212 if (sk_del_node_init(sk)) {
213 inet_sk(sk)->num = 0;
214 sock_prot_dec_use(sk->sk_prot);
215 }
216 write_unlock_bh(&udp_hash_lock);
217}
218
219/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
220 * harder than this. -DaveM
221 */
222static struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport,
223 u32 daddr, u16 dport, int dif)
224{
225 struct sock *sk, *result = NULL;
226 struct hlist_node *node;
227 unsigned short hnum = ntohs(dport);
228 int badness = -1;
229
230 sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) {
231 struct inet_sock *inet = inet_sk(sk);
232
233 if (inet->num == hnum && !ipv6_only_sock(sk)) {
234 int score = (sk->sk_family == PF_INET ? 1 : 0);
235 if (inet->rcv_saddr) {
236 if (inet->rcv_saddr != daddr)
237 continue;
238 score+=2;
239 }
240 if (inet->daddr) {
241 if (inet->daddr != saddr)
242 continue;
243 score+=2;
244 }
245 if (inet->dport) {
246 if (inet->dport != sport)
247 continue;
248 score+=2;
249 }
250 if (sk->sk_bound_dev_if) {
251 if (sk->sk_bound_dev_if != dif)
252 continue;
253 score+=2;
254 }
255 if(score == 9) {
256 result = sk;
257 break;
258 } else if(score > badness) {
259 result = sk;
260 badness = score;
261 }
262 }
263 }
264 return result;
265}
266
267static __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport,
268 u32 daddr, u16 dport, int dif)
269{
270 struct sock *sk;
271
272 read_lock(&udp_hash_lock);
273 sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
274 if (sk)
275 sock_hold(sk);
276 read_unlock(&udp_hash_lock);
277 return sk;
278}
279
280static inline struct sock *udp_v4_mcast_next(struct sock *sk,
281 u16 loc_port, u32 loc_addr,
282 u16 rmt_port, u32 rmt_addr,
283 int dif)
284{
285 struct hlist_node *node;
286 struct sock *s = sk;
287 unsigned short hnum = ntohs(loc_port);
288
289 sk_for_each_from(s, node) {
290 struct inet_sock *inet = inet_sk(s);
291
292 if (inet->num != hnum ||
293 (inet->daddr && inet->daddr != rmt_addr) ||
294 (inet->dport != rmt_port && inet->dport) ||
295 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
296 ipv6_only_sock(s) ||
297 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
298 continue;
299 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
300 continue;
301 goto found;
302 }
303 s = NULL;
304found:
305 return s;
306}
307
308/*
309 * This routine is called by the ICMP module when it gets some
310 * sort of error condition. If err < 0 then the socket should
311 * be closed and the error returned to the user. If err > 0
312 * it's just the icmp type << 8 | icmp code.
313 * Header points to the ip header of the error packet. We move
314 * on past this. Then (as it used to claim before adjustment)
315 * header points to the first 8 bytes of the udp header. We need
316 * to find the appropriate port.
317 */
318
319void udp_err(struct sk_buff *skb, u32 info)
320{
321 struct inet_sock *inet;
322 struct iphdr *iph = (struct iphdr*)skb->data;
323 struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2));
324 int type = skb->h.icmph->type;
325 int code = skb->h.icmph->code;
326 struct sock *sk;
327 int harderr;
328 int err;
329
330 sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex);
331 if (sk == NULL) {
332 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
333 return; /* No socket for error */
334 }
335
336 err = 0;
337 harderr = 0;
338 inet = inet_sk(sk);
339
340 switch (type) {
341 default:
342 case ICMP_TIME_EXCEEDED:
343 err = EHOSTUNREACH;
344 break;
345 case ICMP_SOURCE_QUENCH:
346 goto out;
347 case ICMP_PARAMETERPROB:
348 err = EPROTO;
349 harderr = 1;
350 break;
351 case ICMP_DEST_UNREACH:
352 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
353 if (inet->pmtudisc != IP_PMTUDISC_DONT) {
354 err = EMSGSIZE;
355 harderr = 1;
356 break;
357 }
358 goto out;
359 }
360 err = EHOSTUNREACH;
361 if (code <= NR_ICMP_UNREACH) {
362 harderr = icmp_err_convert[code].fatal;
363 err = icmp_err_convert[code].errno;
364 }
365 break;
366 }
367
368 /*
369 * RFC1122: OK. Passes ICMP errors back to application, as per
370 * 4.1.3.3.
371 */
372 if (!inet->recverr) {
373 if (!harderr || sk->sk_state != TCP_ESTABLISHED)
374 goto out;
375 } else {
376 ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
377 }
378 sk->sk_err = err;
379 sk->sk_error_report(sk);
380out:
381 sock_put(sk);
382}
383
384/*
385 * Throw away all pending data and cancel the corking. Socket is locked.
386 */
387static void udp_flush_pending_frames(struct sock *sk)
388{
389 struct udp_sock *up = udp_sk(sk);
390
391 if (up->pending) {
392 up->len = 0;
393 up->pending = 0;
394 ip_flush_pending_frames(sk);
395 }
396}
397
398/*
399 * Push out all pending data as one UDP datagram. Socket is locked.
400 */
401static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up)
402{
403 struct inet_sock *inet = inet_sk(sk);
404 struct flowi *fl = &inet->cork.fl;
405 struct sk_buff *skb;
406 struct udphdr *uh;
407 int err = 0;
408
409 /* Grab the skbuff where UDP header space exists. */
410 if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
411 goto out;
412
413 /*
414 * Create a UDP header
415 */
416 uh = skb->h.uh;
417 uh->source = fl->fl_ip_sport;
418 uh->dest = fl->fl_ip_dport;
419 uh->len = htons(up->len);
420 uh->check = 0;
421
422 if (sk->sk_no_check == UDP_CSUM_NOXMIT) {
423 skb->ip_summed = CHECKSUM_NONE;
424 goto send;
425 }
426
427 if (skb_queue_len(&sk->sk_write_queue) == 1) {
428 /*
429 * Only one fragment on the socket.
430 */
431 if (skb->ip_summed == CHECKSUM_HW) {
432 skb->csum = offsetof(struct udphdr, check);
433 uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
434 up->len, IPPROTO_UDP, 0);
435 } else {
436 skb->csum = csum_partial((char *)uh,
437 sizeof(struct udphdr), skb->csum);
438 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
439 up->len, IPPROTO_UDP, skb->csum);
440 if (uh->check == 0)
441 uh->check = -1;
442 }
443 } else {
444 unsigned int csum = 0;
445 /*
446 * HW-checksum won't work as there are two or more
447 * fragments on the socket so that all csums of sk_buffs
448 * should be together.
449 */
450 if (skb->ip_summed == CHECKSUM_HW) {
451 int offset = (unsigned char *)uh - skb->data;
452 skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
453
454 skb->ip_summed = CHECKSUM_NONE;
455 } else {
456 skb->csum = csum_partial((char *)uh,
457 sizeof(struct udphdr), skb->csum);
458 }
459
460 skb_queue_walk(&sk->sk_write_queue, skb) {
461 csum = csum_add(csum, skb->csum);
462 }
463 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
464 up->len, IPPROTO_UDP, csum);
465 if (uh->check == 0)
466 uh->check = -1;
467 }
468send:
469 err = ip_push_pending_frames(sk);
470out:
471 up->len = 0;
472 up->pending = 0;
473 return err;
474}
475
476
477static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
478{
479 return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
480}
481
482int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
483 size_t len)
484{
485 struct inet_sock *inet = inet_sk(sk);
486 struct udp_sock *up = udp_sk(sk);
487 int ulen = len;
488 struct ipcm_cookie ipc;
489 struct rtable *rt = NULL;
490 int free = 0;
491 int connected = 0;
492 u32 daddr, faddr, saddr;
493 u16 dport;
494 u8 tos;
495 int err;
496 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
497
498 if (len > 0xFFFF)
499 return -EMSGSIZE;
500
501 /*
502 * Check the flags.
503 */
504
505 if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
506 return -EOPNOTSUPP;
507
508 ipc.opt = NULL;
509
510 if (up->pending) {
511 /*
512 * There are pending frames.
513 * The socket lock must be held while it's corked.
514 */
515 lock_sock(sk);
516 if (likely(up->pending)) {
517 if (unlikely(up->pending != AF_INET)) {
518 release_sock(sk);
519 return -EINVAL;
520 }
521 goto do_append_data;
522 }
523 release_sock(sk);
524 }
525 ulen += sizeof(struct udphdr);
526
527 /*
528 * Get and verify the address.
529 */
530 if (msg->msg_name) {
531 struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
532 if (msg->msg_namelen < sizeof(*usin))
533 return -EINVAL;
534 if (usin->sin_family != AF_INET) {
535 if (usin->sin_family != AF_UNSPEC)
536 return -EAFNOSUPPORT;
537 }
538
539 daddr = usin->sin_addr.s_addr;
540 dport = usin->sin_port;
541 if (dport == 0)
542 return -EINVAL;
543 } else {
544 if (sk->sk_state != TCP_ESTABLISHED)
545 return -EDESTADDRREQ;
546 daddr = inet->daddr;
547 dport = inet->dport;
548 /* Open fast path for connected socket.
549 Route will not be used, if at least one option is set.
550 */
551 connected = 1;
552 }
553 ipc.addr = inet->saddr;
554
555 ipc.oif = sk->sk_bound_dev_if;
556 if (msg->msg_controllen) {
557 err = ip_cmsg_send(msg, &ipc);
558 if (err)
559 return err;
560 if (ipc.opt)
561 free = 1;
562 connected = 0;
563 }
564 if (!ipc.opt)
565 ipc.opt = inet->opt;
566
567 saddr = ipc.addr;
568 ipc.addr = faddr = daddr;
569
570 if (ipc.opt && ipc.opt->srr) {
571 if (!daddr)
572 return -EINVAL;
573 faddr = ipc.opt->faddr;
574 connected = 0;
575 }
576 tos = RT_TOS(inet->tos);
577 if (sock_flag(sk, SOCK_LOCALROUTE) ||
578 (msg->msg_flags & MSG_DONTROUTE) ||
579 (ipc.opt && ipc.opt->is_strictroute)) {
580 tos |= RTO_ONLINK;
581 connected = 0;
582 }
583
584 if (MULTICAST(daddr)) {
585 if (!ipc.oif)
586 ipc.oif = inet->mc_index;
587 if (!saddr)
588 saddr = inet->mc_addr;
589 connected = 0;
590 }
591
592 if (connected)
593 rt = (struct rtable*)sk_dst_check(sk, 0);
594
595 if (rt == NULL) {
596 struct flowi fl = { .oif = ipc.oif,
597 .nl_u = { .ip4_u =
598 { .daddr = faddr,
599 .saddr = saddr,
600 .tos = tos } },
601 .proto = IPPROTO_UDP,
602 .uli_u = { .ports =
603 { .sport = inet->sport,
604 .dport = dport } } };
605 err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
606 if (err)
607 goto out;
608
609 err = -EACCES;
610 if ((rt->rt_flags & RTCF_BROADCAST) &&
611 !sock_flag(sk, SOCK_BROADCAST))
612 goto out;
613 if (connected)
614 sk_dst_set(sk, dst_clone(&rt->u.dst));
615 }
616
617 if (msg->msg_flags&MSG_CONFIRM)
618 goto do_confirm;
619back_from_confirm:
620
621 saddr = rt->rt_src;
622 if (!ipc.addr)
623 daddr = ipc.addr = rt->rt_dst;
624
625 lock_sock(sk);
626 if (unlikely(up->pending)) {
627 /* The socket is already corked while preparing it. */
628 /* ... which is an evident application bug. --ANK */
629 release_sock(sk);
630
631 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
632 err = -EINVAL;
633 goto out;
634 }
635 /*
636 * Now cork the socket to pend data.
637 */
638 inet->cork.fl.fl4_dst = daddr;
639 inet->cork.fl.fl_ip_dport = dport;
640 inet->cork.fl.fl4_src = saddr;
641 inet->cork.fl.fl_ip_sport = inet->sport;
642 up->pending = AF_INET;
643
644do_append_data:
645 up->len += ulen;
646 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen,
647 sizeof(struct udphdr), &ipc, rt,
648 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
649 if (err)
650 udp_flush_pending_frames(sk);
651 else if (!corkreq)
652 err = udp_push_pending_frames(sk, up);
653 release_sock(sk);
654
655out:
656 ip_rt_put(rt);
657 if (free)
658 kfree(ipc.opt);
659 if (!err) {
660 UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS);
661 return len;
662 }
663 return err;
664
665do_confirm:
666 dst_confirm(&rt->u.dst);
667 if (!(msg->msg_flags&MSG_PROBE) || len)
668 goto back_from_confirm;
669 err = 0;
670 goto out;
671}
672
673static int udp_sendpage(struct sock *sk, struct page *page, int offset,
674 size_t size, int flags)
675{
676 struct udp_sock *up = udp_sk(sk);
677 int ret;
678
679 if (!up->pending) {
680 struct msghdr msg = { .msg_flags = flags|MSG_MORE };
681
682 /* Call udp_sendmsg to specify destination address which
683 * sendpage interface can't pass.
684 * This will succeed only when the socket is connected.
685 */
686 ret = udp_sendmsg(NULL, sk, &msg, 0);
687 if (ret < 0)
688 return ret;
689 }
690
691 lock_sock(sk);
692
693 if (unlikely(!up->pending)) {
694 release_sock(sk);
695
696 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
697 return -EINVAL;
698 }
699
700 ret = ip_append_page(sk, page, offset, size, flags);
701 if (ret == -EOPNOTSUPP) {
702 release_sock(sk);
703 return sock_no_sendpage(sk->sk_socket, page, offset,
704 size, flags);
705 }
706 if (ret < 0) {
707 udp_flush_pending_frames(sk);
708 goto out;
709 }
710
711 up->len += size;
712 if (!(up->corkflag || (flags&MSG_MORE)))
713 ret = udp_push_pending_frames(sk, up);
714 if (!ret)
715 ret = size;
716out:
717 release_sock(sk);
718 return ret;
719}
720
721/*
722 * IOCTL requests applicable to the UDP protocol
723 */
724
725int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
726{
727 switch(cmd)
728 {
729 case SIOCOUTQ:
730 {
731 int amount = atomic_read(&sk->sk_wmem_alloc);
732 return put_user(amount, (int __user *)arg);
733 }
734
735 case SIOCINQ:
736 {
737 struct sk_buff *skb;
738 unsigned long amount;
739
740 amount = 0;
741 spin_lock_irq(&sk->sk_receive_queue.lock);
742 skb = skb_peek(&sk->sk_receive_queue);
743 if (skb != NULL) {
744 /*
745 * We will only return the amount
746 * of this packet since that is all
747 * that will be read.
748 */
749 amount = skb->len - sizeof(struct udphdr);
750 }
751 spin_unlock_irq(&sk->sk_receive_queue.lock);
752 return put_user(amount, (int __user *)arg);
753 }
754
755 default:
756 return -ENOIOCTLCMD;
757 }
758 return(0);
759}
760
761static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
762{
763 return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
764}
765
766static __inline__ int udp_checksum_complete(struct sk_buff *skb)
767{
768 return skb->ip_summed != CHECKSUM_UNNECESSARY &&
769 __udp_checksum_complete(skb);
770}
771
772/*
773 * This should be easy, if there is something there we
774 * return it, otherwise we block.
775 */
776
777static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
778 size_t len, int noblock, int flags, int *addr_len)
779{
780 struct inet_sock *inet = inet_sk(sk);
781 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
782 struct sk_buff *skb;
783 int copied, err;
784
785 /*
786 * Check any passed addresses
787 */
788 if (addr_len)
789 *addr_len=sizeof(*sin);
790
791 if (flags & MSG_ERRQUEUE)
792 return ip_recv_error(sk, msg, len);
793
794try_again:
795 skb = skb_recv_datagram(sk, flags, noblock, &err);
796 if (!skb)
797 goto out;
798
799 copied = skb->len - sizeof(struct udphdr);
800 if (copied > len) {
801 copied = len;
802 msg->msg_flags |= MSG_TRUNC;
803 }
804
805 if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
806 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
807 copied);
808 } else if (msg->msg_flags&MSG_TRUNC) {
809 if (__udp_checksum_complete(skb))
810 goto csum_copy_err;
811 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
812 copied);
813 } else {
814 err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
815
816 if (err == -EINVAL)
817 goto csum_copy_err;
818 }
819
820 if (err)
821 goto out_free;
822
823 sock_recv_timestamp(msg, sk, skb);
824
825 /* Copy the address. */
826 if (sin)
827 {
828 sin->sin_family = AF_INET;
829 sin->sin_port = skb->h.uh->source;
830 sin->sin_addr.s_addr = skb->nh.iph->saddr;
831 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
832 }
833 if (inet->cmsg_flags)
834 ip_cmsg_recv(msg, skb);
835
836 err = copied;
837 if (flags & MSG_TRUNC)
838 err = skb->len - sizeof(struct udphdr);
839
840out_free:
841 skb_free_datagram(sk, skb);
842out:
843 return err;
844
845csum_copy_err:
846 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
847
848 /* Clear queue. */
849 if (flags&MSG_PEEK) {
850 int clear = 0;
851 spin_lock_irq(&sk->sk_receive_queue.lock);
852 if (skb == skb_peek(&sk->sk_receive_queue)) {
853 __skb_unlink(skb, &sk->sk_receive_queue);
854 clear = 1;
855 }
856 spin_unlock_irq(&sk->sk_receive_queue.lock);
857 if (clear)
858 kfree_skb(skb);
859 }
860
861 skb_free_datagram(sk, skb);
862
863 if (noblock)
864 return -EAGAIN;
865 goto try_again;
866}
867
868
869int udp_disconnect(struct sock *sk, int flags)
870{
871 struct inet_sock *inet = inet_sk(sk);
872 /*
873 * 1003.1g - break association.
874 */
875
876 sk->sk_state = TCP_CLOSE;
877 inet->daddr = 0;
878 inet->dport = 0;
879 sk->sk_bound_dev_if = 0;
880 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
881 inet_reset_saddr(sk);
882
883 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
884 sk->sk_prot->unhash(sk);
885 inet->sport = 0;
886 }
887 sk_dst_reset(sk);
888 return 0;
889}
890
891static void udp_close(struct sock *sk, long timeout)
892{
893 sk_common_release(sk);
894}
895
896/* return:
897 * 1 if the the UDP system should process it
898 * 0 if we should drop this packet
899 * -1 if it should get processed by xfrm4_rcv_encap
900 */
901static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
902{
903#ifndef CONFIG_XFRM
904 return 1;
905#else
906 struct udp_sock *up = udp_sk(sk);
907 struct udphdr *uh = skb->h.uh;
908 struct iphdr *iph;
909 int iphlen, len;
910
911 __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr);
912 __u32 *udpdata32 = (__u32 *)udpdata;
913 __u16 encap_type = up->encap_type;
914
915 /* if we're overly short, let UDP handle it */
916 if (udpdata > skb->tail)
917 return 1;
918
919 /* if this is not encapsulated socket, then just return now */
920 if (!encap_type)
921 return 1;
922
923 len = skb->tail - udpdata;
924
925 switch (encap_type) {
926 default:
927 case UDP_ENCAP_ESPINUDP:
928 /* Check if this is a keepalive packet. If so, eat it. */
929 if (len == 1 && udpdata[0] == 0xff) {
930 return 0;
931 } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) {
932 /* ESP Packet without Non-ESP header */
933 len = sizeof(struct udphdr);
934 } else
935 /* Must be an IKE packet.. pass it through */
936 return 1;
937 break;
938 case UDP_ENCAP_ESPINUDP_NON_IKE:
939 /* Check if this is a keepalive packet. If so, eat it. */
940 if (len == 1 && udpdata[0] == 0xff) {
941 return 0;
942 } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
943 udpdata32[0] == 0 && udpdata32[1] == 0) {
944
945 /* ESP Packet with Non-IKE marker */
946 len = sizeof(struct udphdr) + 2 * sizeof(u32);
947 } else
948 /* Must be an IKE packet.. pass it through */
949 return 1;
950 break;
951 }
952
953 /* At this point we are sure that this is an ESPinUDP packet,
954 * so we need to remove 'len' bytes from the packet (the UDP
955 * header and optional ESP marker bytes) and then modify the
956 * protocol to ESP, and then call into the transform receiver.
957 */
958
959 /* Now we can update and verify the packet length... */
960 iph = skb->nh.iph;
961 iphlen = iph->ihl << 2;
962 iph->tot_len = htons(ntohs(iph->tot_len) - len);
963 if (skb->len < iphlen + len) {
964 /* packet is too small!?! */
965 return 0;
966 }
967
968 /* pull the data buffer up to the ESP header and set the
969 * transport header to point to ESP. Keep UDP on the stack
970 * for later.
971 */
972 skb->h.raw = skb_pull(skb, len);
973
974 /* modify the protocol (it's ESP!) */
975 iph->protocol = IPPROTO_ESP;
976
977 /* and let the caller know to send this into the ESP processor... */
978 return -1;
979#endif
980}
981
982/* returns:
983 * -1: error
984 * 0: success
985 * >0: "udp encap" protocol resubmission
986 *
987 * Note that in the success and error cases, the skb is assumed to
988 * have either been requeued or freed.
989 */
990static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
991{
992 struct udp_sock *up = udp_sk(sk);
993
994 /*
995 * Charge it to the socket, dropping if the queue is full.
996 */
997 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
998 kfree_skb(skb);
999 return -1;
1000 }
1001
1002 if (up->encap_type) {
1003 /*
1004 * This is an encapsulation socket, so let's see if this is
1005 * an encapsulated packet.
1006 * If it's a keepalive packet, then just eat it.
1007 * If it's an encapsulateed packet, then pass it to the
1008 * IPsec xfrm input and return the response
1009 * appropriately. Otherwise, just fall through and
1010 * pass this up the UDP socket.
1011 */
1012 int ret;
1013
1014 ret = udp_encap_rcv(sk, skb);
1015 if (ret == 0) {
1016 /* Eat the packet .. */
1017 kfree_skb(skb);
1018 return 0;
1019 }
1020 if (ret < 0) {
1021 /* process the ESP packet */
1022 ret = xfrm4_rcv_encap(skb, up->encap_type);
1023 UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
1024 return -ret;
1025 }
1026 /* FALLTHROUGH -- it's a UDP Packet */
1027 }
1028
1029 if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
1030 if (__udp_checksum_complete(skb)) {
1031 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1032 kfree_skb(skb);
1033 return -1;
1034 }
1035 skb->ip_summed = CHECKSUM_UNNECESSARY;
1036 }
1037
1038 if (sock_queue_rcv_skb(sk,skb)<0) {
1039 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1040 kfree_skb(skb);
1041 return -1;
1042 }
1043 UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
1044 return 0;
1045}
1046
1047/*
1048 * Multicasts and broadcasts go to each listener.
1049 *
1050 * Note: called only from the BH handler context,
1051 * so we don't need to lock the hashes.
1052 */
1053static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
1054 u32 saddr, u32 daddr)
1055{
1056 struct sock *sk;
1057 int dif;
1058
1059 read_lock(&udp_hash_lock);
1060 sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
1061 dif = skb->dev->ifindex;
1062 sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
1063 if (sk) {
1064 struct sock *sknext = NULL;
1065
1066 do {
1067 struct sk_buff *skb1 = skb;
1068
1069 sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
1070 uh->source, saddr, dif);
1071 if(sknext)
1072 skb1 = skb_clone(skb, GFP_ATOMIC);
1073
1074 if(skb1) {
1075 int ret = udp_queue_rcv_skb(sk, skb1);
1076 if (ret > 0)
1077 /* we should probably re-process instead
1078 * of dropping packets here. */
1079 kfree_skb(skb1);
1080 }
1081 sk = sknext;
1082 } while(sknext);
1083 } else
1084 kfree_skb(skb);
1085 read_unlock(&udp_hash_lock);
1086 return 0;
1087}
1088
1089/* Initialize UDP checksum. If exited with zero value (success),
1090 * CHECKSUM_UNNECESSARY means, that no more checks are required.
1091 * Otherwise, csum completion requires chacksumming packet body,
1092 * including udp header and folding it to skb->csum.
1093 */
1094static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1095 unsigned short ulen, u32 saddr, u32 daddr)
1096{
1097 if (uh->check == 0) {
1098 skb->ip_summed = CHECKSUM_UNNECESSARY;
1099 } else if (skb->ip_summed == CHECKSUM_HW) {
1100 skb->ip_summed = CHECKSUM_UNNECESSARY;
1101 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1102 return 0;
1103 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
1104 skb->ip_summed = CHECKSUM_NONE;
1105 }
1106 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
1107 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
1108 /* Probably, we should checksum udp header (it should be in cache
1109 * in any case) and data in tiny packets (< rx copybreak).
1110 */
1111 return 0;
1112}
1113
1114/*
1115 * All we need to do is get the socket, and then do a checksum.
1116 */
1117
1118int udp_rcv(struct sk_buff *skb)
1119{
1120 struct sock *sk;
1121 struct udphdr *uh;
1122 unsigned short ulen;
1123 struct rtable *rt = (struct rtable*)skb->dst;
1124 u32 saddr = skb->nh.iph->saddr;
1125 u32 daddr = skb->nh.iph->daddr;
1126 int len = skb->len;
1127
1128 /*
1129 * Validate the packet and the UDP length.
1130 */
1131 if (!pskb_may_pull(skb, sizeof(struct udphdr)))
1132 goto no_header;
1133
1134 uh = skb->h.uh;
1135
1136 ulen = ntohs(uh->len);
1137
1138 if (ulen > len || ulen < sizeof(*uh))
1139 goto short_packet;
1140
1141 if (pskb_trim(skb, ulen))
1142 goto short_packet;
1143
1144 if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
1145 goto csum_error;
1146
1147 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1148 return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
1149
1150 sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
1151
1152 if (sk != NULL) {
1153 int ret = udp_queue_rcv_skb(sk, skb);
1154 sock_put(sk);
1155
1156 /* a return value > 0 means to resubmit the input, but
1157 * it it wants the return to be -protocol, or 0
1158 */
1159 if (ret > 0)
1160 return -ret;
1161 return 0;
1162 }
1163
1164 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1165 goto drop;
1166
1167 /* No socket. Drop packet silently, if checksum is wrong */
1168 if (udp_checksum_complete(skb))
1169 goto csum_error;
1170
1171 UDP_INC_STATS_BH(UDP_MIB_NOPORTS);
1172 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1173
1174 /*
1175 * Hmm. We got an UDP packet to a port to which we
1176 * don't wanna listen. Ignore it.
1177 */
1178 kfree_skb(skb);
1179 return(0);
1180
1181short_packet:
1182 NETDEBUG(if (net_ratelimit())
1183 printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
1184 NIPQUAD(saddr),
1185 ntohs(uh->source),
1186 ulen,
1187 len,
1188 NIPQUAD(daddr),
1189 ntohs(uh->dest)));
1190no_header:
1191 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1192 kfree_skb(skb);
1193 return(0);
1194
1195csum_error:
1196 /*
1197 * RFC1122: OK. Discards the bad packet silently (as far as
1198 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1199 */
1200 NETDEBUG(if (net_ratelimit())
1201 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
1202 NIPQUAD(saddr),
1203 ntohs(uh->source),
1204 NIPQUAD(daddr),
1205 ntohs(uh->dest),
1206 ulen));
1207drop:
1208 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1209 kfree_skb(skb);
1210 return(0);
1211}
1212
1213static int udp_destroy_sock(struct sock *sk)
1214{
1215 lock_sock(sk);
1216 udp_flush_pending_frames(sk);
1217 release_sock(sk);
1218 return 0;
1219}
1220
1221/*
1222 * Socket option code for UDP
1223 */
1224static int udp_setsockopt(struct sock *sk, int level, int optname,
1225 char __user *optval, int optlen)
1226{
1227 struct udp_sock *up = udp_sk(sk);
1228 int val;
1229 int err = 0;
1230
1231 if (level != SOL_UDP)
1232 return ip_setsockopt(sk, level, optname, optval, optlen);
1233
1234 if(optlen<sizeof(int))
1235 return -EINVAL;
1236
1237 if (get_user(val, (int __user *)optval))
1238 return -EFAULT;
1239
1240 switch(optname) {
1241 case UDP_CORK:
1242 if (val != 0) {
1243 up->corkflag = 1;
1244 } else {
1245 up->corkflag = 0;
1246 lock_sock(sk);
1247 udp_push_pending_frames(sk, up);
1248 release_sock(sk);
1249 }
1250 break;
1251
1252 case UDP_ENCAP:
1253 switch (val) {
1254 case 0:
1255 case UDP_ENCAP_ESPINUDP:
1256 case UDP_ENCAP_ESPINUDP_NON_IKE:
1257 up->encap_type = val;
1258 break;
1259 default:
1260 err = -ENOPROTOOPT;
1261 break;
1262 }
1263 break;
1264
1265 default:
1266 err = -ENOPROTOOPT;
1267 break;
1268 };
1269
1270 return err;
1271}
1272
1273static int udp_getsockopt(struct sock *sk, int level, int optname,
1274 char __user *optval, int __user *optlen)
1275{
1276 struct udp_sock *up = udp_sk(sk);
1277 int val, len;
1278
1279 if (level != SOL_UDP)
1280 return ip_getsockopt(sk, level, optname, optval, optlen);
1281
1282 if(get_user(len,optlen))
1283 return -EFAULT;
1284
1285 len = min_t(unsigned int, len, sizeof(int));
1286
1287 if(len < 0)
1288 return -EINVAL;
1289
1290 switch(optname) {
1291 case UDP_CORK:
1292 val = up->corkflag;
1293 break;
1294
1295 case UDP_ENCAP:
1296 val = up->encap_type;
1297 break;
1298
1299 default:
1300 return -ENOPROTOOPT;
1301 };
1302
1303 if(put_user(len, optlen))
1304 return -EFAULT;
1305 if(copy_to_user(optval, &val,len))
1306 return -EFAULT;
1307 return 0;
1308}
1309
1310/**
1311 * udp_poll - wait for a UDP event.
1312 * @file - file struct
1313 * @sock - socket
1314 * @wait - poll table
1315 *
1316 * This is same as datagram poll, except for the special case of
1317 * blocking sockets. If application is using a blocking fd
1318 * and a packet with checksum error is in the queue;
1319 * then it could get return from select indicating data available
1320 * but then block when reading it. Add special case code
1321 * to work around these arguably broken applications.
1322 */
1323unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
1324{
1325 unsigned int mask = datagram_poll(file, sock, wait);
1326 struct sock *sk = sock->sk;
1327
1328 /* Check for false positives due to checksum errors */
1329 if ( (mask & POLLRDNORM) &&
1330 !(file->f_flags & O_NONBLOCK) &&
1331 !(sk->sk_shutdown & RCV_SHUTDOWN)){
1332 struct sk_buff_head *rcvq = &sk->sk_receive_queue;
1333 struct sk_buff *skb;
1334
1335 spin_lock_irq(&rcvq->lock);
1336 while ((skb = skb_peek(rcvq)) != NULL) {
1337 if (udp_checksum_complete(skb)) {
1338 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1339 __skb_unlink(skb, rcvq);
1340 kfree_skb(skb);
1341 } else {
1342 skb->ip_summed = CHECKSUM_UNNECESSARY;
1343 break;
1344 }
1345 }
1346 spin_unlock_irq(&rcvq->lock);
1347
1348 /* nothing to see, move along */
1349 if (skb == NULL)
1350 mask &= ~(POLLIN | POLLRDNORM);
1351 }
1352
1353 return mask;
1354
1355}
1356
1357struct proto udp_prot = {
1358 .name = "UDP",
1359 .owner = THIS_MODULE,
1360 .close = udp_close,
1361 .connect = ip4_datagram_connect,
1362 .disconnect = udp_disconnect,
1363 .ioctl = udp_ioctl,
1364 .destroy = udp_destroy_sock,
1365 .setsockopt = udp_setsockopt,
1366 .getsockopt = udp_getsockopt,
1367 .sendmsg = udp_sendmsg,
1368 .recvmsg = udp_recvmsg,
1369 .sendpage = udp_sendpage,
1370 .backlog_rcv = udp_queue_rcv_skb,
1371 .hash = udp_v4_hash,
1372 .unhash = udp_v4_unhash,
1373 .get_port = udp_v4_get_port,
1374 .obj_size = sizeof(struct udp_sock),
1375};
1376
1377/* ------------------------------------------------------------------------ */
1378#ifdef CONFIG_PROC_FS
1379
1380static struct sock *udp_get_first(struct seq_file *seq)
1381{
1382 struct sock *sk;
1383 struct udp_iter_state *state = seq->private;
1384
1385 for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
1386 struct hlist_node *node;
1387 sk_for_each(sk, node, &udp_hash[state->bucket]) {
1388 if (sk->sk_family == state->family)
1389 goto found;
1390 }
1391 }
1392 sk = NULL;
1393found:
1394 return sk;
1395}
1396
1397static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1398{
1399 struct udp_iter_state *state = seq->private;
1400
1401 do {
1402 sk = sk_next(sk);
1403try_again:
1404 ;
1405 } while (sk && sk->sk_family != state->family);
1406
1407 if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
1408 sk = sk_head(&udp_hash[state->bucket]);
1409 goto try_again;
1410 }
1411 return sk;
1412}
1413
1414static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1415{
1416 struct sock *sk = udp_get_first(seq);
1417
1418 if (sk)
1419 while(pos && (sk = udp_get_next(seq, sk)) != NULL)
1420 --pos;
1421 return pos ? NULL : sk;
1422}
1423
1424static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1425{
1426 read_lock(&udp_hash_lock);
1427 return *pos ? udp_get_idx(seq, *pos-1) : (void *)1;
1428}
1429
1430static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1431{
1432 struct sock *sk;
1433
1434 if (v == (void *)1)
1435 sk = udp_get_idx(seq, 0);
1436 else
1437 sk = udp_get_next(seq, v);
1438
1439 ++*pos;
1440 return sk;
1441}
1442
1443static void udp_seq_stop(struct seq_file *seq, void *v)
1444{
1445 read_unlock(&udp_hash_lock);
1446}
1447
1448static int udp_seq_open(struct inode *inode, struct file *file)
1449{
1450 struct udp_seq_afinfo *afinfo = PDE(inode)->data;
1451 struct seq_file *seq;
1452 int rc = -ENOMEM;
1453 struct udp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
1454
1455 if (!s)
1456 goto out;
1457 memset(s, 0, sizeof(*s));
1458 s->family = afinfo->family;
1459 s->seq_ops.start = udp_seq_start;
1460 s->seq_ops.next = udp_seq_next;
1461 s->seq_ops.show = afinfo->seq_show;
1462 s->seq_ops.stop = udp_seq_stop;
1463
1464 rc = seq_open(file, &s->seq_ops);
1465 if (rc)
1466 goto out_kfree;
1467
1468 seq = file->private_data;
1469 seq->private = s;
1470out:
1471 return rc;
1472out_kfree:
1473 kfree(s);
1474 goto out;
1475}
1476
1477/* ------------------------------------------------------------------------ */
1478int udp_proc_register(struct udp_seq_afinfo *afinfo)
1479{
1480 struct proc_dir_entry *p;
1481 int rc = 0;
1482
1483 if (!afinfo)
1484 return -EINVAL;
1485 afinfo->seq_fops->owner = afinfo->owner;
1486 afinfo->seq_fops->open = udp_seq_open;
1487 afinfo->seq_fops->read = seq_read;
1488 afinfo->seq_fops->llseek = seq_lseek;
1489 afinfo->seq_fops->release = seq_release_private;
1490
1491 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1492 if (p)
1493 p->data = afinfo;
1494 else
1495 rc = -ENOMEM;
1496 return rc;
1497}
1498
1499void udp_proc_unregister(struct udp_seq_afinfo *afinfo)
1500{
1501 if (!afinfo)
1502 return;
1503 proc_net_remove(afinfo->name);
1504 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1505}
1506
1507/* ------------------------------------------------------------------------ */
1508static void udp4_format_sock(struct sock *sp, char *tmpbuf, int bucket)
1509{
1510 struct inet_sock *inet = inet_sk(sp);
1511 unsigned int dest = inet->daddr;
1512 unsigned int src = inet->rcv_saddr;
1513 __u16 destp = ntohs(inet->dport);
1514 __u16 srcp = ntohs(inet->sport);
1515
1516 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1517 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
1518 bucket, src, srcp, dest, destp, sp->sk_state,
1519 atomic_read(&sp->sk_wmem_alloc),
1520 atomic_read(&sp->sk_rmem_alloc),
1521 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
1522 atomic_read(&sp->sk_refcnt), sp);
1523}
1524
1525static int udp4_seq_show(struct seq_file *seq, void *v)
1526{
1527 if (v == SEQ_START_TOKEN)
1528 seq_printf(seq, "%-127s\n",
1529 " sl local_address rem_address st tx_queue "
1530 "rx_queue tr tm->when retrnsmt uid timeout "
1531 "inode");
1532 else {
1533 char tmpbuf[129];
1534 struct udp_iter_state *state = seq->private;
1535
1536 udp4_format_sock(v, tmpbuf, state->bucket);
1537 seq_printf(seq, "%-127s\n", tmpbuf);
1538 }
1539 return 0;
1540}
1541
1542/* ------------------------------------------------------------------------ */
1543static struct file_operations udp4_seq_fops;
1544static struct udp_seq_afinfo udp4_seq_afinfo = {
1545 .owner = THIS_MODULE,
1546 .name = "udp",
1547 .family = AF_INET,
1548 .seq_show = udp4_seq_show,
1549 .seq_fops = &udp4_seq_fops,
1550};
1551
1552int __init udp4_proc_init(void)
1553{
1554 return udp_proc_register(&udp4_seq_afinfo);
1555}
1556
1557void udp4_proc_exit(void)
1558{
1559 udp_proc_unregister(&udp4_seq_afinfo);
1560}
1561#endif /* CONFIG_PROC_FS */
1562
1563EXPORT_SYMBOL(udp_disconnect);
1564EXPORT_SYMBOL(udp_hash);
1565EXPORT_SYMBOL(udp_hash_lock);
1566EXPORT_SYMBOL(udp_ioctl);
1567EXPORT_SYMBOL(udp_port_rover);
1568EXPORT_SYMBOL(udp_prot);
1569EXPORT_SYMBOL(udp_sendmsg);
1570EXPORT_SYMBOL(udp_poll);
1571
1572#ifdef CONFIG_PROC_FS
1573EXPORT_SYMBOL(udp_proc_register);
1574EXPORT_SYMBOL(udp_proc_unregister);
1575#endif
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
new file mode 100644
index 000000000000..6aecd7a43534
--- /dev/null
+++ b/net/ipv4/utils.c
@@ -0,0 +1,59 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Various kernel-resident INET utility functions; mainly
7 * for format conversion and debugging output.
8 *
9 * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
10 *
11 * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 * Alan Cox : verify_area check.
15 * Alan Cox : removed old debugging.
16 * Andi Kleen : add net_ratelimit()
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24#include <linux/module.h>
25#include <linux/types.h>
26#include <asm/byteorder.h>
27
28/*
29 * Convert an ASCII string to binary IP.
30 */
31
32__u32 in_aton(const char *str)
33{
34 unsigned long l;
35 unsigned int val;
36 int i;
37
38 l = 0;
39 for (i = 0; i < 4; i++)
40 {
41 l <<= 8;
42 if (*str != '\0')
43 {
44 val = 0;
45 while (*str != '\0' && *str != '.')
46 {
47 val *= 10;
48 val += *str - '0';
49 str++;
50 }
51 l |= val;
52 if (*str != '\0')
53 str++;
54 }
55 }
56 return(htonl(l));
57}
58
59EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 000000000000..2d3849c38a0f
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,160 @@
1/*
2 * xfrm4_input.c
3 *
4 * Changes:
5 * YOSHIFUJI Hideaki @USAGI
6 * Split up af-specific portion
7 * Derek Atkins <derek@ihtfp.com>
8 * Add Encapsulation support
9 *
10 */
11
12#include <linux/module.h>
13#include <linux/string.h>
14#include <net/inet_ecn.h>
15#include <net/ip.h>
16#include <net/xfrm.h>
17
18int xfrm4_rcv(struct sk_buff *skb)
19{
20 return xfrm4_rcv_encap(skb, 0);
21}
22
23EXPORT_SYMBOL(xfrm4_rcv);
24
25static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
26{
27 struct iphdr *outer_iph = skb->nh.iph;
28 struct iphdr *inner_iph = skb->h.ipiph;
29
30 if (INET_ECN_is_ce(outer_iph->tos))
31 IP_ECN_set_ce(inner_iph);
32}
33
34static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
35{
36 switch (nexthdr) {
37 case IPPROTO_IPIP:
38 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
39 return -EINVAL;
40 *spi = skb->nh.iph->saddr;
41 *seq = 0;
42 return 0;
43 }
44
45 return xfrm_parse_spi(skb, nexthdr, spi, seq);
46}
47
48int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
49{
50 int err;
51 u32 spi, seq;
52 struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH];
53 struct xfrm_state *x;
54 int xfrm_nr = 0;
55 int decaps = 0;
56
57 if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0)
58 goto drop;
59
60 do {
61 struct iphdr *iph = skb->nh.iph;
62
63 if (xfrm_nr == XFRM_MAX_DEPTH)
64 goto drop;
65
66 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET);
67 if (x == NULL)
68 goto drop;
69
70 spin_lock(&x->lock);
71 if (unlikely(x->km.state != XFRM_STATE_VALID))
72 goto drop_unlock;
73
74 if (x->props.replay_window && xfrm_replay_check(x, seq))
75 goto drop_unlock;
76
77 if (xfrm_state_check_expire(x))
78 goto drop_unlock;
79
80 xfrm_vec[xfrm_nr].decap.decap_type = encap_type;
81 if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb))
82 goto drop_unlock;
83
84 /* only the first xfrm gets the encap type */
85 encap_type = 0;
86
87 if (x->props.replay_window)
88 xfrm_replay_advance(x, seq);
89
90 x->curlft.bytes += skb->len;
91 x->curlft.packets++;
92
93 spin_unlock(&x->lock);
94
95 xfrm_vec[xfrm_nr++].xvec = x;
96
97 iph = skb->nh.iph;
98
99 if (x->props.mode) {
100 if (iph->protocol != IPPROTO_IPIP)
101 goto drop;
102 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
103 goto drop;
104 if (skb_cloned(skb) &&
105 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
106 goto drop;
107 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
108 ipv4_copy_dscp(iph, skb->h.ipiph);
109 if (!(x->props.flags & XFRM_STATE_NOECN))
110 ipip_ecn_decapsulate(skb);
111 skb->mac.raw = memmove(skb->data - skb->mac_len,
112 skb->mac.raw, skb->mac_len);
113 skb->nh.raw = skb->data;
114 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
115 decaps = 1;
116 break;
117 }
118
119 if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0)
120 goto drop;
121 } while (!err);
122
123 /* Allocate new secpath or COW existing one. */
124
125 if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
126 struct sec_path *sp;
127 sp = secpath_dup(skb->sp);
128 if (!sp)
129 goto drop;
130 if (skb->sp)
131 secpath_put(skb->sp);
132 skb->sp = sp;
133 }
134 if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
135 goto drop;
136
137 memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
138 skb->sp->len += xfrm_nr;
139
140 if (decaps) {
141 if (!(skb->dev->flags&IFF_LOOPBACK)) {
142 dst_release(skb->dst);
143 skb->dst = NULL;
144 }
145 netif_rx(skb);
146 return 0;
147 } else {
148 return -skb->nh.iph->protocol;
149 }
150
151drop_unlock:
152 spin_unlock(&x->lock);
153 xfrm_state_put(x);
154drop:
155 while (--xfrm_nr >= 0)
156 xfrm_state_put(xfrm_vec[xfrm_nr].xvec);
157
158 kfree_skb(skb);
159 return 0;
160}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 000000000000..af2392ae5769
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,141 @@
1/*
2 * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
3 * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 */
10
11#include <linux/skbuff.h>
12#include <linux/spinlock.h>
13#include <net/inet_ecn.h>
14#include <net/ip.h>
15#include <net/xfrm.h>
16#include <net/icmp.h>
17
18/* Add encapsulation header.
19 *
20 * In transport mode, the IP header will be moved forward to make space
21 * for the encapsulation header.
22 *
23 * In tunnel mode, the top IP header will be constructed per RFC 2401.
24 * The following fields in it shall be filled in by x->type->output:
25 * tot_len
26 * check
27 *
28 * On exit, skb->h will be set to the start of the payload to be processed
29 * by x->type->output and skb->nh will be set to the top IP header.
30 */
31static void xfrm4_encap(struct sk_buff *skb)
32{
33 struct dst_entry *dst = skb->dst;
34 struct xfrm_state *x = dst->xfrm;
35 struct iphdr *iph, *top_iph;
36
37 iph = skb->nh.iph;
38 skb->h.ipiph = iph;
39
40 skb->nh.raw = skb_push(skb, x->props.header_len);
41 top_iph = skb->nh.iph;
42
43 if (!x->props.mode) {
44 skb->h.raw += iph->ihl*4;
45 memmove(top_iph, iph, iph->ihl*4);
46 return;
47 }
48
49 top_iph->ihl = 5;
50 top_iph->version = 4;
51
52 /* DS disclosed */
53 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
54 if (x->props.flags & XFRM_STATE_NOECN)
55 IP_ECN_clear(top_iph);
56
57 top_iph->frag_off = iph->frag_off & htons(IP_DF);
58 if (!top_iph->frag_off)
59 __ip_select_ident(top_iph, dst, 0);
60
61 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
62
63 top_iph->saddr = x->props.saddr.a4;
64 top_iph->daddr = x->id.daddr.a4;
65 top_iph->protocol = IPPROTO_IPIP;
66
67 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
68}
69
70static int xfrm4_tunnel_check_size(struct sk_buff *skb)
71{
72 int mtu, ret = 0;
73 struct dst_entry *dst;
74 struct iphdr *iph = skb->nh.iph;
75
76 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
77 goto out;
78
79 IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
80
81 if (!(iph->frag_off & htons(IP_DF)) || skb->local_df)
82 goto out;
83
84 dst = skb->dst;
85 mtu = dst_mtu(dst);
86 if (skb->len > mtu) {
87 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
88 ret = -EMSGSIZE;
89 }
90out:
91 return ret;
92}
93
94int xfrm4_output(struct sk_buff *skb)
95{
96 struct dst_entry *dst = skb->dst;
97 struct xfrm_state *x = dst->xfrm;
98 int err;
99
100 if (skb->ip_summed == CHECKSUM_HW) {
101 err = skb_checksum_help(skb, 0);
102 if (err)
103 goto error_nolock;
104 }
105
106 if (x->props.mode) {
107 err = xfrm4_tunnel_check_size(skb);
108 if (err)
109 goto error_nolock;
110 }
111
112 spin_lock_bh(&x->lock);
113 err = xfrm_state_check(x, skb);
114 if (err)
115 goto error;
116
117 xfrm4_encap(skb);
118
119 err = x->type->output(x, skb);
120 if (err)
121 goto error;
122
123 x->curlft.bytes += skb->len;
124 x->curlft.packets++;
125
126 spin_unlock_bh(&x->lock);
127
128 if (!(skb->dst = dst_pop(dst))) {
129 err = -EHOSTUNREACH;
130 goto error_nolock;
131 }
132 err = NET_XMIT_BYPASS;
133
134out_exit:
135 return err;
136error:
137 spin_unlock_bh(&x->lock);
138error_nolock:
139 kfree_skb(skb);
140 goto out_exit;
141}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 000000000000..7fe2afd2e669
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,281 @@
1/*
2 * xfrm4_policy.c
3 *
4 * Changes:
5 * Kazunori MIYAZAWA @USAGI
6 * YOSHIFUJI Hideaki @USAGI
7 * Split up af-specific portion
8 *
9 */
10
11#include <linux/config.h>
12#include <net/xfrm.h>
13#include <net/ip.h>
14
15static struct dst_ops xfrm4_dst_ops;
16static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
17
18static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
19
20static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
21{
22 return __ip_route_output_key((struct rtable**)dst, fl);
23}
24
25static struct dst_entry *
26__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
27{
28 struct dst_entry *dst;
29
30 read_lock_bh(&policy->lock);
31 for (dst = policy->bundles; dst; dst = dst->next) {
32 struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
33 if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
34 xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
35 xdst->u.rt.fl.fl4_src == fl->fl4_src &&
36 xfrm_bundle_ok(xdst, fl, AF_INET)) {
37 dst_clone(dst);
38 break;
39 }
40 }
41 read_unlock_bh(&policy->lock);
42 return dst;
43}
44
45/* Allocate chain of dst_entry's, attach known xfrm's, calculate
46 * all the metrics... Shortly, bundle a bundle.
47 */
48
49static int
50__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
51 struct flowi *fl, struct dst_entry **dst_p)
52{
53 struct dst_entry *dst, *dst_prev;
54 struct rtable *rt0 = (struct rtable*)(*dst_p);
55 struct rtable *rt = rt0;
56 u32 remote = fl->fl4_dst;
57 u32 local = fl->fl4_src;
58 struct flowi fl_tunnel = {
59 .nl_u = {
60 .ip4_u = {
61 .saddr = local,
62 .daddr = remote
63 }
64 }
65 };
66 int i;
67 int err;
68 int header_len = 0;
69 int trailer_len = 0;
70
71 dst = dst_prev = NULL;
72 dst_hold(&rt->u.dst);
73
74 for (i = 0; i < nx; i++) {
75 struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
76 struct xfrm_dst *xdst;
77 int tunnel = 0;
78
79 if (unlikely(dst1 == NULL)) {
80 err = -ENOBUFS;
81 dst_release(&rt->u.dst);
82 goto error;
83 }
84
85 if (!dst)
86 dst = dst1;
87 else {
88 dst_prev->child = dst1;
89 dst1->flags |= DST_NOHASH;
90 dst_clone(dst1);
91 }
92
93 xdst = (struct xfrm_dst *)dst1;
94 xdst->route = &rt->u.dst;
95
96 dst1->next = dst_prev;
97 dst_prev = dst1;
98 if (xfrm[i]->props.mode) {
99 remote = xfrm[i]->id.daddr.a4;
100 local = xfrm[i]->props.saddr.a4;
101 tunnel = 1;
102 }
103 header_len += xfrm[i]->props.header_len;
104 trailer_len += xfrm[i]->props.trailer_len;
105
106 if (tunnel) {
107 fl_tunnel.fl4_src = local;
108 fl_tunnel.fl4_dst = remote;
109 err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
110 &fl_tunnel, AF_INET);
111 if (err)
112 goto error;
113 } else
114 dst_hold(&rt->u.dst);
115 }
116
117 dst_prev->child = &rt->u.dst;
118 dst->path = &rt->u.dst;
119
120 *dst_p = dst;
121 dst = dst_prev;
122
123 dst_prev = *dst_p;
124 i = 0;
125 for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
126 struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
127 x->u.rt.fl = *fl;
128
129 dst_prev->xfrm = xfrm[i++];
130 dst_prev->dev = rt->u.dst.dev;
131 if (rt->u.dst.dev)
132 dev_hold(rt->u.dst.dev);
133 dst_prev->obsolete = -1;
134 dst_prev->flags |= DST_HOST;
135 dst_prev->lastuse = jiffies;
136 dst_prev->header_len = header_len;
137 dst_prev->trailer_len = trailer_len;
138 memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
139
140 /* Copy neighbout for reachability confirmation */
141 dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour);
142 dst_prev->input = rt->u.dst.input;
143 dst_prev->output = xfrm4_output;
144 if (rt->peer)
145 atomic_inc(&rt->peer->refcnt);
146 x->u.rt.peer = rt->peer;
147 /* Sheit... I remember I did this right. Apparently,
148 * it was magically lost, so this code needs audit */
149 x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL);
150 x->u.rt.rt_type = rt->rt_type;
151 x->u.rt.rt_src = rt0->rt_src;
152 x->u.rt.rt_dst = rt0->rt_dst;
153 x->u.rt.rt_gateway = rt->rt_gateway;
154 x->u.rt.rt_spec_dst = rt0->rt_spec_dst;
155 header_len -= x->u.dst.xfrm->props.header_len;
156 trailer_len -= x->u.dst.xfrm->props.trailer_len;
157 }
158
159 xfrm_init_pmtu(dst);
160 return 0;
161
162error:
163 if (dst)
164 dst_free(dst);
165 return err;
166}
167
168static void
169_decode_session4(struct sk_buff *skb, struct flowi *fl)
170{
171 struct iphdr *iph = skb->nh.iph;
172 u8 *xprth = skb->nh.raw + iph->ihl*4;
173
174 memset(fl, 0, sizeof(struct flowi));
175 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
176 switch (iph->protocol) {
177 case IPPROTO_UDP:
178 case IPPROTO_TCP:
179 case IPPROTO_SCTP:
180 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
181 u16 *ports = (u16 *)xprth;
182
183 fl->fl_ip_sport = ports[0];
184 fl->fl_ip_dport = ports[1];
185 }
186 break;
187
188 case IPPROTO_ICMP:
189 if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
190 u8 *icmp = xprth;
191
192 fl->fl_icmp_type = icmp[0];
193 fl->fl_icmp_code = icmp[1];
194 }
195 break;
196
197 case IPPROTO_ESP:
198 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
199 u32 *ehdr = (u32 *)xprth;
200
201 fl->fl_ipsec_spi = ehdr[0];
202 }
203 break;
204
205 case IPPROTO_AH:
206 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
207 u32 *ah_hdr = (u32*)xprth;
208
209 fl->fl_ipsec_spi = ah_hdr[1];
210 }
211 break;
212
213 case IPPROTO_COMP:
214 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
215 u16 *ipcomp_hdr = (u16 *)xprth;
216
217 fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1]));
218 }
219 break;
220 default:
221 fl->fl_ipsec_spi = 0;
222 break;
223 };
224 }
225 fl->proto = iph->protocol;
226 fl->fl4_dst = iph->daddr;
227 fl->fl4_src = iph->saddr;
228}
229
230static inline int xfrm4_garbage_collect(void)
231{
232 read_lock(&xfrm4_policy_afinfo.lock);
233 xfrm4_policy_afinfo.garbage_collect();
234 read_unlock(&xfrm4_policy_afinfo.lock);
235 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
236}
237
238static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
239{
240 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
241 struct dst_entry *path = xdst->route;
242
243 path->ops->update_pmtu(path, mtu);
244}
245
246static struct dst_ops xfrm4_dst_ops = {
247 .family = AF_INET,
248 .protocol = __constant_htons(ETH_P_IP),
249 .gc = xfrm4_garbage_collect,
250 .update_pmtu = xfrm4_update_pmtu,
251 .gc_thresh = 1024,
252 .entry_size = sizeof(struct xfrm_dst),
253};
254
255static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
256 .family = AF_INET,
257 .lock = RW_LOCK_UNLOCKED,
258 .type_map = &xfrm4_type_map,
259 .dst_ops = &xfrm4_dst_ops,
260 .dst_lookup = xfrm4_dst_lookup,
261 .find_bundle = __xfrm4_find_bundle,
262 .bundle_create = __xfrm4_bundle_create,
263 .decode_session = _decode_session4,
264};
265
266static void __init xfrm4_policy_init(void)
267{
268 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
269}
270
271static void __exit xfrm4_policy_fini(void)
272{
273 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
274}
275
276void __init xfrm4_init(void)
277{
278 xfrm4_state_init();
279 xfrm4_policy_init();
280}
281
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 000000000000..223a2e83853f
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,126 @@
1/*
2 * xfrm4_state.c
3 *
4 * Changes:
5 * YOSHIFUJI Hideaki @USAGI
6 * Split up af-specific portion
7 *
8 */
9
10#include <net/xfrm.h>
11#include <linux/pfkeyv2.h>
12#include <linux/ipsec.h>
13
14static struct xfrm_state_afinfo xfrm4_state_afinfo;
15
16static void
17__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
18 struct xfrm_tmpl *tmpl,
19 xfrm_address_t *daddr, xfrm_address_t *saddr)
20{
21 x->sel.daddr.a4 = fl->fl4_dst;
22 x->sel.saddr.a4 = fl->fl4_src;
23 x->sel.dport = xfrm_flowi_dport(fl);
24 x->sel.dport_mask = ~0;
25 x->sel.sport = xfrm_flowi_sport(fl);
26 x->sel.sport_mask = ~0;
27 x->sel.prefixlen_d = 32;
28 x->sel.prefixlen_s = 32;
29 x->sel.proto = fl->proto;
30 x->sel.ifindex = fl->oif;
31 x->id = tmpl->id;
32 if (x->id.daddr.a4 == 0)
33 x->id.daddr.a4 = daddr->a4;
34 x->props.saddr = tmpl->saddr;
35 if (x->props.saddr.a4 == 0)
36 x->props.saddr.a4 = saddr->a4;
37 x->props.mode = tmpl->mode;
38 x->props.reqid = tmpl->reqid;
39 x->props.family = AF_INET;
40}
41
42static struct xfrm_state *
43__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
44{
45 unsigned h = __xfrm4_spi_hash(daddr, spi, proto);
46 struct xfrm_state *x;
47
48 list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) {
49 if (x->props.family == AF_INET &&
50 spi == x->id.spi &&
51 daddr->a4 == x->id.daddr.a4 &&
52 proto == x->id.proto) {
53 xfrm_state_hold(x);
54 return x;
55 }
56 }
57 return NULL;
58}
59
60static struct xfrm_state *
61__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
62 xfrm_address_t *daddr, xfrm_address_t *saddr,
63 int create)
64{
65 struct xfrm_state *x, *x0;
66 unsigned h = __xfrm4_dst_hash(daddr);
67
68 x0 = NULL;
69
70 list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) {
71 if (x->props.family == AF_INET &&
72 daddr->a4 == x->id.daddr.a4 &&
73 mode == x->props.mode &&
74 proto == x->id.proto &&
75 saddr->a4 == x->props.saddr.a4 &&
76 reqid == x->props.reqid &&
77 x->km.state == XFRM_STATE_ACQ &&
78 !x->id.spi) {
79 x0 = x;
80 break;
81 }
82 }
83 if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
84 x0->sel.daddr.a4 = daddr->a4;
85 x0->sel.saddr.a4 = saddr->a4;
86 x0->sel.prefixlen_d = 32;
87 x0->sel.prefixlen_s = 32;
88 x0->props.saddr.a4 = saddr->a4;
89 x0->km.state = XFRM_STATE_ACQ;
90 x0->id.daddr.a4 = daddr->a4;
91 x0->id.proto = proto;
92 x0->props.family = AF_INET;
93 x0->props.mode = mode;
94 x0->props.reqid = reqid;
95 x0->props.family = AF_INET;
96 x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
97 xfrm_state_hold(x0);
98 x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
99 add_timer(&x0->timer);
100 xfrm_state_hold(x0);
101 list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h);
102 wake_up(&km_waitq);
103 }
104 if (x0)
105 xfrm_state_hold(x0);
106 return x0;
107}
108
109static struct xfrm_state_afinfo xfrm4_state_afinfo = {
110 .family = AF_INET,
111 .lock = RW_LOCK_UNLOCKED,
112 .init_tempsel = __xfrm4_init_tempsel,
113 .state_lookup = __xfrm4_state_lookup,
114 .find_acq = __xfrm4_find_acq,
115};
116
117void __init xfrm4_state_init(void)
118{
119 xfrm_state_register_afinfo(&xfrm4_state_afinfo);
120}
121
122void __exit xfrm4_state_fini(void)
123{
124 xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
125}
126
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 000000000000..413191f585f6
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,144 @@
1/* xfrm4_tunnel.c: Generic IP tunnel transformer.
2 *
3 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
4 */
5
6#include <linux/skbuff.h>
7#include <linux/module.h>
8#include <net/xfrm.h>
9#include <net/ip.h>
10#include <net/protocol.h>
11
12static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
13{
14 struct iphdr *iph;
15
16 iph = skb->nh.iph;
17 iph->tot_len = htons(skb->len);
18 ip_send_check(iph);
19
20 return 0;
21}
22
23static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
24{
25 return 0;
26}
27
28static struct xfrm_tunnel *ipip_handler;
29static DECLARE_MUTEX(xfrm4_tunnel_sem);
30
31int xfrm4_tunnel_register(struct xfrm_tunnel *handler)
32{
33 int ret;
34
35 down(&xfrm4_tunnel_sem);
36 ret = 0;
37 if (ipip_handler != NULL)
38 ret = -EINVAL;
39 if (!ret)
40 ipip_handler = handler;
41 up(&xfrm4_tunnel_sem);
42
43 return ret;
44}
45
46EXPORT_SYMBOL(xfrm4_tunnel_register);
47
48int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler)
49{
50 int ret;
51
52 down(&xfrm4_tunnel_sem);
53 ret = 0;
54 if (ipip_handler != handler)
55 ret = -EINVAL;
56 if (!ret)
57 ipip_handler = NULL;
58 up(&xfrm4_tunnel_sem);
59
60 synchronize_net();
61
62 return ret;
63}
64
65EXPORT_SYMBOL(xfrm4_tunnel_deregister);
66
67static int ipip_rcv(struct sk_buff *skb)
68{
69 struct xfrm_tunnel *handler = ipip_handler;
70
71 /* Tunnel devices take precedence. */
72 if (handler && handler->handler(skb) == 0)
73 return 0;
74
75 return xfrm4_rcv(skb);
76}
77
78static void ipip_err(struct sk_buff *skb, u32 info)
79{
80 struct xfrm_tunnel *handler = ipip_handler;
81 u32 arg = info;
82
83 if (handler)
84 handler->err_handler(skb, &arg);
85}
86
87static int ipip_init_state(struct xfrm_state *x, void *args)
88{
89 if (!x->props.mode)
90 return -EINVAL;
91
92 if (x->encap)
93 return -EINVAL;
94
95 x->props.header_len = sizeof(struct iphdr);
96
97 return 0;
98}
99
100static void ipip_destroy(struct xfrm_state *x)
101{
102}
103
104static struct xfrm_type ipip_type = {
105 .description = "IPIP",
106 .owner = THIS_MODULE,
107 .proto = IPPROTO_IPIP,
108 .init_state = ipip_init_state,
109 .destructor = ipip_destroy,
110 .input = ipip_xfrm_rcv,
111 .output = ipip_output
112};
113
114static struct net_protocol ipip_protocol = {
115 .handler = ipip_rcv,
116 .err_handler = ipip_err,
117 .no_policy = 1,
118};
119
120static int __init ipip_init(void)
121{
122 if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
123 printk(KERN_INFO "ipip init: can't add xfrm type\n");
124 return -EAGAIN;
125 }
126 if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) {
127 printk(KERN_INFO "ipip init: can't add protocol\n");
128 xfrm_unregister_type(&ipip_type, AF_INET);
129 return -EAGAIN;
130 }
131 return 0;
132}
133
134static void __exit ipip_fini(void)
135{
136 if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0)
137 printk(KERN_INFO "ipip close: can't remove protocol\n");
138 if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
139 printk(KERN_INFO "ipip close: can't remove xfrm type\n");
140}
141
142module_init(ipip_init);
143module_exit(ipip_fini);
144MODULE_LICENSE("GPL");