aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2009-03-16 12:10:36 -0400
committerPatrick McHardy <kaber@trash.net>2009-03-16 12:10:36 -0400
commit0269ea4937343536ec7e85649932bc8c9686ea78 (patch)
tree5c0cae40918bf9a130d1eec3bcb82341592a509a
parent1546000fe8db0d3f47b0ef1dd487ec23fbd95313 (diff)
netfilter: xtables: add cluster match
This patch adds the iptables cluster match. This match can be used to deploy gateway and back-end load-sharing clusters. The cluster can be composed of 32 nodes maximum (although I have only tested this with two nodes, so I cannot tell what is the real scalability limit of this solution in terms of cluster nodes). Assuming that all the nodes see all packets (see below for an example on how to do that if your switch does not allow this), the cluster match decides if this node has to handle a packet given: (jhash(source IP) % total_nodes) & node_mask For related connections, the master conntrack is used. The following is an example of its use to deploy a gateway cluster composed of two nodes (where this is the node 1): iptables -I PREROUTING -t mangle -i eth1 -m cluster \ --cluster-total-nodes 2 --cluster-local-node 1 \ --cluster-proc-name eth1 -j MARK --set-mark 0xffff iptables -A PREROUTING -t mangle -i eth1 \ -m mark ! --mark 0xffff -j DROP iptables -A PREROUTING -t mangle -i eth2 -m cluster \ --cluster-total-nodes 2 --cluster-local-node 1 \ --cluster-proc-name eth2 -j MARK --set-mark 0xffff iptables -A PREROUTING -t mangle -i eth2 \ -m mark ! --mark 0xffff -j DROP And the following commands to make all nodes see the same packets: ip maddr add 01:00:5e:00:01:01 dev eth1 ip maddr add 01:00:5e:00:01:02 dev eth2 arptables -I OUTPUT -o eth1 --h-length 6 \ -j mangle --mangle-mac-s 01:00:5e:00:01:01 arptables -I INPUT -i eth1 --h-length 6 \ --destination-mac 01:00:5e:00:01:01 \ -j mangle --mangle-mac-d 00:zz:yy:xx:5a:27 arptables -I OUTPUT -o eth2 --h-length 6 \ -j mangle --mangle-mac-s 01:00:5e:00:01:02 arptables -I INPUT -i eth2 --h-length 6 \ --destination-mac 01:00:5e:00:01:02 \ -j mangle --mangle-mac-d 00:zz:yy:xx:5a:27 In the case of TCP connections, pickup facility has to be disabled to avoid marking TCP ACK packets coming in the reply direction as valid. echo 0 > /proc/sys/net/netfilter/nf_conntrack_tcp_loose BTW, some final notes: * This match mangles the skbuff pkt_type in case that it detects PACKET_MULTICAST for a non-multicast address. This may be done in a PKTTYPE target for this sole purpose. * This match supersedes the CLUSTERIP target. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Patrick McHardy <kaber@trash.net>
-rw-r--r--include/linux/netfilter/Kbuild1
-rw-r--r--include/linux/netfilter/xt_cluster.h15
-rw-r--r--net/netfilter/Kconfig16
-rw-r--r--net/netfilter/Makefile1
-rw-r--r--net/netfilter/xt_cluster.c164
5 files changed, 197 insertions, 0 deletions
diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index 947b47d7f6c..af9d2fb9721 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -21,6 +21,7 @@ header-y += xt_connbytes.h
21header-y += xt_connlimit.h 21header-y += xt_connlimit.h
22header-y += xt_connmark.h 22header-y += xt_connmark.h
23header-y += xt_conntrack.h 23header-y += xt_conntrack.h
24header-y += xt_cluster.h
24header-y += xt_dccp.h 25header-y += xt_dccp.h
25header-y += xt_dscp.h 26header-y += xt_dscp.h
26header-y += xt_esp.h 27header-y += xt_esp.h
diff --git a/include/linux/netfilter/xt_cluster.h b/include/linux/netfilter/xt_cluster.h
new file mode 100644
index 00000000000..5e0a0d07b52
--- /dev/null
+++ b/include/linux/netfilter/xt_cluster.h
@@ -0,0 +1,15 @@
1#ifndef _XT_CLUSTER_MATCH_H
2#define _XT_CLUSTER_MATCH_H
3
4enum xt_cluster_flags {
5 XT_CLUSTER_F_INV = (1 << 0)
6};
7
8struct xt_cluster_match_info {
9 u_int32_t total_nodes;
10 u_int32_t node_mask;
11 u_int32_t hash_seed;
12 u_int32_t flags;
13};
14
15#endif /* _XT_CLUSTER_MATCH_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index cdbaaff6d0d..2562d05dbaf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -527,6 +527,22 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP
527 This option adds a "TCPOPTSTRIP" target, which allows you to strip 527 This option adds a "TCPOPTSTRIP" target, which allows you to strip
528 TCP options from TCP packets. 528 TCP options from TCP packets.
529 529
530config NETFILTER_XT_MATCH_CLUSTER
531 tristate '"cluster" match support'
532 depends on NF_CONNTRACK
533 depends on NETFILTER_ADVANCED
534 ---help---
535 This option allows you to build work-load-sharing clusters of
536 network servers/stateful firewalls without having a dedicated
537 load-balancing router/server/switch. Basically, this match returns
538 true when the packet must be handled by this cluster node. Thus,
539 all nodes see all packets and this match decides which node handles
540 what packets. The work-load sharing algorithm is based on source
541 address hashing.
542
543 If you say Y or M here, try `iptables -m cluster --help` for
544 more information.
545
530config NETFILTER_XT_MATCH_COMMENT 546config NETFILTER_XT_MATCH_COMMENT
531 tristate '"comment" match support' 547 tristate '"comment" match support'
532 depends on NETFILTER_ADVANCED 548 depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 7a9b8397573..6282060fbda 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
59obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o 59obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
60 60
61# matches 61# matches
62obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
62obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o 63obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
63obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o 64obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
64obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o 65obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
new file mode 100644
index 00000000000..ad5bd890e4e
--- /dev/null
+++ b/net/netfilter/xt_cluster.c
@@ -0,0 +1,164 @@
1/*
2 * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8#include <linux/module.h>
9#include <linux/skbuff.h>
10#include <linux/jhash.h>
11#include <linux/ip.h>
12#include <net/ipv6.h>
13
14#include <linux/netfilter/x_tables.h>
15#include <net/netfilter/nf_conntrack.h>
16#include <linux/netfilter/xt_cluster.h>
17
18static inline u_int32_t nf_ct_orig_ipv4_src(const struct nf_conn *ct)
19{
20 return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
21}
22
23static inline const void *nf_ct_orig_ipv6_src(const struct nf_conn *ct)
24{
25 return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6;
26}
27
28static inline u_int32_t
29xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info)
30{
31 return jhash_1word(ip, info->hash_seed);
32}
33
34static inline u_int32_t
35xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info)
36{
37 return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed);
38}
39
40static inline u_int32_t
41xt_cluster_hash(const struct nf_conn *ct,
42 const struct xt_cluster_match_info *info)
43{
44 u_int32_t hash = 0;
45
46 switch(nf_ct_l3num(ct)) {
47 case AF_INET:
48 hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info);
49 break;
50 case AF_INET6:
51 hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info);
52 break;
53 default:
54 WARN_ON(1);
55 break;
56 }
57 return (((u64)hash * info->total_nodes) >> 32);
58}
59
60static inline bool
61xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
62{
63 bool is_multicast = false;
64
65 switch(family) {
66 case NFPROTO_IPV4:
67 is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr);
68 break;
69 case NFPROTO_IPV6:
70 is_multicast = ipv6_addr_type(&ipv6_hdr(skb)->daddr) &
71 IPV6_ADDR_MULTICAST;
72 break;
73 default:
74 WARN_ON(1);
75 break;
76 }
77 return is_multicast;
78}
79
80static bool
81xt_cluster_mt(const struct sk_buff *skb, const struct xt_match_param *par)
82{
83 struct sk_buff *pskb = (struct sk_buff *)skb;
84 const struct xt_cluster_match_info *info = par->matchinfo;
85 const struct nf_conn *ct;
86 enum ip_conntrack_info ctinfo;
87 unsigned long hash;
88
89 /* This match assumes that all nodes see the same packets. This can be
90 * achieved if the switch that connects the cluster nodes support some
91 * sort of 'port mirroring'. However, if your switch does not support
92 * this, your cluster nodes can reply ARP request using a multicast MAC
93 * address. Thus, your switch will flood the same packets to the
94 * cluster nodes with the same multicast MAC address. Using a multicast
95 * link address is a RFC 1812 (section 3.3.2) violation, but this works
96 * fine in practise.
97 *
98 * Unfortunately, if you use the multicast MAC address, the link layer
99 * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted
100 * by TCP and others for packets coming to this node. For that reason,
101 * this match mangles skbuff's pkt_type if it detects a packet
102 * addressed to a unicast address but using PACKET_MULTICAST. Yes, I
103 * know, matches should not alter packets, but we are doing this here
104 * because we would need to add a PKTTYPE target for this sole purpose.
105 */
106 if (!xt_cluster_is_multicast_addr(skb, par->family) &&
107 skb->pkt_type == PACKET_MULTICAST) {
108 pskb->pkt_type = PACKET_HOST;
109 }
110
111 ct = nf_ct_get(skb, &ctinfo);
112 if (ct == NULL)
113 return false;
114
115 if (ct == &nf_conntrack_untracked)
116 return false;
117
118 if (ct->master)
119 hash = xt_cluster_hash(ct->master, info);
120 else
121 hash = xt_cluster_hash(ct, info);
122
123 return !!((1 << hash) & info->node_mask) ^
124 !!(info->flags & XT_CLUSTER_F_INV);
125}
126
127static bool xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
128{
129 struct xt_cluster_match_info *info = par->matchinfo;
130
131 if (info->node_mask >= (1 << info->total_nodes)) {
132 printk(KERN_ERR "xt_cluster: this node mask cannot be "
133 "higher than the total number of nodes\n");
134 return false;
135 }
136 return true;
137}
138
139static struct xt_match xt_cluster_match __read_mostly = {
140 .name = "cluster",
141 .family = NFPROTO_UNSPEC,
142 .match = xt_cluster_mt,
143 .checkentry = xt_cluster_mt_checkentry,
144 .matchsize = sizeof(struct xt_cluster_match_info),
145 .me = THIS_MODULE,
146};
147
148static int __init xt_cluster_mt_init(void)
149{
150 return xt_register_match(&xt_cluster_match);
151}
152
153static void __exit xt_cluster_mt_fini(void)
154{
155 xt_unregister_match(&xt_cluster_match);
156}
157
158MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
159MODULE_LICENSE("GPL");
160MODULE_DESCRIPTION("Xtables: hash-based cluster match");
161MODULE_ALIAS("ipt_cluster");
162MODULE_ALIAS("ip6t_cluster");
163module_init(xt_cluster_mt_init);
164module_exit(xt_cluster_mt_fini);