aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-03-17 15:06:24 -0400
committerDavid S. Miller <davem@davemloft.net>2014-03-17 15:06:24 -0400
commite86e180b824e00733bd0e499d412a595078f9b51 (patch)
treeebda350b99785b4d0dd0188dd28fa17ec8135474
parente7ef085d0a9dc1cc72e7d8108ed3b4e1a5e8d938 (diff)
parent7d08487777c8b30dea34790734d708470faaf1e5 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
Pablo Neira Ayuso says: ==================== Netfilter/IPVS updates for net-next The following patchset contains Netfilter/IPVS updates for net-next, most relevantly they are: * cleanup to remove double semicolon from stephen hemminger. * calm down sparse warning in xt_ipcomp, from Fan Du. * nf_ct_labels support for nf_tables, from Florian Westphal. * new macros to simplify rcu dereferences in the scope of nfnetlink and nf_tables, from Patrick McHardy. * Accept queue and drop (including reason for drop) to verdict parsing in nf_tables, also from Patrick. * Remove unused random seed initialization in nfnetlink_log, from Florian Westphal. * Allow to attach user-specific information to nf_tables rules, useful to attach user comments to rule, from me. * Return errors in ipset according to the manpage documentation, from Jozsef Kadlecsik. * Fix coccinelle warnings related to incorrect bool type usage for ipset, from Fengguang Wu. * Add hash:ip,mark set type to ipset, from Vytas Dauksa. * Fix message for each spotted by ipset for each netns that is created, from Ilia Mirkin. * Add forceadd option to ipset, which evicts a random entry from the set if it becomes full, from Josh Hunt. * Minor IPVS cleanups and fixes from Andi Kleen and Tingwei Liu. * Improve conntrack scalability by removing a central spinlock, original work from Eric Dumazet. Jesper Dangaard Brouer took them over to address remaining issues. Several patches to prepare this change come in first place. * Rework nft_hash to resolve bugs (leaking chain, missing rcu synchronization on element removal, etc. from Patrick McHardy. * Restore context in the rule deletion path, as we now release rule objects synchronously, from Patrick McHardy. This gets back event notification for anonymous sets. * Fix NAT family validation in nft_nat, also from Patrick. * Improve scalability of xt_connlimit by using an array of spinlocks and by introducing a rb-tree of hashtables for faster lookup of accounted objects per network. This patch was preceded by several patches and refactorizations to accomodate this change including the use of kmem_cache, from Florian Westphal. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/netfilter/ipset/ip_set.h15
-rw-r--r--include/linux/netfilter/nfnetlink.h21
-rw-r--r--include/net/netfilter/nf_conntrack.h11
-rw-r--r--include/net/netfilter/nf_conntrack_core.h9
-rw-r--r--include/net/netfilter/nf_conntrack_labels.h4
-rw-r--r--include/net/netfilter/nf_tables.h28
-rw-r--r--include/net/netns/conntrack.h13
-rw-r--r--include/uapi/linux/netfilter/ipset/ip_set.h12
-rw-r--r--include/uapi/linux/netfilter/nf_tables.h6
-rw-r--r--net/ipv4/netfilter.c2
-rw-r--r--net/netfilter/ipset/Kconfig9
-rw-r--r--net/netfilter/ipset/Makefile1
-rw-r--r--net/netfilter/ipset/ip_set_core.c54
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h43
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c321
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c3
-rw-r--r--net/netfilter/ipset/pfxlen.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c13
-rw-r--r--net/netfilter/nf_conntrack_core.c432
-rw-r--r--net/netfilter/nf_conntrack_expect.c36
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c4
-rw-r--r--net/netfilter/nf_conntrack_helper.c41
-rw-r--r--net/netfilter/nf_conntrack_netlink.c133
-rw-r--r--net/netfilter/nf_conntrack_sip.c8
-rw-r--r--net/netfilter/nf_tables_api.c80
-rw-r--r--net/netfilter/nfnetlink.c8
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/nft_compat.c4
-rw-r--r--net/netfilter/nft_ct.c36
-rw-r--r--net/netfilter/nft_hash.c260
-rw-r--r--net/netfilter/nft_immediate.c3
-rw-r--r--net/netfilter/nft_log.c3
-rw-r--r--net/netfilter/nft_lookup.c5
-rw-r--r--net/netfilter/nft_nat.c22
-rw-r--r--net/netfilter/xt_AUDIT.c4
-rw-r--r--net/netfilter/xt_connlimit.c311
-rw-r--r--net/netfilter/xt_ipcomp.c2
46 files changed, 1527 insertions, 475 deletions
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 0c7d01eae56c..96afc29184be 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -39,11 +39,13 @@ enum ip_set_feature {
39 IPSET_TYPE_NAME = (1 << IPSET_TYPE_NAME_FLAG), 39 IPSET_TYPE_NAME = (1 << IPSET_TYPE_NAME_FLAG),
40 IPSET_TYPE_IFACE_FLAG = 5, 40 IPSET_TYPE_IFACE_FLAG = 5,
41 IPSET_TYPE_IFACE = (1 << IPSET_TYPE_IFACE_FLAG), 41 IPSET_TYPE_IFACE = (1 << IPSET_TYPE_IFACE_FLAG),
42 IPSET_TYPE_NOMATCH_FLAG = 6, 42 IPSET_TYPE_MARK_FLAG = 6,
43 IPSET_TYPE_MARK = (1 << IPSET_TYPE_MARK_FLAG),
44 IPSET_TYPE_NOMATCH_FLAG = 7,
43 IPSET_TYPE_NOMATCH = (1 << IPSET_TYPE_NOMATCH_FLAG), 45 IPSET_TYPE_NOMATCH = (1 << IPSET_TYPE_NOMATCH_FLAG),
44 /* Strictly speaking not a feature, but a flag for dumping: 46 /* Strictly speaking not a feature, but a flag for dumping:
45 * this settype must be dumped last */ 47 * this settype must be dumped last */
46 IPSET_DUMP_LAST_FLAG = 7, 48 IPSET_DUMP_LAST_FLAG = 8,
47 IPSET_DUMP_LAST = (1 << IPSET_DUMP_LAST_FLAG), 49 IPSET_DUMP_LAST = (1 << IPSET_DUMP_LAST_FLAG),
48}; 50};
49 51
@@ -63,6 +65,7 @@ enum ip_set_extension {
63#define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) 65#define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT)
64#define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) 66#define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER)
65#define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) 67#define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT)
68#define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD)
66 69
67/* Extension id, in size order */ 70/* Extension id, in size order */
68enum ip_set_ext_id { 71enum ip_set_ext_id {
@@ -171,8 +174,6 @@ struct ip_set_type {
171 char name[IPSET_MAXNAMELEN]; 174 char name[IPSET_MAXNAMELEN];
172 /* Protocol version */ 175 /* Protocol version */
173 u8 protocol; 176 u8 protocol;
174 /* Set features to control swapping */
175 u8 features;
176 /* Set type dimension */ 177 /* Set type dimension */
177 u8 dimension; 178 u8 dimension;
178 /* 179 /*
@@ -182,6 +183,8 @@ struct ip_set_type {
182 u8 family; 183 u8 family;
183 /* Type revisions */ 184 /* Type revisions */
184 u8 revision_min, revision_max; 185 u8 revision_min, revision_max;
186 /* Set features to control swapping */
187 u16 features;
185 188
186 /* Create set */ 189 /* Create set */
187 int (*create)(struct net *net, struct ip_set *set, 190 int (*create)(struct net *net, struct ip_set *set,
@@ -217,6 +220,8 @@ struct ip_set {
217 u8 revision; 220 u8 revision;
218 /* Extensions */ 221 /* Extensions */
219 u8 extensions; 222 u8 extensions;
223 /* Create flags */
224 u8 flags;
220 /* Default timeout value, if enabled */ 225 /* Default timeout value, if enabled */
221 u32 timeout; 226 u32 timeout;
222 /* Element data size */ 227 /* Element data size */
@@ -251,6 +256,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
251 cadt_flags |= IPSET_FLAG_WITH_COUNTERS; 256 cadt_flags |= IPSET_FLAG_WITH_COUNTERS;
252 if (SET_WITH_COMMENT(set)) 257 if (SET_WITH_COMMENT(set))
253 cadt_flags |= IPSET_FLAG_WITH_COMMENT; 258 cadt_flags |= IPSET_FLAG_WITH_COMMENT;
259 if (SET_WITH_FORCEADD(set))
260 cadt_flags |= IPSET_FLAG_WITH_FORCEADD;
254 261
255 if (!cadt_flags) 262 if (!cadt_flags)
256 return 0; 263 return 0;
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 28c74367e900..e955d4730625 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -44,6 +44,27 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
44 44
45void nfnl_lock(__u8 subsys_id); 45void nfnl_lock(__u8 subsys_id);
46void nfnl_unlock(__u8 subsys_id); 46void nfnl_unlock(__u8 subsys_id);
47#ifdef CONFIG_PROVE_LOCKING
48int lockdep_nfnl_is_held(__u8 subsys_id);
49#else
50static inline int lockdep_nfnl_is_held(__u8 subsys_id)
51{
52 return 1;
53}
54#endif /* CONFIG_PROVE_LOCKING */
55
56/*
57 * nfnl_dereference - fetch RCU pointer when updates are prevented by subsys mutex
58 *
59 * @p: The pointer to read, prior to dereferencing
60 * @ss: The nfnetlink subsystem ID
61 *
62 * Return the value of the specified RCU-protected pointer, but omit
63 * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
64 * caller holds the NFNL subsystem mutex.
65 */
66#define nfnl_dereference(p, ss) \
67 rcu_dereference_protected(p, lockdep_nfnl_is_held(ss))
47 68
48#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ 69#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \
49 MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) 70 MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys))
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index b2ac6246b7e0..37252f71a380 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -73,10 +73,17 @@ struct nf_conn_help {
73 73
74struct nf_conn { 74struct nf_conn {
75 /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, 75 /* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
76 plus 1 for any connection(s) we are `master' for */ 76 * plus 1 for any connection(s) we are `master' for
77 *
78 * Hint, SKB address this struct and refcnt via skb->nfct and
79 * helpers nf_conntrack_get() and nf_conntrack_put().
80 * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
81 * beware nf_ct_get() is different and don't inc refcnt.
82 */
77 struct nf_conntrack ct_general; 83 struct nf_conntrack ct_general;
78 84
79 spinlock_t lock; 85 spinlock_t lock;
86 u16 cpu;
80 87
81 /* XXX should I move this to the tail ? - Y.K */ 88 /* XXX should I move this to the tail ? - Y.K */
82 /* These are my tuples; original and reply */ 89 /* These are my tuples; original and reply */
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 15308b8eb5b5..cc0c18827602 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -77,6 +77,13 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
77 const struct nf_conntrack_l3proto *l3proto, 77 const struct nf_conntrack_l3proto *l3proto,
78 const struct nf_conntrack_l4proto *proto); 78 const struct nf_conntrack_l4proto *proto);
79 79
80extern spinlock_t nf_conntrack_lock ; 80#ifdef CONFIG_LOCKDEP
81# define CONNTRACK_LOCKS 8
82#else
83# define CONNTRACK_LOCKS 1024
84#endif
85extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
86
87extern spinlock_t nf_conntrack_expect_lock;
81 88
82#endif /* _NF_CONNTRACK_CORE_H */ 89#endif /* _NF_CONNTRACK_CORE_H */
diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index c985695283b3..dec6336bf850 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -7,6 +7,8 @@
7 7
8#include <uapi/linux/netfilter/xt_connlabel.h> 8#include <uapi/linux/netfilter/xt_connlabel.h>
9 9
10#define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)
11
10struct nf_conn_labels { 12struct nf_conn_labels {
11 u8 words; 13 u8 words;
12 unsigned long bits[]; 14 unsigned long bits[];
@@ -29,7 +31,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
29 u8 words; 31 u8 words;
30 32
31 words = ACCESS_ONCE(net->ct.label_words); 33 words = ACCESS_ONCE(net->ct.label_words);
32 if (words == 0 || WARN_ON_ONCE(words > 8)) 34 if (words == 0)
33 return NULL; 35 return NULL;
34 36
35 cl_ext = nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS, 37 cl_ext = nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS,
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e7e14ffe0f6a..e6bc14d8fa9a 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/netfilter.h> 5#include <linux/netfilter.h>
6#include <linux/netfilter/nfnetlink.h>
6#include <linux/netfilter/x_tables.h> 7#include <linux/netfilter/x_tables.h>
7#include <linux/netfilter/nf_tables.h> 8#include <linux/netfilter/nf_tables.h>
8#include <net/netlink.h> 9#include <net/netlink.h>
@@ -288,7 +289,8 @@ struct nft_expr_ops {
288 int (*init)(const struct nft_ctx *ctx, 289 int (*init)(const struct nft_ctx *ctx,
289 const struct nft_expr *expr, 290 const struct nft_expr *expr,
290 const struct nlattr * const tb[]); 291 const struct nlattr * const tb[]);
291 void (*destroy)(const struct nft_expr *expr); 292 void (*destroy)(const struct nft_ctx *ctx,
293 const struct nft_expr *expr);
292 int (*dump)(struct sk_buff *skb, 294 int (*dump)(struct sk_buff *skb,
293 const struct nft_expr *expr); 295 const struct nft_expr *expr);
294 int (*validate)(const struct nft_ctx *ctx, 296 int (*validate)(const struct nft_ctx *ctx,
@@ -325,13 +327,15 @@ static inline void *nft_expr_priv(const struct nft_expr *expr)
325 * @handle: rule handle 327 * @handle: rule handle
326 * @genmask: generation mask 328 * @genmask: generation mask
327 * @dlen: length of expression data 329 * @dlen: length of expression data
330 * @ulen: length of user data (used for comments)
328 * @data: expression data 331 * @data: expression data
329 */ 332 */
330struct nft_rule { 333struct nft_rule {
331 struct list_head list; 334 struct list_head list;
332 u64 handle:46, 335 u64 handle:42,
333 genmask:2, 336 genmask:2,
334 dlen:16; 337 dlen:12,
338 ulen:8;
335 unsigned char data[] 339 unsigned char data[]
336 __attribute__((aligned(__alignof__(struct nft_expr)))); 340 __attribute__((aligned(__alignof__(struct nft_expr))));
337}; 341};
@@ -340,19 +344,13 @@ struct nft_rule {
340 * struct nft_rule_trans - nf_tables rule update in transaction 344 * struct nft_rule_trans - nf_tables rule update in transaction
341 * 345 *
342 * @list: used internally 346 * @list: used internally
347 * @ctx: rule context
343 * @rule: rule that needs to be updated 348 * @rule: rule that needs to be updated
344 * @chain: chain that this rule belongs to
345 * @table: table for which this chain applies
346 * @nlh: netlink header of the message that contain this update
347 * @family: family expressesed as AF_*
348 */ 349 */
349struct nft_rule_trans { 350struct nft_rule_trans {
350 struct list_head list; 351 struct list_head list;
352 struct nft_ctx ctx;
351 struct nft_rule *rule; 353 struct nft_rule *rule;
352 const struct nft_chain *chain;
353 const struct nft_table *table;
354 const struct nlmsghdr *nlh;
355 u8 family;
356}; 354};
357 355
358static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) 356static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule)
@@ -370,6 +368,11 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
370 return (struct nft_expr *)&rule->data[rule->dlen]; 368 return (struct nft_expr *)&rule->data[rule->dlen];
371} 369}
372 370
371static inline void *nft_userdata(const struct nft_rule *rule)
372{
373 return (void *)&rule->data[rule->dlen];
374}
375
373/* 376/*
374 * The last pointer isn't really necessary, but the compiler isn't able to 377 * The last pointer isn't really necessary, but the compiler isn't able to
375 * determine that the result of nft_expr_last() is always the same since it 378 * determine that the result of nft_expr_last() is always the same since it
@@ -521,6 +524,9 @@ void nft_unregister_chain_type(const struct nf_chain_type *);
521int nft_register_expr(struct nft_expr_type *); 524int nft_register_expr(struct nft_expr_type *);
522void nft_unregister_expr(struct nft_expr_type *); 525void nft_unregister_expr(struct nft_expr_type *);
523 526
527#define nft_dereference(p) \
528 nfnl_dereference(p, NFNL_SUBSYS_NFTABLES)
529
524#define MODULE_ALIAS_NFT_FAMILY(family) \ 530#define MODULE_ALIAS_NFT_FAMILY(family) \
525 MODULE_ALIAS("nft-afinfo-" __stringify(family)) 531 MODULE_ALIAS("nft-afinfo-" __stringify(family))
526 532
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index fbcc7fa536dc..773cce308bc6 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -5,6 +5,7 @@
5#include <linux/list_nulls.h> 5#include <linux/list_nulls.h>
6#include <linux/atomic.h> 6#include <linux/atomic.h>
7#include <linux/netfilter/nf_conntrack_tcp.h> 7#include <linux/netfilter/nf_conntrack_tcp.h>
8#include <linux/seqlock.h>
8 9
9struct ctl_table_header; 10struct ctl_table_header;
10struct nf_conntrack_ecache; 11struct nf_conntrack_ecache;
@@ -62,6 +63,13 @@ struct nf_ip_net {
62#endif 63#endif
63}; 64};
64 65
66struct ct_pcpu {
67 spinlock_t lock;
68 struct hlist_nulls_head unconfirmed;
69 struct hlist_nulls_head dying;
70 struct hlist_nulls_head tmpl;
71};
72
65struct netns_ct { 73struct netns_ct {
66 atomic_t count; 74 atomic_t count;
67 unsigned int expect_count; 75 unsigned int expect_count;
@@ -83,12 +91,11 @@ struct netns_ct {
83 int sysctl_checksum; 91 int sysctl_checksum;
84 92
85 unsigned int htable_size; 93 unsigned int htable_size;
94 seqcount_t generation;
86 struct kmem_cache *nf_conntrack_cachep; 95 struct kmem_cache *nf_conntrack_cachep;
87 struct hlist_nulls_head *hash; 96 struct hlist_nulls_head *hash;
88 struct hlist_head *expect_hash; 97 struct hlist_head *expect_hash;
89 struct hlist_nulls_head unconfirmed; 98 struct ct_pcpu __percpu *pcpu_lists;
90 struct hlist_nulls_head dying;
91 struct hlist_nulls_head tmpl;
92 struct ip_conntrack_stat __percpu *stat; 99 struct ip_conntrack_stat __percpu *stat;
93 struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; 100 struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
94 struct nf_exp_event_notifier __rcu *nf_expect_event_cb; 101 struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 25d3b2f79c02..78c2f2e79920 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -82,6 +82,8 @@ enum {
82 IPSET_ATTR_PROTO, /* 7 */ 82 IPSET_ATTR_PROTO, /* 7 */
83 IPSET_ATTR_CADT_FLAGS, /* 8 */ 83 IPSET_ATTR_CADT_FLAGS, /* 8 */
84 IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */ 84 IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */
85 IPSET_ATTR_MARK, /* 10 */
86 IPSET_ATTR_MARKMASK, /* 11 */
85 /* Reserve empty slots */ 87 /* Reserve empty slots */
86 IPSET_ATTR_CADT_MAX = 16, 88 IPSET_ATTR_CADT_MAX = 16,
87 /* Create-only specific attributes */ 89 /* Create-only specific attributes */
@@ -144,6 +146,7 @@ enum ipset_errno {
144 IPSET_ERR_IPADDR_IPV6, 146 IPSET_ERR_IPADDR_IPV6,
145 IPSET_ERR_COUNTER, 147 IPSET_ERR_COUNTER,
146 IPSET_ERR_COMMENT, 148 IPSET_ERR_COMMENT,
149 IPSET_ERR_INVALID_MARKMASK,
147 150
148 /* Type specific error codes */ 151 /* Type specific error codes */
149 IPSET_ERR_TYPE_SPECIFIC = 4352, 152 IPSET_ERR_TYPE_SPECIFIC = 4352,
@@ -182,9 +185,18 @@ enum ipset_cadt_flags {
182 IPSET_FLAG_WITH_COUNTERS = (1 << IPSET_FLAG_BIT_WITH_COUNTERS), 185 IPSET_FLAG_WITH_COUNTERS = (1 << IPSET_FLAG_BIT_WITH_COUNTERS),
183 IPSET_FLAG_BIT_WITH_COMMENT = 4, 186 IPSET_FLAG_BIT_WITH_COMMENT = 4,
184 IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT), 187 IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT),
188 IPSET_FLAG_BIT_WITH_FORCEADD = 5,
189 IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD),
185 IPSET_FLAG_CADT_MAX = 15, 190 IPSET_FLAG_CADT_MAX = 15,
186}; 191};
187 192
193/* The flag bits which correspond to the non-extension create flags */
194enum ipset_create_flags {
195 IPSET_CREATE_FLAG_BIT_FORCEADD = 0,
196 IPSET_CREATE_FLAG_FORCEADD = (1 << IPSET_CREATE_FLAG_BIT_FORCEADD),
197 IPSET_CREATE_FLAG_BIT_MAX = 7,
198};
199
188/* Commands with settype-specific attributes */ 200/* Commands with settype-specific attributes */
189enum ipset_adt { 201enum ipset_adt {
190 IPSET_ADD, 202 IPSET_ADD,
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 83c985a6170b..c88ccbfda5f1 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1,7 +1,8 @@
1#ifndef _LINUX_NF_TABLES_H 1#ifndef _LINUX_NF_TABLES_H
2#define _LINUX_NF_TABLES_H 2#define _LINUX_NF_TABLES_H
3 3
4#define NFT_CHAIN_MAXNAMELEN 32 4#define NFT_CHAIN_MAXNAMELEN 32
5#define NFT_USERDATA_MAXLEN 256
5 6
6enum nft_registers { 7enum nft_registers {
7 NFT_REG_VERDICT, 8 NFT_REG_VERDICT,
@@ -156,6 +157,7 @@ enum nft_chain_attributes {
156 * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes) 157 * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
157 * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes) 158 * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
158 * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64) 159 * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64)
160 * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN)
159 */ 161 */
160enum nft_rule_attributes { 162enum nft_rule_attributes {
161 NFTA_RULE_UNSPEC, 163 NFTA_RULE_UNSPEC,
@@ -165,6 +167,7 @@ enum nft_rule_attributes {
165 NFTA_RULE_EXPRESSIONS, 167 NFTA_RULE_EXPRESSIONS,
166 NFTA_RULE_COMPAT, 168 NFTA_RULE_COMPAT,
167 NFTA_RULE_POSITION, 169 NFTA_RULE_POSITION,
170 NFTA_RULE_USERDATA,
168 __NFTA_RULE_MAX 171 __NFTA_RULE_MAX
169}; 172};
170#define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) 173#define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1)
@@ -601,6 +604,7 @@ enum nft_ct_keys {
601 NFT_CT_PROTOCOL, 604 NFT_CT_PROTOCOL,
602 NFT_CT_PROTO_SRC, 605 NFT_CT_PROTO_SRC,
603 NFT_CT_PROTO_DST, 606 NFT_CT_PROTO_DST,
607 NFT_CT_LABELS,
604}; 608};
605 609
606/** 610/**
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3e0adea9c27..7ebd6e37875c 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
61 skb_dst_set(skb, NULL); 61 skb_dst_set(skb, NULL);
62 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); 62 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
63 if (IS_ERR(dst)) 63 if (IS_ERR(dst))
64 return PTR_ERR(dst);; 64 return PTR_ERR(dst);
65 skb_dst_set(skb, dst); 65 skb_dst_set(skb, dst);
66 } 66 }
67#endif 67#endif
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 44cd4f58adf0..2f7f5c32c6f9 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -61,6 +61,15 @@ config IP_SET_HASH_IP
61 61
62 To compile it as a module, choose M here. If unsure, say N. 62 To compile it as a module, choose M here. If unsure, say N.
63 63
64config IP_SET_HASH_IPMARK
65 tristate "hash:ip,mark set support"
66 depends on IP_SET
67 help
68 This option adds the hash:ip,mark set type support, by which one
69 can store IPv4/IPv6 address and mark pairs.
70
71 To compile it as a module, choose M here. If unsure, say N.
72
64config IP_SET_HASH_IPPORT 73config IP_SET_HASH_IPPORT
65 tristate "hash:ip,port set support" 74 tristate "hash:ip,port set support"
66 depends on IP_SET 75 depends on IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
index 44b2d38476fa..231f10196cb9 100644
--- a/net/netfilter/ipset/Makefile
+++ b/net/netfilter/ipset/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
14 14
15# hash types 15# hash types
16obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o 16obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
17obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
17obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o 18obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
18obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o 19obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
19obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o 20obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index de770ec39e51..117208321f16 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -54,10 +54,10 @@ MODULE_DESCRIPTION("core IP set support");
54MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); 54MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
55 55
56/* When the nfnl mutex is held: */ 56/* When the nfnl mutex is held: */
57#define nfnl_dereference(p) \ 57#define ip_set_dereference(p) \
58 rcu_dereference_protected(p, 1) 58 rcu_dereference_protected(p, 1)
59#define nfnl_set(inst, id) \ 59#define ip_set(inst, id) \
60 nfnl_dereference((inst)->ip_set_list)[id] 60 ip_set_dereference((inst)->ip_set_list)[id]
61 61
62/* 62/*
63 * The set types are implemented in modules and registered set types 63 * The set types are implemented in modules and registered set types
@@ -368,6 +368,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
368 368
369 if (tb[IPSET_ATTR_CADT_FLAGS]) 369 if (tb[IPSET_ATTR_CADT_FLAGS])
370 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); 370 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
371 if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
372 set->flags |= IPSET_CREATE_FLAG_FORCEADD;
371 for (id = 0; id < IPSET_EXT_ID_MAX; id++) { 373 for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
372 if (!add_extension(id, cadt_flags, tb)) 374 if (!add_extension(id, cadt_flags, tb))
373 continue; 375 continue;
@@ -510,7 +512,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
510 512
511 if (opt->dim < set->type->dimension || 513 if (opt->dim < set->type->dimension ||
512 !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) 514 !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
513 return 0; 515 return -IPSET_ERR_TYPE_MISMATCH;
514 516
515 write_lock_bh(&set->lock); 517 write_lock_bh(&set->lock);
516 ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); 518 ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
@@ -533,7 +535,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
533 535
534 if (opt->dim < set->type->dimension || 536 if (opt->dim < set->type->dimension ||
535 !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) 537 !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
536 return 0; 538 return -IPSET_ERR_TYPE_MISMATCH;
537 539
538 write_lock_bh(&set->lock); 540 write_lock_bh(&set->lock);
539 ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); 541 ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
@@ -640,7 +642,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
640 return IPSET_INVALID_ID; 642 return IPSET_INVALID_ID;
641 643
642 nfnl_lock(NFNL_SUBSYS_IPSET); 644 nfnl_lock(NFNL_SUBSYS_IPSET);
643 set = nfnl_set(inst, index); 645 set = ip_set(inst, index);
644 if (set) 646 if (set)
645 __ip_set_get(set); 647 __ip_set_get(set);
646 else 648 else
@@ -666,7 +668,7 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index)
666 668
667 nfnl_lock(NFNL_SUBSYS_IPSET); 669 nfnl_lock(NFNL_SUBSYS_IPSET);
668 if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ 670 if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
669 set = nfnl_set(inst, index); 671 set = ip_set(inst, index);
670 if (set != NULL) 672 if (set != NULL)
671 __ip_set_put(set); 673 __ip_set_put(set);
672 } 674 }
@@ -734,7 +736,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
734 736
735 *id = IPSET_INVALID_ID; 737 *id = IPSET_INVALID_ID;
736 for (i = 0; i < inst->ip_set_max; i++) { 738 for (i = 0; i < inst->ip_set_max; i++) {
737 set = nfnl_set(inst, i); 739 set = ip_set(inst, i);
738 if (set != NULL && STREQ(set->name, name)) { 740 if (set != NULL && STREQ(set->name, name)) {
739 *id = i; 741 *id = i;
740 break; 742 break;
@@ -760,7 +762,7 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
760 762
761 *index = IPSET_INVALID_ID; 763 *index = IPSET_INVALID_ID;
762 for (i = 0; i < inst->ip_set_max; i++) { 764 for (i = 0; i < inst->ip_set_max; i++) {
763 s = nfnl_set(inst, i); 765 s = ip_set(inst, i);
764 if (s == NULL) { 766 if (s == NULL) {
765 if (*index == IPSET_INVALID_ID) 767 if (*index == IPSET_INVALID_ID)
766 *index = i; 768 *index = i;
@@ -883,7 +885,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
883 if (!list) 885 if (!list)
884 goto cleanup; 886 goto cleanup;
885 /* nfnl mutex is held, both lists are valid */ 887 /* nfnl mutex is held, both lists are valid */
886 tmp = nfnl_dereference(inst->ip_set_list); 888 tmp = ip_set_dereference(inst->ip_set_list);
887 memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); 889 memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
888 rcu_assign_pointer(inst->ip_set_list, list); 890 rcu_assign_pointer(inst->ip_set_list, list);
889 /* Make sure all current packets have passed through */ 891 /* Make sure all current packets have passed through */
@@ -900,7 +902,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
900 * Finally! Add our shiny new set to the list, and be done. 902 * Finally! Add our shiny new set to the list, and be done.
901 */ 903 */
902 pr_debug("create: '%s' created with index %u!\n", set->name, index); 904 pr_debug("create: '%s' created with index %u!\n", set->name, index);
903 nfnl_set(inst, index) = set; 905 ip_set(inst, index) = set;
904 906
905 return ret; 907 return ret;
906 908
@@ -925,10 +927,10 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
925static void 927static void
926ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) 928ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
927{ 929{
928 struct ip_set *set = nfnl_set(inst, index); 930 struct ip_set *set = ip_set(inst, index);
929 931
930 pr_debug("set: %s\n", set->name); 932 pr_debug("set: %s\n", set->name);
931 nfnl_set(inst, index) = NULL; 933 ip_set(inst, index) = NULL;
932 934
933 /* Must call it without holding any lock */ 935 /* Must call it without holding any lock */
934 set->variant->destroy(set); 936 set->variant->destroy(set);
@@ -962,7 +964,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
962 read_lock_bh(&ip_set_ref_lock); 964 read_lock_bh(&ip_set_ref_lock);
963 if (!attr[IPSET_ATTR_SETNAME]) { 965 if (!attr[IPSET_ATTR_SETNAME]) {
964 for (i = 0; i < inst->ip_set_max; i++) { 966 for (i = 0; i < inst->ip_set_max; i++) {
965 s = nfnl_set(inst, i); 967 s = ip_set(inst, i);
966 if (s != NULL && s->ref) { 968 if (s != NULL && s->ref) {
967 ret = -IPSET_ERR_BUSY; 969 ret = -IPSET_ERR_BUSY;
968 goto out; 970 goto out;
@@ -970,7 +972,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
970 } 972 }
971 read_unlock_bh(&ip_set_ref_lock); 973 read_unlock_bh(&ip_set_ref_lock);
972 for (i = 0; i < inst->ip_set_max; i++) { 974 for (i = 0; i < inst->ip_set_max; i++) {
973 s = nfnl_set(inst, i); 975 s = ip_set(inst, i);
974 if (s != NULL) 976 if (s != NULL)
975 ip_set_destroy_set(inst, i); 977 ip_set_destroy_set(inst, i);
976 } 978 }
@@ -1020,7 +1022,7 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
1020 1022
1021 if (!attr[IPSET_ATTR_SETNAME]) { 1023 if (!attr[IPSET_ATTR_SETNAME]) {
1022 for (i = 0; i < inst->ip_set_max; i++) { 1024 for (i = 0; i < inst->ip_set_max; i++) {
1023 s = nfnl_set(inst, i); 1025 s = ip_set(inst, i);
1024 if (s != NULL) 1026 if (s != NULL)
1025 ip_set_flush_set(s); 1027 ip_set_flush_set(s);
1026 } 1028 }
@@ -1074,7 +1076,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
1074 1076
1075 name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); 1077 name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
1076 for (i = 0; i < inst->ip_set_max; i++) { 1078 for (i = 0; i < inst->ip_set_max; i++) {
1077 s = nfnl_set(inst, i); 1079 s = ip_set(inst, i);
1078 if (s != NULL && STREQ(s->name, name2)) { 1080 if (s != NULL && STREQ(s->name, name2)) {
1079 ret = -IPSET_ERR_EXIST_SETNAME2; 1081 ret = -IPSET_ERR_EXIST_SETNAME2;
1080 goto out; 1082 goto out;
@@ -1134,8 +1136,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
1134 1136
1135 write_lock_bh(&ip_set_ref_lock); 1137 write_lock_bh(&ip_set_ref_lock);
1136 swap(from->ref, to->ref); 1138 swap(from->ref, to->ref);
1137 nfnl_set(inst, from_id) = to; 1139 ip_set(inst, from_id) = to;
1138 nfnl_set(inst, to_id) = from; 1140 ip_set(inst, to_id) = from;
1139 write_unlock_bh(&ip_set_ref_lock); 1141 write_unlock_bh(&ip_set_ref_lock);
1140 1142
1141 return 0; 1143 return 0;
@@ -1157,7 +1159,7 @@ ip_set_dump_done(struct netlink_callback *cb)
1157 struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; 1159 struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
1158 if (cb->args[IPSET_CB_ARG0]) { 1160 if (cb->args[IPSET_CB_ARG0]) {
1159 pr_debug("release set %s\n", 1161 pr_debug("release set %s\n",
1160 nfnl_set(inst, cb->args[IPSET_CB_INDEX])->name); 1162 ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
1161 __ip_set_put_byindex(inst, 1163 __ip_set_put_byindex(inst,
1162 (ip_set_id_t) cb->args[IPSET_CB_INDEX]); 1164 (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
1163 } 1165 }
@@ -1254,7 +1256,7 @@ dump_last:
1254 dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); 1256 dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
1255 for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { 1257 for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
1256 index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; 1258 index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
1257 set = nfnl_set(inst, index); 1259 set = ip_set(inst, index);
1258 if (set == NULL) { 1260 if (set == NULL) {
1259 if (dump_type == DUMP_ONE) { 1261 if (dump_type == DUMP_ONE) {
1260 ret = -ENOENT; 1262 ret = -ENOENT;
@@ -1332,7 +1334,7 @@ next_set:
1332release_refcount: 1334release_refcount:
1333 /* If there was an error or set is done, release set */ 1335 /* If there was an error or set is done, release set */
1334 if (ret || !cb->args[IPSET_CB_ARG0]) { 1336 if (ret || !cb->args[IPSET_CB_ARG0]) {
1335 pr_debug("release set %s\n", nfnl_set(inst, index)->name); 1337 pr_debug("release set %s\n", ip_set(inst, index)->name);
1336 __ip_set_put_byindex(inst, index); 1338 __ip_set_put_byindex(inst, index);
1337 cb->args[IPSET_CB_ARG0] = 0; 1339 cb->args[IPSET_CB_ARG0] = 0;
1338 } 1340 }
@@ -1887,7 +1889,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
1887 find_set_and_id(inst, req_get->set.name, &id); 1889 find_set_and_id(inst, req_get->set.name, &id);
1888 req_get->set.index = id; 1890 req_get->set.index = id;
1889 if (id != IPSET_INVALID_ID) 1891 if (id != IPSET_INVALID_ID)
1890 req_get->family = nfnl_set(inst, id)->family; 1892 req_get->family = ip_set(inst, id)->family;
1891 nfnl_unlock(NFNL_SUBSYS_IPSET); 1893 nfnl_unlock(NFNL_SUBSYS_IPSET);
1892 goto copy; 1894 goto copy;
1893 } 1895 }
@@ -1901,7 +1903,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
1901 goto done; 1903 goto done;
1902 } 1904 }
1903 nfnl_lock(NFNL_SUBSYS_IPSET); 1905 nfnl_lock(NFNL_SUBSYS_IPSET);
1904 set = nfnl_set(inst, req_get->set.index); 1906 set = ip_set(inst, req_get->set.index);
1905 strncpy(req_get->set.name, set ? set->name : "", 1907 strncpy(req_get->set.name, set ? set->name : "",
1906 IPSET_MAXNAMELEN); 1908 IPSET_MAXNAMELEN);
1907 nfnl_unlock(NFNL_SUBSYS_IPSET); 1909 nfnl_unlock(NFNL_SUBSYS_IPSET);
@@ -1945,7 +1947,6 @@ ip_set_net_init(struct net *net)
1945 return -ENOMEM; 1947 return -ENOMEM;
1946 inst->is_deleted = 0; 1948 inst->is_deleted = 0;
1947 rcu_assign_pointer(inst->ip_set_list, list); 1949 rcu_assign_pointer(inst->ip_set_list, list);
1948 pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
1949 return 0; 1950 return 0;
1950} 1951}
1951 1952
@@ -1960,7 +1961,7 @@ ip_set_net_exit(struct net *net)
1960 inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ 1961 inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
1961 1962
1962 for (i = 0; i < inst->ip_set_max; i++) { 1963 for (i = 0; i < inst->ip_set_max; i++) {
1963 set = nfnl_set(inst, i); 1964 set = ip_set(inst, i);
1964 if (set != NULL) 1965 if (set != NULL)
1965 ip_set_destroy_set(inst, i); 1966 ip_set_destroy_set(inst, i);
1966 } 1967 }
@@ -1996,6 +1997,7 @@ ip_set_init(void)
1996 nfnetlink_subsys_unregister(&ip_set_netlink_subsys); 1997 nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
1997 return ret; 1998 return ret;
1998 } 1999 }
2000 pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
1999 return 0; 2001 return 0;
2000} 2002}
2001 2003
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index be6932ad3a86..61c7fb052802 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -263,6 +263,9 @@ struct htype {
263 u32 maxelem; /* max elements in the hash */ 263 u32 maxelem; /* max elements in the hash */
264 u32 elements; /* current element (vs timeout) */ 264 u32 elements; /* current element (vs timeout) */
265 u32 initval; /* random jhash init value */ 265 u32 initval; /* random jhash init value */
266#ifdef IP_SET_HASH_WITH_MARKMASK
267 u32 markmask; /* markmask value for mark mask to store */
268#endif
266 struct timer_list gc; /* garbage collection when timeout enabled */ 269 struct timer_list gc; /* garbage collection when timeout enabled */
267 struct mtype_elem next; /* temporary storage for uadd */ 270 struct mtype_elem next; /* temporary storage for uadd */
268#ifdef IP_SET_HASH_WITH_MULTI 271#ifdef IP_SET_HASH_WITH_MULTI
@@ -454,6 +457,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
454#ifdef IP_SET_HASH_WITH_NETMASK 457#ifdef IP_SET_HASH_WITH_NETMASK
455 x->netmask == y->netmask && 458 x->netmask == y->netmask &&
456#endif 459#endif
460#ifdef IP_SET_HASH_WITH_MARKMASK
461 x->markmask == y->markmask &&
462#endif
457 a->extensions == b->extensions; 463 a->extensions == b->extensions;
458} 464}
459 465
@@ -627,6 +633,18 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
627 bool flag_exist = flags & IPSET_FLAG_EXIST; 633 bool flag_exist = flags & IPSET_FLAG_EXIST;
628 u32 key, multi = 0; 634 u32 key, multi = 0;
629 635
636 if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) {
637 rcu_read_lock_bh();
638 t = rcu_dereference_bh(h->table);
639 key = HKEY(value, h->initval, t->htable_bits);
640 n = hbucket(t,key);
641 if (n->pos) {
642 /* Choosing the first entry in the array to replace */
643 j = 0;
644 goto reuse_slot;
645 }
646 rcu_read_unlock_bh();
647 }
630 if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) 648 if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
631 /* FIXME: when set is full, we slow down here */ 649 /* FIXME: when set is full, we slow down here */
632 mtype_expire(set, h, NLEN(set->family), set->dsize); 650 mtype_expire(set, h, NLEN(set->family), set->dsize);
@@ -908,6 +926,10 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
908 nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) 926 nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
909 goto nla_put_failure; 927 goto nla_put_failure;
910#endif 928#endif
929#ifdef IP_SET_HASH_WITH_MARKMASK
930 if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
931 goto nla_put_failure;
932#endif
911 if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || 933 if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
912 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) 934 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
913 goto nla_put_failure; 935 goto nla_put_failure;
@@ -1016,6 +1038,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1016 struct nlattr *tb[], u32 flags) 1038 struct nlattr *tb[], u32 flags)
1017{ 1039{
1018 u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; 1040 u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
1041#ifdef IP_SET_HASH_WITH_MARKMASK
1042 u32 markmask;
1043#endif
1019 u8 hbits; 1044 u8 hbits;
1020#ifdef IP_SET_HASH_WITH_NETMASK 1045#ifdef IP_SET_HASH_WITH_NETMASK
1021 u8 netmask; 1046 u8 netmask;
@@ -1026,6 +1051,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1026 1051
1027 if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) 1052 if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
1028 return -IPSET_ERR_INVALID_FAMILY; 1053 return -IPSET_ERR_INVALID_FAMILY;
1054
1055#ifdef IP_SET_HASH_WITH_MARKMASK
1056 markmask = 0xffffffff;
1057#endif
1029#ifdef IP_SET_HASH_WITH_NETMASK 1058#ifdef IP_SET_HASH_WITH_NETMASK
1030 netmask = set->family == NFPROTO_IPV4 ? 32 : 128; 1059 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
1031 pr_debug("Create set %s with family %s\n", 1060 pr_debug("Create set %s with family %s\n",
@@ -1034,6 +1063,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1034 1063
1035 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || 1064 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
1036 !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || 1065 !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
1066#ifdef IP_SET_HASH_WITH_MARKMASK
1067 !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) ||
1068#endif
1037 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || 1069 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
1038 !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) 1070 !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
1039 return -IPSET_ERR_PROTOCOL; 1071 return -IPSET_ERR_PROTOCOL;
@@ -1057,6 +1089,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1057 return -IPSET_ERR_INVALID_NETMASK; 1089 return -IPSET_ERR_INVALID_NETMASK;
1058 } 1090 }
1059#endif 1091#endif
1092#ifdef IP_SET_HASH_WITH_MARKMASK
1093 if (tb[IPSET_ATTR_MARKMASK]) {
1094 markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK]));
1095
1096 if ((markmask > 4294967295u) || markmask == 0)
1097 return -IPSET_ERR_INVALID_MARKMASK;
1098 }
1099#endif
1060 1100
1061 hsize = sizeof(*h); 1101 hsize = sizeof(*h);
1062#ifdef IP_SET_HASH_WITH_NETS 1102#ifdef IP_SET_HASH_WITH_NETS
@@ -1071,6 +1111,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1071#ifdef IP_SET_HASH_WITH_NETMASK 1111#ifdef IP_SET_HASH_WITH_NETMASK
1072 h->netmask = netmask; 1112 h->netmask = netmask;
1073#endif 1113#endif
1114#ifdef IP_SET_HASH_WITH_MARKMASK
1115 h->markmask = markmask;
1116#endif
1074 get_random_bytes(&h->initval, sizeof(h->initval)); 1117 get_random_bytes(&h->initval, sizeof(h->initval));
1075 set->timeout = IPSET_NO_TIMEOUT; 1118 set->timeout = IPSET_NO_TIMEOUT;
1076 1119
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index e65fc2423d56..dd40607f878e 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -25,7 +25,8 @@
25 25
26#define IPSET_TYPE_REV_MIN 0 26#define IPSET_TYPE_REV_MIN 0
27/* 1 Counters support */ 27/* 1 Counters support */
28#define IPSET_TYPE_REV_MAX 2 /* Comments support */ 28/* 2 Comments support */
29#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */
29 30
30MODULE_LICENSE("GPL"); 31MODULE_LICENSE("GPL");
31MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); 32MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
new file mode 100644
index 000000000000..4eff0a297254
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -0,0 +1,321 @@
1/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
2 * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* Kernel module implementing an IP set type: the hash:ip,mark type */
10
11#include <linux/jhash.h>
12#include <linux/module.h>
13#include <linux/ip.h>
14#include <linux/skbuff.h>
15#include <linux/errno.h>
16#include <linux/random.h>
17#include <net/ip.h>
18#include <net/ipv6.h>
19#include <net/netlink.h>
20#include <net/tcp.h>
21
22#include <linux/netfilter.h>
23#include <linux/netfilter/ipset/pfxlen.h>
24#include <linux/netfilter/ipset/ip_set.h>
25#include <linux/netfilter/ipset/ip_set_hash.h>
26
27#define IPSET_TYPE_REV_MIN 0
28#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */
29
30MODULE_LICENSE("GPL");
31MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
32IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
33MODULE_ALIAS("ip_set_hash:ip,mark");
34
35/* Type specific function prefix */
36#define HTYPE hash_ipmark
37#define IP_SET_HASH_WITH_MARKMASK
38
39/* IPv4 variant */
40
41/* Member elements */
42struct hash_ipmark4_elem {
43 __be32 ip;
44 __u32 mark;
45};
46
47/* Common functions */
48
49static inline bool
50hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1,
51 const struct hash_ipmark4_elem *ip2,
52 u32 *multi)
53{
54 return ip1->ip == ip2->ip &&
55 ip1->mark == ip2->mark;
56}
57
58static bool
59hash_ipmark4_data_list(struct sk_buff *skb,
60 const struct hash_ipmark4_elem *data)
61{
62 if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
63 nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
64 goto nla_put_failure;
65 return 0;
66
67nla_put_failure:
68 return 1;
69}
70
71static inline void
72hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
73 const struct hash_ipmark4_elem *d)
74{
75 next->ip = d->ip;
76}
77
78#define MTYPE hash_ipmark4
79#define PF 4
80#define HOST_MASK 32
81#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem)
82#include "ip_set_hash_gen.h"
83
84static int
85hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
86 const struct xt_action_param *par,
87 enum ipset_adt adt, struct ip_set_adt_opt *opt)
88{
89 const struct hash_ipmark *h = set->data;
90 ipset_adtfn adtfn = set->variant->adt[adt];
91 struct hash_ipmark4_elem e = { };
92 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
93
94 e.mark = skb->mark;
95 e.mark &= h->markmask;
96
97 ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
98 return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
99}
100
101static int
102hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
103 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
104{
105 const struct hash_ipmark *h = set->data;
106 ipset_adtfn adtfn = set->variant->adt[adt];
107 struct hash_ipmark4_elem e = { };
108 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
109 u32 ip, ip_to = 0;
110 int ret;
111
112 if (unlikely(!tb[IPSET_ATTR_IP] ||
113 !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
114 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
115 !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
116 !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
117 return -IPSET_ERR_PROTOCOL;
118
119 if (tb[IPSET_ATTR_LINENO])
120 *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
121
122 ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
123 ip_set_get_extensions(set, tb, &ext);
124 if (ret)
125 return ret;
126
127 e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
128 e.mark &= h->markmask;
129
130 if (adt == IPSET_TEST ||
131 !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
132 ret = adtfn(set, &e, &ext, &ext, flags);
133 return ip_set_eexist(ret, flags) ? 0 : ret;
134 }
135
136 ip_to = ip = ntohl(e.ip);
137 if (tb[IPSET_ATTR_IP_TO]) {
138 ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
139 if (ret)
140 return ret;
141 if (ip > ip_to)
142 swap(ip, ip_to);
143 } else if (tb[IPSET_ATTR_CIDR]) {
144 u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
145
146 if (!cidr || cidr > 32)
147 return -IPSET_ERR_INVALID_CIDR;
148 ip_set_mask_from_to(ip, ip_to, cidr);
149 }
150
151 if (retried)
152 ip = ntohl(h->next.ip);
153 for (; !before(ip_to, ip); ip++) {
154 e.ip = htonl(ip);
155 ret = adtfn(set, &e, &ext, &ext, flags);
156
157 if (ret && !ip_set_eexist(ret, flags))
158 return ret;
159 else
160 ret = 0;
161 }
162 return ret;
163}
164
165/* IPv6 variant */
166
167struct hash_ipmark6_elem {
168 union nf_inet_addr ip;
169 __u32 mark;
170};
171
172/* Common functions */
173
174static inline bool
175hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1,
176 const struct hash_ipmark6_elem *ip2,
177 u32 *multi)
178{
179 return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
180 ip1->mark == ip2->mark;
181}
182
183static bool
184hash_ipmark6_data_list(struct sk_buff *skb,
185 const struct hash_ipmark6_elem *data)
186{
187 if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
188 nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
189 goto nla_put_failure;
190 return 0;
191
192nla_put_failure:
193 return 1;
194}
195
196static inline void
197hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
198 const struct hash_ipmark6_elem *d)
199{
200}
201
202#undef MTYPE
203#undef PF
204#undef HOST_MASK
205#undef HKEY_DATALEN
206
207#define MTYPE hash_ipmark6
208#define PF 6
209#define HOST_MASK 128
210#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem)
211#define IP_SET_EMIT_CREATE
212#include "ip_set_hash_gen.h"
213
214
215static int
216hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
217 const struct xt_action_param *par,
218 enum ipset_adt adt, struct ip_set_adt_opt *opt)
219{
220 const struct hash_ipmark *h = set->data;
221 ipset_adtfn adtfn = set->variant->adt[adt];
222 struct hash_ipmark6_elem e = { };
223 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
224
225 e.mark = skb->mark;
226 e.mark &= h->markmask;
227
228 ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
229 return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
230}
231
232static int
233hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
234 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
235{
236 const struct hash_ipmark *h = set->data;
237 ipset_adtfn adtfn = set->variant->adt[adt];
238 struct hash_ipmark6_elem e = { };
239 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
240 int ret;
241
242 if (unlikely(!tb[IPSET_ATTR_IP] ||
243 !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
244 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
245 !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
246 !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
247 tb[IPSET_ATTR_IP_TO] ||
248 tb[IPSET_ATTR_CIDR]))
249 return -IPSET_ERR_PROTOCOL;
250
251 if (tb[IPSET_ATTR_LINENO])
252 *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
253
254 ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
255 ip_set_get_extensions(set, tb, &ext);
256 if (ret)
257 return ret;
258
259 e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
260 e.mark &= h->markmask;
261
262 if (adt == IPSET_TEST) {
263 ret = adtfn(set, &e, &ext, &ext, flags);
264 return ip_set_eexist(ret, flags) ? 0 : ret;
265 }
266
267 ret = adtfn(set, &e, &ext, &ext, flags);
268 if (ret && !ip_set_eexist(ret, flags))
269 return ret;
270 else
271 ret = 0;
272
273 return ret;
274}
275
276static struct ip_set_type hash_ipmark_type __read_mostly = {
277 .name = "hash:ip,mark",
278 .protocol = IPSET_PROTOCOL,
279 .features = IPSET_TYPE_IP | IPSET_TYPE_MARK,
280 .dimension = IPSET_DIM_TWO,
281 .family = NFPROTO_UNSPEC,
282 .revision_min = IPSET_TYPE_REV_MIN,
283 .revision_max = IPSET_TYPE_REV_MAX,
284 .create = hash_ipmark_create,
285 .create_policy = {
286 [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
287 [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
288 [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
289 [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
290 [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
291 [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
292 [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
293 },
294 .adt_policy = {
295 [IPSET_ATTR_IP] = { .type = NLA_NESTED },
296 [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
297 [IPSET_ATTR_MARK] = { .type = NLA_U32 },
298 [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
299 [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
300 [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
301 [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
302 [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
303 [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
304 },
305 .me = THIS_MODULE,
306};
307
308static int __init
309hash_ipmark_init(void)
310{
311 return ip_set_type_register(&hash_ipmark_type);
312}
313
314static void __exit
315hash_ipmark_fini(void)
316{
317 ip_set_type_unregister(&hash_ipmark_type);
318}
319
320module_init(hash_ipmark_init);
321module_exit(hash_ipmark_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 525a595dd1fe..7597b82a8b03 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -27,7 +27,8 @@
27#define IPSET_TYPE_REV_MIN 0 27#define IPSET_TYPE_REV_MIN 0
28/* 1 SCTP and UDPLITE support added */ 28/* 1 SCTP and UDPLITE support added */
29/* 2 Counters support added */ 29/* 2 Counters support added */
30#define IPSET_TYPE_REV_MAX 3 /* Comments support added */ 30/* 3 Comments support added */
31#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */
31 32
32MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
33MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); 34MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index f5636631466e..672655ffd573 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -27,7 +27,8 @@
27#define IPSET_TYPE_REV_MIN 0 27#define IPSET_TYPE_REV_MIN 0
28/* 1 SCTP and UDPLITE support added */ 28/* 1 SCTP and UDPLITE support added */
29/* 2 Counters support added */ 29/* 2 Counters support added */
30#define IPSET_TYPE_REV_MAX 3 /* Comments support added */ 30/* 3 Comments support added */
31#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */
31 32
32MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
33MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); 34MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 5d87fe8a41ff..7308d84f9277 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -29,7 +29,8 @@
29/* 2 Range as input support for IPv4 added */ 29/* 2 Range as input support for IPv4 added */
30/* 3 nomatch flag support added */ 30/* 3 nomatch flag support added */
31/* 4 Counters support added */ 31/* 4 Counters support added */
32#define IPSET_TYPE_REV_MAX 5 /* Comments support added */ 32/* 5 Comments support added */
33#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */
33 34
34MODULE_LICENSE("GPL"); 35MODULE_LICENSE("GPL");
35MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); 36MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 8295cf4f9fdc..4c7d495783a3 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -26,7 +26,8 @@
26/* 1 Range as input support for IPv4 added */ 26/* 1 Range as input support for IPv4 added */
27/* 2 nomatch flag support added */ 27/* 2 nomatch flag support added */
28/* 3 Counters support added */ 28/* 3 Counters support added */
29#define IPSET_TYPE_REV_MAX 4 /* Comments support added */ 29/* 4 Comments support added */
30#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */
30 31
31MODULE_LICENSE("GPL"); 32MODULE_LICENSE("GPL");
32MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); 33MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index b827a0f1f351..db2606805b35 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -27,7 +27,8 @@
27/* 1 nomatch flag support added */ 27/* 1 nomatch flag support added */
28/* 2 /0 support added */ 28/* 2 /0 support added */
29/* 3 Counters support added */ 29/* 3 Counters support added */
30#define IPSET_TYPE_REV_MAX 4 /* Comments support added */ 30/* 4 Comments support added */
31#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */
31 32
32MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
33MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); 34MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 6226803fc490..3e99987e4bf2 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -24,7 +24,7 @@
24#include <linux/netfilter/ipset/ip_set_hash.h> 24#include <linux/netfilter/ipset/ip_set_hash.h>
25 25
26#define IPSET_TYPE_REV_MIN 0 26#define IPSET_TYPE_REV_MIN 0
27#define IPSET_TYPE_REV_MAX 0 27#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */
28 28
29MODULE_LICENSE("GPL"); 29MODULE_LICENSE("GPL");
30MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); 30MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -112,10 +112,10 @@ hash_netnet4_data_list(struct sk_buff *skb,
112 (flags && 112 (flags &&
113 nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) 113 nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
114 goto nla_put_failure; 114 goto nla_put_failure;
115 return 0; 115 return false;
116 116
117nla_put_failure: 117nla_put_failure:
118 return 1; 118 return true;
119} 119}
120 120
121static inline void 121static inline void
@@ -334,10 +334,10 @@ hash_netnet6_data_list(struct sk_buff *skb,
334 (flags && 334 (flags &&
335 nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) 335 nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
336 goto nla_put_failure; 336 goto nla_put_failure;
337 return 0; 337 return false;
338 338
339nla_put_failure: 339nla_put_failure:
340 return 1; 340 return true;
341} 341}
342 342
343static inline void 343static inline void
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 7097fb0141bf..1c645fbd09c7 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -28,7 +28,8 @@
28/* 2 Range as input support for IPv4 added */ 28/* 2 Range as input support for IPv4 added */
29/* 3 nomatch flag support added */ 29/* 3 nomatch flag support added */
30/* 4 Counters support added */ 30/* 4 Counters support added */
31#define IPSET_TYPE_REV_MAX 5 /* Comments support added */ 31/* 5 Comments support added */
32#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */
32 33
33MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
34MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); 35MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 703d1192a6a2..c0d2ba73f8b2 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -25,7 +25,8 @@
25#include <linux/netfilter/ipset/ip_set_hash.h> 25#include <linux/netfilter/ipset/ip_set_hash.h>
26 26
27#define IPSET_TYPE_REV_MIN 0 27#define IPSET_TYPE_REV_MIN 0
28#define IPSET_TYPE_REV_MAX 0 /* Comments support added */ 28/* 0 Comments support added */
29#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */
29 30
30MODULE_LICENSE("GPL"); 31MODULE_LICENSE("GPL");
31MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); 32MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
index 4f29fa97044b..04d15fdc99ee 100644
--- a/net/netfilter/ipset/pfxlen.c
+++ b/net/netfilter/ipset/pfxlen.c
@@ -7,8 +7,8 @@
7 7
8#define E(a, b, c, d) \ 8#define E(a, b, c, d) \
9 {.ip6 = { \ 9 {.ip6 = { \
10 __constant_htonl(a), __constant_htonl(b), \ 10 htonl(a), htonl(b), \
11 __constant_htonl(c), __constant_htonl(d), \ 11 htonl(c), htonl(d), \
12 } } 12 } }
13 13
14/* 14/*
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d6d75841352a..c42e83d2751c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3580,7 +3580,7 @@ out:
3580} 3580}
3581 3581
3582 3582
3583static const struct genl_ops ip_vs_genl_ops[] __read_mostly = { 3583static const struct genl_ops ip_vs_genl_ops[] = {
3584 { 3584 {
3585 .cmd = IPVS_CMD_NEW_SERVICE, 3585 .cmd = IPVS_CMD_NEW_SERVICE,
3586 .flags = GENL_ADMIN_PERM, 3586 .flags = GENL_ADMIN_PERM,
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index ca056a331e60..547ff33c1efd 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
238 238
239 spin_lock_bh(&svc->sched_lock); 239 spin_lock_bh(&svc->sched_lock);
240 tbl->dead = 1; 240 tbl->dead = 1;
241 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 241 for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
242 hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { 242 hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
243 ip_vs_lblc_del(en); 243 ip_vs_lblc_del(en);
244 atomic_dec(&tbl->entries); 244 atomic_dec(&tbl->entries);
@@ -265,7 +265,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
265 unsigned long now = jiffies; 265 unsigned long now = jiffies;
266 int i, j; 266 int i, j;
267 267
268 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 268 for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
269 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 269 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
270 270
271 spin_lock(&svc->sched_lock); 271 spin_lock(&svc->sched_lock);
@@ -321,7 +321,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
321 if (goal > tbl->max_size/2) 321 if (goal > tbl->max_size/2)
322 goal = tbl->max_size/2; 322 goal = tbl->max_size/2;
323 323
324 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 324 for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
325 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 325 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
326 326
327 spin_lock(&svc->sched_lock); 327 spin_lock(&svc->sched_lock);
@@ -340,7 +340,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
340 tbl->rover = j; 340 tbl->rover = j;
341 341
342 out: 342 out:
343 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); 343 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
344} 344}
345 345
346 346
@@ -363,7 +363,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
363 /* 363 /*
364 * Initialize the hash buckets 364 * Initialize the hash buckets
365 */ 365 */
366 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 366 for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
367 INIT_HLIST_HEAD(&tbl->bucket[i]); 367 INIT_HLIST_HEAD(&tbl->bucket[i]);
368 } 368 }
369 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 369 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
@@ -536,8 +536,7 @@ out:
536/* 536/*
537 * IPVS LBLC Scheduler structure 537 * IPVS LBLC Scheduler structure
538 */ 538 */
539static struct ip_vs_scheduler ip_vs_lblc_scheduler = 539static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
540{
541 .name = "lblc", 540 .name = "lblc",
542 .refcnt = ATOMIC_INIT(0), 541 .refcnt = ATOMIC_INIT(0),
543 .module = THIS_MODULE, 542 .module = THIS_MODULE,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 356bef519fe5..5d1e7d126ebd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -60,8 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
60 const struct nlattr *attr) __read_mostly; 60 const struct nlattr *attr) __read_mostly;
61EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); 61EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
62 62
63DEFINE_SPINLOCK(nf_conntrack_lock); 63__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
64EXPORT_SYMBOL_GPL(nf_conntrack_lock); 64EXPORT_SYMBOL_GPL(nf_conntrack_locks);
65
66__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
67EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
68
69static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
70{
71 h1 %= CONNTRACK_LOCKS;
72 h2 %= CONNTRACK_LOCKS;
73 spin_unlock(&nf_conntrack_locks[h1]);
74 if (h1 != h2)
75 spin_unlock(&nf_conntrack_locks[h2]);
76}
77
78/* return true if we need to recompute hashes (in case hash table was resized) */
79static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
80 unsigned int h2, unsigned int sequence)
81{
82 h1 %= CONNTRACK_LOCKS;
83 h2 %= CONNTRACK_LOCKS;
84 if (h1 <= h2) {
85 spin_lock(&nf_conntrack_locks[h1]);
86 if (h1 != h2)
87 spin_lock_nested(&nf_conntrack_locks[h2],
88 SINGLE_DEPTH_NESTING);
89 } else {
90 spin_lock(&nf_conntrack_locks[h2]);
91 spin_lock_nested(&nf_conntrack_locks[h1],
92 SINGLE_DEPTH_NESTING);
93 }
94 if (read_seqcount_retry(&net->ct.generation, sequence)) {
95 nf_conntrack_double_unlock(h1, h2);
96 return true;
97 }
98 return false;
99}
100
101static void nf_conntrack_all_lock(void)
102{
103 int i;
104
105 for (i = 0; i < CONNTRACK_LOCKS; i++)
106 spin_lock_nested(&nf_conntrack_locks[i], i);
107}
108
109static void nf_conntrack_all_unlock(void)
110{
111 int i;
112
113 for (i = 0; i < CONNTRACK_LOCKS; i++)
114 spin_unlock(&nf_conntrack_locks[i]);
115}
65 116
66unsigned int nf_conntrack_htable_size __read_mostly; 117unsigned int nf_conntrack_htable_size __read_mostly;
67EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 118EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
@@ -192,6 +243,50 @@ clean_from_lists(struct nf_conn *ct)
192 nf_ct_remove_expectations(ct); 243 nf_ct_remove_expectations(ct);
193} 244}
194 245
246/* must be called with local_bh_disable */
247static void nf_ct_add_to_dying_list(struct nf_conn *ct)
248{
249 struct ct_pcpu *pcpu;
250
251 /* add this conntrack to the (per cpu) dying list */
252 ct->cpu = smp_processor_id();
253 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
254
255 spin_lock(&pcpu->lock);
256 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
257 &pcpu->dying);
258 spin_unlock(&pcpu->lock);
259}
260
261/* must be called with local_bh_disable */
262static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
263{
264 struct ct_pcpu *pcpu;
265
266 /* add this conntrack to the (per cpu) unconfirmed list */
267 ct->cpu = smp_processor_id();
268 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
269
270 spin_lock(&pcpu->lock);
271 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
272 &pcpu->unconfirmed);
273 spin_unlock(&pcpu->lock);
274}
275
276/* must be called with local_bh_disable */
277static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
278{
279 struct ct_pcpu *pcpu;
280
281 /* We overload first tuple to link into unconfirmed or dying list.*/
282 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
283
284 spin_lock(&pcpu->lock);
285 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
286 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
287 spin_unlock(&pcpu->lock);
288}
289
195static void 290static void
196destroy_conntrack(struct nf_conntrack *nfct) 291destroy_conntrack(struct nf_conntrack *nfct)
197{ 292{
@@ -203,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
203 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 298 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
204 NF_CT_ASSERT(!timer_pending(&ct->timeout)); 299 NF_CT_ASSERT(!timer_pending(&ct->timeout));
205 300
206 /* To make sure we don't get any weird locking issues here:
207 * destroy_conntrack() MUST NOT be called with a write lock
208 * to nf_conntrack_lock!!! -HW */
209 rcu_read_lock(); 301 rcu_read_lock();
210 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 302 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
211 if (l4proto && l4proto->destroy) 303 if (l4proto && l4proto->destroy)
@@ -213,19 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct)
213 305
214 rcu_read_unlock(); 306 rcu_read_unlock();
215 307
216 spin_lock_bh(&nf_conntrack_lock); 308 local_bh_disable();
217 /* Expectations will have been removed in clean_from_lists, 309 /* Expectations will have been removed in clean_from_lists,
218 * except TFTP can create an expectation on the first packet, 310 * except TFTP can create an expectation on the first packet,
219 * before connection is in the list, so we need to clean here, 311 * before connection is in the list, so we need to clean here,
220 * too. */ 312 * too.
313 */
221 nf_ct_remove_expectations(ct); 314 nf_ct_remove_expectations(ct);
222 315
223 /* We overload first tuple to link into unconfirmed or dying list.*/ 316 nf_ct_del_from_dying_or_unconfirmed_list(ct);
224 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
225 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
226 317
227 NF_CT_STAT_INC(net, delete); 318 NF_CT_STAT_INC(net, delete);
228 spin_unlock_bh(&nf_conntrack_lock); 319 local_bh_enable();
229 320
230 if (ct->master) 321 if (ct->master)
231 nf_ct_put(ct->master); 322 nf_ct_put(ct->master);
@@ -237,17 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)
237static void nf_ct_delete_from_lists(struct nf_conn *ct) 328static void nf_ct_delete_from_lists(struct nf_conn *ct)
238{ 329{
239 struct net *net = nf_ct_net(ct); 330 struct net *net = nf_ct_net(ct);
331 unsigned int hash, reply_hash;
332 u16 zone = nf_ct_zone(ct);
333 unsigned int sequence;
240 334
241 nf_ct_helper_destroy(ct); 335 nf_ct_helper_destroy(ct);
242 spin_lock_bh(&nf_conntrack_lock); 336
243 /* Inside lock so preempt is disabled on module removal path. 337 local_bh_disable();
244 * Otherwise we can get spurious warnings. */ 338 do {
245 NF_CT_STAT_INC(net, delete_list); 339 sequence = read_seqcount_begin(&net->ct.generation);
340 hash = hash_conntrack(net, zone,
341 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
342 reply_hash = hash_conntrack(net, zone,
343 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
344 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
345
246 clean_from_lists(ct); 346 clean_from_lists(ct);
247 /* add this conntrack to the dying list */ 347 nf_conntrack_double_unlock(hash, reply_hash);
248 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 348
249 &net->ct.dying); 349 nf_ct_add_to_dying_list(ct);
250 spin_unlock_bh(&nf_conntrack_lock); 350
351 NF_CT_STAT_INC(net, delete_list);
352 local_bh_enable();
251} 353}
252 354
253static void death_by_event(unsigned long ul_conntrack) 355static void death_by_event(unsigned long ul_conntrack)
@@ -331,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
331 * Warning : 433 * Warning :
332 * - Caller must take a reference on returned object 434 * - Caller must take a reference on returned object
333 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 435 * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
334 * OR
335 * - Caller must lock nf_conntrack_lock before calling this function
336 */ 436 */
337static struct nf_conntrack_tuple_hash * 437static struct nf_conntrack_tuple_hash *
338____nf_conntrack_find(struct net *net, u16 zone, 438____nf_conntrack_find(struct net *net, u16 zone,
@@ -408,32 +508,36 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
408 508
409static void __nf_conntrack_hash_insert(struct nf_conn *ct, 509static void __nf_conntrack_hash_insert(struct nf_conn *ct,
410 unsigned int hash, 510 unsigned int hash,
411 unsigned int repl_hash) 511 unsigned int reply_hash)
412{ 512{
413 struct net *net = nf_ct_net(ct); 513 struct net *net = nf_ct_net(ct);
414 514
415 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 515 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
416 &net->ct.hash[hash]); 516 &net->ct.hash[hash]);
417 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 517 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
418 &net->ct.hash[repl_hash]); 518 &net->ct.hash[reply_hash]);
419} 519}
420 520
421int 521int
422nf_conntrack_hash_check_insert(struct nf_conn *ct) 522nf_conntrack_hash_check_insert(struct nf_conn *ct)
423{ 523{
424 struct net *net = nf_ct_net(ct); 524 struct net *net = nf_ct_net(ct);
425 unsigned int hash, repl_hash; 525 unsigned int hash, reply_hash;
426 struct nf_conntrack_tuple_hash *h; 526 struct nf_conntrack_tuple_hash *h;
427 struct hlist_nulls_node *n; 527 struct hlist_nulls_node *n;
428 u16 zone; 528 u16 zone;
529 unsigned int sequence;
429 530
430 zone = nf_ct_zone(ct); 531 zone = nf_ct_zone(ct);
431 hash = hash_conntrack(net, zone,
432 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
433 repl_hash = hash_conntrack(net, zone,
434 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
435 532
436 spin_lock_bh(&nf_conntrack_lock); 533 local_bh_disable();
534 do {
535 sequence = read_seqcount_begin(&net->ct.generation);
536 hash = hash_conntrack(net, zone,
537 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
538 reply_hash = hash_conntrack(net, zone,
539 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
540 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
437 541
438 /* See if there's one in the list already, including reverse */ 542 /* See if there's one in the list already, including reverse */
439 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 543 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
@@ -441,7 +545,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
441 &h->tuple) && 545 &h->tuple) &&
442 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 546 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
443 goto out; 547 goto out;
444 hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) 548 hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
445 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 549 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
446 &h->tuple) && 550 &h->tuple) &&
447 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 551 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
@@ -451,15 +555,16 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
451 smp_wmb(); 555 smp_wmb();
452 /* The caller holds a reference to this object */ 556 /* The caller holds a reference to this object */
453 atomic_set(&ct->ct_general.use, 2); 557 atomic_set(&ct->ct_general.use, 2);
454 __nf_conntrack_hash_insert(ct, hash, repl_hash); 558 __nf_conntrack_hash_insert(ct, hash, reply_hash);
559 nf_conntrack_double_unlock(hash, reply_hash);
455 NF_CT_STAT_INC(net, insert); 560 NF_CT_STAT_INC(net, insert);
456 spin_unlock_bh(&nf_conntrack_lock); 561 local_bh_enable();
457
458 return 0; 562 return 0;
459 563
460out: 564out:
565 nf_conntrack_double_unlock(hash, reply_hash);
461 NF_CT_STAT_INC(net, insert_failed); 566 NF_CT_STAT_INC(net, insert_failed);
462 spin_unlock_bh(&nf_conntrack_lock); 567 local_bh_enable();
463 return -EEXIST; 568 return -EEXIST;
464} 569}
465EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 570EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
@@ -467,15 +572,22 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
467/* deletion from this larval template list happens via nf_ct_put() */ 572/* deletion from this larval template list happens via nf_ct_put() */
468void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) 573void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
469{ 574{
575 struct ct_pcpu *pcpu;
576
470 __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); 577 __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
471 __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); 578 __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
472 nf_conntrack_get(&tmpl->ct_general); 579 nf_conntrack_get(&tmpl->ct_general);
473 580
474 spin_lock_bh(&nf_conntrack_lock); 581 /* add this conntrack to the (per cpu) tmpl list */
582 local_bh_disable();
583 tmpl->cpu = smp_processor_id();
584 pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
585
586 spin_lock(&pcpu->lock);
475 /* Overload tuple linked list to put us in template list. */ 587 /* Overload tuple linked list to put us in template list. */
476 hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 588 hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
477 &net->ct.tmpl); 589 &pcpu->tmpl);
478 spin_unlock_bh(&nf_conntrack_lock); 590 spin_unlock_bh(&pcpu->lock);
479} 591}
480EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); 592EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
481 593
@@ -483,7 +595,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
483int 595int
484__nf_conntrack_confirm(struct sk_buff *skb) 596__nf_conntrack_confirm(struct sk_buff *skb)
485{ 597{
486 unsigned int hash, repl_hash; 598 unsigned int hash, reply_hash;
487 struct nf_conntrack_tuple_hash *h; 599 struct nf_conntrack_tuple_hash *h;
488 struct nf_conn *ct; 600 struct nf_conn *ct;
489 struct nf_conn_help *help; 601 struct nf_conn_help *help;
@@ -492,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
492 enum ip_conntrack_info ctinfo; 604 enum ip_conntrack_info ctinfo;
493 struct net *net; 605 struct net *net;
494 u16 zone; 606 u16 zone;
607 unsigned int sequence;
495 608
496 ct = nf_ct_get(skb, &ctinfo); 609 ct = nf_ct_get(skb, &ctinfo);
497 net = nf_ct_net(ct); 610 net = nf_ct_net(ct);
@@ -504,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
504 return NF_ACCEPT; 617 return NF_ACCEPT;
505 618
506 zone = nf_ct_zone(ct); 619 zone = nf_ct_zone(ct);
507 /* reuse the hash saved before */ 620 local_bh_disable();
508 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 621
509 hash = hash_bucket(hash, net); 622 do {
510 repl_hash = hash_conntrack(net, zone, 623 sequence = read_seqcount_begin(&net->ct.generation);
511 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 624 /* reuse the hash saved before */
625 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
626 hash = hash_bucket(hash, net);
627 reply_hash = hash_conntrack(net, zone,
628 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
629
630 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
512 631
513 /* We're not in hash table, and we refuse to set up related 632 /* We're not in hash table, and we refuse to set up related
514 connections for unconfirmed conns. But packet copies and 633 * connections for unconfirmed conns. But packet copies and
515 REJECT will give spurious warnings here. */ 634 * REJECT will give spurious warnings here.
635 */
516 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 636 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
517 637
518 /* No external references means no one else could have 638 /* No external references means no one else could have
519 confirmed us. */ 639 * confirmed us.
640 */
520 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 641 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
521 pr_debug("Confirming conntrack %p\n", ct); 642 pr_debug("Confirming conntrack %p\n", ct);
522
523 spin_lock_bh(&nf_conntrack_lock);
524
525 /* We have to check the DYING flag inside the lock to prevent 643 /* We have to check the DYING flag inside the lock to prevent
526 a race against nf_ct_get_next_corpse() possibly called from 644 a race against nf_ct_get_next_corpse() possibly called from
527 user context, else we insert an already 'dead' hash, blocking 645 user context, else we insert an already 'dead' hash, blocking
528 further use of that particular connection -JM */ 646 further use of that particular connection -JM */
529 647
530 if (unlikely(nf_ct_is_dying(ct))) { 648 if (unlikely(nf_ct_is_dying(ct))) {
531 spin_unlock_bh(&nf_conntrack_lock); 649 nf_conntrack_double_unlock(hash, reply_hash);
650 local_bh_enable();
532 return NF_ACCEPT; 651 return NF_ACCEPT;
533 } 652 }
534 653
@@ -540,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb)
540 &h->tuple) && 659 &h->tuple) &&
541 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 660 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
542 goto out; 661 goto out;
543 hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) 662 hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
544 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 663 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
545 &h->tuple) && 664 &h->tuple) &&
546 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 665 zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
547 goto out; 666 goto out;
548 667
549 /* Remove from unconfirmed list */ 668 nf_ct_del_from_dying_or_unconfirmed_list(ct);
550 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
551 669
552 /* Timer relative to confirmation time, not original 670 /* Timer relative to confirmation time, not original
553 setting time, otherwise we'd get timer wrap in 671 setting time, otherwise we'd get timer wrap in
@@ -570,9 +688,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
570 * guarantee that no other CPU can find the conntrack before the above 688 * guarantee that no other CPU can find the conntrack before the above
571 * stores are visible. 689 * stores are visible.
572 */ 690 */
573 __nf_conntrack_hash_insert(ct, hash, repl_hash); 691 __nf_conntrack_hash_insert(ct, hash, reply_hash);
692 nf_conntrack_double_unlock(hash, reply_hash);
574 NF_CT_STAT_INC(net, insert); 693 NF_CT_STAT_INC(net, insert);
575 spin_unlock_bh(&nf_conntrack_lock); 694 local_bh_enable();
576 695
577 help = nfct_help(ct); 696 help = nfct_help(ct);
578 if (help && help->helper) 697 if (help && help->helper)
@@ -583,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
583 return NF_ACCEPT; 702 return NF_ACCEPT;
584 703
585out: 704out:
705 nf_conntrack_double_unlock(hash, reply_hash);
586 NF_CT_STAT_INC(net, insert_failed); 706 NF_CT_STAT_INC(net, insert_failed);
587 spin_unlock_bh(&nf_conntrack_lock); 707 local_bh_enable();
588 return NF_DROP; 708 return NF_DROP;
589} 709}
590EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 710EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -627,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
627 747
628/* There's a small race here where we may free a just-assured 748/* There's a small race here where we may free a just-assured
629 connection. Too bad: we're in trouble anyway. */ 749 connection. Too bad: we're in trouble anyway. */
630static noinline int early_drop(struct net *net, unsigned int hash) 750static noinline int early_drop(struct net *net, unsigned int _hash)
631{ 751{
632 /* Use oldest entry, which is roughly LRU */ 752 /* Use oldest entry, which is roughly LRU */
633 struct nf_conntrack_tuple_hash *h; 753 struct nf_conntrack_tuple_hash *h;
634 struct nf_conn *ct = NULL, *tmp; 754 struct nf_conn *ct = NULL, *tmp;
635 struct hlist_nulls_node *n; 755 struct hlist_nulls_node *n;
636 unsigned int i, cnt = 0; 756 unsigned int i = 0, cnt = 0;
637 int dropped = 0; 757 int dropped = 0;
758 unsigned int hash, sequence;
759 spinlock_t *lockp;
638 760
639 rcu_read_lock(); 761 local_bh_disable();
640 for (i = 0; i < net->ct.htable_size; i++) { 762restart:
763 sequence = read_seqcount_begin(&net->ct.generation);
764 hash = hash_bucket(_hash, net);
765 for (; i < net->ct.htable_size; i++) {
766 lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
767 spin_lock(lockp);
768 if (read_seqcount_retry(&net->ct.generation, sequence)) {
769 spin_unlock(lockp);
770 goto restart;
771 }
641 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], 772 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
642 hnnode) { 773 hnnode) {
643 tmp = nf_ct_tuplehash_to_ctrack(h); 774 tmp = nf_ct_tuplehash_to_ctrack(h);
644 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) 775 if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
776 !nf_ct_is_dying(tmp) &&
777 atomic_inc_not_zero(&tmp->ct_general.use)) {
645 ct = tmp; 778 ct = tmp;
779 break;
780 }
646 cnt++; 781 cnt++;
647 } 782 }
648 783
649 if (ct != NULL) { 784 hash = (hash + 1) % net->ct.htable_size;
650 if (likely(!nf_ct_is_dying(ct) && 785 spin_unlock(lockp);
651 atomic_inc_not_zero(&ct->ct_general.use)))
652 break;
653 else
654 ct = NULL;
655 }
656 786
657 if (cnt >= NF_CT_EVICTION_RANGE) 787 if (ct || cnt >= NF_CT_EVICTION_RANGE)
658 break; 788 break;
659 789
660 hash = (hash + 1) % net->ct.htable_size;
661 } 790 }
662 rcu_read_unlock(); 791 local_bh_enable();
663 792
664 if (!ct) 793 if (!ct)
665 return dropped; 794 return dropped;
@@ -708,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
708 837
709 if (nf_conntrack_max && 838 if (nf_conntrack_max &&
710 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 839 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
711 if (!early_drop(net, hash_bucket(hash, net))) { 840 if (!early_drop(net, hash)) {
712 atomic_dec(&net->ct.count); 841 atomic_dec(&net->ct.count);
713 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 842 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
714 return ERR_PTR(-ENOMEM); 843 return ERR_PTR(-ENOMEM);
@@ -805,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
805 struct nf_conn_help *help; 934 struct nf_conn_help *help;
806 struct nf_conntrack_tuple repl_tuple; 935 struct nf_conntrack_tuple repl_tuple;
807 struct nf_conntrack_ecache *ecache; 936 struct nf_conntrack_ecache *ecache;
808 struct nf_conntrack_expect *exp; 937 struct nf_conntrack_expect *exp = NULL;
809 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; 938 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
810 struct nf_conn_timeout *timeout_ext; 939 struct nf_conn_timeout *timeout_ext;
811 unsigned int *timeouts; 940 unsigned int *timeouts;
@@ -849,42 +978,44 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
849 ecache ? ecache->expmask : 0, 978 ecache ? ecache->expmask : 0,
850 GFP_ATOMIC); 979 GFP_ATOMIC);
851 980
852 spin_lock_bh(&nf_conntrack_lock); 981 local_bh_disable();
853 exp = nf_ct_find_expectation(net, zone, tuple); 982 if (net->ct.expect_count) {
854 if (exp) { 983 spin_lock(&nf_conntrack_expect_lock);
855 pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", 984 exp = nf_ct_find_expectation(net, zone, tuple);
856 ct, exp); 985 if (exp) {
857 /* Welcome, Mr. Bond. We've been expecting you... */ 986 pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
858 __set_bit(IPS_EXPECTED_BIT, &ct->status); 987 ct, exp);
859 ct->master = exp->master; 988 /* Welcome, Mr. Bond. We've been expecting you... */
860 if (exp->helper) { 989 __set_bit(IPS_EXPECTED_BIT, &ct->status);
861 help = nf_ct_helper_ext_add(ct, exp->helper, 990 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
862 GFP_ATOMIC); 991 ct->master = exp->master;
863 if (help) 992 if (exp->helper) {
864 rcu_assign_pointer(help->helper, exp->helper); 993 help = nf_ct_helper_ext_add(ct, exp->helper,
865 } 994 GFP_ATOMIC);
995 if (help)
996 rcu_assign_pointer(help->helper, exp->helper);
997 }
866 998
867#ifdef CONFIG_NF_CONNTRACK_MARK 999#ifdef CONFIG_NF_CONNTRACK_MARK
868 ct->mark = exp->master->mark; 1000 ct->mark = exp->master->mark;
869#endif 1001#endif
870#ifdef CONFIG_NF_CONNTRACK_SECMARK 1002#ifdef CONFIG_NF_CONNTRACK_SECMARK
871 ct->secmark = exp->master->secmark; 1003 ct->secmark = exp->master->secmark;
872#endif 1004#endif
873 nf_conntrack_get(&ct->master->ct_general); 1005 NF_CT_STAT_INC(net, expect_new);
874 NF_CT_STAT_INC(net, expect_new); 1006 }
875 } else { 1007 spin_unlock(&nf_conntrack_expect_lock);
1008 }
1009 if (!exp) {
876 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1010 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
877 NF_CT_STAT_INC(net, new); 1011 NF_CT_STAT_INC(net, new);
878 } 1012 }
879 1013
880 /* Now it is inserted into the unconfirmed list, bump refcount */ 1014 /* Now it is inserted into the unconfirmed list, bump refcount */
881 nf_conntrack_get(&ct->ct_general); 1015 nf_conntrack_get(&ct->ct_general);
1016 nf_ct_add_to_unconfirmed_list(ct);
882 1017
883 /* Overload tuple linked list to put us in unconfirmed list. */ 1018 local_bh_enable();
884 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
885 &net->ct.unconfirmed);
886
887 spin_unlock_bh(&nf_conntrack_lock);
888 1019
889 if (exp) { 1020 if (exp) {
890 if (exp->expectfn) 1021 if (exp->expectfn)
@@ -1254,27 +1385,42 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1254 struct nf_conntrack_tuple_hash *h; 1385 struct nf_conntrack_tuple_hash *h;
1255 struct nf_conn *ct; 1386 struct nf_conn *ct;
1256 struct hlist_nulls_node *n; 1387 struct hlist_nulls_node *n;
1388 int cpu;
1389 spinlock_t *lockp;
1257 1390
1258 spin_lock_bh(&nf_conntrack_lock);
1259 for (; *bucket < net->ct.htable_size; (*bucket)++) { 1391 for (; *bucket < net->ct.htable_size; (*bucket)++) {
1260 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { 1392 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
1261 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 1393 local_bh_disable();
1262 continue; 1394 spin_lock(lockp);
1395 if (*bucket < net->ct.htable_size) {
1396 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
1397 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
1398 continue;
1399 ct = nf_ct_tuplehash_to_ctrack(h);
1400 if (iter(ct, data))
1401 goto found;
1402 }
1403 }
1404 spin_unlock(lockp);
1405 local_bh_enable();
1406 }
1407
1408 for_each_possible_cpu(cpu) {
1409 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1410
1411 spin_lock_bh(&pcpu->lock);
1412 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
1263 ct = nf_ct_tuplehash_to_ctrack(h); 1413 ct = nf_ct_tuplehash_to_ctrack(h);
1264 if (iter(ct, data)) 1414 if (iter(ct, data))
1265 goto found; 1415 set_bit(IPS_DYING_BIT, &ct->status);
1266 } 1416 }
1417 spin_unlock_bh(&pcpu->lock);
1267 } 1418 }
1268 hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
1269 ct = nf_ct_tuplehash_to_ctrack(h);
1270 if (iter(ct, data))
1271 set_bit(IPS_DYING_BIT, &ct->status);
1272 }
1273 spin_unlock_bh(&nf_conntrack_lock);
1274 return NULL; 1419 return NULL;
1275found: 1420found:
1276 atomic_inc(&ct->ct_general.use); 1421 atomic_inc(&ct->ct_general.use);
1277 spin_unlock_bh(&nf_conntrack_lock); 1422 spin_unlock(lockp);
1423 local_bh_enable();
1278 return ct; 1424 return ct;
1279} 1425}
1280 1426
@@ -1323,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net)
1323 struct nf_conntrack_tuple_hash *h; 1469 struct nf_conntrack_tuple_hash *h;
1324 struct nf_conn *ct; 1470 struct nf_conn *ct;
1325 struct hlist_nulls_node *n; 1471 struct hlist_nulls_node *n;
1472 int cpu;
1326 1473
1327 spin_lock_bh(&nf_conntrack_lock); 1474 for_each_possible_cpu(cpu) {
1328 hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { 1475 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1329 ct = nf_ct_tuplehash_to_ctrack(h); 1476
1330 /* never fails to remove them, no listeners at this point */ 1477 spin_lock_bh(&pcpu->lock);
1331 nf_ct_kill(ct); 1478 hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
1479 ct = nf_ct_tuplehash_to_ctrack(h);
1480 /* never fails to remove them, no listeners at this point */
1481 nf_ct_kill(ct);
1482 }
1483 spin_unlock_bh(&pcpu->lock);
1332 } 1484 }
1333 spin_unlock_bh(&nf_conntrack_lock);
1334} 1485}
1335 1486
1336static int untrack_refs(void) 1487static int untrack_refs(void)
@@ -1417,6 +1568,7 @@ i_see_dead_people:
1417 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 1568 kmem_cache_destroy(net->ct.nf_conntrack_cachep);
1418 kfree(net->ct.slabname); 1569 kfree(net->ct.slabname);
1419 free_percpu(net->ct.stat); 1570 free_percpu(net->ct.stat);
1571 free_percpu(net->ct.pcpu_lists);
1420 } 1572 }
1421} 1573}
1422 1574
@@ -1469,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1469 if (!hash) 1621 if (!hash)
1470 return -ENOMEM; 1622 return -ENOMEM;
1471 1623
1624 local_bh_disable();
1625 nf_conntrack_all_lock();
1626 write_seqcount_begin(&init_net.ct.generation);
1627
1472 /* Lookups in the old hash might happen in parallel, which means we 1628 /* Lookups in the old hash might happen in parallel, which means we
1473 * might get false negatives during connection lookup. New connections 1629 * might get false negatives during connection lookup. New connections
1474 * created because of a false negative won't make it into the hash 1630 * created because of a false negative won't make it into the hash
1475 * though since that required taking the lock. 1631 * though since that required taking the locks.
1476 */ 1632 */
1477 spin_lock_bh(&nf_conntrack_lock); 1633
1478 for (i = 0; i < init_net.ct.htable_size; i++) { 1634 for (i = 0; i < init_net.ct.htable_size; i++) {
1479 while (!hlist_nulls_empty(&init_net.ct.hash[i])) { 1635 while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
1480 h = hlist_nulls_entry(init_net.ct.hash[i].first, 1636 h = hlist_nulls_entry(init_net.ct.hash[i].first,
@@ -1491,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1491 1647
1492 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; 1648 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
1493 init_net.ct.hash = hash; 1649 init_net.ct.hash = hash;
1494 spin_unlock_bh(&nf_conntrack_lock); 1650
1651 write_seqcount_end(&init_net.ct.generation);
1652 nf_conntrack_all_unlock();
1653 local_bh_enable();
1495 1654
1496 nf_ct_free_hashtable(old_hash, old_size); 1655 nf_ct_free_hashtable(old_hash, old_size);
1497 return 0; 1656 return 0;
@@ -1513,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
1513int nf_conntrack_init_start(void) 1672int nf_conntrack_init_start(void)
1514{ 1673{
1515 int max_factor = 8; 1674 int max_factor = 8;
1516 int ret, cpu; 1675 int i, ret, cpu;
1676
1677 for (i = 0; i < ARRAY_SIZE(nf_conntrack_locks); i++)
1678 spin_lock_init(&nf_conntrack_locks[i]);
1517 1679
1518 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB 1680 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1519 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ 1681 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1629,37 +1791,43 @@ void nf_conntrack_init_end(void)
1629 1791
1630int nf_conntrack_init_net(struct net *net) 1792int nf_conntrack_init_net(struct net *net)
1631{ 1793{
1632 int ret; 1794 int ret = -ENOMEM;
1795 int cpu;
1633 1796
1634 atomic_set(&net->ct.count, 0); 1797 atomic_set(&net->ct.count, 0);
1635 INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); 1798
1636 INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); 1799 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
1637 INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL); 1800 if (!net->ct.pcpu_lists)
1638 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1639 if (!net->ct.stat) {
1640 ret = -ENOMEM;
1641 goto err_stat; 1801 goto err_stat;
1802
1803 for_each_possible_cpu(cpu) {
1804 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1805
1806 spin_lock_init(&pcpu->lock);
1807 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
1808 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
1809 INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
1642 } 1810 }
1643 1811
1812 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1813 if (!net->ct.stat)
1814 goto err_pcpu_lists;
1815
1644 net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); 1816 net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
1645 if (!net->ct.slabname) { 1817 if (!net->ct.slabname)
1646 ret = -ENOMEM;
1647 goto err_slabname; 1818 goto err_slabname;
1648 }
1649 1819
1650 net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, 1820 net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
1651 sizeof(struct nf_conn), 0, 1821 sizeof(struct nf_conn), 0,
1652 SLAB_DESTROY_BY_RCU, NULL); 1822 SLAB_DESTROY_BY_RCU, NULL);
1653 if (!net->ct.nf_conntrack_cachep) { 1823 if (!net->ct.nf_conntrack_cachep) {
1654 printk(KERN_ERR "Unable to create nf_conn slab cache\n"); 1824 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1655 ret = -ENOMEM;
1656 goto err_cache; 1825 goto err_cache;
1657 } 1826 }
1658 1827
1659 net->ct.htable_size = nf_conntrack_htable_size; 1828 net->ct.htable_size = nf_conntrack_htable_size;
1660 net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); 1829 net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
1661 if (!net->ct.hash) { 1830 if (!net->ct.hash) {
1662 ret = -ENOMEM;
1663 printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); 1831 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1664 goto err_hash; 1832 goto err_hash;
1665 } 1833 }
@@ -1701,6 +1869,8 @@ err_cache:
1701 kfree(net->ct.slabname); 1869 kfree(net->ct.slabname);
1702err_slabname: 1870err_slabname:
1703 free_percpu(net->ct.stat); 1871 free_percpu(net->ct.stat);
1872err_pcpu_lists:
1873 free_percpu(net->ct.pcpu_lists);
1704err_stat: 1874err_stat:
1705 return ret; 1875 return ret;
1706} 1876}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 4fd1ca94fd4a..f87e8f68ad45 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -66,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)
66{ 66{
67 struct nf_conntrack_expect *exp = (void *)ul_expect; 67 struct nf_conntrack_expect *exp = (void *)ul_expect;
68 68
69 spin_lock_bh(&nf_conntrack_lock); 69 spin_lock_bh(&nf_conntrack_expect_lock);
70 nf_ct_unlink_expect(exp); 70 nf_ct_unlink_expect(exp);
71 spin_unlock_bh(&nf_conntrack_lock); 71 spin_unlock_bh(&nf_conntrack_expect_lock);
72 nf_ct_expect_put(exp); 72 nf_ct_expect_put(exp);
73} 73}
74 74
@@ -155,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone,
155 if (!nf_ct_is_confirmed(exp->master)) 155 if (!nf_ct_is_confirmed(exp->master))
156 return NULL; 156 return NULL;
157 157
158 /* Avoid race with other CPUs, that for exp->master ct, is
159 * about to invoke ->destroy(), or nf_ct_delete() via timeout
160 * or early_drop().
161 *
162 * The atomic_inc_not_zero() check tells: If that fails, we
163 * know that the ct is being destroyed. If it succeeds, we
164 * can be sure the ct cannot disappear underneath.
165 */
166 if (unlikely(nf_ct_is_dying(exp->master) ||
167 !atomic_inc_not_zero(&exp->master->ct_general.use)))
168 return NULL;
169
158 if (exp->flags & NF_CT_EXPECT_PERMANENT) { 170 if (exp->flags & NF_CT_EXPECT_PERMANENT) {
159 atomic_inc(&exp->use); 171 atomic_inc(&exp->use);
160 return exp; 172 return exp;
@@ -162,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone,
162 nf_ct_unlink_expect(exp); 174 nf_ct_unlink_expect(exp);
163 return exp; 175 return exp;
164 } 176 }
177 /* Undo exp->master refcnt increase, if del_timer() failed */
178 nf_ct_put(exp->master);
165 179
166 return NULL; 180 return NULL;
167} 181}
@@ -177,12 +191,14 @@ void nf_ct_remove_expectations(struct nf_conn *ct)
177 if (!help) 191 if (!help)
178 return; 192 return;
179 193
194 spin_lock_bh(&nf_conntrack_expect_lock);
180 hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { 195 hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
181 if (del_timer(&exp->timeout)) { 196 if (del_timer(&exp->timeout)) {
182 nf_ct_unlink_expect(exp); 197 nf_ct_unlink_expect(exp);
183 nf_ct_expect_put(exp); 198 nf_ct_expect_put(exp);
184 } 199 }
185 } 200 }
201 spin_unlock_bh(&nf_conntrack_expect_lock);
186} 202}
187EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); 203EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
188 204
@@ -217,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
217/* Generally a bad idea to call this: could have matched already. */ 233/* Generally a bad idea to call this: could have matched already. */
218void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) 234void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
219{ 235{
220 spin_lock_bh(&nf_conntrack_lock); 236 spin_lock_bh(&nf_conntrack_expect_lock);
221 if (del_timer(&exp->timeout)) { 237 if (del_timer(&exp->timeout)) {
222 nf_ct_unlink_expect(exp); 238 nf_ct_unlink_expect(exp);
223 nf_ct_expect_put(exp); 239 nf_ct_expect_put(exp);
224 } 240 }
225 spin_unlock_bh(&nf_conntrack_lock); 241 spin_unlock_bh(&nf_conntrack_expect_lock);
226} 242}
227EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); 243EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
228 244
@@ -335,7 +351,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
335 setup_timer(&exp->timeout, nf_ct_expectation_timed_out, 351 setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
336 (unsigned long)exp); 352 (unsigned long)exp);
337 helper = rcu_dereference_protected(master_help->helper, 353 helper = rcu_dereference_protected(master_help->helper,
338 lockdep_is_held(&nf_conntrack_lock)); 354 lockdep_is_held(&nf_conntrack_expect_lock));
339 if (helper) { 355 if (helper) {
340 exp->timeout.expires = jiffies + 356 exp->timeout.expires = jiffies +
341 helper->expect_policy[exp->class].timeout * HZ; 357 helper->expect_policy[exp->class].timeout * HZ;
@@ -395,7 +411,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
395 } 411 }
396 /* Will be over limit? */ 412 /* Will be over limit? */
397 helper = rcu_dereference_protected(master_help->helper, 413 helper = rcu_dereference_protected(master_help->helper,
398 lockdep_is_held(&nf_conntrack_lock)); 414 lockdep_is_held(&nf_conntrack_expect_lock));
399 if (helper) { 415 if (helper) {
400 p = &helper->expect_policy[expect->class]; 416 p = &helper->expect_policy[expect->class];
401 if (p->max_expected && 417 if (p->max_expected &&
@@ -417,12 +433,12 @@ out:
417 return ret; 433 return ret;
418} 434}
419 435
420int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 436int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
421 u32 portid, int report) 437 u32 portid, int report)
422{ 438{
423 int ret; 439 int ret;
424 440
425 spin_lock_bh(&nf_conntrack_lock); 441 spin_lock_bh(&nf_conntrack_expect_lock);
426 ret = __nf_ct_expect_check(expect); 442 ret = __nf_ct_expect_check(expect);
427 if (ret <= 0) 443 if (ret <= 0)
428 goto out; 444 goto out;
@@ -430,11 +446,11 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
430 ret = nf_ct_expect_insert(expect); 446 ret = nf_ct_expect_insert(expect);
431 if (ret < 0) 447 if (ret < 0)
432 goto out; 448 goto out;
433 spin_unlock_bh(&nf_conntrack_lock); 449 spin_unlock_bh(&nf_conntrack_expect_lock);
434 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); 450 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
435 return ret; 451 return ret;
436out: 452out:
437 spin_unlock_bh(&nf_conntrack_lock); 453 spin_unlock_bh(&nf_conntrack_expect_lock);
438 return ret; 454 return ret;
439} 455}
440EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); 456EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 70866d192efc..3a3a60b126e0 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1476,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
1476 nf_ct_refresh(ct, skb, info->timeout * HZ); 1476 nf_ct_refresh(ct, skb, info->timeout * HZ);
1477 1477
1478 /* Set expect timeout */ 1478 /* Set expect timeout */
1479 spin_lock_bh(&nf_conntrack_lock); 1479 spin_lock_bh(&nf_conntrack_expect_lock);
1480 exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3, 1480 exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,
1481 info->sig_port[!dir]); 1481 info->sig_port[!dir]);
1482 if (exp) { 1482 if (exp) {
@@ -1486,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
1486 nf_ct_dump_tuple(&exp->tuple); 1486 nf_ct_dump_tuple(&exp->tuple);
1487 set_expect_timeout(exp, info->timeout); 1487 set_expect_timeout(exp, info->timeout);
1488 } 1488 }
1489 spin_unlock_bh(&nf_conntrack_lock); 1489 spin_unlock_bh(&nf_conntrack_expect_lock);
1490 } 1490 }
1491 1491
1492 return 0; 1492 return 0;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 974a2a4adefa..5b3eae7d4c9a 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -250,16 +250,14 @@ out:
250} 250}
251EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); 251EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
252 252
253/* appropiate ct lock protecting must be taken by caller */
253static inline int unhelp(struct nf_conntrack_tuple_hash *i, 254static inline int unhelp(struct nf_conntrack_tuple_hash *i,
254 const struct nf_conntrack_helper *me) 255 const struct nf_conntrack_helper *me)
255{ 256{
256 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); 257 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
257 struct nf_conn_help *help = nfct_help(ct); 258 struct nf_conn_help *help = nfct_help(ct);
258 259
259 if (help && rcu_dereference_protected( 260 if (help && rcu_dereference_raw(help->helper) == me) {
260 help->helper,
261 lockdep_is_held(&nf_conntrack_lock)
262 ) == me) {
263 nf_conntrack_event(IPCT_HELPER, ct); 261 nf_conntrack_event(IPCT_HELPER, ct);
264 RCU_INIT_POINTER(help->helper, NULL); 262 RCU_INIT_POINTER(help->helper, NULL);
265 } 263 }
@@ -284,17 +282,17 @@ static LIST_HEAD(nf_ct_helper_expectfn_list);
284 282
285void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) 283void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n)
286{ 284{
287 spin_lock_bh(&nf_conntrack_lock); 285 spin_lock_bh(&nf_conntrack_expect_lock);
288 list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); 286 list_add_rcu(&n->head, &nf_ct_helper_expectfn_list);
289 spin_unlock_bh(&nf_conntrack_lock); 287 spin_unlock_bh(&nf_conntrack_expect_lock);
290} 288}
291EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); 289EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register);
292 290
293void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) 291void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
294{ 292{
295 spin_lock_bh(&nf_conntrack_lock); 293 spin_lock_bh(&nf_conntrack_expect_lock);
296 list_del_rcu(&n->head); 294 list_del_rcu(&n->head);
297 spin_unlock_bh(&nf_conntrack_lock); 295 spin_unlock_bh(&nf_conntrack_expect_lock);
298} 296}
299EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); 297EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
300 298
@@ -396,15 +394,17 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
396 const struct hlist_node *next; 394 const struct hlist_node *next;
397 const struct hlist_nulls_node *nn; 395 const struct hlist_nulls_node *nn;
398 unsigned int i; 396 unsigned int i;
397 int cpu;
399 398
400 /* Get rid of expectations */ 399 /* Get rid of expectations */
400 spin_lock_bh(&nf_conntrack_expect_lock);
401 for (i = 0; i < nf_ct_expect_hsize; i++) { 401 for (i = 0; i < nf_ct_expect_hsize; i++) {
402 hlist_for_each_entry_safe(exp, next, 402 hlist_for_each_entry_safe(exp, next,
403 &net->ct.expect_hash[i], hnode) { 403 &net->ct.expect_hash[i], hnode) {
404 struct nf_conn_help *help = nfct_help(exp->master); 404 struct nf_conn_help *help = nfct_help(exp->master);
405 if ((rcu_dereference_protected( 405 if ((rcu_dereference_protected(
406 help->helper, 406 help->helper,
407 lockdep_is_held(&nf_conntrack_lock) 407 lockdep_is_held(&nf_conntrack_expect_lock)
408 ) == me || exp->helper == me) && 408 ) == me || exp->helper == me) &&
409 del_timer(&exp->timeout)) { 409 del_timer(&exp->timeout)) {
410 nf_ct_unlink_expect(exp); 410 nf_ct_unlink_expect(exp);
@@ -412,14 +412,27 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
412 } 412 }
413 } 413 }
414 } 414 }
415 spin_unlock_bh(&nf_conntrack_expect_lock);
415 416
416 /* Get rid of expecteds, set helpers to NULL. */ 417 /* Get rid of expecteds, set helpers to NULL. */
417 hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) 418 for_each_possible_cpu(cpu) {
418 unhelp(h, me); 419 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
419 for (i = 0; i < net->ct.htable_size; i++) { 420
420 hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) 421 spin_lock_bh(&pcpu->lock);
422 hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
421 unhelp(h, me); 423 unhelp(h, me);
424 spin_unlock_bh(&pcpu->lock);
425 }
426 local_bh_disable();
427 for (i = 0; i < net->ct.htable_size; i++) {
428 spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
429 if (i < net->ct.htable_size) {
430 hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
431 unhelp(h, me);
432 }
433 spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
422 } 434 }
435 local_bh_enable();
423} 436}
424 437
425void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) 438void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
@@ -437,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
437 synchronize_rcu(); 450 synchronize_rcu();
438 451
439 rtnl_lock(); 452 rtnl_lock();
440 spin_lock_bh(&nf_conntrack_lock);
441 for_each_net(net) 453 for_each_net(net)
442 __nf_conntrack_helper_unregister(me, net); 454 __nf_conntrack_helper_unregister(me, net);
443 spin_unlock_bh(&nf_conntrack_lock);
444 rtnl_unlock(); 455 rtnl_unlock();
445} 456}
446EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); 457EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b9f0e0374322..ccc46fa5edbc 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
764 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); 764 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
765 u_int8_t l3proto = nfmsg->nfgen_family; 765 u_int8_t l3proto = nfmsg->nfgen_family;
766 int res; 766 int res;
767 spinlock_t *lockp;
768
767#ifdef CONFIG_NF_CONNTRACK_MARK 769#ifdef CONFIG_NF_CONNTRACK_MARK
768 const struct ctnetlink_dump_filter *filter = cb->data; 770 const struct ctnetlink_dump_filter *filter = cb->data;
769#endif 771#endif
770 772
771 spin_lock_bh(&nf_conntrack_lock);
772 last = (struct nf_conn *)cb->args[1]; 773 last = (struct nf_conn *)cb->args[1];
774
775 local_bh_disable();
773 for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { 776 for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
774restart: 777restart:
778 lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
779 spin_lock(lockp);
780 if (cb->args[0] >= net->ct.htable_size) {
781 spin_unlock(lockp);
782 goto out;
783 }
775 hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], 784 hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
776 hnnode) { 785 hnnode) {
777 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 786 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -803,16 +812,18 @@ restart:
803 if (res < 0) { 812 if (res < 0) {
804 nf_conntrack_get(&ct->ct_general); 813 nf_conntrack_get(&ct->ct_general);
805 cb->args[1] = (unsigned long)ct; 814 cb->args[1] = (unsigned long)ct;
815 spin_unlock(lockp);
806 goto out; 816 goto out;
807 } 817 }
808 } 818 }
819 spin_unlock(lockp);
809 if (cb->args[1]) { 820 if (cb->args[1]) {
810 cb->args[1] = 0; 821 cb->args[1] = 0;
811 goto restart; 822 goto restart;
812 } 823 }
813 } 824 }
814out: 825out:
815 spin_unlock_bh(&nf_conntrack_lock); 826 local_bh_enable();
816 if (last) 827 if (last)
817 nf_ct_put(last); 828 nf_ct_put(last);
818 829
@@ -966,7 +977,6 @@ ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
966 return 0; 977 return 0;
967} 978}
968 979
969#define __CTA_LABELS_MAX_LENGTH ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)
970static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { 980static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
971 [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, 981 [CTA_TUPLE_ORIG] = { .type = NLA_NESTED },
972 [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, 982 [CTA_TUPLE_REPLY] = { .type = NLA_NESTED },
@@ -984,9 +994,9 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
984 [CTA_ZONE] = { .type = NLA_U16 }, 994 [CTA_ZONE] = { .type = NLA_U16 },
985 [CTA_MARK_MASK] = { .type = NLA_U32 }, 995 [CTA_MARK_MASK] = { .type = NLA_U32 },
986 [CTA_LABELS] = { .type = NLA_BINARY, 996 [CTA_LABELS] = { .type = NLA_BINARY,
987 .len = __CTA_LABELS_MAX_LENGTH }, 997 .len = NF_CT_LABELS_MAX_SIZE },
988 [CTA_LABELS_MASK] = { .type = NLA_BINARY, 998 [CTA_LABELS_MASK] = { .type = NLA_BINARY,
989 .len = __CTA_LABELS_MAX_LENGTH }, 999 .len = NF_CT_LABELS_MAX_SIZE },
990}; 1000};
991 1001
992static int 1002static int
@@ -1138,50 +1148,65 @@ static int ctnetlink_done_list(struct netlink_callback *cb)
1138} 1148}
1139 1149
1140static int 1150static int
1141ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, 1151ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
1142 struct hlist_nulls_head *list)
1143{ 1152{
1144 struct nf_conn *ct, *last; 1153 struct nf_conn *ct, *last = NULL;
1145 struct nf_conntrack_tuple_hash *h; 1154 struct nf_conntrack_tuple_hash *h;
1146 struct hlist_nulls_node *n; 1155 struct hlist_nulls_node *n;
1147 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); 1156 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
1148 u_int8_t l3proto = nfmsg->nfgen_family; 1157 u_int8_t l3proto = nfmsg->nfgen_family;
1149 int res; 1158 int res;
1159 int cpu;
1160 struct hlist_nulls_head *list;
1161 struct net *net = sock_net(skb->sk);
1150 1162
1151 if (cb->args[2]) 1163 if (cb->args[2])
1152 return 0; 1164 return 0;
1153 1165
1154 spin_lock_bh(&nf_conntrack_lock); 1166 if (cb->args[0] == nr_cpu_ids)
1155 last = (struct nf_conn *)cb->args[1]; 1167 return 0;
1156restart: 1168
1157 hlist_nulls_for_each_entry(h, n, list, hnnode) { 1169 for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
1158 ct = nf_ct_tuplehash_to_ctrack(h); 1170 struct ct_pcpu *pcpu;
1159 if (l3proto && nf_ct_l3num(ct) != l3proto) 1171
1172 if (!cpu_possible(cpu))
1160 continue; 1173 continue;
1161 if (cb->args[1]) { 1174
1162 if (ct != last) 1175 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1176 spin_lock_bh(&pcpu->lock);
1177 last = (struct nf_conn *)cb->args[1];
1178 list = dying ? &pcpu->dying : &pcpu->unconfirmed;
1179restart:
1180 hlist_nulls_for_each_entry(h, n, list, hnnode) {
1181 ct = nf_ct_tuplehash_to_ctrack(h);
1182 if (l3proto && nf_ct_l3num(ct) != l3proto)
1163 continue; 1183 continue;
1164 cb->args[1] = 0; 1184 if (cb->args[1]) {
1165 } 1185 if (ct != last)
1166 rcu_read_lock(); 1186 continue;
1167 res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, 1187 cb->args[1] = 0;
1168 cb->nlh->nlmsg_seq, 1188 }
1169 NFNL_MSG_TYPE(cb->nlh->nlmsg_type), 1189 rcu_read_lock();
1170 ct); 1190 res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
1171 rcu_read_unlock(); 1191 cb->nlh->nlmsg_seq,
1172 if (res < 0) { 1192 NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
1173 nf_conntrack_get(&ct->ct_general); 1193 ct);
1174 cb->args[1] = (unsigned long)ct; 1194 rcu_read_unlock();
1175 goto out; 1195 if (res < 0) {
1196 nf_conntrack_get(&ct->ct_general);
1197 cb->args[1] = (unsigned long)ct;
1198 spin_unlock_bh(&pcpu->lock);
1199 goto out;
1200 }
1176 } 1201 }
1202 if (cb->args[1]) {
1203 cb->args[1] = 0;
1204 goto restart;
1205 } else
1206 cb->args[2] = 1;
1207 spin_unlock_bh(&pcpu->lock);
1177 } 1208 }
1178 if (cb->args[1]) {
1179 cb->args[1] = 0;
1180 goto restart;
1181 } else
1182 cb->args[2] = 1;
1183out: 1209out:
1184 spin_unlock_bh(&nf_conntrack_lock);
1185 if (last) 1210 if (last)
1186 nf_ct_put(last); 1211 nf_ct_put(last);
1187 1212
@@ -1191,9 +1216,7 @@ out:
1191static int 1216static int
1192ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) 1217ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
1193{ 1218{
1194 struct net *net = sock_net(skb->sk); 1219 return ctnetlink_dump_list(skb, cb, true);
1195
1196 return ctnetlink_dump_list(skb, cb, &net->ct.dying);
1197} 1220}
1198 1221
1199static int 1222static int
@@ -1215,9 +1238,7 @@ ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
1215static int 1238static int
1216ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) 1239ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
1217{ 1240{
1218 struct net *net = sock_net(skb->sk); 1241 return ctnetlink_dump_list(skb, cb, false);
1219
1220 return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed);
1221} 1242}
1222 1243
1223static int 1244static int
@@ -1361,14 +1382,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
1361 nf_ct_protonum(ct)); 1382 nf_ct_protonum(ct));
1362 if (helper == NULL) { 1383 if (helper == NULL) {
1363#ifdef CONFIG_MODULES 1384#ifdef CONFIG_MODULES
1364 spin_unlock_bh(&nf_conntrack_lock); 1385 spin_unlock_bh(&nf_conntrack_expect_lock);
1365 1386
1366 if (request_module("nfct-helper-%s", helpname) < 0) { 1387 if (request_module("nfct-helper-%s", helpname) < 0) {
1367 spin_lock_bh(&nf_conntrack_lock); 1388 spin_lock_bh(&nf_conntrack_expect_lock);
1368 return -EOPNOTSUPP; 1389 return -EOPNOTSUPP;
1369 } 1390 }
1370 1391
1371 spin_lock_bh(&nf_conntrack_lock); 1392 spin_lock_bh(&nf_conntrack_expect_lock);
1372 helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), 1393 helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
1373 nf_ct_protonum(ct)); 1394 nf_ct_protonum(ct));
1374 if (helper) 1395 if (helper)
@@ -1804,9 +1825,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
1804 err = -EEXIST; 1825 err = -EEXIST;
1805 ct = nf_ct_tuplehash_to_ctrack(h); 1826 ct = nf_ct_tuplehash_to_ctrack(h);
1806 if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { 1827 if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
1807 spin_lock_bh(&nf_conntrack_lock); 1828 spin_lock_bh(&nf_conntrack_expect_lock);
1808 err = ctnetlink_change_conntrack(ct, cda); 1829 err = ctnetlink_change_conntrack(ct, cda);
1809 spin_unlock_bh(&nf_conntrack_lock); 1830 spin_unlock_bh(&nf_conntrack_expect_lock);
1810 if (err == 0) { 1831 if (err == 0) {
1811 nf_conntrack_eventmask_report((1 << IPCT_REPLY) | 1832 nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
1812 (1 << IPCT_ASSURED) | 1833 (1 << IPCT_ASSURED) |
@@ -2135,9 +2156,9 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
2135 if (ret < 0) 2156 if (ret < 0)
2136 return ret; 2157 return ret;
2137 2158
2138 spin_lock_bh(&nf_conntrack_lock); 2159 spin_lock_bh(&nf_conntrack_expect_lock);
2139 ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); 2160 ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
2140 spin_unlock_bh(&nf_conntrack_lock); 2161 spin_unlock_bh(&nf_conntrack_expect_lock);
2141 2162
2142 return ret; 2163 return ret;
2143} 2164}
@@ -2692,13 +2713,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
2692 } 2713 }
2693 2714
2694 /* after list removal, usage count == 1 */ 2715 /* after list removal, usage count == 1 */
2695 spin_lock_bh(&nf_conntrack_lock); 2716 spin_lock_bh(&nf_conntrack_expect_lock);
2696 if (del_timer(&exp->timeout)) { 2717 if (del_timer(&exp->timeout)) {
2697 nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, 2718 nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
2698 nlmsg_report(nlh)); 2719 nlmsg_report(nlh));
2699 nf_ct_expect_put(exp); 2720 nf_ct_expect_put(exp);
2700 } 2721 }
2701 spin_unlock_bh(&nf_conntrack_lock); 2722 spin_unlock_bh(&nf_conntrack_expect_lock);
2702 /* have to put what we 'get' above. 2723 /* have to put what we 'get' above.
2703 * after this line usage count == 0 */ 2724 * after this line usage count == 0 */
2704 nf_ct_expect_put(exp); 2725 nf_ct_expect_put(exp);
@@ -2707,7 +2728,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
2707 struct nf_conn_help *m_help; 2728 struct nf_conn_help *m_help;
2708 2729
2709 /* delete all expectations for this helper */ 2730 /* delete all expectations for this helper */
2710 spin_lock_bh(&nf_conntrack_lock); 2731 spin_lock_bh(&nf_conntrack_expect_lock);
2711 for (i = 0; i < nf_ct_expect_hsize; i++) { 2732 for (i = 0; i < nf_ct_expect_hsize; i++) {
2712 hlist_for_each_entry_safe(exp, next, 2733 hlist_for_each_entry_safe(exp, next,
2713 &net->ct.expect_hash[i], 2734 &net->ct.expect_hash[i],
@@ -2722,10 +2743,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
2722 } 2743 }
2723 } 2744 }
2724 } 2745 }
2725 spin_unlock_bh(&nf_conntrack_lock); 2746 spin_unlock_bh(&nf_conntrack_expect_lock);
2726 } else { 2747 } else {
2727 /* This basically means we have to flush everything*/ 2748 /* This basically means we have to flush everything*/
2728 spin_lock_bh(&nf_conntrack_lock); 2749 spin_lock_bh(&nf_conntrack_expect_lock);
2729 for (i = 0; i < nf_ct_expect_hsize; i++) { 2750 for (i = 0; i < nf_ct_expect_hsize; i++) {
2730 hlist_for_each_entry_safe(exp, next, 2751 hlist_for_each_entry_safe(exp, next,
2731 &net->ct.expect_hash[i], 2752 &net->ct.expect_hash[i],
@@ -2738,7 +2759,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
2738 } 2759 }
2739 } 2760 }
2740 } 2761 }
2741 spin_unlock_bh(&nf_conntrack_lock); 2762 spin_unlock_bh(&nf_conntrack_expect_lock);
2742 } 2763 }
2743 2764
2744 return 0; 2765 return 0;
@@ -2964,11 +2985,11 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
2964 if (err < 0) 2985 if (err < 0)
2965 return err; 2986 return err;
2966 2987
2967 spin_lock_bh(&nf_conntrack_lock); 2988 spin_lock_bh(&nf_conntrack_expect_lock);
2968 exp = __nf_ct_expect_find(net, zone, &tuple); 2989 exp = __nf_ct_expect_find(net, zone, &tuple);
2969 2990
2970 if (!exp) { 2991 if (!exp) {
2971 spin_unlock_bh(&nf_conntrack_lock); 2992 spin_unlock_bh(&nf_conntrack_expect_lock);
2972 err = -ENOENT; 2993 err = -ENOENT;
2973 if (nlh->nlmsg_flags & NLM_F_CREATE) { 2994 if (nlh->nlmsg_flags & NLM_F_CREATE) {
2974 err = ctnetlink_create_expect(net, zone, cda, 2995 err = ctnetlink_create_expect(net, zone, cda,
@@ -2982,7 +3003,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
2982 err = -EEXIST; 3003 err = -EEXIST;
2983 if (!(nlh->nlmsg_flags & NLM_F_EXCL)) 3004 if (!(nlh->nlmsg_flags & NLM_F_EXCL))
2984 err = ctnetlink_change_expect(exp, cda); 3005 err = ctnetlink_change_expect(exp, cda);
2985 spin_unlock_bh(&nf_conntrack_lock); 3006 spin_unlock_bh(&nf_conntrack_expect_lock);
2986 3007
2987 return err; 3008 return err;
2988} 3009}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 466410eaa482..4c3ba1c8d682 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -800,7 +800,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
800 struct hlist_node *next; 800 struct hlist_node *next;
801 int found = 0; 801 int found = 0;
802 802
803 spin_lock_bh(&nf_conntrack_lock); 803 spin_lock_bh(&nf_conntrack_expect_lock);
804 hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { 804 hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
805 if (exp->class != SIP_EXPECT_SIGNALLING || 805 if (exp->class != SIP_EXPECT_SIGNALLING ||
806 !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || 806 !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) ||
@@ -815,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
815 found = 1; 815 found = 1;
816 break; 816 break;
817 } 817 }
818 spin_unlock_bh(&nf_conntrack_lock); 818 spin_unlock_bh(&nf_conntrack_expect_lock);
819 return found; 819 return found;
820} 820}
821 821
@@ -825,7 +825,7 @@ static void flush_expectations(struct nf_conn *ct, bool media)
825 struct nf_conntrack_expect *exp; 825 struct nf_conntrack_expect *exp;
826 struct hlist_node *next; 826 struct hlist_node *next;
827 827
828 spin_lock_bh(&nf_conntrack_lock); 828 spin_lock_bh(&nf_conntrack_expect_lock);
829 hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { 829 hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
830 if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) 830 if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
831 continue; 831 continue;
@@ -836,7 +836,7 @@ static void flush_expectations(struct nf_conn *ct, bool media)
836 if (!media) 836 if (!media)
837 break; 837 break;
838 } 838 }
839 spin_unlock_bh(&nf_conntrack_lock); 839 spin_unlock_bh(&nf_conntrack_expect_lock);
840} 840}
841 841
842static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, 842static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index adce01e8bb57..33045a562297 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -794,9 +794,8 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr)
794 stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); 794 stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
795 795
796 if (chain->stats) { 796 if (chain->stats) {
797 /* nfnl_lock is held, add some nfnl function for this, later */
798 struct nft_stats __percpu *oldstats = 797 struct nft_stats __percpu *oldstats =
799 rcu_dereference_protected(chain->stats, 1); 798 nft_dereference(chain->stats);
800 799
801 rcu_assign_pointer(chain->stats, newstats); 800 rcu_assign_pointer(chain->stats, newstats);
802 synchronize_rcu(); 801 synchronize_rcu();
@@ -1254,10 +1253,11 @@ err1:
1254 return err; 1253 return err;
1255} 1254}
1256 1255
1257static void nf_tables_expr_destroy(struct nft_expr *expr) 1256static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
1257 struct nft_expr *expr)
1258{ 1258{
1259 if (expr->ops->destroy) 1259 if (expr->ops->destroy)
1260 expr->ops->destroy(expr); 1260 expr->ops->destroy(ctx, expr);
1261 module_put(expr->ops->type->owner); 1261 module_put(expr->ops->type->owner);
1262} 1262}
1263 1263
@@ -1296,6 +1296,8 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
1296 [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, 1296 [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
1297 [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, 1297 [NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
1298 [NFTA_RULE_POSITION] = { .type = NLA_U64 }, 1298 [NFTA_RULE_POSITION] = { .type = NLA_U64 },
1299 [NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
1300 .len = NFT_USERDATA_MAXLEN },
1299}; 1301};
1300 1302
1301static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, 1303static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
@@ -1348,6 +1350,10 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
1348 } 1350 }
1349 nla_nest_end(skb, list); 1351 nla_nest_end(skb, list);
1350 1352
1353 if (rule->ulen &&
1354 nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule)))
1355 goto nla_put_failure;
1356
1351 return nlmsg_end(skb, nlh); 1357 return nlmsg_end(skb, nlh);
1352 1358
1353nla_put_failure: 1359nla_put_failure:
@@ -1531,7 +1537,8 @@ err:
1531 return err; 1537 return err;
1532} 1538}
1533 1539
1534static void nf_tables_rule_destroy(struct nft_rule *rule) 1540static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
1541 struct nft_rule *rule)
1535{ 1542{
1536 struct nft_expr *expr; 1543 struct nft_expr *expr;
1537 1544
@@ -1541,7 +1548,7 @@ static void nf_tables_rule_destroy(struct nft_rule *rule)
1541 */ 1548 */
1542 expr = nft_expr_first(rule); 1549 expr = nft_expr_first(rule);
1543 while (expr->ops && expr != nft_expr_last(rule)) { 1550 while (expr->ops && expr != nft_expr_last(rule)) {
1544 nf_tables_expr_destroy(expr); 1551 nf_tables_expr_destroy(ctx, expr);
1545 expr = nft_expr_next(expr); 1552 expr = nft_expr_next(expr);
1546 } 1553 }
1547 kfree(rule); 1554 kfree(rule);
@@ -1552,7 +1559,7 @@ static void nf_tables_rule_destroy(struct nft_rule *rule)
1552static struct nft_expr_info *info; 1559static struct nft_expr_info *info;
1553 1560
1554static struct nft_rule_trans * 1561static struct nft_rule_trans *
1555nf_tables_trans_add(struct nft_rule *rule, const struct nft_ctx *ctx) 1562nf_tables_trans_add(struct nft_ctx *ctx, struct nft_rule *rule)
1556{ 1563{
1557 struct nft_rule_trans *rupd; 1564 struct nft_rule_trans *rupd;
1558 1565
@@ -1560,11 +1567,8 @@ nf_tables_trans_add(struct nft_rule *rule, const struct nft_ctx *ctx)
1560 if (rupd == NULL) 1567 if (rupd == NULL)
1561 return NULL; 1568 return NULL;
1562 1569
1563 rupd->chain = ctx->chain; 1570 rupd->ctx = *ctx;
1564 rupd->table = ctx->table;
1565 rupd->rule = rule; 1571 rupd->rule = rule;
1566 rupd->family = ctx->afi->family;
1567 rupd->nlh = ctx->nlh;
1568 list_add_tail(&rupd->list, &ctx->net->nft.commit_list); 1572 list_add_tail(&rupd->list, &ctx->net->nft.commit_list);
1569 1573
1570 return rupd; 1574 return rupd;
@@ -1584,7 +1588,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1584 struct nft_expr *expr; 1588 struct nft_expr *expr;
1585 struct nft_ctx ctx; 1589 struct nft_ctx ctx;
1586 struct nlattr *tmp; 1590 struct nlattr *tmp;
1587 unsigned int size, i, n; 1591 unsigned int size, i, n, ulen = 0;
1588 int err, rem; 1592 int err, rem;
1589 bool create; 1593 bool create;
1590 u64 handle, pos_handle; 1594 u64 handle, pos_handle;
@@ -1650,8 +1654,11 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1650 } 1654 }
1651 } 1655 }
1652 1656
1657 if (nla[NFTA_RULE_USERDATA])
1658 ulen = nla_len(nla[NFTA_RULE_USERDATA]);
1659
1653 err = -ENOMEM; 1660 err = -ENOMEM;
1654 rule = kzalloc(sizeof(*rule) + size, GFP_KERNEL); 1661 rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL);
1655 if (rule == NULL) 1662 if (rule == NULL)
1656 goto err1; 1663 goto err1;
1657 1664
@@ -1659,6 +1666,10 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1659 1666
1660 rule->handle = handle; 1667 rule->handle = handle;
1661 rule->dlen = size; 1668 rule->dlen = size;
1669 rule->ulen = ulen;
1670
1671 if (ulen)
1672 nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen);
1662 1673
1663 expr = nft_expr_first(rule); 1674 expr = nft_expr_first(rule);
1664 for (i = 0; i < n; i++) { 1675 for (i = 0; i < n; i++) {
@@ -1671,7 +1682,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1671 1682
1672 if (nlh->nlmsg_flags & NLM_F_REPLACE) { 1683 if (nlh->nlmsg_flags & NLM_F_REPLACE) {
1673 if (nft_rule_is_active_next(net, old_rule)) { 1684 if (nft_rule_is_active_next(net, old_rule)) {
1674 repl = nf_tables_trans_add(old_rule, &ctx); 1685 repl = nf_tables_trans_add(&ctx, old_rule);
1675 if (repl == NULL) { 1686 if (repl == NULL) {
1676 err = -ENOMEM; 1687 err = -ENOMEM;
1677 goto err2; 1688 goto err2;
@@ -1694,7 +1705,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1694 list_add_rcu(&rule->list, &chain->rules); 1705 list_add_rcu(&rule->list, &chain->rules);
1695 } 1706 }
1696 1707
1697 if (nf_tables_trans_add(rule, &ctx) == NULL) { 1708 if (nf_tables_trans_add(&ctx, rule) == NULL) {
1698 err = -ENOMEM; 1709 err = -ENOMEM;
1699 goto err3; 1710 goto err3;
1700 } 1711 }
@@ -1709,7 +1720,7 @@ err3:
1709 kfree(repl); 1720 kfree(repl);
1710 } 1721 }
1711err2: 1722err2:
1712 nf_tables_rule_destroy(rule); 1723 nf_tables_rule_destroy(&ctx, rule);
1713err1: 1724err1:
1714 for (i = 0; i < n; i++) { 1725 for (i = 0; i < n; i++) {
1715 if (info[i].ops != NULL) 1726 if (info[i].ops != NULL)
@@ -1723,7 +1734,7 @@ nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule)
1723{ 1734{
1724 /* You cannot delete the same rule twice */ 1735 /* You cannot delete the same rule twice */
1725 if (nft_rule_is_active_next(ctx->net, rule)) { 1736 if (nft_rule_is_active_next(ctx->net, rule)) {
1726 if (nf_tables_trans_add(rule, ctx) == NULL) 1737 if (nf_tables_trans_add(ctx, rule) == NULL)
1727 return -ENOMEM; 1738 return -ENOMEM;
1728 nft_rule_disactivate_next(ctx->net, rule); 1739 nft_rule_disactivate_next(ctx->net, rule);
1729 return 0; 1740 return 0;
@@ -1819,10 +1830,10 @@ static int nf_tables_commit(struct sk_buff *skb)
1819 */ 1830 */
1820 if (nft_rule_is_active(net, rupd->rule)) { 1831 if (nft_rule_is_active(net, rupd->rule)) {
1821 nft_rule_clear(net, rupd->rule); 1832 nft_rule_clear(net, rupd->rule);
1822 nf_tables_rule_notify(skb, rupd->nlh, rupd->table, 1833 nf_tables_rule_notify(skb, rupd->ctx.nlh,
1823 rupd->chain, rupd->rule, 1834 rupd->ctx.table, rupd->ctx.chain,
1824 NFT_MSG_NEWRULE, 0, 1835 rupd->rule, NFT_MSG_NEWRULE, 0,
1825 rupd->family); 1836 rupd->ctx.afi->family);
1826 list_del(&rupd->list); 1837 list_del(&rupd->list);
1827 kfree(rupd); 1838 kfree(rupd);
1828 continue; 1839 continue;
@@ -1830,9 +1841,10 @@ static int nf_tables_commit(struct sk_buff *skb)
1830 1841
1831 /* This rule is in the past, get rid of it */ 1842 /* This rule is in the past, get rid of it */
1832 list_del_rcu(&rupd->rule->list); 1843 list_del_rcu(&rupd->rule->list);
1833 nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain, 1844 nf_tables_rule_notify(skb, rupd->ctx.nlh,
1845 rupd->ctx.table, rupd->ctx.chain,
1834 rupd->rule, NFT_MSG_DELRULE, 0, 1846 rupd->rule, NFT_MSG_DELRULE, 0,
1835 rupd->family); 1847 rupd->ctx.afi->family);
1836 } 1848 }
1837 1849
1838 /* Make sure we don't see any packet traversing old rules */ 1850 /* Make sure we don't see any packet traversing old rules */
@@ -1840,7 +1852,7 @@ static int nf_tables_commit(struct sk_buff *skb)
1840 1852
1841 /* Now we can safely release unused old rules */ 1853 /* Now we can safely release unused old rules */
1842 list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { 1854 list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
1843 nf_tables_rule_destroy(rupd->rule); 1855 nf_tables_rule_destroy(&rupd->ctx, rupd->rule);
1844 list_del(&rupd->list); 1856 list_del(&rupd->list);
1845 kfree(rupd); 1857 kfree(rupd);
1846 } 1858 }
@@ -1869,7 +1881,7 @@ static int nf_tables_abort(struct sk_buff *skb)
1869 synchronize_rcu(); 1881 synchronize_rcu();
1870 1882
1871 list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { 1883 list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
1872 nf_tables_rule_destroy(rupd->rule); 1884 nf_tables_rule_destroy(&rupd->ctx, rupd->rule);
1873 list_del(&rupd->list); 1885 list_del(&rupd->list);
1874 kfree(rupd); 1886 kfree(rupd);
1875 } 1887 }
@@ -2430,8 +2442,7 @@ err1:
2430static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) 2442static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
2431{ 2443{
2432 list_del(&set->list); 2444 list_del(&set->list);
2433 if (!(set->flags & NFT_SET_ANONYMOUS)) 2445 nf_tables_set_notify(ctx, set, NFT_MSG_DELSET);
2434 nf_tables_set_notify(ctx, set, NFT_MSG_DELSET);
2435 2446
2436 set->ops->destroy(set); 2447 set->ops->destroy(set);
2437 module_put(set->ops->owner); 2448 module_put(set->ops->owner);
@@ -3175,9 +3186,16 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
3175 data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); 3186 data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
3176 3187
3177 switch (data->verdict) { 3188 switch (data->verdict) {
3178 case NF_ACCEPT: 3189 default:
3179 case NF_DROP: 3190 switch (data->verdict & NF_VERDICT_MASK) {
3180 case NF_QUEUE: 3191 case NF_ACCEPT:
3192 case NF_DROP:
3193 case NF_QUEUE:
3194 break;
3195 default:
3196 return -EINVAL;
3197 }
3198 /* fall through */
3181 case NFT_CONTINUE: 3199 case NFT_CONTINUE:
3182 case NFT_BREAK: 3200 case NFT_BREAK:
3183 case NFT_RETURN: 3201 case NFT_RETURN:
@@ -3198,8 +3216,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
3198 data->chain = chain; 3216 data->chain = chain;
3199 desc->len = sizeof(data); 3217 desc->len = sizeof(data);
3200 break; 3218 break;
3201 default:
3202 return -EINVAL;
3203 } 3219 }
3204 3220
3205 desc->type = NFT_DATA_VERDICT; 3221 desc->type = NFT_DATA_VERDICT;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 046aa13b4fea..e8138da4c14f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -61,6 +61,14 @@ void nfnl_unlock(__u8 subsys_id)
61} 61}
62EXPORT_SYMBOL_GPL(nfnl_unlock); 62EXPORT_SYMBOL_GPL(nfnl_unlock);
63 63
64#ifdef CONFIG_PROVE_LOCKING
65int lockdep_nfnl_is_held(u8 subsys_id)
66{
67 return lockdep_is_held(&table[subsys_id].mutex);
68}
69EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
70#endif
71
64int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) 72int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
65{ 73{
66 nfnl_lock(n->subsys_id); 74 nfnl_lock(n->subsys_id);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index a155d19a225e..d292c8d286eb 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -28,8 +28,6 @@
28#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
29#include <linux/security.h> 29#include <linux/security.h>
30#include <linux/list.h> 30#include <linux/list.h>
31#include <linux/jhash.h>
32#include <linux/random.h>
33#include <linux/slab.h> 31#include <linux/slab.h>
34#include <net/sock.h> 32#include <net/sock.h>
35#include <net/netfilter/nf_log.h> 33#include <net/netfilter/nf_log.h>
@@ -75,7 +73,6 @@ struct nfulnl_instance {
75}; 73};
76 74
77#define INSTANCE_BUCKETS 16 75#define INSTANCE_BUCKETS 16
78static unsigned int hash_init;
79 76
80static int nfnl_log_net_id __read_mostly; 77static int nfnl_log_net_id __read_mostly;
81 78
@@ -1067,11 +1064,6 @@ static int __init nfnetlink_log_init(void)
1067{ 1064{
1068 int status = -ENOMEM; 1065 int status = -ENOMEM;
1069 1066
1070 /* it's not really all that important to have a random value, so
1071 * we can do this from the init function, even if there hasn't
1072 * been that much entropy yet */
1073 get_random_bytes(&hash_init, sizeof(hash_init));
1074
1075 netlink_register_notifier(&nfulnl_rtnl_notifier); 1067 netlink_register_notifier(&nfulnl_rtnl_notifier);
1076 status = nfnetlink_subsys_register(&nfulnl_subsys); 1068 status = nfnetlink_subsys_register(&nfulnl_subsys);
1077 if (status < 0) { 1069 if (status < 0) {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 82cb8236f8a1..8a779be832fb 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -192,7 +192,7 @@ err:
192} 192}
193 193
194static void 194static void
195nft_target_destroy(const struct nft_expr *expr) 195nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
196{ 196{
197 struct xt_target *target = expr->ops->data; 197 struct xt_target *target = expr->ops->data;
198 198
@@ -379,7 +379,7 @@ err:
379} 379}
380 380
381static void 381static void
382nft_match_destroy(const struct nft_expr *expr) 382nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
383{ 383{
384 struct xt_match *match = expr->ops->data; 384 struct xt_match *match = expr->ops->data;
385 385
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 46e275403838..bd0d41e69341 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -19,15 +19,15 @@
19#include <net/netfilter/nf_conntrack_tuple.h> 19#include <net/netfilter/nf_conntrack_tuple.h>
20#include <net/netfilter/nf_conntrack_helper.h> 20#include <net/netfilter/nf_conntrack_helper.h>
21#include <net/netfilter/nf_conntrack_ecache.h> 21#include <net/netfilter/nf_conntrack_ecache.h>
22#include <net/netfilter/nf_conntrack_labels.h>
22 23
23struct nft_ct { 24struct nft_ct {
24 enum nft_ct_keys key:8; 25 enum nft_ct_keys key:8;
25 enum ip_conntrack_dir dir:8; 26 enum ip_conntrack_dir dir:8;
26 union{ 27 union {
27 enum nft_registers dreg:8; 28 enum nft_registers dreg:8;
28 enum nft_registers sreg:8; 29 enum nft_registers sreg:8;
29 }; 30 };
30 uint8_t family;
31}; 31};
32 32
33static void nft_ct_get_eval(const struct nft_expr *expr, 33static void nft_ct_get_eval(const struct nft_expr *expr,
@@ -97,6 +97,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
97 goto err; 97 goto err;
98 strncpy((char *)dest->data, helper->name, sizeof(dest->data)); 98 strncpy((char *)dest->data, helper->name, sizeof(dest->data));
99 return; 99 return;
100#ifdef CONFIG_NF_CONNTRACK_LABELS
101 case NFT_CT_LABELS: {
102 struct nf_conn_labels *labels = nf_ct_labels_find(ct);
103 unsigned int size;
104
105 if (!labels) {
106 memset(dest->data, 0, sizeof(dest->data));
107 return;
108 }
109
110 BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data));
111 size = labels->words * sizeof(long);
112
113 memcpy(dest->data, labels->bits, size);
114 if (size < sizeof(dest->data))
115 memset(((char *) dest->data) + size, 0,
116 sizeof(dest->data) - size);
117 return;
118 }
119#endif
100 } 120 }
101 121
102 tuple = &ct->tuplehash[priv->dir].tuple; 122 tuple = &ct->tuplehash[priv->dir].tuple;
@@ -221,6 +241,9 @@ static int nft_ct_init_validate_get(const struct nft_expr *expr,
221#ifdef CONFIG_NF_CONNTRACK_SECMARK 241#ifdef CONFIG_NF_CONNTRACK_SECMARK
222 case NFT_CT_SECMARK: 242 case NFT_CT_SECMARK:
223#endif 243#endif
244#ifdef CONFIG_NF_CONNTRACK_LABELS
245 case NFT_CT_LABELS:
246#endif
224 case NFT_CT_EXPIRATION: 247 case NFT_CT_EXPIRATION:
225 case NFT_CT_HELPER: 248 case NFT_CT_HELPER:
226 if (tb[NFTA_CT_DIRECTION] != NULL) 249 if (tb[NFTA_CT_DIRECTION] != NULL)
@@ -292,16 +315,13 @@ static int nft_ct_init(const struct nft_ctx *ctx,
292 if (err < 0) 315 if (err < 0)
293 return err; 316 return err;
294 317
295 priv->family = ctx->afi->family;
296
297 return 0; 318 return 0;
298} 319}
299 320
300static void nft_ct_destroy(const struct nft_expr *expr) 321static void nft_ct_destroy(const struct nft_ctx *ctx,
322 const struct nft_expr *expr)
301{ 323{
302 struct nft_ct *priv = nft_expr_priv(expr); 324 nft_ct_l3proto_module_put(ctx->afi->family);
303
304 nft_ct_l3proto_module_put(priv->family);
305} 325}
306 326
307static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) 327static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3d3f8fce10a5..6a1acde16c60 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> 2 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -18,17 +18,29 @@
18#include <linux/netfilter/nf_tables.h> 18#include <linux/netfilter/nf_tables.h>
19#include <net/netfilter/nf_tables.h> 19#include <net/netfilter/nf_tables.h>
20 20
21#define NFT_HASH_MIN_SIZE 4
22
21struct nft_hash { 23struct nft_hash {
22 struct hlist_head *hash; 24 struct nft_hash_table __rcu *tbl;
23 unsigned int hsize; 25};
26
27struct nft_hash_table {
28 unsigned int size;
29 unsigned int elements;
30 struct nft_hash_elem __rcu *buckets[];
24}; 31};
25 32
26struct nft_hash_elem { 33struct nft_hash_elem {
27 struct hlist_node hnode; 34 struct nft_hash_elem __rcu *next;
28 struct nft_data key; 35 struct nft_data key;
29 struct nft_data data[]; 36 struct nft_data data[];
30}; 37};
31 38
39#define nft_hash_for_each_entry(i, head) \
40 for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next))
41#define nft_hash_for_each_entry_rcu(i, head) \
42 for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next))
43
32static u32 nft_hash_rnd __read_mostly; 44static u32 nft_hash_rnd __read_mostly;
33static bool nft_hash_rnd_initted __read_mostly; 45static bool nft_hash_rnd_initted __read_mostly;
34 46
@@ -38,7 +50,7 @@ static unsigned int nft_hash_data(const struct nft_data *data,
38 unsigned int h; 50 unsigned int h;
39 51
40 h = jhash(data->data, len, nft_hash_rnd); 52 h = jhash(data->data, len, nft_hash_rnd);
41 return ((u64)h * hsize) >> 32; 53 return h & (hsize - 1);
42} 54}
43 55
44static bool nft_hash_lookup(const struct nft_set *set, 56static bool nft_hash_lookup(const struct nft_set *set,
@@ -46,11 +58,12 @@ static bool nft_hash_lookup(const struct nft_set *set,
46 struct nft_data *data) 58 struct nft_data *data)
47{ 59{
48 const struct nft_hash *priv = nft_set_priv(set); 60 const struct nft_hash *priv = nft_set_priv(set);
61 const struct nft_hash_table *tbl = rcu_dereference(priv->tbl);
49 const struct nft_hash_elem *he; 62 const struct nft_hash_elem *he;
50 unsigned int h; 63 unsigned int h;
51 64
52 h = nft_hash_data(key, priv->hsize, set->klen); 65 h = nft_hash_data(key, tbl->size, set->klen);
53 hlist_for_each_entry(he, &priv->hash[h], hnode) { 66 nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) {
54 if (nft_data_cmp(&he->key, key, set->klen)) 67 if (nft_data_cmp(&he->key, key, set->klen))
55 continue; 68 continue;
56 if (set->flags & NFT_SET_MAP) 69 if (set->flags & NFT_SET_MAP)
@@ -60,19 +73,148 @@ static bool nft_hash_lookup(const struct nft_set *set,
60 return false; 73 return false;
61} 74}
62 75
63static void nft_hash_elem_destroy(const struct nft_set *set, 76static void nft_hash_tbl_free(const struct nft_hash_table *tbl)
64 struct nft_hash_elem *he)
65{ 77{
66 nft_data_uninit(&he->key, NFT_DATA_VALUE); 78 if (is_vmalloc_addr(tbl))
67 if (set->flags & NFT_SET_MAP) 79 vfree(tbl);
68 nft_data_uninit(he->data, set->dtype); 80 else
69 kfree(he); 81 kfree(tbl);
82}
83
84static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets)
85{
86 struct nft_hash_table *tbl;
87 size_t size;
88
89 size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
90 tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN);
91 if (tbl == NULL)
92 tbl = vzalloc(size);
93 if (tbl == NULL)
94 return NULL;
95 tbl->size = nbuckets;
96
97 return tbl;
98}
99
100static void nft_hash_chain_unzip(const struct nft_set *set,
101 const struct nft_hash_table *ntbl,
102 struct nft_hash_table *tbl, unsigned int n)
103{
104 struct nft_hash_elem *he, *last, *next;
105 unsigned int h;
106
107 he = nft_dereference(tbl->buckets[n]);
108 if (he == NULL)
109 return;
110 h = nft_hash_data(&he->key, ntbl->size, set->klen);
111
112 /* Find last element of first chain hashing to bucket h */
113 last = he;
114 nft_hash_for_each_entry(he, he->next) {
115 if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
116 break;
117 last = he;
118 }
119
120 /* Unlink first chain from the old table */
121 RCU_INIT_POINTER(tbl->buckets[n], last->next);
122
123 /* If end of chain reached, done */
124 if (he == NULL)
125 return;
126
127 /* Find first element of second chain hashing to bucket h */
128 next = NULL;
129 nft_hash_for_each_entry(he, he->next) {
130 if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
131 continue;
132 next = he;
133 break;
134 }
135
136 /* Link the two chains */
137 RCU_INIT_POINTER(last->next, next);
138}
139
140static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv)
141{
142 struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
143 struct nft_hash_elem *he;
144 unsigned int i, h;
145 bool complete;
146
147 ntbl = nft_hash_tbl_alloc(tbl->size * 2);
148 if (ntbl == NULL)
149 return -ENOMEM;
150
151 /* Link new table's buckets to first element in the old table
152 * hashing to the new bucket.
153 */
154 for (i = 0; i < ntbl->size; i++) {
155 h = i < tbl->size ? i : i - tbl->size;
156 nft_hash_for_each_entry(he, tbl->buckets[h]) {
157 if (nft_hash_data(&he->key, ntbl->size, set->klen) != i)
158 continue;
159 RCU_INIT_POINTER(ntbl->buckets[i], he);
160 break;
161 }
162 }
163 ntbl->elements = tbl->elements;
164
165 /* Publish new table */
166 rcu_assign_pointer(priv->tbl, ntbl);
167
168 /* Unzip interleaved hash chains */
169 do {
170 /* Wait for readers to use new table/unzipped chains */
171 synchronize_rcu();
172
173 complete = true;
174 for (i = 0; i < tbl->size; i++) {
175 nft_hash_chain_unzip(set, ntbl, tbl, i);
176 if (tbl->buckets[i] != NULL)
177 complete = false;
178 }
179 } while (!complete);
180
181 nft_hash_tbl_free(tbl);
182 return 0;
183}
184
185static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv)
186{
187 struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
188 struct nft_hash_elem __rcu **pprev;
189 unsigned int i;
190
191 ntbl = nft_hash_tbl_alloc(tbl->size / 2);
192 if (ntbl == NULL)
193 return -ENOMEM;
194
195 for (i = 0; i < ntbl->size; i++) {
196 ntbl->buckets[i] = tbl->buckets[i];
197
198 for (pprev = &ntbl->buckets[i]; *pprev != NULL;
199 pprev = &nft_dereference(*pprev)->next)
200 ;
201 RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
202 }
203 ntbl->elements = tbl->elements;
204
205 /* Publish new table */
206 rcu_assign_pointer(priv->tbl, ntbl);
207 synchronize_rcu();
208
209 nft_hash_tbl_free(tbl);
210 return 0;
70} 211}
71 212
72static int nft_hash_insert(const struct nft_set *set, 213static int nft_hash_insert(const struct nft_set *set,
73 const struct nft_set_elem *elem) 214 const struct nft_set_elem *elem)
74{ 215{
75 struct nft_hash *priv = nft_set_priv(set); 216 struct nft_hash *priv = nft_set_priv(set);
217 struct nft_hash_table *tbl = nft_dereference(priv->tbl);
76 struct nft_hash_elem *he; 218 struct nft_hash_elem *he;
77 unsigned int size, h; 219 unsigned int size, h;
78 220
@@ -91,33 +233,66 @@ static int nft_hash_insert(const struct nft_set *set,
91 if (set->flags & NFT_SET_MAP) 233 if (set->flags & NFT_SET_MAP)
92 nft_data_copy(he->data, &elem->data); 234 nft_data_copy(he->data, &elem->data);
93 235
94 h = nft_hash_data(&he->key, priv->hsize, set->klen); 236 h = nft_hash_data(&he->key, tbl->size, set->klen);
95 hlist_add_head_rcu(&he->hnode, &priv->hash[h]); 237 RCU_INIT_POINTER(he->next, tbl->buckets[h]);
238 rcu_assign_pointer(tbl->buckets[h], he);
239 tbl->elements++;
240
241 /* Expand table when exceeding 75% load */
242 if (tbl->elements > tbl->size / 4 * 3)
243 nft_hash_tbl_expand(set, priv);
244
96 return 0; 245 return 0;
97} 246}
98 247
248static void nft_hash_elem_destroy(const struct nft_set *set,
249 struct nft_hash_elem *he)
250{
251 nft_data_uninit(&he->key, NFT_DATA_VALUE);
252 if (set->flags & NFT_SET_MAP)
253 nft_data_uninit(he->data, set->dtype);
254 kfree(he);
255}
256
99static void nft_hash_remove(const struct nft_set *set, 257static void nft_hash_remove(const struct nft_set *set,
100 const struct nft_set_elem *elem) 258 const struct nft_set_elem *elem)
101{ 259{
102 struct nft_hash_elem *he = elem->cookie; 260 struct nft_hash *priv = nft_set_priv(set);
261 struct nft_hash_table *tbl = nft_dereference(priv->tbl);
262 struct nft_hash_elem *he, __rcu **pprev;
103 263
104 hlist_del_rcu(&he->hnode); 264 pprev = elem->cookie;
265 he = nft_dereference((*pprev));
266
267 RCU_INIT_POINTER(*pprev, he->next);
268 synchronize_rcu();
105 kfree(he); 269 kfree(he);
270 tbl->elements--;
271
272 /* Shrink table beneath 30% load */
273 if (tbl->elements < tbl->size * 3 / 10 &&
274 tbl->size > NFT_HASH_MIN_SIZE)
275 nft_hash_tbl_shrink(set, priv);
106} 276}
107 277
108static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) 278static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
109{ 279{
110 const struct nft_hash *priv = nft_set_priv(set); 280 const struct nft_hash *priv = nft_set_priv(set);
281 const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
282 struct nft_hash_elem __rcu * const *pprev;
111 struct nft_hash_elem *he; 283 struct nft_hash_elem *he;
112 unsigned int h; 284 unsigned int h;
113 285
114 h = nft_hash_data(&elem->key, priv->hsize, set->klen); 286 h = nft_hash_data(&elem->key, tbl->size, set->klen);
115 hlist_for_each_entry(he, &priv->hash[h], hnode) { 287 pprev = &tbl->buckets[h];
116 if (nft_data_cmp(&he->key, &elem->key, set->klen)) 288 nft_hash_for_each_entry(he, tbl->buckets[h]) {
289 if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
290 pprev = &he->next;
117 continue; 291 continue;
292 }
118 293
119 elem->cookie = he; 294 elem->cookie = (void *)pprev;
120 elem->flags = 0; 295 elem->flags = 0;
121 if (set->flags & NFT_SET_MAP) 296 if (set->flags & NFT_SET_MAP)
122 nft_data_copy(&elem->data, he->data); 297 nft_data_copy(&elem->data, he->data);
123 return 0; 298 return 0;
@@ -129,12 +304,13 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
129 struct nft_set_iter *iter) 304 struct nft_set_iter *iter)
130{ 305{
131 const struct nft_hash *priv = nft_set_priv(set); 306 const struct nft_hash *priv = nft_set_priv(set);
307 const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
132 const struct nft_hash_elem *he; 308 const struct nft_hash_elem *he;
133 struct nft_set_elem elem; 309 struct nft_set_elem elem;
134 unsigned int i; 310 unsigned int i;
135 311
136 for (i = 0; i < priv->hsize; i++) { 312 for (i = 0; i < tbl->size; i++) {
137 hlist_for_each_entry(he, &priv->hash[i], hnode) { 313 nft_hash_for_each_entry(he, tbl->buckets[i]) {
138 if (iter->count < iter->skip) 314 if (iter->count < iter->skip)
139 goto cont; 315 goto cont;
140 316
@@ -161,43 +337,35 @@ static int nft_hash_init(const struct nft_set *set,
161 const struct nlattr * const tb[]) 337 const struct nlattr * const tb[])
162{ 338{
163 struct nft_hash *priv = nft_set_priv(set); 339 struct nft_hash *priv = nft_set_priv(set);
164 unsigned int cnt, i; 340 struct nft_hash_table *tbl;
165 341
166 if (unlikely(!nft_hash_rnd_initted)) { 342 if (unlikely(!nft_hash_rnd_initted)) {
167 get_random_bytes(&nft_hash_rnd, 4); 343 get_random_bytes(&nft_hash_rnd, 4);
168 nft_hash_rnd_initted = true; 344 nft_hash_rnd_initted = true;
169 } 345 }
170 346
171 /* Aim for a load factor of 0.75 */ 347 tbl = nft_hash_tbl_alloc(NFT_HASH_MIN_SIZE);
172 // FIXME: temporarily broken until we have set descriptions 348 if (tbl == NULL)
173 cnt = 100;
174 cnt = cnt * 4 / 3;
175
176 priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL);
177 if (priv->hash == NULL)
178 return -ENOMEM; 349 return -ENOMEM;
179 priv->hsize = cnt; 350 RCU_INIT_POINTER(priv->tbl, tbl);
180
181 for (i = 0; i < cnt; i++)
182 INIT_HLIST_HEAD(&priv->hash[i]);
183
184 return 0; 351 return 0;
185} 352}
186 353
187static void nft_hash_destroy(const struct nft_set *set) 354static void nft_hash_destroy(const struct nft_set *set)
188{ 355{
189 const struct nft_hash *priv = nft_set_priv(set); 356 const struct nft_hash *priv = nft_set_priv(set);
190 const struct hlist_node *next; 357 const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
191 struct nft_hash_elem *elem; 358 struct nft_hash_elem *he, *next;
192 unsigned int i; 359 unsigned int i;
193 360
194 for (i = 0; i < priv->hsize; i++) { 361 for (i = 0; i < tbl->size; i++) {
195 hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) { 362 for (he = nft_dereference(tbl->buckets[i]); he != NULL;
196 hlist_del(&elem->hnode); 363 he = next) {
197 nft_hash_elem_destroy(set, elem); 364 next = nft_dereference(he->next);
365 nft_hash_elem_destroy(set, he);
198 } 366 }
199 } 367 }
200 kfree(priv->hash); 368 kfree(tbl);
201} 369}
202 370
203static struct nft_set_ops nft_hash_ops __read_mostly = { 371static struct nft_set_ops nft_hash_ops __read_mostly = {
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index f169501f1ad4..810385eb7249 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -70,7 +70,8 @@ err1:
70 return err; 70 return err;
71} 71}
72 72
73static void nft_immediate_destroy(const struct nft_expr *expr) 73static void nft_immediate_destroy(const struct nft_ctx *ctx,
74 const struct nft_expr *expr)
74{ 75{
75 const struct nft_immediate_expr *priv = nft_expr_priv(expr); 76 const struct nft_immediate_expr *priv = nft_expr_priv(expr);
76 return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); 77 return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 26c5154e05f3..10cfb156cdf4 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -74,7 +74,8 @@ static int nft_log_init(const struct nft_ctx *ctx,
74 return 0; 74 return 0;
75} 75}
76 76
77static void nft_log_destroy(const struct nft_expr *expr) 77static void nft_log_destroy(const struct nft_ctx *ctx,
78 const struct nft_expr *expr)
78{ 79{
79 struct nft_log *priv = nft_expr_priv(expr); 80 struct nft_log *priv = nft_expr_priv(expr);
80 81
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index bb4ef4cccb6e..7fd2bea8aa23 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -89,11 +89,12 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
89 return 0; 89 return 0;
90} 90}
91 91
92static void nft_lookup_destroy(const struct nft_expr *expr) 92static void nft_lookup_destroy(const struct nft_ctx *ctx,
93 const struct nft_expr *expr)
93{ 94{
94 struct nft_lookup *priv = nft_expr_priv(expr); 95 struct nft_lookup *priv = nft_expr_priv(expr);
95 96
96 nf_tables_unbind_set(NULL, priv->set, &priv->binding); 97 nf_tables_unbind_set(ctx, priv->set, &priv->binding);
97} 98}
98 99
99static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) 100static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index d3b1ffe26181..a0195d28bcfc 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -31,8 +31,8 @@ struct nft_nat {
31 enum nft_registers sreg_addr_max:8; 31 enum nft_registers sreg_addr_max:8;
32 enum nft_registers sreg_proto_min:8; 32 enum nft_registers sreg_proto_min:8;
33 enum nft_registers sreg_proto_max:8; 33 enum nft_registers sreg_proto_max:8;
34 int family; 34 enum nf_nat_manip_type type:8;
35 enum nf_nat_manip_type type; 35 u8 family;
36}; 36};
37 37
38static void nft_nat_eval(const struct nft_expr *expr, 38static void nft_nat_eval(const struct nft_expr *expr,
@@ -88,6 +88,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
88 const struct nlattr * const tb[]) 88 const struct nlattr * const tb[])
89{ 89{
90 struct nft_nat *priv = nft_expr_priv(expr); 90 struct nft_nat *priv = nft_expr_priv(expr);
91 u32 family;
91 int err; 92 int err;
92 93
93 if (tb[NFTA_NAT_TYPE] == NULL) 94 if (tb[NFTA_NAT_TYPE] == NULL)
@@ -107,9 +108,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
107 if (tb[NFTA_NAT_FAMILY] == NULL) 108 if (tb[NFTA_NAT_FAMILY] == NULL)
108 return -EINVAL; 109 return -EINVAL;
109 110
110 priv->family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); 111 family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
111 if (priv->family != AF_INET && priv->family != AF_INET6) 112 if (family != AF_INET && family != AF_INET6)
112 return -EINVAL; 113 return -EAFNOSUPPORT;
114 if (family != ctx->afi->family)
115 return -EOPNOTSUPP;
116 priv->family = family;
113 117
114 if (tb[NFTA_NAT_REG_ADDR_MIN]) { 118 if (tb[NFTA_NAT_REG_ADDR_MIN]) {
115 priv->sreg_addr_min = ntohl(nla_get_be32( 119 priv->sreg_addr_min = ntohl(nla_get_be32(
@@ -202,13 +206,7 @@ static struct nft_expr_type nft_nat_type __read_mostly = {
202 206
203static int __init nft_nat_module_init(void) 207static int __init nft_nat_module_init(void)
204{ 208{
205 int err; 209 return nft_register_expr(&nft_nat_type);
206
207 err = nft_register_expr(&nft_nat_type);
208 if (err < 0)
209 return err;
210
211 return 0;
212} 210}
213 211
214static void __exit nft_nat_module_exit(void) 212static void __exit nft_nat_module_exit(void)
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 3228d7f24eb4..4973cbddc446 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -146,11 +146,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
146 146
147 if (par->family == NFPROTO_BRIDGE) { 147 if (par->family == NFPROTO_BRIDGE) {
148 switch (eth_hdr(skb)->h_proto) { 148 switch (eth_hdr(skb)->h_proto) {
149 case __constant_htons(ETH_P_IP): 149 case htons(ETH_P_IP):
150 audit_ip4(ab, skb); 150 audit_ip4(ab, skb);
151 break; 151 break;
152 152
153 case __constant_htons(ETH_P_IPV6): 153 case htons(ETH_P_IPV6):
154 audit_ip6(ab, skb); 154 audit_ip6(ab, skb);
155 break; 155 break;
156 } 156 }
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index c40b2695633b..458464e7bd7a 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -19,6 +19,7 @@
19#include <linux/jhash.h> 19#include <linux/jhash.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/rbtree.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/random.h> 24#include <linux/random.h>
24#include <linux/skbuff.h> 25#include <linux/skbuff.h>
@@ -31,6 +32,10 @@
31#include <net/netfilter/nf_conntrack_tuple.h> 32#include <net/netfilter/nf_conntrack_tuple.h>
32#include <net/netfilter/nf_conntrack_zones.h> 33#include <net/netfilter/nf_conntrack_zones.h>
33 34
35#define CONNLIMIT_SLOTS 32
36#define CONNLIMIT_LOCK_SLOTS 32
37#define CONNLIMIT_GC_MAX_NODES 8
38
34/* we will save the tuples of all connections we care about */ 39/* we will save the tuples of all connections we care about */
35struct xt_connlimit_conn { 40struct xt_connlimit_conn {
36 struct hlist_node node; 41 struct hlist_node node;
@@ -38,16 +43,26 @@ struct xt_connlimit_conn {
38 union nf_inet_addr addr; 43 union nf_inet_addr addr;
39}; 44};
40 45
46struct xt_connlimit_rb {
47 struct rb_node node;
48 struct hlist_head hhead; /* connections/hosts in same subnet */
49 union nf_inet_addr addr; /* search key */
50};
51
41struct xt_connlimit_data { 52struct xt_connlimit_data {
42 struct hlist_head iphash[256]; 53 struct rb_root climit_root4[CONNLIMIT_SLOTS];
43 spinlock_t lock; 54 struct rb_root climit_root6[CONNLIMIT_SLOTS];
55 spinlock_t locks[CONNLIMIT_LOCK_SLOTS];
44}; 56};
45 57
46static u_int32_t connlimit_rnd __read_mostly; 58static u_int32_t connlimit_rnd __read_mostly;
59static struct kmem_cache *connlimit_rb_cachep __read_mostly;
60static struct kmem_cache *connlimit_conn_cachep __read_mostly;
47 61
48static inline unsigned int connlimit_iphash(__be32 addr) 62static inline unsigned int connlimit_iphash(__be32 addr)
49{ 63{
50 return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; 64 return jhash_1word((__force __u32)addr,
65 connlimit_rnd) % CONNLIMIT_SLOTS;
51} 66}
52 67
53static inline unsigned int 68static inline unsigned int
@@ -60,7 +75,8 @@ connlimit_iphash6(const union nf_inet_addr *addr,
60 for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) 75 for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)
61 res.ip6[i] = addr->ip6[i] & mask->ip6[i]; 76 res.ip6[i] = addr->ip6[i] & mask->ip6[i];
62 77
63 return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; 78 return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6),
79 connlimit_rnd) % CONNLIMIT_SLOTS;
64} 80}
65 81
66static inline bool already_closed(const struct nf_conn *conn) 82static inline bool already_closed(const struct nf_conn *conn)
@@ -72,13 +88,14 @@ static inline bool already_closed(const struct nf_conn *conn)
72 return 0; 88 return 0;
73} 89}
74 90
75static inline unsigned int 91static int
76same_source_net(const union nf_inet_addr *addr, 92same_source_net(const union nf_inet_addr *addr,
77 const union nf_inet_addr *mask, 93 const union nf_inet_addr *mask,
78 const union nf_inet_addr *u3, u_int8_t family) 94 const union nf_inet_addr *u3, u_int8_t family)
79{ 95{
80 if (family == NFPROTO_IPV4) { 96 if (family == NFPROTO_IPV4) {
81 return (addr->ip & mask->ip) == (u3->ip & mask->ip); 97 return ntohl(addr->ip & mask->ip) -
98 ntohl(u3->ip & mask->ip);
82 } else { 99 } else {
83 union nf_inet_addr lh, rh; 100 union nf_inet_addr lh, rh;
84 unsigned int i; 101 unsigned int i;
@@ -88,89 +105,205 @@ same_source_net(const union nf_inet_addr *addr,
88 rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; 105 rh.ip6[i] = u3->ip6[i] & mask->ip6[i];
89 } 106 }
90 107
91 return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; 108 return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6));
92 } 109 }
93} 110}
94 111
95static int count_them(struct net *net, 112static bool add_hlist(struct hlist_head *head,
96 struct xt_connlimit_data *data,
97 const struct nf_conntrack_tuple *tuple, 113 const struct nf_conntrack_tuple *tuple,
98 const union nf_inet_addr *addr, 114 const union nf_inet_addr *addr)
99 const union nf_inet_addr *mask, 115{
100 u_int8_t family) 116 struct xt_connlimit_conn *conn;
117
118 conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
119 if (conn == NULL)
120 return false;
121 conn->tuple = *tuple;
122 conn->addr = *addr;
123 hlist_add_head(&conn->node, head);
124 return true;
125}
126
127static unsigned int check_hlist(struct net *net,
128 struct hlist_head *head,
129 const struct nf_conntrack_tuple *tuple,
130 bool *addit)
101{ 131{
102 const struct nf_conntrack_tuple_hash *found; 132 const struct nf_conntrack_tuple_hash *found;
103 struct xt_connlimit_conn *conn; 133 struct xt_connlimit_conn *conn;
104 struct hlist_node *n; 134 struct hlist_node *n;
105 struct nf_conn *found_ct; 135 struct nf_conn *found_ct;
106 struct hlist_head *hash; 136 unsigned int length = 0;
107 bool addit = true;
108 int matches = 0;
109
110 if (family == NFPROTO_IPV6)
111 hash = &data->iphash[connlimit_iphash6(addr, mask)];
112 else
113 hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)];
114 137
138 *addit = true;
115 rcu_read_lock(); 139 rcu_read_lock();
116 140
117 /* check the saved connections */ 141 /* check the saved connections */
118 hlist_for_each_entry_safe(conn, n, hash, node) { 142 hlist_for_each_entry_safe(conn, n, head, node) {
119 found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, 143 found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE,
120 &conn->tuple); 144 &conn->tuple);
121 found_ct = NULL; 145 if (found == NULL) {
146 hlist_del(&conn->node);
147 kmem_cache_free(connlimit_conn_cachep, conn);
148 continue;
149 }
122 150
123 if (found != NULL) 151 found_ct = nf_ct_tuplehash_to_ctrack(found);
124 found_ct = nf_ct_tuplehash_to_ctrack(found);
125 152
126 if (found_ct != NULL && 153 if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
127 nf_ct_tuple_equal(&conn->tuple, tuple) &&
128 !already_closed(found_ct))
129 /* 154 /*
130 * Just to be sure we have it only once in the list. 155 * Just to be sure we have it only once in the list.
131 * We should not see tuples twice unless someone hooks 156 * We should not see tuples twice unless someone hooks
132 * this into a table without "-p tcp --syn". 157 * this into a table without "-p tcp --syn".
133 */ 158 */
134 addit = false; 159 *addit = false;
135 160 } else if (already_closed(found_ct)) {
136 if (found == NULL) {
137 /* this one is gone */
138 hlist_del(&conn->node);
139 kfree(conn);
140 continue;
141 }
142
143 if (already_closed(found_ct)) {
144 /* 161 /*
145 * we do not care about connections which are 162 * we do not care about connections which are
146 * closed already -> ditch it 163 * closed already -> ditch it
147 */ 164 */
148 nf_ct_put(found_ct); 165 nf_ct_put(found_ct);
149 hlist_del(&conn->node); 166 hlist_del(&conn->node);
150 kfree(conn); 167 kmem_cache_free(connlimit_conn_cachep, conn);
151 continue; 168 continue;
152 } 169 }
153 170
154 if (same_source_net(addr, mask, &conn->addr, family))
155 /* same source network -> be counted! */
156 ++matches;
157 nf_ct_put(found_ct); 171 nf_ct_put(found_ct);
172 length++;
158 } 173 }
159 174
160 rcu_read_unlock(); 175 rcu_read_unlock();
161 176
162 if (addit) { 177 return length;
163 /* save the new connection in our list */ 178}
164 conn = kmalloc(sizeof(*conn), GFP_ATOMIC); 179
165 if (conn == NULL) 180static void tree_nodes_free(struct rb_root *root,
166 return -ENOMEM; 181 struct xt_connlimit_rb *gc_nodes[],
167 conn->tuple = *tuple; 182 unsigned int gc_count)
168 conn->addr = *addr; 183{
169 hlist_add_head(&conn->node, hash); 184 struct xt_connlimit_rb *rbconn;
170 ++matches; 185
186 while (gc_count) {
187 rbconn = gc_nodes[--gc_count];
188 rb_erase(&rbconn->node, root);
189 kmem_cache_free(connlimit_rb_cachep, rbconn);
190 }
191}
192
193static unsigned int
194count_tree(struct net *net, struct rb_root *root,
195 const struct nf_conntrack_tuple *tuple,
196 const union nf_inet_addr *addr, const union nf_inet_addr *mask,
197 u8 family)
198{
199 struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
200 struct rb_node **rbnode, *parent;
201 struct xt_connlimit_rb *rbconn;
202 struct xt_connlimit_conn *conn;
203 unsigned int gc_count;
204 bool no_gc = false;
205
206 restart:
207 gc_count = 0;
208 parent = NULL;
209 rbnode = &(root->rb_node);
210 while (*rbnode) {
211 int diff;
212 bool addit;
213
214 rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
215
216 parent = *rbnode;
217 diff = same_source_net(addr, mask, &rbconn->addr, family);
218 if (diff < 0) {
219 rbnode = &((*rbnode)->rb_left);
220 } else if (diff > 0) {
221 rbnode = &((*rbnode)->rb_right);
222 } else {
223 /* same source network -> be counted! */
224 unsigned int count;
225 count = check_hlist(net, &rbconn->hhead, tuple, &addit);
226
227 tree_nodes_free(root, gc_nodes, gc_count);
228 if (!addit)
229 return count;
230
231 if (!add_hlist(&rbconn->hhead, tuple, addr))
232 return 0; /* hotdrop */
233
234 return count + 1;
235 }
236
237 if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
238 continue;
239
240 /* only used for GC on hhead, retval and 'addit' ignored */
241 check_hlist(net, &rbconn->hhead, tuple, &addit);
242 if (hlist_empty(&rbconn->hhead))
243 gc_nodes[gc_count++] = rbconn;
244 }
245
246 if (gc_count) {
247 no_gc = true;
248 tree_nodes_free(root, gc_nodes, gc_count);
249 /* tree_node_free before new allocation permits
250 * allocator to re-use newly free'd object.
251 *
252 * This is a rare event; in most cases we will find
253 * existing node to re-use. (or gc_count is 0).
254 */
255 goto restart;
256 }
257
258 /* no match, need to insert new node */
259 rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC);
260 if (rbconn == NULL)
261 return 0;
262
263 conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
264 if (conn == NULL) {
265 kmem_cache_free(connlimit_rb_cachep, rbconn);
266 return 0;
267 }
268
269 conn->tuple = *tuple;
270 conn->addr = *addr;
271 rbconn->addr = *addr;
272
273 INIT_HLIST_HEAD(&rbconn->hhead);
274 hlist_add_head(&conn->node, &rbconn->hhead);
275
276 rb_link_node(&rbconn->node, parent, rbnode);
277 rb_insert_color(&rbconn->node, root);
278 return 1;
279}
280
281static int count_them(struct net *net,
282 struct xt_connlimit_data *data,
283 const struct nf_conntrack_tuple *tuple,
284 const union nf_inet_addr *addr,
285 const union nf_inet_addr *mask,
286 u_int8_t family)
287{
288 struct rb_root *root;
289 int count;
290 u32 hash;
291
292 if (family == NFPROTO_IPV6) {
293 hash = connlimit_iphash6(addr, mask);
294 root = &data->climit_root6[hash];
295 } else {
296 hash = connlimit_iphash(addr->ip & mask->ip);
297 root = &data->climit_root4[hash];
171 } 298 }
172 299
173 return matches; 300 spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
301
302 count = count_tree(net, root, tuple, addr, mask, family);
303
304 spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
305
306 return count;
174} 307}
175 308
176static bool 309static bool
@@ -183,7 +316,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
183 const struct nf_conntrack_tuple *tuple_ptr = &tuple; 316 const struct nf_conntrack_tuple *tuple_ptr = &tuple;
184 enum ip_conntrack_info ctinfo; 317 enum ip_conntrack_info ctinfo;
185 const struct nf_conn *ct; 318 const struct nf_conn *ct;
186 int connections; 319 unsigned int connections;
187 320
188 ct = nf_ct_get(skb, &ctinfo); 321 ct = nf_ct_get(skb, &ctinfo);
189 if (ct != NULL) 322 if (ct != NULL)
@@ -202,12 +335,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
202 iph->daddr : iph->saddr; 335 iph->daddr : iph->saddr;
203 } 336 }
204 337
205 spin_lock_bh(&info->data->lock);
206 connections = count_them(net, info->data, tuple_ptr, &addr, 338 connections = count_them(net, info->data, tuple_ptr, &addr,
207 &info->mask, par->family); 339 &info->mask, par->family);
208 spin_unlock_bh(&info->data->lock); 340 if (connections == 0)
209
210 if (connections < 0)
211 /* kmalloc failed, drop it entirely */ 341 /* kmalloc failed, drop it entirely */
212 goto hotdrop; 342 goto hotdrop;
213 343
@@ -247,29 +377,47 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
247 return -ENOMEM; 377 return -ENOMEM;
248 } 378 }
249 379
250 spin_lock_init(&info->data->lock); 380 for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i)
251 for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) 381 spin_lock_init(&info->data->locks[i]);
252 INIT_HLIST_HEAD(&info->data->iphash[i]); 382
383 for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
384 info->data->climit_root4[i] = RB_ROOT;
385 for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
386 info->data->climit_root6[i] = RB_ROOT;
253 387
254 return 0; 388 return 0;
255} 389}
256 390
257static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) 391static void destroy_tree(struct rb_root *r)
258{ 392{
259 const struct xt_connlimit_info *info = par->matchinfo;
260 struct xt_connlimit_conn *conn; 393 struct xt_connlimit_conn *conn;
394 struct xt_connlimit_rb *rbconn;
261 struct hlist_node *n; 395 struct hlist_node *n;
262 struct hlist_head *hash = info->data->iphash; 396 struct rb_node *node;
397
398 while ((node = rb_first(r)) != NULL) {
399 rbconn = container_of(node, struct xt_connlimit_rb, node);
400
401 rb_erase(node, r);
402
403 hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
404 kmem_cache_free(connlimit_conn_cachep, conn);
405
406 kmem_cache_free(connlimit_rb_cachep, rbconn);
407 }
408}
409
410static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
411{
412 const struct xt_connlimit_info *info = par->matchinfo;
263 unsigned int i; 413 unsigned int i;
264 414
265 nf_ct_l3proto_module_put(par->family); 415 nf_ct_l3proto_module_put(par->family);
266 416
267 for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { 417 for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
268 hlist_for_each_entry_safe(conn, n, &hash[i], node) { 418 destroy_tree(&info->data->climit_root4[i]);
269 hlist_del(&conn->node); 419 for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
270 kfree(conn); 420 destroy_tree(&info->data->climit_root6[i]);
271 }
272 }
273 421
274 kfree(info->data); 422 kfree(info->data);
275} 423}
@@ -287,12 +435,37 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
287 435
288static int __init connlimit_mt_init(void) 436static int __init connlimit_mt_init(void)
289{ 437{
290 return xt_register_match(&connlimit_mt_reg); 438 int ret;
439
440 BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
441 BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
442
443 connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
444 sizeof(struct xt_connlimit_conn),
445 0, 0, NULL);
446 if (!connlimit_conn_cachep)
447 return -ENOMEM;
448
449 connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb",
450 sizeof(struct xt_connlimit_rb),
451 0, 0, NULL);
452 if (!connlimit_rb_cachep) {
453 kmem_cache_destroy(connlimit_conn_cachep);
454 return -ENOMEM;
455 }
456 ret = xt_register_match(&connlimit_mt_reg);
457 if (ret != 0) {
458 kmem_cache_destroy(connlimit_conn_cachep);
459 kmem_cache_destroy(connlimit_rb_cachep);
460 }
461 return ret;
291} 462}
292 463
293static void __exit connlimit_mt_exit(void) 464static void __exit connlimit_mt_exit(void)
294{ 465{
295 xt_unregister_match(&connlimit_mt_reg); 466 xt_unregister_match(&connlimit_mt_reg);
467 kmem_cache_destroy(connlimit_conn_cachep);
468 kmem_cache_destroy(connlimit_rb_cachep);
296} 469}
297 470
298module_init(connlimit_mt_init); 471module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c
index a4c7561698c5..89d53104c6b3 100644
--- a/net/netfilter/xt_ipcomp.c
+++ b/net/netfilter/xt_ipcomp.c
@@ -60,7 +60,7 @@ static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par)
60 } 60 }
61 61
62 return spi_match(compinfo->spis[0], compinfo->spis[1], 62 return spi_match(compinfo->spis[0], compinfo->spis[1],
63 ntohl(chdr->cpi << 16), 63 ntohs(chdr->cpi),
64 !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); 64 !!(compinfo->invflags & XT_IPCOMP_INV_SPI));
65} 65}
66 66