aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2014-03-04 10:21:51 -0500
committerPablo Neira Ayuso <pablo@netfilter.org>2014-03-07 05:42:07 -0500
commitce6eb0d7c8ec16753f817054e2a566504327e274 (patch)
treef9cda6bea6427ec66867481232416649929c622a /net/netfilter
parent93bb0ceb75be2fdfa9fc0dd1fb522d9ada515d9c (diff)
netfilter: nft_hash: bug fixes and resizing
The hash set type is very broken and was never meant to be merged in this state. Missing RCU synchronization on element removal, leaking chain refcounts when used as a verdict map, races during lookups, a fixed table size are probably just some of the problems. Luckily it is currently never chosen by the kernel when the rbtree type is also available. Rewrite it to be usable. The new implementation supports automatic hash table resizing using RCU, based on Paul McKenney's and Josh Triplett's algorithm "Optimized Resizing For RCU-Protected Hash Tables" described in [1]. Resizing doesn't require a second list head in the elements, it works by chosing a hash function that remaps elements to a predictable set of buckets, only resizing by integral factors and - during expansion: linking new buckets to the old bucket that contains elements for any of the new buckets, thereby creating imprecise chains, then incrementally seperating the elements until the new buckets only contain elements that hash directly to them. - during shrinking: linking the hash chains of all old buckets that hash to the same new bucket to form a single chain. Expansion requires at most the number of elements in the longest hash chain grace periods, shrinking requires a single grace period. Due to the requirement of having hash chains/elements linked to multiple buckets during resizing, homemade single linked lists are used instead of the existing list helpers, that don't support this in a clean fashion. As a side effect, the amount of memory required per element is reduced by one pointer. Expansion is triggered when the load factors exceeds 75%, shrinking when the load factor goes below 30%. Both operations are allowed to fail and will be retried on the next insertion or removal if their respective conditions still hold. [1] http://dl.acm.org/citation.cfm?id=2002181.2002192 Reviewed-by: Josh Triplett <josh@joshtriplett.org> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/nft_hash.c260
1 files changed, 214 insertions, 46 deletions
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3d3f8fce10a5..6a1acde16c60 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> 2 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -18,17 +18,29 @@
18#include <linux/netfilter/nf_tables.h> 18#include <linux/netfilter/nf_tables.h>
19#include <net/netfilter/nf_tables.h> 19#include <net/netfilter/nf_tables.h>
20 20
21#define NFT_HASH_MIN_SIZE 4
22
21struct nft_hash { 23struct nft_hash {
22 struct hlist_head *hash; 24 struct nft_hash_table __rcu *tbl;
23 unsigned int hsize; 25};
26
27struct nft_hash_table {
28 unsigned int size;
29 unsigned int elements;
30 struct nft_hash_elem __rcu *buckets[];
24}; 31};
25 32
26struct nft_hash_elem { 33struct nft_hash_elem {
27 struct hlist_node hnode; 34 struct nft_hash_elem __rcu *next;
28 struct nft_data key; 35 struct nft_data key;
29 struct nft_data data[]; 36 struct nft_data data[];
30}; 37};
31 38
39#define nft_hash_for_each_entry(i, head) \
40 for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next))
41#define nft_hash_for_each_entry_rcu(i, head) \
42 for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next))
43
32static u32 nft_hash_rnd __read_mostly; 44static u32 nft_hash_rnd __read_mostly;
33static bool nft_hash_rnd_initted __read_mostly; 45static bool nft_hash_rnd_initted __read_mostly;
34 46
@@ -38,7 +50,7 @@ static unsigned int nft_hash_data(const struct nft_data *data,
38 unsigned int h; 50 unsigned int h;
39 51
40 h = jhash(data->data, len, nft_hash_rnd); 52 h = jhash(data->data, len, nft_hash_rnd);
41 return ((u64)h * hsize) >> 32; 53 return h & (hsize - 1);
42} 54}
43 55
44static bool nft_hash_lookup(const struct nft_set *set, 56static bool nft_hash_lookup(const struct nft_set *set,
@@ -46,11 +58,12 @@ static bool nft_hash_lookup(const struct nft_set *set,
46 struct nft_data *data) 58 struct nft_data *data)
47{ 59{
48 const struct nft_hash *priv = nft_set_priv(set); 60 const struct nft_hash *priv = nft_set_priv(set);
61 const struct nft_hash_table *tbl = rcu_dereference(priv->tbl);
49 const struct nft_hash_elem *he; 62 const struct nft_hash_elem *he;
50 unsigned int h; 63 unsigned int h;
51 64
52 h = nft_hash_data(key, priv->hsize, set->klen); 65 h = nft_hash_data(key, tbl->size, set->klen);
53 hlist_for_each_entry(he, &priv->hash[h], hnode) { 66 nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) {
54 if (nft_data_cmp(&he->key, key, set->klen)) 67 if (nft_data_cmp(&he->key, key, set->klen))
55 continue; 68 continue;
56 if (set->flags & NFT_SET_MAP) 69 if (set->flags & NFT_SET_MAP)
@@ -60,19 +73,148 @@ static bool nft_hash_lookup(const struct nft_set *set,
60 return false; 73 return false;
61} 74}
62 75
63static void nft_hash_elem_destroy(const struct nft_set *set, 76static void nft_hash_tbl_free(const struct nft_hash_table *tbl)
64 struct nft_hash_elem *he)
65{ 77{
66 nft_data_uninit(&he->key, NFT_DATA_VALUE); 78 if (is_vmalloc_addr(tbl))
67 if (set->flags & NFT_SET_MAP) 79 vfree(tbl);
68 nft_data_uninit(he->data, set->dtype); 80 else
69 kfree(he); 81 kfree(tbl);
82}
83
84static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets)
85{
86 struct nft_hash_table *tbl;
87 size_t size;
88
89 size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
90 tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN);
91 if (tbl == NULL)
92 tbl = vzalloc(size);
93 if (tbl == NULL)
94 return NULL;
95 tbl->size = nbuckets;
96
97 return tbl;
98}
99
100static void nft_hash_chain_unzip(const struct nft_set *set,
101 const struct nft_hash_table *ntbl,
102 struct nft_hash_table *tbl, unsigned int n)
103{
104 struct nft_hash_elem *he, *last, *next;
105 unsigned int h;
106
107 he = nft_dereference(tbl->buckets[n]);
108 if (he == NULL)
109 return;
110 h = nft_hash_data(&he->key, ntbl->size, set->klen);
111
112 /* Find last element of first chain hashing to bucket h */
113 last = he;
114 nft_hash_for_each_entry(he, he->next) {
115 if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
116 break;
117 last = he;
118 }
119
120 /* Unlink first chain from the old table */
121 RCU_INIT_POINTER(tbl->buckets[n], last->next);
122
123 /* If end of chain reached, done */
124 if (he == NULL)
125 return;
126
127 /* Find first element of second chain hashing to bucket h */
128 next = NULL;
129 nft_hash_for_each_entry(he, he->next) {
130 if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
131 continue;
132 next = he;
133 break;
134 }
135
136 /* Link the two chains */
137 RCU_INIT_POINTER(last->next, next);
138}
139
140static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv)
141{
142 struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
143 struct nft_hash_elem *he;
144 unsigned int i, h;
145 bool complete;
146
147 ntbl = nft_hash_tbl_alloc(tbl->size * 2);
148 if (ntbl == NULL)
149 return -ENOMEM;
150
151 /* Link new table's buckets to first element in the old table
152 * hashing to the new bucket.
153 */
154 for (i = 0; i < ntbl->size; i++) {
155 h = i < tbl->size ? i : i - tbl->size;
156 nft_hash_for_each_entry(he, tbl->buckets[h]) {
157 if (nft_hash_data(&he->key, ntbl->size, set->klen) != i)
158 continue;
159 RCU_INIT_POINTER(ntbl->buckets[i], he);
160 break;
161 }
162 }
163 ntbl->elements = tbl->elements;
164
165 /* Publish new table */
166 rcu_assign_pointer(priv->tbl, ntbl);
167
168 /* Unzip interleaved hash chains */
169 do {
170 /* Wait for readers to use new table/unzipped chains */
171 synchronize_rcu();
172
173 complete = true;
174 for (i = 0; i < tbl->size; i++) {
175 nft_hash_chain_unzip(set, ntbl, tbl, i);
176 if (tbl->buckets[i] != NULL)
177 complete = false;
178 }
179 } while (!complete);
180
181 nft_hash_tbl_free(tbl);
182 return 0;
183}
184
185static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv)
186{
187 struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
188 struct nft_hash_elem __rcu **pprev;
189 unsigned int i;
190
191 ntbl = nft_hash_tbl_alloc(tbl->size / 2);
192 if (ntbl == NULL)
193 return -ENOMEM;
194
195 for (i = 0; i < ntbl->size; i++) {
196 ntbl->buckets[i] = tbl->buckets[i];
197
198 for (pprev = &ntbl->buckets[i]; *pprev != NULL;
199 pprev = &nft_dereference(*pprev)->next)
200 ;
201 RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
202 }
203 ntbl->elements = tbl->elements;
204
205 /* Publish new table */
206 rcu_assign_pointer(priv->tbl, ntbl);
207 synchronize_rcu();
208
209 nft_hash_tbl_free(tbl);
210 return 0;
70} 211}
71 212
72static int nft_hash_insert(const struct nft_set *set, 213static int nft_hash_insert(const struct nft_set *set,
73 const struct nft_set_elem *elem) 214 const struct nft_set_elem *elem)
74{ 215{
75 struct nft_hash *priv = nft_set_priv(set); 216 struct nft_hash *priv = nft_set_priv(set);
217 struct nft_hash_table *tbl = nft_dereference(priv->tbl);
76 struct nft_hash_elem *he; 218 struct nft_hash_elem *he;
77 unsigned int size, h; 219 unsigned int size, h;
78 220
@@ -91,33 +233,66 @@ static int nft_hash_insert(const struct nft_set *set,
91 if (set->flags & NFT_SET_MAP) 233 if (set->flags & NFT_SET_MAP)
92 nft_data_copy(he->data, &elem->data); 234 nft_data_copy(he->data, &elem->data);
93 235
94 h = nft_hash_data(&he->key, priv->hsize, set->klen); 236 h = nft_hash_data(&he->key, tbl->size, set->klen);
95 hlist_add_head_rcu(&he->hnode, &priv->hash[h]); 237 RCU_INIT_POINTER(he->next, tbl->buckets[h]);
238 rcu_assign_pointer(tbl->buckets[h], he);
239 tbl->elements++;
240
241 /* Expand table when exceeding 75% load */
242 if (tbl->elements > tbl->size / 4 * 3)
243 nft_hash_tbl_expand(set, priv);
244
96 return 0; 245 return 0;
97} 246}
98 247
248static void nft_hash_elem_destroy(const struct nft_set *set,
249 struct nft_hash_elem *he)
250{
251 nft_data_uninit(&he->key, NFT_DATA_VALUE);
252 if (set->flags & NFT_SET_MAP)
253 nft_data_uninit(he->data, set->dtype);
254 kfree(he);
255}
256
99static void nft_hash_remove(const struct nft_set *set, 257static void nft_hash_remove(const struct nft_set *set,
100 const struct nft_set_elem *elem) 258 const struct nft_set_elem *elem)
101{ 259{
102 struct nft_hash_elem *he = elem->cookie; 260 struct nft_hash *priv = nft_set_priv(set);
261 struct nft_hash_table *tbl = nft_dereference(priv->tbl);
262 struct nft_hash_elem *he, __rcu **pprev;
103 263
104 hlist_del_rcu(&he->hnode); 264 pprev = elem->cookie;
265 he = nft_dereference((*pprev));
266
267 RCU_INIT_POINTER(*pprev, he->next);
268 synchronize_rcu();
105 kfree(he); 269 kfree(he);
270 tbl->elements--;
271
272 /* Shrink table beneath 30% load */
273 if (tbl->elements < tbl->size * 3 / 10 &&
274 tbl->size > NFT_HASH_MIN_SIZE)
275 nft_hash_tbl_shrink(set, priv);
106} 276}
107 277
108static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) 278static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
109{ 279{
110 const struct nft_hash *priv = nft_set_priv(set); 280 const struct nft_hash *priv = nft_set_priv(set);
281 const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
282 struct nft_hash_elem __rcu * const *pprev;
111 struct nft_hash_elem *he; 283 struct nft_hash_elem *he;
112 unsigned int h; 284 unsigned int h;
113 285
114 h = nft_hash_data(&elem->key, priv->hsize, set->klen); 286 h = nft_hash_data(&elem->key, tbl->size, set->klen);
115 hlist_for_each_entry(he, &priv->hash[h], hnode) { 287 pprev = &tbl->buckets[h];
116 if (nft_data_cmp(&he->key, &elem->key, set->klen)) 288 nft_hash_for_each_entry(he, tbl->buckets[h]) {
289 if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
290 pprev = &he->next;
117 continue; 291 continue;
292 }
118 293
119 elem->cookie = he; 294 elem->cookie = (void *)pprev;
120 elem->flags = 0; 295 elem->flags = 0;
121 if (set->flags & NFT_SET_MAP) 296 if (set->flags & NFT_SET_MAP)
122 nft_data_copy(&elem->data, he->data); 297 nft_data_copy(&elem->data, he->data);
123 return 0; 298 return 0;
@@ -129,12 +304,13 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
129 struct nft_set_iter *iter) 304 struct nft_set_iter *iter)
130{ 305{
131 const struct nft_hash *priv = nft_set_priv(set); 306 const struct nft_hash *priv = nft_set_priv(set);
307 const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
132 const struct nft_hash_elem *he; 308 const struct nft_hash_elem *he;
133 struct nft_set_elem elem; 309 struct nft_set_elem elem;
134 unsigned int i; 310 unsigned int i;
135 311
136 for (i = 0; i < priv->hsize; i++) { 312 for (i = 0; i < tbl->size; i++) {
137 hlist_for_each_entry(he, &priv->hash[i], hnode) { 313 nft_hash_for_each_entry(he, tbl->buckets[i]) {
138 if (iter->count < iter->skip) 314 if (iter->count < iter->skip)
139 goto cont; 315 goto cont;
140 316
@@ -161,43 +337,35 @@ static int nft_hash_init(const struct nft_set *set,
161 const struct nlattr * const tb[]) 337 const struct nlattr * const tb[])
162{ 338{
163 struct nft_hash *priv = nft_set_priv(set); 339 struct nft_hash *priv = nft_set_priv(set);
164 unsigned int cnt, i; 340 struct nft_hash_table *tbl;
165 341
166 if (unlikely(!nft_hash_rnd_initted)) { 342 if (unlikely(!nft_hash_rnd_initted)) {
167 get_random_bytes(&nft_hash_rnd, 4); 343 get_random_bytes(&nft_hash_rnd, 4);
168 nft_hash_rnd_initted = true; 344 nft_hash_rnd_initted = true;
169 } 345 }
170 346
171 /* Aim for a load factor of 0.75 */ 347 tbl = nft_hash_tbl_alloc(NFT_HASH_MIN_SIZE);
172 // FIXME: temporarily broken until we have set descriptions 348 if (tbl == NULL)
173 cnt = 100;
174 cnt = cnt * 4 / 3;
175
176 priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL);
177 if (priv->hash == NULL)
178 return -ENOMEM; 349 return -ENOMEM;
179 priv->hsize = cnt; 350 RCU_INIT_POINTER(priv->tbl, tbl);
180
181 for (i = 0; i < cnt; i++)
182 INIT_HLIST_HEAD(&priv->hash[i]);
183
184 return 0; 351 return 0;
185} 352}
186 353
187static void nft_hash_destroy(const struct nft_set *set) 354static void nft_hash_destroy(const struct nft_set *set)
188{ 355{
189 const struct nft_hash *priv = nft_set_priv(set); 356 const struct nft_hash *priv = nft_set_priv(set);
190 const struct hlist_node *next; 357 const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
191 struct nft_hash_elem *elem; 358 struct nft_hash_elem *he, *next;
192 unsigned int i; 359 unsigned int i;
193 360
194 for (i = 0; i < priv->hsize; i++) { 361 for (i = 0; i < tbl->size; i++) {
195 hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) { 362 for (he = nft_dereference(tbl->buckets[i]); he != NULL;
196 hlist_del(&elem->hnode); 363 he = next) {
197 nft_hash_elem_destroy(set, elem); 364 next = nft_dereference(he->next);
365 nft_hash_elem_destroy(set, he);
198 } 366 }
199 } 367 }
200 kfree(priv->hash); 368 kfree(tbl);
201} 369}
202 370
203static struct nft_set_ops nft_hash_ops __read_mostly = { 371static struct nft_set_ops nft_hash_ops __read_mostly = {