aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/fib_trie.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/fib_trie.c')
-rw-r--r--net/ipv4/fib_trie.c1960
1 files changed, 934 insertions, 1026 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18bcaf2ff2fd..3daf0224ff2e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -83,28 +83,33 @@
83 83
84#define MAX_STAT_DEPTH 32 84#define MAX_STAT_DEPTH 32
85 85
86#define KEYLENGTH (8*sizeof(t_key)) 86#define KEYLENGTH (8*sizeof(t_key))
87#define KEY_MAX ((t_key)~0)
87 88
88typedef unsigned int t_key; 89typedef unsigned int t_key;
89 90
90#define T_TNODE 0 91#define IS_TNODE(n) ((n)->bits)
91#define T_LEAF 1 92#define IS_LEAF(n) (!(n)->bits)
92#define NODE_TYPE_MASK 0x1UL
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 93
95#define IS_TNODE(n) (!(n->parent & T_LEAF)) 94#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
96#define IS_LEAF(n) (n->parent & T_LEAF)
97 95
98struct rt_trie_node { 96struct tnode {
99 unsigned long parent;
100 t_key key;
101};
102
103struct leaf {
104 unsigned long parent;
105 t_key key; 97 t_key key;
106 struct hlist_head list; 98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 unsigned char slen;
101 struct tnode __rcu *parent;
107 struct rcu_head rcu; 102 struct rcu_head rcu;
103 union {
104 /* The fields in this struct are valid if bits > 0 (TNODE) */
105 struct {
106 t_key empty_children; /* KEYLENGTH bits needed */
107 t_key full_children; /* KEYLENGTH bits needed */
108 struct tnode __rcu *child[0];
109 };
110 /* This list pointer if valid if bits == 0 (LEAF) */
111 struct hlist_head list;
112 };
108}; 113};
109 114
110struct leaf_info { 115struct leaf_info {
@@ -115,20 +120,6 @@ struct leaf_info {
115 struct rcu_head rcu; 120 struct rcu_head rcu;
116}; 121};
117 122
118struct tnode {
119 unsigned long parent;
120 t_key key;
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
123 unsigned int full_children; /* KEYLENGTH bits needed */
124 unsigned int empty_children; /* KEYLENGTH bits needed */
125 union {
126 struct rcu_head rcu;
127 struct tnode *tnode_free;
128 };
129 struct rt_trie_node __rcu *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 123#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats { 124struct trie_use_stats {
134 unsigned int gets; 125 unsigned int gets;
@@ -151,19 +142,13 @@ struct trie_stat {
151}; 142};
152 143
153struct trie { 144struct trie {
154 struct rt_trie_node __rcu *trie; 145 struct tnode __rcu *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS 146#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats; 147 struct trie_use_stats __percpu *stats;
157#endif 148#endif
158}; 149};
159 150
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, 151static void resize(struct trie *t, struct tnode *tn);
161 int wasfull);
162static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
167static size_t tnode_free_size; 152static size_t tnode_free_size;
168 153
169/* 154/*
@@ -176,170 +161,101 @@ static const int sync_pages = 128;
176static struct kmem_cache *fn_alias_kmem __read_mostly; 161static struct kmem_cache *fn_alias_kmem __read_mostly;
177static struct kmem_cache *trie_leaf_kmem __read_mostly; 162static struct kmem_cache *trie_leaf_kmem __read_mostly;
178 163
179/* 164/* caller must hold RTNL */
180 * caller must hold RTNL 165#define node_parent(n) rtnl_dereference((n)->parent)
181 */
182static inline struct tnode *node_parent(const struct rt_trie_node *node)
183{
184 unsigned long parent;
185
186 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
187 166
188 return (struct tnode *)(parent & ~NODE_TYPE_MASK); 167/* caller must hold RCU read lock or RTNL */
189} 168#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
190 169
191/* 170/* wrapper for rcu_assign_pointer */
192 * caller must hold RCU read lock or RTNL 171static inline void node_set_parent(struct tnode *n, struct tnode *tp)
193 */
194static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195{ 172{
196 unsigned long parent; 173 if (n)
197 174 rcu_assign_pointer(n->parent, tp);
198 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
199 lockdep_rtnl_is_held());
200
201 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
202} 175}
203 176
204/* Same as rcu_assign_pointer 177#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
205 * but that macro() assumes that value is a pointer. 178
179/* This provides us with the number of children in this node, in the case of a
180 * leaf this will return 0 meaning none of the children are accessible.
206 */ 181 */
207static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) 182static inline unsigned long tnode_child_length(const struct tnode *tn)
208{ 183{
209 smp_wmb(); 184 return (1ul << tn->bits) & ~(1ul);
210 node->parent = (unsigned long)ptr | NODE_TYPE(node);
211} 185}
212 186
213/* 187/* caller must hold RTNL */
214 * caller must hold RTNL 188static inline struct tnode *tnode_get_child(const struct tnode *tn,
215 */ 189 unsigned long i)
216static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217{ 190{
218 BUG_ON(i >= 1U << tn->bits);
219
220 return rtnl_dereference(tn->child[i]); 191 return rtnl_dereference(tn->child[i]);
221} 192}
222 193
223/* 194/* caller must hold RCU read lock or RTNL */
224 * caller must hold RCU read lock or RTNL 195static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
225 */ 196 unsigned long i)
226static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227{ 197{
228 BUG_ON(i >= 1U << tn->bits);
229
230 return rcu_dereference_rtnl(tn->child[i]); 198 return rcu_dereference_rtnl(tn->child[i]);
231} 199}
232 200
233static inline int tnode_child_length(const struct tnode *tn) 201/* To understand this stuff, an understanding of keys and all their bits is
234{ 202 * necessary. Every node in the trie has a key associated with it, but not
235 return 1 << tn->bits; 203 * all of the bits in that key are significant.
236} 204 *
237 205 * Consider a node 'n' and its parent 'tp'.
238static inline t_key mask_pfx(t_key k, unsigned int l) 206 *
239{ 207 * If n is a leaf, every bit in its key is significant. Its presence is
240 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 208 * necessitated by path compression, since during a tree traversal (when
241} 209 * searching for a leaf - unless we are doing an insertion) we will completely
242 210 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
243static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) 211 * a potentially successful search, that we have indeed been walking the
244{ 212 * correct key path.
245 if (offset < KEYLENGTH) 213 *
246 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 214 * Note that we can never "miss" the correct key in the tree if present by
247 else 215 * following the wrong path. Path compression ensures that segments of the key
248 return 0; 216 * that are the same for all keys with a given prefix are skipped, but the
249} 217 * skipped part *is* identical for each node in the subtrie below the skipped
250 218 * bit! trie_insert() in this implementation takes care of that.
251static inline int tkey_equals(t_key a, t_key b) 219 *
252{ 220 * if n is an internal node - a 'tnode' here, the various parts of its key
253 return a == b; 221 * have many different meanings.
254} 222 *
255 223 * Example:
256static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 224 * _________________________________________________________________
257{ 225 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
258 if (bits == 0 || offset >= KEYLENGTH) 226 * -----------------------------------------------------------------
259 return 1; 227 * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
260 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 228 *
261 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 229 * _________________________________________________________________
262} 230 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
263 231 * -----------------------------------------------------------------
264static inline int tkey_mismatch(t_key a, int offset, t_key b) 232 * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
265{ 233 *
266 t_key diff = a ^ b; 234 * tp->pos = 22
267 int i = offset; 235 * tp->bits = 3
268 236 * n->pos = 13
269 if (!diff) 237 * n->bits = 4
270 return 0; 238 *
271 while ((diff << i) >> (KEYLENGTH-1) == 0) 239 * First, let's just ignore the bits that come before the parent tp, that is
272 i++; 240 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
273 return i; 241 * point we do not use them for anything.
274} 242 *
275 243 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
276/* 244 * index into the parent's child array. That is, they will be used to find
277 To understand this stuff, an understanding of keys and all their bits is 245 * 'n' among tp's children.
278 necessary. Every node in the trie has a key associated with it, but not 246 *
279 all of the bits in that key are significant. 247 * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
280 248 * for the node n.
281 Consider a node 'n' and its parent 'tp'. 249 *
282 250 * All the bits we have seen so far are significant to the node n. The rest
283 If n is a leaf, every bit in its key is significant. Its presence is 251 * of the bits are really not needed or indeed known in n->key.
284 necessitated by path compression, since during a tree traversal (when 252 *
285 searching for a leaf - unless we are doing an insertion) we will completely 253 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
286 ignore all skipped bits we encounter. Thus we need to verify, at the end of 254 * n's child array, and will of course be different for each child.
287 a potentially successful search, that we have indeed been walking the 255 *
288 correct key path. 256 * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
289 257 * at this point.
290 Note that we can never "miss" the correct key in the tree if present by 258 */
291 following the wrong path. Path compression ensures that segments of the key
292 that are the same for all keys with a given prefix are skipped, but the
293 skipped part *is* identical for each node in the subtrie below the skipped
294 bit! trie_insert() in this implementation takes care of that - note the
295 call to tkey_sub_equals() in trie_insert().
296
297 if n is an internal node - a 'tnode' here, the various parts of its key
298 have many different meanings.
299
300 Example:
301 _________________________________________________________________
302 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
303 -----------------------------------------------------------------
304 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
305
306 _________________________________________________________________
307 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
308 -----------------------------------------------------------------
309 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
310
311 tp->pos = 7
312 tp->bits = 3
313 n->pos = 15
314 n->bits = 4
315
316 First, let's just ignore the bits that come before the parent tp, that is
317 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318 not use them for anything.
319
320 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321 index into the parent's child array. That is, they will be used to find
322 'n' among tp's children.
323
324 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
325 for the node n.
326
327 All the bits we have seen so far are significant to the node n. The rest
328 of the bits are really not needed or indeed known in n->key.
329
330 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331 n's child array, and will of course be different for each child.
332
333
334 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
335 at this point.
336
337*/
338
339static inline void check_tnode(const struct tnode *tn)
340{
341 WARN_ON(tn && tn->pos+tn->bits > 32);
342}
343 259
344static const int halve_threshold = 25; 260static const int halve_threshold = 25;
345static const int inflate_threshold = 50; 261static const int inflate_threshold = 50;
@@ -357,17 +273,23 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
357 call_rcu(&fa->rcu, __alias_free_mem); 273 call_rcu(&fa->rcu, __alias_free_mem);
358} 274}
359 275
360static void __leaf_free_rcu(struct rcu_head *head) 276#define TNODE_KMALLOC_MAX \
361{ 277 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
362 struct leaf *l = container_of(head, struct leaf, rcu);
363 kmem_cache_free(trie_leaf_kmem, l);
364}
365 278
366static inline void free_leaf(struct leaf *l) 279static void __node_free_rcu(struct rcu_head *head)
367{ 280{
368 call_rcu(&l->rcu, __leaf_free_rcu); 281 struct tnode *n = container_of(head, struct tnode, rcu);
282
283 if (IS_LEAF(n))
284 kmem_cache_free(trie_leaf_kmem, n);
285 else if (n->bits <= TNODE_KMALLOC_MAX)
286 kfree(n);
287 else
288 vfree(n);
369} 289}
370 290
291#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
292
371static inline void free_leaf_info(struct leaf_info *leaf) 293static inline void free_leaf_info(struct leaf_info *leaf)
372{ 294{
373 kfree_rcu(leaf, rcu); 295 kfree_rcu(leaf, rcu);
@@ -381,56 +303,31 @@ static struct tnode *tnode_alloc(size_t size)
381 return vzalloc(size); 303 return vzalloc(size);
382} 304}
383 305
384static void __tnode_free_rcu(struct rcu_head *head) 306static inline void empty_child_inc(struct tnode *n)
385{
386 struct tnode *tn = container_of(head, struct tnode, rcu);
387 size_t size = sizeof(struct tnode) +
388 (sizeof(struct rt_trie_node *) << tn->bits);
389
390 if (size <= PAGE_SIZE)
391 kfree(tn);
392 else
393 vfree(tn);
394}
395
396static inline void tnode_free(struct tnode *tn)
397{
398 if (IS_LEAF(tn))
399 free_leaf((struct leaf *) tn);
400 else
401 call_rcu(&tn->rcu, __tnode_free_rcu);
402}
403
404static void tnode_free_safe(struct tnode *tn)
405{ 307{
406 BUG_ON(IS_LEAF(tn)); 308 ++n->empty_children ? : ++n->full_children;
407 tn->tnode_free = tnode_free_head;
408 tnode_free_head = tn;
409 tnode_free_size += sizeof(struct tnode) +
410 (sizeof(struct rt_trie_node *) << tn->bits);
411} 309}
412 310
413static void tnode_free_flush(void) 311static inline void empty_child_dec(struct tnode *n)
414{ 312{
415 struct tnode *tn; 313 n->empty_children-- ? : n->full_children--;
416
417 while ((tn = tnode_free_head)) {
418 tnode_free_head = tn->tnode_free;
419 tn->tnode_free = NULL;
420 tnode_free(tn);
421 }
422
423 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
424 tnode_free_size = 0;
425 synchronize_rcu();
426 }
427} 314}
428 315
429static struct leaf *leaf_new(void) 316static struct tnode *leaf_new(t_key key)
430{ 317{
431 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); 318 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
432 if (l) { 319 if (l) {
433 l->parent = T_LEAF; 320 l->parent = NULL;
321 /* set key and pos to reflect full key value
322 * any trailing zeros in the key should be ignored
323 * as the nodes are searched
324 */
325 l->key = key;
326 l->slen = 0;
327 l->pos = 0;
328 /* set bits to 0 indicating we are not a tnode */
329 l->bits = 0;
330
434 INIT_HLIST_HEAD(&l->list); 331 INIT_HLIST_HEAD(&l->list);
435 } 332 }
436 return l; 333 return l;
@@ -449,462 +346,530 @@ static struct leaf_info *leaf_info_new(int plen)
449 346
450static struct tnode *tnode_new(t_key key, int pos, int bits) 347static struct tnode *tnode_new(t_key key, int pos, int bits)
451{ 348{
452 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); 349 size_t sz = offsetof(struct tnode, child[1ul << bits]);
453 struct tnode *tn = tnode_alloc(sz); 350 struct tnode *tn = tnode_alloc(sz);
351 unsigned int shift = pos + bits;
352
353 /* verify bits and pos their msb bits clear and values are valid */
354 BUG_ON(!bits || (shift > KEYLENGTH));
454 355
455 if (tn) { 356 if (tn) {
456 tn->parent = T_TNODE; 357 tn->parent = NULL;
358 tn->slen = pos;
457 tn->pos = pos; 359 tn->pos = pos;
458 tn->bits = bits; 360 tn->bits = bits;
459 tn->key = key; 361 tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
460 tn->full_children = 0; 362 if (bits == KEYLENGTH)
461 tn->empty_children = 1<<bits; 363 tn->full_children = 1;
364 else
365 tn->empty_children = 1ul << bits;
462 } 366 }
463 367
464 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 368 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
465 sizeof(struct rt_trie_node *) << bits); 369 sizeof(struct tnode *) << bits);
466 return tn; 370 return tn;
467} 371}
468 372
469/* 373/* Check whether a tnode 'n' is "full", i.e. it is an internal node
470 * Check whether a tnode 'n' is "full", i.e. it is an internal node
471 * and no bits are skipped. See discussion in dyntree paper p. 6 374 * and no bits are skipped. See discussion in dyntree paper p. 6
472 */ 375 */
473 376static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
474static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475{ 377{
476 if (n == NULL || IS_LEAF(n)) 378 return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
477 return 0;
478
479 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
480} 379}
481 380
482static inline void put_child(struct tnode *tn, int i, 381/* Add a child at position i overwriting the old value.
483 struct rt_trie_node *n) 382 * Update the value of full_children and empty_children.
484{ 383 */
485 tnode_put_child_reorg(tn, i, n, -1); 384static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
486}
487
488 /*
489 * Add a child at position i overwriting the old value.
490 * Update the value of full_children and empty_children.
491 */
492
493static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494 int wasfull)
495{ 385{
496 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); 386 struct tnode *chi = tnode_get_child(tn, i);
497 int isfull; 387 int isfull, wasfull;
498 388
499 BUG_ON(i >= 1<<tn->bits); 389 BUG_ON(i >= tnode_child_length(tn));
500 390
501 /* update emptyChildren */ 391 /* update emptyChildren, overflow into fullChildren */
502 if (n == NULL && chi != NULL) 392 if (n == NULL && chi != NULL)
503 tn->empty_children++; 393 empty_child_inc(tn);
504 else if (n != NULL && chi == NULL) 394 if (n != NULL && chi == NULL)
505 tn->empty_children--; 395 empty_child_dec(tn);
506 396
507 /* update fullChildren */ 397 /* update fullChildren */
508 if (wasfull == -1) 398 wasfull = tnode_full(tn, chi);
509 wasfull = tnode_full(tn, chi);
510
511 isfull = tnode_full(tn, n); 399 isfull = tnode_full(tn, n);
400
512 if (wasfull && !isfull) 401 if (wasfull && !isfull)
513 tn->full_children--; 402 tn->full_children--;
514 else if (!wasfull && isfull) 403 else if (!wasfull && isfull)
515 tn->full_children++; 404 tn->full_children++;
516 405
517 if (n) 406 if (n && (tn->slen < n->slen))
518 node_set_parent(n, tn); 407 tn->slen = n->slen;
519 408
520 rcu_assign_pointer(tn->child[i], n); 409 rcu_assign_pointer(tn->child[i], n);
521} 410}
522 411
523#define MAX_WORK 10 412static void update_children(struct tnode *tn)
524static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525{ 413{
526 int i; 414 unsigned long i;
527 struct tnode *old_tn;
528 int inflate_threshold_use;
529 int halve_threshold_use;
530 int max_work;
531 415
532 if (!tn) 416 /* update all of the child parent pointers */
533 return NULL; 417 for (i = tnode_child_length(tn); i;) {
418 struct tnode *inode = tnode_get_child(tn, --i);
534 419
535 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 420 if (!inode)
536 tn, inflate_threshold, halve_threshold); 421 continue;
537 422
538 /* No children */ 423 /* Either update the children of a tnode that
539 if (tn->empty_children == tnode_child_length(tn)) { 424 * already belongs to us or update the child
540 tnode_free_safe(tn); 425 * to point to ourselves.
541 return NULL; 426 */
427 if (node_parent(inode) == tn)
428 update_children(inode);
429 else
430 node_set_parent(inode, tn);
542 } 431 }
543 /* One child */ 432}
544 if (tn->empty_children == tnode_child_length(tn) - 1)
545 goto one_child;
546 /*
547 * Double as long as the resulting node has a number of
548 * nonempty nodes that are above the threshold.
549 */
550
551 /*
552 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
553 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554 * Telecommunications, page 6:
555 * "A node is doubled if the ratio of non-empty children to all
556 * children in the *doubled* node is at least 'high'."
557 *
558 * 'high' in this instance is the variable 'inflate_threshold'. It
559 * is expressed as a percentage, so we multiply it with
560 * tnode_child_length() and instead of multiplying by 2 (since the
561 * child array will be doubled by inflate()) and multiplying
562 * the left-hand side by 100 (to handle the percentage thing) we
563 * multiply the left-hand side by 50.
564 *
565 * The left-hand side may look a bit weird: tnode_child_length(tn)
566 * - tn->empty_children is of course the number of non-null children
567 * in the current node. tn->full_children is the number of "full"
568 * children, that is non-null tnodes with a skip value of 0.
569 * All of those will be doubled in the resulting inflated tnode, so
570 * we just count them one extra time here.
571 *
572 * A clearer way to write this would be:
573 *
574 * to_be_doubled = tn->full_children;
575 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 * tn->full_children;
577 *
578 * new_child_length = tnode_child_length(tn) * 2;
579 *
580 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 * new_child_length;
582 * if (new_fill_factor >= inflate_threshold)
583 *
584 * ...and so on, tho it would mess up the while () loop.
585 *
586 * anyway,
587 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
588 * inflate_threshold
589 *
590 * avoid a division:
591 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
592 * inflate_threshold * new_child_length
593 *
594 * expand not_to_be_doubled and to_be_doubled, and shorten:
595 * 100 * (tnode_child_length(tn) - tn->empty_children +
596 * tn->full_children) >= inflate_threshold * new_child_length
597 *
598 * expand new_child_length:
599 * 100 * (tnode_child_length(tn) - tn->empty_children +
600 * tn->full_children) >=
601 * inflate_threshold * tnode_child_length(tn) * 2
602 *
603 * shorten again:
604 * 50 * (tn->full_children + tnode_child_length(tn) -
605 * tn->empty_children) >= inflate_threshold *
606 * tnode_child_length(tn)
607 *
608 */
609 433
610 check_tnode(tn); 434static inline void put_child_root(struct tnode *tp, struct trie *t,
435 t_key key, struct tnode *n)
436{
437 if (tp)
438 put_child(tp, get_index(key, tp), n);
439 else
440 rcu_assign_pointer(t->trie, n);
441}
611 442
612 /* Keep root node larger */ 443static inline void tnode_free_init(struct tnode *tn)
444{
445 tn->rcu.next = NULL;
446}
613 447
614 if (!node_parent((struct rt_trie_node *)tn)) { 448static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
615 inflate_threshold_use = inflate_threshold_root; 449{
616 halve_threshold_use = halve_threshold_root; 450 n->rcu.next = tn->rcu.next;
617 } else { 451 tn->rcu.next = &n->rcu;
618 inflate_threshold_use = inflate_threshold; 452}
619 halve_threshold_use = halve_threshold;
620 }
621 453
622 max_work = MAX_WORK; 454static void tnode_free(struct tnode *tn)
623 while ((tn->full_children > 0 && max_work-- && 455{
624 50 * (tn->full_children + tnode_child_length(tn) 456 struct callback_head *head = &tn->rcu;
625 - tn->empty_children)
626 >= inflate_threshold_use * tnode_child_length(tn))) {
627 457
628 old_tn = tn; 458 while (head) {
629 tn = inflate(t, tn); 459 head = head->next;
460 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
461 node_free(tn);
630 462
631 if (IS_ERR(tn)) { 463 tn = container_of(head, struct tnode, rcu);
632 tn = old_tn;
633#ifdef CONFIG_IP_FIB_TRIE_STATS
634 t->stats.resize_node_skipped++;
635#endif
636 break;
637 }
638 } 464 }
639 465
640 check_tnode(tn); 466 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
641 467 tnode_free_size = 0;
642 /* Return if at least one inflate is run */ 468 synchronize_rcu();
643 if (max_work != MAX_WORK)
644 return (struct rt_trie_node *) tn;
645
646 /*
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
649 */
650
651 max_work = MAX_WORK;
652 while (tn->bits > 1 && max_work-- &&
653 100 * (tnode_child_length(tn) - tn->empty_children) <
654 halve_threshold_use * tnode_child_length(tn)) {
655
656 old_tn = tn;
657 tn = halve(t, tn);
658 if (IS_ERR(tn)) {
659 tn = old_tn;
660#ifdef CONFIG_IP_FIB_TRIE_STATS
661 t->stats.resize_node_skipped++;
662#endif
663 break;
664 }
665 } 469 }
470}
666 471
472static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
473{
474 struct tnode *tp = node_parent(oldtnode);
475 unsigned long i;
667 476
668 /* Only one child remains */ 477 /* setup the parent pointer out of and back into this node */
669 if (tn->empty_children == tnode_child_length(tn) - 1) { 478 NODE_INIT_PARENT(tn, tp);
670one_child: 479 put_child_root(tp, t, tn->key, tn);
671 for (i = 0; i < tnode_child_length(tn); i++) {
672 struct rt_trie_node *n;
673
674 n = rtnl_dereference(tn->child[i]);
675 if (!n)
676 continue;
677
678 /* compress one level */
679 480
680 node_set_parent(n, NULL); 481 /* update all of the child parent pointers */
681 tnode_free_safe(tn); 482 update_children(tn);
682 return n;
683 }
684 }
685 return (struct rt_trie_node *) tn;
686}
687 483
484 /* all pointers should be clean so we are done */
485 tnode_free(oldtnode);
688 486
689static void tnode_clean_free(struct tnode *tn) 487 /* resize children now that oldtnode is freed */
690{ 488 for (i = tnode_child_length(tn); i;) {
691 int i; 489 struct tnode *inode = tnode_get_child(tn, --i);
692 struct tnode *tofree;
693 490
694 for (i = 0; i < tnode_child_length(tn); i++) { 491 /* resize child node */
695 tofree = (struct tnode *)rtnl_dereference(tn->child[i]); 492 if (tnode_full(tn, inode))
696 if (tofree) 493 resize(t, inode);
697 tnode_free(tofree);
698 } 494 }
699 tnode_free(tn);
700} 495}
701 496
702static struct tnode *inflate(struct trie *t, struct tnode *tn) 497static int inflate(struct trie *t, struct tnode *oldtnode)
703{ 498{
704 struct tnode *oldtnode = tn; 499 struct tnode *tn;
705 int olen = tnode_child_length(tn); 500 unsigned long i;
706 int i; 501 t_key m;
707 502
708 pr_debug("In inflate\n"); 503 pr_debug("In inflate\n");
709 504
710 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 505 tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
711
712 if (!tn) 506 if (!tn)
713 return ERR_PTR(-ENOMEM); 507 return -ENOMEM;
714
715 /*
716 * Preallocate and store tnodes before the actual work so we
717 * don't get into an inconsistent state if memory allocation
718 * fails. In case of failure we return the oldnode and inflate
719 * of tnode is ignored.
720 */
721
722 for (i = 0; i < olen; i++) {
723 struct tnode *inode;
724
725 inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 if (inode &&
727 IS_TNODE(inode) &&
728 inode->pos == oldtnode->pos + oldtnode->bits &&
729 inode->bits > 1) {
730 struct tnode *left, *right;
731 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732
733 left = tnode_new(inode->key&(~m), inode->pos + 1,
734 inode->bits - 1);
735 if (!left)
736 goto nomem;
737
738 right = tnode_new(inode->key|m, inode->pos + 1,
739 inode->bits - 1);
740
741 if (!right) {
742 tnode_free(left);
743 goto nomem;
744 }
745 508
746 put_child(tn, 2*i, (struct rt_trie_node *) left); 509 /* prepare oldtnode to be freed */
747 put_child(tn, 2*i+1, (struct rt_trie_node *) right); 510 tnode_free_init(oldtnode);
748 }
749 }
750 511
751 for (i = 0; i < olen; i++) { 512 /* Assemble all of the pointers in our cluster, in this case that
752 struct tnode *inode; 513 * represents all of the pointers out of our allocated nodes that
753 struct rt_trie_node *node = tnode_get_child(oldtnode, i); 514 * point to existing tnodes and the links between our allocated
754 struct tnode *left, *right; 515 * nodes.
755 int size, j; 516 */
517 for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
518 struct tnode *inode = tnode_get_child(oldtnode, --i);
519 struct tnode *node0, *node1;
520 unsigned long j, k;
756 521
757 /* An empty child */ 522 /* An empty child */
758 if (node == NULL) 523 if (inode == NULL)
759 continue; 524 continue;
760 525
761 /* A leaf or an internal node with skipped bits */ 526 /* A leaf or an internal node with skipped bits */
762 527 if (!tnode_full(oldtnode, inode)) {
763 if (IS_LEAF(node) || ((struct tnode *) node)->pos > 528 put_child(tn, get_index(inode->key, tn), inode);
764 tn->pos + tn->bits - 1) {
765 put_child(tn,
766 tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
767 node);
768 continue; 529 continue;
769 } 530 }
770 531
771 /* An internal node with two children */ 532 /* drop the node in the old tnode free list */
772 inode = (struct tnode *) node; 533 tnode_free_append(oldtnode, inode);
773 534
535 /* An internal node with two children */
774 if (inode->bits == 1) { 536 if (inode->bits == 1) {
775 put_child(tn, 2*i, rtnl_dereference(inode->child[0])); 537 put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
776 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); 538 put_child(tn, 2 * i, tnode_get_child(inode, 0));
777
778 tnode_free_safe(inode);
779 continue; 539 continue;
780 } 540 }
781 541
782 /* An internal node with more than two children */
783
784 /* We will replace this node 'inode' with two new 542 /* We will replace this node 'inode' with two new
785 * ones, 'left' and 'right', each with half of the 543 * ones, 'node0' and 'node1', each with half of the
786 * original children. The two new nodes will have 544 * original children. The two new nodes will have
787 * a position one bit further down the key and this 545 * a position one bit further down the key and this
788 * means that the "significant" part of their keys 546 * means that the "significant" part of their keys
789 * (see the discussion near the top of this file) 547 * (see the discussion near the top of this file)
790 * will differ by one bit, which will be "0" in 548 * will differ by one bit, which will be "0" in
791 * left's key and "1" in right's key. Since we are 549 * node0's key and "1" in node1's key. Since we are
792 * moving the key position by one step, the bit that 550 * moving the key position by one step, the bit that
793 * we are moving away from - the bit at position 551 * we are moving away from - the bit at position
794 * (inode->pos) - is the one that will differ between 552 * (tn->pos) - is the one that will differ between
795 * left and right. So... we synthesize that bit in the 553 * node0 and node1. So... we synthesize that bit in the
796 * two new keys. 554 * two new keys.
797 * The mask 'm' below will be a single "one" bit at
798 * the position (inode->pos)
799 */ 555 */
556 node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
557 if (!node1)
558 goto nomem;
559 node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
560
561 tnode_free_append(tn, node1);
562 if (!node0)
563 goto nomem;
564 tnode_free_append(tn, node0);
565
566 /* populate child pointers in new nodes */
567 for (k = tnode_child_length(inode), j = k / 2; j;) {
568 put_child(node1, --j, tnode_get_child(inode, --k));
569 put_child(node0, j, tnode_get_child(inode, j));
570 put_child(node1, --j, tnode_get_child(inode, --k));
571 put_child(node0, j, tnode_get_child(inode, j));
572 }
800 573
801 /* Use the old key, but set the new significant 574 /* link new nodes to parent */
802 * bit to zero. 575 NODE_INIT_PARENT(node1, tn);
803 */ 576 NODE_INIT_PARENT(node0, tn);
577
578 /* link parent to nodes */
579 put_child(tn, 2 * i + 1, node1);
580 put_child(tn, 2 * i, node0);
581 }
582
583 /* setup the parent pointers into and out of this node */
584 replace(t, oldtnode, tn);
585
586 return 0;
587nomem:
588 /* all pointers should be clean so we are done */
589 tnode_free(tn);
590 return -ENOMEM;
591}
592
593static int halve(struct trie *t, struct tnode *oldtnode)
594{
595 struct tnode *tn;
596 unsigned long i;
597
598 pr_debug("In halve\n");
804 599
805 left = (struct tnode *) tnode_get_child(tn, 2*i); 600 tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
806 put_child(tn, 2*i, NULL); 601 if (!tn)
602 return -ENOMEM;
807 603
808 BUG_ON(!left); 604 /* prepare oldtnode to be freed */
605 tnode_free_init(oldtnode);
809 606
810 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 607 /* Assemble all of the pointers in our cluster, in this case that
811 put_child(tn, 2*i+1, NULL); 608 * represents all of the pointers out of our allocated nodes that
609 * point to existing tnodes and the links between our allocated
610 * nodes.
611 */
612 for (i = tnode_child_length(oldtnode); i;) {
613 struct tnode *node1 = tnode_get_child(oldtnode, --i);
614 struct tnode *node0 = tnode_get_child(oldtnode, --i);
615 struct tnode *inode;
812 616
813 BUG_ON(!right); 617 /* At least one of the children is empty */
618 if (!node1 || !node0) {
619 put_child(tn, i / 2, node1 ? : node0);
620 continue;
621 }
814 622
815 size = tnode_child_length(left); 623 /* Two nonempty children */
816 for (j = 0; j < size; j++) { 624 inode = tnode_new(node0->key, oldtnode->pos, 1);
817 put_child(left, j, rtnl_dereference(inode->child[j])); 625 if (!inode) {
818 put_child(right, j, rtnl_dereference(inode->child[j + size])); 626 tnode_free(tn);
627 return -ENOMEM;
819 } 628 }
820 put_child(tn, 2*i, resize(t, left)); 629 tnode_free_append(tn, inode);
821 put_child(tn, 2*i+1, resize(t, right)); 630
631 /* initialize pointers out of node */
632 put_child(inode, 1, node1);
633 put_child(inode, 0, node0);
634 NODE_INIT_PARENT(inode, tn);
822 635
823 tnode_free_safe(inode); 636 /* link parent to node */
637 put_child(tn, i / 2, inode);
824 } 638 }
825 tnode_free_safe(oldtnode); 639
826 return tn; 640 /* setup the parent pointers into and out of this node */
827nomem: 641 replace(t, oldtnode, tn);
828 tnode_clean_free(tn); 642
829 return ERR_PTR(-ENOMEM); 643 return 0;
830} 644}
831 645
832static struct tnode *halve(struct trie *t, struct tnode *tn) 646static void collapse(struct trie *t, struct tnode *oldtnode)
833{ 647{
834 struct tnode *oldtnode = tn; 648 struct tnode *n, *tp;
835 struct rt_trie_node *left, *right; 649 unsigned long i;
836 int i;
837 int olen = tnode_child_length(tn);
838 650
839 pr_debug("In halve\n"); 651 /* scan the tnode looking for that one child that might still exist */
652 for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
653 n = tnode_get_child(oldtnode, --i);
840 654
841 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 655 /* compress one level */
656 tp = node_parent(oldtnode);
657 put_child_root(tp, t, oldtnode->key, n);
658 node_set_parent(n, tp);
842 659
843 if (!tn) 660 /* drop dead node */
844 return ERR_PTR(-ENOMEM); 661 node_free(oldtnode);
662}
845 663
846 /* 664static unsigned char update_suffix(struct tnode *tn)
847 * Preallocate and store tnodes before the actual work so we 665{
848 * don't get into an inconsistent state if memory allocation 666 unsigned char slen = tn->pos;
849 * fails. In case of failure we return the oldnode and halve 667 unsigned long stride, i;
850 * of tnode is ignored. 668
669 /* search though the list of children looking for nodes that might
670 * have a suffix greater than the one we currently have. This is
671 * why we start with a stride of 2 since a stride of 1 would
672 * represent the nodes with suffix length equal to tn->pos
851 */ 673 */
674 for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
675 struct tnode *n = tnode_get_child(tn, i);
852 676
853 for (i = 0; i < olen; i += 2) { 677 if (!n || (n->slen <= slen))
854 left = tnode_get_child(oldtnode, i); 678 continue;
855 right = tnode_get_child(oldtnode, i+1);
856 679
857 /* Two nonempty children */ 680 /* update stride and slen based on new value */
858 if (left && right) { 681 stride <<= (n->slen - slen);
859 struct tnode *newn; 682 slen = n->slen;
683 i &= ~(stride - 1);
860 684
861 newn = tnode_new(left->key, tn->pos + tn->bits, 1); 685 /* if slen covers all but the last bit we can stop here
686 * there will be nothing longer than that since only node
687 * 0 and 1 << (bits - 1) could have that as their suffix
688 * length.
689 */
690 if ((slen + 1) >= (tn->pos + tn->bits))
691 break;
692 }
862 693
863 if (!newn) 694 tn->slen = slen;
864 goto nomem;
865 695
866 put_child(tn, i/2, (struct rt_trie_node *)newn); 696 return slen;
867 } 697}
868 698
869 } 699/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
700 * the Helsinki University of Technology and Matti Tikkanen of Nokia
701 * Telecommunications, page 6:
702 * "A node is doubled if the ratio of non-empty children to all
703 * children in the *doubled* node is at least 'high'."
704 *
705 * 'high' in this instance is the variable 'inflate_threshold'. It
706 * is expressed as a percentage, so we multiply it with
707 * tnode_child_length() and instead of multiplying by 2 (since the
708 * child array will be doubled by inflate()) and multiplying
709 * the left-hand side by 100 (to handle the percentage thing) we
710 * multiply the left-hand side by 50.
711 *
712 * The left-hand side may look a bit weird: tnode_child_length(tn)
713 * - tn->empty_children is of course the number of non-null children
714 * in the current node. tn->full_children is the number of "full"
715 * children, that is non-null tnodes with a skip value of 0.
716 * All of those will be doubled in the resulting inflated tnode, so
717 * we just count them one extra time here.
718 *
719 * A clearer way to write this would be:
720 *
721 * to_be_doubled = tn->full_children;
722 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
723 * tn->full_children;
724 *
725 * new_child_length = tnode_child_length(tn) * 2;
726 *
727 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
728 * new_child_length;
729 * if (new_fill_factor >= inflate_threshold)
730 *
731 * ...and so on, tho it would mess up the while () loop.
732 *
733 * anyway,
734 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
735 * inflate_threshold
736 *
737 * avoid a division:
738 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
739 * inflate_threshold * new_child_length
740 *
741 * expand not_to_be_doubled and to_be_doubled, and shorten:
742 * 100 * (tnode_child_length(tn) - tn->empty_children +
743 * tn->full_children) >= inflate_threshold * new_child_length
744 *
745 * expand new_child_length:
746 * 100 * (tnode_child_length(tn) - tn->empty_children +
747 * tn->full_children) >=
748 * inflate_threshold * tnode_child_length(tn) * 2
749 *
750 * shorten again:
751 * 50 * (tn->full_children + tnode_child_length(tn) -
752 * tn->empty_children) >= inflate_threshold *
753 * tnode_child_length(tn)
754 *
755 */
756static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
757{
758 unsigned long used = tnode_child_length(tn);
759 unsigned long threshold = used;
870 760
871 for (i = 0; i < olen; i += 2) { 761 /* Keep root node larger */
872 struct tnode *newBinNode; 762 threshold *= tp ? inflate_threshold : inflate_threshold_root;
763 used -= tn->empty_children;
764 used += tn->full_children;
873 765
874 left = tnode_get_child(oldtnode, i); 766 /* if bits == KEYLENGTH then pos = 0, and will fail below */
875 right = tnode_get_child(oldtnode, i+1);
876 767
877 /* At least one of the children is empty */ 768 return (used > 1) && tn->pos && ((50 * used) >= threshold);
878 if (left == NULL) { 769}
879 if (right == NULL) /* Both are empty */ 770
880 continue; 771static bool should_halve(const struct tnode *tp, const struct tnode *tn)
881 put_child(tn, i/2, right); 772{
882 continue; 773 unsigned long used = tnode_child_length(tn);
774 unsigned long threshold = used;
775
776 /* Keep root node larger */
777 threshold *= tp ? halve_threshold : halve_threshold_root;
778 used -= tn->empty_children;
779
780 /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
781
782 return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
783}
784
785static bool should_collapse(const struct tnode *tn)
786{
787 unsigned long used = tnode_child_length(tn);
788
789 used -= tn->empty_children;
790
791 /* account for bits == KEYLENGTH case */
792 if ((tn->bits == KEYLENGTH) && tn->full_children)
793 used -= KEY_MAX;
794
795 /* One child or none, time to drop us from the trie */
796 return used < 2;
797}
798
799#define MAX_WORK 10
800static void resize(struct trie *t, struct tnode *tn)
801{
802 struct tnode *tp = node_parent(tn);
803 struct tnode __rcu **cptr;
804 int max_work = MAX_WORK;
805
806 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
807 tn, inflate_threshold, halve_threshold);
808
809 /* track the tnode via the pointer from the parent instead of
810 * doing it ourselves. This way we can let RCU fully do its
811 * thing without us interfering
812 */
813 cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
814 BUG_ON(tn != rtnl_dereference(*cptr));
815
816 /* Double as long as the resulting node has a number of
817 * nonempty nodes that are above the threshold.
818 */
819 while (should_inflate(tp, tn) && max_work) {
820 if (inflate(t, tn)) {
821#ifdef CONFIG_IP_FIB_TRIE_STATS
822 this_cpu_inc(t->stats->resize_node_skipped);
823#endif
824 break;
883 } 825 }
884 826
885 if (right == NULL) { 827 max_work--;
886 put_child(tn, i/2, left); 828 tn = rtnl_dereference(*cptr);
887 continue; 829 }
830
831 /* Return if at least one inflate is run */
832 if (max_work != MAX_WORK)
833 return;
834
835 /* Halve as long as the number of empty children in this
836 * node is above threshold.
837 */
838 while (should_halve(tp, tn) && max_work) {
839 if (halve(t, tn)) {
840#ifdef CONFIG_IP_FIB_TRIE_STATS
841 this_cpu_inc(t->stats->resize_node_skipped);
842#endif
843 break;
888 } 844 }
889 845
890 /* Two nonempty children */ 846 max_work--;
891 newBinNode = (struct tnode *) tnode_get_child(tn, i/2); 847 tn = rtnl_dereference(*cptr);
892 put_child(tn, i/2, NULL); 848 }
893 put_child(newBinNode, 0, left); 849
894 put_child(newBinNode, 1, right); 850 /* Only one child remains */
895 put_child(tn, i/2, resize(t, newBinNode)); 851 if (should_collapse(tn)) {
852 collapse(t, tn);
853 return;
854 }
855
856 /* Return if at least one deflate was run */
857 if (max_work != MAX_WORK)
858 return;
859
860 /* push the suffix length to the parent node */
861 if (tn->slen > tn->pos) {
862 unsigned char slen = update_suffix(tn);
863
864 if (tp && (slen > tp->slen))
865 tp->slen = slen;
896 } 866 }
897 tnode_free_safe(oldtnode);
898 return tn;
899nomem:
900 tnode_clean_free(tn);
901 return ERR_PTR(-ENOMEM);
902} 867}
903 868
904/* readside must use rcu_read_lock currently dump routines 869/* readside must use rcu_read_lock currently dump routines
905 via get_fa_head and dump */ 870 via get_fa_head and dump */
906 871
907static struct leaf_info *find_leaf_info(struct leaf *l, int plen) 872static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
908{ 873{
909 struct hlist_head *head = &l->list; 874 struct hlist_head *head = &l->list;
910 struct leaf_info *li; 875 struct leaf_info *li;
@@ -916,7 +881,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
916 return NULL; 881 return NULL;
917} 882}
918 883
919static inline struct list_head *get_fa_head(struct leaf *l, int plen) 884static inline struct list_head *get_fa_head(struct tnode *l, int plen)
920{ 885{
921 struct leaf_info *li = find_leaf_info(l, plen); 886 struct leaf_info *li = find_leaf_info(l, plen);
922 887
@@ -926,8 +891,51 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen)
926 return &li->falh; 891 return &li->falh;
927} 892}
928 893
929static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 894static void leaf_pull_suffix(struct tnode *l)
895{
896 struct tnode *tp = node_parent(l);
897
898 while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
899 if (update_suffix(tp) > l->slen)
900 break;
901 tp = node_parent(tp);
902 }
903}
904
905static void leaf_push_suffix(struct tnode *l)
906{
907 struct tnode *tn = node_parent(l);
908
909 /* if this is a new leaf then tn will be NULL and we can sort
910 * out parent suffix lengths as a part of trie_rebalance
911 */
912 while (tn && (tn->slen < l->slen)) {
913 tn->slen = l->slen;
914 tn = node_parent(tn);
915 }
916}
917
918static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
930{ 919{
920 /* record the location of the previous list_info entry */
921 struct hlist_node **pprev = old->hlist.pprev;
922 struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
923
924 /* remove the leaf info from the list */
925 hlist_del_rcu(&old->hlist);
926
927 /* only access li if it is pointing at the last valid hlist_node */
928 if (hlist_empty(&l->list) || (*pprev))
929 return;
930
931 /* update the trie with the latest suffix length */
932 l->slen = KEYLENGTH - li->plen;
933 leaf_pull_suffix(l);
934}
935
936static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
937{
938 struct hlist_head *head = &l->list;
931 struct leaf_info *li = NULL, *last = NULL; 939 struct leaf_info *li = NULL, *last = NULL;
932 940
933 if (hlist_empty(head)) { 941 if (hlist_empty(head)) {
@@ -944,218 +952,174 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
944 else 952 else
945 hlist_add_before_rcu(&new->hlist, &li->hlist); 953 hlist_add_before_rcu(&new->hlist, &li->hlist);
946 } 954 }
955
956 /* if we added to the tail node then we need to update slen */
957 if (l->slen < (KEYLENGTH - new->plen)) {
958 l->slen = KEYLENGTH - new->plen;
959 leaf_push_suffix(l);
960 }
947} 961}
948 962
949/* rcu_read_lock needs to be hold by caller from readside */ 963/* rcu_read_lock needs to be hold by caller from readside */
964static struct tnode *fib_find_node(struct trie *t, u32 key)
965{
966 struct tnode *n = rcu_dereference_rtnl(t->trie);
967
968 while (n) {
969 unsigned long index = get_index(key, n);
970
971 /* This bit of code is a bit tricky but it combines multiple
972 * checks into a single check. The prefix consists of the
973 * prefix plus zeros for the bits in the cindex. The index
974 * is the difference between the key and this value. From
975 * this we can actually derive several pieces of data.
976 * if (index & (~0ul << bits))
977 * we have a mismatch in skip bits and failed
978 * else
979 * we know the value is cindex
980 */
981 if (index & (~0ul << n->bits))
982 return NULL;
950 983
951static struct leaf * 984 /* we have found a leaf. Prefixes have already been compared */
952fib_find_node(struct trie *t, u32 key) 985 if (IS_LEAF(n))
953{ 986 break;
954 int pos;
955 struct tnode *tn;
956 struct rt_trie_node *n;
957 987
958 pos = 0; 988 n = tnode_get_child_rcu(n, index);
959 n = rcu_dereference_rtnl(t->trie); 989 }
960 990
961 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 991 return n;
962 tn = (struct tnode *) n; 992}
963 993
964 check_tnode(tn); 994/* Return the first fib alias matching TOS with
995 * priority less than or equal to PRIO.
996 */
997static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
998{
999 struct fib_alias *fa;
965 1000
966 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1001 if (!fah)
967 pos = tn->pos + tn->bits; 1002 return NULL;
968 n = tnode_get_child_rcu(tn,
969 tkey_extract_bits(key,
970 tn->pos,
971 tn->bits));
972 } else
973 break;
974 }
975 /* Case we have found a leaf. Compare prefixes */
976 1003
977 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) 1004 list_for_each_entry(fa, fah, fa_list) {
978 return (struct leaf *)n; 1005 if (fa->fa_tos > tos)
1006 continue;
1007 if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
1008 return fa;
1009 }
979 1010
980 return NULL; 1011 return NULL;
981} 1012}
982 1013
983static void trie_rebalance(struct trie *t, struct tnode *tn) 1014static void trie_rebalance(struct trie *t, struct tnode *tn)
984{ 1015{
985 int wasfull;
986 t_key cindex, key;
987 struct tnode *tp; 1016 struct tnode *tp;
988 1017
989 key = tn->key; 1018 while ((tp = node_parent(tn)) != NULL) {
990 1019 resize(t, tn);
991 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
992 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
993 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
994 tn = (struct tnode *)resize(t, tn);
995
996 tnode_put_child_reorg(tp, cindex,
997 (struct rt_trie_node *)tn, wasfull);
998
999 tp = node_parent((struct rt_trie_node *) tn);
1000 if (!tp)
1001 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1002
1003 tnode_free_flush();
1004 if (!tp)
1005 break;
1006 tn = tp; 1020 tn = tp;
1007 } 1021 }
1008 1022
1009 /* Handle last (top) tnode */ 1023 /* Handle last (top) tnode */
1010 if (IS_TNODE(tn)) 1024 if (IS_TNODE(tn))
1011 tn = (struct tnode *)resize(t, tn); 1025 resize(t, tn);
1012
1013 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014 tnode_free_flush();
1015} 1026}
1016 1027
1017/* only used from updater-side */ 1028/* only used from updater-side */
1018 1029
1019static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) 1030static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020{ 1031{
1021 int pos, newpos;
1022 struct tnode *tp = NULL, *tn = NULL;
1023 struct rt_trie_node *n;
1024 struct leaf *l;
1025 int missbit;
1026 struct list_head *fa_head = NULL; 1032 struct list_head *fa_head = NULL;
1033 struct tnode *l, *n, *tp = NULL;
1027 struct leaf_info *li; 1034 struct leaf_info *li;
1028 t_key cindex;
1029 1035
1030 pos = 0; 1036 li = leaf_info_new(plen);
1037 if (!li)
1038 return NULL;
1039 fa_head = &li->falh;
1040
1031 n = rtnl_dereference(t->trie); 1041 n = rtnl_dereference(t->trie);
1032 1042
1033 /* If we point to NULL, stop. Either the tree is empty and we should 1043 /* If we point to NULL, stop. Either the tree is empty and we should
1034 * just put a new leaf in if, or we have reached an empty child slot, 1044 * just put a new leaf in if, or we have reached an empty child slot,
1035 * and we should just put our new leaf in that. 1045 * and we should just put our new leaf in that.
1036 * If we point to a T_TNODE, check if it matches our key. Note that
1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 * not be the parent's 'pos'+'bits'!
1039 *
1040 * If it does match the current key, get pos/bits from it, extract
1041 * the index from our key, push the T_TNODE and walk the tree.
1042 *
1043 * If it doesn't, we have to replace it with a new T_TNODE.
1044 * 1046 *
1045 * If we point to a T_LEAF, it might or might not have the same key 1047 * If we hit a node with a key that does't match then we should stop
1046 * as we do. If it does, just change the value, update the T_LEAF's 1048 * and create a new tnode to replace that node and insert ourselves
1047 * value, and return it. 1049 * and the other node into the new tnode.
1048 * If it doesn't, we need to replace it with a T_TNODE.
1049 */ 1050 */
1050 1051 while (n) {
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 1052 unsigned long index = get_index(key, n);
1052 tn = (struct tnode *) n; 1053
1053 1054 /* This bit of code is a bit tricky but it combines multiple
1054 check_tnode(tn); 1055 * checks into a single check. The prefix consists of the
1055 1056 * prefix plus zeros for the "bits" in the prefix. The index
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1057 * is the difference between the key and this value. From
1057 tp = tn; 1058 * this we can actually derive several pieces of data.
1058 pos = tn->pos + tn->bits; 1059 * if !(index >> bits)
1059 n = tnode_get_child(tn, 1060 * we know the value is child index
1060 tkey_extract_bits(key, 1061 * else
1061 tn->pos, 1062 * we have a mismatch in skip bits and failed
1062 tn->bits)); 1063 */
1063 1064 if (index >> n->bits)
1064 BUG_ON(n && node_parent(n) != tn);
1065 } else
1066 break; 1065 break;
1067 }
1068 1066
1069 /* 1067 /* we have found a leaf. Prefixes have already been compared */
1070 * n ----> NULL, LEAF or TNODE 1068 if (IS_LEAF(n)) {
1071 * 1069 /* Case 1: n is a leaf, and prefixes match*/
1072 * tp is n's (parent) ----> NULL or TNODE 1070 insert_leaf_info(n, li);
1073 */ 1071 return fa_head;
1074 1072 }
1075 BUG_ON(tp && IS_LEAF(tp));
1076
1077 /* Case 1: n is a leaf. Compare prefixes */
1078
1079 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080 l = (struct leaf *) n;
1081 li = leaf_info_new(plen);
1082
1083 if (!li)
1084 return NULL;
1085 1073
1086 fa_head = &li->falh; 1074 tp = n;
1087 insert_leaf_info(&l->list, li); 1075 n = tnode_get_child_rcu(n, index);
1088 goto done;
1089 } 1076 }
1090 l = leaf_new();
1091 1077
1092 if (!l) 1078 l = leaf_new(key);
1093 return NULL; 1079 if (!l) {
1094 1080 free_leaf_info(li);
1095 l->key = key;
1096 li = leaf_info_new(plen);
1097
1098 if (!li) {
1099 free_leaf(l);
1100 return NULL; 1081 return NULL;
1101 } 1082 }
1102 1083
1103 fa_head = &li->falh; 1084 insert_leaf_info(l, li);
1104 insert_leaf_info(&l->list, li);
1105
1106 if (t->trie && n == NULL) {
1107 /* Case 2: n is NULL, and will just insert a new leaf */
1108 1085
1109 node_set_parent((struct rt_trie_node *)l, tp); 1086 /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1110 1087 *
1111 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1088 * Add a new tnode here
1112 put_child(tp, cindex, (struct rt_trie_node *)l); 1089 * first tnode need some special handling
1113 } else { 1090 * leaves us in position for handling as case 3
1114 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1091 */
1115 /* 1092 if (n) {
1116 * Add a new tnode here 1093 struct tnode *tn;
1117 * first tnode need some special handling
1118 */
1119
1120 if (n) {
1121 pos = tp ? tp->pos+tp->bits : 0;
1122 newpos = tkey_mismatch(key, pos, n->key);
1123 tn = tnode_new(n->key, newpos, 1);
1124 } else {
1125 newpos = 0;
1126 tn = tnode_new(key, newpos, 1); /* First tnode */
1127 }
1128 1094
1095 tn = tnode_new(key, __fls(key ^ n->key), 1);
1129 if (!tn) { 1096 if (!tn) {
1130 free_leaf_info(li); 1097 free_leaf_info(li);
1131 free_leaf(l); 1098 node_free(l);
1132 return NULL; 1099 return NULL;
1133 } 1100 }
1134 1101
1135 node_set_parent((struct rt_trie_node *)tn, tp); 1102 /* initialize routes out of node */
1103 NODE_INIT_PARENT(tn, tp);
1104 put_child(tn, get_index(key, tn) ^ 1, n);
1136 1105
1137 missbit = tkey_extract_bits(key, newpos, 1); 1106 /* start adding routes into the node */
1138 put_child(tn, missbit, (struct rt_trie_node *)l); 1107 put_child_root(tp, t, key, tn);
1139 put_child(tn, 1-missbit, n); 1108 node_set_parent(n, tn);
1140
1141 if (tp) {
1142 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1143 put_child(tp, cindex, (struct rt_trie_node *)tn);
1144 } else {
1145 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1146 }
1147 1109
1110 /* parent now has a NULL spot where the leaf can go */
1148 tp = tn; 1111 tp = tn;
1149 } 1112 }
1150 1113
1151 if (tp && tp->pos + tp->bits > 32) 1114 /* Case 3: n is NULL, and will just insert a new leaf */
1152 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1115 if (tp) {
1153 tp, tp->pos, tp->bits, key, plen); 1116 NODE_INIT_PARENT(l, tp);
1154 1117 put_child(tp, get_index(key, tp), l);
1155 /* Rebalance the trie */ 1118 trie_rebalance(t, tp);
1119 } else {
1120 rcu_assign_pointer(t->trie, l);
1121 }
1156 1122
1157 trie_rebalance(t, tp);
1158done:
1159 return fa_head; 1123 return fa_head;
1160} 1124}
1161 1125
@@ -1172,7 +1136,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1172 u8 tos = cfg->fc_tos; 1136 u8 tos = cfg->fc_tos;
1173 u32 key, mask; 1137 u32 key, mask;
1174 int err; 1138 int err;
1175 struct leaf *l; 1139 struct tnode *l;
1176 1140
1177 if (plen > 32) 1141 if (plen > 32)
1178 return -EINVAL; 1142 return -EINVAL;
@@ -1329,18 +1293,130 @@ err:
1329 return err; 1293 return err;
1330} 1294}
1331 1295
1296static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1297{
1298 t_key prefix = n->key;
1299
1300 return (key ^ prefix) & (prefix | -prefix);
1301}
1302
1332/* should be called with rcu_read_lock */ 1303/* should be called with rcu_read_lock */
1333static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, 1304int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1334 t_key key, const struct flowi4 *flp, 1305 struct fib_result *res, int fib_flags)
1335 struct fib_result *res, int fib_flags)
1336{ 1306{
1307 struct trie *t = (struct trie *)tb->tb_data;
1308#ifdef CONFIG_IP_FIB_TRIE_STATS
1309 struct trie_use_stats __percpu *stats = t->stats;
1310#endif
1311 const t_key key = ntohl(flp->daddr);
1312 struct tnode *n, *pn;
1337 struct leaf_info *li; 1313 struct leaf_info *li;
1338 struct hlist_head *hhead = &l->list; 1314 t_key cindex;
1315
1316 n = rcu_dereference(t->trie);
1317 if (!n)
1318 return -EAGAIN;
1319
1320#ifdef CONFIG_IP_FIB_TRIE_STATS
1321 this_cpu_inc(stats->gets);
1322#endif
1323
1324 pn = n;
1325 cindex = 0;
1326
1327 /* Step 1: Travel to the longest prefix match in the trie */
1328 for (;;) {
1329 unsigned long index = get_index(key, n);
1330
1331 /* This bit of code is a bit tricky but it combines multiple
1332 * checks into a single check. The prefix consists of the
1333 * prefix plus zeros for the "bits" in the prefix. The index
1334 * is the difference between the key and this value. From
1335 * this we can actually derive several pieces of data.
1336 * if (index & (~0ul << bits))
1337 * we have a mismatch in skip bits and failed
1338 * else
1339 * we know the value is cindex
1340 */
1341 if (index & (~0ul << n->bits))
1342 break;
1343
1344 /* we have found a leaf. Prefixes have already been compared */
1345 if (IS_LEAF(n))
1346 goto found;
1347
1348 /* only record pn and cindex if we are going to be chopping
1349 * bits later. Otherwise we are just wasting cycles.
1350 */
1351 if (n->slen > n->pos) {
1352 pn = n;
1353 cindex = index;
1354 }
1355
1356 n = tnode_get_child_rcu(n, index);
1357 if (unlikely(!n))
1358 goto backtrace;
1359 }
1360
1361 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1362 for (;;) {
1363 /* record the pointer where our next node pointer is stored */
1364 struct tnode __rcu **cptr = n->child;
1365
1366 /* This test verifies that none of the bits that differ
1367 * between the key and the prefix exist in the region of
1368 * the lsb and higher in the prefix.
1369 */
1370 if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1371 goto backtrace;
1372
1373 /* exit out and process leaf */
1374 if (unlikely(IS_LEAF(n)))
1375 break;
1376
1377 /* Don't bother recording parent info. Since we are in
1378 * prefix match mode we will have to come back to wherever
1379 * we started this traversal anyway
1380 */
1381
1382 while ((n = rcu_dereference(*cptr)) == NULL) {
1383backtrace:
1384#ifdef CONFIG_IP_FIB_TRIE_STATS
1385 if (!n)
1386 this_cpu_inc(stats->null_node_hit);
1387#endif
1388 /* If we are at cindex 0 there are no more bits for
1389 * us to strip at this level so we must ascend back
1390 * up one level to see if there are any more bits to
1391 * be stripped there.
1392 */
1393 while (!cindex) {
1394 t_key pkey = pn->key;
1395
1396 pn = node_parent_rcu(pn);
1397 if (unlikely(!pn))
1398 return -EAGAIN;
1399#ifdef CONFIG_IP_FIB_TRIE_STATS
1400 this_cpu_inc(stats->backtrack);
1401#endif
1402 /* Get Child's index */
1403 cindex = get_index(pkey, pn);
1404 }
1405
1406 /* strip the least significant bit from the cindex */
1407 cindex &= cindex - 1;
1408
1409 /* grab pointer for next child node */
1410 cptr = &pn->child[cindex];
1411 }
1412 }
1339 1413
1340 hlist_for_each_entry_rcu(li, hhead, hlist) { 1414found:
1415 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1416 hlist_for_each_entry_rcu(li, &n->list, hlist) {
1341 struct fib_alias *fa; 1417 struct fib_alias *fa;
1342 1418
1343 if (l->key != (key & li->mask_plen)) 1419 if ((key ^ n->key) & li->mask_plen)
1344 continue; 1420 continue;
1345 1421
1346 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 1422 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -1355,9 +1431,9 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1355 continue; 1431 continue;
1356 fib_alias_accessed(fa); 1432 fib_alias_accessed(fa);
1357 err = fib_props[fa->fa_type].error; 1433 err = fib_props[fa->fa_type].error;
1358 if (err) { 1434 if (unlikely(err < 0)) {
1359#ifdef CONFIG_IP_FIB_TRIE_STATS 1435#ifdef CONFIG_IP_FIB_TRIE_STATS
1360 t->stats.semantic_match_passed++; 1436 this_cpu_inc(stats->semantic_match_passed);
1361#endif 1437#endif
1362 return err; 1438 return err;
1363 } 1439 }
@@ -1371,241 +1447,48 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1371 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) 1447 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1372 continue; 1448 continue;
1373 1449
1374#ifdef CONFIG_IP_FIB_TRIE_STATS 1450 if (!(fib_flags & FIB_LOOKUP_NOREF))
1375 t->stats.semantic_match_passed++; 1451 atomic_inc(&fi->fib_clntref);
1376#endif 1452
1377 res->prefixlen = li->plen; 1453 res->prefixlen = li->plen;
1378 res->nh_sel = nhsel; 1454 res->nh_sel = nhsel;
1379 res->type = fa->fa_type; 1455 res->type = fa->fa_type;
1380 res->scope = fa->fa_info->fib_scope; 1456 res->scope = fi->fib_scope;
1381 res->fi = fi; 1457 res->fi = fi;
1382 res->table = tb; 1458 res->table = tb;
1383 res->fa_head = &li->falh; 1459 res->fa_head = &li->falh;
1384 if (!(fib_flags & FIB_LOOKUP_NOREF))
1385 atomic_inc(&fi->fib_clntref);
1386 return 0;
1387 }
1388 }
1389
1390#ifdef CONFIG_IP_FIB_TRIE_STATS
1391 t->stats.semantic_match_miss++;
1392#endif
1393 }
1394
1395 return 1;
1396}
1397
1398int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1399 struct fib_result *res, int fib_flags)
1400{
1401 struct trie *t = (struct trie *) tb->tb_data;
1402 int ret;
1403 struct rt_trie_node *n;
1404 struct tnode *pn;
1405 unsigned int pos, bits;
1406 t_key key = ntohl(flp->daddr);
1407 unsigned int chopped_off;
1408 t_key cindex = 0;
1409 unsigned int current_prefix_length = KEYLENGTH;
1410 struct tnode *cn;
1411 t_key pref_mismatch;
1412
1413 rcu_read_lock();
1414
1415 n = rcu_dereference(t->trie);
1416 if (!n)
1417 goto failed;
1418
1419#ifdef CONFIG_IP_FIB_TRIE_STATS 1460#ifdef CONFIG_IP_FIB_TRIE_STATS
1420 t->stats.gets++; 1461 this_cpu_inc(stats->semantic_match_passed);
1421#endif 1462#endif
1422 1463 return err;
1423 /* Just a leaf? */ 1464 }
1424 if (IS_LEAF(n)) {
1425 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1426 goto found;
1427 }
1428
1429 pn = (struct tnode *) n;
1430 chopped_off = 0;
1431
1432 while (pn) {
1433 pos = pn->pos;
1434 bits = pn->bits;
1435
1436 if (!chopped_off)
1437 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1438 pos, bits);
1439
1440 n = tnode_get_child_rcu(pn, cindex);
1441
1442 if (n == NULL) {
1443#ifdef CONFIG_IP_FIB_TRIE_STATS
1444 t->stats.null_node_hit++;
1445#endif
1446 goto backtrace;
1447 }
1448
1449 if (IS_LEAF(n)) {
1450 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1451 if (ret > 0)
1452 goto backtrace;
1453 goto found;
1454 }
1455
1456 cn = (struct tnode *)n;
1457
1458 /*
1459 * It's a tnode, and we can do some extra checks here if we
1460 * like, to avoid descending into a dead-end branch.
1461 * This tnode is in the parent's child array at index
1462 * key[p_pos..p_pos+p_bits] but potentially with some bits
1463 * chopped off, so in reality the index may be just a
1464 * subprefix, padded with zero at the end.
1465 * We can also take a look at any skipped bits in this
1466 * tnode - everything up to p_pos is supposed to be ok,
1467 * and the non-chopped bits of the index (se previous
1468 * paragraph) are also guaranteed ok, but the rest is
1469 * considered unknown.
1470 *
1471 * The skipped bits are key[pos+bits..cn->pos].
1472 */
1473
1474 /* If current_prefix_length < pos+bits, we are already doing
1475 * actual prefix matching, which means everything from
1476 * pos+(bits-chopped_off) onward must be zero along some
1477 * branch of this subtree - otherwise there is *no* valid
1478 * prefix present. Here we can only check the skipped
1479 * bits. Remember, since we have already indexed into the
1480 * parent's child array, we know that the bits we chopped of
1481 * *are* zero.
1482 */
1483
1484 /* NOTA BENE: Checking only skipped bits
1485 for the new node here */
1486
1487 if (current_prefix_length < pos+bits) {
1488 if (tkey_extract_bits(cn->key, current_prefix_length,
1489 cn->pos - current_prefix_length)
1490 || !(cn->child[0]))
1491 goto backtrace;
1492 }
1493
1494 /*
1495 * If chopped_off=0, the index is fully validated and we
1496 * only need to look at the skipped bits for this, the new,
1497 * tnode. What we actually want to do is to find out if
1498 * these skipped bits match our key perfectly, or if we will
1499 * have to count on finding a matching prefix further down,
1500 * because if we do, we would like to have some way of
1501 * verifying the existence of such a prefix at this point.
1502 */
1503
1504 /* The only thing we can do at this point is to verify that
1505 * any such matching prefix can indeed be a prefix to our
1506 * key, and if the bits in the node we are inspecting that
1507 * do not match our key are not ZERO, this cannot be true.
1508 * Thus, find out where there is a mismatch (before cn->pos)
1509 * and verify that all the mismatching bits are zero in the
1510 * new tnode's key.
1511 */
1512
1513 /*
1514 * Note: We aren't very concerned about the piece of
1515 * the key that precede pn->pos+pn->bits, since these
1516 * have already been checked. The bits after cn->pos
1517 * aren't checked since these are by definition
1518 * "unknown" at this point. Thus, what we want to see
1519 * is if we are about to enter the "prefix matching"
1520 * state, and in that case verify that the skipped
1521 * bits that will prevail throughout this subtree are
1522 * zero, as they have to be if we are to find a
1523 * matching prefix.
1524 */
1525
1526 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1527
1528 /*
1529 * In short: If skipped bits in this node do not match
1530 * the search key, enter the "prefix matching"
1531 * state.directly.
1532 */
1533 if (pref_mismatch) {
1534 /* fls(x) = __fls(x) + 1 */
1535 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1536
1537 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1538 goto backtrace;
1539
1540 if (current_prefix_length >= cn->pos)
1541 current_prefix_length = mp;
1542 } 1465 }
1543 1466
1544 pn = (struct tnode *)n; /* Descend */
1545 chopped_off = 0;
1546 continue;
1547
1548backtrace:
1549 chopped_off++;
1550
1551 /* As zero don't change the child key (cindex) */
1552 while ((chopped_off <= pn->bits)
1553 && !(cindex & (1<<(chopped_off-1))))
1554 chopped_off++;
1555
1556 /* Decrease current_... with bits chopped off */
1557 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1558 current_prefix_length = pn->pos + pn->bits
1559 - chopped_off;
1560
1561 /*
1562 * Either we do the actual chop off according or if we have
1563 * chopped off all bits in this tnode walk up to our parent.
1564 */
1565
1566 if (chopped_off <= pn->bits) {
1567 cindex &= ~(1 << (chopped_off-1));
1568 } else {
1569 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1570 if (!parent)
1571 goto failed;
1572
1573 /* Get Child's index */
1574 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1575 pn = parent;
1576 chopped_off = 0;
1577
1578#ifdef CONFIG_IP_FIB_TRIE_STATS 1467#ifdef CONFIG_IP_FIB_TRIE_STATS
1579 t->stats.backtrack++; 1468 this_cpu_inc(stats->semantic_match_miss);
1580#endif 1469#endif
1581 goto backtrace;
1582 }
1583 } 1470 }
1584failed: 1471 goto backtrace;
1585 ret = 1;
1586found:
1587 rcu_read_unlock();
1588 return ret;
1589} 1472}
1590EXPORT_SYMBOL_GPL(fib_table_lookup); 1473EXPORT_SYMBOL_GPL(fib_table_lookup);
1591 1474
1592/* 1475/*
1593 * Remove the leaf and return parent. 1476 * Remove the leaf and return parent.
1594 */ 1477 */
1595static void trie_leaf_remove(struct trie *t, struct leaf *l) 1478static void trie_leaf_remove(struct trie *t, struct tnode *l)
1596{ 1479{
1597 struct tnode *tp = node_parent((struct rt_trie_node *) l); 1480 struct tnode *tp = node_parent(l);
1598 1481
1599 pr_debug("entering trie_leaf_remove(%p)\n", l); 1482 pr_debug("entering trie_leaf_remove(%p)\n", l);
1600 1483
1601 if (tp) { 1484 if (tp) {
1602 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); 1485 put_child(tp, get_index(l->key, tp), NULL);
1603 put_child(tp, cindex, NULL);
1604 trie_rebalance(t, tp); 1486 trie_rebalance(t, tp);
1605 } else 1487 } else {
1606 RCU_INIT_POINTER(t->trie, NULL); 1488 RCU_INIT_POINTER(t->trie, NULL);
1489 }
1607 1490
1608 free_leaf(l); 1491 node_free(l);
1609} 1492}
1610 1493
1611/* 1494/*
@@ -1619,7 +1502,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1619 u8 tos = cfg->fc_tos; 1502 u8 tos = cfg->fc_tos;
1620 struct fib_alias *fa, *fa_to_delete; 1503 struct fib_alias *fa, *fa_to_delete;
1621 struct list_head *fa_head; 1504 struct list_head *fa_head;
1622 struct leaf *l; 1505 struct tnode *l;
1623 struct leaf_info *li; 1506 struct leaf_info *li;
1624 1507
1625 if (plen > 32) 1508 if (plen > 32)
@@ -1684,7 +1567,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1684 tb->tb_num_default--; 1567 tb->tb_num_default--;
1685 1568
1686 if (list_empty(fa_head)) { 1569 if (list_empty(fa_head)) {
1687 hlist_del_rcu(&li->hlist); 1570 remove_leaf_info(l, li);
1688 free_leaf_info(li); 1571 free_leaf_info(li);
1689 } 1572 }
1690 1573
@@ -1717,12 +1600,13 @@ static int trie_flush_list(struct list_head *head)
1717 return found; 1600 return found;
1718} 1601}
1719 1602
1720static int trie_flush_leaf(struct leaf *l) 1603static int trie_flush_leaf(struct tnode *l)
1721{ 1604{
1722 int found = 0; 1605 int found = 0;
1723 struct hlist_head *lih = &l->list; 1606 struct hlist_head *lih = &l->list;
1724 struct hlist_node *tmp; 1607 struct hlist_node *tmp;
1725 struct leaf_info *li = NULL; 1608 struct leaf_info *li = NULL;
1609 unsigned char plen = KEYLENGTH;
1726 1610
1727 hlist_for_each_entry_safe(li, tmp, lih, hlist) { 1611 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1728 found += trie_flush_list(&li->falh); 1612 found += trie_flush_list(&li->falh);
@@ -1730,8 +1614,14 @@ static int trie_flush_leaf(struct leaf *l)
1730 if (list_empty(&li->falh)) { 1614 if (list_empty(&li->falh)) {
1731 hlist_del_rcu(&li->hlist); 1615 hlist_del_rcu(&li->hlist);
1732 free_leaf_info(li); 1616 free_leaf_info(li);
1617 continue;
1733 } 1618 }
1619
1620 plen = li->plen;
1734 } 1621 }
1622
1623 l->slen = KEYLENGTH - plen;
1624
1735 return found; 1625 return found;
1736} 1626}
1737 1627
@@ -1739,63 +1629,57 @@ static int trie_flush_leaf(struct leaf *l)
1739 * Scan for the next right leaf starting at node p->child[idx] 1629 * Scan for the next right leaf starting at node p->child[idx]
1740 * Since we have back pointer, no recursion necessary. 1630 * Since we have back pointer, no recursion necessary.
1741 */ 1631 */
1742static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) 1632static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1743{ 1633{
1744 do { 1634 do {
1745 t_key idx; 1635 unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
1746 1636
1747 if (c) 1637 while (idx < tnode_child_length(p)) {
1748 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1749 else
1750 idx = 0;
1751
1752 while (idx < 1u << p->bits) {
1753 c = tnode_get_child_rcu(p, idx++); 1638 c = tnode_get_child_rcu(p, idx++);
1754 if (!c) 1639 if (!c)
1755 continue; 1640 continue;
1756 1641
1757 if (IS_LEAF(c)) 1642 if (IS_LEAF(c))
1758 return (struct leaf *) c; 1643 return c;
1759 1644
1760 /* Rescan start scanning in new node */ 1645 /* Rescan start scanning in new node */
1761 p = (struct tnode *) c; 1646 p = c;
1762 idx = 0; 1647 idx = 0;
1763 } 1648 }
1764 1649
1765 /* Node empty, walk back up to parent */ 1650 /* Node empty, walk back up to parent */
1766 c = (struct rt_trie_node *) p; 1651 c = p;
1767 } while ((p = node_parent_rcu(c)) != NULL); 1652 } while ((p = node_parent_rcu(c)) != NULL);
1768 1653
1769 return NULL; /* Root of trie */ 1654 return NULL; /* Root of trie */
1770} 1655}
1771 1656
1772static struct leaf *trie_firstleaf(struct trie *t) 1657static struct tnode *trie_firstleaf(struct trie *t)
1773{ 1658{
1774 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); 1659 struct tnode *n = rcu_dereference_rtnl(t->trie);
1775 1660
1776 if (!n) 1661 if (!n)
1777 return NULL; 1662 return NULL;
1778 1663
1779 if (IS_LEAF(n)) /* trie is just a leaf */ 1664 if (IS_LEAF(n)) /* trie is just a leaf */
1780 return (struct leaf *) n; 1665 return n;
1781 1666
1782 return leaf_walk_rcu(n, NULL); 1667 return leaf_walk_rcu(n, NULL);
1783} 1668}
1784 1669
1785static struct leaf *trie_nextleaf(struct leaf *l) 1670static struct tnode *trie_nextleaf(struct tnode *l)
1786{ 1671{
1787 struct rt_trie_node *c = (struct rt_trie_node *) l; 1672 struct tnode *p = node_parent_rcu(l);
1788 struct tnode *p = node_parent_rcu(c);
1789 1673
1790 if (!p) 1674 if (!p)
1791 return NULL; /* trie with just one leaf */ 1675 return NULL; /* trie with just one leaf */
1792 1676
1793 return leaf_walk_rcu(p, c); 1677 return leaf_walk_rcu(p, l);
1794} 1678}
1795 1679
1796static struct leaf *trie_leafindex(struct trie *t, int index) 1680static struct tnode *trie_leafindex(struct trie *t, int index)
1797{ 1681{
1798 struct leaf *l = trie_firstleaf(t); 1682 struct tnode *l = trie_firstleaf(t);
1799 1683
1800 while (l && index-- > 0) 1684 while (l && index-- > 0)
1801 l = trie_nextleaf(l); 1685 l = trie_nextleaf(l);
@@ -1810,19 +1694,28 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
1810int fib_table_flush(struct fib_table *tb) 1694int fib_table_flush(struct fib_table *tb)
1811{ 1695{
1812 struct trie *t = (struct trie *) tb->tb_data; 1696 struct trie *t = (struct trie *) tb->tb_data;
1813 struct leaf *l, *ll = NULL; 1697 struct tnode *l, *ll = NULL;
1814 int found = 0; 1698 int found = 0;
1815 1699
1816 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { 1700 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1817 found += trie_flush_leaf(l); 1701 found += trie_flush_leaf(l);
1818 1702
1819 if (ll && hlist_empty(&ll->list)) 1703 if (ll) {
1820 trie_leaf_remove(t, ll); 1704 if (hlist_empty(&ll->list))
1705 trie_leaf_remove(t, ll);
1706 else
1707 leaf_pull_suffix(ll);
1708 }
1709
1821 ll = l; 1710 ll = l;
1822 } 1711 }
1823 1712
1824 if (ll && hlist_empty(&ll->list)) 1713 if (ll) {
1825 trie_leaf_remove(t, ll); 1714 if (hlist_empty(&ll->list))
1715 trie_leaf_remove(t, ll);
1716 else
1717 leaf_pull_suffix(ll);
1718 }
1826 1719
1827 pr_debug("trie_flush found=%d\n", found); 1720 pr_debug("trie_flush found=%d\n", found);
1828 return found; 1721 return found;
@@ -1830,6 +1723,11 @@ int fib_table_flush(struct fib_table *tb)
1830 1723
1831void fib_free_table(struct fib_table *tb) 1724void fib_free_table(struct fib_table *tb)
1832{ 1725{
1726#ifdef CONFIG_IP_FIB_TRIE_STATS
1727 struct trie *t = (struct trie *)tb->tb_data;
1728
1729 free_percpu(t->stats);
1730#endif /* CONFIG_IP_FIB_TRIE_STATS */
1833 kfree(tb); 1731 kfree(tb);
1834} 1732}
1835 1733
@@ -1870,7 +1768,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1870 return skb->len; 1768 return skb->len;
1871} 1769}
1872 1770
1873static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, 1771static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1874 struct sk_buff *skb, struct netlink_callback *cb) 1772 struct sk_buff *skb, struct netlink_callback *cb)
1875{ 1773{
1876 struct leaf_info *li; 1774 struct leaf_info *li;
@@ -1906,7 +1804,7 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1906int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, 1804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1907 struct netlink_callback *cb) 1805 struct netlink_callback *cb)
1908{ 1806{
1909 struct leaf *l; 1807 struct tnode *l;
1910 struct trie *t = (struct trie *) tb->tb_data; 1808 struct trie *t = (struct trie *) tb->tb_data;
1911 t_key key = cb->args[2]; 1809 t_key key = cb->args[2];
1912 int count = cb->args[3]; 1810 int count = cb->args[3];
@@ -1952,7 +1850,7 @@ void __init fib_trie_init(void)
1952 0, SLAB_PANIC, NULL); 1850 0, SLAB_PANIC, NULL);
1953 1851
1954 trie_leaf_kmem = kmem_cache_create("ip_fib_trie", 1852 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1955 max(sizeof(struct leaf), 1853 max(sizeof(struct tnode),
1956 sizeof(struct leaf_info)), 1854 sizeof(struct leaf_info)),
1957 0, SLAB_PANIC, NULL); 1855 0, SLAB_PANIC, NULL);
1958} 1856}
@@ -1973,7 +1871,14 @@ struct fib_table *fib_trie_table(u32 id)
1973 tb->tb_num_default = 0; 1871 tb->tb_num_default = 0;
1974 1872
1975 t = (struct trie *) tb->tb_data; 1873 t = (struct trie *) tb->tb_data;
1976 memset(t, 0, sizeof(*t)); 1874 RCU_INIT_POINTER(t->trie, NULL);
1875#ifdef CONFIG_IP_FIB_TRIE_STATS
1876 t->stats = alloc_percpu(struct trie_use_stats);
1877 if (!t->stats) {
1878 kfree(tb);
1879 tb = NULL;
1880 }
1881#endif
1977 1882
1978 return tb; 1883 return tb;
1979} 1884}
@@ -1988,10 +1893,10 @@ struct fib_trie_iter {
1988 unsigned int depth; 1893 unsigned int depth;
1989}; 1894};
1990 1895
1991static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) 1896static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1992{ 1897{
1898 unsigned long cindex = iter->index;
1993 struct tnode *tn = iter->tnode; 1899 struct tnode *tn = iter->tnode;
1994 unsigned int cindex = iter->index;
1995 struct tnode *p; 1900 struct tnode *p;
1996 1901
1997 /* A single entry routing table */ 1902 /* A single entry routing table */
@@ -2001,8 +1906,8 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2001 pr_debug("get_next iter={node=%p index=%d depth=%d}\n", 1906 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2002 iter->tnode, iter->index, iter->depth); 1907 iter->tnode, iter->index, iter->depth);
2003rescan: 1908rescan:
2004 while (cindex < (1<<tn->bits)) { 1909 while (cindex < tnode_child_length(tn)) {
2005 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); 1910 struct tnode *n = tnode_get_child_rcu(tn, cindex);
2006 1911
2007 if (n) { 1912 if (n) {
2008 if (IS_LEAF(n)) { 1913 if (IS_LEAF(n)) {
@@ -2010,7 +1915,7 @@ rescan:
2010 iter->index = cindex + 1; 1915 iter->index = cindex + 1;
2011 } else { 1916 } else {
2012 /* push down one level */ 1917 /* push down one level */
2013 iter->tnode = (struct tnode *) n; 1918 iter->tnode = n;
2014 iter->index = 0; 1919 iter->index = 0;
2015 ++iter->depth; 1920 ++iter->depth;
2016 } 1921 }
@@ -2021,9 +1926,9 @@ rescan:
2021 } 1926 }
2022 1927
2023 /* Current node exhausted, pop back up */ 1928 /* Current node exhausted, pop back up */
2024 p = node_parent_rcu((struct rt_trie_node *)tn); 1929 p = node_parent_rcu(tn);
2025 if (p) { 1930 if (p) {
2026 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 1931 cindex = get_index(tn->key, p) + 1;
2027 tn = p; 1932 tn = p;
2028 --iter->depth; 1933 --iter->depth;
2029 goto rescan; 1934 goto rescan;
@@ -2033,10 +1938,10 @@ rescan:
2033 return NULL; 1938 return NULL;
2034} 1939}
2035 1940
2036static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, 1941static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
2037 struct trie *t) 1942 struct trie *t)
2038{ 1943{
2039 struct rt_trie_node *n; 1944 struct tnode *n;
2040 1945
2041 if (!t) 1946 if (!t)
2042 return NULL; 1947 return NULL;
@@ -2046,7 +1951,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2046 return NULL; 1951 return NULL;
2047 1952
2048 if (IS_TNODE(n)) { 1953 if (IS_TNODE(n)) {
2049 iter->tnode = (struct tnode *) n; 1954 iter->tnode = n;
2050 iter->index = 0; 1955 iter->index = 0;
2051 iter->depth = 1; 1956 iter->depth = 1;
2052 } else { 1957 } else {
@@ -2060,7 +1965,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2060 1965
2061static void trie_collect_stats(struct trie *t, struct trie_stat *s) 1966static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2062{ 1967{
2063 struct rt_trie_node *n; 1968 struct tnode *n;
2064 struct fib_trie_iter iter; 1969 struct fib_trie_iter iter;
2065 1970
2066 memset(s, 0, sizeof(*s)); 1971 memset(s, 0, sizeof(*s));
@@ -2068,7 +1973,6 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2068 rcu_read_lock(); 1973 rcu_read_lock();
2069 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { 1974 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2070 if (IS_LEAF(n)) { 1975 if (IS_LEAF(n)) {
2071 struct leaf *l = (struct leaf *)n;
2072 struct leaf_info *li; 1976 struct leaf_info *li;
2073 1977
2074 s->leaves++; 1978 s->leaves++;
@@ -2076,19 +1980,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2076 if (iter.depth > s->maxdepth) 1980 if (iter.depth > s->maxdepth)
2077 s->maxdepth = iter.depth; 1981 s->maxdepth = iter.depth;
2078 1982
2079 hlist_for_each_entry_rcu(li, &l->list, hlist) 1983 hlist_for_each_entry_rcu(li, &n->list, hlist)
2080 ++s->prefixes; 1984 ++s->prefixes;
2081 } else { 1985 } else {
2082 const struct tnode *tn = (const struct tnode *) n;
2083 int i;
2084
2085 s->tnodes++; 1986 s->tnodes++;
2086 if (tn->bits < MAX_STAT_DEPTH) 1987 if (n->bits < MAX_STAT_DEPTH)
2087 s->nodesizes[tn->bits]++; 1988 s->nodesizes[n->bits]++;
2088 1989 s->nullpointers += n->empty_children;
2089 for (i = 0; i < (1<<tn->bits); i++)
2090 if (!tn->child[i])
2091 s->nullpointers++;
2092 } 1990 }
2093 } 1991 }
2094 rcu_read_unlock(); 1992 rcu_read_unlock();
@@ -2111,7 +2009,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2111 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); 2009 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2112 2010
2113 seq_printf(seq, "\tLeaves: %u\n", stat->leaves); 2011 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2114 bytes = sizeof(struct leaf) * stat->leaves; 2012 bytes = sizeof(struct tnode) * stat->leaves;
2115 2013
2116 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); 2014 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2117 bytes += sizeof(struct leaf_info) * stat->prefixes; 2015 bytes += sizeof(struct leaf_info) * stat->prefixes;
@@ -2132,25 +2030,38 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2132 seq_putc(seq, '\n'); 2030 seq_putc(seq, '\n');
2133 seq_printf(seq, "\tPointers: %u\n", pointers); 2031 seq_printf(seq, "\tPointers: %u\n", pointers);
2134 2032
2135 bytes += sizeof(struct rt_trie_node *) * pointers; 2033 bytes += sizeof(struct tnode *) * pointers;
2136 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2034 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2137 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2035 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2138} 2036}
2139 2037
2140#ifdef CONFIG_IP_FIB_TRIE_STATS 2038#ifdef CONFIG_IP_FIB_TRIE_STATS
2141static void trie_show_usage(struct seq_file *seq, 2039static void trie_show_usage(struct seq_file *seq,
2142 const struct trie_use_stats *stats) 2040 const struct trie_use_stats __percpu *stats)
2143{ 2041{
2042 struct trie_use_stats s = { 0 };
2043 int cpu;
2044
2045 /* loop through all of the CPUs and gather up the stats */
2046 for_each_possible_cpu(cpu) {
2047 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2048
2049 s.gets += pcpu->gets;
2050 s.backtrack += pcpu->backtrack;
2051 s.semantic_match_passed += pcpu->semantic_match_passed;
2052 s.semantic_match_miss += pcpu->semantic_match_miss;
2053 s.null_node_hit += pcpu->null_node_hit;
2054 s.resize_node_skipped += pcpu->resize_node_skipped;
2055 }
2056
2144 seq_printf(seq, "\nCounters:\n---------\n"); 2057 seq_printf(seq, "\nCounters:\n---------\n");
2145 seq_printf(seq, "gets = %u\n", stats->gets); 2058 seq_printf(seq, "gets = %u\n", s.gets);
2146 seq_printf(seq, "backtracks = %u\n", stats->backtrack); 2059 seq_printf(seq, "backtracks = %u\n", s.backtrack);
2147 seq_printf(seq, "semantic match passed = %u\n", 2060 seq_printf(seq, "semantic match passed = %u\n",
2148 stats->semantic_match_passed); 2061 s.semantic_match_passed);
2149 seq_printf(seq, "semantic match miss = %u\n", 2062 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2150 stats->semantic_match_miss); 2063 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2151 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); 2064 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2152 seq_printf(seq, "skipped node resize = %u\n\n",
2153 stats->resize_node_skipped);
2154} 2065}
2155#endif /* CONFIG_IP_FIB_TRIE_STATS */ 2066#endif /* CONFIG_IP_FIB_TRIE_STATS */
2156 2067
@@ -2173,7 +2084,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2173 seq_printf(seq, 2084 seq_printf(seq,
2174 "Basic info: size of leaf:" 2085 "Basic info: size of leaf:"
2175 " %Zd bytes, size of tnode: %Zd bytes.\n", 2086 " %Zd bytes, size of tnode: %Zd bytes.\n",
2176 sizeof(struct leaf), sizeof(struct tnode)); 2087 sizeof(struct tnode), sizeof(struct tnode));
2177 2088
2178 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2089 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2179 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 2090 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2191,7 +2102,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2191 trie_collect_stats(t, &stat); 2102 trie_collect_stats(t, &stat);
2192 trie_show_stats(seq, &stat); 2103 trie_show_stats(seq, &stat);
2193#ifdef CONFIG_IP_FIB_TRIE_STATS 2104#ifdef CONFIG_IP_FIB_TRIE_STATS
2194 trie_show_usage(seq, &t->stats); 2105 trie_show_usage(seq, t->stats);
2195#endif 2106#endif
2196 } 2107 }
2197 } 2108 }
@@ -2212,7 +2123,7 @@ static const struct file_operations fib_triestat_fops = {
2212 .release = single_release_net, 2123 .release = single_release_net,
2213}; 2124};
2214 2125
2215static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2126static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2216{ 2127{
2217 struct fib_trie_iter *iter = seq->private; 2128 struct fib_trie_iter *iter = seq->private;
2218 struct net *net = seq_file_net(seq); 2129 struct net *net = seq_file_net(seq);
@@ -2224,7 +2135,7 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2224 struct fib_table *tb; 2135 struct fib_table *tb;
2225 2136
2226 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 2137 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2227 struct rt_trie_node *n; 2138 struct tnode *n;
2228 2139
2229 for (n = fib_trie_get_first(iter, 2140 for (n = fib_trie_get_first(iter,
2230 (struct trie *) tb->tb_data); 2141 (struct trie *) tb->tb_data);
@@ -2253,7 +2164,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2253 struct fib_table *tb = iter->tb; 2164 struct fib_table *tb = iter->tb;
2254 struct hlist_node *tb_node; 2165 struct hlist_node *tb_node;
2255 unsigned int h; 2166 unsigned int h;
2256 struct rt_trie_node *n; 2167 struct tnode *n;
2257 2168
2258 ++*pos; 2169 ++*pos;
2259 /* next node in same table */ 2170 /* next node in same table */
@@ -2339,29 +2250,26 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2339static int fib_trie_seq_show(struct seq_file *seq, void *v) 2250static int fib_trie_seq_show(struct seq_file *seq, void *v)
2340{ 2251{
2341 const struct fib_trie_iter *iter = seq->private; 2252 const struct fib_trie_iter *iter = seq->private;
2342 struct rt_trie_node *n = v; 2253 struct tnode *n = v;
2343 2254
2344 if (!node_parent_rcu(n)) 2255 if (!node_parent_rcu(n))
2345 fib_table_print(seq, iter->tb); 2256 fib_table_print(seq, iter->tb);
2346 2257
2347 if (IS_TNODE(n)) { 2258 if (IS_TNODE(n)) {
2348 struct tnode *tn = (struct tnode *) n; 2259 __be32 prf = htonl(n->key);
2349 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2350 2260
2351 seq_indent(seq, iter->depth-1); 2261 seq_indent(seq, iter->depth-1);
2352 seq_printf(seq, " +-- %pI4/%d %d %d %d\n", 2262 seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
2353 &prf, tn->pos, tn->bits, tn->full_children, 2263 &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2354 tn->empty_children); 2264 n->full_children, n->empty_children);
2355
2356 } else { 2265 } else {
2357 struct leaf *l = (struct leaf *) n;
2358 struct leaf_info *li; 2266 struct leaf_info *li;
2359 __be32 val = htonl(l->key); 2267 __be32 val = htonl(n->key);
2360 2268
2361 seq_indent(seq, iter->depth); 2269 seq_indent(seq, iter->depth);
2362 seq_printf(seq, " |-- %pI4\n", &val); 2270 seq_printf(seq, " |-- %pI4\n", &val);
2363 2271
2364 hlist_for_each_entry_rcu(li, &l->list, hlist) { 2272 hlist_for_each_entry_rcu(li, &n->list, hlist) {
2365 struct fib_alias *fa; 2273 struct fib_alias *fa;
2366 2274
2367 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2275 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2411,9 +2319,9 @@ struct fib_route_iter {
2411 t_key key; 2319 t_key key;
2412}; 2320};
2413 2321
2414static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) 2322static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2415{ 2323{
2416 struct leaf *l = NULL; 2324 struct tnode *l = NULL;
2417 struct trie *t = iter->main_trie; 2325 struct trie *t = iter->main_trie;
2418 2326
2419 /* use cache location of last found key */ 2327 /* use cache location of last found key */
@@ -2458,7 +2366,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2458static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2366static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2459{ 2367{
2460 struct fib_route_iter *iter = seq->private; 2368 struct fib_route_iter *iter = seq->private;
2461 struct leaf *l = v; 2369 struct tnode *l = v;
2462 2370
2463 ++*pos; 2371 ++*pos;
2464 if (v == SEQ_START_TOKEN) { 2372 if (v == SEQ_START_TOKEN) {
@@ -2504,7 +2412,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
2504 */ 2412 */
2505static int fib_route_seq_show(struct seq_file *seq, void *v) 2413static int fib_route_seq_show(struct seq_file *seq, void *v)
2506{ 2414{
2507 struct leaf *l = v; 2415 struct tnode *l = v;
2508 struct leaf_info *li; 2416 struct leaf_info *li;
2509 2417
2510 if (v == SEQ_START_TOKEN) { 2418 if (v == SEQ_START_TOKEN) {