aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorRobert Olsson <Robert.Olsson@data.slu.se>2005-08-25 16:01:29 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2005-08-29 19:09:03 -0400
commit2373ce1ca04dd46bf2b8b0f9a799eb2a90da92fb (patch)
treead517ad6e5b45f52ea05d97a78b85f8c114183d9 /net
parente5b4376074e02b783e56a8f7c42d544e18112c4e (diff)
[IPV4]: Convert FIB Trie to RCU.
* Removes RW-lock * Proteced read functions uses rcu_dereference proteced with rcu_read_lock() * writing of procted pointer w. rcu_assigen_pointer * Insert/Replace atomic list_replace_rcu * A BUG_ON condition removed.in trie_rebalance With help from Paul E. McKenney. Signed-off-by: Robert Olsson <Robert.Olsson@data.slu.se> Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/fib_trie.c389
1 files changed, 202 insertions, 187 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9c4c7f0367b0..ff21748248e4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.325" 46#define VERSION "0.402"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -62,6 +62,7 @@
62#include <linux/netdevice.h> 62#include <linux/netdevice.h>
63#include <linux/if_arp.h> 63#include <linux/if_arp.h>
64#include <linux/proc_fs.h> 64#include <linux/proc_fs.h>
65#include <linux/rcupdate.h>
65#include <linux/skbuff.h> 66#include <linux/skbuff.h>
66#include <linux/netlink.h> 67#include <linux/netlink.h>
67#include <linux/init.h> 68#include <linux/init.h>
@@ -81,22 +82,19 @@
81#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) 82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
82#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) 83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
83 84
84static DEFINE_RWLOCK(fib_lock);
85
86typedef unsigned int t_key; 85typedef unsigned int t_key;
87 86
88#define T_TNODE 0 87#define T_TNODE 0
89#define T_LEAF 1 88#define T_LEAF 1
90#define NODE_TYPE_MASK 0x1UL 89#define NODE_TYPE_MASK 0x1UL
91#define NODE_PARENT(node) \ 90#define NODE_PARENT(node) \
92 ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK)) 91 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
93#define NODE_SET_PARENT(node, ptr) \ 92
94 ((node)->parent = (((unsigned long)(ptr)) | \ 93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95 ((node)->parent & NODE_TYPE_MASK))) 94
96#define NODE_INIT_PARENT(node, type) \ 95#define NODE_SET_PARENT(node, ptr) \
97 ((node)->parent = (type)) 96 rcu_assign_pointer((node)->parent, \
98#define NODE_TYPE(node) \ 97 ((unsigned long)(ptr)) | NODE_TYPE(node))
99 ((node)->parent & NODE_TYPE_MASK)
100 98
101#define IS_TNODE(n) (!(n->parent & T_LEAF)) 99#define IS_TNODE(n) (!(n->parent & T_LEAF))
102#define IS_LEAF(n) (n->parent & T_LEAF) 100#define IS_LEAF(n) (n->parent & T_LEAF)
@@ -110,10 +108,12 @@ struct leaf {
110 t_key key; 108 t_key key;
111 unsigned long parent; 109 unsigned long parent;
112 struct hlist_head list; 110 struct hlist_head list;
111 struct rcu_head rcu;
113}; 112};
114 113
115struct leaf_info { 114struct leaf_info {
116 struct hlist_node hlist; 115 struct hlist_node hlist;
116 struct rcu_head rcu;
117 int plen; 117 int plen;
118 struct list_head falh; 118 struct list_head falh;
119}; 119};
@@ -125,6 +125,7 @@ struct tnode {
125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ 125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short full_children; /* KEYLENGTH bits needed */ 126 unsigned short full_children; /* KEYLENGTH bits needed */
127 unsigned short empty_children; /* KEYLENGTH bits needed */ 127 unsigned short empty_children; /* KEYLENGTH bits needed */
128 struct rcu_head rcu;
128 struct node *child[0]; 129 struct node *child[0];
129}; 130};
130 131
@@ -168,11 +169,14 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t);
168static kmem_cache_t *fn_alias_kmem; 169static kmem_cache_t *fn_alias_kmem;
169static struct trie *trie_local = NULL, *trie_main = NULL; 170static struct trie *trie_local = NULL, *trie_main = NULL;
170 171
172
173/* rcu_read_lock needs to be hold by caller from readside */
174
171static inline struct node *tnode_get_child(struct tnode *tn, int i) 175static inline struct node *tnode_get_child(struct tnode *tn, int i)
172{ 176{
173 BUG_ON(i >= 1 << tn->bits); 177 BUG_ON(i >= 1 << tn->bits);
174 178
175 return tn->child[i]; 179 return rcu_dereference(tn->child[i]);
176} 180}
177 181
178static inline int tnode_child_length(const struct tnode *tn) 182static inline int tnode_child_length(const struct tnode *tn)
@@ -213,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
213 return i; 217 return i;
214} 218}
215 219
216/* Candidate for fib_semantics */
217
218static void fn_free_alias(struct fib_alias *fa)
219{
220 fib_release_info(fa->fa_info);
221 kmem_cache_free(fn_alias_kmem, fa);
222}
223
224/* 220/*
225 To understand this stuff, an understanding of keys and all their bits is 221 To understand this stuff, an understanding of keys and all their bits is
226 necessary. Every node in the trie has a key associated with it, but not 222 necessary. Every node in the trie has a key associated with it, but not
@@ -292,53 +288,57 @@ static inline void check_tnode(const struct tnode *tn)
292static int halve_threshold = 25; 288static int halve_threshold = 25;
293static int inflate_threshold = 50; 289static int inflate_threshold = 50;
294 290
295static struct leaf *leaf_new(void) 291
292static void __alias_free_mem(struct rcu_head *head)
296{ 293{
297 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 294 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
298 if (l) { 295 kmem_cache_free(fn_alias_kmem, fa);
299 NODE_INIT_PARENT(l, T_LEAF);
300 INIT_HLIST_HEAD(&l->list);
301 }
302 return l;
303} 296}
304 297
305static struct leaf_info *leaf_info_new(int plen) 298static inline void alias_free_mem_rcu(struct fib_alias *fa)
306{ 299{
307 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 300 call_rcu(&fa->rcu, __alias_free_mem);
308 301}
309 if (!li)
310 return NULL;
311 302
312 li->plen = plen; 303static void __leaf_free_rcu(struct rcu_head *head)
313 INIT_LIST_HEAD(&li->falh); 304{
305 kfree(container_of(head, struct leaf, rcu));
306}
314 307
315 return li; 308static inline void free_leaf(struct leaf *leaf)
309{
310 call_rcu(&leaf->rcu, __leaf_free_rcu);
316} 311}
317 312
318static inline void free_leaf(struct leaf *l) 313static void __leaf_info_free_rcu(struct rcu_head *head)
319{ 314{
320 kfree(l); 315 kfree(container_of(head, struct leaf_info, rcu));
321} 316}
322 317
323static inline void free_leaf_info(struct leaf_info *li) 318static inline void free_leaf_info(struct leaf_info *leaf)
324{ 319{
325 kfree(li); 320 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
326} 321}
327 322
328static struct tnode *tnode_alloc(unsigned int size) 323static struct tnode *tnode_alloc(unsigned int size)
329{ 324{
330 if (size <= PAGE_SIZE) { 325 struct page *pages;
331 return kmalloc(size, GFP_KERNEL); 326
332 } else { 327 if (size <= PAGE_SIZE)
333 return (struct tnode *) 328 return kcalloc(size, 1, GFP_KERNEL);
334 __get_free_pages(GFP_KERNEL, get_order(size)); 329
335 } 330 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
331 if (!pages)
332 return NULL;
333
334 return page_address(pages);
336} 335}
337 336
338static void __tnode_free(struct tnode *tn) 337static void __tnode_free_rcu(struct rcu_head *head)
339{ 338{
339 struct tnode *tn = container_of(head, struct tnode, rcu);
340 unsigned int size = sizeof(struct tnode) + 340 unsigned int size = sizeof(struct tnode) +
341 (1 << tn->bits) * sizeof(struct node *); 341 (1 << tn->bits) * sizeof(struct node *);
342 342
343 if (size <= PAGE_SIZE) 343 if (size <= PAGE_SIZE)
344 kfree(tn); 344 kfree(tn);
@@ -346,6 +346,31 @@ static void __tnode_free(struct tnode *tn)
346 free_pages((unsigned long)tn, get_order(size)); 346 free_pages((unsigned long)tn, get_order(size));
347} 347}
348 348
349static inline void tnode_free(struct tnode *tn)
350{
351 call_rcu(&tn->rcu, __tnode_free_rcu);
352}
353
354static struct leaf *leaf_new(void)
355{
356 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
357 if (l) {
358 l->parent = T_LEAF;
359 INIT_HLIST_HEAD(&l->list);
360 }
361 return l;
362}
363
364static struct leaf_info *leaf_info_new(int plen)
365{
366 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
367 if (li) {
368 li->plen = plen;
369 INIT_LIST_HEAD(&li->falh);
370 }
371 return li;
372}
373
349static struct tnode* tnode_new(t_key key, int pos, int bits) 374static struct tnode* tnode_new(t_key key, int pos, int bits)
350{ 375{
351 int nchildren = 1<<bits; 376 int nchildren = 1<<bits;
@@ -354,7 +379,7 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
354 379
355 if (tn) { 380 if (tn) {
356 memset(tn, 0, sz); 381 memset(tn, 0, sz);
357 NODE_INIT_PARENT(tn, T_TNODE); 382 tn->parent = T_TNODE;
358 tn->pos = pos; 383 tn->pos = pos;
359 tn->bits = bits; 384 tn->bits = bits;
360 tn->key = key; 385 tn->key = key;
@@ -367,17 +392,6 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
367 return tn; 392 return tn;
368} 393}
369 394
370static void tnode_free(struct tnode *tn)
371{
372 if (IS_LEAF(tn)) {
373 free_leaf((struct leaf *)tn);
374 pr_debug("FL %p \n", tn);
375 } else {
376 __tnode_free(tn);
377 pr_debug("FT %p \n", tn);
378 }
379}
380
381/* 395/*
382 * Check whether a tnode 'n' is "full", i.e. it is an internal node 396 * Check whether a tnode 'n' is "full", i.e. it is an internal node
383 * and no bits are skipped. See discussion in dyntree paper p. 6 397 * and no bits are skipped. See discussion in dyntree paper p. 6
@@ -403,13 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
403 417
404static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 418static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
405{ 419{
406 struct node *chi; 420 struct node *chi = tn->child[i];
407 int isfull; 421 int isfull;
408 422
409 BUG_ON(i >= 1<<tn->bits); 423 BUG_ON(i >= 1<<tn->bits);
410 424
411 write_lock_bh(&fib_lock);
412 chi = tn->child[i];
413 425
414 /* update emptyChildren */ 426 /* update emptyChildren */
415 if (n == NULL && chi != NULL) 427 if (n == NULL && chi != NULL)
@@ -430,8 +442,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
430 if (n) 442 if (n)
431 NODE_SET_PARENT(n, tn); 443 NODE_SET_PARENT(n, tn);
432 444
433 tn->child[i] = n; 445 rcu_assign_pointer(tn->child[i], n);
434 write_unlock_bh(&fib_lock);
435} 446}
436 447
437static struct node *resize(struct trie *t, struct tnode *tn) 448static struct node *resize(struct trie *t, struct tnode *tn)
@@ -456,17 +467,12 @@ static struct node *resize(struct trie *t, struct tnode *tn)
456 for (i = 0; i < tnode_child_length(tn); i++) { 467 for (i = 0; i < tnode_child_length(tn); i++) {
457 struct node *n; 468 struct node *n;
458 469
459 write_lock_bh(&fib_lock);
460 n = tn->child[i]; 470 n = tn->child[i];
461 if (!n) { 471 if (!n)
462 write_unlock_bh(&fib_lock);
463 continue; 472 continue;
464 }
465 473
466 /* compress one level */ 474 /* compress one level */
467 NODE_INIT_PARENT(n, NODE_TYPE(n)); 475 NODE_SET_PARENT(n, NULL);
468
469 write_unlock_bh(&fib_lock);
470 tnode_free(tn); 476 tnode_free(tn);
471 return n; 477 return n;
472 } 478 }
@@ -577,24 +583,17 @@ static struct node *resize(struct trie *t, struct tnode *tn)
577 583
578 584
579 /* Only one child remains */ 585 /* Only one child remains */
580
581 if (tn->empty_children == tnode_child_length(tn) - 1) 586 if (tn->empty_children == tnode_child_length(tn) - 1)
582 for (i = 0; i < tnode_child_length(tn); i++) { 587 for (i = 0; i < tnode_child_length(tn); i++) {
583 struct node *n; 588 struct node *n;
584 589
585 write_lock_bh(&fib_lock);
586
587 n = tn->child[i]; 590 n = tn->child[i];
588 if (!n) { 591 if (!n)
589 write_unlock_bh(&fib_lock);
590 continue; 592 continue;
591 }
592 593
593 /* compress one level */ 594 /* compress one level */
594 595
595 NODE_INIT_PARENT(n, NODE_TYPE(n)); 596 NODE_SET_PARENT(n, NULL);
596
597 write_unlock_bh(&fib_lock);
598 tnode_free(tn); 597 tnode_free(tn);
599 return n; 598 return n;
600 } 599 }
@@ -831,19 +830,22 @@ static void trie_init(struct trie *t)
831 return; 830 return;
832 831
833 t->size = 0; 832 t->size = 0;
834 t->trie = NULL; 833 rcu_assign_pointer(t->trie, NULL);
835 t->revision = 0; 834 t->revision = 0;
836#ifdef CONFIG_IP_FIB_TRIE_STATS 835#ifdef CONFIG_IP_FIB_TRIE_STATS
837 memset(&t->stats, 0, sizeof(struct trie_use_stats)); 836 memset(&t->stats, 0, sizeof(struct trie_use_stats));
838#endif 837#endif
839} 838}
840 839
840/* readside most use rcu_read_lock currently dump routines
841 via get_fa_head and dump */
842
841static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) 843static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
842{ 844{
843 struct hlist_node *node; 845 struct hlist_node *node;
844 struct leaf_info *li; 846 struct leaf_info *li;
845 847
846 hlist_for_each_entry(li, node, head, hlist) 848 hlist_for_each_entry_rcu(li, node, head, hlist)
847 if (li->plen == plen) 849 if (li->plen == plen)
848 return li; 850 return li;
849 851
@@ -862,28 +864,27 @@ static inline struct list_head * get_fa_head(struct leaf *l, int plen)
862 864
863static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 865static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
864{ 866{
865 struct leaf_info *li = NULL, *last = NULL; 867 struct leaf_info *li = NULL, *last = NULL;
866 struct hlist_node *node; 868 struct hlist_node *node;
867 869
868 write_lock_bh(&fib_lock); 870 if (hlist_empty(head)) {
871 hlist_add_head_rcu(&new->hlist, head);
872 } else {
873 hlist_for_each_entry(li, node, head, hlist) {
874 if (new->plen > li->plen)
875 break;
869 876
870 if (hlist_empty(head)) { 877 last = li;
871 hlist_add_head(&new->hlist, head); 878 }
872 } else { 879 if (last)
873 hlist_for_each_entry(li, node, head, hlist) { 880 hlist_add_after_rcu(&last->hlist, &new->hlist);
874 if (new->plen > li->plen) 881 else
875 break; 882 hlist_add_before_rcu(&new->hlist, &li->hlist);
876 883 }
877 last = li;
878 }
879 if (last)
880 hlist_add_after(&last->hlist, &new->hlist);
881 else
882 hlist_add_before(&new->hlist, &li->hlist);
883 }
884 write_unlock_bh(&fib_lock);
885} 884}
886 885
886/* rcu_read_lock needs to be hold by caller from readside */
887
887static struct leaf * 888static struct leaf *
888fib_find_node(struct trie *t, u32 key) 889fib_find_node(struct trie *t, u32 key)
889{ 890{
@@ -892,7 +893,7 @@ fib_find_node(struct trie *t, u32 key)
892 struct node *n; 893 struct node *n;
893 894
894 pos = 0; 895 pos = 0;
895 n = t->trie; 896 n = rcu_dereference(t->trie);
896 897
897 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 898 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
898 tn = (struct tnode *) n; 899 tn = (struct tnode *) n;
@@ -915,17 +916,13 @@ fib_find_node(struct trie *t, u32 key)
915 916
916static struct node *trie_rebalance(struct trie *t, struct tnode *tn) 917static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
917{ 918{
918 int i;
919 int wasfull; 919 int wasfull;
920 t_key cindex, key; 920 t_key cindex, key;
921 struct tnode *tp = NULL; 921 struct tnode *tp = NULL;
922 922
923 key = tn->key; 923 key = tn->key;
924 i = 0;
925 924
926 while (tn != NULL && NODE_PARENT(tn) != NULL) { 925 while (tn != NULL && NODE_PARENT(tn) != NULL) {
927 BUG_ON(i > 12); /* Why is this a bug? -ojn */
928 i++;
929 926
930 tp = NODE_PARENT(tn); 927 tp = NODE_PARENT(tn);
931 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 928 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
@@ -945,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
945 return (struct node*) tn; 942 return (struct node*) tn;
946} 943}
947 944
945/* only used from updater-side */
946
948static struct list_head * 947static struct list_head *
949fib_insert_node(struct trie *t, int *err, u32 key, int plen) 948fib_insert_node(struct trie *t, int *err, u32 key, int plen)
950{ 949{
@@ -1081,7 +1080,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1081 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1080 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1082 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1081 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1083 } else { 1082 } else {
1084 t->trie = (struct node*) tn; /* First tnode */ 1083 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1085 tp = tn; 1084 tp = tn;
1086 } 1085 }
1087 } 1086 }
@@ -1091,7 +1090,8 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1091 tp, tp->pos, tp->bits, key, plen); 1090 tp, tp->pos, tp->bits, key, plen);
1092 1091
1093 /* Rebalance the trie */ 1092 /* Rebalance the trie */
1094 t->trie = trie_rebalance(t, tp); 1093
1094 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1095done: 1095done:
1096 t->revision++; 1096 t->revision++;
1097err: 1097err:
@@ -1166,16 +1166,21 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1166 struct fib_info *fi_drop; 1166 struct fib_info *fi_drop;
1167 u8 state; 1167 u8 state;
1168 1168
1169 write_lock_bh(&fib_lock); 1169 err = -ENOBUFS;
1170 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1171 if (new_fa == NULL)
1172 goto out;
1170 1173
1171 fi_drop = fa->fa_info; 1174 fi_drop = fa->fa_info;
1172 fa->fa_info = fi; 1175 new_fa->fa_tos = fa->fa_tos;
1173 fa->fa_type = type; 1176 new_fa->fa_info = fi;
1174 fa->fa_scope = r->rtm_scope; 1177 new_fa->fa_type = type;
1178 new_fa->fa_scope = r->rtm_scope;
1175 state = fa->fa_state; 1179 state = fa->fa_state;
1176 fa->fa_state &= ~FA_S_ACCESSED; 1180 new_fa->fa_state &= ~FA_S_ACCESSED;
1177 1181
1178 write_unlock_bh(&fib_lock); 1182 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1183 alias_free_mem_rcu(fa);
1179 1184
1180 fib_release_info(fi_drop); 1185 fib_release_info(fi_drop);
1181 if (state & FA_S_ACCESSED) 1186 if (state & FA_S_ACCESSED)
@@ -1227,11 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1227 goto out_free_new_fa; 1232 goto out_free_new_fa;
1228 } 1233 }
1229 1234
1230 write_lock_bh(&fib_lock); 1235 list_add_tail_rcu(&new_fa->fa_list,
1231 1236 (fa ? &fa->fa_list : fa_head));
1232 list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
1233
1234 write_unlock_bh(&fib_lock);
1235 1237
1236 rt_cache_flush(-1); 1238 rt_cache_flush(-1);
1237 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1239 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1246,6 +1248,8 @@ err:
1246 return err; 1248 return err;
1247} 1249}
1248 1250
1251
1252/* should be clalled with rcu_read_lock */
1249static inline int check_leaf(struct trie *t, struct leaf *l, 1253static inline int check_leaf(struct trie *t, struct leaf *l,
1250 t_key key, int *plen, const struct flowi *flp, 1254 t_key key, int *plen, const struct flowi *flp,
1251 struct fib_result *res) 1255 struct fib_result *res)
@@ -1256,7 +1260,7 @@ static inline int check_leaf(struct trie *t, struct leaf *l,
1256 struct hlist_head *hhead = &l->list; 1260 struct hlist_head *hhead = &l->list;
1257 struct hlist_node *node; 1261 struct hlist_node *node;
1258 1262
1259 hlist_for_each_entry(li, node, hhead, hlist) { 1263 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1260 i = li->plen; 1264 i = li->plen;
1261 mask = ntohl(inet_make_mask(i)); 1265 mask = ntohl(inet_make_mask(i));
1262 if (l->key != (key & mask)) 1266 if (l->key != (key & mask))
@@ -1292,10 +1296,9 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1292 t_key node_prefix, key_prefix, pref_mismatch; 1296 t_key node_prefix, key_prefix, pref_mismatch;
1293 int mp; 1297 int mp;
1294 1298
1295 n = t->trie; 1299 rcu_read_lock();
1296
1297 read_lock(&fib_lock);
1298 1300
1301 n = rcu_dereference(t->trie);
1299 if (!n) 1302 if (!n)
1300 goto failed; 1303 goto failed;
1301 1304
@@ -1465,10 +1468,11 @@ backtrace:
1465failed: 1468failed:
1466 ret = 1; 1469 ret = 1;
1467found: 1470found:
1468 read_unlock(&fib_lock); 1471 rcu_read_unlock();
1469 return ret; 1472 return ret;
1470} 1473}
1471 1474
1475/* only called from updater side */
1472static int trie_leaf_remove(struct trie *t, t_key key) 1476static int trie_leaf_remove(struct trie *t, t_key key)
1473{ 1477{
1474 t_key cindex; 1478 t_key cindex;
@@ -1503,15 +1507,17 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1503 t->revision++; 1507 t->revision++;
1504 t->size--; 1508 t->size--;
1505 1509
1510 preempt_disable();
1506 tp = NODE_PARENT(n); 1511 tp = NODE_PARENT(n);
1507 tnode_free((struct tnode *) n); 1512 tnode_free((struct tnode *) n);
1508 1513
1509 if (tp) { 1514 if (tp) {
1510 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1515 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1511 put_child(t, (struct tnode *)tp, cindex, NULL); 1516 put_child(t, (struct tnode *)tp, cindex, NULL);
1512 t->trie = trie_rebalance(t, tp); 1517 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1513 } else 1518 } else
1514 t->trie = NULL; 1519 rcu_assign_pointer(t->trie, NULL);
1520 preempt_enable();
1515 1521
1516 return 1; 1522 return 1;
1517} 1523}
@@ -1527,7 +1533,6 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1527 struct fib_alias *fa, *fa_to_delete; 1533 struct fib_alias *fa, *fa_to_delete;
1528 struct list_head *fa_head; 1534 struct list_head *fa_head;
1529 struct leaf *l; 1535 struct leaf *l;
1530 int kill_li = 0;
1531 struct leaf_info *li; 1536 struct leaf_info *li;
1532 1537
1533 1538
@@ -1560,6 +1565,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1560 1565
1561 fa_to_delete = NULL; 1566 fa_to_delete = NULL;
1562 fa_head = fa->fa_list.prev; 1567 fa_head = fa->fa_list.prev;
1568
1563 list_for_each_entry(fa, fa_head, fa_list) { 1569 list_for_each_entry(fa, fa_head, fa_list) {
1564 struct fib_info *fi = fa->fa_info; 1570 struct fib_info *fi = fa->fa_info;
1565 1571
@@ -1587,18 +1593,12 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1587 l = fib_find_node(t, key); 1593 l = fib_find_node(t, key);
1588 li = find_leaf_info(&l->list, plen); 1594 li = find_leaf_info(&l->list, plen);
1589 1595
1590 write_lock_bh(&fib_lock); 1596 list_del_rcu(&fa->fa_list);
1591
1592 list_del(&fa->fa_list);
1593 1597
1594 if (list_empty(fa_head)) { 1598 if (list_empty(fa_head)) {
1595 hlist_del(&li->hlist); 1599 hlist_del_rcu(&li->hlist);
1596 kill_li = 1;
1597 }
1598 write_unlock_bh(&fib_lock);
1599
1600 if (kill_li)
1601 free_leaf_info(li); 1600 free_leaf_info(li);
1601 }
1602 1602
1603 if (hlist_empty(&l->list)) 1603 if (hlist_empty(&l->list))
1604 trie_leaf_remove(t, key); 1604 trie_leaf_remove(t, key);
@@ -1606,7 +1606,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1606 if (fa->fa_state & FA_S_ACCESSED) 1606 if (fa->fa_state & FA_S_ACCESSED)
1607 rt_cache_flush(-1); 1607 rt_cache_flush(-1);
1608 1608
1609 fn_free_alias(fa); 1609 fib_release_info(fa->fa_info);
1610 alias_free_mem_rcu(fa);
1610 return 0; 1611 return 0;
1611} 1612}
1612 1613
@@ -1618,12 +1619,10 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
1618 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1619 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1619 struct fib_info *fi = fa->fa_info; 1620 struct fib_info *fi = fa->fa_info;
1620 1621
1621 if (fi && (fi->fib_flags&RTNH_F_DEAD)) { 1622 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1622 write_lock_bh(&fib_lock); 1623 list_del_rcu(&fa->fa_list);
1623 list_del(&fa->fa_list); 1624 fib_release_info(fa->fa_info);
1624 write_unlock_bh(&fib_lock); 1625 alias_free_mem_rcu(fa);
1625
1626 fn_free_alias(fa);
1627 found++; 1626 found++;
1628 } 1627 }
1629 } 1628 }
@@ -1641,30 +1640,30 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
1641 found += trie_flush_list(t, &li->falh); 1640 found += trie_flush_list(t, &li->falh);
1642 1641
1643 if (list_empty(&li->falh)) { 1642 if (list_empty(&li->falh)) {
1644 write_lock_bh(&fib_lock); 1643 hlist_del_rcu(&li->hlist);
1645 hlist_del(&li->hlist);
1646 write_unlock_bh(&fib_lock);
1647
1648 free_leaf_info(li); 1644 free_leaf_info(li);
1649 } 1645 }
1650 } 1646 }
1651 return found; 1647 return found;
1652} 1648}
1653 1649
1650/* rcu_read_lock needs to be hold by caller from readside */
1651
1654static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) 1652static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1655{ 1653{
1656 struct node *c = (struct node *) thisleaf; 1654 struct node *c = (struct node *) thisleaf;
1657 struct tnode *p; 1655 struct tnode *p;
1658 int idx; 1656 int idx;
1657 struct node *trie = rcu_dereference(t->trie);
1659 1658
1660 if (c == NULL) { 1659 if (c == NULL) {
1661 if (t->trie == NULL) 1660 if (trie == NULL)
1662 return NULL; 1661 return NULL;
1663 1662
1664 if (IS_LEAF(t->trie)) /* trie w. just a leaf */ 1663 if (IS_LEAF(trie)) /* trie w. just a leaf */
1665 return (struct leaf *) t->trie; 1664 return (struct leaf *) trie;
1666 1665
1667 p = (struct tnode*) t->trie; /* Start */ 1666 p = (struct tnode*) trie; /* Start */
1668 } else 1667 } else
1669 p = (struct tnode *) NODE_PARENT(c); 1668 p = (struct tnode *) NODE_PARENT(c);
1670 1669
@@ -1679,23 +1678,26 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1679 1678
1680 last = 1 << p->bits; 1679 last = 1 << p->bits;
1681 for (idx = pos; idx < last ; idx++) { 1680 for (idx = pos; idx < last ; idx++) {
1682 if (!p->child[idx]) 1681 c = rcu_dereference(p->child[idx]);
1682
1683 if (!c)
1683 continue; 1684 continue;
1684 1685
1685 /* Decend if tnode */ 1686 /* Decend if tnode */
1686 while (IS_TNODE(p->child[idx])) { 1687 while (IS_TNODE(c)) {
1687 p = (struct tnode*) p->child[idx]; 1688 p = (struct tnode *) c;
1688 idx = 0; 1689 idx = 0;
1689 1690
1690 /* Rightmost non-NULL branch */ 1691 /* Rightmost non-NULL branch */
1691 if (p && IS_TNODE(p)) 1692 if (p && IS_TNODE(p))
1692 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; 1693 while (!(c = rcu_dereference(p->child[idx]))
1694 && idx < (1<<p->bits)) idx++;
1693 1695
1694 /* Done with this tnode? */ 1696 /* Done with this tnode? */
1695 if (idx >= (1 << p->bits) || p->child[idx] == NULL) 1697 if (idx >= (1 << p->bits) || !c)
1696 goto up; 1698 goto up;
1697 } 1699 }
1698 return (struct leaf*) p->child[idx]; 1700 return (struct leaf *) c;
1699 } 1701 }
1700up: 1702up:
1701 /* No more children go up one step */ 1703 /* No more children go up one step */
@@ -1713,6 +1715,7 @@ static int fn_trie_flush(struct fib_table *tb)
1713 1715
1714 t->revision++; 1716 t->revision++;
1715 1717
1718 rcu_read_lock();
1716 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { 1719 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1717 found += trie_flush_leaf(t, l); 1720 found += trie_flush_leaf(t, l);
1718 1721
@@ -1720,6 +1723,7 @@ static int fn_trie_flush(struct fib_table *tb)
1720 trie_leaf_remove(t, ll->key); 1723 trie_leaf_remove(t, ll->key);
1721 ll = l; 1724 ll = l;
1722 } 1725 }
1726 rcu_read_unlock();
1723 1727
1724 if (ll && hlist_empty(&ll->list)) 1728 if (ll && hlist_empty(&ll->list))
1725 trie_leaf_remove(t, ll->key); 1729 trie_leaf_remove(t, ll->key);
@@ -1745,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1745 last_resort = NULL; 1749 last_resort = NULL;
1746 order = -1; 1750 order = -1;
1747 1751
1748 read_lock(&fib_lock); 1752 rcu_read_lock();
1749 1753
1750 l = fib_find_node(t, 0); 1754 l = fib_find_node(t, 0);
1751 if (!l) 1755 if (!l)
@@ -1758,7 +1762,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1758 if (list_empty(fa_head)) 1762 if (list_empty(fa_head))
1759 goto out; 1763 goto out;
1760 1764
1761 list_for_each_entry(fa, fa_head, fa_list) { 1765 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1762 struct fib_info *next_fi = fa->fa_info; 1766 struct fib_info *next_fi = fa->fa_info;
1763 1767
1764 if (fa->fa_scope != res->scope || 1768 if (fa->fa_scope != res->scope ||
@@ -1809,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1809 } 1813 }
1810 trie_last_dflt = last_idx; 1814 trie_last_dflt = last_idx;
1811 out:; 1815 out:;
1812 read_unlock(&fib_lock); 1816 rcu_read_unlock();
1813} 1817}
1814 1818
1815static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1819static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
@@ -1823,7 +1827,9 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
1823 s_i = cb->args[3]; 1827 s_i = cb->args[3];
1824 i = 0; 1828 i = 0;
1825 1829
1826 list_for_each_entry(fa, fah, fa_list) { 1830 /* rcu_read_lock is hold by caller */
1831
1832 list_for_each_entry_rcu(fa, fah, fa_list) {
1827 if (i < s_i) { 1833 if (i < s_i) {
1828 i++; 1834 i++;
1829 continue; 1835 continue;
@@ -1898,7 +1904,7 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
1898 1904
1899 s_m = cb->args[1]; 1905 s_m = cb->args[1];
1900 1906
1901 read_lock(&fib_lock); 1907 rcu_read_lock();
1902 for (m = 0; m <= 32; m++) { 1908 for (m = 0; m <= 32; m++) {
1903 if (m < s_m) 1909 if (m < s_m)
1904 continue; 1910 continue;
@@ -1911,11 +1917,11 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
1911 goto out; 1917 goto out;
1912 } 1918 }
1913 } 1919 }
1914 read_unlock(&fib_lock); 1920 rcu_read_unlock();
1915 cb->args[1] = m; 1921 cb->args[1] = m;
1916 return skb->len; 1922 return skb->len;
1917out: 1923out:
1918 read_unlock(&fib_lock); 1924 rcu_read_unlock();
1919 return -1; 1925 return -1;
1920} 1926}
1921 1927
@@ -2016,7 +2022,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2016 putspace_seq(seq, indent+2); 2022 putspace_seq(seq, indent+2);
2017 seq_printf(seq, "{/%d...dumping}\n", i); 2023 seq_printf(seq, "{/%d...dumping}\n", i);
2018 2024
2019 list_for_each_entry(fa, fa_head, fa_list) { 2025 list_for_each_entry_rcu(fa, fa_head, fa_list) {
2020 putspace_seq(seq, indent+2); 2026 putspace_seq(seq, indent+2);
2021 if (fa->fa_info == NULL) { 2027 if (fa->fa_info == NULL) {
2022 seq_printf(seq, "Error fa_info=NULL\n"); 2028 seq_printf(seq, "Error fa_info=NULL\n");
@@ -2056,28 +2062,28 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2056 2062
2057static void trie_dump_seq(struct seq_file *seq, struct trie *t) 2063static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2058{ 2064{
2059 struct node *n = t->trie; 2065 struct node *n;
2060 int cindex = 0; 2066 int cindex = 0;
2061 int indent = 1; 2067 int indent = 1;
2062 int pend = 0; 2068 int pend = 0;
2063 int depth = 0; 2069 int depth = 0;
2064 struct tnode *tn; 2070 struct tnode *tn;
2065 2071
2066 read_lock(&fib_lock); 2072 rcu_read_lock();
2067 2073 n = rcu_dereference(t->trie);
2068 seq_printf(seq, "------ trie_dump of t=%p ------\n", t); 2074 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2069 2075
2070 if (!n) { 2076 if (!n) {
2071 seq_printf(seq, "------ trie is empty\n"); 2077 seq_printf(seq, "------ trie is empty\n");
2072 2078
2073 read_unlock(&fib_lock); 2079 rcu_read_unlock();
2074 return; 2080 return;
2075 } 2081 }
2076 2082
2077 printnode_seq(seq, indent, n, pend, cindex, 0); 2083 printnode_seq(seq, indent, n, pend, cindex, 0);
2078 2084
2079 if (!IS_TNODE(n)) { 2085 if (!IS_TNODE(n)) {
2080 read_unlock(&fib_lock); 2086 rcu_read_unlock();
2081 return; 2087 return;
2082 } 2088 }
2083 2089
@@ -2088,26 +2094,32 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2088 depth++; 2094 depth++;
2089 2095
2090 while (tn && cindex < (1 << tn->bits)) { 2096 while (tn && cindex < (1 << tn->bits)) {
2091 if (tn->child[cindex]) { 2097 struct node *child = rcu_dereference(tn->child[cindex]);
2098 if (!child)
2099 cindex++;
2100 else {
2092 /* Got a child */ 2101 /* Got a child */
2102 printnode_seq(seq, indent, child, pend,
2103 cindex, tn->bits);
2093 2104
2094 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); 2105 if (IS_LEAF(child))
2095 if (IS_LEAF(tn->child[cindex])) {
2096 cindex++; 2106 cindex++;
2097 } else { 2107
2108 else {
2098 /* 2109 /*
2099 * New tnode. Decend one level 2110 * New tnode. Decend one level
2100 */ 2111 */
2101 2112
2102 depth++; 2113 depth++;
2103 tn = (struct tnode *)tn->child[cindex]; 2114 n = child;
2104 pend = tn->pos + tn->bits; 2115 tn = (struct tnode *)n;
2105 putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); 2116 pend = tn->pos+tn->bits;
2117 putspace_seq(seq, indent);
2118 seq_printf(seq, "\\--\n");
2106 indent += 3; 2119 indent += 3;
2107 cindex = 0; 2120 cindex = 0;
2108 } 2121 }
2109 } else 2122 }
2110 cindex++;
2111 2123
2112 /* 2124 /*
2113 * Test if we are done 2125 * Test if we are done
@@ -2132,8 +2144,7 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2132 depth--; 2144 depth--;
2133 } 2145 }
2134 } 2146 }
2135 2147 rcu_read_unlock();
2136 read_unlock(&fib_lock);
2137} 2148}
2138 2149
2139static struct trie_stat *trie_stat_new(void) 2150static struct trie_stat *trie_stat_new(void)
@@ -2159,7 +2170,7 @@ static struct trie_stat *trie_stat_new(void)
2159 2170
2160static struct trie_stat *trie_collect_stats(struct trie *t) 2171static struct trie_stat *trie_collect_stats(struct trie *t)
2161{ 2172{
2162 struct node *n = t->trie; 2173 struct node *n;
2163 struct trie_stat *s = trie_stat_new(); 2174 struct trie_stat *s = trie_stat_new();
2164 int cindex = 0; 2175 int cindex = 0;
2165 int pend = 0; 2176 int pend = 0;
@@ -2167,11 +2178,13 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2167 2178
2168 if (!s) 2179 if (!s)
2169 return NULL; 2180 return NULL;
2181
2182 rcu_read_lock();
2183 n = rcu_dereference(t->trie);
2184
2170 if (!n) 2185 if (!n)
2171 return s; 2186 return s;
2172 2187
2173 read_lock(&fib_lock);
2174
2175 if (IS_TNODE(n)) { 2188 if (IS_TNODE(n)) {
2176 struct tnode *tn = (struct tnode *)n; 2189 struct tnode *tn = (struct tnode *)n;
2177 pend = tn->pos+tn->bits; 2190 pend = tn->pos+tn->bits;
@@ -2179,7 +2192,9 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2179 depth++; 2192 depth++;
2180 2193
2181 while (tn && cindex < (1 << tn->bits)) { 2194 while (tn && cindex < (1 << tn->bits)) {
2182 if (tn->child[cindex]) { 2195 struct node *ch = rcu_dereference(tn->child[cindex]);
2196 if (ch) {
2197
2183 /* Got a child */ 2198 /* Got a child */
2184 2199
2185 if (IS_LEAF(tn->child[cindex])) { 2200 if (IS_LEAF(tn->child[cindex])) {
@@ -2199,7 +2214,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2199 s->nodesizes[tn->bits]++; 2214 s->nodesizes[tn->bits]++;
2200 depth++; 2215 depth++;
2201 2216
2202 n = tn->child[cindex]; 2217 n = ch;
2203 tn = (struct tnode *)n; 2218 tn = (struct tnode *)n;
2204 pend = tn->pos+tn->bits; 2219 pend = tn->pos+tn->bits;
2205 2220
@@ -2236,7 +2251,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2236 } 2251 }
2237 } 2252 }
2238 2253
2239 read_unlock(&fib_lock); 2254 rcu_read_unlock();
2240 return s; 2255 return s;
2241} 2256}
2242 2257