aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-02-01 18:15:39 -0500
committerDavid S. Miller <davem@davemloft.net>2011-02-01 18:35:25 -0500
commit3630b7c050d9c3564f143d595339fc06b888d6f3 (patch)
tree6c6144205d92e745303d79d136b0baed4b385193
parent2ba451421b23636c45fabfa522858c5c124b3673 (diff)
ipv4: Remove fib_hash.
The time has finally come to remove the hash based routing table implementation in ipv4. FIB Trie is mature, well tested, and I've done an audit of it's code to confirm that it implements insert, delete, and lookup with the same identical semantics as fib_hash did. If there are any semantic differences found in fib_trie, we should simply fix them. I've placed the trie statistic config option under advanced router configuration. Signed-off-by: David S. Miller <davem@davemloft.net> Acked-by: Stephen Hemminger <shemminger@vyatta.com>
-rw-r--r--net/ipv4/Kconfig38
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/fib_hash.c1061
3 files changed, 2 insertions, 1101 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8949a05ac307..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
55 55
56 If unsure, say N here. 56 If unsure, say N here.
57 57
58choice
59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
60 depends on IP_ADVANCED_ROUTER
61 default ASK_IP_FIB_HASH
62
63config ASK_IP_FIB_HASH
64 bool "FIB_HASH"
65 ---help---
66 Current FIB is very proven and good enough for most users.
67
68config IP_FIB_TRIE
69 bool "FIB_TRIE"
70 ---help---
71 Use new experimental LC-trie as FIB lookup algorithm.
72 This improves lookup performance if you have a large
73 number of routes.
74
75 LC-trie is a longest matching prefix lookup algorithm which
76 performs better than FIB_HASH for large routing tables.
77 But, it consumes more memory and is more complex.
78
79 LC-trie is described in:
80
81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
83 June 1999
84
85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
88
89endchoice
90
91config IP_FIB_HASH
92 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
93
94config IP_FIB_TRIE_STATS 58config IP_FIB_TRIE_STATS
95 bool "FIB TRIE statistics" 59 bool "FIB TRIE statistics"
96 depends on IP_FIB_TRIE 60 depends on IP_ADVANCED_ROUTER
97 ---help--- 61 ---help---
98 Keep track of statistics on structure of FIB TRIE table. 62 Keep track of statistics on structure of FIB TRIE table.
99 Useful for testing and measuring TRIE performance. 63 Useful for testing and measuring TRIE performance.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a75..0dc772d0d125 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o 14 inet_fragment.o
15 15
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
18obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
19obj-$(CONFIG_PROC_FS) += proc.o 17obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 19obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index fadb6024de27..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1061 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 FIB: lookup engine and maintenance routines.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/string.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/errno.h>
26#include <linux/in.h>
27#include <linux/inet.h>
28#include <linux/inetdevice.h>
29#include <linux/netdevice.h>
30#include <linux/if_arp.h>
31#include <linux/proc_fs.h>
32#include <linux/skbuff.h>
33#include <linux/netlink.h>
34#include <linux/init.h>
35#include <linux/slab.h>
36
37#include <net/net_namespace.h>
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/ip_fib.h>
44
45#include "fib_lookup.h"
46
47static struct kmem_cache *fn_hash_kmem __read_mostly;
48static struct kmem_cache *fn_alias_kmem __read_mostly;
49
50struct fib_node {
51 struct hlist_node fn_hash;
52 struct list_head fn_alias;
53 __be32 fn_key;
54 struct fib_alias fn_embedded_alias;
55};
56
57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
58
59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
63 u32 fz_hashmask; /* (fz_divisor - 1) */
64
65 u8 fz_order; /* Zone order (0..32) */
66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
68#define FZ_MASK(fz) ((fz)->fz_mask)
69
70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
71
72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
75
76struct fn_hash {
77 struct fn_zone *fn_zones[33];
78 struct fn_zone __rcu *fn_zone_list;
79};
80
81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
82{
83 u32 h = ntohl(key) >> fz->fz_revorder;
84 h ^= (h>>20);
85 h ^= (h>>10);
86 h ^= (h>>5);
87 h &= fz->fz_hashmask;
88 return h;
89}
90
91static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
92{
93 return dst & FZ_MASK(fz);
94}
95
96static unsigned int fib_hash_genid;
97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
99
100static struct hlist_head *fz_hash_alloc(int divisor)
101{
102 unsigned long size = divisor * sizeof(struct hlist_head);
103
104 if (size <= PAGE_SIZE)
105 return kzalloc(size, GFP_KERNEL);
106
107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109}
110
111/* The fib hash lock must be held when this is called. */
112static inline void fn_rebuild_zone(struct fn_zone *fz,
113 struct hlist_head *old_ht,
114 int old_divisor)
115{
116 int i;
117
118 for (i = 0; i < old_divisor; i++) {
119 struct hlist_node *node, *n;
120 struct fib_node *f;
121
122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
123 struct hlist_head *new_head;
124
125 hlist_del_rcu(&f->fn_hash);
126
127 new_head = rcu_dereference_protected(fz->fz_hash, 1) +
128 fn_hash(f->fn_key, fz);
129 hlist_add_head_rcu(&f->fn_hash, new_head);
130 }
131 }
132}
133
134static void fz_hash_free(struct hlist_head *hash, int divisor)
135{
136 unsigned long size = divisor * sizeof(struct hlist_head);
137
138 if (size <= PAGE_SIZE)
139 kfree(hash);
140 else
141 free_pages((unsigned long)hash, get_order(size));
142}
143
144static void fn_rehash_zone(struct fn_zone *fz)
145{
146 struct hlist_head *ht, *old_ht;
147 int old_divisor, new_divisor;
148 u32 new_hashmask;
149
150 new_divisor = old_divisor = fz->fz_divisor;
151
152 switch (old_divisor) {
153 case EMBEDDED_HASH_SIZE:
154 new_divisor *= EMBEDDED_HASH_SIZE;
155 break;
156 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
157 new_divisor *= (EMBEDDED_HASH_SIZE/2);
158 break;
159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
161 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
162 return;
163 }
164 new_divisor = (old_divisor << 1);
165 break;
166 }
167
168 new_hashmask = (new_divisor - 1);
169
170#if RT_CACHE_DEBUG >= 2
171 printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
172 fz->fz_order, old_divisor);
173#endif
174
175 ht = fz_hash_alloc(new_divisor);
176
177 if (ht) {
178 struct fn_zone nfz;
179
180 memcpy(&nfz, fz, sizeof(nfz));
181
182 write_seqlock_bh(&fz->fz_lock);
183 old_ht = rcu_dereference_protected(fz->fz_hash, 1);
184 RCU_INIT_POINTER(nfz.fz_hash, ht);
185 nfz.fz_hashmask = new_hashmask;
186 nfz.fz_divisor = new_divisor;
187 fn_rebuild_zone(&nfz, old_ht, old_divisor);
188 fib_hash_genid++;
189 rcu_assign_pointer(fz->fz_hash, ht);
190 fz->fz_hashmask = new_hashmask;
191 fz->fz_divisor = new_divisor;
192 write_sequnlock_bh(&fz->fz_lock);
193
194 if (old_ht != fz->fz_embedded_hash) {
195 synchronize_rcu();
196 fz_hash_free(old_ht, old_divisor);
197 }
198 }
199}
200
201static void fn_free_node_rcu(struct rcu_head *head)
202{
203 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
204
205 kmem_cache_free(fn_hash_kmem, f);
206}
207
208static inline void fn_free_node(struct fib_node *f)
209{
210 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
211}
212
213static void fn_free_alias_rcu(struct rcu_head *head)
214{
215 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
216
217 kmem_cache_free(fn_alias_kmem, fa);
218}
219
220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
221{
222 fib_release_info(fa->fa_info);
223 if (fa == &f->fn_embedded_alias)
224 fa->fa_info = NULL;
225 else
226 call_rcu(&fa->rcu, fn_free_alias_rcu);
227}
228
229static struct fn_zone *
230fn_new_zone(struct fn_hash *table, int z)
231{
232 int i;
233 struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
234 if (!fz)
235 return NULL;
236
237 seqlock_init(&fz->fz_lock);
238 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
239 fz->fz_hashmask = fz->fz_divisor - 1;
240 RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
241 fz->fz_order = z;
242 fz->fz_revorder = 32 - z;
243 fz->fz_mask = inet_make_mask(z);
244
245 /* Find the first not empty zone with more specific mask */
246 for (i = z + 1; i <= 32; i++)
247 if (table->fn_zones[i])
248 break;
249 if (i > 32) {
250 /* No more specific masks, we are the first. */
251 rcu_assign_pointer(fz->fz_next,
252 rtnl_dereference(table->fn_zone_list));
253 rcu_assign_pointer(table->fn_zone_list, fz);
254 } else {
255 rcu_assign_pointer(fz->fz_next,
256 rtnl_dereference(table->fn_zones[i]->fz_next));
257 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
258 }
259 table->fn_zones[z] = fz;
260 fib_hash_genid++;
261 return fz;
262}
263
264int fib_table_lookup(struct fib_table *tb,
265 const struct flowi *flp, struct fib_result *res,
266 int fib_flags)
267{
268 int err;
269 struct fn_zone *fz;
270 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
271
272 rcu_read_lock();
273 for (fz = rcu_dereference(t->fn_zone_list);
274 fz != NULL;
275 fz = rcu_dereference(fz->fz_next)) {
276 struct hlist_head *head;
277 struct hlist_node *node;
278 struct fib_node *f;
279 __be32 k;
280 unsigned int seq;
281
282 do {
283 seq = read_seqbegin(&fz->fz_lock);
284 k = fz_key(flp->fl4_dst, fz);
285
286 head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
287 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
288 if (f->fn_key != k)
289 continue;
290
291 err = fib_semantic_match(tb, &f->fn_alias,
292 flp, res,
293 fz->fz_order, fib_flags);
294 if (err <= 0)
295 goto out;
296 }
297 } while (read_seqretry(&fz->fz_lock, seq));
298 }
299 err = 1;
300out:
301 rcu_read_unlock();
302 return err;
303}
304
305/* Insert node F to FZ. */
306static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
307{
308 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
309
310 hlist_add_head_rcu(&f->fn_hash, head);
311}
312
313/* Return the node in FZ matching KEY. */
314static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
315{
316 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
317 struct hlist_node *node;
318 struct fib_node *f;
319
320 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
321 if (f->fn_key == key)
322 return f;
323 }
324
325 return NULL;
326}
327
328
329static struct fib_alias *fib_fast_alloc(struct fib_node *f)
330{
331 struct fib_alias *fa = &f->fn_embedded_alias;
332
333 if (fa->fa_info != NULL)
334 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
335 return fa;
336}
337
338/* Caller must hold RTNL. */
339int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
340{
341 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
342 struct fib_node *new_f = NULL;
343 struct fib_node *f;
344 struct fib_alias *fa, *new_fa;
345 struct fn_zone *fz;
346 struct fib_info *fi;
347 u8 tos = cfg->fc_tos;
348 __be32 key;
349 int err;
350
351 if (cfg->fc_dst_len > 32)
352 return -EINVAL;
353
354 fz = table->fn_zones[cfg->fc_dst_len];
355 if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
356 return -ENOBUFS;
357
358 key = 0;
359 if (cfg->fc_dst) {
360 if (cfg->fc_dst & ~FZ_MASK(fz))
361 return -EINVAL;
362 key = fz_key(cfg->fc_dst, fz);
363 }
364
365 fi = fib_create_info(cfg);
366 if (IS_ERR(fi))
367 return PTR_ERR(fi);
368
369 if (fz->fz_nent > (fz->fz_divisor<<1) &&
370 fz->fz_divisor < FZ_MAX_DIVISOR &&
371 (cfg->fc_dst_len == 32 ||
372 (1 << cfg->fc_dst_len) > fz->fz_divisor))
373 fn_rehash_zone(fz);
374
375 f = fib_find_node(fz, key);
376
377 if (!f)
378 fa = NULL;
379 else
380 fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
381
382 /* Now fa, if non-NULL, points to the first fib alias
383 * with the same keys [prefix,tos,priority], if such key already
384 * exists or to the node before which we will insert new one.
385 *
386 * If fa is NULL, we will need to allocate a new one and
387 * insert to the head of f.
388 *
389 * If f is NULL, no fib node matched the destination key
390 * and we need to allocate a new one of those as well.
391 */
392
393 if (fa && fa->fa_tos == tos &&
394 fa->fa_info->fib_priority == fi->fib_priority) {
395 struct fib_alias *fa_first, *fa_match;
396
397 err = -EEXIST;
398 if (cfg->fc_nlflags & NLM_F_EXCL)
399 goto out;
400
401 /* We have 2 goals:
402 * 1. Find exact match for type, scope, fib_info to avoid
403 * duplicate routes
404 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
405 */
406 fa_match = NULL;
407 fa_first = fa;
408 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
409 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
410 if (fa->fa_tos != tos)
411 break;
412 if (fa->fa_info->fib_priority != fi->fib_priority)
413 break;
414 if (fa->fa_type == cfg->fc_type &&
415 fa->fa_scope == cfg->fc_scope &&
416 fa->fa_info == fi) {
417 fa_match = fa;
418 break;
419 }
420 }
421
422 if (cfg->fc_nlflags & NLM_F_REPLACE) {
423 u8 state;
424
425 fa = fa_first;
426 if (fa_match) {
427 if (fa == fa_match)
428 err = 0;
429 goto out;
430 }
431 err = -ENOBUFS;
432 new_fa = fib_fast_alloc(f);
433 if (new_fa == NULL)
434 goto out;
435
436 new_fa->fa_tos = fa->fa_tos;
437 new_fa->fa_info = fi;
438 new_fa->fa_type = cfg->fc_type;
439 new_fa->fa_scope = cfg->fc_scope;
440 state = fa->fa_state;
441 new_fa->fa_state = state & ~FA_S_ACCESSED;
442 fib_hash_genid++;
443 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
444
445 fn_free_alias(fa, f);
446 if (state & FA_S_ACCESSED)
447 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
448 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
449 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
450 return 0;
451 }
452
453 /* Error if we find a perfect match which
454 * uses the same scope, type, and nexthop
455 * information.
456 */
457 if (fa_match)
458 goto out;
459
460 if (!(cfg->fc_nlflags & NLM_F_APPEND))
461 fa = fa_first;
462 }
463
464 err = -ENOENT;
465 if (!(cfg->fc_nlflags & NLM_F_CREATE))
466 goto out;
467
468 err = -ENOBUFS;
469
470 if (!f) {
471 new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
472 if (new_f == NULL)
473 goto out;
474
475 INIT_HLIST_NODE(&new_f->fn_hash);
476 INIT_LIST_HEAD(&new_f->fn_alias);
477 new_f->fn_key = key;
478 f = new_f;
479 }
480
481 new_fa = fib_fast_alloc(f);
482 if (new_fa == NULL)
483 goto out;
484
485 new_fa->fa_info = fi;
486 new_fa->fa_tos = tos;
487 new_fa->fa_type = cfg->fc_type;
488 new_fa->fa_scope = cfg->fc_scope;
489 new_fa->fa_state = 0;
490
491 /*
492 * Insert new entry to the list.
493 */
494
495 if (new_f)
496 fib_insert_node(fz, new_f);
497 list_add_tail_rcu(&new_fa->fa_list,
498 (fa ? &fa->fa_list : &f->fn_alias));
499 fib_hash_genid++;
500
501 if (new_f)
502 fz->fz_nent++;
503 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
504
505 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
506 &cfg->fc_nlinfo, 0);
507 return 0;
508
509out:
510 if (new_f)
511 kmem_cache_free(fn_hash_kmem, new_f);
512 fib_release_info(fi);
513 return err;
514}
515
516int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
517{
518 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
519 struct fib_node *f;
520 struct fib_alias *fa, *fa_to_delete;
521 struct fn_zone *fz;
522 __be32 key;
523
524 if (cfg->fc_dst_len > 32)
525 return -EINVAL;
526
527 if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
528 return -ESRCH;
529
530 key = 0;
531 if (cfg->fc_dst) {
532 if (cfg->fc_dst & ~FZ_MASK(fz))
533 return -EINVAL;
534 key = fz_key(cfg->fc_dst, fz);
535 }
536
537 f = fib_find_node(fz, key);
538
539 if (!f)
540 fa = NULL;
541 else
542 fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
543 if (!fa)
544 return -ESRCH;
545
546 fa_to_delete = NULL;
547 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
548 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
549 struct fib_info *fi = fa->fa_info;
550
551 if (fa->fa_tos != cfg->fc_tos)
552 break;
553
554 if ((!cfg->fc_type ||
555 fa->fa_type == cfg->fc_type) &&
556 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
557 fa->fa_scope == cfg->fc_scope) &&
558 (!cfg->fc_protocol ||
559 fi->fib_protocol == cfg->fc_protocol) &&
560 fib_nh_match(cfg, fi) == 0) {
561 fa_to_delete = fa;
562 break;
563 }
564 }
565
566 if (fa_to_delete) {
567 int kill_fn;
568
569 fa = fa_to_delete;
570 rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
571 tb->tb_id, &cfg->fc_nlinfo, 0);
572
573 kill_fn = 0;
574 list_del_rcu(&fa->fa_list);
575 if (list_empty(&f->fn_alias)) {
576 hlist_del_rcu(&f->fn_hash);
577 kill_fn = 1;
578 }
579 fib_hash_genid++;
580
581 if (fa->fa_state & FA_S_ACCESSED)
582 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
583 fn_free_alias(fa, f);
584 if (kill_fn) {
585 fn_free_node(f);
586 fz->fz_nent--;
587 }
588
589 return 0;
590 }
591 return -ESRCH;
592}
593
594static int fn_flush_list(struct fn_zone *fz, int idx)
595{
596 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
597 struct hlist_node *node, *n;
598 struct fib_node *f;
599 int found = 0;
600
601 hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
602 struct fib_alias *fa, *fa_node;
603 int kill_f;
604
605 kill_f = 0;
606 list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
607 struct fib_info *fi = fa->fa_info;
608
609 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
610 list_del_rcu(&fa->fa_list);
611 if (list_empty(&f->fn_alias)) {
612 hlist_del_rcu(&f->fn_hash);
613 kill_f = 1;
614 }
615 fib_hash_genid++;
616
617 fn_free_alias(fa, f);
618 found++;
619 }
620 }
621 if (kill_f) {
622 fn_free_node(f);
623 fz->fz_nent--;
624 }
625 }
626 return found;
627}
628
629/* caller must hold RTNL. */
630int fib_table_flush(struct fib_table *tb)
631{
632 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
633 struct fn_zone *fz;
634 int found = 0;
635
636 for (fz = rtnl_dereference(table->fn_zone_list);
637 fz != NULL;
638 fz = rtnl_dereference(fz->fz_next)) {
639 int i;
640
641 for (i = fz->fz_divisor - 1; i >= 0; i--)
642 found += fn_flush_list(fz, i);
643 }
644 return found;
645}
646
647void fib_free_table(struct fib_table *tb)
648{
649 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
650 struct fn_zone *fz, *next;
651
652 next = table->fn_zone_list;
653 while (next != NULL) {
654 fz = next;
655 next = fz->fz_next;
656
657 if (fz->fz_hash != fz->fz_embedded_hash)
658 fz_hash_free(fz->fz_hash, fz->fz_divisor);
659
660 kfree(fz);
661 }
662
663 kfree(tb);
664}
665
666static inline int
667fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
668 struct fib_table *tb,
669 struct fn_zone *fz,
670 struct hlist_head *head)
671{
672 struct hlist_node *node;
673 struct fib_node *f;
674 int i, s_i;
675
676 s_i = cb->args[4];
677 i = 0;
678 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
679 struct fib_alias *fa;
680
681 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
682 if (i < s_i)
683 goto next;
684
685 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
686 cb->nlh->nlmsg_seq,
687 RTM_NEWROUTE,
688 tb->tb_id,
689 fa->fa_type,
690 fa->fa_scope,
691 f->fn_key,
692 fz->fz_order,
693 fa->fa_tos,
694 fa->fa_info,
695 NLM_F_MULTI) < 0) {
696 cb->args[4] = i;
697 return -1;
698 }
699next:
700 i++;
701 }
702 }
703 cb->args[4] = i;
704 return skb->len;
705}
706
707static inline int
708fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
709 struct fib_table *tb,
710 struct fn_zone *fz)
711{
712 int h, s_h;
713 struct hlist_head *head = rcu_dereference(fz->fz_hash);
714
715 if (head == NULL)
716 return skb->len;
717 s_h = cb->args[3];
718 for (h = s_h; h < fz->fz_divisor; h++) {
719 if (hlist_empty(head + h))
720 continue;
721 if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
722 cb->args[3] = h;
723 return -1;
724 }
725 memset(&cb->args[4], 0,
726 sizeof(cb->args) - 4*sizeof(cb->args[0]));
727 }
728 cb->args[3] = h;
729 return skb->len;
730}
731
732int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
733 struct netlink_callback *cb)
734{
735 int m = 0, s_m;
736 struct fn_zone *fz;
737 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
738
739 s_m = cb->args[2];
740 rcu_read_lock();
741 for (fz = rcu_dereference(table->fn_zone_list);
742 fz != NULL;
743 fz = rcu_dereference(fz->fz_next), m++) {
744 if (m < s_m)
745 continue;
746 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
747 cb->args[2] = m;
748 rcu_read_unlock();
749 return -1;
750 }
751 memset(&cb->args[3], 0,
752 sizeof(cb->args) - 3*sizeof(cb->args[0]));
753 }
754 rcu_read_unlock();
755 cb->args[2] = m;
756 return skb->len;
757}
758
759void __init fib_hash_init(void)
760{
761 fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
762 0, SLAB_PANIC, NULL);
763
764 fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
765 0, SLAB_PANIC, NULL);
766
767}
768
769struct fib_table *fib_hash_table(u32 id)
770{
771 struct fib_table *tb;
772
773 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
774 GFP_KERNEL);
775 if (tb == NULL)
776 return NULL;
777
778 tb->tb_id = id;
779 tb->tb_default = -1;
780
781 memset(tb->tb_data, 0, sizeof(struct fn_hash));
782 return tb;
783}
784
785/* ------------------------------------------------------------------------ */
786#ifdef CONFIG_PROC_FS
787
788struct fib_iter_state {
789 struct seq_net_private p;
790 struct fn_zone *zone;
791 int bucket;
792 struct hlist_head *hash_head;
793 struct fib_node *fn;
794 struct fib_alias *fa;
795 loff_t pos;
796 unsigned int genid;
797 int valid;
798};
799
800static struct fib_alias *fib_get_first(struct seq_file *seq)
801{
802 struct fib_iter_state *iter = seq->private;
803 struct fib_table *main_table;
804 struct fn_hash *table;
805
806 main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
807 table = (struct fn_hash *)main_table->tb_data;
808
809 iter->bucket = 0;
810 iter->hash_head = NULL;
811 iter->fn = NULL;
812 iter->fa = NULL;
813 iter->pos = 0;
814 iter->genid = fib_hash_genid;
815 iter->valid = 1;
816
817 for (iter->zone = rcu_dereference(table->fn_zone_list);
818 iter->zone != NULL;
819 iter->zone = rcu_dereference(iter->zone->fz_next)) {
820 int maxslot;
821
822 if (!iter->zone->fz_nent)
823 continue;
824
825 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
826 maxslot = iter->zone->fz_divisor;
827
828 for (iter->bucket = 0; iter->bucket < maxslot;
829 ++iter->bucket, ++iter->hash_head) {
830 struct hlist_node *node;
831 struct fib_node *fn;
832
833 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
834 struct fib_alias *fa;
835
836 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
837 iter->fn = fn;
838 iter->fa = fa;
839 goto out;
840 }
841 }
842 }
843 }
844out:
845 return iter->fa;
846}
847
848static struct fib_alias *fib_get_next(struct seq_file *seq)
849{
850 struct fib_iter_state *iter = seq->private;
851 struct fib_node *fn;
852 struct fib_alias *fa;
853
854 /* Advance FA, if any. */
855 fn = iter->fn;
856 fa = iter->fa;
857 if (fa) {
858 BUG_ON(!fn);
859 list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
860 iter->fa = fa;
861 goto out;
862 }
863 }
864
865 fa = iter->fa = NULL;
866
867 /* Advance FN. */
868 if (fn) {
869 struct hlist_node *node = &fn->fn_hash;
870 hlist_for_each_entry_continue(fn, node, fn_hash) {
871 iter->fn = fn;
872
873 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
874 iter->fa = fa;
875 goto out;
876 }
877 }
878 }
879
880 fn = iter->fn = NULL;
881
882 /* Advance hash chain. */
883 if (!iter->zone)
884 goto out;
885
886 for (;;) {
887 struct hlist_node *node;
888 int maxslot;
889
890 maxslot = iter->zone->fz_divisor;
891
892 while (++iter->bucket < maxslot) {
893 iter->hash_head++;
894
895 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
896 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
897 iter->fn = fn;
898 iter->fa = fa;
899 goto out;
900 }
901 }
902 }
903
904 iter->zone = rcu_dereference(iter->zone->fz_next);
905
906 if (!iter->zone)
907 goto out;
908
909 iter->bucket = 0;
910 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
911
912 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
913 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
914 iter->fn = fn;
915 iter->fa = fa;
916 goto out;
917 }
918 }
919 }
920out:
921 iter->pos++;
922 return fa;
923}
924
925static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
926{
927 struct fib_iter_state *iter = seq->private;
928 struct fib_alias *fa;
929
930 if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
931 fa = iter->fa;
932 pos -= iter->pos;
933 } else
934 fa = fib_get_first(seq);
935
936 if (fa)
937 while (pos && (fa = fib_get_next(seq)))
938 --pos;
939 return pos ? NULL : fa;
940}
941
942static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
943 __acquires(RCU)
944{
945 void *v = NULL;
946
947 rcu_read_lock();
948 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
949 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
950 return v;
951}
952
953static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
954{
955 ++*pos;
956 return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
957}
958
959static void fib_seq_stop(struct seq_file *seq, void *v)
960 __releases(RCU)
961{
962 rcu_read_unlock();
963}
964
965static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
966{
967 static const unsigned type2flags[RTN_MAX + 1] = {
968 [7] = RTF_REJECT,
969 [8] = RTF_REJECT,
970 };
971 unsigned flags = type2flags[type];
972
973 if (fi && fi->fib_nh->nh_gw)
974 flags |= RTF_GATEWAY;
975 if (mask == htonl(0xFFFFFFFF))
976 flags |= RTF_HOST;
977 flags |= RTF_UP;
978 return flags;
979}
980
981/*
982 * This outputs /proc/net/route.
983 *
984 * It always works in backward compatibility mode.
985 * The format of the file is not supposed to be changed.
986 */
987static int fib_seq_show(struct seq_file *seq, void *v)
988{
989 struct fib_iter_state *iter;
990 int len;
991 __be32 prefix, mask;
992 unsigned flags;
993 struct fib_node *f;
994 struct fib_alias *fa;
995 struct fib_info *fi;
996
997 if (v == SEQ_START_TOKEN) {
998 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
999 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1000 "\tWindow\tIRTT");
1001 goto out;
1002 }
1003
1004 iter = seq->private;
1005 f = iter->fn;
1006 fa = iter->fa;
1007 fi = fa->fa_info;
1008 prefix = f->fn_key;
1009 mask = FZ_MASK(iter->zone);
1010 flags = fib_flag_trans(fa->fa_type, mask, fi);
1011 if (fi)
1012 seq_printf(seq,
1013 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1014 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1015 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1016 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1017 fi->fib_window,
1018 fi->fib_rtt >> 3, &len);
1019 else
1020 seq_printf(seq,
1021 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1022 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
1023
1024 seq_printf(seq, "%*s\n", 127 - len, "");
1025out:
1026 return 0;
1027}
1028
1029static const struct seq_operations fib_seq_ops = {
1030 .start = fib_seq_start,
1031 .next = fib_seq_next,
1032 .stop = fib_seq_stop,
1033 .show = fib_seq_show,
1034};
1035
1036static int fib_seq_open(struct inode *inode, struct file *file)
1037{
1038 return seq_open_net(inode, file, &fib_seq_ops,
1039 sizeof(struct fib_iter_state));
1040}
1041
1042static const struct file_operations fib_seq_fops = {
1043 .owner = THIS_MODULE,
1044 .open = fib_seq_open,
1045 .read = seq_read,
1046 .llseek = seq_lseek,
1047 .release = seq_release_net,
1048};
1049
1050int __net_init fib_proc_init(struct net *net)
1051{
1052 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
1053 return -ENOMEM;
1054 return 0;
1055}
1056
1057void __net_exit fib_proc_exit(struct net *net)
1058{
1059 proc_net_remove(net, "route");
1060}
1061#endif /* CONFIG_PROC_FS */