/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
* & Swedish University of Agricultural Sciences.
*
* Jens Laas <jens.laas@data.slu.se> Swedish University of
* Agricultural Sciences.
*
* Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
*
* This work is based on the LPC-trie which is originally described in:
*
* An experimental study of compression methods for dynamic tries
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
* http://www.csc.kth.se/~snilsson/software/dyntrie2/
*
*
* IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
* IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
*
*
* Code from fib_hash has been reused which includes the following header:
*
*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IPv4 FIB: lookup engine and maintenance routines.
*
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Substantial contributions to this work comes from:
*
* David S. Miller, <davem@davemloft.net>
* Stephen Hemminger <shemminger@osdl.org>
* Paul E. McKenney <paulmck@us.ibm.com>
* Patrick McHardy <kaber@trash.net>
*/
#define VERSION "0.409"
#include <asm/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
typedef unsigned int t_key;
#define T_TNODE 0
#define T_LEAF 1
#define NODE_TYPE_MASK 0x1UL
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
struct rt_trie_node {
unsigned long parent;
t_key key;
};
struct leaf {
unsigned long parent;
t_key key;
struct hlist_head list;
struct rcu_head rcu;
};
struct leaf_info {
struct hlist_node hlist;
int plen;
u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
struct list_head falh;
struct rcu_head rcu;
};
struct tnode {
unsigned long parent;
t_key key;
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned int full_children; /* KEYLENGTH bits needed */
unsigned int empty_children; /* KEYLENGTH bits needed */
union {
struct rcu_head rcu;
struct work_struct work;
struct tnode *tnode_free;
};
struct rt_trie_node __rcu *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
unsigned int backtrack;
unsigned int semantic_match_passed;
unsigned int semantic_match_miss;
unsigned int null_node_hit;
unsigned int resize_node_skipped;
};
#endif
struct trie_stat {
unsigned int totdepth;
unsigned int maxdepth;
unsigned int tnodes;
unsigned int leaves;
unsigned int nullpointers;
unsigned int prefixes;
unsigned int nodesizes[MAX_STAT_DEPTH];
};
struct trie {
struct rt_trie_node __rcu *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
};
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull);
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
static size_t tnode_free_size;
/*
* synchronize_rcu after call_rcu for that many pages; it should be especially
* useful before resizing the root node with PREEMPT_NONE configs; the value was
* obtained experimentally, aiming to avoid visible slowdown.
*/
static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
/*
* caller must hold RTNL
*/
static inline struct tnode *node_parent(const struct rt_trie_node *node)
{
unsigned long parent;
parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
return (struct tnode *)(parent & ~NODE_TYPE_MASK);
}
/*
* caller must hold RCU read lock or RTNL
*/
static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
{
unsigned long parent;
parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
lockdep_rtnl_is_held());
return (struct tnode *)(parent & ~NODE_TYPE_MASK);
}
/* Same as rcu_assign_pointer
* but that macro() assumes that value is a pointer.
*/
static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
smp_wmb();
node->parent = (unsigned long)ptr | NODE_TYPE(node);
}
/*
* caller must hold RTNL
*/
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
{
BUG_ON(i >= 1U << tn->bits);
return rtnl_dereference(tn->child[i]);
}
/*
* caller must hold RCU read lock or RTNL
*/
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
{
BUG_ON(i >= 1U << tn->bits);
return rcu_dereference_rtnl(tn->child[i]);
}
static inline int tnode_child_length(const struct tnode *tn)
{
return 1 << tn->bits;
}
static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
{
if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
else
return 0;
}
static inline int tkey_equals(t_key a, t_key b)
{
return a == b;
}
static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
if (bits == 0 || offset >= KEYLENGTH)
return 1;
bits = bits > KEYLENGTH ? KEYLENGTH : bits;
return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
}
static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
t_key diff = a ^ b;
int i = offset;
if (!diff)
return 0;
while ((diff << i) >> (KEYLENGTH-1) == 0)
i++;
return i;
}
/*
To understand this stuff, an understanding of keys and all their bits is
necessary. Every node in the trie has a key associated with it, but not
all of the bits in that key are significant.
Consider a node 'n' and its parent 'tp'.
If n is a leaf, every bit in its key is significant. Its presence is
necessitated by path compression, since during a tree traversal (when
searching for a leaf - unless we are doing an insertion) we will completely
ignore all skipped bits we encounter. Thus we need to verify, at the end of
a potentially successful search, that we have indeed been walking the
correct key path.
Note that we can never "miss" the correct key in the tree if present by
following the wrong path. Path compression ensures that segments of the key
that are the same for all keys with a given prefix are skipped, but the
skipped part *is* identical for each node in the subtrie below the skipped
bit! trie_insert() in this implementation takes care of that - note the
call to tkey_sub_equals() in trie_insert().
if n is an internal node - a 'tnode' here, the various parts of its key
have many different meanings.
Example:
_________________________________________________________________
| i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
-----------------------------------------------------------------
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
_________________________________________________________________
| C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
-----------------------------------------------------------------
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
tp->pos = 7
tp->bits = 3
n->pos = 15
n->bits = 4
First, let's just ignore the bits that come before the parent tp, that is
the bits from 0 to (tp->pos-1). They are *known* but at this point we do
not use them for anything.
The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
index into the parent's child array. That is, they will be used to find
'n' among tp's children.
The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
for the node n.
All the bits we have seen so far are significant to the node n. The rest
of the bits are really not needed or indeed known in n->key.
The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
n's child array, and will of course be different for each child.
The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
at this point.
*/
static inline void check_tnode(const struct tnode *tn)
{
WARN_ON(tn && tn->pos+tn->bits > 32);
}
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
static const int halve_threshold_root = 15;
static const int inflate_threshold_root = 30;
static void __alias_free_mem(struct rcu_head *head)
{
struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
kmem_cache_free(fn_alias_kmem, fa);
}
static inline void alias_free_mem_rcu(struct fib_alias *fa)
{
call_rcu(&fa->rcu, __alias_free_mem);
}
static void __leaf_free_rcu(struct rcu_head *head)
{
struct leaf *l = container_of(head, struct leaf, rcu);
kmem_cache_free(trie_leaf_kmem, l);
}
static inline void free_leaf(struct leaf *l)
{
call_rcu(&l->rcu, __leaf_free_rcu);
}
static inline void free_leaf_info(struct leaf_info *leaf)
{
kfree_rcu(leaf, rcu);
}
static struct tnode *tnode_alloc(size_t size)
{
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else
return vzalloc(size);
}
static void __tnode_vfree(struct work_struct *arg)
{
struct tnode *tn = container_of(arg, struct tnode, work);
vfree(tn);
}
static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
size_t size = sizeof(struct tnode) +
(sizeof(struct rt_trie_node *) << tn->bits);
if (size <= PAGE_SIZE)
kfree(tn);
else {
INIT_WORK(&tn->work, __tnode_vfree);
schedule_work(&tn->work);
}
}
static inline void tnode_free(struct tnode *tn)
{
if (IS_LEAF(tn))
free_leaf((struct leaf *) tn);
else
call_rcu(&tn->rcu, __tnode_free_rcu);
}
static void tnode_free_safe(struct tnode *tn)
{
BUG_ON(IS_LEAF(tn));
tn->tnode_free = tnode_free_head;
tnode_free_head = tn;
tnode_free_size += sizeof(struct tnode) +
(sizeof(struct rt_trie_node *) << tn->bits);
}
static void tnode_free_flush(void)
{
struct tnode *tn;
while ((tn = tnode_free_head)) {
tnode_free_head = tn->tnode_free;
tn->tnode_free = NULL;
tnode_free(tn);
}
if (tnode_free_size >= PAGE_SIZE * sync_pages) {
tnode_free_size = 0;
synchronize_rcu();
}
}
static struct leaf *leaf_new(void)
{
struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
l->parent = T_LEAF;
INIT_HLIST_HEAD(&l->list);
}
return l;
}
static struct leaf_info *leaf_info_new(int plen)
{
struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
if (li) {
li->plen = plen;
li->mask_plen = ntohl(inet_make_mask(plen));
INIT_LIST_HEAD(&li->falh);
}
return li;
}
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
if (tn) {
tn->parent = T_TNODE;
tn->pos = pos;
tn->bits = bits;
tn->key = key;
tn->full_children = 0;
tn->empty_children = 1<<bits;
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
sizeof(struct rt_trie_node *) << bits);
return tn;
}
/*
* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}
static inline void put_child(struct tnode *tn, int i,
struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
/*
* Add a child at position i overwriting the old value.
* Update the value of full_children and empty_children.
*/
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull)
{
struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
int isfull;
BUG_ON(i >= 1<<tn->bits);
/* update emptyChildren */
if (n == NULL && chi != NULL)
tn->empty_children++;
else if (n != NULL && chi == NULL)
tn->empty_children--;
/* update fullChildren */
if (wasfull == -1)
wasfull = tnode_full(tn, chi);
isfull = tnode_full(tn, n);
if (wasfull && !isfull)
tn->full_children--;
else if (!wasfull && isfull)
tn->full_children++;
if (n)
node_set_parent(n, tn);
rcu_assign_pointer(tn->child[i], n);
}
#define MAX_WORK 10
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
int i;
struct tnode *old_tn;
int inflate_threshold_use;
int halve_threshold_use;
int max_work;
if (!tn)
return NULL;
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
tn, inflate_threshold, halve_threshold);
/* No children */
if (tn->empty_children == tnode_child_length(tn)) {
tnode_free_safe(tn);
return NULL;
}
/* One child */
if (tn->empty_children == tnode_child_length(tn) - 1)
goto one_child;
/*
* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
/*
* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
* the Helsinki University of Technology and Matti Tikkanen of Nokia
* Telecommunications, page 6:
* "A node is doubled if the ratio of non-empty children to all
* children in the *doubled* node is at least 'high'."
*
* 'high' in this instance is the variable 'inflate_threshold'. It
* is expressed as a percentage, so we multiply it with
* tnode_child_length() and instead of multiplying by 2 (since the
* child array will be doubled by inflate()) and multiplying
* the left-hand side by 100 (to handle the percentage thing) we
* multiply the left-hand side by 50.
*
* The left-hand side may look a bit weird: tnode_child_length(tn)
* - tn->empty_children is of course the number of non-null children
* in the current node. tn->full_children is the number of "full"
* children, that is non-null tnodes with a skip value of 0.
* All of those will be doubled in the resulting inflated tnode, so
* we just count them one extra time here.
*
* A clearer way to write this would be:
*
* to_be_doubled = tn->full_children;
* not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
* tn->full_children;
*
* new_child_length = tnode_child_length(tn) * 2;
*
* new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
* new_child_length;
* if (new_fill_factor >= inflate_threshold)
*
* ...and so on, tho it would mess up the while () loop.
*
* anyway,
* 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
* inflate_threshold
*
* avoid a division:
* 100 * (not_to_be_doubled + 2*to_be_doubled) >=
* inflate_threshold * new_child_length
*
* expand not_to_be_doubled and to_be_doubled, and shorten:
* 100 * (tnode_child_length(tn) - tn->empty_children +
* tn->full_children) >= inflate_threshold * new_child_length
*
* expand new_child_length:
* 100 * (tnode_child_length(tn) - tn->empty_children +
* tn->full_children) >=
* inflate_threshold * tnode_child_length(tn) * 2
*
* shorten again:
* 50 * (tn->full_children + tnode_child_length(tn) -
* tn->empty_children) >= inflate_threshold *
* tnode_child_length(tn)
*
*/
check_tnode(tn);
/* Keep root node larger */
if (!node_parent((struct rt_trie_node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
} else {
inflate_threshold_use = inflate_threshold;
halve_threshold_use = halve_threshold;
}
max_work = MAX_WORK;
while ((tn->full_children > 0 && max_work-- &&
50 * (tn->full_children + tnode_child_length(tn)
- tn->empty_children)
>= inflate_threshold_use * tnode_child_length(tn))) {
old_tn = tn;
tn = inflate(t, tn);
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.resize_node_skipped++;
#endif
break;
}
}
check_tnode(tn);
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
return (struct rt_trie_node *) tn;
/*
* Halve as long as the number of empty children in this
* node is above threshold.
*/
max_work = MAX_WORK;
while (tn->bits > 1 && max_work-- &&
100 * (tnode_child_length(tn) - tn->empty_children) <
halve_threshold_use * tnode_child_length(tn)) {
old_tn = tn;
tn = halve(t, tn);
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.resize_node_skipped++;
#endif
break;
}
}
/* Only one child remains */
if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
for (i = 0; i < tnode_child_length(tn); i++) {
struct rt_trie_node *n;
n = rtnl_dereference(tn->child[i]);
if (!n)
continue;
/* compress one level */
node_set_parent(n, NULL);
tnode_free_safe(tn);
return n;
}
}
return (struct rt_trie_node *) tn;
}
static void tnode_clean_free(struct tnode *tn)
{
int i;
struct tnode *tofree;
for (i = 0; i < tnode_child_length(tn); i++) {
tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
if (tofree)
tnode_free(tofree);
}
tnode_free(tn);
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
int olen = tnode_child_length(tn);
int i;
pr_debug("In inflate\n");
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
if (!tn)
return ERR_PTR(-ENOMEM);
/*
* Preallocate and store tnodes before the actual work so we
* don't get into an inconsistent state if memory allocation
* fails. In case of failure we return the oldnode and inflate
* of tnode is ignored.
*/
for (i = 0; i < olen; i++) {
struct tnode *inode;
inode = (struct tnode *) tnode_get_child(oldtnode, i);
if (inode &&
IS_TNODE(inode) &&
inode->pos == oldtnode->pos + oldtnode->bits &&
inode->bits > 1) {
struct tnode *left, *right;
t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
left = tnode_new(inode->key&(~m), inode->pos + 1,
inode->bits - 1);
if (!left)
goto nomem;
right = tnode_new(inode->key|m, inode->pos + 1,
inode->bits - 1);
if (!right) {
tnode_free(left);
goto nomem;
}
put_child(tn, 2*i, (struct rt_trie_node *) left);
put_child(tn, 2*i+1, (struct rt_trie_node *) right);
}
}
for (i = 0; i < olen; i++) {
struct tnode *inode;
struct rt_trie_node *node = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
/* An empty child */
if (node == NULL)
continue;
/* A leaf or an internal node with skipped bits */
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
if (tkey_extract_bits(node->key,
oldtnode->pos + oldtnode->bits,
1) == 0)
put_child(tn, 2*i, node);
else
put_child(tn, 2*i+1, node);
continue;
}
/* An internal node with two children */
inode = (struct tnode *) node;
if (inode->bits == 1) {
put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
tnode_free_safe(inode);
continue;
}
/* An internal node with more than two children */
/* We will replace this node 'inode' with two new
* ones, 'left' and 'right', each with half of the
* original children. The two new nodes will have
* a position one bit further down the key and this
* means that the "significant" part of their keys
* (see the discussion near the top of this file)
* will differ by one bit, which will be "0" in
* left's key and "1" in right's key. Since we are
* moving the key position by one step, the bit that
* we are moving away from - the bit at position
* (inode->pos) - is the one that will differ between
* left and right. So... we synthesize that bit in the
* two new keys.
* The mask 'm' below will be a single "one" bit at
* the position (inode->pos)
*/
/* Use the old key, but set the new significant
* bit to zero.
*/
left = (struct tnode *) tnode_get_child(tn, 2*i);
put_child(tn, 2*i, NULL);
BUG_ON(!left);
right = (struct tnode *) tnode_get_child(tn, 2*i+1);
put_child(tn, 2*i+1, NULL);
BUG_ON(!right);
size = tnode_child_length(left);
for (j = 0; j < size; j++) {
put_child(left, j, rtnl_dereference(inode->child[j]));
put_child(right, j, rtnl_dereference(inode->child[j + size]));
}
put_child(tn, 2*i, resize(t, left));
put_child(tn, 2*i+1, resize(t, right));
tnode_free_safe(inode);
}
tnode_free_safe(oldtnode);
return tn;
nomem:
tnode_clean_free(tn);
return ERR_PTR(-ENOMEM);
}
static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
struct rt_trie_node *left, *right;
int i;
int olen = tnode_child_length(tn);
pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
if (!tn)
return ERR_PTR(-ENOMEM);
/*
* Preallocate and store tnodes before the actual work so we
* don't get into an inconsistent state if memory allocation
* fails. In case of failure we return the oldnode and halve
* of tnode is ignored.
*/
for (i = 0; i < olen; i += 2) {
left = tnode_get_child(oldtnode, i);
right = tnode_get_child(oldtnode, i+1);
/* Two nonempty children */
if (left && right) {
struct tnode *newn;
newn = tnode_new(left->key, tn->pos + tn->bits, 1);
if (!newn)
goto nomem;
put_child(tn, i/2, (struct rt_trie_node *)newn);
}
}
for (i = 0; i < olen; i += 2) {
struct tnode *newBinNode;
left = tnode_get_child(oldtnode, i);
right = tnode_get_child(oldtnode, i+1);
/* At least one of the children is empty */
if (left == NULL) {
if (right == NULL) /* Both are empty */
continue;
put_child(tn, i/2, right);
continue;
}
if (right == NULL) {
put_child(tn, i/2, left);
continue;
}
/* Two nonempty children */
newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
put_child(tn, i/2, NULL);
put_child(newBinNode, 0, left);
put_child(newBinNode, 1, right);
put_child(tn, i/2, resize(t, newBinNode));
}
tnode_free_safe(oldtnode);
return tn;
nomem:
tnode_clean_free(tn);
return ERR_PTR(-ENOMEM);
}
/* readside must use rcu_read_lock currently dump routines
via get_fa_head and dump */
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
{
struct hlist_head *head = &l->list;
struct leaf_info *li;
hlist_for_each_entry_rcu(li, head, hlist)
if (li->plen == plen)
return li;
return NULL;
}
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
{
struct leaf_info *li = find_leaf_info(l, plen);
if (!li)
return NULL;
return &li->falh;
}
static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
struct leaf_info *li = NULL, *last = NULL;
if (hlist_empty(head)) {
hlist_add_head_rcu(&new->hlist, head);
} else {
hlist_for_each_entry(li, head, hlist) {
if (new->plen > li->plen)
break;
last = li;
}
if (last)
hlist_add_after_rcu(&last->hlist, &new->hlist);
else
hlist_add_before_rcu(&new->hlist, &li->hlist);
}
}
/* rcu_read_lock needs to be hold by caller from readside */
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
int pos;
struct tnode *tn;
struct rt_trie_node *n;
pos = 0;
n = rcu_dereference_rtnl(t->trie);
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
check_tnode(tn);
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
pos = tn->pos + tn->bits;
n = tnode_get_child_rcu(tn,
tkey_extract_bits(key,
tn->pos,
tn->bits));
} else
break;
}
/* Case we have found a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
return (struct leaf *)n;
return NULL;
}
static void trie_rebalance(struct trie *t, struct tnode *tn)
{
int wasfull;
t_key cindex, key;
struct tnode *tp;
key = tn->key;
while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *)resize(t, tn);
tnode_put_child_reorg(tp, cindex,
(struct rt_trie_node *)tn, wasfull);
tp = node_parent((struct rt_trie_node *) tn);
if (!tp)
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
if (!tp)
break;
tn = tp;
}
/* Handle last (top) tnode */
if (IS_TNODE(tn))
tn = (struct tnode *)resize(t, tn);
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
}
/* only used from updater-side */
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
struct leaf_info *li;
t_key cindex;
pos = 0;
n = rtnl_dereference(t->trie);
/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
* and we should just put our new leaf in that.
* If we point to a T_TNODE, check if it matches our key. Note that
* a T_TNODE might be skipping any number of bits - its 'pos' need
* not be the parent's 'pos'+'bits'!
*
* If it does match the current key, get pos/bits from it, extract
* the index from our key, push the T_TNODE and walk the tree.
*
* If it doesn't, we have to replace it with a new T_TNODE.
*
* If we point to a T_LEAF, it might or might not have the same key
* as we do. If it does, just change the value, update the T_LEAF's
* value, and return it.
* If it doesn't, we need to replace it with a T_TNODE.
*/
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
check_tnode(tn);
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
pos = tn->pos + tn->bits;
n = tnode_get_child(tn,
tkey_extract_bits(key,
tn->pos,
tn->bits));
BUG_ON(n && node_parent(n) != tn);
} else
break;
}
/*
* n ----> NULL, LEAF or TNODE
*
* tp is n's (parent) ----> NULL or TNODE
*/
BUG_ON(tp && IS_LEAF(tp));
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
l = (struct leaf *) n;
li = leaf_info_new(plen);
if (!li)
return NULL;
fa_head = &li->falh;
insert_leaf_info(&l->list, li);
goto done;
}
l = leaf_new();
if (!l)
return NULL;
l->key = key;
li = leaf_info_new(plen);
if (!li) {
free_leaf(l);
return NULL;
}
fa_head = &li->falh;
insert_leaf_info(&l->list, li);
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
* Add a new tnode here
* first tnode need some special handling
*/
if (tp)
pos = tp->pos+tp->bits;
else
pos = 0;
if (n) {
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
} else {
newpos = 0;
tn = tnode_new(key, newpos, 1); /* First tnode */
}
if (!tn) {
free_leaf_info(li);
free_leaf(l);
return NULL;
}
node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
put_child(tn, missbit, (struct rt_trie_node *)l);
put_child(tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
}
}
if (tp && tp->pos + tp->bits > 32)
pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
tp, tp->pos, tp->bits, key, plen);
/* Rebalance the trie */
trie_rebalance(t, tp);
done:
return fa_head;
}
/*
* Caller must hold RTNL.
*/
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *new_fa;
struct list_head *fa_head = NULL;
struct fib_info *fi;
int plen = cfg->fc_dst_len;
u8 tos = cfg->fc_tos;
u32 key, mask;
int err;
struct leaf *l;
if (plen > 32)
return -EINVAL;
key = ntohl(cfg->fc_dst);
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
mask = ntohl(inet_make_mask(plen));
if (key & ~mask)
return -EINVAL;
key = key & mask;
fi = fib_create_info(cfg);
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
goto err;
}
l = fib_find_node(t, key);
fa = NULL;
if (l) {
fa_head = get_fa_head(l, plen);
fa = fib_find_alias(fa_head, tos, fi->fib_priority);
}
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
* insert to the head of f.
*
* If f is NULL, no fib node matched the destination key
* and we need to allocate a new one of those as well.
*/
if (fa && fa->fa_tos == tos &&
fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_first, *fa_match;
err = -EEXIST;
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
* 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
*/
fa_match = NULL;
fa_first = fa;
fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
list_for_each_entry_continue(fa, fa_head, fa_list) {
if (fa->fa_tos != tos)
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
if (fa->fa_type == cfg->fc_type &&
fa->fa_info == fi) {
fa_match = fa;
break;
}
}
if (cfg->fc_nlflags & NLM_F_REPLACE) {
struct fib_info *fi_drop;
u8 state;
fa = fa_first;
if (fa_match) {
if (fa == fa_match)
err = 0;
goto out;
}
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (new_fa == NULL)
goto out;
fi_drop = fa->fa_info;
new_fa->fa_tos = fa->fa_tos;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;
list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
goto succeeded;
}
/* Error if we find a perfect match which
* uses the same scope, type, and nexthop
* information.
*/
if (fa_match)
goto out;
if (!(cfg->fc_nlflags & NLM_F_APPEND))
fa = fa_first;
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (new_fa == NULL)
goto out;
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
/*
* Insert new entry to the list.
*/
if (!fa_head) {
fa_head = fib_insert_node(t, key, plen);
if (unlikely(!fa_head)) {
err = -ENOMEM;
goto out_free_new_fa;
}
}
if (!plen)
tb->tb_num_default++;
list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : fa_head));
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
succeeded:
return 0;
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
fib_release_info(fi);
err:
return err;