aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c126
1 files changed, 75 insertions, 51 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 12a1cf306f67..726ea5e8180a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3073 3111
3074int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3075{ 3113{
3076 int i, order, goal, rc = 0; 3114 int rc = 0;
3077 3115
3078 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3079 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3080 3118
3081#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3082 for (order = 0; 3122 for (order = 0;
3083 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3084 /* NOTHING */; 3124 /* NOTHING */;
@@ -3086,6 +3126,7 @@ int __init ip_rt_init(void)
3086 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3087 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3088 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3089#endif 3130#endif
3090 3131
3091 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3096,36 +3137,19 @@ int __init ip_rt_init(void)
3096 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3097 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3098 3139
3099 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3100 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3101 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3102 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3103 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3104 3145 (27 - PAGE_SHIFT) :
3105 do { 3146 (29 - PAGE_SHIFT),
3106 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3107 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3108 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3109 rt_hash_mask--; 3150 0);
3110 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3111 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3112 } while (rt_hash_table == NULL && --order > 0);
3113
3114 if (!rt_hash_table)
3115 panic("Failed to allocate IP route cache hash table\n");
3116
3117 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3118 rt_hash_mask,
3119 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3120
3121 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3122 /* NOTHING */;
3123
3124 rt_hash_mask--;
3125 for (i = 0; i <= rt_hash_mask; i++) {
3126 spin_lock_init(&rt_hash_table[i].lock);
3127 rt_hash_table[i].chain = NULL;
3128 }
3129 3153
3130 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3131 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;