diff options
Diffstat (limited to 'net/ipv4/route.c')
| -rw-r--r-- | net/ipv4/route.c | 128 |
1 files changed, 76 insertions, 52 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 12a1cf306f67..d675ff80b04d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
| @@ -54,6 +54,7 @@ | |||
| 54 | * Marc Boucher : routing by fwmark | 54 | * Marc Boucher : routing by fwmark |
| 55 | * Robert Olsson : Added rt_cache statistics | 55 | * Robert Olsson : Added rt_cache statistics |
| 56 | * Arnaldo C. Melo : Convert proc stuff to seq_file | 56 | * Arnaldo C. Melo : Convert proc stuff to seq_file |
| 57 | * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. | ||
| 57 | * | 58 | * |
| 58 | * This program is free software; you can redistribute it and/or | 59 | * This program is free software; you can redistribute it and/or |
| 59 | * modify it under the terms of the GNU General Public License | 60 | * modify it under the terms of the GNU General Public License |
| @@ -70,6 +71,7 @@ | |||
| 70 | #include <linux/kernel.h> | 71 | #include <linux/kernel.h> |
| 71 | #include <linux/sched.h> | 72 | #include <linux/sched.h> |
| 72 | #include <linux/mm.h> | 73 | #include <linux/mm.h> |
| 74 | #include <linux/bootmem.h> | ||
| 73 | #include <linux/string.h> | 75 | #include <linux/string.h> |
| 74 | #include <linux/socket.h> | 76 | #include <linux/socket.h> |
| 75 | #include <linux/sockios.h> | 77 | #include <linux/sockios.h> |
| @@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = { | |||
| 201 | 203 | ||
| 202 | struct rt_hash_bucket { | 204 | struct rt_hash_bucket { |
| 203 | struct rtable *chain; | 205 | struct rtable *chain; |
| 204 | spinlock_t lock; | 206 | }; |
| 205 | } __attribute__((__aligned__(8))); | 207 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) |
| 208 | /* | ||
| 209 | * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks | ||
| 210 | * The size of this table is a power of two and depends on the number of CPUS. | ||
| 211 | */ | ||
| 212 | #if NR_CPUS >= 32 | ||
| 213 | #define RT_HASH_LOCK_SZ 4096 | ||
| 214 | #elif NR_CPUS >= 16 | ||
| 215 | #define RT_HASH_LOCK_SZ 2048 | ||
| 216 | #elif NR_CPUS >= 8 | ||
| 217 | #define RT_HASH_LOCK_SZ 1024 | ||
| 218 | #elif NR_CPUS >= 4 | ||
| 219 | #define RT_HASH_LOCK_SZ 512 | ||
| 220 | #else | ||
| 221 | #define RT_HASH_LOCK_SZ 256 | ||
| 222 | #endif | ||
| 223 | |||
| 224 | static spinlock_t *rt_hash_locks; | ||
| 225 | # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] | ||
| 226 | # define rt_hash_lock_init() { \ | ||
| 227 | int i; \ | ||
| 228 | rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ | ||
| 229 | if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ | ||
| 230 | for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ | ||
| 231 | spin_lock_init(&rt_hash_locks[i]); \ | ||
| 232 | } | ||
| 233 | #else | ||
| 234 | # define rt_hash_lock_addr(slot) NULL | ||
| 235 | # define rt_hash_lock_init() | ||
| 236 | #endif | ||
| 206 | 237 | ||
| 207 | static struct rt_hash_bucket *rt_hash_table; | 238 | static struct rt_hash_bucket *rt_hash_table; |
| 208 | static unsigned rt_hash_mask; | 239 | static unsigned rt_hash_mask; |
| @@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, | |||
| 575 | /* This runs via a timer and thus is always in BH context. */ | 606 | /* This runs via a timer and thus is always in BH context. */ |
| 576 | static void rt_check_expire(unsigned long dummy) | 607 | static void rt_check_expire(unsigned long dummy) |
| 577 | { | 608 | { |
| 578 | static int rover; | 609 | static unsigned int rover; |
| 579 | int i = rover, t; | 610 | unsigned int i = rover, goal; |
| 580 | struct rtable *rth, **rthp; | 611 | struct rtable *rth, **rthp; |
| 581 | unsigned long now = jiffies; | 612 | unsigned long now = jiffies; |
| 582 | 613 | u64 mult; | |
| 583 | for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; | 614 | |
| 584 | t -= ip_rt_gc_timeout) { | 615 | mult = ((u64)ip_rt_gc_interval) << rt_hash_log; |
| 616 | if (ip_rt_gc_timeout > 1) | ||
| 617 | do_div(mult, ip_rt_gc_timeout); | ||
| 618 | goal = (unsigned int)mult; | ||
| 619 | if (goal > rt_hash_mask) goal = rt_hash_mask + 1; | ||
| 620 | for (; goal > 0; goal--) { | ||
| 585 | unsigned long tmo = ip_rt_gc_timeout; | 621 | unsigned long tmo = ip_rt_gc_timeout; |
| 586 | 622 | ||
| 587 | i = (i + 1) & rt_hash_mask; | 623 | i = (i + 1) & rt_hash_mask; |
| 588 | rthp = &rt_hash_table[i].chain; | 624 | rthp = &rt_hash_table[i].chain; |
| 589 | 625 | ||
| 590 | spin_lock(&rt_hash_table[i].lock); | 626 | if (*rthp == 0) |
| 627 | continue; | ||
| 628 | spin_lock(rt_hash_lock_addr(i)); | ||
| 591 | while ((rth = *rthp) != NULL) { | 629 | while ((rth = *rthp) != NULL) { |
| 592 | if (rth->u.dst.expires) { | 630 | if (rth->u.dst.expires) { |
| 593 | /* Entry is expired even if it is in use */ | 631 | /* Entry is expired even if it is in use */ |
| @@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy) | |||
| 620 | rt_free(rth); | 658 | rt_free(rth); |
| 621 | #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ | 659 | #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ |
| 622 | } | 660 | } |
| 623 | spin_unlock(&rt_hash_table[i].lock); | 661 | spin_unlock(rt_hash_lock_addr(i)); |
| 624 | 662 | ||
| 625 | /* Fallback loop breaker. */ | 663 | /* Fallback loop breaker. */ |
| 626 | if (time_after(jiffies, now)) | 664 | if (time_after(jiffies, now)) |
| 627 | break; | 665 | break; |
| 628 | } | 666 | } |
| 629 | rover = i; | 667 | rover = i; |
| 630 | mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); | 668 | mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); |
| 631 | } | 669 | } |
| 632 | 670 | ||
| 633 | /* This can run from both BH and non-BH contexts, the latter | 671 | /* This can run from both BH and non-BH contexts, the latter |
| @@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy) | |||
| 643 | get_random_bytes(&rt_hash_rnd, 4); | 681 | get_random_bytes(&rt_hash_rnd, 4); |
| 644 | 682 | ||
| 645 | for (i = rt_hash_mask; i >= 0; i--) { | 683 | for (i = rt_hash_mask; i >= 0; i--) { |
| 646 | spin_lock_bh(&rt_hash_table[i].lock); | 684 | spin_lock_bh(rt_hash_lock_addr(i)); |
| 647 | rth = rt_hash_table[i].chain; | 685 | rth = rt_hash_table[i].chain; |
| 648 | if (rth) | 686 | if (rth) |
| 649 | rt_hash_table[i].chain = NULL; | 687 | rt_hash_table[i].chain = NULL; |
| 650 | spin_unlock_bh(&rt_hash_table[i].lock); | 688 | spin_unlock_bh(rt_hash_lock_addr(i)); |
| 651 | 689 | ||
| 652 | for (; rth; rth = next) { | 690 | for (; rth; rth = next) { |
| 653 | next = rth->u.rt_next; | 691 | next = rth->u.rt_next; |
| @@ -780,7 +818,7 @@ static int rt_garbage_collect(void) | |||
| 780 | 818 | ||
| 781 | k = (k + 1) & rt_hash_mask; | 819 | k = (k + 1) & rt_hash_mask; |
| 782 | rthp = &rt_hash_table[k].chain; | 820 | rthp = &rt_hash_table[k].chain; |
| 783 | spin_lock_bh(&rt_hash_table[k].lock); | 821 | spin_lock_bh(rt_hash_lock_addr(k)); |
| 784 | while ((rth = *rthp) != NULL) { | 822 | while ((rth = *rthp) != NULL) { |
| 785 | if (!rt_may_expire(rth, tmo, expire)) { | 823 | if (!rt_may_expire(rth, tmo, expire)) { |
| 786 | tmo >>= 1; | 824 | tmo >>= 1; |
| @@ -812,7 +850,7 @@ static int rt_garbage_collect(void) | |||
| 812 | goal--; | 850 | goal--; |
| 813 | #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ | 851 | #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ |
| 814 | } | 852 | } |
| 815 | spin_unlock_bh(&rt_hash_table[k].lock); | 853 | spin_unlock_bh(rt_hash_lock_addr(k)); |
| 816 | if (goal <= 0) | 854 | if (goal <= 0) |
| 817 | break; | 855 | break; |
| 818 | } | 856 | } |
| @@ -882,7 +920,7 @@ restart: | |||
| 882 | 920 | ||
| 883 | rthp = &rt_hash_table[hash].chain; | 921 | rthp = &rt_hash_table[hash].chain; |
| 884 | 922 | ||
| 885 | spin_lock_bh(&rt_hash_table[hash].lock); | 923 | spin_lock_bh(rt_hash_lock_addr(hash)); |
| 886 | while ((rth = *rthp) != NULL) { | 924 | while ((rth = *rthp) != NULL) { |
| 887 | #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED | 925 | #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED |
| 888 | if (!(rth->u.dst.flags & DST_BALANCED) && | 926 | if (!(rth->u.dst.flags & DST_BALANCED) && |
| @@ -908,7 +946,7 @@ restart: | |||
| 908 | rth->u.dst.__use++; | 946 | rth->u.dst.__use++; |
| 909 | dst_hold(&rth->u.dst); | 947 | dst_hold(&rth->u.dst); |
| 910 | rth->u.dst.lastuse = now; | 948 | rth->u.dst.lastuse = now; |
| 911 | spin_unlock_bh(&rt_hash_table[hash].lock); | 949 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
| 912 | 950 | ||
| 913 | rt_drop(rt); | 951 | rt_drop(rt); |
| 914 | *rp = rth; | 952 | *rp = rth; |
| @@ -949,7 +987,7 @@ restart: | |||
| 949 | if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { | 987 | if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { |
| 950 | int err = arp_bind_neighbour(&rt->u.dst); | 988 | int err = arp_bind_neighbour(&rt->u.dst); |
| 951 | if (err) { | 989 | if (err) { |
| 952 | spin_unlock_bh(&rt_hash_table[hash].lock); | 990 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
| 953 | 991 | ||
| 954 | if (err != -ENOBUFS) { | 992 | if (err != -ENOBUFS) { |
| 955 | rt_drop(rt); | 993 | rt_drop(rt); |
| @@ -990,7 +1028,7 @@ restart: | |||
| 990 | } | 1028 | } |
| 991 | #endif | 1029 | #endif |
| 992 | rt_hash_table[hash].chain = rt; | 1030 | rt_hash_table[hash].chain = rt; |
| 993 | spin_unlock_bh(&rt_hash_table[hash].lock); | 1031 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
| 994 | *rp = rt; | 1032 | *rp = rt; |
| 995 | return 0; | 1033 | return 0; |
| 996 | } | 1034 | } |
| @@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt) | |||
| 1058 | { | 1096 | { |
| 1059 | struct rtable **rthp; | 1097 | struct rtable **rthp; |
| 1060 | 1098 | ||
| 1061 | spin_lock_bh(&rt_hash_table[hash].lock); | 1099 | spin_lock_bh(rt_hash_lock_addr(hash)); |
| 1062 | ip_rt_put(rt); | 1100 | ip_rt_put(rt); |
| 1063 | for (rthp = &rt_hash_table[hash].chain; *rthp; | 1101 | for (rthp = &rt_hash_table[hash].chain; *rthp; |
| 1064 | rthp = &(*rthp)->u.rt_next) | 1102 | rthp = &(*rthp)->u.rt_next) |
| @@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt) | |||
| 1067 | rt_free(rt); | 1105 | rt_free(rt); |
| 1068 | break; | 1106 | break; |
| 1069 | } | 1107 | } |
| 1070 | spin_unlock_bh(&rt_hash_table[hash].lock); | 1108 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
| 1071 | } | 1109 | } |
| 1072 | 1110 | ||
| 1073 | void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, | 1111 | void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, |
| @@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev, | |||
| 1647 | printk(KERN_WARNING "martian source %u.%u.%u.%u from " | 1685 | printk(KERN_WARNING "martian source %u.%u.%u.%u from " |
| 1648 | "%u.%u.%u.%u, on dev %s\n", | 1686 | "%u.%u.%u.%u, on dev %s\n", |
| 1649 | NIPQUAD(daddr), NIPQUAD(saddr), dev->name); | 1687 | NIPQUAD(daddr), NIPQUAD(saddr), dev->name); |
| 1650 | if (dev->hard_header_len) { | 1688 | if (dev->hard_header_len && skb->mac.raw) { |
| 1651 | int i; | 1689 | int i; |
| 1652 | unsigned char *p = skb->mac.raw; | 1690 | unsigned char *p = skb->mac.raw; |
| 1653 | printk(KERN_WARNING "ll header: "); | 1691 | printk(KERN_WARNING "ll header: "); |
| @@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries); | |||
| 3073 | 3111 | ||
| 3074 | int __init ip_rt_init(void) | 3112 | int __init ip_rt_init(void) |
| 3075 | { | 3113 | { |
| 3076 | int i, order, goal, rc = 0; | 3114 | int rc = 0; |
| 3077 | 3115 | ||
| 3078 | rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ | 3116 | rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ |
| 3079 | (jiffies ^ (jiffies >> 7))); | 3117 | (jiffies ^ (jiffies >> 7))); |
| 3080 | 3118 | ||
| 3081 | #ifdef CONFIG_NET_CLS_ROUTE | 3119 | #ifdef CONFIG_NET_CLS_ROUTE |
| 3120 | { | ||
| 3121 | int order; | ||
| 3082 | for (order = 0; | 3122 | for (order = 0; |
| 3083 | (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) | 3123 | (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) |
| 3084 | /* NOTHING */; | 3124 | /* NOTHING */; |
| @@ -3086,6 +3126,7 @@ int __init ip_rt_init(void) | |||
| 3086 | if (!ip_rt_acct) | 3126 | if (!ip_rt_acct) |
| 3087 | panic("IP: failed to allocate ip_rt_acct\n"); | 3127 | panic("IP: failed to allocate ip_rt_acct\n"); |
| 3088 | memset(ip_rt_acct, 0, PAGE_SIZE << order); | 3128 | memset(ip_rt_acct, 0, PAGE_SIZE << order); |
| 3129 | } | ||
| 3089 | #endif | 3130 | #endif |
| 3090 | 3131 | ||
| 3091 | ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", | 3132 | ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", |
| @@ -3096,36 +3137,19 @@ int __init ip_rt_init(void) | |||
| 3096 | if (!ipv4_dst_ops.kmem_cachep) | 3137 | if (!ipv4_dst_ops.kmem_cachep) |
| 3097 | panic("IP: failed to allocate ip_dst_cache\n"); | 3138 | panic("IP: failed to allocate ip_dst_cache\n"); |
| 3098 | 3139 | ||
| 3099 | goal = num_physpages >> (26 - PAGE_SHIFT); | 3140 | rt_hash_table = (struct rt_hash_bucket *) |
| 3100 | if (rhash_entries) | 3141 | alloc_large_system_hash("IP route cache", |
| 3101 | goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; | 3142 | sizeof(struct rt_hash_bucket), |
| 3102 | for (order = 0; (1UL << order) < goal; order++) | 3143 | rhash_entries, |
| 3103 | /* NOTHING */; | 3144 | (num_physpages >= 128 * 1024) ? |
| 3104 | 3145 | (27 - PAGE_SHIFT) : | |
| 3105 | do { | 3146 | (29 - PAGE_SHIFT), |
| 3106 | rt_hash_mask = (1UL << order) * PAGE_SIZE / | 3147 | HASH_HIGHMEM, |
| 3107 | sizeof(struct rt_hash_bucket); | 3148 | &rt_hash_log, |
| 3108 | while (rt_hash_mask & (rt_hash_mask - 1)) | 3149 | &rt_hash_mask, |
| 3109 | rt_hash_mask--; | 3150 | 0); |
| 3110 | rt_hash_table = (struct rt_hash_bucket *) | 3151 | memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); |
| 3111 | __get_free_pages(GFP_ATOMIC, order); | 3152 | rt_hash_lock_init(); |
| 3112 | } while (rt_hash_table == NULL && --order > 0); | ||
| 3113 | |||
| 3114 | if (!rt_hash_table) | ||
| 3115 | panic("Failed to allocate IP route cache hash table\n"); | ||
| 3116 | |||
| 3117 | printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", | ||
| 3118 | rt_hash_mask, | ||
| 3119 | (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); | ||
| 3120 | |||
| 3121 | for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) | ||
| 3122 | /* NOTHING */; | ||
| 3123 | |||
| 3124 | rt_hash_mask--; | ||
| 3125 | for (i = 0; i <= rt_hash_mask; i++) { | ||
| 3126 | spin_lock_init(&rt_hash_table[i].lock); | ||
| 3127 | rt_hash_table[i].chain = NULL; | ||
| 3128 | } | ||
| 3129 | 3153 | ||
| 3130 | ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); | 3154 | ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); |
| 3131 | ip_rt_max_size = (rt_hash_mask + 1) * 16; | 3155 | ip_rt_max_size = (rt_hash_mask + 1) * 16; |
