diff options
author | Neil Horman <nhorman@tuxdriver.com> | 2008-10-27 15:28:25 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-10-27 20:06:14 -0400 |
commit | 1080d709fb9d8cd4392f93476ee46a9d6ea05a5b (patch) | |
tree | b87a162b10d98f7d44657d7c9a98bbc7385159fd /net | |
parent | 1d63e726408dfdb3e10ed8f00c383b30ebb333d3 (diff) |
net: implement emergency route cache rebulds when gc_elasticity is exceeded
This is a patch to provide on demand route cache rebuilding. Currently, our
route cache is rebulid periodically regardless of need. This introduced
unneeded periodic latency. This patch offers a better approach. Using code
provided by Eric Dumazet, we compute the standard deviation of the average hash
bucket chain length while running rt_check_expire. Should any given chain
length grow to larger that average plus 4 standard deviations, we trigger an
emergency hash table rebuild for that net namespace. This allows for the common
case in which chains are well behaved and do not grow unevenly to not incur any
latency at all, while those systems (which may be being maliciously attacked),
only rebuild when the attack is detected. This patch take 2 other factors into
account:
1) chains with multiple entries that differ by attributes that do not affect the
hash value are only counted once, so as not to unduly bias system to rebuilding
if features like QOS are heavily used
2) if rebuilding crosses a certain threshold (which is adjustable via the added
sysctl in this patch), route caching is disabled entirely for that net
namespace, since constant rebuilding is less efficient that no caching at all
Tested successfully by me.
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/route.c | 132 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 12 |
2 files changed, 142 insertions, 2 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2ea6dcc3e2cc..21ce7e1b2284 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; | |||
129 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; | 129 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; |
130 | static int ip_rt_min_advmss __read_mostly = 256; | 130 | static int ip_rt_min_advmss __read_mostly = 256; |
131 | static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; | 131 | static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; |
132 | static int rt_chain_length_max __read_mostly = 20; | ||
132 | 133 | ||
133 | static void rt_worker_func(struct work_struct *work); | 134 | static void rt_worker_func(struct work_struct *work); |
134 | static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); | 135 | static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); |
@@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); | |||
145 | static void ipv4_link_failure(struct sk_buff *skb); | 146 | static void ipv4_link_failure(struct sk_buff *skb); |
146 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); | 147 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); |
147 | static int rt_garbage_collect(struct dst_ops *ops); | 148 | static int rt_garbage_collect(struct dst_ops *ops); |
149 | static void rt_emergency_hash_rebuild(struct net *net); | ||
148 | 150 | ||
149 | 151 | ||
150 | static struct dst_ops ipv4_dst_ops = { | 152 | static struct dst_ops ipv4_dst_ops = { |
@@ -201,6 +203,7 @@ const __u8 ip_tos2prio[16] = { | |||
201 | struct rt_hash_bucket { | 203 | struct rt_hash_bucket { |
202 | struct rtable *chain; | 204 | struct rtable *chain; |
203 | }; | 205 | }; |
206 | |||
204 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ | 207 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ |
205 | defined(CONFIG_PROVE_LOCKING) | 208 | defined(CONFIG_PROVE_LOCKING) |
206 | /* | 209 | /* |
@@ -674,6 +677,20 @@ static inline u32 rt_score(struct rtable *rt) | |||
674 | return score; | 677 | return score; |
675 | } | 678 | } |
676 | 679 | ||
680 | static inline bool rt_caching(const struct net *net) | ||
681 | { | ||
682 | return net->ipv4.current_rt_cache_rebuild_count <= | ||
683 | net->ipv4.sysctl_rt_cache_rebuild_count; | ||
684 | } | ||
685 | |||
686 | static inline bool compare_hash_inputs(const struct flowi *fl1, | ||
687 | const struct flowi *fl2) | ||
688 | { | ||
689 | return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | | ||
690 | (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | | ||
691 | (fl1->iif ^ fl2->iif)) == 0); | ||
692 | } | ||
693 | |||
677 | static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) | 694 | static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) |
678 | { | 695 | { |
679 | return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | | 696 | return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | |
@@ -753,11 +770,24 @@ static void rt_do_flush(int process_context) | |||
753 | } | 770 | } |
754 | } | 771 | } |
755 | 772 | ||
773 | /* | ||
774 | * While freeing expired entries, we compute average chain length | ||
775 | * and standard deviation, using fixed-point arithmetic. | ||
776 | * This to have an estimation of rt_chain_length_max | ||
777 | * rt_chain_length_max = max(elasticity, AVG + 4*SD) | ||
778 | * We use 3 bits for frational part, and 29 (or 61) for magnitude. | ||
779 | */ | ||
780 | |||
781 | #define FRACT_BITS 3 | ||
782 | #define ONE (1UL << FRACT_BITS) | ||
783 | |||
756 | static void rt_check_expire(void) | 784 | static void rt_check_expire(void) |
757 | { | 785 | { |
758 | static unsigned int rover; | 786 | static unsigned int rover; |
759 | unsigned int i = rover, goal; | 787 | unsigned int i = rover, goal; |
760 | struct rtable *rth, **rthp; | 788 | struct rtable *rth, **rthp; |
789 | unsigned long length = 0, samples = 0; | ||
790 | unsigned long sum = 0, sum2 = 0; | ||
761 | u64 mult; | 791 | u64 mult; |
762 | 792 | ||
763 | mult = ((u64)ip_rt_gc_interval) << rt_hash_log; | 793 | mult = ((u64)ip_rt_gc_interval) << rt_hash_log; |
@@ -766,6 +796,7 @@ static void rt_check_expire(void) | |||
766 | goal = (unsigned int)mult; | 796 | goal = (unsigned int)mult; |
767 | if (goal > rt_hash_mask) | 797 | if (goal > rt_hash_mask) |
768 | goal = rt_hash_mask + 1; | 798 | goal = rt_hash_mask + 1; |
799 | length = 0; | ||
769 | for (; goal > 0; goal--) { | 800 | for (; goal > 0; goal--) { |
770 | unsigned long tmo = ip_rt_gc_timeout; | 801 | unsigned long tmo = ip_rt_gc_timeout; |
771 | 802 | ||
@@ -775,6 +806,8 @@ static void rt_check_expire(void) | |||
775 | if (need_resched()) | 806 | if (need_resched()) |
776 | cond_resched(); | 807 | cond_resched(); |
777 | 808 | ||
809 | samples++; | ||
810 | |||
778 | if (*rthp == NULL) | 811 | if (*rthp == NULL) |
779 | continue; | 812 | continue; |
780 | spin_lock_bh(rt_hash_lock_addr(i)); | 813 | spin_lock_bh(rt_hash_lock_addr(i)); |
@@ -789,11 +822,29 @@ static void rt_check_expire(void) | |||
789 | if (time_before_eq(jiffies, rth->u.dst.expires)) { | 822 | if (time_before_eq(jiffies, rth->u.dst.expires)) { |
790 | tmo >>= 1; | 823 | tmo >>= 1; |
791 | rthp = &rth->u.dst.rt_next; | 824 | rthp = &rth->u.dst.rt_next; |
825 | /* | ||
826 | * Only bump our length if the hash | ||
827 | * inputs on entries n and n+1 are not | ||
828 | * the same, we only count entries on | ||
829 | * a chain with equal hash inputs once | ||
830 | * so that entries for different QOS | ||
831 | * levels, and other non-hash input | ||
832 | * attributes don't unfairly skew | ||
833 | * the length computation | ||
834 | */ | ||
835 | if ((*rthp == NULL) || | ||
836 | !compare_hash_inputs(&(*rthp)->fl, | ||
837 | &rth->fl)) | ||
838 | length += ONE; | ||
792 | continue; | 839 | continue; |
793 | } | 840 | } |
794 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { | 841 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { |
795 | tmo >>= 1; | 842 | tmo >>= 1; |
796 | rthp = &rth->u.dst.rt_next; | 843 | rthp = &rth->u.dst.rt_next; |
844 | if ((*rthp == NULL) || | ||
845 | !compare_hash_inputs(&(*rthp)->fl, | ||
846 | &rth->fl)) | ||
847 | length += ONE; | ||
797 | continue; | 848 | continue; |
798 | } | 849 | } |
799 | 850 | ||
@@ -802,6 +853,15 @@ static void rt_check_expire(void) | |||
802 | rt_free(rth); | 853 | rt_free(rth); |
803 | } | 854 | } |
804 | spin_unlock_bh(rt_hash_lock_addr(i)); | 855 | spin_unlock_bh(rt_hash_lock_addr(i)); |
856 | sum += length; | ||
857 | sum2 += length*length; | ||
858 | } | ||
859 | if (samples) { | ||
860 | unsigned long avg = sum / samples; | ||
861 | unsigned long sd = int_sqrt(sum2 / samples - avg*avg); | ||
862 | rt_chain_length_max = max_t(unsigned long, | ||
863 | ip_rt_gc_elasticity, | ||
864 | (avg + 4*sd) >> FRACT_BITS); | ||
805 | } | 865 | } |
806 | rover = i; | 866 | rover = i; |
807 | } | 867 | } |
@@ -851,6 +911,26 @@ static void rt_secret_rebuild(unsigned long __net) | |||
851 | mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); | 911 | mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); |
852 | } | 912 | } |
853 | 913 | ||
914 | static void rt_secret_rebuild_oneshot(struct net *net) | ||
915 | { | ||
916 | del_timer_sync(&net->ipv4.rt_secret_timer); | ||
917 | rt_cache_invalidate(net); | ||
918 | if (ip_rt_secret_interval) { | ||
919 | net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; | ||
920 | add_timer(&net->ipv4.rt_secret_timer); | ||
921 | } | ||
922 | } | ||
923 | |||
924 | static void rt_emergency_hash_rebuild(struct net *net) | ||
925 | { | ||
926 | if (net_ratelimit()) { | ||
927 | printk(KERN_WARNING "Route hash chain too long!\n"); | ||
928 | printk(KERN_WARNING "Adjust your secret_interval!\n"); | ||
929 | } | ||
930 | |||
931 | rt_secret_rebuild_oneshot(net); | ||
932 | } | ||
933 | |||
854 | /* | 934 | /* |
855 | Short description of GC goals. | 935 | Short description of GC goals. |
856 | 936 | ||
@@ -989,6 +1069,7 @@ out: return 0; | |||
989 | static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) | 1069 | static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) |
990 | { | 1070 | { |
991 | struct rtable *rth, **rthp; | 1071 | struct rtable *rth, **rthp; |
1072 | struct rtable *rthi; | ||
992 | unsigned long now; | 1073 | unsigned long now; |
993 | struct rtable *cand, **candp; | 1074 | struct rtable *cand, **candp; |
994 | u32 min_score; | 1075 | u32 min_score; |
@@ -1002,7 +1083,13 @@ restart: | |||
1002 | candp = NULL; | 1083 | candp = NULL; |
1003 | now = jiffies; | 1084 | now = jiffies; |
1004 | 1085 | ||
1086 | if (!rt_caching(dev_net(rt->u.dst.dev))) { | ||
1087 | rt_drop(rt); | ||
1088 | return 0; | ||
1089 | } | ||
1090 | |||
1005 | rthp = &rt_hash_table[hash].chain; | 1091 | rthp = &rt_hash_table[hash].chain; |
1092 | rthi = NULL; | ||
1006 | 1093 | ||
1007 | spin_lock_bh(rt_hash_lock_addr(hash)); | 1094 | spin_lock_bh(rt_hash_lock_addr(hash)); |
1008 | while ((rth = *rthp) != NULL) { | 1095 | while ((rth = *rthp) != NULL) { |
@@ -1048,6 +1135,17 @@ restart: | |||
1048 | chain_length++; | 1135 | chain_length++; |
1049 | 1136 | ||
1050 | rthp = &rth->u.dst.rt_next; | 1137 | rthp = &rth->u.dst.rt_next; |
1138 | |||
1139 | /* | ||
1140 | * check to see if the next entry in the chain | ||
1141 | * contains the same hash input values as rt. If it does | ||
1142 | * This is where we will insert into the list, instead of | ||
1143 | * at the head. This groups entries that differ by aspects not | ||
1144 | * relvant to the hash function together, which we use to adjust | ||
1145 | * our chain length | ||
1146 | */ | ||
1147 | if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl)) | ||
1148 | rthi = rth; | ||
1051 | } | 1149 | } |
1052 | 1150 | ||
1053 | if (cand) { | 1151 | if (cand) { |
@@ -1061,6 +1159,16 @@ restart: | |||
1061 | *candp = cand->u.dst.rt_next; | 1159 | *candp = cand->u.dst.rt_next; |
1062 | rt_free(cand); | 1160 | rt_free(cand); |
1063 | } | 1161 | } |
1162 | } else { | ||
1163 | if (chain_length > rt_chain_length_max) { | ||
1164 | struct net *net = dev_net(rt->u.dst.dev); | ||
1165 | int num = ++net->ipv4.current_rt_cache_rebuild_count; | ||
1166 | if (!rt_caching(dev_net(rt->u.dst.dev))) { | ||
1167 | printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", | ||
1168 | rt->u.dst.dev->name, num); | ||
1169 | } | ||
1170 | rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); | ||
1171 | } | ||
1064 | } | 1172 | } |
1065 | 1173 | ||
1066 | /* Try to bind route to arp only if it is output | 1174 | /* Try to bind route to arp only if it is output |
@@ -1098,7 +1206,11 @@ restart: | |||
1098 | } | 1206 | } |
1099 | } | 1207 | } |
1100 | 1208 | ||
1101 | rt->u.dst.rt_next = rt_hash_table[hash].chain; | 1209 | if (rthi) |
1210 | rt->u.dst.rt_next = rthi->u.dst.rt_next; | ||
1211 | else | ||
1212 | rt->u.dst.rt_next = rt_hash_table[hash].chain; | ||
1213 | |||
1102 | #if RT_CACHE_DEBUG >= 2 | 1214 | #if RT_CACHE_DEBUG >= 2 |
1103 | if (rt->u.dst.rt_next) { | 1215 | if (rt->u.dst.rt_next) { |
1104 | struct rtable *trt; | 1216 | struct rtable *trt; |
@@ -1114,7 +1226,11 @@ restart: | |||
1114 | * previous writes to rt are comitted to memory | 1226 | * previous writes to rt are comitted to memory |
1115 | * before making rt visible to other CPUS. | 1227 | * before making rt visible to other CPUS. |
1116 | */ | 1228 | */ |
1117 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | 1229 | if (rthi) |
1230 | rcu_assign_pointer(rthi->u.dst.rt_next, rt); | ||
1231 | else | ||
1232 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | ||
1233 | |||
1118 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 1234 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
1119 | *rp = rt; | 1235 | *rp = rt; |
1120 | return 0; | 1236 | return 0; |
@@ -1217,6 +1333,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1217 | || ipv4_is_zeronet(new_gw)) | 1333 | || ipv4_is_zeronet(new_gw)) |
1218 | goto reject_redirect; | 1334 | goto reject_redirect; |
1219 | 1335 | ||
1336 | if (!rt_caching(net)) | ||
1337 | goto reject_redirect; | ||
1338 | |||
1220 | if (!IN_DEV_SHARED_MEDIA(in_dev)) { | 1339 | if (!IN_DEV_SHARED_MEDIA(in_dev)) { |
1221 | if (!inet_addr_onlink(in_dev, new_gw, old_gw)) | 1340 | if (!inet_addr_onlink(in_dev, new_gw, old_gw)) |
1222 | goto reject_redirect; | 1341 | goto reject_redirect; |
@@ -2130,6 +2249,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2130 | struct net *net; | 2249 | struct net *net; |
2131 | 2250 | ||
2132 | net = dev_net(dev); | 2251 | net = dev_net(dev); |
2252 | |||
2253 | if (!rt_caching(net)) | ||
2254 | goto skip_cache; | ||
2255 | |||
2133 | tos &= IPTOS_RT_MASK; | 2256 | tos &= IPTOS_RT_MASK; |
2134 | hash = rt_hash(daddr, saddr, iif, rt_genid(net)); | 2257 | hash = rt_hash(daddr, saddr, iif, rt_genid(net)); |
2135 | 2258 | ||
@@ -2154,6 +2277,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2154 | } | 2277 | } |
2155 | rcu_read_unlock(); | 2278 | rcu_read_unlock(); |
2156 | 2279 | ||
2280 | skip_cache: | ||
2157 | /* Multicast recognition logic is moved from route cache to here. | 2281 | /* Multicast recognition logic is moved from route cache to here. |
2158 | The problem was that too many Ethernet cards have broken/missing | 2282 | The problem was that too many Ethernet cards have broken/missing |
2159 | hardware multicast filters :-( As result the host on multicasting | 2283 | hardware multicast filters :-( As result the host on multicasting |
@@ -2539,6 +2663,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, | |||
2539 | unsigned hash; | 2663 | unsigned hash; |
2540 | struct rtable *rth; | 2664 | struct rtable *rth; |
2541 | 2665 | ||
2666 | if (!rt_caching(net)) | ||
2667 | goto slow_output; | ||
2668 | |||
2542 | hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); | 2669 | hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); |
2543 | 2670 | ||
2544 | rcu_read_lock_bh(); | 2671 | rcu_read_lock_bh(); |
@@ -2563,6 +2690,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, | |||
2563 | } | 2690 | } |
2564 | rcu_read_unlock_bh(); | 2691 | rcu_read_unlock_bh(); |
2565 | 2692 | ||
2693 | slow_output: | ||
2566 | return ip_route_output_slow(net, rp, flp); | 2694 | return ip_route_output_slow(net, rp, flp); |
2567 | } | 2695 | } |
2568 | 2696 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1bb10df8ce7d..0cc8d31f9ac0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -795,6 +795,14 @@ static struct ctl_table ipv4_net_table[] = { | |||
795 | .mode = 0644, | 795 | .mode = 0644, |
796 | .proc_handler = &proc_dointvec | 796 | .proc_handler = &proc_dointvec |
797 | }, | 797 | }, |
798 | { | ||
799 | .ctl_name = CTL_UNNUMBERED, | ||
800 | .procname = "rt_cache_rebuild_count", | ||
801 | .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, | ||
802 | .maxlen = sizeof(int), | ||
803 | .mode = 0644, | ||
804 | .proc_handler = &proc_dointvec | ||
805 | }, | ||
798 | { } | 806 | { } |
799 | }; | 807 | }; |
800 | 808 | ||
@@ -827,8 +835,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) | |||
827 | &net->ipv4.sysctl_icmp_ratelimit; | 835 | &net->ipv4.sysctl_icmp_ratelimit; |
828 | table[5].data = | 836 | table[5].data = |
829 | &net->ipv4.sysctl_icmp_ratemask; | 837 | &net->ipv4.sysctl_icmp_ratemask; |
838 | table[6].data = | ||
839 | &net->ipv4.sysctl_rt_cache_rebuild_count; | ||
830 | } | 840 | } |
831 | 841 | ||
842 | net->ipv4.sysctl_rt_cache_rebuild_count = 4; | ||
843 | |||
832 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, | 844 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, |
833 | net_ipv4_ctl_path, table); | 845 | net_ipv4_ctl_path, table); |
834 | if (net->ipv4.ipv4_hdr == NULL) | 846 | if (net->ipv4.ipv4_hdr == NULL) |