diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 6 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 132 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 12 |
4 files changed, 150 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d84932650fd3..c7712787933c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -27,6 +27,12 @@ min_adv_mss - INTEGER | |||
27 | The advertised MSS depends on the first hop route MTU, but will | 27 | The advertised MSS depends on the first hop route MTU, but will |
28 | never be lower than this setting. | 28 | never be lower than this setting. |
29 | 29 | ||
30 | rt_cache_rebuild_count - INTEGER | ||
31 | The per net-namespace route cache emergency rebuild threshold. | ||
32 | Any net-namespace having its route cache rebuilt due to | ||
33 | a hash bucket chain being too long more than this many times | ||
34 | will have its route caching disabled | ||
35 | |||
30 | IP Fragmentation: | 36 | IP Fragmentation: |
31 | 37 | ||
32 | ipfrag_high_thresh - INTEGER | 38 | ipfrag_high_thresh - INTEGER |
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ece1c926b5d1..977f482d97a9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h | |||
@@ -49,6 +49,8 @@ struct netns_ipv4 { | |||
49 | int sysctl_icmp_ratelimit; | 49 | int sysctl_icmp_ratelimit; |
50 | int sysctl_icmp_ratemask; | 50 | int sysctl_icmp_ratemask; |
51 | int sysctl_icmp_errors_use_inbound_ifaddr; | 51 | int sysctl_icmp_errors_use_inbound_ifaddr; |
52 | int sysctl_rt_cache_rebuild_count; | ||
53 | int current_rt_cache_rebuild_count; | ||
52 | 54 | ||
53 | struct timer_list rt_secret_timer; | 55 | struct timer_list rt_secret_timer; |
54 | atomic_t rt_genid; | 56 | atomic_t rt_genid; |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2ea6dcc3e2cc..21ce7e1b2284 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; | |||
129 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; | 129 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; |
130 | static int ip_rt_min_advmss __read_mostly = 256; | 130 | static int ip_rt_min_advmss __read_mostly = 256; |
131 | static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; | 131 | static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; |
132 | static int rt_chain_length_max __read_mostly = 20; | ||
132 | 133 | ||
133 | static void rt_worker_func(struct work_struct *work); | 134 | static void rt_worker_func(struct work_struct *work); |
134 | static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); | 135 | static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); |
@@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); | |||
145 | static void ipv4_link_failure(struct sk_buff *skb); | 146 | static void ipv4_link_failure(struct sk_buff *skb); |
146 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); | 147 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); |
147 | static int rt_garbage_collect(struct dst_ops *ops); | 148 | static int rt_garbage_collect(struct dst_ops *ops); |
149 | static void rt_emergency_hash_rebuild(struct net *net); | ||
148 | 150 | ||
149 | 151 | ||
150 | static struct dst_ops ipv4_dst_ops = { | 152 | static struct dst_ops ipv4_dst_ops = { |
@@ -201,6 +203,7 @@ const __u8 ip_tos2prio[16] = { | |||
201 | struct rt_hash_bucket { | 203 | struct rt_hash_bucket { |
202 | struct rtable *chain; | 204 | struct rtable *chain; |
203 | }; | 205 | }; |
206 | |||
204 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ | 207 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ |
205 | defined(CONFIG_PROVE_LOCKING) | 208 | defined(CONFIG_PROVE_LOCKING) |
206 | /* | 209 | /* |
@@ -674,6 +677,20 @@ static inline u32 rt_score(struct rtable *rt) | |||
674 | return score; | 677 | return score; |
675 | } | 678 | } |
676 | 679 | ||
680 | static inline bool rt_caching(const struct net *net) | ||
681 | { | ||
682 | return net->ipv4.current_rt_cache_rebuild_count <= | ||
683 | net->ipv4.sysctl_rt_cache_rebuild_count; | ||
684 | } | ||
685 | |||
686 | static inline bool compare_hash_inputs(const struct flowi *fl1, | ||
687 | const struct flowi *fl2) | ||
688 | { | ||
689 | return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | | ||
690 | (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | | ||
691 | (fl1->iif ^ fl2->iif)) == 0); | ||
692 | } | ||
693 | |||
677 | static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) | 694 | static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) |
678 | { | 695 | { |
679 | return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | | 696 | return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | |
@@ -753,11 +770,24 @@ static void rt_do_flush(int process_context) | |||
753 | } | 770 | } |
754 | } | 771 | } |
755 | 772 | ||
773 | /* | ||
774 | * While freeing expired entries, we compute average chain length | ||
775 | * and standard deviation, using fixed-point arithmetic. | ||
776 | * This to have an estimation of rt_chain_length_max | ||
777 | * rt_chain_length_max = max(elasticity, AVG + 4*SD) | ||
778 | * We use 3 bits for frational part, and 29 (or 61) for magnitude. | ||
779 | */ | ||
780 | |||
781 | #define FRACT_BITS 3 | ||
782 | #define ONE (1UL << FRACT_BITS) | ||
783 | |||
756 | static void rt_check_expire(void) | 784 | static void rt_check_expire(void) |
757 | { | 785 | { |
758 | static unsigned int rover; | 786 | static unsigned int rover; |
759 | unsigned int i = rover, goal; | 787 | unsigned int i = rover, goal; |
760 | struct rtable *rth, **rthp; | 788 | struct rtable *rth, **rthp; |
789 | unsigned long length = 0, samples = 0; | ||
790 | unsigned long sum = 0, sum2 = 0; | ||
761 | u64 mult; | 791 | u64 mult; |
762 | 792 | ||
763 | mult = ((u64)ip_rt_gc_interval) << rt_hash_log; | 793 | mult = ((u64)ip_rt_gc_interval) << rt_hash_log; |
@@ -766,6 +796,7 @@ static void rt_check_expire(void) | |||
766 | goal = (unsigned int)mult; | 796 | goal = (unsigned int)mult; |
767 | if (goal > rt_hash_mask) | 797 | if (goal > rt_hash_mask) |
768 | goal = rt_hash_mask + 1; | 798 | goal = rt_hash_mask + 1; |
799 | length = 0; | ||
769 | for (; goal > 0; goal--) { | 800 | for (; goal > 0; goal--) { |
770 | unsigned long tmo = ip_rt_gc_timeout; | 801 | unsigned long tmo = ip_rt_gc_timeout; |
771 | 802 | ||
@@ -775,6 +806,8 @@ static void rt_check_expire(void) | |||
775 | if (need_resched()) | 806 | if (need_resched()) |
776 | cond_resched(); | 807 | cond_resched(); |
777 | 808 | ||
809 | samples++; | ||
810 | |||
778 | if (*rthp == NULL) | 811 | if (*rthp == NULL) |
779 | continue; | 812 | continue; |
780 | spin_lock_bh(rt_hash_lock_addr(i)); | 813 | spin_lock_bh(rt_hash_lock_addr(i)); |
@@ -789,11 +822,29 @@ static void rt_check_expire(void) | |||
789 | if (time_before_eq(jiffies, rth->u.dst.expires)) { | 822 | if (time_before_eq(jiffies, rth->u.dst.expires)) { |
790 | tmo >>= 1; | 823 | tmo >>= 1; |
791 | rthp = &rth->u.dst.rt_next; | 824 | rthp = &rth->u.dst.rt_next; |
825 | /* | ||
826 | * Only bump our length if the hash | ||
827 | * inputs on entries n and n+1 are not | ||
828 | * the same, we only count entries on | ||
829 | * a chain with equal hash inputs once | ||
830 | * so that entries for different QOS | ||
831 | * levels, and other non-hash input | ||
832 | * attributes don't unfairly skew | ||
833 | * the length computation | ||
834 | */ | ||
835 | if ((*rthp == NULL) || | ||
836 | !compare_hash_inputs(&(*rthp)->fl, | ||
837 | &rth->fl)) | ||
838 | length += ONE; | ||
792 | continue; | 839 | continue; |
793 | } | 840 | } |
794 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { | 841 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { |
795 | tmo >>= 1; | 842 | tmo >>= 1; |
796 | rthp = &rth->u.dst.rt_next; | 843 | rthp = &rth->u.dst.rt_next; |
844 | if ((*rthp == NULL) || | ||
845 | !compare_hash_inputs(&(*rthp)->fl, | ||
846 | &rth->fl)) | ||
847 | length += ONE; | ||
797 | continue; | 848 | continue; |
798 | } | 849 | } |
799 | 850 | ||
@@ -802,6 +853,15 @@ static void rt_check_expire(void) | |||
802 | rt_free(rth); | 853 | rt_free(rth); |
803 | } | 854 | } |
804 | spin_unlock_bh(rt_hash_lock_addr(i)); | 855 | spin_unlock_bh(rt_hash_lock_addr(i)); |
856 | sum += length; | ||
857 | sum2 += length*length; | ||
858 | } | ||
859 | if (samples) { | ||
860 | unsigned long avg = sum / samples; | ||
861 | unsigned long sd = int_sqrt(sum2 / samples - avg*avg); | ||
862 | rt_chain_length_max = max_t(unsigned long, | ||
863 | ip_rt_gc_elasticity, | ||
864 | (avg + 4*sd) >> FRACT_BITS); | ||
805 | } | 865 | } |
806 | rover = i; | 866 | rover = i; |
807 | } | 867 | } |
@@ -851,6 +911,26 @@ static void rt_secret_rebuild(unsigned long __net) | |||
851 | mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); | 911 | mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); |
852 | } | 912 | } |
853 | 913 | ||
914 | static void rt_secret_rebuild_oneshot(struct net *net) | ||
915 | { | ||
916 | del_timer_sync(&net->ipv4.rt_secret_timer); | ||
917 | rt_cache_invalidate(net); | ||
918 | if (ip_rt_secret_interval) { | ||
919 | net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; | ||
920 | add_timer(&net->ipv4.rt_secret_timer); | ||
921 | } | ||
922 | } | ||
923 | |||
924 | static void rt_emergency_hash_rebuild(struct net *net) | ||
925 | { | ||
926 | if (net_ratelimit()) { | ||
927 | printk(KERN_WARNING "Route hash chain too long!\n"); | ||
928 | printk(KERN_WARNING "Adjust your secret_interval!\n"); | ||
929 | } | ||
930 | |||
931 | rt_secret_rebuild_oneshot(net); | ||
932 | } | ||
933 | |||
854 | /* | 934 | /* |
855 | Short description of GC goals. | 935 | Short description of GC goals. |
856 | 936 | ||
@@ -989,6 +1069,7 @@ out: return 0; | |||
989 | static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) | 1069 | static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) |
990 | { | 1070 | { |
991 | struct rtable *rth, **rthp; | 1071 | struct rtable *rth, **rthp; |
1072 | struct rtable *rthi; | ||
992 | unsigned long now; | 1073 | unsigned long now; |
993 | struct rtable *cand, **candp; | 1074 | struct rtable *cand, **candp; |
994 | u32 min_score; | 1075 | u32 min_score; |
@@ -1002,7 +1083,13 @@ restart: | |||
1002 | candp = NULL; | 1083 | candp = NULL; |
1003 | now = jiffies; | 1084 | now = jiffies; |
1004 | 1085 | ||
1086 | if (!rt_caching(dev_net(rt->u.dst.dev))) { | ||
1087 | rt_drop(rt); | ||
1088 | return 0; | ||
1089 | } | ||
1090 | |||
1005 | rthp = &rt_hash_table[hash].chain; | 1091 | rthp = &rt_hash_table[hash].chain; |
1092 | rthi = NULL; | ||
1006 | 1093 | ||
1007 | spin_lock_bh(rt_hash_lock_addr(hash)); | 1094 | spin_lock_bh(rt_hash_lock_addr(hash)); |
1008 | while ((rth = *rthp) != NULL) { | 1095 | while ((rth = *rthp) != NULL) { |
@@ -1048,6 +1135,17 @@ restart: | |||
1048 | chain_length++; | 1135 | chain_length++; |
1049 | 1136 | ||
1050 | rthp = &rth->u.dst.rt_next; | 1137 | rthp = &rth->u.dst.rt_next; |
1138 | |||
1139 | /* | ||
1140 | * check to see if the next entry in the chain | ||
1141 | * contains the same hash input values as rt. If it does | ||
1142 | * This is where we will insert into the list, instead of | ||
1143 | * at the head. This groups entries that differ by aspects not | ||
1144 | * relvant to the hash function together, which we use to adjust | ||
1145 | * our chain length | ||
1146 | */ | ||
1147 | if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl)) | ||
1148 | rthi = rth; | ||
1051 | } | 1149 | } |
1052 | 1150 | ||
1053 | if (cand) { | 1151 | if (cand) { |
@@ -1061,6 +1159,16 @@ restart: | |||
1061 | *candp = cand->u.dst.rt_next; | 1159 | *candp = cand->u.dst.rt_next; |
1062 | rt_free(cand); | 1160 | rt_free(cand); |
1063 | } | 1161 | } |
1162 | } else { | ||
1163 | if (chain_length > rt_chain_length_max) { | ||
1164 | struct net *net = dev_net(rt->u.dst.dev); | ||
1165 | int num = ++net->ipv4.current_rt_cache_rebuild_count; | ||
1166 | if (!rt_caching(dev_net(rt->u.dst.dev))) { | ||
1167 | printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", | ||
1168 | rt->u.dst.dev->name, num); | ||
1169 | } | ||
1170 | rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); | ||
1171 | } | ||
1064 | } | 1172 | } |
1065 | 1173 | ||
1066 | /* Try to bind route to arp only if it is output | 1174 | /* Try to bind route to arp only if it is output |
@@ -1098,7 +1206,11 @@ restart: | |||
1098 | } | 1206 | } |
1099 | } | 1207 | } |
1100 | 1208 | ||
1101 | rt->u.dst.rt_next = rt_hash_table[hash].chain; | 1209 | if (rthi) |
1210 | rt->u.dst.rt_next = rthi->u.dst.rt_next; | ||
1211 | else | ||
1212 | rt->u.dst.rt_next = rt_hash_table[hash].chain; | ||
1213 | |||
1102 | #if RT_CACHE_DEBUG >= 2 | 1214 | #if RT_CACHE_DEBUG >= 2 |
1103 | if (rt->u.dst.rt_next) { | 1215 | if (rt->u.dst.rt_next) { |
1104 | struct rtable *trt; | 1216 | struct rtable *trt; |
@@ -1114,7 +1226,11 @@ restart: | |||
1114 | * previous writes to rt are comitted to memory | 1226 | * previous writes to rt are comitted to memory |
1115 | * before making rt visible to other CPUS. | 1227 | * before making rt visible to other CPUS. |
1116 | */ | 1228 | */ |
1117 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | 1229 | if (rthi) |
1230 | rcu_assign_pointer(rthi->u.dst.rt_next, rt); | ||
1231 | else | ||
1232 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | ||
1233 | |||
1118 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 1234 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
1119 | *rp = rt; | 1235 | *rp = rt; |
1120 | return 0; | 1236 | return 0; |
@@ -1217,6 +1333,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1217 | || ipv4_is_zeronet(new_gw)) | 1333 | || ipv4_is_zeronet(new_gw)) |
1218 | goto reject_redirect; | 1334 | goto reject_redirect; |
1219 | 1335 | ||
1336 | if (!rt_caching(net)) | ||
1337 | goto reject_redirect; | ||
1338 | |||
1220 | if (!IN_DEV_SHARED_MEDIA(in_dev)) { | 1339 | if (!IN_DEV_SHARED_MEDIA(in_dev)) { |
1221 | if (!inet_addr_onlink(in_dev, new_gw, old_gw)) | 1340 | if (!inet_addr_onlink(in_dev, new_gw, old_gw)) |
1222 | goto reject_redirect; | 1341 | goto reject_redirect; |
@@ -2130,6 +2249,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2130 | struct net *net; | 2249 | struct net *net; |
2131 | 2250 | ||
2132 | net = dev_net(dev); | 2251 | net = dev_net(dev); |
2252 | |||
2253 | if (!rt_caching(net)) | ||
2254 | goto skip_cache; | ||
2255 | |||
2133 | tos &= IPTOS_RT_MASK; | 2256 | tos &= IPTOS_RT_MASK; |
2134 | hash = rt_hash(daddr, saddr, iif, rt_genid(net)); | 2257 | hash = rt_hash(daddr, saddr, iif, rt_genid(net)); |
2135 | 2258 | ||
@@ -2154,6 +2277,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2154 | } | 2277 | } |
2155 | rcu_read_unlock(); | 2278 | rcu_read_unlock(); |
2156 | 2279 | ||
2280 | skip_cache: | ||
2157 | /* Multicast recognition logic is moved from route cache to here. | 2281 | /* Multicast recognition logic is moved from route cache to here. |
2158 | The problem was that too many Ethernet cards have broken/missing | 2282 | The problem was that too many Ethernet cards have broken/missing |
2159 | hardware multicast filters :-( As result the host on multicasting | 2283 | hardware multicast filters :-( As result the host on multicasting |
@@ -2539,6 +2663,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, | |||
2539 | unsigned hash; | 2663 | unsigned hash; |
2540 | struct rtable *rth; | 2664 | struct rtable *rth; |
2541 | 2665 | ||
2666 | if (!rt_caching(net)) | ||
2667 | goto slow_output; | ||
2668 | |||
2542 | hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); | 2669 | hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); |
2543 | 2670 | ||
2544 | rcu_read_lock_bh(); | 2671 | rcu_read_lock_bh(); |
@@ -2563,6 +2690,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, | |||
2563 | } | 2690 | } |
2564 | rcu_read_unlock_bh(); | 2691 | rcu_read_unlock_bh(); |
2565 | 2692 | ||
2693 | slow_output: | ||
2566 | return ip_route_output_slow(net, rp, flp); | 2694 | return ip_route_output_slow(net, rp, flp); |
2567 | } | 2695 | } |
2568 | 2696 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1bb10df8ce7d..0cc8d31f9ac0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -795,6 +795,14 @@ static struct ctl_table ipv4_net_table[] = { | |||
795 | .mode = 0644, | 795 | .mode = 0644, |
796 | .proc_handler = &proc_dointvec | 796 | .proc_handler = &proc_dointvec |
797 | }, | 797 | }, |
798 | { | ||
799 | .ctl_name = CTL_UNNUMBERED, | ||
800 | .procname = "rt_cache_rebuild_count", | ||
801 | .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, | ||
802 | .maxlen = sizeof(int), | ||
803 | .mode = 0644, | ||
804 | .proc_handler = &proc_dointvec | ||
805 | }, | ||
798 | { } | 806 | { } |
799 | }; | 807 | }; |
800 | 808 | ||
@@ -827,8 +835,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) | |||
827 | &net->ipv4.sysctl_icmp_ratelimit; | 835 | &net->ipv4.sysctl_icmp_ratelimit; |
828 | table[5].data = | 836 | table[5].data = |
829 | &net->ipv4.sysctl_icmp_ratemask; | 837 | &net->ipv4.sysctl_icmp_ratemask; |
838 | table[6].data = | ||
839 | &net->ipv4.sysctl_rt_cache_rebuild_count; | ||
830 | } | 840 | } |
831 | 841 | ||
842 | net->ipv4.sysctl_rt_cache_rebuild_count = 4; | ||
843 | |||
832 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, | 844 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, |
833 | net_ipv4_ctl_path, table); | 845 | net_ipv4_ctl_path, table); |
834 | if (net->ipv4.ipv4_hdr == NULL) | 846 | if (net->ipv4.ipv4_hdr == NULL) |