aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt6
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--net/ipv4/route.c132
-rw-r--r--net/ipv4/sysctl_net_ipv4.c12
4 files changed, 150 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d84932650fd3..c7712787933c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -27,6 +27,12 @@ min_adv_mss - INTEGER
27 The advertised MSS depends on the first hop route MTU, but will 27 The advertised MSS depends on the first hop route MTU, but will
28 never be lower than this setting. 28 never be lower than this setting.
29 29
30rt_cache_rebuild_count - INTEGER
31 The per net-namespace route cache emergency rebuild threshold.
32 Any net-namespace having its route cache rebuilt due to
33 a hash bucket chain being too long more than this many times
34 will have its route caching disabled
35
30IP Fragmentation: 36IP Fragmentation:
31 37
32ipfrag_high_thresh - INTEGER 38ipfrag_high_thresh - INTEGER
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index ece1c926b5d1..977f482d97a9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,6 +49,8 @@ struct netns_ipv4 {
49 int sysctl_icmp_ratelimit; 49 int sysctl_icmp_ratelimit;
50 int sysctl_icmp_ratemask; 50 int sysctl_icmp_ratemask;
51 int sysctl_icmp_errors_use_inbound_ifaddr; 51 int sysctl_icmp_errors_use_inbound_ifaddr;
52 int sysctl_rt_cache_rebuild_count;
53 int current_rt_cache_rebuild_count;
52 54
53 struct timer_list rt_secret_timer; 55 struct timer_list rt_secret_timer;
54 atomic_t rt_genid; 56 atomic_t rt_genid;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2ea6dcc3e2cc..21ce7e1b2284 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256; 130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; 131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20;
132 133
133static void rt_worker_func(struct work_struct *work); 134static void rt_worker_func(struct work_struct *work);
134static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); 135static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
@@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147static int rt_garbage_collect(struct dst_ops *ops); 148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
148 150
149 151
150static struct dst_ops ipv4_dst_ops = { 152static struct dst_ops ipv4_dst_ops = {
@@ -201,6 +203,7 @@ const __u8 ip_tos2prio[16] = {
201struct rt_hash_bucket { 203struct rt_hash_bucket {
202 struct rtable *chain; 204 struct rtable *chain;
203}; 205};
206
204#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING) 208 defined(CONFIG_PROVE_LOCKING)
206/* 209/*
@@ -674,6 +677,20 @@ static inline u32 rt_score(struct rtable *rt)
674 return score; 677 return score;
675} 678}
676 679
680static inline bool rt_caching(const struct net *net)
681{
682 return net->ipv4.current_rt_cache_rebuild_count <=
683 net->ipv4.sysctl_rt_cache_rebuild_count;
684}
685
686static inline bool compare_hash_inputs(const struct flowi *fl1,
687 const struct flowi *fl2)
688{
689 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
690 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
691 (fl1->iif ^ fl2->iif)) == 0);
692}
693
677static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 694static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
678{ 695{
679 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 696 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
@@ -753,11 +770,24 @@ static void rt_do_flush(int process_context)
753 } 770 }
754} 771}
755 772
773/*
774 * While freeing expired entries, we compute average chain length
775 * and standard deviation, using fixed-point arithmetic.
776 * This to have an estimation of rt_chain_length_max
777 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
778 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
779 */
780
781#define FRACT_BITS 3
782#define ONE (1UL << FRACT_BITS)
783
756static void rt_check_expire(void) 784static void rt_check_expire(void)
757{ 785{
758 static unsigned int rover; 786 static unsigned int rover;
759 unsigned int i = rover, goal; 787 unsigned int i = rover, goal;
760 struct rtable *rth, **rthp; 788 struct rtable *rth, **rthp;
789 unsigned long length = 0, samples = 0;
790 unsigned long sum = 0, sum2 = 0;
761 u64 mult; 791 u64 mult;
762 792
763 mult = ((u64)ip_rt_gc_interval) << rt_hash_log; 793 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
@@ -766,6 +796,7 @@ static void rt_check_expire(void)
766 goal = (unsigned int)mult; 796 goal = (unsigned int)mult;
767 if (goal > rt_hash_mask) 797 if (goal > rt_hash_mask)
768 goal = rt_hash_mask + 1; 798 goal = rt_hash_mask + 1;
799 length = 0;
769 for (; goal > 0; goal--) { 800 for (; goal > 0; goal--) {
770 unsigned long tmo = ip_rt_gc_timeout; 801 unsigned long tmo = ip_rt_gc_timeout;
771 802
@@ -775,6 +806,8 @@ static void rt_check_expire(void)
775 if (need_resched()) 806 if (need_resched())
776 cond_resched(); 807 cond_resched();
777 808
809 samples++;
810
778 if (*rthp == NULL) 811 if (*rthp == NULL)
779 continue; 812 continue;
780 spin_lock_bh(rt_hash_lock_addr(i)); 813 spin_lock_bh(rt_hash_lock_addr(i));
@@ -789,11 +822,29 @@ static void rt_check_expire(void)
789 if (time_before_eq(jiffies, rth->u.dst.expires)) { 822 if (time_before_eq(jiffies, rth->u.dst.expires)) {
790 tmo >>= 1; 823 tmo >>= 1;
791 rthp = &rth->u.dst.rt_next; 824 rthp = &rth->u.dst.rt_next;
825 /*
826 * Only bump our length if the hash
827 * inputs on entries n and n+1 are not
828 * the same, we only count entries on
829 * a chain with equal hash inputs once
830 * so that entries for different QOS
831 * levels, and other non-hash input
832 * attributes don't unfairly skew
833 * the length computation
834 */
835 if ((*rthp == NULL) ||
836 !compare_hash_inputs(&(*rthp)->fl,
837 &rth->fl))
838 length += ONE;
792 continue; 839 continue;
793 } 840 }
794 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 841 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
795 tmo >>= 1; 842 tmo >>= 1;
796 rthp = &rth->u.dst.rt_next; 843 rthp = &rth->u.dst.rt_next;
844 if ((*rthp == NULL) ||
845 !compare_hash_inputs(&(*rthp)->fl,
846 &rth->fl))
847 length += ONE;
797 continue; 848 continue;
798 } 849 }
799 850
@@ -802,6 +853,15 @@ static void rt_check_expire(void)
802 rt_free(rth); 853 rt_free(rth);
803 } 854 }
804 spin_unlock_bh(rt_hash_lock_addr(i)); 855 spin_unlock_bh(rt_hash_lock_addr(i));
856 sum += length;
857 sum2 += length*length;
858 }
859 if (samples) {
860 unsigned long avg = sum / samples;
861 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
862 rt_chain_length_max = max_t(unsigned long,
863 ip_rt_gc_elasticity,
864 (avg + 4*sd) >> FRACT_BITS);
805 } 865 }
806 rover = i; 866 rover = i;
807} 867}
@@ -851,6 +911,26 @@ static void rt_secret_rebuild(unsigned long __net)
851 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); 911 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
852} 912}
853 913
914static void rt_secret_rebuild_oneshot(struct net *net)
915{
916 del_timer_sync(&net->ipv4.rt_secret_timer);
917 rt_cache_invalidate(net);
918 if (ip_rt_secret_interval) {
919 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
920 add_timer(&net->ipv4.rt_secret_timer);
921 }
922}
923
924static void rt_emergency_hash_rebuild(struct net *net)
925{
926 if (net_ratelimit()) {
927 printk(KERN_WARNING "Route hash chain too long!\n");
928 printk(KERN_WARNING "Adjust your secret_interval!\n");
929 }
930
931 rt_secret_rebuild_oneshot(net);
932}
933
854/* 934/*
855 Short description of GC goals. 935 Short description of GC goals.
856 936
@@ -989,6 +1069,7 @@ out: return 0;
989static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 1069static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
990{ 1070{
991 struct rtable *rth, **rthp; 1071 struct rtable *rth, **rthp;
1072 struct rtable *rthi;
992 unsigned long now; 1073 unsigned long now;
993 struct rtable *cand, **candp; 1074 struct rtable *cand, **candp;
994 u32 min_score; 1075 u32 min_score;
@@ -1002,7 +1083,13 @@ restart:
1002 candp = NULL; 1083 candp = NULL;
1003 now = jiffies; 1084 now = jiffies;
1004 1085
1086 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1087 rt_drop(rt);
1088 return 0;
1089 }
1090
1005 rthp = &rt_hash_table[hash].chain; 1091 rthp = &rt_hash_table[hash].chain;
1092 rthi = NULL;
1006 1093
1007 spin_lock_bh(rt_hash_lock_addr(hash)); 1094 spin_lock_bh(rt_hash_lock_addr(hash));
1008 while ((rth = *rthp) != NULL) { 1095 while ((rth = *rthp) != NULL) {
@@ -1048,6 +1135,17 @@ restart:
1048 chain_length++; 1135 chain_length++;
1049 1136
1050 rthp = &rth->u.dst.rt_next; 1137 rthp = &rth->u.dst.rt_next;
1138
1139 /*
1140 * check to see if the next entry in the chain
1141 * contains the same hash input values as rt. If it does
1142 * This is where we will insert into the list, instead of
1143 * at the head. This groups entries that differ by aspects not
1144 * relvant to the hash function together, which we use to adjust
1145 * our chain length
1146 */
1147 if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
1148 rthi = rth;
1051 } 1149 }
1052 1150
1053 if (cand) { 1151 if (cand) {
@@ -1061,6 +1159,16 @@ restart:
1061 *candp = cand->u.dst.rt_next; 1159 *candp = cand->u.dst.rt_next;
1062 rt_free(cand); 1160 rt_free(cand);
1063 } 1161 }
1162 } else {
1163 if (chain_length > rt_chain_length_max) {
1164 struct net *net = dev_net(rt->u.dst.dev);
1165 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1167 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168 rt->u.dst.dev->name, num);
1169 }
1170 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1171 }
1064 } 1172 }
1065 1173
1066 /* Try to bind route to arp only if it is output 1174 /* Try to bind route to arp only if it is output
@@ -1098,7 +1206,11 @@ restart:
1098 } 1206 }
1099 } 1207 }
1100 1208
1101 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1209 if (rthi)
1210 rt->u.dst.rt_next = rthi->u.dst.rt_next;
1211 else
1212 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1213
1102#if RT_CACHE_DEBUG >= 2 1214#if RT_CACHE_DEBUG >= 2
1103 if (rt->u.dst.rt_next) { 1215 if (rt->u.dst.rt_next) {
1104 struct rtable *trt; 1216 struct rtable *trt;
@@ -1114,7 +1226,11 @@ restart:
1114 * previous writes to rt are comitted to memory 1226 * previous writes to rt are comitted to memory
1115 * before making rt visible to other CPUS. 1227 * before making rt visible to other CPUS.
1116 */ 1228 */
1117 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1229 if (rthi)
1230 rcu_assign_pointer(rthi->u.dst.rt_next, rt);
1231 else
1232 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1233
1118 spin_unlock_bh(rt_hash_lock_addr(hash)); 1234 spin_unlock_bh(rt_hash_lock_addr(hash));
1119 *rp = rt; 1235 *rp = rt;
1120 return 0; 1236 return 0;
@@ -1217,6 +1333,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1217 || ipv4_is_zeronet(new_gw)) 1333 || ipv4_is_zeronet(new_gw))
1218 goto reject_redirect; 1334 goto reject_redirect;
1219 1335
1336 if (!rt_caching(net))
1337 goto reject_redirect;
1338
1220 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1339 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1221 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1340 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1222 goto reject_redirect; 1341 goto reject_redirect;
@@ -2130,6 +2249,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2130 struct net *net; 2249 struct net *net;
2131 2250
2132 net = dev_net(dev); 2251 net = dev_net(dev);
2252
2253 if (!rt_caching(net))
2254 goto skip_cache;
2255
2133 tos &= IPTOS_RT_MASK; 2256 tos &= IPTOS_RT_MASK;
2134 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2257 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2135 2258
@@ -2154,6 +2277,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2154 } 2277 }
2155 rcu_read_unlock(); 2278 rcu_read_unlock();
2156 2279
2280skip_cache:
2157 /* Multicast recognition logic is moved from route cache to here. 2281 /* Multicast recognition logic is moved from route cache to here.
2158 The problem was that too many Ethernet cards have broken/missing 2282 The problem was that too many Ethernet cards have broken/missing
2159 hardware multicast filters :-( As result the host on multicasting 2283 hardware multicast filters :-( As result the host on multicasting
@@ -2539,6 +2663,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2539 unsigned hash; 2663 unsigned hash;
2540 struct rtable *rth; 2664 struct rtable *rth;
2541 2665
2666 if (!rt_caching(net))
2667 goto slow_output;
2668
2542 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2669 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2543 2670
2544 rcu_read_lock_bh(); 2671 rcu_read_lock_bh();
@@ -2563,6 +2690,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2563 } 2690 }
2564 rcu_read_unlock_bh(); 2691 rcu_read_unlock_bh();
2565 2692
2693slow_output:
2566 return ip_route_output_slow(net, rp, flp); 2694 return ip_route_output_slow(net, rp, flp);
2567} 2695}
2568 2696
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1bb10df8ce7d..0cc8d31f9ac0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -795,6 +795,14 @@ static struct ctl_table ipv4_net_table[] = {
795 .mode = 0644, 795 .mode = 0644,
796 .proc_handler = &proc_dointvec 796 .proc_handler = &proc_dointvec
797 }, 797 },
798 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "rt_cache_rebuild_count",
801 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
802 .maxlen = sizeof(int),
803 .mode = 0644,
804 .proc_handler = &proc_dointvec
805 },
798 { } 806 { }
799}; 807};
800 808
@@ -827,8 +835,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
827 &net->ipv4.sysctl_icmp_ratelimit; 835 &net->ipv4.sysctl_icmp_ratelimit;
828 table[5].data = 836 table[5].data =
829 &net->ipv4.sysctl_icmp_ratemask; 837 &net->ipv4.sysctl_icmp_ratemask;
838 table[6].data =
839 &net->ipv4.sysctl_rt_cache_rebuild_count;
830 } 840 }
831 841
842 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
843
832 net->ipv4.ipv4_hdr = register_net_sysctl_table(net, 844 net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
833 net_ipv4_ctl_path, table); 845 net_ipv4_ctl_path, table);
834 if (net->ipv4.ipv4_hdr == NULL) 846 if (net->ipv4.ipv4_hdr == NULL)