aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/route.c132
-rw-r--r--net/ipv4/sysctl_net_ipv4.c12
2 files changed, 142 insertions, 2 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2ea6dcc3e2cc..21ce7e1b2284 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256; 130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; 131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20;
132 133
133static void rt_worker_func(struct work_struct *work); 134static void rt_worker_func(struct work_struct *work);
134static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); 135static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
@@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147static int rt_garbage_collect(struct dst_ops *ops); 148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
148 150
149 151
150static struct dst_ops ipv4_dst_ops = { 152static struct dst_ops ipv4_dst_ops = {
@@ -201,6 +203,7 @@ const __u8 ip_tos2prio[16] = {
201struct rt_hash_bucket { 203struct rt_hash_bucket {
202 struct rtable *chain; 204 struct rtable *chain;
203}; 205};
206
204#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING) 208 defined(CONFIG_PROVE_LOCKING)
206/* 209/*
@@ -674,6 +677,20 @@ static inline u32 rt_score(struct rtable *rt)
674 return score; 677 return score;
675} 678}
676 679
680static inline bool rt_caching(const struct net *net)
681{
682 return net->ipv4.current_rt_cache_rebuild_count <=
683 net->ipv4.sysctl_rt_cache_rebuild_count;
684}
685
686static inline bool compare_hash_inputs(const struct flowi *fl1,
687 const struct flowi *fl2)
688{
689 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
690 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
691 (fl1->iif ^ fl2->iif)) == 0);
692}
693
677static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 694static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
678{ 695{
679 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 696 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
@@ -753,11 +770,24 @@ static void rt_do_flush(int process_context)
753 } 770 }
754} 771}
755 772
773/*
774 * While freeing expired entries, we compute average chain length
775 * and standard deviation, using fixed-point arithmetic.
776 * This to have an estimation of rt_chain_length_max
777 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
778 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
779 */
780
781#define FRACT_BITS 3
782#define ONE (1UL << FRACT_BITS)
783
756static void rt_check_expire(void) 784static void rt_check_expire(void)
757{ 785{
758 static unsigned int rover; 786 static unsigned int rover;
759 unsigned int i = rover, goal; 787 unsigned int i = rover, goal;
760 struct rtable *rth, **rthp; 788 struct rtable *rth, **rthp;
789 unsigned long length = 0, samples = 0;
790 unsigned long sum = 0, sum2 = 0;
761 u64 mult; 791 u64 mult;
762 792
763 mult = ((u64)ip_rt_gc_interval) << rt_hash_log; 793 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
@@ -766,6 +796,7 @@ static void rt_check_expire(void)
766 goal = (unsigned int)mult; 796 goal = (unsigned int)mult;
767 if (goal > rt_hash_mask) 797 if (goal > rt_hash_mask)
768 goal = rt_hash_mask + 1; 798 goal = rt_hash_mask + 1;
799 length = 0;
769 for (; goal > 0; goal--) { 800 for (; goal > 0; goal--) {
770 unsigned long tmo = ip_rt_gc_timeout; 801 unsigned long tmo = ip_rt_gc_timeout;
771 802
@@ -775,6 +806,8 @@ static void rt_check_expire(void)
775 if (need_resched()) 806 if (need_resched())
776 cond_resched(); 807 cond_resched();
777 808
809 samples++;
810
778 if (*rthp == NULL) 811 if (*rthp == NULL)
779 continue; 812 continue;
780 spin_lock_bh(rt_hash_lock_addr(i)); 813 spin_lock_bh(rt_hash_lock_addr(i));
@@ -789,11 +822,29 @@ static void rt_check_expire(void)
789 if (time_before_eq(jiffies, rth->u.dst.expires)) { 822 if (time_before_eq(jiffies, rth->u.dst.expires)) {
790 tmo >>= 1; 823 tmo >>= 1;
791 rthp = &rth->u.dst.rt_next; 824 rthp = &rth->u.dst.rt_next;
825 /*
826 * Only bump our length if the hash
827 * inputs on entries n and n+1 are not
828 * the same, we only count entries on
829 * a chain with equal hash inputs once
830 * so that entries for different QOS
831 * levels, and other non-hash input
832 * attributes don't unfairly skew
833 * the length computation
834 */
835 if ((*rthp == NULL) ||
836 !compare_hash_inputs(&(*rthp)->fl,
837 &rth->fl))
838 length += ONE;
792 continue; 839 continue;
793 } 840 }
794 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 841 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
795 tmo >>= 1; 842 tmo >>= 1;
796 rthp = &rth->u.dst.rt_next; 843 rthp = &rth->u.dst.rt_next;
844 if ((*rthp == NULL) ||
845 !compare_hash_inputs(&(*rthp)->fl,
846 &rth->fl))
847 length += ONE;
797 continue; 848 continue;
798 } 849 }
799 850
@@ -802,6 +853,15 @@ static void rt_check_expire(void)
802 rt_free(rth); 853 rt_free(rth);
803 } 854 }
804 spin_unlock_bh(rt_hash_lock_addr(i)); 855 spin_unlock_bh(rt_hash_lock_addr(i));
856 sum += length;
857 sum2 += length*length;
858 }
859 if (samples) {
860 unsigned long avg = sum / samples;
861 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
862 rt_chain_length_max = max_t(unsigned long,
863 ip_rt_gc_elasticity,
864 (avg + 4*sd) >> FRACT_BITS);
805 } 865 }
806 rover = i; 866 rover = i;
807} 867}
@@ -851,6 +911,26 @@ static void rt_secret_rebuild(unsigned long __net)
851 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); 911 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
852} 912}
853 913
914static void rt_secret_rebuild_oneshot(struct net *net)
915{
916 del_timer_sync(&net->ipv4.rt_secret_timer);
917 rt_cache_invalidate(net);
918 if (ip_rt_secret_interval) {
919 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
920 add_timer(&net->ipv4.rt_secret_timer);
921 }
922}
923
924static void rt_emergency_hash_rebuild(struct net *net)
925{
926 if (net_ratelimit()) {
927 printk(KERN_WARNING "Route hash chain too long!\n");
928 printk(KERN_WARNING "Adjust your secret_interval!\n");
929 }
930
931 rt_secret_rebuild_oneshot(net);
932}
933
854/* 934/*
855 Short description of GC goals. 935 Short description of GC goals.
856 936
@@ -989,6 +1069,7 @@ out: return 0;
989static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 1069static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
990{ 1070{
991 struct rtable *rth, **rthp; 1071 struct rtable *rth, **rthp;
1072 struct rtable *rthi;
992 unsigned long now; 1073 unsigned long now;
993 struct rtable *cand, **candp; 1074 struct rtable *cand, **candp;
994 u32 min_score; 1075 u32 min_score;
@@ -1002,7 +1083,13 @@ restart:
1002 candp = NULL; 1083 candp = NULL;
1003 now = jiffies; 1084 now = jiffies;
1004 1085
1086 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1087 rt_drop(rt);
1088 return 0;
1089 }
1090
1005 rthp = &rt_hash_table[hash].chain; 1091 rthp = &rt_hash_table[hash].chain;
1092 rthi = NULL;
1006 1093
1007 spin_lock_bh(rt_hash_lock_addr(hash)); 1094 spin_lock_bh(rt_hash_lock_addr(hash));
1008 while ((rth = *rthp) != NULL) { 1095 while ((rth = *rthp) != NULL) {
@@ -1048,6 +1135,17 @@ restart:
1048 chain_length++; 1135 chain_length++;
1049 1136
1050 rthp = &rth->u.dst.rt_next; 1137 rthp = &rth->u.dst.rt_next;
1138
1139 /*
1140 * check to see if the next entry in the chain
1141 * contains the same hash input values as rt. If it does
1142 * This is where we will insert into the list, instead of
1143 * at the head. This groups entries that differ by aspects not
1144 * relvant to the hash function together, which we use to adjust
1145 * our chain length
1146 */
1147 if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
1148 rthi = rth;
1051 } 1149 }
1052 1150
1053 if (cand) { 1151 if (cand) {
@@ -1061,6 +1159,16 @@ restart:
1061 *candp = cand->u.dst.rt_next; 1159 *candp = cand->u.dst.rt_next;
1062 rt_free(cand); 1160 rt_free(cand);
1063 } 1161 }
1162 } else {
1163 if (chain_length > rt_chain_length_max) {
1164 struct net *net = dev_net(rt->u.dst.dev);
1165 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1167 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168 rt->u.dst.dev->name, num);
1169 }
1170 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1171 }
1064 } 1172 }
1065 1173
1066 /* Try to bind route to arp only if it is output 1174 /* Try to bind route to arp only if it is output
@@ -1098,7 +1206,11 @@ restart:
1098 } 1206 }
1099 } 1207 }
1100 1208
1101 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1209 if (rthi)
1210 rt->u.dst.rt_next = rthi->u.dst.rt_next;
1211 else
1212 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1213
1102#if RT_CACHE_DEBUG >= 2 1214#if RT_CACHE_DEBUG >= 2
1103 if (rt->u.dst.rt_next) { 1215 if (rt->u.dst.rt_next) {
1104 struct rtable *trt; 1216 struct rtable *trt;
@@ -1114,7 +1226,11 @@ restart:
1114 * previous writes to rt are comitted to memory 1226 * previous writes to rt are comitted to memory
1115 * before making rt visible to other CPUS. 1227 * before making rt visible to other CPUS.
1116 */ 1228 */
1117 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1229 if (rthi)
1230 rcu_assign_pointer(rthi->u.dst.rt_next, rt);
1231 else
1232 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1233
1118 spin_unlock_bh(rt_hash_lock_addr(hash)); 1234 spin_unlock_bh(rt_hash_lock_addr(hash));
1119 *rp = rt; 1235 *rp = rt;
1120 return 0; 1236 return 0;
@@ -1217,6 +1333,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1217 || ipv4_is_zeronet(new_gw)) 1333 || ipv4_is_zeronet(new_gw))
1218 goto reject_redirect; 1334 goto reject_redirect;
1219 1335
1336 if (!rt_caching(net))
1337 goto reject_redirect;
1338
1220 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1339 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1221 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1340 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1222 goto reject_redirect; 1341 goto reject_redirect;
@@ -2130,6 +2249,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2130 struct net *net; 2249 struct net *net;
2131 2250
2132 net = dev_net(dev); 2251 net = dev_net(dev);
2252
2253 if (!rt_caching(net))
2254 goto skip_cache;
2255
2133 tos &= IPTOS_RT_MASK; 2256 tos &= IPTOS_RT_MASK;
2134 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2257 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2135 2258
@@ -2154,6 +2277,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2154 } 2277 }
2155 rcu_read_unlock(); 2278 rcu_read_unlock();
2156 2279
2280skip_cache:
2157 /* Multicast recognition logic is moved from route cache to here. 2281 /* Multicast recognition logic is moved from route cache to here.
2158 The problem was that too many Ethernet cards have broken/missing 2282 The problem was that too many Ethernet cards have broken/missing
2159 hardware multicast filters :-( As result the host on multicasting 2283 hardware multicast filters :-( As result the host on multicasting
@@ -2539,6 +2663,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2539 unsigned hash; 2663 unsigned hash;
2540 struct rtable *rth; 2664 struct rtable *rth;
2541 2665
2666 if (!rt_caching(net))
2667 goto slow_output;
2668
2542 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2669 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2543 2670
2544 rcu_read_lock_bh(); 2671 rcu_read_lock_bh();
@@ -2563,6 +2690,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2563 } 2690 }
2564 rcu_read_unlock_bh(); 2691 rcu_read_unlock_bh();
2565 2692
2693slow_output:
2566 return ip_route_output_slow(net, rp, flp); 2694 return ip_route_output_slow(net, rp, flp);
2567} 2695}
2568 2696
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1bb10df8ce7d..0cc8d31f9ac0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -795,6 +795,14 @@ static struct ctl_table ipv4_net_table[] = {
795 .mode = 0644, 795 .mode = 0644,
796 .proc_handler = &proc_dointvec 796 .proc_handler = &proc_dointvec
797 }, 797 },
798 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "rt_cache_rebuild_count",
801 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
802 .maxlen = sizeof(int),
803 .mode = 0644,
804 .proc_handler = &proc_dointvec
805 },
798 { } 806 { }
799}; 807};
800 808
@@ -827,8 +835,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
827 &net->ipv4.sysctl_icmp_ratelimit; 835 &net->ipv4.sysctl_icmp_ratelimit;
828 table[5].data = 836 table[5].data =
829 &net->ipv4.sysctl_icmp_ratemask; 837 &net->ipv4.sysctl_icmp_ratemask;
838 table[6].data =
839 &net->ipv4.sysctl_rt_cache_rebuild_count;
830 } 840 }
831 841
842 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
843
832 net->ipv4.ipv4_hdr = register_net_sysctl_table(net, 844 net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
833 net_ipv4_ctl_path, table); 845 net_ipv4_ctl_path, table);
834 if (net->ipv4.ipv4_hdr == NULL) 846 if (net->ipv4.ipv4_hdr == NULL)