aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2009-05-19 16:14:28 -0400
committerDavid S. Miller <davem@davemloft.net>2009-05-20 20:18:02 -0400
commit1ddbcb005c395518c2cd0df504cff3d4b5c85853 (patch)
tree03567b8b50d3094ae13c64b44890f9c0d53361b7
parentcf8da764fc6959b7efb482f375dfef9830e98205 (diff)
net: fix rtable leak in net/ipv4/route.c
Alexander V. Lukyanov found a regression in 2.6.29 and made a complete analysis found in http://bugzilla.kernel.org/show_bug.cgi?id=13339 Quoted here because its a perfect one : begin_of_quotation 2.6.29 patch has introduced flexible route cache rebuilding. Unfortunately the patch has at least one critical flaw, and another problem. rt_intern_hash calculates rthi pointer, which is later used for new entry insertion. The same loop calculates cand pointer which is used to clean the list. If the pointers are the same, rtable leak occurs, as first the cand is removed then the new entry is appended to it. This leak leads to unregister_netdevice problem (usage count > 0). Another problem of the patch is that it tries to insert the entries in certain order, to facilitate counting of entries distinct by all but QoS parameters. Unfortunately, referencing an existing rtable entry moves it to list beginning, to speed up further lookups, so the carefully built order is destroyed. For the first problem the simplest patch it to set rthi=0 when rthi==cand, but it will also destroy the ordering. end_of_quotation Problematic commit is 1080d709fb9d8cd4392f93476ee46a9d6ea05a5b (net: implement emergency route cache rebulds when gc_elasticity is exceeded) Trying to keep dst_entries ordered is too complex and breaks the fact that order should depend on the frequency of use for garbage collection. A possible fix is to make rt_intern_hash() simpler, and only makes rt_check_expire() a litle bit smarter, being able to cope with an arbitrary entries order. The added loop is running on cache hot data, while cpu is prefetching next object, so should be unnoticied. Reported-and-analyzed-by: Alexander V. Lukyanov <lav@yar.ru> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/ipv4/route.c55
1 files changed, 17 insertions, 38 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 869cf1c44b78..28205e5bfa9b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,7 +784,7 @@ static void rt_check_expire(void)
784{ 784{
785 static unsigned int rover; 785 static unsigned int rover;
786 unsigned int i = rover, goal; 786 unsigned int i = rover, goal;
787 struct rtable *rth, **rthp; 787 struct rtable *rth, *aux, **rthp;
788 unsigned long samples = 0; 788 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0; 789 unsigned long sum = 0, sum2 = 0;
790 u64 mult; 790 u64 mult;
@@ -812,6 +812,7 @@ static void rt_check_expire(void)
812 length = 0; 812 length = 0;
813 spin_lock_bh(rt_hash_lock_addr(i)); 813 spin_lock_bh(rt_hash_lock_addr(i));
814 while ((rth = *rthp) != NULL) { 814 while ((rth = *rthp) != NULL) {
815 prefetch(rth->u.dst.rt_next);
815 if (rt_is_expired(rth)) { 816 if (rt_is_expired(rth)) {
816 *rthp = rth->u.dst.rt_next; 817 *rthp = rth->u.dst.rt_next;
817 rt_free(rth); 818 rt_free(rth);
@@ -820,33 +821,30 @@ static void rt_check_expire(void)
820 if (rth->u.dst.expires) { 821 if (rth->u.dst.expires) {
821 /* Entry is expired even if it is in use */ 822 /* Entry is expired even if it is in use */
822 if (time_before_eq(jiffies, rth->u.dst.expires)) { 823 if (time_before_eq(jiffies, rth->u.dst.expires)) {
824nofree:
823 tmo >>= 1; 825 tmo >>= 1;
824 rthp = &rth->u.dst.rt_next; 826 rthp = &rth->u.dst.rt_next;
825 /* 827 /*
826 * Only bump our length if the hash 828 * We only count entries on
827 * inputs on entries n and n+1 are not
828 * the same, we only count entries on
829 * a chain with equal hash inputs once 829 * a chain with equal hash inputs once
830 * so that entries for different QOS 830 * so that entries for different QOS
831 * levels, and other non-hash input 831 * levels, and other non-hash input
832 * attributes don't unfairly skew 832 * attributes don't unfairly skew
833 * the length computation 833 * the length computation
834 */ 834 */
835 if ((*rthp == NULL) || 835 for (aux = rt_hash_table[i].chain;;) {
836 !compare_hash_inputs(&(*rthp)->fl, 836 if (aux == rth) {
837 &rth->fl)) 837 length += ONE;
838 length += ONE; 838 break;
839 }
840 if (compare_hash_inputs(&aux->fl, &rth->fl))
841 break;
842 aux = aux->u.dst.rt_next;
843 }
839 continue; 844 continue;
840 } 845 }
841 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 846 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
842 tmo >>= 1; 847 goto nofree;
843 rthp = &rth->u.dst.rt_next;
844 if ((*rthp == NULL) ||
845 !compare_hash_inputs(&(*rthp)->fl,
846 &rth->fl))
847 length += ONE;
848 continue;
849 }
850 848
851 /* Cleanup aged off entries. */ 849 /* Cleanup aged off entries. */
852 *rthp = rth->u.dst.rt_next; 850 *rthp = rth->u.dst.rt_next;
@@ -1069,7 +1067,6 @@ out: return 0;
1069static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 1067static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
1070{ 1068{
1071 struct rtable *rth, **rthp; 1069 struct rtable *rth, **rthp;
1072 struct rtable *rthi;
1073 unsigned long now; 1070 unsigned long now;
1074 struct rtable *cand, **candp; 1071 struct rtable *cand, **candp;
1075 u32 min_score; 1072 u32 min_score;
@@ -1089,7 +1086,6 @@ restart:
1089 } 1086 }
1090 1087
1091 rthp = &rt_hash_table[hash].chain; 1088 rthp = &rt_hash_table[hash].chain;
1092 rthi = NULL;
1093 1089
1094 spin_lock_bh(rt_hash_lock_addr(hash)); 1090 spin_lock_bh(rt_hash_lock_addr(hash));
1095 while ((rth = *rthp) != NULL) { 1091 while ((rth = *rthp) != NULL) {
@@ -1135,17 +1131,6 @@ restart:
1135 chain_length++; 1131 chain_length++;
1136 1132
1137 rthp = &rth->u.dst.rt_next; 1133 rthp = &rth->u.dst.rt_next;
1138
1139 /*
1140 * check to see if the next entry in the chain
1141 * contains the same hash input values as rt. If it does
1142 * This is where we will insert into the list, instead of
1143 * at the head. This groups entries that differ by aspects not
1144 * relvant to the hash function together, which we use to adjust
1145 * our chain length
1146 */
1147 if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
1148 rthi = rth;
1149 } 1134 }
1150 1135
1151 if (cand) { 1136 if (cand) {
@@ -1206,10 +1191,7 @@ restart:
1206 } 1191 }
1207 } 1192 }
1208 1193
1209 if (rthi) 1194 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1210 rt->u.dst.rt_next = rthi->u.dst.rt_next;
1211 else
1212 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1213 1195
1214#if RT_CACHE_DEBUG >= 2 1196#if RT_CACHE_DEBUG >= 2
1215 if (rt->u.dst.rt_next) { 1197 if (rt->u.dst.rt_next) {
@@ -1225,10 +1207,7 @@ restart:
1225 * previous writes to rt are comitted to memory 1207 * previous writes to rt are comitted to memory
1226 * before making rt visible to other CPUS. 1208 * before making rt visible to other CPUS.
1227 */ 1209 */
1228 if (rthi) 1210 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1229 rcu_assign_pointer(rthi->u.dst.rt_next, rt);
1230 else
1231 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1232 1211
1233 spin_unlock_bh(rt_hash_lock_addr(hash)); 1212 spin_unlock_bh(rt_hash_lock_addr(hash));
1234 *rp = rt; 1213 *rp = rt;