aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_semantics.c42
-rw-r--r--net/ipv4/fib_trie.c53
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/route.c183
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp_input.c3
-rw-r--r--net/ipv4/tcp_ipv4.c12
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/xfrm4_policy.c1
10 files changed, 205 insertions, 106 deletions
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8732cc7920ed..c43ae3fba792 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1046,6 +1046,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1046 1046
1047 if (event == NETDEV_UNREGISTER) { 1047 if (event == NETDEV_UNREGISTER) {
1048 fib_disable_ip(dev, 2, -1); 1048 fib_disable_ip(dev, 2, -1);
1049 rt_flush_dev(dev);
1049 return NOTIFY_DONE; 1050 return NOTIFY_DONE;
1050 } 1051 }
1051 1052
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da0cc2e6b250..da80dc14cc76 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,21 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
140 }, 140 },
141}; 141};
142 142
143static void rt_fibinfo_free(struct rtable __rcu **rtp)
144{
145 struct rtable *rt = rcu_dereference_protected(*rtp, 1);
146
147 if (!rt)
148 return;
149
150 /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
151 * because we waited an RCU grace period before calling
152 * free_fib_info_rcu()
153 */
154
155 dst_free(&rt->dst);
156}
157
143static void free_nh_exceptions(struct fib_nh *nh) 158static void free_nh_exceptions(struct fib_nh *nh)
144{ 159{
145 struct fnhe_hash_bucket *hash = nh->nh_exceptions; 160 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
@@ -153,6 +168,9 @@ static void free_nh_exceptions(struct fib_nh *nh)
153 struct fib_nh_exception *next; 168 struct fib_nh_exception *next;
154 169
155 next = rcu_dereference_protected(fnhe->fnhe_next, 1); 170 next = rcu_dereference_protected(fnhe->fnhe_next, 1);
171
172 rt_fibinfo_free(&fnhe->fnhe_rth);
173
156 kfree(fnhe); 174 kfree(fnhe);
157 175
158 fnhe = next; 176 fnhe = next;
@@ -161,6 +179,23 @@ static void free_nh_exceptions(struct fib_nh *nh)
161 kfree(hash); 179 kfree(hash);
162} 180}
163 181
182static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
183{
184 int cpu;
185
186 if (!rtp)
187 return;
188
189 for_each_possible_cpu(cpu) {
190 struct rtable *rt;
191
192 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
193 if (rt)
194 dst_free(&rt->dst);
195 }
196 free_percpu(rtp);
197}
198
164/* Release a nexthop info record */ 199/* Release a nexthop info record */
165static void free_fib_info_rcu(struct rcu_head *head) 200static void free_fib_info_rcu(struct rcu_head *head)
166{ 201{
@@ -171,10 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
171 dev_put(nexthop_nh->nh_dev); 206 dev_put(nexthop_nh->nh_dev);
172 if (nexthop_nh->nh_exceptions) 207 if (nexthop_nh->nh_exceptions)
173 free_nh_exceptions(nexthop_nh); 208 free_nh_exceptions(nexthop_nh);
174 if (nexthop_nh->nh_rth_output) 209 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
175 dst_free(&nexthop_nh->nh_rth_output->dst); 210 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
176 if (nexthop_nh->nh_rth_input)
177 dst_free(&nexthop_nh->nh_rth_input->dst);
178 } endfor_nexthops(fi); 211 } endfor_nexthops(fi);
179 212
180 release_net(fi->fib_net); 213 release_net(fi->fib_net);
@@ -804,6 +837,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
804 fi->fib_nhs = nhs; 837 fi->fib_nhs = nhs;
805 change_nexthops(fi) { 838 change_nexthops(fi) {
806 nexthop_nh->nh_parent = fi; 839 nexthop_nh->nh_parent = fi;
840 nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
807 } endfor_nexthops(fi) 841 } endfor_nexthops(fi)
808 842
809 if (cfg->fc_mx) { 843 if (cfg->fc_mx) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18cbc15b20d5..f0cdb30921c0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -159,7 +159,6 @@ struct trie {
159#endif 159#endif
160}; 160};
161 161
162static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
163static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
164 int wasfull); 163 int wasfull);
165static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); 164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
@@ -473,7 +472,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
473 } 472 }
474 473
475 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 474 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
476 sizeof(struct rt_trie_node) << bits); 475 sizeof(struct rt_trie_node *) << bits);
477 return tn; 476 return tn;
478} 477}
479 478
@@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *
490 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 489 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
491} 490}
492 491
493static inline void put_child(struct trie *t, struct tnode *tn, int i, 492static inline void put_child(struct tnode *tn, int i,
494 struct rt_trie_node *n) 493 struct rt_trie_node *n)
495{ 494{
496 tnode_put_child_reorg(tn, i, n, -1); 495 tnode_put_child_reorg(tn, i, n, -1);
@@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
754 goto nomem; 753 goto nomem;
755 } 754 }
756 755
757 put_child(t, tn, 2*i, (struct rt_trie_node *) left); 756 put_child(tn, 2*i, (struct rt_trie_node *) left);
758 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); 757 put_child(tn, 2*i+1, (struct rt_trie_node *) right);
759 } 758 }
760 } 759 }
761 760
@@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
776 if (tkey_extract_bits(node->key, 775 if (tkey_extract_bits(node->key,
777 oldtnode->pos + oldtnode->bits, 776 oldtnode->pos + oldtnode->bits,
778 1) == 0) 777 1) == 0)
779 put_child(t, tn, 2*i, node); 778 put_child(tn, 2*i, node);
780 else 779 else
781 put_child(t, tn, 2*i+1, node); 780 put_child(tn, 2*i+1, node);
782 continue; 781 continue;
783 } 782 }
784 783
@@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
786 inode = (struct tnode *) node; 785 inode = (struct tnode *) node;
787 786
788 if (inode->bits == 1) { 787 if (inode->bits == 1) {
789 put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); 788 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
790 put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); 789 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
791 790
792 tnode_free_safe(inode); 791 tnode_free_safe(inode);
793 continue; 792 continue;
@@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
817 */ 816 */
818 817
819 left = (struct tnode *) tnode_get_child(tn, 2*i); 818 left = (struct tnode *) tnode_get_child(tn, 2*i);
820 put_child(t, tn, 2*i, NULL); 819 put_child(tn, 2*i, NULL);
821 820
822 BUG_ON(!left); 821 BUG_ON(!left);
823 822
824 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 823 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
825 put_child(t, tn, 2*i+1, NULL); 824 put_child(tn, 2*i+1, NULL);
826 825
827 BUG_ON(!right); 826 BUG_ON(!right);
828 827
829 size = tnode_child_length(left); 828 size = tnode_child_length(left);
830 for (j = 0; j < size; j++) { 829 for (j = 0; j < size; j++) {
831 put_child(t, left, j, rtnl_dereference(inode->child[j])); 830 put_child(left, j, rtnl_dereference(inode->child[j]));
832 put_child(t, right, j, rtnl_dereference(inode->child[j + size])); 831 put_child(right, j, rtnl_dereference(inode->child[j + size]));
833 } 832 }
834 put_child(t, tn, 2*i, resize(t, left)); 833 put_child(tn, 2*i, resize(t, left));
835 put_child(t, tn, 2*i+1, resize(t, right)); 834 put_child(tn, 2*i+1, resize(t, right));
836 835
837 tnode_free_safe(inode); 836 tnode_free_safe(inode);
838 } 837 }
@@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
877 if (!newn) 876 if (!newn)
878 goto nomem; 877 goto nomem;
879 878
880 put_child(t, tn, i/2, (struct rt_trie_node *)newn); 879 put_child(tn, i/2, (struct rt_trie_node *)newn);
881 } 880 }
882 881
883 } 882 }
@@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
892 if (left == NULL) { 891 if (left == NULL) {
893 if (right == NULL) /* Both are empty */ 892 if (right == NULL) /* Both are empty */
894 continue; 893 continue;
895 put_child(t, tn, i/2, right); 894 put_child(tn, i/2, right);
896 continue; 895 continue;
897 } 896 }
898 897
899 if (right == NULL) { 898 if (right == NULL) {
900 put_child(t, tn, i/2, left); 899 put_child(tn, i/2, left);
901 continue; 900 continue;
902 } 901 }
903 902
904 /* Two nonempty children */ 903 /* Two nonempty children */
905 newBinNode = (struct tnode *) tnode_get_child(tn, i/2); 904 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
906 put_child(t, tn, i/2, NULL); 905 put_child(tn, i/2, NULL);
907 put_child(t, newBinNode, 0, left); 906 put_child(newBinNode, 0, left);
908 put_child(t, newBinNode, 1, right); 907 put_child(newBinNode, 1, right);
909 put_child(t, tn, i/2, resize(t, newBinNode)); 908 put_child(tn, i/2, resize(t, newBinNode));
910 } 909 }
911 tnode_free_safe(oldtnode); 910 tnode_free_safe(oldtnode);
912 return tn; 911 return tn;
@@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1125 node_set_parent((struct rt_trie_node *)l, tp); 1124 node_set_parent((struct rt_trie_node *)l, tp);
1126 1125
1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1126 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128 put_child(t, tp, cindex, (struct rt_trie_node *)l); 1127 put_child(tp, cindex, (struct rt_trie_node *)l);
1129 } else { 1128 } else {
1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1131 /* 1130 /*
@@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1155 node_set_parent((struct rt_trie_node *)tn, tp); 1154 node_set_parent((struct rt_trie_node *)tn, tp);
1156 1155
1157 missbit = tkey_extract_bits(key, newpos, 1); 1156 missbit = tkey_extract_bits(key, newpos, 1);
1158 put_child(t, tn, missbit, (struct rt_trie_node *)l); 1157 put_child(tn, missbit, (struct rt_trie_node *)l);
1159 put_child(t, tn, 1-missbit, n); 1158 put_child(tn, 1-missbit, n);
1160 1159
1161 if (tp) { 1160 if (tp) {
1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1161 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163 put_child(t, tp, cindex, (struct rt_trie_node *)tn); 1162 put_child(tp, cindex, (struct rt_trie_node *)tn);
1164 } else { 1163 } else {
1165 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1164 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1166 tp = tn; 1165 tp = tn;
@@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1619 1618
1620 if (tp) { 1619 if (tp) {
1621 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); 1620 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1622 put_child(t, tp, cindex, NULL); 1621 put_child(tp, cindex, NULL);
1623 trie_rebalance(t, tp); 1622 trie_rebalance(t, tp);
1624 } else 1623 } else
1625 RCU_INIT_POINTER(t->trie, NULL); 1624 RCU_INIT_POINTER(t->trie, NULL);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 981ff1eef28c..f1395a6fb35f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -325,14 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb)
325 const struct net_protocol *ipprot; 325 const struct net_protocol *ipprot;
326 int protocol = iph->protocol; 326 int protocol = iph->protocol;
327 327
328 rcu_read_lock();
329 ipprot = rcu_dereference(inet_protos[protocol]); 328 ipprot = rcu_dereference(inet_protos[protocol]);
330 if (ipprot && ipprot->early_demux) { 329 if (ipprot && ipprot->early_demux) {
331 ipprot->early_demux(skb); 330 ipprot->early_demux(skb);
332 /* must reload iph, skb->head might have changed */ 331 /* must reload iph, skb->head might have changed */
333 iph = ip_hdr(skb); 332 iph = ip_hdr(skb);
334 } 333 }
335 rcu_read_unlock();
336 } 334 }
337 335
338 /* 336 /*
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fc1a81ca79a7..c035251beb07 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -147,6 +147,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb, u32 mtu); 147 struct sk_buff *skb, u32 mtu);
148static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 148static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 struct sk_buff *skb); 149 struct sk_buff *skb);
150static void ipv4_dst_destroy(struct dst_entry *dst);
150 151
151static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152 int how) 153 int how)
@@ -170,6 +171,7 @@ static struct dst_ops ipv4_dst_ops = {
170 .default_advmss = ipv4_default_advmss, 171 .default_advmss = ipv4_default_advmss,
171 .mtu = ipv4_mtu, 172 .mtu = ipv4_mtu,
172 .cow_metrics = ipv4_cow_metrics, 173 .cow_metrics = ipv4_cow_metrics,
174 .destroy = ipv4_dst_destroy,
173 .ifdown = ipv4_dst_ifdown, 175 .ifdown = ipv4_dst_ifdown,
174 .negative_advice = ipv4_negative_advice, 176 .negative_advice = ipv4_negative_advice,
175 .link_failure = ipv4_link_failure, 177 .link_failure = ipv4_link_failure,
@@ -587,11 +589,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
587 build_sk_flow_key(fl4, sk); 589 build_sk_flow_key(fl4, sk);
588} 590}
589 591
590static DEFINE_SEQLOCK(fnhe_seqlock); 592static inline void rt_free(struct rtable *rt)
593{
594 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
595}
596
597static DEFINE_SPINLOCK(fnhe_lock);
591 598
592static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 599static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
593{ 600{
594 struct fib_nh_exception *fnhe, *oldest; 601 struct fib_nh_exception *fnhe, *oldest;
602 struct rtable *orig;
595 603
596 oldest = rcu_dereference(hash->chain); 604 oldest = rcu_dereference(hash->chain);
597 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 605 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -599,6 +607,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
599 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 607 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
600 oldest = fnhe; 608 oldest = fnhe;
601 } 609 }
610 orig = rcu_dereference(oldest->fnhe_rth);
611 if (orig) {
612 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
613 rt_free(orig);
614 }
602 return oldest; 615 return oldest;
603} 616}
604 617
@@ -620,7 +633,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
620 int depth; 633 int depth;
621 u32 hval = fnhe_hashfun(daddr); 634 u32 hval = fnhe_hashfun(daddr);
622 635
623 write_seqlock_bh(&fnhe_seqlock); 636 spin_lock_bh(&fnhe_lock);
624 637
625 hash = nh->nh_exceptions; 638 hash = nh->nh_exceptions;
626 if (!hash) { 639 if (!hash) {
@@ -667,7 +680,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
667 fnhe->fnhe_stamp = jiffies; 680 fnhe->fnhe_stamp = jiffies;
668 681
669out_unlock: 682out_unlock:
670 write_sequnlock_bh(&fnhe_seqlock); 683 spin_unlock_bh(&fnhe_lock);
671 return; 684 return;
672} 685}
673 686
@@ -1164,53 +1177,62 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1164 return NULL; 1177 return NULL;
1165} 1178}
1166 1179
1167static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1180static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1168 __be32 daddr) 1181 __be32 daddr)
1169{ 1182{
1170 __be32 fnhe_daddr, gw; 1183 bool ret = false;
1171 unsigned long expires; 1184
1172 unsigned int seq; 1185 spin_lock_bh(&fnhe_lock);
1173 u32 pmtu;
1174
1175restart:
1176 seq = read_seqbegin(&fnhe_seqlock);
1177 fnhe_daddr = fnhe->fnhe_daddr;
1178 gw = fnhe->fnhe_gw;
1179 pmtu = fnhe->fnhe_pmtu;
1180 expires = fnhe->fnhe_expires;
1181 if (read_seqretry(&fnhe_seqlock, seq))
1182 goto restart;
1183
1184 if (daddr != fnhe_daddr)
1185 return;
1186 1186
1187 if (pmtu) { 1187 if (daddr == fnhe->fnhe_daddr) {
1188 unsigned long diff = expires - jiffies; 1188 struct rtable *orig;
1189 1189
1190 if (time_before(jiffies, expires)) { 1190 if (fnhe->fnhe_pmtu) {
1191 rt->rt_pmtu = pmtu; 1191 unsigned long expires = fnhe->fnhe_expires;
1192 dst_set_expires(&rt->dst, diff); 1192 unsigned long diff = expires - jiffies;
1193
1194 if (time_before(jiffies, expires)) {
1195 rt->rt_pmtu = fnhe->fnhe_pmtu;
1196 dst_set_expires(&rt->dst, diff);
1197 }
1193 } 1198 }
1199 if (fnhe->fnhe_gw) {
1200 rt->rt_flags |= RTCF_REDIRECTED;
1201 rt->rt_gateway = fnhe->fnhe_gw;
1202 }
1203
1204 orig = rcu_dereference(fnhe->fnhe_rth);
1205 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1206 if (orig)
1207 rt_free(orig);
1208
1209 fnhe->fnhe_stamp = jiffies;
1210 ret = true;
1211 } else {
1212 /* Routes we intend to cache in nexthop exception have
1213 * the DST_NOCACHE bit clear. However, if we are
1214 * unsuccessful at storing this route into the cache
1215 * we really need to set it.
1216 */
1217 rt->dst.flags |= DST_NOCACHE;
1194 } 1218 }
1195 if (gw) { 1219 spin_unlock_bh(&fnhe_lock);
1196 rt->rt_flags |= RTCF_REDIRECTED;
1197 rt->rt_gateway = gw;
1198 }
1199 fnhe->fnhe_stamp = jiffies;
1200}
1201 1220
1202static inline void rt_free(struct rtable *rt) 1221 return ret;
1203{
1204 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1205} 1222}
1206 1223
1207static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1224static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1208{ 1225{
1209 struct rtable *orig, *prev, **p = &nh->nh_rth_output; 1226 struct rtable *orig, *prev, **p;
1210 1227 bool ret = true;
1211 if (rt_is_input_route(rt))
1212 p = &nh->nh_rth_input;
1213 1228
1229 if (rt_is_input_route(rt)) {
1230 p = (struct rtable **)&nh->nh_rth_input;
1231 } else {
1232 if (!nh->nh_pcpu_rth_output)
1233 goto nocache;
1234 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1235 }
1214 orig = *p; 1236 orig = *p;
1215 1237
1216 prev = cmpxchg(p, orig, rt); 1238 prev = cmpxchg(p, orig, rt);
@@ -1223,7 +1245,50 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1223 * unsuccessful at storing this route into the cache 1245 * unsuccessful at storing this route into the cache
1224 * we really need to set it. 1246 * we really need to set it.
1225 */ 1247 */
1248nocache:
1226 rt->dst.flags |= DST_NOCACHE; 1249 rt->dst.flags |= DST_NOCACHE;
1250 ret = false;
1251 }
1252
1253 return ret;
1254}
1255
1256static DEFINE_SPINLOCK(rt_uncached_lock);
1257static LIST_HEAD(rt_uncached_list);
1258
1259static void rt_add_uncached_list(struct rtable *rt)
1260{
1261 spin_lock_bh(&rt_uncached_lock);
1262 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1263 spin_unlock_bh(&rt_uncached_lock);
1264}
1265
1266static void ipv4_dst_destroy(struct dst_entry *dst)
1267{
1268 struct rtable *rt = (struct rtable *) dst;
1269
1270 if (dst->flags & DST_NOCACHE) {
1271 spin_lock_bh(&rt_uncached_lock);
1272 list_del(&rt->rt_uncached);
1273 spin_unlock_bh(&rt_uncached_lock);
1274 }
1275}
1276
1277void rt_flush_dev(struct net_device *dev)
1278{
1279 if (!list_empty(&rt_uncached_list)) {
1280 struct net *net = dev_net(dev);
1281 struct rtable *rt;
1282
1283 spin_lock_bh(&rt_uncached_lock);
1284 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1285 if (rt->dst.dev != dev)
1286 continue;
1287 rt->dst.dev = net->loopback_dev;
1288 dev_hold(rt->dst.dev);
1289 dev_put(dev);
1290 }
1291 spin_unlock_bh(&rt_uncached_lock);
1227 } 1292 }
1228} 1293}
1229 1294
@@ -1239,20 +1304,24 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1239 struct fib_nh_exception *fnhe, 1304 struct fib_nh_exception *fnhe,
1240 struct fib_info *fi, u16 type, u32 itag) 1305 struct fib_info *fi, u16 type, u32 itag)
1241{ 1306{
1307 bool cached = false;
1308
1242 if (fi) { 1309 if (fi) {
1243 struct fib_nh *nh = &FIB_RES_NH(*res); 1310 struct fib_nh *nh = &FIB_RES_NH(*res);
1244 1311
1245 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) 1312 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1246 rt->rt_gateway = nh->nh_gw; 1313 rt->rt_gateway = nh->nh_gw;
1247 if (unlikely(fnhe))
1248 rt_bind_exception(rt, fnhe, daddr);
1249 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1314 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1250#ifdef CONFIG_IP_ROUTE_CLASSID 1315#ifdef CONFIG_IP_ROUTE_CLASSID
1251 rt->dst.tclassid = nh->nh_tclassid; 1316 rt->dst.tclassid = nh->nh_tclassid;
1252#endif 1317#endif
1253 if (!(rt->dst.flags & DST_NOCACHE)) 1318 if (unlikely(fnhe))
1254 rt_cache_route(nh, rt); 1319 cached = rt_bind_exception(rt, fnhe, daddr);
1320 else if (!(rt->dst.flags & DST_NOCACHE))
1321 cached = rt_cache_route(nh, rt);
1255 } 1322 }
1323 if (unlikely(!cached))
1324 rt_add_uncached_list(rt);
1256 1325
1257#ifdef CONFIG_IP_ROUTE_CLASSID 1326#ifdef CONFIG_IP_ROUTE_CLASSID
1258#ifdef CONFIG_IP_MULTIPLE_TABLES 1327#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1319,6 +1388,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1319 rth->rt_iif = 0; 1388 rth->rt_iif = 0;
1320 rth->rt_pmtu = 0; 1389 rth->rt_pmtu = 0;
1321 rth->rt_gateway = 0; 1390 rth->rt_gateway = 0;
1391 INIT_LIST_HEAD(&rth->rt_uncached);
1322 if (our) { 1392 if (our) {
1323 rth->dst.input= ip_local_deliver; 1393 rth->dst.input= ip_local_deliver;
1324 rth->rt_flags |= RTCF_LOCAL; 1394 rth->rt_flags |= RTCF_LOCAL;
@@ -1420,7 +1490,7 @@ static int __mkroute_input(struct sk_buff *skb,
1420 do_cache = false; 1490 do_cache = false;
1421 if (res->fi) { 1491 if (res->fi) {
1422 if (!itag) { 1492 if (!itag) {
1423 rth = FIB_RES_NH(*res).nh_rth_input; 1493 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1424 if (rt_cache_valid(rth)) { 1494 if (rt_cache_valid(rth)) {
1425 skb_dst_set_noref(skb, &rth->dst); 1495 skb_dst_set_noref(skb, &rth->dst);
1426 goto out; 1496 goto out;
@@ -1444,6 +1514,7 @@ static int __mkroute_input(struct sk_buff *skb,
1444 rth->rt_iif = 0; 1514 rth->rt_iif = 0;
1445 rth->rt_pmtu = 0; 1515 rth->rt_pmtu = 0;
1446 rth->rt_gateway = 0; 1516 rth->rt_gateway = 0;
1517 INIT_LIST_HEAD(&rth->rt_uncached);
1447 1518
1448 rth->dst.input = ip_forward; 1519 rth->dst.input = ip_forward;
1449 rth->dst.output = ip_output; 1520 rth->dst.output = ip_output;
@@ -1582,7 +1653,7 @@ local_input:
1582 do_cache = false; 1653 do_cache = false;
1583 if (res.fi) { 1654 if (res.fi) {
1584 if (!itag) { 1655 if (!itag) {
1585 rth = FIB_RES_NH(res).nh_rth_input; 1656 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1586 if (rt_cache_valid(rth)) { 1657 if (rt_cache_valid(rth)) {
1587 skb_dst_set_noref(skb, &rth->dst); 1658 skb_dst_set_noref(skb, &rth->dst);
1588 err = 0; 1659 err = 0;
@@ -1610,6 +1681,7 @@ local_input:
1610 rth->rt_iif = 0; 1681 rth->rt_iif = 0;
1611 rth->rt_pmtu = 0; 1682 rth->rt_pmtu = 0;
1612 rth->rt_gateway = 0; 1683 rth->rt_gateway = 0;
1684 INIT_LIST_HEAD(&rth->rt_uncached);
1613 if (res.type == RTN_UNREACHABLE) { 1685 if (res.type == RTN_UNREACHABLE) {
1614 rth->dst.input= ip_error; 1686 rth->dst.input= ip_error;
1615 rth->dst.error= -err; 1687 rth->dst.error= -err;
@@ -1748,19 +1820,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
1748 1820
1749 fnhe = NULL; 1821 fnhe = NULL;
1750 if (fi) { 1822 if (fi) {
1823 struct rtable __rcu **prth;
1824
1751 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); 1825 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1752 if (!fnhe) { 1826 if (fnhe)
1753 rth = FIB_RES_NH(*res).nh_rth_output; 1827 prth = &fnhe->fnhe_rth;
1754 if (rt_cache_valid(rth)) { 1828 else
1755 dst_hold(&rth->dst); 1829 prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1756 return rth; 1830 rth = rcu_dereference(*prth);
1757 } 1831 if (rt_cache_valid(rth)) {
1832 dst_hold(&rth->dst);
1833 return rth;
1758 } 1834 }
1759 } 1835 }
1760 rth = rt_dst_alloc(dev_out, 1836 rth = rt_dst_alloc(dev_out,
1761 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1837 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1762 IN_DEV_CONF_GET(in_dev, NOXFRM), 1838 IN_DEV_CONF_GET(in_dev, NOXFRM),
1763 fi && !fnhe); 1839 fi);
1764 if (!rth) 1840 if (!rth)
1765 return ERR_PTR(-ENOBUFS); 1841 return ERR_PTR(-ENOBUFS);
1766 1842
@@ -1773,6 +1849,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
1773 rth->rt_iif = orig_oif ? : 0; 1849 rth->rt_iif = orig_oif ? : 0;
1774 rth->rt_pmtu = 0; 1850 rth->rt_pmtu = 0;
1775 rth->rt_gateway = 0; 1851 rth->rt_gateway = 0;
1852 INIT_LIST_HEAD(&rth->rt_uncached);
1776 1853
1777 RT_CACHE_STAT_INC(out_slow_tot); 1854 RT_CACHE_STAT_INC(out_slow_tot);
1778 1855
@@ -2052,6 +2129,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2052 rt->rt_type = ort->rt_type; 2129 rt->rt_type = ort->rt_type;
2053 rt->rt_gateway = ort->rt_gateway; 2130 rt->rt_gateway = ort->rt_gateway;
2054 2131
2132 INIT_LIST_HEAD(&rt->rt_uncached);
2133
2055 dst_free(new); 2134 dst_free(new);
2056 } 2135 }
2057 2136
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5840c3255721..4b6487a68279 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -784,13 +784,6 @@ static struct ctl_table ipv4_net_table[] = {
784 .proc_handler = proc_dointvec 784 .proc_handler = proc_dointvec
785 }, 785 },
786 { 786 {
787 .procname = "rt_cache_rebuild_count",
788 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
789 .maxlen = sizeof(int),
790 .mode = 0644,
791 .proc_handler = proc_dointvec
792 },
793 {
794 .procname = "ping_group_range", 787 .procname = "ping_group_range",
795 .data = &init_net.ipv4.sysctl_ping_group_range, 788 .data = &init_net.ipv4.sysctl_ping_group_range,
796 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), 789 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range),
@@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
829 table[5].data = 822 table[5].data =
830 &net->ipv4.sysctl_icmp_ratemask; 823 &net->ipv4.sysctl_icmp_ratemask;
831 table[6].data = 824 table[6].data =
832 &net->ipv4.sysctl_rt_cache_rebuild_count;
833 table[7].data =
834 &net->ipv4.sysctl_ping_group_range; 825 &net->ipv4.sysctl_ping_group_range;
835 826
836 } 827 }
@@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
842 net->ipv4.sysctl_ping_group_range[0] = 1; 833 net->ipv4.sysctl_ping_group_range[0] = 1;
843 net->ipv4.sysctl_ping_group_range[1] = 0; 834 net->ipv4.sysctl_ping_group_range[1] = 0;
844 835
845 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
846
847 tcp_init_mem(net); 836 tcp_init_mem(net);
848 837
849 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); 838 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a356e1fecf9a..9be30b039ae3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5604 tcp_set_state(sk, TCP_ESTABLISHED); 5604 tcp_set_state(sk, TCP_ESTABLISHED);
5605 5605
5606 if (skb != NULL) { 5606 if (skb != NULL) {
5607 sk->sk_rx_dst = dst_clone(skb_dst(skb)); 5607 inet_sk_rx_dst_set(sk, skb);
5608 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
5609 security_inet_conn_established(sk, skb); 5608 security_inet_conn_established(sk, skb);
5610 } 5609 }
5611 5610
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2fbd9921253f..7f91e5ac8277 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1617#endif 1617#endif
1618 1618
1619 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1619 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1620 struct dst_entry *dst = sk->sk_rx_dst;
1621
1620 sock_rps_save_rxhash(sk, skb); 1622 sock_rps_save_rxhash(sk, skb);
1621 if (sk->sk_rx_dst) { 1623 if (dst) {
1622 struct dst_entry *dst = sk->sk_rx_dst;
1623 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1624 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1624 dst->ops->check(dst, 0) == NULL) { 1625 dst->ops->check(dst, 0) == NULL) {
1625 dst_release(dst); 1626 dst_release(dst);
1626 sk->sk_rx_dst = NULL; 1627 sk->sk_rx_dst = NULL;
1627 } 1628 }
1628 } 1629 }
1629 if (unlikely(sk->sk_rx_dst == NULL)) { 1630 if (unlikely(sk->sk_rx_dst == NULL))
1630 sk->sk_rx_dst = dst_clone(skb_dst(skb)); 1631 inet_sk_rx_dst_set(sk, skb);
1631 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1632
1632 }
1633 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1633 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1634 rsk = sk; 1634 rsk = sk;
1635 goto reset; 1635 goto reset;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3f1cc2028edd..232a90c3ec86 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
387 struct tcp_sock *oldtp = tcp_sk(sk); 387 struct tcp_sock *oldtp = tcp_sk(sk);
388 struct tcp_cookie_values *oldcvp = oldtp->cookie_values; 388 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
389 389
390 newsk->sk_rx_dst = dst_clone(skb_dst(skb)); 390 inet_sk_rx_dst_set(newsk, skb);
391 inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif;
392 391
393 /* TCP Cookie Transactions require space for the cookie pair, 392 /* TCP Cookie Transactions require space for the cookie pair,
394 * as it differs for each connection. There is no need to 393 * as it differs for each connection. There is no need to
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index c6281847f16a..681ea2f413e2 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
92 xdst->u.rt.rt_type = rt->rt_type; 92 xdst->u.rt.rt_type = rt->rt_type;
93 xdst->u.rt.rt_gateway = rt->rt_gateway; 93 xdst->u.rt.rt_gateway = rt->rt_gateway;
94 xdst->u.rt.rt_pmtu = rt->rt_pmtu; 94 xdst->u.rt.rt_pmtu = rt->rt_pmtu;
95 INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
95 96
96 return 0; 97 return 0;
97} 98}