aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv6/route.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-01-26 23:51:05 -0500
committerDavid S. Miller <davem@davemloft.net>2011-01-26 23:51:05 -0500
commit62fa8a846d7de4b299232e330c74b7783539df76 (patch)
treee401dbdbf4b11cbd27bdc3a47d9dc8b512173c9f /net/ipv6/route.c
parentb4e69ac670d71b5748dc81e536b2cb103489badd (diff)
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write. Initially a route entry points it's metrics at a read-only location. If a routing table entry exists, it will point there. Else it will point at the all zero metric place-holder called 'dst_default_metrics'. The writeability state of the metrics is stored in the low bits of the metrics pointer, we have two bits left to spare if we want to store more states. For the initial implementation, COW is implemented simply via kmalloc. However future enhancements will change this to place the writable metrics somewhere else, in order to increase sharing. Very likely this "somewhere else" will be the inetpeer cache. Note also that this means that metrics updates may transiently fail if we cannot COW the metrics successfully. But even by itself, this patch should decrease memory usage and increase cache locality especially for routing workloads. In those cases the read-only metric copies stay in place and never get written to. TCP workloads where metrics get updated, and those rare cases where PMTU triggers occur, will take a very slight performance hit. But that hit will be alleviated when the long-term writable metrics move to a more sharable location. Since the metrics storage went from a u32 array of RTAX_MAX entries to what is essentially a pointer, some retooling of the dst_entry layout was necessary. Most importantly, we need to preserve the alignment of the reference count so that it doesn't share cache lines with the read-mostly state, as per Eric Dumazet's alignment assertion checks. The only non-trivial bit here is the move of the 'flags' member into the writeable cacheline. This is OK since we are always accessing the flags around the same moment when we made a modification to the reference count. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6/route.c')
-rw-r--r--net/ipv6/route.c15
1 files changed, 12 insertions, 3 deletions
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1534508f6c68..45fafa018f12 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -105,6 +105,7 @@ static struct dst_ops ip6_dst_ops_template = {
105 .check = ip6_dst_check, 105 .check = ip6_dst_check,
106 .default_advmss = ip6_default_advmss, 106 .default_advmss = ip6_default_advmss,
107 .default_mtu = ip6_default_mtu, 107 .default_mtu = ip6_default_mtu,
108 .cow_metrics = dst_cow_metrics_generic,
108 .destroy = ip6_dst_destroy, 109 .destroy = ip6_dst_destroy,
109 .ifdown = ip6_dst_ifdown, 110 .ifdown = ip6_dst_ifdown,
110 .negative_advice = ip6_negative_advice, 111 .negative_advice = ip6_negative_advice,
@@ -125,6 +126,10 @@ static struct dst_ops ip6_dst_blackhole_ops = {
125 .update_pmtu = ip6_rt_blackhole_update_pmtu, 126 .update_pmtu = ip6_rt_blackhole_update_pmtu,
126}; 127};
127 128
129static const u32 ip6_template_metrics[RTAX_MAX] = {
130 [RTAX_HOPLIMIT - 1] = 255,
131};
132
128static struct rt6_info ip6_null_entry_template = { 133static struct rt6_info ip6_null_entry_template = {
129 .dst = { 134 .dst = {
130 .__refcnt = ATOMIC_INIT(1), 135 .__refcnt = ATOMIC_INIT(1),
@@ -193,6 +198,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
193 rt->rt6i_idev = NULL; 198 rt->rt6i_idev = NULL;
194 in6_dev_put(idev); 199 in6_dev_put(idev);
195 } 200 }
201 dst_destroy_metrics_generic(dst);
196 if (peer) { 202 if (peer) {
197 BUG_ON(!(rt->rt6i_flags & RTF_CACHE)); 203 BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
198 rt->rt6i_peer = NULL; 204 rt->rt6i_peer = NULL;
@@ -2681,7 +2687,8 @@ static int __net_init ip6_route_net_init(struct net *net)
2681 net->ipv6.ip6_null_entry->dst.path = 2687 net->ipv6.ip6_null_entry->dst.path =
2682 (struct dst_entry *)net->ipv6.ip6_null_entry; 2688 (struct dst_entry *)net->ipv6.ip6_null_entry;
2683 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2689 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2684 dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255); 2690 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2691 ip6_template_metrics, true);
2685 2692
2686#ifdef CONFIG_IPV6_MULTIPLE_TABLES 2693#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2687 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 2694 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
@@ -2692,7 +2699,8 @@ static int __net_init ip6_route_net_init(struct net *net)
2692 net->ipv6.ip6_prohibit_entry->dst.path = 2699 net->ipv6.ip6_prohibit_entry->dst.path =
2693 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 2700 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2694 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2701 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2695 dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255); 2702 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2703 ip6_template_metrics, true);
2696 2704
2697 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 2705 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2698 sizeof(*net->ipv6.ip6_blk_hole_entry), 2706 sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -2702,7 +2710,8 @@ static int __net_init ip6_route_net_init(struct net *net)
2702 net->ipv6.ip6_blk_hole_entry->dst.path = 2710 net->ipv6.ip6_blk_hole_entry->dst.path =
2703 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 2711 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2704 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2712 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2705 dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255); 2713 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2714 ip6_template_metrics, true);
2706#endif 2715#endif
2707 2716
2708 net->ipv6.sysctl.flush_delay = 0; 2717 net->ipv6.sysctl.flush_delay = 0;