diff options
author | David S. Miller <davem@davemloft.net> | 2011-01-26 23:51:05 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-01-26 23:51:05 -0500 |
commit | 62fa8a846d7de4b299232e330c74b7783539df76 (patch) | |
tree | e401dbdbf4b11cbd27bdc3a47d9dc8b512173c9f /net/ipv4/route.c | |
parent | b4e69ac670d71b5748dc81e536b2cb103489badd (diff) |
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write.
Initially a route entry points it's metrics at a read-only location.
If a routing table entry exists, it will point there. Else it will
point at the all zero metric place-holder called 'dst_default_metrics'.
The writeability state of the metrics is stored in the low bits of the
metrics pointer, we have two bits left to spare if we want to store
more states.
For the initial implementation, COW is implemented simply via kmalloc.
However future enhancements will change this to place the writable
metrics somewhere else, in order to increase sharing. Very likely
this "somewhere else" will be the inetpeer cache.
Note also that this means that metrics updates may transiently fail
if we cannot COW the metrics successfully.
But even by itself, this patch should decrease memory usage and
increase cache locality especially for routing workloads. In those
cases the read-only metric copies stay in place and never get written
to.
TCP workloads where metrics get updated, and those rare cases where
PMTU triggers occur, will take a very slight performance hit. But
that hit will be alleviated when the long-term writable metrics
move to a more sharable location.
Since the metrics storage went from a u32 array of RTAX_MAX entries to
what is essentially a pointer, some retooling of the dst_entry layout
was necessary.
Most importantly, we need to preserve the alignment of the reference
count so that it doesn't share cache lines with the read-mostly state,
as per Eric Dumazet's alignment assertion checks.
The only non-trivial bit here is the move of the 'flags' member into
the writeable cacheline. This is OK since we are always accessing the
flags around the same moment when we made a modification to the
reference count.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 45 |
1 files changed, 44 insertions, 1 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3e5b7cc2db4f..980030d4e4ae 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -152,6 +152,36 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | |||
152 | { | 152 | { |
153 | } | 153 | } |
154 | 154 | ||
155 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) | ||
156 | { | ||
157 | u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); | ||
158 | |||
159 | if (p) { | ||
160 | u32 *old_p = __DST_METRICS_PTR(old); | ||
161 | unsigned long prev, new; | ||
162 | |||
163 | memcpy(p, old_p, sizeof(u32) * RTAX_MAX); | ||
164 | |||
165 | new = (unsigned long) p; | ||
166 | prev = cmpxchg(&dst->_metrics, old, new); | ||
167 | |||
168 | if (prev != old) { | ||
169 | kfree(p); | ||
170 | p = __DST_METRICS_PTR(prev); | ||
171 | if (prev & DST_METRICS_READ_ONLY) | ||
172 | p = NULL; | ||
173 | } else { | ||
174 | struct rtable *rt = (struct rtable *) dst; | ||
175 | |||
176 | if (rt->fi) { | ||
177 | fib_info_put(rt->fi); | ||
178 | rt->fi = NULL; | ||
179 | } | ||
180 | } | ||
181 | } | ||
182 | return p; | ||
183 | } | ||
184 | |||
155 | static struct dst_ops ipv4_dst_ops = { | 185 | static struct dst_ops ipv4_dst_ops = { |
156 | .family = AF_INET, | 186 | .family = AF_INET, |
157 | .protocol = cpu_to_be16(ETH_P_IP), | 187 | .protocol = cpu_to_be16(ETH_P_IP), |
@@ -159,6 +189,7 @@ static struct dst_ops ipv4_dst_ops = { | |||
159 | .check = ipv4_dst_check, | 189 | .check = ipv4_dst_check, |
160 | .default_advmss = ipv4_default_advmss, | 190 | .default_advmss = ipv4_default_advmss, |
161 | .default_mtu = ipv4_default_mtu, | 191 | .default_mtu = ipv4_default_mtu, |
192 | .cow_metrics = ipv4_cow_metrics, | ||
162 | .destroy = ipv4_dst_destroy, | 193 | .destroy = ipv4_dst_destroy, |
163 | .ifdown = ipv4_dst_ifdown, | 194 | .ifdown = ipv4_dst_ifdown, |
164 | .negative_advice = ipv4_negative_advice, | 195 | .negative_advice = ipv4_negative_advice, |
@@ -1441,6 +1472,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1441 | 1472 | ||
1442 | if (rt->peer) | 1473 | if (rt->peer) |
1443 | atomic_inc(&rt->peer->refcnt); | 1474 | atomic_inc(&rt->peer->refcnt); |
1475 | if (rt->fi) | ||
1476 | atomic_inc(&rt->fi->fib_clntref); | ||
1444 | 1477 | ||
1445 | if (arp_bind_neighbour(&rt->dst) || | 1478 | if (arp_bind_neighbour(&rt->dst) || |
1446 | !(rt->dst.neighbour->nud_state & | 1479 | !(rt->dst.neighbour->nud_state & |
@@ -1720,6 +1753,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst) | |||
1720 | struct rtable *rt = (struct rtable *) dst; | 1753 | struct rtable *rt = (struct rtable *) dst; |
1721 | struct inet_peer *peer = rt->peer; | 1754 | struct inet_peer *peer = rt->peer; |
1722 | 1755 | ||
1756 | dst_destroy_metrics_generic(dst); | ||
1757 | if (rt->fi) { | ||
1758 | fib_info_put(rt->fi); | ||
1759 | rt->fi = NULL; | ||
1760 | } | ||
1723 | if (peer) { | 1761 | if (peer) { |
1724 | rt->peer = NULL; | 1762 | rt->peer = NULL; |
1725 | inet_putpeer(peer); | 1763 | inet_putpeer(peer); |
@@ -1824,7 +1862,9 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) | |||
1824 | if (FIB_RES_GW(*res) && | 1862 | if (FIB_RES_GW(*res) && |
1825 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) | 1863 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) |
1826 | rt->rt_gateway = FIB_RES_GW(*res); | 1864 | rt->rt_gateway = FIB_RES_GW(*res); |
1827 | dst_import_metrics(dst, fi->fib_metrics); | 1865 | rt->fi = fi; |
1866 | atomic_inc(&fi->fib_clntref); | ||
1867 | dst_init_metrics(dst, fi->fib_metrics, true); | ||
1828 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1868 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1829 | dst->tclassid = FIB_RES_NH(*res).nh_tclassid; | 1869 | dst->tclassid = FIB_RES_NH(*res).nh_tclassid; |
1830 | #endif | 1870 | #endif |
@@ -2752,6 +2792,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi | |||
2752 | rt->peer = ort->peer; | 2792 | rt->peer = ort->peer; |
2753 | if (rt->peer) | 2793 | if (rt->peer) |
2754 | atomic_inc(&rt->peer->refcnt); | 2794 | atomic_inc(&rt->peer->refcnt); |
2795 | rt->fi = ort->fi; | ||
2796 | if (rt->fi) | ||
2797 | atomic_inc(&rt->fi->fib_clntref); | ||
2755 | 2798 | ||
2756 | dst_free(new); | 2799 | dst_free(new); |
2757 | } | 2800 | } |