aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-01-26 23:51:05 -0500
committerDavid S. Miller <davem@davemloft.net>2011-01-26 23:51:05 -0500
commit62fa8a846d7de4b299232e330c74b7783539df76 (patch)
treee401dbdbf4b11cbd27bdc3a47d9dc8b512173c9f /net/ipv4/route.c
parentb4e69ac670d71b5748dc81e536b2cb103489badd (diff)
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write. Initially a route entry points it's metrics at a read-only location. If a routing table entry exists, it will point there. Else it will point at the all zero metric place-holder called 'dst_default_metrics'. The writeability state of the metrics is stored in the low bits of the metrics pointer, we have two bits left to spare if we want to store more states. For the initial implementation, COW is implemented simply via kmalloc. However future enhancements will change this to place the writable metrics somewhere else, in order to increase sharing. Very likely this "somewhere else" will be the inetpeer cache. Note also that this means that metrics updates may transiently fail if we cannot COW the metrics successfully. But even by itself, this patch should decrease memory usage and increase cache locality especially for routing workloads. In those cases the read-only metric copies stay in place and never get written to. TCP workloads where metrics get updated, and those rare cases where PMTU triggers occur, will take a very slight performance hit. But that hit will be alleviated when the long-term writable metrics move to a more sharable location. Since the metrics storage went from a u32 array of RTAX_MAX entries to what is essentially a pointer, some retooling of the dst_entry layout was necessary. Most importantly, we need to preserve the alignment of the reference count so that it doesn't share cache lines with the read-mostly state, as per Eric Dumazet's alignment assertion checks. The only non-trivial bit here is the move of the 'flags' member into the writeable cacheline. This is OK since we are always accessing the flags around the same moment when we made a modification to the reference count. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c45
1 files changed, 44 insertions, 1 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3e5b7cc2db4f..980030d4e4ae 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -152,6 +152,36 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152{ 152{
153} 153}
154 154
155static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156{
157 u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
158
159 if (p) {
160 u32 *old_p = __DST_METRICS_PTR(old);
161 unsigned long prev, new;
162
163 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
164
165 new = (unsigned long) p;
166 prev = cmpxchg(&dst->_metrics, old, new);
167
168 if (prev != old) {
169 kfree(p);
170 p = __DST_METRICS_PTR(prev);
171 if (prev & DST_METRICS_READ_ONLY)
172 p = NULL;
173 } else {
174 struct rtable *rt = (struct rtable *) dst;
175
176 if (rt->fi) {
177 fib_info_put(rt->fi);
178 rt->fi = NULL;
179 }
180 }
181 }
182 return p;
183}
184
155static struct dst_ops ipv4_dst_ops = { 185static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET, 186 .family = AF_INET,
157 .protocol = cpu_to_be16(ETH_P_IP), 187 .protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +189,7 @@ static struct dst_ops ipv4_dst_ops = {
159 .check = ipv4_dst_check, 189 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss, 190 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu, 191 .default_mtu = ipv4_default_mtu,
192 .cow_metrics = ipv4_cow_metrics,
162 .destroy = ipv4_dst_destroy, 193 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown, 194 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice, 195 .negative_advice = ipv4_negative_advice,
@@ -1441,6 +1472,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1441 1472
1442 if (rt->peer) 1473 if (rt->peer)
1443 atomic_inc(&rt->peer->refcnt); 1474 atomic_inc(&rt->peer->refcnt);
1475 if (rt->fi)
1476 atomic_inc(&rt->fi->fib_clntref);
1444 1477
1445 if (arp_bind_neighbour(&rt->dst) || 1478 if (arp_bind_neighbour(&rt->dst) ||
1446 !(rt->dst.neighbour->nud_state & 1479 !(rt->dst.neighbour->nud_state &
@@ -1720,6 +1753,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1720 struct rtable *rt = (struct rtable *) dst; 1753 struct rtable *rt = (struct rtable *) dst;
1721 struct inet_peer *peer = rt->peer; 1754 struct inet_peer *peer = rt->peer;
1722 1755
1756 dst_destroy_metrics_generic(dst);
1757 if (rt->fi) {
1758 fib_info_put(rt->fi);
1759 rt->fi = NULL;
1760 }
1723 if (peer) { 1761 if (peer) {
1724 rt->peer = NULL; 1762 rt->peer = NULL;
1725 inet_putpeer(peer); 1763 inet_putpeer(peer);
@@ -1824,7 +1862,9 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1824 if (FIB_RES_GW(*res) && 1862 if (FIB_RES_GW(*res) &&
1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1863 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826 rt->rt_gateway = FIB_RES_GW(*res); 1864 rt->rt_gateway = FIB_RES_GW(*res);
1827 dst_import_metrics(dst, fi->fib_metrics); 1865 rt->fi = fi;
1866 atomic_inc(&fi->fib_clntref);
1867 dst_init_metrics(dst, fi->fib_metrics, true);
1828#ifdef CONFIG_IP_ROUTE_CLASSID 1868#ifdef CONFIG_IP_ROUTE_CLASSID
1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1869 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1830#endif 1870#endif
@@ -2752,6 +2792,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2752 rt->peer = ort->peer; 2792 rt->peer = ort->peer;
2753 if (rt->peer) 2793 if (rt->peer)
2754 atomic_inc(&rt->peer->refcnt); 2794 atomic_inc(&rt->peer->refcnt);
2795 rt->fi = ort->fi;
2796 if (rt->fi)
2797 atomic_inc(&rt->fi->fib_clntref);
2755 2798
2756 dst_free(new); 2799 dst_free(new);
2757 } 2800 }