aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/netdevice.h15
-rw-r--r--include/net/dst.h18
-rw-r--r--include/net/neighbour.h2
-rw-r--r--net/bridge/br_netfilter.c6
-rw-r--r--net/core/dst.c7
-rw-r--r--net/core/neighbour.c81
-rw-r--r--net/ipv4/ip_output.c14
-rw-r--r--net/ipv4/route.c7
-rw-r--r--net/ipv6/ip6_output.c14
9 files changed, 59 insertions, 105 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 75382378a1ba..5ccc0cb8352b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -252,14 +252,7 @@ struct netdev_hw_addr_list {
252 netdev_hw_addr_list_for_each(ha, &(dev)->mc) 252 netdev_hw_addr_list_for_each(ha, &(dev)->mc)
253 253
254struct hh_cache { 254struct hh_cache {
255 atomic_t hh_refcnt; /* number of users */ 255 u16 hh_len;
256/*
257 * We want hh_output, hh_len, hh_lock and hh_data be a in a separate
258 * cache line on SMP.
259 * They are mostly read, but hh_refcnt may be changed quite frequently,
260 * incurring cache line ping pongs.
261 */
262 u16 hh_len ____cacheline_aligned_in_smp;
263 u16 __pad; 256 u16 __pad;
264 int (*hh_output)(struct sk_buff *skb); 257 int (*hh_output)(struct sk_buff *skb);
265 seqlock_t hh_lock; 258 seqlock_t hh_lock;
@@ -273,12 +266,6 @@ struct hh_cache {
273 unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)]; 266 unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
274}; 267};
275 268
276static inline void hh_cache_put(struct hh_cache *hh)
277{
278 if (atomic_dec_and_test(&hh->hh_refcnt))
279 kfree(hh);
280}
281
282/* Reserve HH_DATA_MOD byte aligned hard_header_len, but at least that much. 269/* Reserve HH_DATA_MOD byte aligned hard_header_len, but at least that much.
283 * Alternative is: 270 * Alternative is:
284 * dev->hard_header_len ? (dev->hard_header_len + 271 * dev->hard_header_len ? (dev->hard_header_len +
diff --git a/include/net/dst.h b/include/net/dst.h
index e12ddfb9eb16..0dd7ccbc0dd5 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -38,7 +38,6 @@ struct dst_entry {
38 unsigned long expires; 38 unsigned long expires;
39 struct dst_entry *path; 39 struct dst_entry *path;
40 struct neighbour *neighbour; 40 struct neighbour *neighbour;
41 struct hh_cache *hh;
42#ifdef CONFIG_XFRM 41#ifdef CONFIG_XFRM
43 struct xfrm_state *xfrm; 42 struct xfrm_state *xfrm;
44#else 43#else
@@ -47,6 +46,14 @@ struct dst_entry {
47 int (*input)(struct sk_buff*); 46 int (*input)(struct sk_buff*);
48 int (*output)(struct sk_buff*); 47 int (*output)(struct sk_buff*);
49 48
49 int flags;
50#define DST_HOST 0x0001
51#define DST_NOXFRM 0x0002
52#define DST_NOPOLICY 0x0004
53#define DST_NOHASH 0x0008
54#define DST_NOCACHE 0x0010
55#define DST_NOCOUNT 0x0020
56
50 short error; 57 short error;
51 short obsolete; 58 short obsolete;
52 unsigned short header_len; /* more space at head required */ 59 unsigned short header_len; /* more space at head required */
@@ -62,7 +69,7 @@ struct dst_entry {
62 * (L1_CACHE_SIZE would be too much) 69 * (L1_CACHE_SIZE would be too much)
63 */ 70 */
64#ifdef CONFIG_64BIT 71#ifdef CONFIG_64BIT
65 long __pad_to_align_refcnt[1]; 72 long __pad_to_align_refcnt[2];
66#endif 73#endif
67 /* 74 /*
68 * __refcnt wants to be on a different cache line from 75 * __refcnt wants to be on a different cache line from
@@ -71,13 +78,6 @@ struct dst_entry {
71 atomic_t __refcnt; /* client references */ 78 atomic_t __refcnt; /* client references */
72 int __use; 79 int __use;
73 unsigned long lastuse; 80 unsigned long lastuse;
74 int flags;
75#define DST_HOST 0x0001
76#define DST_NOXFRM 0x0002
77#define DST_NOPOLICY 0x0004
78#define DST_NOHASH 0x0008
79#define DST_NOCACHE 0x0010
80#define DST_NOCOUNT 0x0020
81 union { 81 union {
82 struct dst_entry *next; 82 struct dst_entry *next;
83 struct rtable __rcu *rt_next; 83 struct rtable __rcu *rt_next;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 6fe8c2cd5acb..bd8f9f09ab5c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -108,7 +108,7 @@ struct neighbour {
108 __u8 dead; 108 __u8 dead;
109 seqlock_t ha_lock; 109 seqlock_t ha_lock;
110 unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))]; 110 unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
111 struct hh_cache *hh; 111 struct hh_cache hh;
112 int (*output)(struct sk_buff *skb); 112 int (*output)(struct sk_buff *skb);
113 const struct neigh_ops *ops; 113 const struct neigh_ops *ops;
114 struct rcu_head rcu; 114 struct rcu_head rcu;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 56149ec36d7f..75ee421917c7 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -343,14 +343,16 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
343static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) 343static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
344{ 344{
345 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 345 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
346 struct neighbour *neigh;
346 struct dst_entry *dst; 347 struct dst_entry *dst;
347 348
348 skb->dev = bridge_parent(skb->dev); 349 skb->dev = bridge_parent(skb->dev);
349 if (!skb->dev) 350 if (!skb->dev)
350 goto free_skb; 351 goto free_skb;
351 dst = skb_dst(skb); 352 dst = skb_dst(skb);
352 if (dst->hh) { 353 neigh = dst->neighbour;
353 neigh_hh_bridge(dst->hh, skb); 354 if (neigh->hh.hh_len) {
355 neigh_hh_bridge(&neigh->hh, skb);
354 skb->dev = nf_bridge->physindev; 356 skb->dev = nf_bridge->physindev;
355 return br_handle_frame_finish(skb); 357 return br_handle_frame_finish(skb);
356 } else if (dst->neighbour) { 358 } else if (dst->neighbour) {
diff --git a/net/core/dst.c b/net/core/dst.c
index 6135f3671692..4aacc14936a0 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -172,7 +172,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
172 dst->expires = 0UL; 172 dst->expires = 0UL;
173 dst->path = dst; 173 dst->path = dst;
174 dst->neighbour = NULL; 174 dst->neighbour = NULL;
175 dst->hh = NULL;
176#ifdef CONFIG_XFRM 175#ifdef CONFIG_XFRM
177 dst->xfrm = NULL; 176 dst->xfrm = NULL;
178#endif 177#endif
@@ -226,19 +225,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
226{ 225{
227 struct dst_entry *child; 226 struct dst_entry *child;
228 struct neighbour *neigh; 227 struct neighbour *neigh;
229 struct hh_cache *hh;
230 228
231 smp_rmb(); 229 smp_rmb();
232 230
233again: 231again:
234 neigh = dst->neighbour; 232 neigh = dst->neighbour;
235 hh = dst->hh;
236 child = dst->child; 233 child = dst->child;
237 234
238 dst->hh = NULL;
239 if (hh)
240 hh_cache_put(hh);
241
242 if (neigh) { 235 if (neigh) {
243 dst->neighbour = NULL; 236 dst->neighbour = NULL;
244 neigh_release(neigh); 237 neigh_release(neigh);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f879bb552994..77a399f2ad03 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -297,6 +297,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
297 n->updated = n->used = now; 297 n->updated = n->used = now;
298 n->nud_state = NUD_NONE; 298 n->nud_state = NUD_NONE;
299 n->output = neigh_blackhole; 299 n->output = neigh_blackhole;
300 seqlock_init(&n->hh.hh_lock);
300 n->parms = neigh_parms_clone(&tbl->parms); 301 n->parms = neigh_parms_clone(&tbl->parms);
301 setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); 302 setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
302 303
@@ -702,14 +703,11 @@ void neigh_destroy(struct neighbour *neigh)
702 if (neigh_del_timer(neigh)) 703 if (neigh_del_timer(neigh))
703 printk(KERN_WARNING "Impossible event.\n"); 704 printk(KERN_WARNING "Impossible event.\n");
704 705
705 hh = neigh->hh; 706 hh = &neigh->hh;
706 if (hh) { 707 if (hh->hh_len) {
707 neigh->hh = NULL;
708
709 write_seqlock_bh(&hh->hh_lock); 708 write_seqlock_bh(&hh->hh_lock);
710 hh->hh_output = neigh_blackhole; 709 hh->hh_output = neigh_blackhole;
711 write_sequnlock_bh(&hh->hh_lock); 710 write_sequnlock_bh(&hh->hh_lock);
712 hh_cache_put(hh);
713 } 711 }
714 712
715 skb_queue_purge(&neigh->arp_queue); 713 skb_queue_purge(&neigh->arp_queue);
@@ -737,8 +735,8 @@ static void neigh_suspect(struct neighbour *neigh)
737 735
738 neigh->output = neigh->ops->output; 736 neigh->output = neigh->ops->output;
739 737
740 hh = neigh->hh; 738 hh = &neigh->hh;
741 if (hh) 739 if (hh->hh_len)
742 hh->hh_output = neigh->ops->output; 740 hh->hh_output = neigh->ops->output;
743} 741}
744 742
@@ -755,8 +753,8 @@ static void neigh_connect(struct neighbour *neigh)
755 753
756 neigh->output = neigh->ops->connected_output; 754 neigh->output = neigh->ops->connected_output;
757 755
758 hh = neigh->hh; 756 hh = &neigh->hh;
759 if (hh) 757 if (hh->hh_len)
760 hh->hh_output = neigh->ops->hh_output; 758 hh->hh_output = neigh->ops->hh_output;
761} 759}
762 760
@@ -1017,7 +1015,7 @@ out_unlock_bh:
1017} 1015}
1018EXPORT_SYMBOL(__neigh_event_send); 1016EXPORT_SYMBOL(__neigh_event_send);
1019 1017
1020static void neigh_update_hhs(const struct neighbour *neigh) 1018static void neigh_update_hhs(struct neighbour *neigh)
1021{ 1019{
1022 struct hh_cache *hh; 1020 struct hh_cache *hh;
1023 void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) 1021 void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1027,8 +1025,8 @@ static void neigh_update_hhs(const struct neighbour *neigh)
1027 update = neigh->dev->header_ops->cache_update; 1025 update = neigh->dev->header_ops->cache_update;
1028 1026
1029 if (update) { 1027 if (update) {
1030 hh = neigh->hh; 1028 hh = &neigh->hh;
1031 if (hh) { 1029 if (hh->hh_len) {
1032 write_seqlock_bh(&hh->hh_lock); 1030 write_seqlock_bh(&hh->hh_lock);
1033 update(hh, neigh->dev, neigh->ha); 1031 update(hh, neigh->dev, neigh->ha);
1034 write_sequnlock_bh(&hh->hh_lock); 1032 write_sequnlock_bh(&hh->hh_lock);
@@ -1214,62 +1212,29 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
1214} 1212}
1215EXPORT_SYMBOL(neigh_event_ns); 1213EXPORT_SYMBOL(neigh_event_ns);
1216 1214
1217static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst)
1218{
1219 struct hh_cache *hh;
1220
1221 smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
1222 hh = n->hh;
1223 if (hh) {
1224 atomic_inc(&hh->hh_refcnt);
1225 if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
1226 hh_cache_put(hh);
1227 return true;
1228 }
1229 return false;
1230}
1231
1232/* called with read_lock_bh(&n->lock); */ 1215/* called with read_lock_bh(&n->lock); */
1233static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, 1216static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
1234 __be16 protocol)
1235{ 1217{
1236 struct hh_cache *hh;
1237 struct net_device *dev = dst->dev; 1218 struct net_device *dev = dst->dev;
1238 1219 __be16 prot = dst->ops->protocol;
1239 if (likely(neigh_hh_lookup(n, dst))) 1220 struct hh_cache *hh = &n->hh;
1240 return;
1241
1242 /* slow path */
1243 hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
1244 if (!hh)
1245 return;
1246
1247 seqlock_init(&hh->hh_lock);
1248 atomic_set(&hh->hh_refcnt, 2);
1249
1250 if (dev->header_ops->cache(n, hh, protocol)) {
1251 kfree(hh);
1252 return;
1253 }
1254 1221
1255 write_lock_bh(&n->lock); 1222 write_lock_bh(&n->lock);
1256 1223
1257 /* must check if another thread already did the insert */ 1224 /* Only one thread can come in here and initialize the
1258 if (neigh_hh_lookup(n, dst)) { 1225 * hh_cache entry.
1259 kfree(hh); 1226 */
1227 if (hh->hh_len)
1228 goto end;
1229
1230 if (dev->header_ops->cache(n, hh, prot))
1260 goto end; 1231 goto end;
1261 }
1262 1232
1263 if (n->nud_state & NUD_CONNECTED) 1233 if (n->nud_state & NUD_CONNECTED)
1264 hh->hh_output = n->ops->hh_output; 1234 hh->hh_output = n->ops->hh_output;
1265 else 1235 else
1266 hh->hh_output = n->ops->output; 1236 hh->hh_output = n->ops->output;
1267 1237
1268 smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
1269 n->hh = hh;
1270
1271 if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
1272 hh_cache_put(hh);
1273end: 1238end:
1274 write_unlock_bh(&n->lock); 1239 write_unlock_bh(&n->lock);
1275} 1240}
@@ -1312,10 +1277,8 @@ int neigh_resolve_output(struct sk_buff *skb)
1312 struct net_device *dev = neigh->dev; 1277 struct net_device *dev = neigh->dev;
1313 unsigned int seq; 1278 unsigned int seq;
1314 1279
1315 if (dev->header_ops->cache && 1280 if (dev->header_ops->cache && !neigh->hh.hh_len)
1316 !dst->hh && 1281 neigh_hh_init(neigh, dst);
1317 !(dst->flags & DST_NOCACHE))
1318 neigh_hh_init(neigh, dst, dst->ops->protocol);
1319 1282
1320 do { 1283 do {
1321 seq = read_seqbegin(&neigh->ha_lock); 1284 seq = read_seqbegin(&neigh->ha_lock);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 54119d5aae8f..a621b96aed15 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -182,6 +182,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
182 struct rtable *rt = (struct rtable *)dst; 182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev; 183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev); 184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh;
185 186
186 if (rt->rt_type == RTN_MULTICAST) { 187 if (rt->rt_type == RTN_MULTICAST) {
187 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); 188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -203,11 +204,14 @@ static inline int ip_finish_output2(struct sk_buff *skb)
203 skb = skb2; 204 skb = skb2;
204 } 205 }
205 206
206 if (dst->hh) 207 neigh = dst->neighbour;
207 return neigh_hh_output(dst->hh, skb); 208 if (neigh) {
208 else if (dst->neighbour) 209 struct hh_cache *hh = &neigh->hh;
209 return dst->neighbour->output(skb); 210 if (hh->hh_len)
210 211 return neigh_hh_output(hh, skb);
212 else
213 return dst->neighbour->output(skb);
214 }
211 if (net_ratelimit()) 215 if (net_ratelimit())
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); 216 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 kfree_skb(skb); 217 kfree_skb(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c6388e825ed3..a52bb74d2612 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -426,9 +426,10 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
426 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 426 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427 dst_metric(&r->dst, RTAX_RTTVAR)), 427 dst_metric(&r->dst, RTAX_RTTVAR)),
428 r->rt_key_tos, 428 r->rt_key_tos,
429 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, 429 -1,
430 r->dst.hh ? (r->dst.hh->hh_output == 430 (r->dst.neighbour ?
431 dev_queue_xmit) : 0, 431 (r->dst.neighbour->hh.hh_output ==
432 dev_queue_xmit) : 0),
432 r->rt_spec_dst, &len); 433 r->rt_spec_dst, &len);
433 434
434 seq_printf(seq, "%*s\n", 127 - len, ""); 435 seq_printf(seq, "%*s\n", 127 - len, "");
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9d4b165837d6..f0f144cac0bd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -100,6 +100,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
100{ 100{
101 struct dst_entry *dst = skb_dst(skb); 101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev; 102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
103 104
104 skb->protocol = htons(ETH_P_IPV6); 105 skb->protocol = htons(ETH_P_IPV6);
105 skb->dev = dev; 106 skb->dev = dev;
@@ -134,11 +135,14 @@ static int ip6_finish_output2(struct sk_buff *skb)
134 skb->len); 135 skb->len);
135 } 136 }
136 137
137 if (dst->hh) 138 neigh = dst->neighbour;
138 return neigh_hh_output(dst->hh, skb); 139 if (neigh) {
139 else if (dst->neighbour) 140 struct hh_cache *hh = &neigh->hh;
140 return dst->neighbour->output(skb); 141 if (hh->hh_len)
141 142 return neigh_hh_output(hh, skb);
143 else
144 return dst->neighbour->output(skb);
145 }
142 IP6_INC_STATS_BH(dev_net(dst->dev), 146 IP6_INC_STATS_BH(dev_net(dst->dev),
143 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 147 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 kfree_skb(skb); 148 kfree_skb(skb);