aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2010-05-11 19:19:48 -0400
committerDavid S. Miller <davem@davemloft.net>2010-05-17 20:18:50 -0400
commit7fee226ad2397b635e2fd565a59ca3ae08a164cd (patch)
tree0bcd26150ad74ec1a237109de87a3d214a07fc22
parentebda37c27d0c768947e9b058332d7ea798210cf8 (diff)
net: add a noref bit on skb dst
Use low order bit of skb->_skb_dst to tell dst is not refcounted. Change _skb_dst to _skb_refdst to make sure all uses are catched. skb_dst() returns the dst, regardless of noref bit set or not, but with a lockdep check to make sure a noref dst is not given if current user is not rcu protected. New skb_dst_set_noref() helper to set an notrefcounted dst on a skb. (with lockdep check) skb_dst_drop() drops a reference only if skb dst was refcounted. skb_dst_force() helper is used to force a refcount on dst, when skb is queued and not anymore RCU protected. Use skb_dst_force() in __sk_add_backlog(), __dev_xmit_skb() if !IFF_XMIT_DST_RELEASE or skb enqueued on qdisc queue, in sock_queue_rcv_skb(), in __nf_queue(). Use skb_dst_force() in dev_requeue_skb(). Note: dst_use_noref() still dirties dst, we might transform it later to do one dirtying per jiffies. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/skbuff.h58
-rw-r--r--include/net/dst.h48
-rw-r--r--include/net/sock.h13
-rw-r--r--net/core/dev.c3
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/core/sock.c6
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/ip_options.c9
-rw-r--r--net/ipv4/netfilter.c6
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/sched/sch_generic.c4
12 files changed, 134 insertions, 25 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c9525bce80f6..7cdfb4d52847 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -264,7 +264,7 @@ typedef unsigned char *sk_buff_data_t;
264 * @transport_header: Transport layer header 264 * @transport_header: Transport layer header
265 * @network_header: Network layer header 265 * @network_header: Network layer header
266 * @mac_header: Link layer header 266 * @mac_header: Link layer header
267 * @_skb_dst: destination entry 267 * @_skb_refdst: destination entry (with norefcount bit)
268 * @sp: the security path, used for xfrm 268 * @sp: the security path, used for xfrm
269 * @cb: Control buffer. Free for use by every layer. Put private vars here 269 * @cb: Control buffer. Free for use by every layer. Put private vars here
270 * @len: Length of actual data 270 * @len: Length of actual data
@@ -328,7 +328,7 @@ struct sk_buff {
328 */ 328 */
329 char cb[48] __aligned(8); 329 char cb[48] __aligned(8);
330 330
331 unsigned long _skb_dst; 331 unsigned long _skb_refdst;
332#ifdef CONFIG_XFRM 332#ifdef CONFIG_XFRM
333 struct sec_path *sp; 333 struct sec_path *sp;
334#endif 334#endif
@@ -419,14 +419,64 @@ struct sk_buff {
419 419
420#include <asm/system.h> 420#include <asm/system.h>
421 421
422/*
423 * skb might have a dst pointer attached, refcounted or not.
424 * _skb_refdst low order bit is set if refcount was _not_ taken
425 */
426#define SKB_DST_NOREF 1UL
427#define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
428
429/**
430 * skb_dst - returns skb dst_entry
431 * @skb: buffer
432 *
433 * Returns skb dst_entry, regardless of reference taken or not.
434 */
422static inline struct dst_entry *skb_dst(const struct sk_buff *skb) 435static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
423{ 436{
424 return (struct dst_entry *)skb->_skb_dst; 437 /* If refdst was not refcounted, check we still are in a
438 * rcu_read_lock section
439 */
440 WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
441 !rcu_read_lock_held() &&
442 !rcu_read_lock_bh_held());
443 return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
425} 444}
426 445
446/**
447 * skb_dst_set - sets skb dst
448 * @skb: buffer
449 * @dst: dst entry
450 *
451 * Sets skb dst, assuming a reference was taken on dst and should
452 * be released by skb_dst_drop()
453 */
427static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) 454static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
428{ 455{
429 skb->_skb_dst = (unsigned long)dst; 456 skb->_skb_refdst = (unsigned long)dst;
457}
458
459/**
460 * skb_dst_set_noref - sets skb dst, without a reference
461 * @skb: buffer
462 * @dst: dst entry
463 *
464 * Sets skb dst, assuming a reference was not taken on dst
465 * skb_dst_drop() should not dst_release() this dst
466 */
467static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
468{
469 WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
470 skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
471}
472
473/**
474 * skb_dst_is_noref - Test if skb dst isnt refcounted
475 * @skb: buffer
476 */
477static inline bool skb_dst_is_noref(const struct sk_buff *skb)
478{
479 return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
430} 480}
431 481
432static inline struct rtable *skb_rtable(const struct sk_buff *skb) 482static inline struct rtable *skb_rtable(const struct sk_buff *skb)
diff --git a/include/net/dst.h b/include/net/dst.h
index aac5a5fcfda9..27207a13f2a6 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -168,6 +168,12 @@ static inline void dst_use(struct dst_entry *dst, unsigned long time)
168 dst->lastuse = time; 168 dst->lastuse = time;
169} 169}
170 170
171static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
172{
173 dst->__use++;
174 dst->lastuse = time;
175}
176
171static inline 177static inline
172struct dst_entry * dst_clone(struct dst_entry * dst) 178struct dst_entry * dst_clone(struct dst_entry * dst)
173{ 179{
@@ -177,11 +183,47 @@ struct dst_entry * dst_clone(struct dst_entry * dst)
177} 183}
178 184
179extern void dst_release(struct dst_entry *dst); 185extern void dst_release(struct dst_entry *dst);
186
187static inline void refdst_drop(unsigned long refdst)
188{
189 if (!(refdst & SKB_DST_NOREF))
190 dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
191}
192
193/**
194 * skb_dst_drop - drops skb dst
195 * @skb: buffer
196 *
197 * Drops dst reference count if a reference was taken.
198 */
180static inline void skb_dst_drop(struct sk_buff *skb) 199static inline void skb_dst_drop(struct sk_buff *skb)
181{ 200{
182 if (skb->_skb_dst) 201 if (skb->_skb_refdst) {
183 dst_release(skb_dst(skb)); 202 refdst_drop(skb->_skb_refdst);
184 skb->_skb_dst = 0UL; 203 skb->_skb_refdst = 0UL;
204 }
205}
206
207static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
208{
209 nskb->_skb_refdst = oskb->_skb_refdst;
210 if (!(nskb->_skb_refdst & SKB_DST_NOREF))
211 dst_clone(skb_dst(nskb));
212}
213
214/**
215 * skb_dst_force - makes sure skb dst is refcounted
216 * @skb: buffer
217 *
218 * If dst is not yet refcounted, let's do it
219 */
220static inline void skb_dst_force(struct sk_buff *skb)
221{
222 if (skb_dst_is_noref(skb)) {
223 WARN_ON(!rcu_read_lock_held());
224 skb->_skb_refdst &= ~SKB_DST_NOREF;
225 dst_clone(skb_dst(skb));
226 }
185} 227}
186 228
187/* Children define the path of the packet through the 229/* Children define the path of the packet through the
diff --git a/include/net/sock.h b/include/net/sock.h
index aed16eb9db4b..5697caf8cc76 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -600,12 +600,15 @@ static inline int sk_stream_memory_free(struct sock *sk)
600/* OOB backlog add */ 600/* OOB backlog add */
601static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) 601static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
602{ 602{
603 if (!sk->sk_backlog.tail) { 603 /* dont let skb dst not refcounted, we are going to leave rcu lock */
604 sk->sk_backlog.head = sk->sk_backlog.tail = skb; 604 skb_dst_force(skb);
605 } else { 605
606 if (!sk->sk_backlog.tail)
607 sk->sk_backlog.head = skb;
608 else
606 sk->sk_backlog.tail->next = skb; 609 sk->sk_backlog.tail->next = skb;
607 sk->sk_backlog.tail = skb; 610
608 } 611 sk->sk_backlog.tail = skb;
609 skb->next = NULL; 612 skb->next = NULL;
610} 613}
611 614
diff --git a/net/core/dev.c b/net/core/dev.c
index cdcb9cbedf41..6c820650b80f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2052,6 +2052,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2052 * waiting to be sent out; and the qdisc is not running - 2052 * waiting to be sent out; and the qdisc is not running -
2053 * xmit the skb directly. 2053 * xmit the skb directly.
2054 */ 2054 */
2055 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2056 skb_dst_force(skb);
2055 __qdisc_update_bstats(q, skb->len); 2057 __qdisc_update_bstats(q, skb->len);
2056 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) 2058 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2057 __qdisc_run(q); 2059 __qdisc_run(q);
@@ -2060,6 +2062,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2060 2062
2061 rc = NET_XMIT_SUCCESS; 2063 rc = NET_XMIT_SUCCESS;
2062 } else { 2064 } else {
2065 skb_dst_force(skb);
2063 rc = qdisc_enqueue_root(skb, q); 2066 rc = qdisc_enqueue_root(skb, q);
2064 qdisc_run(q); 2067 qdisc_run(q);
2065 } 2068 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a9b0e1f77806..c543dd252433 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -520,7 +520,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
520 new->transport_header = old->transport_header; 520 new->transport_header = old->transport_header;
521 new->network_header = old->network_header; 521 new->network_header = old->network_header;
522 new->mac_header = old->mac_header; 522 new->mac_header = old->mac_header;
523 skb_dst_set(new, dst_clone(skb_dst(old))); 523 skb_dst_copy(new, old);
524 new->rxhash = old->rxhash; 524 new->rxhash = old->rxhash;
525#ifdef CONFIG_XFRM 525#ifdef CONFIG_XFRM
526 new->sp = secpath_get(old->sp); 526 new->sp = secpath_get(old->sp);
diff --git a/net/core/sock.c b/net/core/sock.c
index 63530a03b8c2..bf88a167c8f2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
307 */ 307 */
308 skb_len = skb->len; 308 skb_len = skb->len;
309 309
310 /* we escape from rcu protected region, make sure we dont leak
311 * a norefcounted dst
312 */
313 skb_dst_force(skb);
314
310 spin_lock_irqsave(&list->lock, flags); 315 spin_lock_irqsave(&list->lock, flags);
311 skb->dropcount = atomic_read(&sk->sk_drops); 316 skb->dropcount = atomic_read(&sk->sk_drops);
312 __skb_queue_tail(list, skb); 317 __skb_queue_tail(list, skb);
@@ -1536,6 +1541,7 @@ static void __release_sock(struct sock *sk)
1536 do { 1541 do {
1537 struct sk_buff *next = skb->next; 1542 struct sk_buff *next = skb->next;
1538 1543
1544 WARN_ON_ONCE(skb_dst_is_noref(skb));
1539 skb->next = NULL; 1545 skb->next = NULL;
1540 sk_backlog_rcv(sk, skb); 1546 sk_backlog_rcv(sk, skb);
1541 1547
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f3d339f728b0..d65e9215bcd7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
587 err = __ip_route_output_key(net, &rt2, &fl); 587 err = __ip_route_output_key(net, &rt2, &fl);
588 else { 588 else {
589 struct flowi fl2 = {}; 589 struct flowi fl2 = {};
590 struct dst_entry *odst; 590 unsigned long orefdst;
591 591
592 fl2.fl4_dst = fl.fl4_src; 592 fl2.fl4_dst = fl.fl4_src;
593 if (ip_route_output_key(net, &rt2, &fl2)) 593 if (ip_route_output_key(net, &rt2, &fl2))
594 goto relookup_failed; 594 goto relookup_failed;
595 595
596 /* Ugh! */ 596 /* Ugh! */
597 odst = skb_dst(skb_in); 597 orefdst = skb_in->_skb_refdst; /* save old refdst */
598 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, 598 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
599 RT_TOS(tos), rt2->u.dst.dev); 599 RT_TOS(tos), rt2->u.dst.dev);
600 600
601 dst_release(&rt2->u.dst); 601 dst_release(&rt2->u.dst);
602 rt2 = skb_rtable(skb_in); 602 rt2 = skb_rtable(skb_in);
603 skb_dst_set(skb_in, odst); 603 skb_in->_skb_refdst = orefdst; /* restore old refdst */
604 } 604 }
605 605
606 if (err) 606 if (err)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4c09a31fd140..3244133c24f6 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
601 unsigned char *optptr = skb_network_header(skb) + opt->srr; 601 unsigned char *optptr = skb_network_header(skb) + opt->srr;
602 struct rtable *rt = skb_rtable(skb); 602 struct rtable *rt = skb_rtable(skb);
603 struct rtable *rt2; 603 struct rtable *rt2;
604 unsigned long orefdst;
604 int err; 605 int err;
605 606
606 if (!opt->srr) 607 if (!opt->srr)
@@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
624 } 625 }
625 memcpy(&nexthop, &optptr[srrptr-1], 4); 626 memcpy(&nexthop, &optptr[srrptr-1], 4);
626 627
627 rt = skb_rtable(skb); 628 orefdst = skb->_skb_refdst;
628 skb_dst_set(skb, NULL); 629 skb_dst_set(skb, NULL);
629 err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); 630 err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
630 rt2 = skb_rtable(skb); 631 rt2 = skb_rtable(skb);
631 if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { 632 if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
632 ip_rt_put(rt2); 633 skb_dst_drop(skb);
633 skb_dst_set(skb, &rt->u.dst); 634 skb->_skb_refdst = orefdst;
634 return -EINVAL; 635 return -EINVAL;
635 } 636 }
636 ip_rt_put(rt); 637 refdst_drop(orefdst);
637 if (rt2->rt_type != RTN_LOCAL) 638 if (rt2->rt_type != RTN_LOCAL)
638 break; 639 break;
639 /* Superfast 8) loopback forward */ 640 /* Superfast 8) loopback forward */
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 82fb43c5c59e..07de855e2175 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
17 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
18 struct rtable *rt; 18 struct rtable *rt;
19 struct flowi fl = {}; 19 struct flowi fl = {};
20 struct dst_entry *odst; 20 unsigned long orefdst;
21 unsigned int hh_len; 21 unsigned int hh_len;
22 unsigned int type; 22 unsigned int type;
23 23
@@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
51 if (ip_route_output_key(net, &rt, &fl) != 0) 51 if (ip_route_output_key(net, &rt, &fl) != 0)
52 return -1; 52 return -1;
53 53
54 odst = skb_dst(skb); 54 orefdst = skb->_skb_refdst;
55 if (ip_route_input(skb, iph->daddr, iph->saddr, 55 if (ip_route_input(skb, iph->daddr, iph->saddr,
56 RT_TOS(iph->tos), rt->u.dst.dev) != 0) { 56 RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
57 dst_release(&rt->u.dst); 57 dst_release(&rt->u.dst);
58 return -1; 58 return -1;
59 } 59 }
60 dst_release(&rt->u.dst); 60 dst_release(&rt->u.dst);
61 dst_release(odst); 61 refdst_drop(orefdst);
62 } 62 }
63 63
64 if (skb_dst(skb)->error) 64 if (skb_dst(skb)->error)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dea3f9264250..705eccfb4769 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3033,7 +3033,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3033 continue; 3033 continue;
3034 if (rt_is_expired(rt)) 3034 if (rt_is_expired(rt))
3035 continue; 3035 continue;
3036 skb_dst_set(skb, dst_clone(&rt->u.dst)); 3036 skb_dst_set_noref(skb, &rt->u.dst);
3037 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3037 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3038 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3038 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3039 1, NLM_F_MULTI) <= 0) { 3039 1, NLM_F_MULTI) <= 0) {
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 0b1103c0b1f3..78b3cf9c519c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -9,6 +9,7 @@
9#include <linux/rcupdate.h> 9#include <linux/rcupdate.h>
10#include <net/protocol.h> 10#include <net/protocol.h>
11#include <net/netfilter/nf_queue.h> 11#include <net/netfilter/nf_queue.h>
12#include <net/dst.h>
12 13
13#include "nf_internals.h" 14#include "nf_internals.h"
14 15
@@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb,
170 dev_hold(physoutdev); 171 dev_hold(physoutdev);
171 } 172 }
172#endif 173#endif
174 skb_dst_force(skb);
173 afinfo->saveroute(skb, entry); 175 afinfo->saveroute(skb, entry);
174 status = qh->outfn(entry, queuenum); 176 status = qh->outfn(entry, queuenum);
175 177
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a969b111bd76..a63029ef3edd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@
26#include <linux/list.h> 26#include <linux/list.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <net/pkt_sched.h> 28#include <net/pkt_sched.h>
29#include <net/dst.h>
29 30
30/* Main transmission queue. */ 31/* Main transmission queue. */
31 32
@@ -40,6 +41,7 @@
40 41
41static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) 42static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
42{ 43{
44 skb_dst_force(skb);
43 q->gso_skb = skb; 45 q->gso_skb = skb;
44 q->qstats.requeues++; 46 q->qstats.requeues++;
45 q->q.qlen++; /* it's still part of the queue */ 47 q->q.qlen++; /* it's still part of the queue */
@@ -179,7 +181,7 @@ static inline int qdisc_restart(struct Qdisc *q)
179 skb = dequeue_skb(q); 181 skb = dequeue_skb(q);
180 if (unlikely(!skb)) 182 if (unlikely(!skb))
181 return 0; 183 return 0;
182 184 WARN_ON_ONCE(skb_dst_is_noref(skb));
183 root_lock = qdisc_lock(q); 185 root_lock = qdisc_lock(q);
184 dev = qdisc_dev(q); 186 dev = qdisc_dev(q);
185 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); 187 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));