diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-05-11 19:19:48 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-05-17 20:18:50 -0400 |
commit | 7fee226ad2397b635e2fd565a59ca3ae08a164cd (patch) | |
tree | 0bcd26150ad74ec1a237109de87a3d214a07fc22 /net | |
parent | ebda37c27d0c768947e9b058332d7ea798210cf8 (diff) |
net: add a noref bit on skb dst
Use low order bit of skb->_skb_dst to tell dst is not refcounted.
Change _skb_dst to _skb_refdst to make sure all uses are catched.
skb_dst() returns the dst, regardless of noref bit set or not, but
with a lockdep check to make sure a noref dst is not given if current
user is not rcu protected.
New skb_dst_set_noref() helper to set an notrefcounted dst on a skb.
(with lockdep check)
skb_dst_drop() drops a reference only if skb dst was refcounted.
skb_dst_force() helper is used to force a refcount on dst, when skb
is queued and not anymore RCU protected.
Use skb_dst_force() in __sk_add_backlog(), __dev_xmit_skb() if
!IFF_XMIT_DST_RELEASE or skb enqueued on qdisc queue, in
sock_queue_rcv_skb(), in __nf_queue().
Use skb_dst_force() in dev_requeue_skb().
Note: dst_use_noref() still dirties dst, we might transform it
later to do one dirtying per jiffies.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/core/dev.c | 3 | ||||
-rw-r--r-- | net/core/skbuff.c | 2 | ||||
-rw-r--r-- | net/core/sock.c | 6 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 6 | ||||
-rw-r--r-- | net/ipv4/ip_options.c | 9 | ||||
-rw-r--r-- | net/ipv4/netfilter.c | 6 | ||||
-rw-r--r-- | net/ipv4/route.c | 2 | ||||
-rw-r--r-- | net/netfilter/nf_queue.c | 2 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 4 |
9 files changed, 27 insertions, 13 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index cdcb9cbedf41..6c820650b80f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -2052,6 +2052,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, | |||
2052 | * waiting to be sent out; and the qdisc is not running - | 2052 | * waiting to be sent out; and the qdisc is not running - |
2053 | * xmit the skb directly. | 2053 | * xmit the skb directly. |
2054 | */ | 2054 | */ |
2055 | if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) | ||
2056 | skb_dst_force(skb); | ||
2055 | __qdisc_update_bstats(q, skb->len); | 2057 | __qdisc_update_bstats(q, skb->len); |
2056 | if (sch_direct_xmit(skb, q, dev, txq, root_lock)) | 2058 | if (sch_direct_xmit(skb, q, dev, txq, root_lock)) |
2057 | __qdisc_run(q); | 2059 | __qdisc_run(q); |
@@ -2060,6 +2062,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, | |||
2060 | 2062 | ||
2061 | rc = NET_XMIT_SUCCESS; | 2063 | rc = NET_XMIT_SUCCESS; |
2062 | } else { | 2064 | } else { |
2065 | skb_dst_force(skb); | ||
2063 | rc = qdisc_enqueue_root(skb, q); | 2066 | rc = qdisc_enqueue_root(skb, q); |
2064 | qdisc_run(q); | 2067 | qdisc_run(q); |
2065 | } | 2068 | } |
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a9b0e1f77806..c543dd252433 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -520,7 +520,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |||
520 | new->transport_header = old->transport_header; | 520 | new->transport_header = old->transport_header; |
521 | new->network_header = old->network_header; | 521 | new->network_header = old->network_header; |
522 | new->mac_header = old->mac_header; | 522 | new->mac_header = old->mac_header; |
523 | skb_dst_set(new, dst_clone(skb_dst(old))); | 523 | skb_dst_copy(new, old); |
524 | new->rxhash = old->rxhash; | 524 | new->rxhash = old->rxhash; |
525 | #ifdef CONFIG_XFRM | 525 | #ifdef CONFIG_XFRM |
526 | new->sp = secpath_get(old->sp); | 526 | new->sp = secpath_get(old->sp); |
diff --git a/net/core/sock.c b/net/core/sock.c index 63530a03b8c2..bf88a167c8f2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
307 | */ | 307 | */ |
308 | skb_len = skb->len; | 308 | skb_len = skb->len; |
309 | 309 | ||
310 | /* we escape from rcu protected region, make sure we dont leak | ||
311 | * a norefcounted dst | ||
312 | */ | ||
313 | skb_dst_force(skb); | ||
314 | |||
310 | spin_lock_irqsave(&list->lock, flags); | 315 | spin_lock_irqsave(&list->lock, flags); |
311 | skb->dropcount = atomic_read(&sk->sk_drops); | 316 | skb->dropcount = atomic_read(&sk->sk_drops); |
312 | __skb_queue_tail(list, skb); | 317 | __skb_queue_tail(list, skb); |
@@ -1536,6 +1541,7 @@ static void __release_sock(struct sock *sk) | |||
1536 | do { | 1541 | do { |
1537 | struct sk_buff *next = skb->next; | 1542 | struct sk_buff *next = skb->next; |
1538 | 1543 | ||
1544 | WARN_ON_ONCE(skb_dst_is_noref(skb)); | ||
1539 | skb->next = NULL; | 1545 | skb->next = NULL; |
1540 | sk_backlog_rcv(sk, skb); | 1546 | sk_backlog_rcv(sk, skb); |
1541 | 1547 | ||
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f3d339f728b0..d65e9215bcd7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
587 | err = __ip_route_output_key(net, &rt2, &fl); | 587 | err = __ip_route_output_key(net, &rt2, &fl); |
588 | else { | 588 | else { |
589 | struct flowi fl2 = {}; | 589 | struct flowi fl2 = {}; |
590 | struct dst_entry *odst; | 590 | unsigned long orefdst; |
591 | 591 | ||
592 | fl2.fl4_dst = fl.fl4_src; | 592 | fl2.fl4_dst = fl.fl4_src; |
593 | if (ip_route_output_key(net, &rt2, &fl2)) | 593 | if (ip_route_output_key(net, &rt2, &fl2)) |
594 | goto relookup_failed; | 594 | goto relookup_failed; |
595 | 595 | ||
596 | /* Ugh! */ | 596 | /* Ugh! */ |
597 | odst = skb_dst(skb_in); | 597 | orefdst = skb_in->_skb_refdst; /* save old refdst */ |
598 | err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, | 598 | err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, |
599 | RT_TOS(tos), rt2->u.dst.dev); | 599 | RT_TOS(tos), rt2->u.dst.dev); |
600 | 600 | ||
601 | dst_release(&rt2->u.dst); | 601 | dst_release(&rt2->u.dst); |
602 | rt2 = skb_rtable(skb_in); | 602 | rt2 = skb_rtable(skb_in); |
603 | skb_dst_set(skb_in, odst); | 603 | skb_in->_skb_refdst = orefdst; /* restore old refdst */ |
604 | } | 604 | } |
605 | 605 | ||
606 | if (err) | 606 | if (err) |
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 4c09a31fd140..3244133c24f6 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
@@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) | |||
601 | unsigned char *optptr = skb_network_header(skb) + opt->srr; | 601 | unsigned char *optptr = skb_network_header(skb) + opt->srr; |
602 | struct rtable *rt = skb_rtable(skb); | 602 | struct rtable *rt = skb_rtable(skb); |
603 | struct rtable *rt2; | 603 | struct rtable *rt2; |
604 | unsigned long orefdst; | ||
604 | int err; | 605 | int err; |
605 | 606 | ||
606 | if (!opt->srr) | 607 | if (!opt->srr) |
@@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb) | |||
624 | } | 625 | } |
625 | memcpy(&nexthop, &optptr[srrptr-1], 4); | 626 | memcpy(&nexthop, &optptr[srrptr-1], 4); |
626 | 627 | ||
627 | rt = skb_rtable(skb); | 628 | orefdst = skb->_skb_refdst; |
628 | skb_dst_set(skb, NULL); | 629 | skb_dst_set(skb, NULL); |
629 | err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); | 630 | err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); |
630 | rt2 = skb_rtable(skb); | 631 | rt2 = skb_rtable(skb); |
631 | if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { | 632 | if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { |
632 | ip_rt_put(rt2); | 633 | skb_dst_drop(skb); |
633 | skb_dst_set(skb, &rt->u.dst); | 634 | skb->_skb_refdst = orefdst; |
634 | return -EINVAL; | 635 | return -EINVAL; |
635 | } | 636 | } |
636 | ip_rt_put(rt); | 637 | refdst_drop(orefdst); |
637 | if (rt2->rt_type != RTN_LOCAL) | 638 | if (rt2->rt_type != RTN_LOCAL) |
638 | break; | 639 | break; |
639 | /* Superfast 8) loopback forward */ | 640 | /* Superfast 8) loopback forward */ |
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 82fb43c5c59e..07de855e2175 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c | |||
@@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | |||
17 | const struct iphdr *iph = ip_hdr(skb); | 17 | const struct iphdr *iph = ip_hdr(skb); |
18 | struct rtable *rt; | 18 | struct rtable *rt; |
19 | struct flowi fl = {}; | 19 | struct flowi fl = {}; |
20 | struct dst_entry *odst; | 20 | unsigned long orefdst; |
21 | unsigned int hh_len; | 21 | unsigned int hh_len; |
22 | unsigned int type; | 22 | unsigned int type; |
23 | 23 | ||
@@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | |||
51 | if (ip_route_output_key(net, &rt, &fl) != 0) | 51 | if (ip_route_output_key(net, &rt, &fl) != 0) |
52 | return -1; | 52 | return -1; |
53 | 53 | ||
54 | odst = skb_dst(skb); | 54 | orefdst = skb->_skb_refdst; |
55 | if (ip_route_input(skb, iph->daddr, iph->saddr, | 55 | if (ip_route_input(skb, iph->daddr, iph->saddr, |
56 | RT_TOS(iph->tos), rt->u.dst.dev) != 0) { | 56 | RT_TOS(iph->tos), rt->u.dst.dev) != 0) { |
57 | dst_release(&rt->u.dst); | 57 | dst_release(&rt->u.dst); |
58 | return -1; | 58 | return -1; |
59 | } | 59 | } |
60 | dst_release(&rt->u.dst); | 60 | dst_release(&rt->u.dst); |
61 | dst_release(odst); | 61 | refdst_drop(orefdst); |
62 | } | 62 | } |
63 | 63 | ||
64 | if (skb_dst(skb)->error) | 64 | if (skb_dst(skb)->error) |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dea3f9264250..705eccfb4769 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -3033,7 +3033,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
3033 | continue; | 3033 | continue; |
3034 | if (rt_is_expired(rt)) | 3034 | if (rt_is_expired(rt)) |
3035 | continue; | 3035 | continue; |
3036 | skb_dst_set(skb, dst_clone(&rt->u.dst)); | 3036 | skb_dst_set_noref(skb, &rt->u.dst); |
3037 | if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, | 3037 | if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, |
3038 | cb->nlh->nlmsg_seq, RTM_NEWROUTE, | 3038 | cb->nlh->nlmsg_seq, RTM_NEWROUTE, |
3039 | 1, NLM_F_MULTI) <= 0) { | 3039 | 1, NLM_F_MULTI) <= 0) { |
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 0b1103c0b1f3..78b3cf9c519c 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/rcupdate.h> | 9 | #include <linux/rcupdate.h> |
10 | #include <net/protocol.h> | 10 | #include <net/protocol.h> |
11 | #include <net/netfilter/nf_queue.h> | 11 | #include <net/netfilter/nf_queue.h> |
12 | #include <net/dst.h> | ||
12 | 13 | ||
13 | #include "nf_internals.h" | 14 | #include "nf_internals.h" |
14 | 15 | ||
@@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb, | |||
170 | dev_hold(physoutdev); | 171 | dev_hold(physoutdev); |
171 | } | 172 | } |
172 | #endif | 173 | #endif |
174 | skb_dst_force(skb); | ||
173 | afinfo->saveroute(skb, entry); | 175 | afinfo->saveroute(skb, entry); |
174 | status = qh->outfn(entry, queuenum); | 176 | status = qh->outfn(entry, queuenum); |
175 | 177 | ||
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a969b111bd76..a63029ef3edd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/list.h> | 26 | #include <linux/list.h> |
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <net/pkt_sched.h> | 28 | #include <net/pkt_sched.h> |
29 | #include <net/dst.h> | ||
29 | 30 | ||
30 | /* Main transmission queue. */ | 31 | /* Main transmission queue. */ |
31 | 32 | ||
@@ -40,6 +41,7 @@ | |||
40 | 41 | ||
41 | static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) | 42 | static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) |
42 | { | 43 | { |
44 | skb_dst_force(skb); | ||
43 | q->gso_skb = skb; | 45 | q->gso_skb = skb; |
44 | q->qstats.requeues++; | 46 | q->qstats.requeues++; |
45 | q->q.qlen++; /* it's still part of the queue */ | 47 | q->q.qlen++; /* it's still part of the queue */ |
@@ -179,7 +181,7 @@ static inline int qdisc_restart(struct Qdisc *q) | |||
179 | skb = dequeue_skb(q); | 181 | skb = dequeue_skb(q); |
180 | if (unlikely(!skb)) | 182 | if (unlikely(!skb)) |
181 | return 0; | 183 | return 0; |
182 | 184 | WARN_ON_ONCE(skb_dst_is_noref(skb)); | |
183 | root_lock = qdisc_lock(q); | 185 | root_lock = qdisc_lock(q); |
184 | dev = qdisc_dev(q); | 186 | dev = qdisc_dev(q); |
185 | txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); | 187 | txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); |