diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2008-11-24 18:52:46 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-11-24 18:52:46 -0500 |
commit | 2e77d89b2fa8e3f8325b8ce7893ec3645f41aff5 (patch) | |
tree | ae40aa75449f705bd166630f9bcb5f41373d8248 /net | |
parent | 4db0acf3c0afbbbb2ae35a65f8896ca6655a47ec (diff) |
net: avoid a pair of dst_hold()/dst_release() in ip_append_data()
We can reduce pressure on dst entry refcount that slowdown UDP transmit
path on SMP machines. This pressure is visible on RTP servers when
delivering content to mediagateways, especially big ones, handling
thousand of streams. Several cpus send UDP frames to the same
destination, hence use the same dst entry.
This patch makes ip_append_data() eventually steal the refcount its
callers had to take on the dst entry.
This doesnt avoid all refcounting, but still gives speedups on SMP,
on UDP/RAW transmit path
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/icmp.c | 8 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 11 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 2 |
4 files changed, 14 insertions, 9 deletions
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 21e497efbd7f..7b88be9803b1 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -321,12 +321,12 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, | |||
321 | } | 321 | } |
322 | 322 | ||
323 | static void icmp_push_reply(struct icmp_bxm *icmp_param, | 323 | static void icmp_push_reply(struct icmp_bxm *icmp_param, |
324 | struct ipcm_cookie *ipc, struct rtable *rt) | 324 | struct ipcm_cookie *ipc, struct rtable **rt) |
325 | { | 325 | { |
326 | struct sock *sk; | 326 | struct sock *sk; |
327 | struct sk_buff *skb; | 327 | struct sk_buff *skb; |
328 | 328 | ||
329 | sk = icmp_sk(dev_net(rt->u.dst.dev)); | 329 | sk = icmp_sk(dev_net((*rt)->u.dst.dev)); |
330 | if (ip_append_data(sk, icmp_glue_bits, icmp_param, | 330 | if (ip_append_data(sk, icmp_glue_bits, icmp_param, |
331 | icmp_param->data_len+icmp_param->head_len, | 331 | icmp_param->data_len+icmp_param->head_len, |
332 | icmp_param->head_len, | 332 | icmp_param->head_len, |
@@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
392 | } | 392 | } |
393 | if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, | 393 | if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, |
394 | icmp_param->data.icmph.code)) | 394 | icmp_param->data.icmph.code)) |
395 | icmp_push_reply(icmp_param, &ipc, rt); | 395 | icmp_push_reply(icmp_param, &ipc, &rt); |
396 | ip_rt_put(rt); | 396 | ip_rt_put(rt); |
397 | out_unlock: | 397 | out_unlock: |
398 | icmp_xmit_unlock(sk); | 398 | icmp_xmit_unlock(sk); |
@@ -635,7 +635,7 @@ route_done: | |||
635 | icmp_param.data_len = room; | 635 | icmp_param.data_len = room; |
636 | icmp_param.head_len = sizeof(struct icmphdr); | 636 | icmp_param.head_len = sizeof(struct icmphdr); |
637 | 637 | ||
638 | icmp_push_reply(&icmp_param, &ipc, rt); | 638 | icmp_push_reply(&icmp_param, &ipc, &rt); |
639 | ende: | 639 | ende: |
640 | ip_rt_put(rt); | 640 | ip_rt_put(rt); |
641 | out_unlock: | 641 | out_unlock: |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 46d7be233eac..5516825a0751 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -778,7 +778,7 @@ int ip_append_data(struct sock *sk, | |||
778 | int getfrag(void *from, char *to, int offset, int len, | 778 | int getfrag(void *from, char *to, int offset, int len, |
779 | int odd, struct sk_buff *skb), | 779 | int odd, struct sk_buff *skb), |
780 | void *from, int length, int transhdrlen, | 780 | void *from, int length, int transhdrlen, |
781 | struct ipcm_cookie *ipc, struct rtable *rt, | 781 | struct ipcm_cookie *ipc, struct rtable **rtp, |
782 | unsigned int flags) | 782 | unsigned int flags) |
783 | { | 783 | { |
784 | struct inet_sock *inet = inet_sk(sk); | 784 | struct inet_sock *inet = inet_sk(sk); |
@@ -793,6 +793,7 @@ int ip_append_data(struct sock *sk, | |||
793 | int offset = 0; | 793 | int offset = 0; |
794 | unsigned int maxfraglen, fragheaderlen; | 794 | unsigned int maxfraglen, fragheaderlen; |
795 | int csummode = CHECKSUM_NONE; | 795 | int csummode = CHECKSUM_NONE; |
796 | struct rtable *rt; | ||
796 | 797 | ||
797 | if (flags&MSG_PROBE) | 798 | if (flags&MSG_PROBE) |
798 | return 0; | 799 | return 0; |
@@ -812,7 +813,11 @@ int ip_append_data(struct sock *sk, | |||
812 | inet->cork.flags |= IPCORK_OPT; | 813 | inet->cork.flags |= IPCORK_OPT; |
813 | inet->cork.addr = ipc->addr; | 814 | inet->cork.addr = ipc->addr; |
814 | } | 815 | } |
815 | dst_hold(&rt->u.dst); | 816 | rt = *rtp; |
817 | /* | ||
818 | * We steal reference to this route, caller should not release it | ||
819 | */ | ||
820 | *rtp = NULL; | ||
816 | inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? | 821 | inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? |
817 | rt->u.dst.dev->mtu : | 822 | rt->u.dst.dev->mtu : |
818 | dst_mtu(rt->u.dst.path); | 823 | dst_mtu(rt->u.dst.path); |
@@ -1391,7 +1396,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar | |||
1391 | sk->sk_protocol = ip_hdr(skb)->protocol; | 1396 | sk->sk_protocol = ip_hdr(skb)->protocol; |
1392 | sk->sk_bound_dev_if = arg->bound_dev_if; | 1397 | sk->sk_bound_dev_if = arg->bound_dev_if; |
1393 | ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, | 1398 | ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, |
1394 | &ipc, rt, MSG_DONTWAIT); | 1399 | &ipc, &rt, MSG_DONTWAIT); |
1395 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { | 1400 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { |
1396 | if (arg->csumoffset >= 0) | 1401 | if (arg->csumoffset >= 0) |
1397 | *((__sum16 *)skb_transport_header(skb) + | 1402 | *((__sum16 *)skb_transport_header(skb) + |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 998fcffc9e15..dff8bc4e0fac 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -572,7 +572,7 @@ back_from_confirm: | |||
572 | ipc.addr = rt->rt_dst; | 572 | ipc.addr = rt->rt_dst; |
573 | lock_sock(sk); | 573 | lock_sock(sk); |
574 | err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, | 574 | err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, |
575 | &ipc, rt, msg->msg_flags); | 575 | &ipc, &rt, msg->msg_flags); |
576 | if (err) | 576 | if (err) |
577 | ip_flush_pending_frames(sk); | 577 | ip_flush_pending_frames(sk); |
578 | else if (!(msg->msg_flags & MSG_MORE)) | 578 | else if (!(msg->msg_flags & MSG_MORE)) |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index da869ce041d9..549114472db3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -719,7 +719,7 @@ do_append_data: | |||
719 | up->len += ulen; | 719 | up->len += ulen; |
720 | getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; | 720 | getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; |
721 | err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, | 721 | err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, |
722 | sizeof(struct udphdr), &ipc, rt, | 722 | sizeof(struct udphdr), &ipc, &rt, |
723 | corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); | 723 | corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); |
724 | if (err) | 724 | if (err) |
725 | udp_flush_pending_frames(sk); | 725 | udp_flush_pending_frames(sk); |