diff options
author | David Held <drheld@google.com> | 2014-07-15 23:28:32 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-07-17 02:29:52 -0400 |
commit | 2dc41cff7545d55c6294525c811594576f8e119c (patch) | |
tree | cab09b28d188606139b1b50b661f42da157c52ba | |
parent | 5cf3d46192fccf68b4a4759e4d7346e41c669a76 (diff) |
udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.
Many multicast sources can have the same port which can result in a very
large list when hashing by port only. Hash by address and port instead
if this is the case. This makes multicast more similar to unicast.
On a 24-core machine receiving from 500 multicast sockets on the same
port, before this patch 80% of system CPU was used up by spin locking
and only ~25% of packets were successfully delivered.
With this patch, all packets are delivered and kernel overhead is ~8%
system CPU on spinlocks.
Signed-off-by: David Held <drheld@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/sock.h | 14 | ||||
-rw-r--r-- | net/ipv4/udp.c | 31 | ||||
-rw-r--r-- | net/ipv6/udp.c | 30 |
3 files changed, 55 insertions, 20 deletions
diff --git a/include/net/sock.h b/include/net/sock.h index 29e48a6d1ded..28f734601b50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk, | |||
660 | #define sk_for_each_bound(__sk, list) \ | 660 | #define sk_for_each_bound(__sk, list) \ |
661 | hlist_for_each_entry(__sk, list, sk_bind_node) | 661 | hlist_for_each_entry(__sk, list, sk_bind_node) |
662 | 662 | ||
663 | /** | ||
664 | * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset | ||
665 | * @tpos: the type * to use as a loop cursor. | ||
666 | * @pos: the &struct hlist_node to use as a loop cursor. | ||
667 | * @head: the head for your list. | ||
668 | * @offset: offset of hlist_node within the struct. | ||
669 | * | ||
670 | */ | ||
671 | #define sk_nulls_for_each_entry_offset(tpos, pos, head, offset) \ | ||
672 | for (pos = (head)->first; \ | ||
673 | (!is_a_nulls(pos)) && \ | ||
674 | ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ | ||
675 | pos = pos->next) | ||
676 | |||
663 | static inline struct user_namespace *sk_user_ns(struct sock *sk) | 677 | static inline struct user_namespace *sk_user_ns(struct sock *sk) |
664 | { | 678 | { |
665 | /* Careful only use this in a context where these parameters | 679 | /* Careful only use this in a context where these parameters |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bbcc33737ef1..f31053b90ee0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -1619,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count, | |||
1619 | 1619 | ||
1620 | if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) | 1620 | if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) |
1621 | skb1 = NULL; | 1621 | skb1 = NULL; |
1622 | |||
1623 | sock_put(sk); | ||
1622 | } | 1624 | } |
1623 | if (unlikely(skb1)) | 1625 | if (unlikely(skb1)) |
1624 | kfree_skb(skb1); | 1626 | kfree_skb(skb1); |
@@ -1651,10 +1653,20 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, | |||
1651 | unsigned short hnum = ntohs(uh->dest); | 1653 | unsigned short hnum = ntohs(uh->dest); |
1652 | struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); | 1654 | struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); |
1653 | int dif = skb->dev->ifindex; | 1655 | int dif = skb->dev->ifindex; |
1654 | unsigned int i, count = 0; | 1656 | unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); |
1657 | unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); | ||
1658 | |||
1659 | if (use_hash2) { | ||
1660 | hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & | ||
1661 | udp_table.mask; | ||
1662 | hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; | ||
1663 | start_lookup: | ||
1664 | hslot = &udp_table.hash2[hash2]; | ||
1665 | offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); | ||
1666 | } | ||
1655 | 1667 | ||
1656 | spin_lock(&hslot->lock); | 1668 | spin_lock(&hslot->lock); |
1657 | sk_nulls_for_each(sk, node, &hslot->head) { | 1669 | sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { |
1658 | if (__udp_is_mcast_sock(net, sk, | 1670 | if (__udp_is_mcast_sock(net, sk, |
1659 | uh->dest, daddr, | 1671 | uh->dest, daddr, |
1660 | uh->source, saddr, | 1672 | uh->source, saddr, |
@@ -1664,24 +1676,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, | |||
1664 | count = 0; | 1676 | count = 0; |
1665 | } | 1677 | } |
1666 | stack[count++] = sk; | 1678 | stack[count++] = sk; |
1679 | sock_hold(sk); | ||
1667 | } | 1680 | } |
1668 | } | 1681 | } |
1669 | /* | ||
1670 | * before releasing chain lock, we must take a reference on sockets | ||
1671 | */ | ||
1672 | for (i = 0; i < count; i++) | ||
1673 | sock_hold(stack[i]); | ||
1674 | 1682 | ||
1675 | spin_unlock(&hslot->lock); | 1683 | spin_unlock(&hslot->lock); |
1676 | 1684 | ||
1685 | /* Also lookup *:port if we are using hash2 and haven't done so yet. */ | ||
1686 | if (use_hash2 && hash2 != hash2_any) { | ||
1687 | hash2 = hash2_any; | ||
1688 | goto start_lookup; | ||
1689 | } | ||
1690 | |||
1677 | /* | 1691 | /* |
1678 | * do the slow work with no lock held | 1692 | * do the slow work with no lock held |
1679 | */ | 1693 | */ |
1680 | if (count) { | 1694 | if (count) { |
1681 | flush_stack(stack, count, skb, count - 1); | 1695 | flush_stack(stack, count, skb, count - 1); |
1682 | |||
1683 | for (i = 0; i < count; i++) | ||
1684 | sock_put(stack[i]); | ||
1685 | } else { | 1696 | } else { |
1686 | kfree_skb(skb); | 1697 | kfree_skb(skb); |
1687 | } | 1698 | } |
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7d3bd80085be..f9d8800bb72f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c | |||
@@ -745,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count, | |||
745 | 745 | ||
746 | if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) | 746 | if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) |
747 | skb1 = NULL; | 747 | skb1 = NULL; |
748 | sock_put(sk); | ||
748 | } | 749 | } |
749 | if (unlikely(skb1)) | 750 | if (unlikely(skb1)) |
750 | kfree_skb(skb1); | 751 | kfree_skb(skb1); |
@@ -774,10 +775,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, | |||
774 | unsigned short hnum = ntohs(uh->dest); | 775 | unsigned short hnum = ntohs(uh->dest); |
775 | struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); | 776 | struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); |
776 | int dif = inet6_iif(skb); | 777 | int dif = inet6_iif(skb); |
777 | unsigned int i, count = 0; | 778 | unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); |
779 | unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); | ||
780 | |||
781 | if (use_hash2) { | ||
782 | hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & | ||
783 | udp_table.mask; | ||
784 | hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask; | ||
785 | start_lookup: | ||
786 | hslot = &udp_table.hash2[hash2]; | ||
787 | offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); | ||
788 | } | ||
778 | 789 | ||
779 | spin_lock(&hslot->lock); | 790 | spin_lock(&hslot->lock); |
780 | sk_nulls_for_each(sk, node, &hslot->head) { | 791 | sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { |
781 | if (__udp_v6_is_mcast_sock(net, sk, | 792 | if (__udp_v6_is_mcast_sock(net, sk, |
782 | uh->dest, daddr, | 793 | uh->dest, daddr, |
783 | uh->source, saddr, | 794 | uh->source, saddr, |
@@ -791,21 +802,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, | |||
791 | count = 0; | 802 | count = 0; |
792 | } | 803 | } |
793 | stack[count++] = sk; | 804 | stack[count++] = sk; |
805 | sock_hold(sk); | ||
794 | } | 806 | } |
795 | } | 807 | } |
796 | /* | ||
797 | * before releasing the lock, we must take reference on sockets | ||
798 | */ | ||
799 | for (i = 0; i < count; i++) | ||
800 | sock_hold(stack[i]); | ||
801 | 808 | ||
802 | spin_unlock(&hslot->lock); | 809 | spin_unlock(&hslot->lock); |
803 | 810 | ||
811 | /* Also lookup *:port if we are using hash2 and haven't done so yet. */ | ||
812 | if (use_hash2 && hash2 != hash2_any) { | ||
813 | hash2 = hash2_any; | ||
814 | goto start_lookup; | ||
815 | } | ||
816 | |||
804 | if (count) { | 817 | if (count) { |
805 | flush_stack(stack, count, skb, count - 1); | 818 | flush_stack(stack, count, skb, count - 1); |
806 | |||
807 | for (i = 0; i < count; i++) | ||
808 | sock_put(stack[i]); | ||
809 | } else { | 819 | } else { |
810 | kfree_skb(skb); | 820 | kfree_skb(skb); |
811 | } | 821 | } |