aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2014-01-15 10:25:36 -0500
committerDavid S. Miller <davem@davemloft.net>2014-01-16 19:17:12 -0500
commitb013840810c221f2b0cf641d01531526052dc1fb (patch)
treec1fc18fcd8c5b011fe29dc4ba984fd8d519642e8 /net/packet
parent87a2fd286adf35a87cf6cb30fa80a0726eb74f76 (diff)
packet: use percpu mmap tx frame pending refcount
In PF_PACKET's packet mmap(), we can avoid using one atomic_inc() and one atomic_dec() call in skb destructor and use a percpu reference count instead in order to determine if packets are still pending to be sent out. Micro-benchmark with [1] that has been slightly modified (that is, protcol = 0 in socket(2) and bind(2)), example on a rather crappy testing machine; I expect it to scale and have even better results on bigger machines: ./packet_mm_tx -s7000 -m7200 -z700000 em1, avg over 2500 runs: With patch: 4,022,015 cyc Without patch: 4,812,994 cyc time ./packet_mm_tx -s64 -c10000000 em1 > /dev/null, stable: With patch: real 1m32.241s user 0m0.287s sys 1m29.316s Without patch: real 1m38.386s user 0m0.265s sys 1m35.572s In function tpacket_snd(), it is okay to use packet_read_pending() since in fast-path we short-circuit the condition already with ph != NULL, since we have next frames to process. In case we have MSG_DONTWAIT, we also do not execute this path as need_wait is false here anyway, and in case of _no_ MSG_DONTWAIT flag, it is okay to call a packet_read_pending(), because when we ever reach that path, we're done processing outgoing frames anyway and only look if there are skbs still outstanding to be orphaned. We can stay lockless in this percpu counter since it's acceptable when we reach this path for the sum to be imprecise first, but we'll level out at 0 after all pending frames have reached the skb destructor eventually through tx reclaim. When people pin a tx process to particular CPUs, we expect overflows to happen in the reference counter as on one CPU we expect heavy increase; and distributed through ksoftirqd on all CPUs a decrease, for example. As David Laight points out, since the C language doesn't define the result of signed int overflow (i.e. rather than wrap, it is allowed to saturate as a possible outcome), we have to use unsigned int as reference count. The sum over all CPUs when tx is complete will result in 0 again. The BUG_ON() in tpacket_destruct_skb() we can remove as well. It can _only_ be set from inside tpacket_snd() path and we made sure to increase tx_ring.pending in any case before we called po->xmit(skb). So testing for tx_ring.pending == 0 is not too useful. Instead, it would rather have been useful to test if lower layers didn't orphan the skb so that we're missing ring slots being put back to TP_STATUS_AVAILABLE. But such a bug will be caught in user space already as we end up realizing that we do not have any TP_STATUS_AVAILABLE slots left anymore. Therefore, we're all set. Btw, in case of RX_RING path, we do not make use of the pending member, therefore we also don't need to use up any percpu memory here. Also note that __alloc_percpu() already returns a zero-filled percpu area, so initialization is done already. [1] http://wiki.ipxwarzone.com/index.php5?title=Linux_packet_mmap Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet')
-rw-r--r--net/packet/af_packet.c66
-rw-r--r--net/packet/diag.c1
-rw-r--r--net/packet/internal.h2
3 files changed, 62 insertions, 7 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d5495d87f399..12f2f725a945 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -89,6 +89,7 @@
89#include <linux/errqueue.h> 89#include <linux/errqueue.h>
90#include <linux/net_tstamp.h> 90#include <linux/net_tstamp.h>
91#include <linux/reciprocal_div.h> 91#include <linux/reciprocal_div.h>
92#include <linux/percpu.h>
92#ifdef CONFIG_INET 93#ifdef CONFIG_INET
93#include <net/inet_common.h> 94#include <net/inet_common.h>
94#endif 95#endif
@@ -1168,6 +1169,47 @@ static void packet_increment_head(struct packet_ring_buffer *buff)
1168 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 1169 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1169} 1170}
1170 1171
1172static void packet_inc_pending(struct packet_ring_buffer *rb)
1173{
1174 this_cpu_inc(*rb->pending_refcnt);
1175}
1176
1177static void packet_dec_pending(struct packet_ring_buffer *rb)
1178{
1179 this_cpu_dec(*rb->pending_refcnt);
1180}
1181
1182static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1183{
1184 unsigned int refcnt = 0;
1185 int cpu;
1186
1187 /* We don't use pending refcount in rx_ring. */
1188 if (rb->pending_refcnt == NULL)
1189 return 0;
1190
1191 for_each_possible_cpu(cpu)
1192 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1193
1194 return refcnt;
1195}
1196
1197static int packet_alloc_pending(struct packet_sock *po)
1198{
1199 po->rx_ring.pending_refcnt = NULL;
1200
1201 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1202 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1203 return -ENOBUFS;
1204
1205 return 0;
1206}
1207
1208static void packet_free_pending(struct packet_sock *po)
1209{
1210 free_percpu(po->tx_ring.pending_refcnt);
1211}
1212
1171static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) 1213static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1172{ 1214{
1173 struct sock *sk = &po->sk; 1215 struct sock *sk = &po->sk;
@@ -2014,8 +2056,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
2014 __u32 ts; 2056 __u32 ts;
2015 2057
2016 ph = skb_shinfo(skb)->destructor_arg; 2058 ph = skb_shinfo(skb)->destructor_arg;
2017 BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 2059 packet_dec_pending(&po->tx_ring);
2018 atomic_dec(&po->tx_ring.pending);
2019 2060
2020 ts = __packet_set_timestamp(po, ph, skb); 2061 ts = __packet_set_timestamp(po, ph, skb);
2021 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); 2062 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
@@ -2236,7 +2277,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2236 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); 2277 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
2237 skb->destructor = tpacket_destruct_skb; 2278 skb->destructor = tpacket_destruct_skb;
2238 __packet_set_status(po, ph, TP_STATUS_SENDING); 2279 __packet_set_status(po, ph, TP_STATUS_SENDING);
2239 atomic_inc(&po->tx_ring.pending); 2280 packet_inc_pending(&po->tx_ring);
2240 2281
2241 status = TP_STATUS_SEND_REQUEST; 2282 status = TP_STATUS_SEND_REQUEST;
2242 err = po->xmit(skb); 2283 err = po->xmit(skb);
@@ -2256,8 +2297,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2256 } 2297 }
2257 packet_increment_head(&po->tx_ring); 2298 packet_increment_head(&po->tx_ring);
2258 len_sum += tp_len; 2299 len_sum += tp_len;
2259 } while (likely((ph != NULL) || (need_wait && 2300 } while (likely((ph != NULL) ||
2260 atomic_read(&po->tx_ring.pending)))); 2301 /* Note: packet_read_pending() might be slow if we have
2302 * to call it as it's per_cpu variable, but in fast-path
2303 * we already short-circuit the loop with the first
2304 * condition, and luckily don't have to go that path
2305 * anyway.
2306 */
2307 (need_wait && packet_read_pending(&po->tx_ring))));
2261 2308
2262 err = len_sum; 2309 err = len_sum;
2263 goto out_put; 2310 goto out_put;
@@ -2556,6 +2603,7 @@ static int packet_release(struct socket *sock)
2556 /* Purge queues */ 2603 /* Purge queues */
2557 2604
2558 skb_queue_purge(&sk->sk_receive_queue); 2605 skb_queue_purge(&sk->sk_receive_queue);
2606 packet_free_pending(po);
2559 sk_refcnt_debug_release(sk); 2607 sk_refcnt_debug_release(sk);
2560 2608
2561 sock_put(sk); 2609 sock_put(sk);
@@ -2717,6 +2765,10 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2717 po->num = proto; 2765 po->num = proto;
2718 po->xmit = dev_queue_xmit; 2766 po->xmit = dev_queue_xmit;
2719 2767
2768 err = packet_alloc_pending(po);
2769 if (err)
2770 goto out2;
2771
2720 packet_cached_dev_reset(po); 2772 packet_cached_dev_reset(po);
2721 2773
2722 sk->sk_destruct = packet_sock_destruct; 2774 sk->sk_destruct = packet_sock_destruct;
@@ -2749,6 +2801,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2749 preempt_enable(); 2801 preempt_enable();
2750 2802
2751 return 0; 2803 return 0;
2804out2:
2805 sk_free(sk);
2752out: 2806out:
2753 return err; 2807 return err;
2754} 2808}
@@ -3676,7 +3730,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3676 if (!closing) { 3730 if (!closing) {
3677 if (atomic_read(&po->mapped)) 3731 if (atomic_read(&po->mapped))
3678 goto out; 3732 goto out;
3679 if (atomic_read(&rb->pending)) 3733 if (packet_read_pending(rb))
3680 goto out; 3734 goto out;
3681 } 3735 }
3682 3736
diff --git a/net/packet/diag.c b/net/packet/diag.c
index a9584a2f6d69..533ce4ff108a 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -3,6 +3,7 @@
3#include <linux/net.h> 3#include <linux/net.h>
4#include <linux/netdevice.h> 4#include <linux/netdevice.h>
5#include <linux/packet_diag.h> 5#include <linux/packet_diag.h>
6#include <linux/percpu.h>
6#include <net/net_namespace.h> 7#include <net/net_namespace.h>
7#include <net/sock.h> 8#include <net/sock.h>
8 9
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 0a87d7b36c9e..eb9580a6b25f 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -64,7 +64,7 @@ struct packet_ring_buffer {
64 unsigned int pg_vec_pages; 64 unsigned int pg_vec_pages;
65 unsigned int pg_vec_len; 65 unsigned int pg_vec_len;
66 66
67 atomic_t pending; 67 unsigned int __percpu *pending_refcnt;
68 68
69 struct tpacket_kbdq_core prb_bdqc; 69 struct tpacket_kbdq_core prb_bdqc;
70}; 70};