diff options
author | Daniel Borkmann <dborkman@redhat.com> | 2013-12-06 05:36:17 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-12-09 20:23:33 -0500 |
commit | d346a3fae3ff1d99f5d0c819bf86edf9094a26a1 (patch) | |
tree | c1a83b58fc8904403063195bc77d5abe4a0c0d79 /net | |
parent | 4262e5ccbbb5171abd2921eed16ed339633d6478 (diff) |
packet: introduce PACKET_QDISC_BYPASS socket option
This patch introduces a PACKET_QDISC_BYPASS socket option, that
allows for using a similar xmit() function as in pktgen instead
of taking the dev_queue_xmit() path. This can be very useful when
PF_PACKET applications are required to be used in a similar
scenario as pktgen, but with full, flexible packet payload that
needs to be provided, for example.
On default, nothing changes in behaviour for normal PF_PACKET
TX users, so everything stays as is for applications. New users,
however, can now set PACKET_QDISC_BYPASS if needed to prevent
own packets from i) reentering packet_rcv() and ii) to directly
push the frame to the driver.
In doing so we can increase pps (here 64 byte packets) for
PF_PACKET a bit:
# CPUs -- QDISC_BYPASS -- qdisc path -- qdisc path[**]
1 CPU == 1,509,628 pps -- 1,208,708 -- 1,247,436
2 CPUs == 3,198,659 pps -- 2,536,012 -- 1,605,779
3 CPUs == 4,787,992 pps -- 3,788,740 -- 1,735,610
4 CPUs == 6,173,956 pps -- 4,907,799 -- 1,909,114
5 CPUs == 7,495,676 pps -- 5,956,499 -- 2,014,422
6 CPUs == 9,001,496 pps -- 7,145,064 -- 2,155,261
7 CPUs == 10,229,776 pps -- 8,190,596 -- 2,220,619
8 CPUs == 11,040,732 pps -- 9,188,544 -- 2,241,879
9 CPUs == 12,009,076 pps -- 10,275,936 -- 2,068,447
10 CPUs == 11,380,052 pps -- 11,265,337 -- 1,578,689
11 CPUs == 11,672,676 pps -- 11,845,344 -- 1,297,412
[...]
20 CPUs == 11,363,192 pps -- 11,014,933 -- 1,245,081
[**]: qdisc path with packet_rcv(), how probably most people
seem to use it (hopefully not anymore if not needed)
The test was done using a modified trafgen, sending a simple
static 64 bytes packet, on all CPUs. The trick in the fast
"qdisc path" case, is to avoid reentering packet_rcv() by
setting the RAW socket protocol to zero, like:
socket(PF_PACKET, SOCK_RAW, 0);
Tradeoffs are documented as well in this patch, clearly, if
queues are busy, we will drop more packets, tc disciplines are
ignored, and these packets are not visible to taps anymore. For
a pktgen like scenario, we argue that this is acceptable.
The pointer to the xmit function has been placed in packet
socket structure hole between cached_dev and prot_hook that
is hot anyway as we're working on cached_dev in each send path.
Done in joint work together with Jesper Dangaard Brouer.
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/packet/af_packet.c | 91 | ||||
-rw-r--r-- | net/packet/internal.h | 1 |
2 files changed, 80 insertions, 12 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e4171dd98590..9d70f1349926 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -237,6 +237,48 @@ struct packet_skb_cb { | |||
237 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); | 237 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); |
238 | static void __fanout_link(struct sock *sk, struct packet_sock *po); | 238 | static void __fanout_link(struct sock *sk, struct packet_sock *po); |
239 | 239 | ||
240 | static int packet_direct_xmit(struct sk_buff *skb) | ||
241 | { | ||
242 | struct net_device *dev = skb->dev; | ||
243 | const struct net_device_ops *ops = dev->netdev_ops; | ||
244 | netdev_features_t features; | ||
245 | struct netdev_queue *txq; | ||
246 | u16 queue_map; | ||
247 | int ret; | ||
248 | |||
249 | if (unlikely(!netif_running(dev) || | ||
250 | !netif_carrier_ok(dev))) { | ||
251 | kfree_skb(skb); | ||
252 | return NET_XMIT_DROP; | ||
253 | } | ||
254 | |||
255 | features = netif_skb_features(skb); | ||
256 | if (skb_needs_linearize(skb, features) && | ||
257 | __skb_linearize(skb)) { | ||
258 | kfree_skb(skb); | ||
259 | return NET_XMIT_DROP; | ||
260 | } | ||
261 | |||
262 | queue_map = skb_get_queue_mapping(skb); | ||
263 | txq = netdev_get_tx_queue(dev, queue_map); | ||
264 | |||
265 | __netif_tx_lock_bh(txq); | ||
266 | if (unlikely(netif_xmit_frozen_or_stopped(txq))) { | ||
267 | ret = NETDEV_TX_BUSY; | ||
268 | kfree_skb(skb); | ||
269 | goto out; | ||
270 | } | ||
271 | |||
272 | ret = ops->ndo_start_xmit(skb, dev); | ||
273 | if (likely(dev_xmit_complete(ret))) | ||
274 | txq_trans_update(txq); | ||
275 | else | ||
276 | kfree_skb(skb); | ||
277 | out: | ||
278 | __netif_tx_unlock_bh(txq); | ||
279 | return ret; | ||
280 | } | ||
281 | |||
240 | static struct net_device *packet_cached_dev_get(struct packet_sock *po) | 282 | static struct net_device *packet_cached_dev_get(struct packet_sock *po) |
241 | { | 283 | { |
242 | struct net_device *dev; | 284 | struct net_device *dev; |
@@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po) | |||
261 | RCU_INIT_POINTER(po->cached_dev, NULL); | 303 | RCU_INIT_POINTER(po->cached_dev, NULL); |
262 | } | 304 | } |
263 | 305 | ||
306 | static bool packet_use_direct_xmit(const struct packet_sock *po) | ||
307 | { | ||
308 | return po->xmit == packet_direct_xmit; | ||
309 | } | ||
310 | |||
311 | static u16 packet_pick_tx_queue(struct net_device *dev) | ||
312 | { | ||
313 | return (u16) smp_processor_id() % dev->real_num_tx_queues; | ||
314 | } | ||
315 | |||
264 | /* register_prot_hook must be invoked with the po->bind_lock held, | 316 | /* register_prot_hook must be invoked with the po->bind_lock held, |
265 | * or from a context in which asynchronous accesses to the packet | 317 | * or from a context in which asynchronous accesses to the packet |
266 | * socket is not possible (packet_create()). | 318 | * socket is not possible (packet_create()). |
@@ -1994,9 +2046,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | |||
1994 | 2046 | ||
1995 | skb_reserve(skb, hlen); | 2047 | skb_reserve(skb, hlen); |
1996 | skb_reset_network_header(skb); | 2048 | skb_reset_network_header(skb); |
1997 | skb_probe_transport_header(skb, 0); | ||
1998 | 2049 | ||
1999 | if (po->tp_tx_has_off) { | 2050 | if (!packet_use_direct_xmit(po)) |
2051 | skb_probe_transport_header(skb, 0); | ||
2052 | if (unlikely(po->tp_tx_has_off)) { | ||
2000 | int off_min, off_max, off; | 2053 | int off_min, off_max, off; |
2001 | off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); | 2054 | off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); |
2002 | off_max = po->tx_ring.frame_size - tp_len; | 2055 | off_max = po->tx_ring.frame_size - tp_len; |
@@ -2166,12 +2219,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
2166 | } | 2219 | } |
2167 | } | 2220 | } |
2168 | 2221 | ||
2222 | skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); | ||
2169 | skb->destructor = tpacket_destruct_skb; | 2223 | skb->destructor = tpacket_destruct_skb; |
2170 | __packet_set_status(po, ph, TP_STATUS_SENDING); | 2224 | __packet_set_status(po, ph, TP_STATUS_SENDING); |
2171 | atomic_inc(&po->tx_ring.pending); | 2225 | atomic_inc(&po->tx_ring.pending); |
2172 | 2226 | ||
2173 | status = TP_STATUS_SEND_REQUEST; | 2227 | status = TP_STATUS_SEND_REQUEST; |
2174 | err = dev_queue_xmit(skb); | 2228 | err = po->xmit(skb); |
2175 | if (unlikely(err > 0)) { | 2229 | if (unlikely(err > 0)) { |
2176 | err = net_xmit_errno(err); | 2230 | err = net_xmit_errno(err); |
2177 | if (err && __packet_get_status(po, ph) == | 2231 | if (err && __packet_get_status(po, ph) == |
@@ -2230,8 +2284,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, | |||
2230 | return skb; | 2284 | return skb; |
2231 | } | 2285 | } |
2232 | 2286 | ||
2233 | static int packet_snd(struct socket *sock, | 2287 | static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) |
2234 | struct msghdr *msg, size_t len) | ||
2235 | { | 2288 | { |
2236 | struct sock *sk = sock->sk; | 2289 | struct sock *sk = sock->sk; |
2237 | struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; | 2290 | struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; |
@@ -2376,6 +2429,7 @@ static int packet_snd(struct socket *sock, | |||
2376 | skb->dev = dev; | 2429 | skb->dev = dev; |
2377 | skb->priority = sk->sk_priority; | 2430 | skb->priority = sk->sk_priority; |
2378 | skb->mark = sk->sk_mark; | 2431 | skb->mark = sk->sk_mark; |
2432 | skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); | ||
2379 | 2433 | ||
2380 | if (po->has_vnet_hdr) { | 2434 | if (po->has_vnet_hdr) { |
2381 | if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { | 2435 | if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { |
@@ -2396,16 +2450,12 @@ static int packet_snd(struct socket *sock, | |||
2396 | len += vnet_hdr_len; | 2450 | len += vnet_hdr_len; |
2397 | } | 2451 | } |
2398 | 2452 | ||
2399 | skb_probe_transport_header(skb, reserve); | 2453 | if (!packet_use_direct_xmit(po)) |
2400 | 2454 | skb_probe_transport_header(skb, reserve); | |
2401 | if (unlikely(extra_len == 4)) | 2455 | if (unlikely(extra_len == 4)) |
2402 | skb->no_fcs = 1; | 2456 | skb->no_fcs = 1; |
2403 | 2457 | ||
2404 | /* | 2458 | err = po->xmit(skb); |
2405 | * Now send it | ||
2406 | */ | ||
2407 | |||
2408 | err = dev_queue_xmit(skb); | ||
2409 | if (err > 0 && (err = net_xmit_errno(err)) != 0) | 2459 | if (err > 0 && (err = net_xmit_errno(err)) != 0) |
2410 | goto out_unlock; | 2460 | goto out_unlock; |
2411 | 2461 | ||
@@ -2427,6 +2477,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
2427 | { | 2477 | { |
2428 | struct sock *sk = sock->sk; | 2478 | struct sock *sk = sock->sk; |
2429 | struct packet_sock *po = pkt_sk(sk); | 2479 | struct packet_sock *po = pkt_sk(sk); |
2480 | |||
2430 | if (po->tx_ring.pg_vec) | 2481 | if (po->tx_ring.pg_vec) |
2431 | return tpacket_snd(po, msg); | 2482 | return tpacket_snd(po, msg); |
2432 | else | 2483 | else |
@@ -2641,6 +2692,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, | |||
2641 | po = pkt_sk(sk); | 2692 | po = pkt_sk(sk); |
2642 | sk->sk_family = PF_PACKET; | 2693 | sk->sk_family = PF_PACKET; |
2643 | po->num = proto; | 2694 | po->num = proto; |
2695 | po->xmit = dev_queue_xmit; | ||
2644 | 2696 | ||
2645 | packet_cached_dev_reset(po); | 2697 | packet_cached_dev_reset(po); |
2646 | 2698 | ||
@@ -3220,6 +3272,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
3220 | po->tp_tx_has_off = !!val; | 3272 | po->tp_tx_has_off = !!val; |
3221 | return 0; | 3273 | return 0; |
3222 | } | 3274 | } |
3275 | case PACKET_QDISC_BYPASS: | ||
3276 | { | ||
3277 | int val; | ||
3278 | |||
3279 | if (optlen != sizeof(val)) | ||
3280 | return -EINVAL; | ||
3281 | if (copy_from_user(&val, optval, sizeof(val))) | ||
3282 | return -EFAULT; | ||
3283 | |||
3284 | po->xmit = val ? packet_direct_xmit : dev_queue_xmit; | ||
3285 | return 0; | ||
3286 | } | ||
3223 | default: | 3287 | default: |
3224 | return -ENOPROTOOPT; | 3288 | return -ENOPROTOOPT; |
3225 | } | 3289 | } |
@@ -3312,6 +3376,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
3312 | case PACKET_TX_HAS_OFF: | 3376 | case PACKET_TX_HAS_OFF: |
3313 | val = po->tp_tx_has_off; | 3377 | val = po->tp_tx_has_off; |
3314 | break; | 3378 | break; |
3379 | case PACKET_QDISC_BYPASS: | ||
3380 | val = packet_use_direct_xmit(po); | ||
3381 | break; | ||
3315 | default: | 3382 | default: |
3316 | return -ENOPROTOOPT; | 3383 | return -ENOPROTOOPT; |
3317 | } | 3384 | } |
diff --git a/net/packet/internal.h b/net/packet/internal.h index 1035fa2d909c..0a87d7b36c9e 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h | |||
@@ -114,6 +114,7 @@ struct packet_sock { | |||
114 | unsigned int tp_tx_has_off:1; | 114 | unsigned int tp_tx_has_off:1; |
115 | unsigned int tp_tstamp; | 115 | unsigned int tp_tstamp; |
116 | struct net_device __rcu *cached_dev; | 116 | struct net_device __rcu *cached_dev; |
117 | int (*xmit)(struct sk_buff *skb); | ||
117 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | 118 | struct packet_type prot_hook ____cacheline_aligned_in_smp; |
118 | }; | 119 | }; |
119 | 120 | ||