aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2013-12-06 05:36:17 -0500
committerDavid S. Miller <davem@davemloft.net>2013-12-09 20:23:33 -0500
commitd346a3fae3ff1d99f5d0c819bf86edf9094a26a1 (patch)
treec1a83b58fc8904403063195bc77d5abe4a0c0d79
parent4262e5ccbbb5171abd2921eed16ed339633d6478 (diff)
packet: introduce PACKET_QDISC_BYPASS socket option
This patch introduces a PACKET_QDISC_BYPASS socket option, that allows for using a similar xmit() function as in pktgen instead of taking the dev_queue_xmit() path. This can be very useful when PF_PACKET applications are required to be used in a similar scenario as pktgen, but with full, flexible packet payload that needs to be provided, for example. On default, nothing changes in behaviour for normal PF_PACKET TX users, so everything stays as is for applications. New users, however, can now set PACKET_QDISC_BYPASS if needed to prevent own packets from i) reentering packet_rcv() and ii) to directly push the frame to the driver. In doing so we can increase pps (here 64 byte packets) for PF_PACKET a bit: # CPUs -- QDISC_BYPASS -- qdisc path -- qdisc path[**] 1 CPU == 1,509,628 pps -- 1,208,708 -- 1,247,436 2 CPUs == 3,198,659 pps -- 2,536,012 -- 1,605,779 3 CPUs == 4,787,992 pps -- 3,788,740 -- 1,735,610 4 CPUs == 6,173,956 pps -- 4,907,799 -- 1,909,114 5 CPUs == 7,495,676 pps -- 5,956,499 -- 2,014,422 6 CPUs == 9,001,496 pps -- 7,145,064 -- 2,155,261 7 CPUs == 10,229,776 pps -- 8,190,596 -- 2,220,619 8 CPUs == 11,040,732 pps -- 9,188,544 -- 2,241,879 9 CPUs == 12,009,076 pps -- 10,275,936 -- 2,068,447 10 CPUs == 11,380,052 pps -- 11,265,337 -- 1,578,689 11 CPUs == 11,672,676 pps -- 11,845,344 -- 1,297,412 [...] 20 CPUs == 11,363,192 pps -- 11,014,933 -- 1,245,081 [**]: qdisc path with packet_rcv(), how probably most people seem to use it (hopefully not anymore if not needed) The test was done using a modified trafgen, sending a simple static 64 bytes packet, on all CPUs. The trick in the fast "qdisc path" case, is to avoid reentering packet_rcv() by setting the RAW socket protocol to zero, like: socket(PF_PACKET, SOCK_RAW, 0); Tradeoffs are documented as well in this patch, clearly, if queues are busy, we will drop more packets, tc disciplines are ignored, and these packets are not visible to taps anymore. For a pktgen like scenario, we argue that this is acceptable. The pointer to the xmit function has been placed in packet socket structure hole between cached_dev and prot_hook that is hot anyway as we're working on cached_dev in each send path. Done in joint work together with Jesper Dangaard Brouer. Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/packet_mmap.txt21
-rw-r--r--include/uapi/linux/if_packet.h1
-rw-r--r--net/packet/af_packet.c91
-rw-r--r--net/packet/internal.h1
4 files changed, 102 insertions, 12 deletions
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 8e48e3b14227..4288ffafba9f 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -953,6 +953,27 @@ int main(int argc, char **argp)
953} 953}
954 954
955------------------------------------------------------------------------------- 955-------------------------------------------------------------------------------
956+ PACKET_QDISC_BYPASS
957-------------------------------------------------------------------------------
958
959If there is a requirement to load the network with many packets in a similar
960fashion as pktgen does, you might set the following option after socket
961creation:
962
963 int one = 1;
964 setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one));
965
966This has the side-effect, that packets sent through PF_PACKET will bypass the
967kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning,
968packet are not buffered, tc disciplines are ignored, increased loss can occur
969and such packets are also not visible to other PF_PACKET sockets anymore. So,
970you have been warned; generally, this can be useful for stress testing various
971components of a system.
972
973On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled
974on PF_PACKET sockets.
975
976-------------------------------------------------------------------------------
956+ PACKET_TIMESTAMP 977+ PACKET_TIMESTAMP
957------------------------------------------------------------------------------- 978-------------------------------------------------------------------------------
958 979
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index dbf06667394b..1e24aa701cbd 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -51,6 +51,7 @@ struct sockaddr_ll {
51#define PACKET_TIMESTAMP 17 51#define PACKET_TIMESTAMP 17
52#define PACKET_FANOUT 18 52#define PACKET_FANOUT 18
53#define PACKET_TX_HAS_OFF 19 53#define PACKET_TX_HAS_OFF 19
54#define PACKET_QDISC_BYPASS 20
54 55
55#define PACKET_FANOUT_HASH 0 56#define PACKET_FANOUT_HASH 0
56#define PACKET_FANOUT_LB 1 57#define PACKET_FANOUT_LB 1
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e4171dd98590..9d70f1349926 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -237,6 +237,48 @@ struct packet_skb_cb {
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po); 237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po); 238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239 239
240static int packet_direct_xmit(struct sk_buff *skb)
241{
242 struct net_device *dev = skb->dev;
243 const struct net_device_ops *ops = dev->netdev_ops;
244 netdev_features_t features;
245 struct netdev_queue *txq;
246 u16 queue_map;
247 int ret;
248
249 if (unlikely(!netif_running(dev) ||
250 !netif_carrier_ok(dev))) {
251 kfree_skb(skb);
252 return NET_XMIT_DROP;
253 }
254
255 features = netif_skb_features(skb);
256 if (skb_needs_linearize(skb, features) &&
257 __skb_linearize(skb)) {
258 kfree_skb(skb);
259 return NET_XMIT_DROP;
260 }
261
262 queue_map = skb_get_queue_mapping(skb);
263 txq = netdev_get_tx_queue(dev, queue_map);
264
265 __netif_tx_lock_bh(txq);
266 if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
267 ret = NETDEV_TX_BUSY;
268 kfree_skb(skb);
269 goto out;
270 }
271
272 ret = ops->ndo_start_xmit(skb, dev);
273 if (likely(dev_xmit_complete(ret)))
274 txq_trans_update(txq);
275 else
276 kfree_skb(skb);
277out:
278 __netif_tx_unlock_bh(txq);
279 return ret;
280}
281
240static struct net_device *packet_cached_dev_get(struct packet_sock *po) 282static struct net_device *packet_cached_dev_get(struct packet_sock *po)
241{ 283{
242 struct net_device *dev; 284 struct net_device *dev;
@@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po)
261 RCU_INIT_POINTER(po->cached_dev, NULL); 303 RCU_INIT_POINTER(po->cached_dev, NULL);
262} 304}
263 305
306static bool packet_use_direct_xmit(const struct packet_sock *po)
307{
308 return po->xmit == packet_direct_xmit;
309}
310
311static u16 packet_pick_tx_queue(struct net_device *dev)
312{
313 return (u16) smp_processor_id() % dev->real_num_tx_queues;
314}
315
264/* register_prot_hook must be invoked with the po->bind_lock held, 316/* register_prot_hook must be invoked with the po->bind_lock held,
265 * or from a context in which asynchronous accesses to the packet 317 * or from a context in which asynchronous accesses to the packet
266 * socket is not possible (packet_create()). 318 * socket is not possible (packet_create()).
@@ -1994,9 +2046,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1994 2046
1995 skb_reserve(skb, hlen); 2047 skb_reserve(skb, hlen);
1996 skb_reset_network_header(skb); 2048 skb_reset_network_header(skb);
1997 skb_probe_transport_header(skb, 0);
1998 2049
1999 if (po->tp_tx_has_off) { 2050 if (!packet_use_direct_xmit(po))
2051 skb_probe_transport_header(skb, 0);
2052 if (unlikely(po->tp_tx_has_off)) {
2000 int off_min, off_max, off; 2053 int off_min, off_max, off;
2001 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); 2054 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2002 off_max = po->tx_ring.frame_size - tp_len; 2055 off_max = po->tx_ring.frame_size - tp_len;
@@ -2166,12 +2219,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2166 } 2219 }
2167 } 2220 }
2168 2221
2222 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
2169 skb->destructor = tpacket_destruct_skb; 2223 skb->destructor = tpacket_destruct_skb;
2170 __packet_set_status(po, ph, TP_STATUS_SENDING); 2224 __packet_set_status(po, ph, TP_STATUS_SENDING);
2171 atomic_inc(&po->tx_ring.pending); 2225 atomic_inc(&po->tx_ring.pending);
2172 2226
2173 status = TP_STATUS_SEND_REQUEST; 2227 status = TP_STATUS_SEND_REQUEST;
2174 err = dev_queue_xmit(skb); 2228 err = po->xmit(skb);
2175 if (unlikely(err > 0)) { 2229 if (unlikely(err > 0)) {
2176 err = net_xmit_errno(err); 2230 err = net_xmit_errno(err);
2177 if (err && __packet_get_status(po, ph) == 2231 if (err && __packet_get_status(po, ph) ==
@@ -2230,8 +2284,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2230 return skb; 2284 return skb;
2231} 2285}
2232 2286
2233static int packet_snd(struct socket *sock, 2287static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2234 struct msghdr *msg, size_t len)
2235{ 2288{
2236 struct sock *sk = sock->sk; 2289 struct sock *sk = sock->sk;
2237 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; 2290 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
@@ -2376,6 +2429,7 @@ static int packet_snd(struct socket *sock,
2376 skb->dev = dev; 2429 skb->dev = dev;
2377 skb->priority = sk->sk_priority; 2430 skb->priority = sk->sk_priority;
2378 skb->mark = sk->sk_mark; 2431 skb->mark = sk->sk_mark;
2432 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
2379 2433
2380 if (po->has_vnet_hdr) { 2434 if (po->has_vnet_hdr) {
2381 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 2435 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
@@ -2396,16 +2450,12 @@ static int packet_snd(struct socket *sock,
2396 len += vnet_hdr_len; 2450 len += vnet_hdr_len;
2397 } 2451 }
2398 2452
2399 skb_probe_transport_header(skb, reserve); 2453 if (!packet_use_direct_xmit(po))
2400 2454 skb_probe_transport_header(skb, reserve);
2401 if (unlikely(extra_len == 4)) 2455 if (unlikely(extra_len == 4))
2402 skb->no_fcs = 1; 2456 skb->no_fcs = 1;
2403 2457
2404 /* 2458 err = po->xmit(skb);
2405 * Now send it
2406 */
2407
2408 err = dev_queue_xmit(skb);
2409 if (err > 0 && (err = net_xmit_errno(err)) != 0) 2459 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2410 goto out_unlock; 2460 goto out_unlock;
2411 2461
@@ -2427,6 +2477,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2427{ 2477{
2428 struct sock *sk = sock->sk; 2478 struct sock *sk = sock->sk;
2429 struct packet_sock *po = pkt_sk(sk); 2479 struct packet_sock *po = pkt_sk(sk);
2480
2430 if (po->tx_ring.pg_vec) 2481 if (po->tx_ring.pg_vec)
2431 return tpacket_snd(po, msg); 2482 return tpacket_snd(po, msg);
2432 else 2483 else
@@ -2641,6 +2692,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2641 po = pkt_sk(sk); 2692 po = pkt_sk(sk);
2642 sk->sk_family = PF_PACKET; 2693 sk->sk_family = PF_PACKET;
2643 po->num = proto; 2694 po->num = proto;
2695 po->xmit = dev_queue_xmit;
2644 2696
2645 packet_cached_dev_reset(po); 2697 packet_cached_dev_reset(po);
2646 2698
@@ -3220,6 +3272,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
3220 po->tp_tx_has_off = !!val; 3272 po->tp_tx_has_off = !!val;
3221 return 0; 3273 return 0;
3222 } 3274 }
3275 case PACKET_QDISC_BYPASS:
3276 {
3277 int val;
3278
3279 if (optlen != sizeof(val))
3280 return -EINVAL;
3281 if (copy_from_user(&val, optval, sizeof(val)))
3282 return -EFAULT;
3283
3284 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3285 return 0;
3286 }
3223 default: 3287 default:
3224 return -ENOPROTOOPT; 3288 return -ENOPROTOOPT;
3225 } 3289 }
@@ -3312,6 +3376,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3312 case PACKET_TX_HAS_OFF: 3376 case PACKET_TX_HAS_OFF:
3313 val = po->tp_tx_has_off; 3377 val = po->tp_tx_has_off;
3314 break; 3378 break;
3379 case PACKET_QDISC_BYPASS:
3380 val = packet_use_direct_xmit(po);
3381 break;
3315 default: 3382 default:
3316 return -ENOPROTOOPT; 3383 return -ENOPROTOOPT;
3317 } 3384 }
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 1035fa2d909c..0a87d7b36c9e 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -114,6 +114,7 @@ struct packet_sock {
114 unsigned int tp_tx_has_off:1; 114 unsigned int tp_tx_has_off:1;
115 unsigned int tp_tstamp; 115 unsigned int tp_tstamp;
116 struct net_device __rcu *cached_dev; 116 struct net_device __rcu *cached_dev;
117 int (*xmit)(struct sk_buff *skb);
117 struct packet_type prot_hook ____cacheline_aligned_in_smp; 118 struct packet_type prot_hook ____cacheline_aligned_in_smp;
118}; 119};
119 120