diff options
-rw-r--r-- | Documentation/networking/packet_mmap.txt | 21 | ||||
-rw-r--r-- | include/uapi/linux/if_packet.h | 1 | ||||
-rw-r--r-- | net/packet/af_packet.c | 91 | ||||
-rw-r--r-- | net/packet/internal.h | 1 |
4 files changed, 102 insertions, 12 deletions
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 8e48e3b14227..4288ffafba9f 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt | |||
@@ -953,6 +953,27 @@ int main(int argc, char **argp) | |||
953 | } | 953 | } |
954 | 954 | ||
955 | ------------------------------------------------------------------------------- | 955 | ------------------------------------------------------------------------------- |
956 | + PACKET_QDISC_BYPASS | ||
957 | ------------------------------------------------------------------------------- | ||
958 | |||
959 | If there is a requirement to load the network with many packets in a similar | ||
960 | fashion as pktgen does, you might set the following option after socket | ||
961 | creation: | ||
962 | |||
963 | int one = 1; | ||
964 | setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)); | ||
965 | |||
966 | This has the side-effect, that packets sent through PF_PACKET will bypass the | ||
967 | kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning, | ||
968 | packet are not buffered, tc disciplines are ignored, increased loss can occur | ||
969 | and such packets are also not visible to other PF_PACKET sockets anymore. So, | ||
970 | you have been warned; generally, this can be useful for stress testing various | ||
971 | components of a system. | ||
972 | |||
973 | On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled | ||
974 | on PF_PACKET sockets. | ||
975 | |||
976 | ------------------------------------------------------------------------------- | ||
956 | + PACKET_TIMESTAMP | 977 | + PACKET_TIMESTAMP |
957 | ------------------------------------------------------------------------------- | 978 | ------------------------------------------------------------------------------- |
958 | 979 | ||
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index dbf06667394b..1e24aa701cbd 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h | |||
@@ -51,6 +51,7 @@ struct sockaddr_ll { | |||
51 | #define PACKET_TIMESTAMP 17 | 51 | #define PACKET_TIMESTAMP 17 |
52 | #define PACKET_FANOUT 18 | 52 | #define PACKET_FANOUT 18 |
53 | #define PACKET_TX_HAS_OFF 19 | 53 | #define PACKET_TX_HAS_OFF 19 |
54 | #define PACKET_QDISC_BYPASS 20 | ||
54 | 55 | ||
55 | #define PACKET_FANOUT_HASH 0 | 56 | #define PACKET_FANOUT_HASH 0 |
56 | #define PACKET_FANOUT_LB 1 | 57 | #define PACKET_FANOUT_LB 1 |
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e4171dd98590..9d70f1349926 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -237,6 +237,48 @@ struct packet_skb_cb { | |||
237 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); | 237 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); |
238 | static void __fanout_link(struct sock *sk, struct packet_sock *po); | 238 | static void __fanout_link(struct sock *sk, struct packet_sock *po); |
239 | 239 | ||
240 | static int packet_direct_xmit(struct sk_buff *skb) | ||
241 | { | ||
242 | struct net_device *dev = skb->dev; | ||
243 | const struct net_device_ops *ops = dev->netdev_ops; | ||
244 | netdev_features_t features; | ||
245 | struct netdev_queue *txq; | ||
246 | u16 queue_map; | ||
247 | int ret; | ||
248 | |||
249 | if (unlikely(!netif_running(dev) || | ||
250 | !netif_carrier_ok(dev))) { | ||
251 | kfree_skb(skb); | ||
252 | return NET_XMIT_DROP; | ||
253 | } | ||
254 | |||
255 | features = netif_skb_features(skb); | ||
256 | if (skb_needs_linearize(skb, features) && | ||
257 | __skb_linearize(skb)) { | ||
258 | kfree_skb(skb); | ||
259 | return NET_XMIT_DROP; | ||
260 | } | ||
261 | |||
262 | queue_map = skb_get_queue_mapping(skb); | ||
263 | txq = netdev_get_tx_queue(dev, queue_map); | ||
264 | |||
265 | __netif_tx_lock_bh(txq); | ||
266 | if (unlikely(netif_xmit_frozen_or_stopped(txq))) { | ||
267 | ret = NETDEV_TX_BUSY; | ||
268 | kfree_skb(skb); | ||
269 | goto out; | ||
270 | } | ||
271 | |||
272 | ret = ops->ndo_start_xmit(skb, dev); | ||
273 | if (likely(dev_xmit_complete(ret))) | ||
274 | txq_trans_update(txq); | ||
275 | else | ||
276 | kfree_skb(skb); | ||
277 | out: | ||
278 | __netif_tx_unlock_bh(txq); | ||
279 | return ret; | ||
280 | } | ||
281 | |||
240 | static struct net_device *packet_cached_dev_get(struct packet_sock *po) | 282 | static struct net_device *packet_cached_dev_get(struct packet_sock *po) |
241 | { | 283 | { |
242 | struct net_device *dev; | 284 | struct net_device *dev; |
@@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po) | |||
261 | RCU_INIT_POINTER(po->cached_dev, NULL); | 303 | RCU_INIT_POINTER(po->cached_dev, NULL); |
262 | } | 304 | } |
263 | 305 | ||
306 | static bool packet_use_direct_xmit(const struct packet_sock *po) | ||
307 | { | ||
308 | return po->xmit == packet_direct_xmit; | ||
309 | } | ||
310 | |||
311 | static u16 packet_pick_tx_queue(struct net_device *dev) | ||
312 | { | ||
313 | return (u16) smp_processor_id() % dev->real_num_tx_queues; | ||
314 | } | ||
315 | |||
264 | /* register_prot_hook must be invoked with the po->bind_lock held, | 316 | /* register_prot_hook must be invoked with the po->bind_lock held, |
265 | * or from a context in which asynchronous accesses to the packet | 317 | * or from a context in which asynchronous accesses to the packet |
266 | * socket is not possible (packet_create()). | 318 | * socket is not possible (packet_create()). |
@@ -1994,9 +2046,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | |||
1994 | 2046 | ||
1995 | skb_reserve(skb, hlen); | 2047 | skb_reserve(skb, hlen); |
1996 | skb_reset_network_header(skb); | 2048 | skb_reset_network_header(skb); |
1997 | skb_probe_transport_header(skb, 0); | ||
1998 | 2049 | ||
1999 | if (po->tp_tx_has_off) { | 2050 | if (!packet_use_direct_xmit(po)) |
2051 | skb_probe_transport_header(skb, 0); | ||
2052 | if (unlikely(po->tp_tx_has_off)) { | ||
2000 | int off_min, off_max, off; | 2053 | int off_min, off_max, off; |
2001 | off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); | 2054 | off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); |
2002 | off_max = po->tx_ring.frame_size - tp_len; | 2055 | off_max = po->tx_ring.frame_size - tp_len; |
@@ -2166,12 +2219,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
2166 | } | 2219 | } |
2167 | } | 2220 | } |
2168 | 2221 | ||
2222 | skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); | ||
2169 | skb->destructor = tpacket_destruct_skb; | 2223 | skb->destructor = tpacket_destruct_skb; |
2170 | __packet_set_status(po, ph, TP_STATUS_SENDING); | 2224 | __packet_set_status(po, ph, TP_STATUS_SENDING); |
2171 | atomic_inc(&po->tx_ring.pending); | 2225 | atomic_inc(&po->tx_ring.pending); |
2172 | 2226 | ||
2173 | status = TP_STATUS_SEND_REQUEST; | 2227 | status = TP_STATUS_SEND_REQUEST; |
2174 | err = dev_queue_xmit(skb); | 2228 | err = po->xmit(skb); |
2175 | if (unlikely(err > 0)) { | 2229 | if (unlikely(err > 0)) { |
2176 | err = net_xmit_errno(err); | 2230 | err = net_xmit_errno(err); |
2177 | if (err && __packet_get_status(po, ph) == | 2231 | if (err && __packet_get_status(po, ph) == |
@@ -2230,8 +2284,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, | |||
2230 | return skb; | 2284 | return skb; |
2231 | } | 2285 | } |
2232 | 2286 | ||
2233 | static int packet_snd(struct socket *sock, | 2287 | static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) |
2234 | struct msghdr *msg, size_t len) | ||
2235 | { | 2288 | { |
2236 | struct sock *sk = sock->sk; | 2289 | struct sock *sk = sock->sk; |
2237 | struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; | 2290 | struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; |
@@ -2376,6 +2429,7 @@ static int packet_snd(struct socket *sock, | |||
2376 | skb->dev = dev; | 2429 | skb->dev = dev; |
2377 | skb->priority = sk->sk_priority; | 2430 | skb->priority = sk->sk_priority; |
2378 | skb->mark = sk->sk_mark; | 2431 | skb->mark = sk->sk_mark; |
2432 | skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); | ||
2379 | 2433 | ||
2380 | if (po->has_vnet_hdr) { | 2434 | if (po->has_vnet_hdr) { |
2381 | if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { | 2435 | if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { |
@@ -2396,16 +2450,12 @@ static int packet_snd(struct socket *sock, | |||
2396 | len += vnet_hdr_len; | 2450 | len += vnet_hdr_len; |
2397 | } | 2451 | } |
2398 | 2452 | ||
2399 | skb_probe_transport_header(skb, reserve); | 2453 | if (!packet_use_direct_xmit(po)) |
2400 | 2454 | skb_probe_transport_header(skb, reserve); | |
2401 | if (unlikely(extra_len == 4)) | 2455 | if (unlikely(extra_len == 4)) |
2402 | skb->no_fcs = 1; | 2456 | skb->no_fcs = 1; |
2403 | 2457 | ||
2404 | /* | 2458 | err = po->xmit(skb); |
2405 | * Now send it | ||
2406 | */ | ||
2407 | |||
2408 | err = dev_queue_xmit(skb); | ||
2409 | if (err > 0 && (err = net_xmit_errno(err)) != 0) | 2459 | if (err > 0 && (err = net_xmit_errno(err)) != 0) |
2410 | goto out_unlock; | 2460 | goto out_unlock; |
2411 | 2461 | ||
@@ -2427,6 +2477,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
2427 | { | 2477 | { |
2428 | struct sock *sk = sock->sk; | 2478 | struct sock *sk = sock->sk; |
2429 | struct packet_sock *po = pkt_sk(sk); | 2479 | struct packet_sock *po = pkt_sk(sk); |
2480 | |||
2430 | if (po->tx_ring.pg_vec) | 2481 | if (po->tx_ring.pg_vec) |
2431 | return tpacket_snd(po, msg); | 2482 | return tpacket_snd(po, msg); |
2432 | else | 2483 | else |
@@ -2641,6 +2692,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, | |||
2641 | po = pkt_sk(sk); | 2692 | po = pkt_sk(sk); |
2642 | sk->sk_family = PF_PACKET; | 2693 | sk->sk_family = PF_PACKET; |
2643 | po->num = proto; | 2694 | po->num = proto; |
2695 | po->xmit = dev_queue_xmit; | ||
2644 | 2696 | ||
2645 | packet_cached_dev_reset(po); | 2697 | packet_cached_dev_reset(po); |
2646 | 2698 | ||
@@ -3220,6 +3272,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
3220 | po->tp_tx_has_off = !!val; | 3272 | po->tp_tx_has_off = !!val; |
3221 | return 0; | 3273 | return 0; |
3222 | } | 3274 | } |
3275 | case PACKET_QDISC_BYPASS: | ||
3276 | { | ||
3277 | int val; | ||
3278 | |||
3279 | if (optlen != sizeof(val)) | ||
3280 | return -EINVAL; | ||
3281 | if (copy_from_user(&val, optval, sizeof(val))) | ||
3282 | return -EFAULT; | ||
3283 | |||
3284 | po->xmit = val ? packet_direct_xmit : dev_queue_xmit; | ||
3285 | return 0; | ||
3286 | } | ||
3223 | default: | 3287 | default: |
3224 | return -ENOPROTOOPT; | 3288 | return -ENOPROTOOPT; |
3225 | } | 3289 | } |
@@ -3312,6 +3376,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
3312 | case PACKET_TX_HAS_OFF: | 3376 | case PACKET_TX_HAS_OFF: |
3313 | val = po->tp_tx_has_off; | 3377 | val = po->tp_tx_has_off; |
3314 | break; | 3378 | break; |
3379 | case PACKET_QDISC_BYPASS: | ||
3380 | val = packet_use_direct_xmit(po); | ||
3381 | break; | ||
3315 | default: | 3382 | default: |
3316 | return -ENOPROTOOPT; | 3383 | return -ENOPROTOOPT; |
3317 | } | 3384 | } |
diff --git a/net/packet/internal.h b/net/packet/internal.h index 1035fa2d909c..0a87d7b36c9e 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h | |||
@@ -114,6 +114,7 @@ struct packet_sock { | |||
114 | unsigned int tp_tx_has_off:1; | 114 | unsigned int tp_tx_has_off:1; |
115 | unsigned int tp_tstamp; | 115 | unsigned int tp_tstamp; |
116 | struct net_device __rcu *cached_dev; | 116 | struct net_device __rcu *cached_dev; |
117 | int (*xmit)(struct sk_buff *skb); | ||
117 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | 118 | struct packet_type prot_hook ____cacheline_aligned_in_smp; |
118 | }; | 119 | }; |
119 | 120 | ||