diff options
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r-- | net/packet/af_packet.c | 299 |
1 files changed, 225 insertions, 74 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 88cfbc189558..6a2bb37506c5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -88,7 +88,7 @@ | |||
88 | #include <linux/virtio_net.h> | 88 | #include <linux/virtio_net.h> |
89 | #include <linux/errqueue.h> | 89 | #include <linux/errqueue.h> |
90 | #include <linux/net_tstamp.h> | 90 | #include <linux/net_tstamp.h> |
91 | #include <linux/reciprocal_div.h> | 91 | #include <linux/percpu.h> |
92 | #ifdef CONFIG_INET | 92 | #ifdef CONFIG_INET |
93 | #include <net/inet_common.h> | 93 | #include <net/inet_common.h> |
94 | #endif | 94 | #endif |
@@ -237,6 +237,48 @@ struct packet_skb_cb { | |||
237 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); | 237 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); |
238 | static void __fanout_link(struct sock *sk, struct packet_sock *po); | 238 | static void __fanout_link(struct sock *sk, struct packet_sock *po); |
239 | 239 | ||
240 | static int packet_direct_xmit(struct sk_buff *skb) | ||
241 | { | ||
242 | struct net_device *dev = skb->dev; | ||
243 | const struct net_device_ops *ops = dev->netdev_ops; | ||
244 | netdev_features_t features; | ||
245 | struct netdev_queue *txq; | ||
246 | u16 queue_map; | ||
247 | int ret; | ||
248 | |||
249 | if (unlikely(!netif_running(dev) || | ||
250 | !netif_carrier_ok(dev))) { | ||
251 | kfree_skb(skb); | ||
252 | return NET_XMIT_DROP; | ||
253 | } | ||
254 | |||
255 | features = netif_skb_features(skb); | ||
256 | if (skb_needs_linearize(skb, features) && | ||
257 | __skb_linearize(skb)) { | ||
258 | kfree_skb(skb); | ||
259 | return NET_XMIT_DROP; | ||
260 | } | ||
261 | |||
262 | queue_map = skb_get_queue_mapping(skb); | ||
263 | txq = netdev_get_tx_queue(dev, queue_map); | ||
264 | |||
265 | __netif_tx_lock_bh(txq); | ||
266 | if (unlikely(netif_xmit_frozen_or_stopped(txq))) { | ||
267 | ret = NETDEV_TX_BUSY; | ||
268 | kfree_skb(skb); | ||
269 | goto out; | ||
270 | } | ||
271 | |||
272 | ret = ops->ndo_start_xmit(skb, dev); | ||
273 | if (likely(dev_xmit_complete(ret))) | ||
274 | txq_trans_update(txq); | ||
275 | else | ||
276 | kfree_skb(skb); | ||
277 | out: | ||
278 | __netif_tx_unlock_bh(txq); | ||
279 | return ret; | ||
280 | } | ||
281 | |||
240 | static struct net_device *packet_cached_dev_get(struct packet_sock *po) | 282 | static struct net_device *packet_cached_dev_get(struct packet_sock *po) |
241 | { | 283 | { |
242 | struct net_device *dev; | 284 | struct net_device *dev; |
@@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po) | |||
261 | RCU_INIT_POINTER(po->cached_dev, NULL); | 303 | RCU_INIT_POINTER(po->cached_dev, NULL); |
262 | } | 304 | } |
263 | 305 | ||
306 | static bool packet_use_direct_xmit(const struct packet_sock *po) | ||
307 | { | ||
308 | return po->xmit == packet_direct_xmit; | ||
309 | } | ||
310 | |||
311 | static u16 packet_pick_tx_queue(struct net_device *dev) | ||
312 | { | ||
313 | return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; | ||
314 | } | ||
315 | |||
264 | /* register_prot_hook must be invoked with the po->bind_lock held, | 316 | /* register_prot_hook must be invoked with the po->bind_lock held, |
265 | * or from a context in which asynchronous accesses to the packet | 317 | * or from a context in which asynchronous accesses to the packet |
266 | * socket is not possible (packet_create()). | 318 | * socket is not possible (packet_create()). |
@@ -458,7 +510,8 @@ static void prb_shutdown_retire_blk_timer(struct packet_sock *po, | |||
458 | { | 510 | { |
459 | struct tpacket_kbdq_core *pkc; | 511 | struct tpacket_kbdq_core *pkc; |
460 | 512 | ||
461 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | 513 | pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : |
514 | GET_PBDQC_FROM_RB(&po->rx_ring); | ||
462 | 515 | ||
463 | spin_lock_bh(&rb_queue->lock); | 516 | spin_lock_bh(&rb_queue->lock); |
464 | pkc->delete_blk_timer = 1; | 517 | pkc->delete_blk_timer = 1; |
@@ -484,7 +537,8 @@ static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) | |||
484 | if (tx_ring) | 537 | if (tx_ring) |
485 | BUG(); | 538 | BUG(); |
486 | 539 | ||
487 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | 540 | pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : |
541 | GET_PBDQC_FROM_RB(&po->rx_ring); | ||
488 | prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); | 542 | prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); |
489 | } | 543 | } |
490 | 544 | ||
@@ -542,7 +596,7 @@ static void init_prb_bdqc(struct packet_sock *po, | |||
542 | struct pgv *pg_vec, | 596 | struct pgv *pg_vec, |
543 | union tpacket_req_u *req_u, int tx_ring) | 597 | union tpacket_req_u *req_u, int tx_ring) |
544 | { | 598 | { |
545 | struct tpacket_kbdq_core *p1 = &rb->prb_bdqc; | 599 | struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); |
546 | struct tpacket_block_desc *pbd; | 600 | struct tpacket_block_desc *pbd; |
547 | 601 | ||
548 | memset(p1, 0x0, sizeof(*p1)); | 602 | memset(p1, 0x0, sizeof(*p1)); |
@@ -606,7 +660,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) | |||
606 | static void prb_retire_rx_blk_timer_expired(unsigned long data) | 660 | static void prb_retire_rx_blk_timer_expired(unsigned long data) |
607 | { | 661 | { |
608 | struct packet_sock *po = (struct packet_sock *)data; | 662 | struct packet_sock *po = (struct packet_sock *)data; |
609 | struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc; | 663 | struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); |
610 | unsigned int frozen; | 664 | unsigned int frozen; |
611 | struct tpacket_block_desc *pbd; | 665 | struct tpacket_block_desc *pbd; |
612 | 666 | ||
@@ -909,7 +963,7 @@ static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) | |||
909 | static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, | 963 | static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, |
910 | struct tpacket3_hdr *ppd) | 964 | struct tpacket3_hdr *ppd) |
911 | { | 965 | { |
912 | ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); | 966 | ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); |
913 | } | 967 | } |
914 | 968 | ||
915 | static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, | 969 | static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, |
@@ -923,9 +977,11 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, | |||
923 | { | 977 | { |
924 | if (vlan_tx_tag_present(pkc->skb)) { | 978 | if (vlan_tx_tag_present(pkc->skb)) { |
925 | ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); | 979 | ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); |
926 | ppd->tp_status = TP_STATUS_VLAN_VALID; | 980 | ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); |
981 | ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; | ||
927 | } else { | 982 | } else { |
928 | ppd->hv1.tp_vlan_tci = 0; | 983 | ppd->hv1.tp_vlan_tci = 0; |
984 | ppd->hv1.tp_vlan_tpid = 0; | ||
929 | ppd->tp_status = TP_STATUS_AVAILABLE; | 985 | ppd->tp_status = TP_STATUS_AVAILABLE; |
930 | } | 986 | } |
931 | } | 987 | } |
@@ -933,6 +989,7 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, | |||
933 | static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, | 989 | static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, |
934 | struct tpacket3_hdr *ppd) | 990 | struct tpacket3_hdr *ppd) |
935 | { | 991 | { |
992 | ppd->hv1.tp_padding = 0; | ||
936 | prb_fill_vlan_info(pkc, ppd); | 993 | prb_fill_vlan_info(pkc, ppd); |
937 | 994 | ||
938 | if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) | 995 | if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) |
@@ -1111,6 +1168,47 @@ static void packet_increment_head(struct packet_ring_buffer *buff) | |||
1111 | buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; | 1168 | buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; |
1112 | } | 1169 | } |
1113 | 1170 | ||
1171 | static void packet_inc_pending(struct packet_ring_buffer *rb) | ||
1172 | { | ||
1173 | this_cpu_inc(*rb->pending_refcnt); | ||
1174 | } | ||
1175 | |||
1176 | static void packet_dec_pending(struct packet_ring_buffer *rb) | ||
1177 | { | ||
1178 | this_cpu_dec(*rb->pending_refcnt); | ||
1179 | } | ||
1180 | |||
1181 | static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) | ||
1182 | { | ||
1183 | unsigned int refcnt = 0; | ||
1184 | int cpu; | ||
1185 | |||
1186 | /* We don't use pending refcount in rx_ring. */ | ||
1187 | if (rb->pending_refcnt == NULL) | ||
1188 | return 0; | ||
1189 | |||
1190 | for_each_possible_cpu(cpu) | ||
1191 | refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); | ||
1192 | |||
1193 | return refcnt; | ||
1194 | } | ||
1195 | |||
1196 | static int packet_alloc_pending(struct packet_sock *po) | ||
1197 | { | ||
1198 | po->rx_ring.pending_refcnt = NULL; | ||
1199 | |||
1200 | po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); | ||
1201 | if (unlikely(po->tx_ring.pending_refcnt == NULL)) | ||
1202 | return -ENOBUFS; | ||
1203 | |||
1204 | return 0; | ||
1205 | } | ||
1206 | |||
1207 | static void packet_free_pending(struct packet_sock *po) | ||
1208 | { | ||
1209 | free_percpu(po->tx_ring.pending_refcnt); | ||
1210 | } | ||
1211 | |||
1114 | static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) | 1212 | static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) |
1115 | { | 1213 | { |
1116 | struct sock *sk = &po->sk; | 1214 | struct sock *sk = &po->sk; |
@@ -1163,7 +1261,7 @@ static unsigned int fanout_demux_hash(struct packet_fanout *f, | |||
1163 | struct sk_buff *skb, | 1261 | struct sk_buff *skb, |
1164 | unsigned int num) | 1262 | unsigned int num) |
1165 | { | 1263 | { |
1166 | return reciprocal_divide(skb->rxhash, num); | 1264 | return reciprocal_scale(skb->rxhash, num); |
1167 | } | 1265 | } |
1168 | 1266 | ||
1169 | static unsigned int fanout_demux_lb(struct packet_fanout *f, | 1267 | static unsigned int fanout_demux_lb(struct packet_fanout *f, |
@@ -1190,7 +1288,7 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, | |||
1190 | struct sk_buff *skb, | 1288 | struct sk_buff *skb, |
1191 | unsigned int num) | 1289 | unsigned int num) |
1192 | { | 1290 | { |
1193 | return reciprocal_divide(prandom_u32(), num); | 1291 | return prandom_u32_max(num); |
1194 | } | 1292 | } |
1195 | 1293 | ||
1196 | static unsigned int fanout_demux_rollover(struct packet_fanout *f, | 1294 | static unsigned int fanout_demux_rollover(struct packet_fanout *f, |
@@ -1214,6 +1312,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, | |||
1214 | return idx; | 1312 | return idx; |
1215 | } | 1313 | } |
1216 | 1314 | ||
1315 | static unsigned int fanout_demux_qm(struct packet_fanout *f, | ||
1316 | struct sk_buff *skb, | ||
1317 | unsigned int num) | ||
1318 | { | ||
1319 | return skb_get_queue_mapping(skb) % num; | ||
1320 | } | ||
1321 | |||
1217 | static bool fanout_has_flag(struct packet_fanout *f, u16 flag) | 1322 | static bool fanout_has_flag(struct packet_fanout *f, u16 flag) |
1218 | { | 1323 | { |
1219 | return f->flags & (flag >> 8); | 1324 | return f->flags & (flag >> 8); |
@@ -1241,7 +1346,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | |||
1241 | if (!skb) | 1346 | if (!skb) |
1242 | return 0; | 1347 | return 0; |
1243 | } | 1348 | } |
1244 | skb_get_rxhash(skb); | 1349 | skb_get_hash(skb); |
1245 | idx = fanout_demux_hash(f, skb, num); | 1350 | idx = fanout_demux_hash(f, skb, num); |
1246 | break; | 1351 | break; |
1247 | case PACKET_FANOUT_LB: | 1352 | case PACKET_FANOUT_LB: |
@@ -1253,6 +1358,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | |||
1253 | case PACKET_FANOUT_RND: | 1358 | case PACKET_FANOUT_RND: |
1254 | idx = fanout_demux_rnd(f, skb, num); | 1359 | idx = fanout_demux_rnd(f, skb, num); |
1255 | break; | 1360 | break; |
1361 | case PACKET_FANOUT_QM: | ||
1362 | idx = fanout_demux_qm(f, skb, num); | ||
1363 | break; | ||
1256 | case PACKET_FANOUT_ROLLOVER: | 1364 | case PACKET_FANOUT_ROLLOVER: |
1257 | idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); | 1365 | idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); |
1258 | break; | 1366 | break; |
@@ -1299,9 +1407,9 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) | |||
1299 | spin_unlock(&f->lock); | 1407 | spin_unlock(&f->lock); |
1300 | } | 1408 | } |
1301 | 1409 | ||
1302 | static bool match_fanout_group(struct packet_type *ptype, struct sock * sk) | 1410 | static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) |
1303 | { | 1411 | { |
1304 | if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) | 1412 | if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) |
1305 | return true; | 1413 | return true; |
1306 | 1414 | ||
1307 | return false; | 1415 | return false; |
@@ -1323,6 +1431,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) | |||
1323 | case PACKET_FANOUT_LB: | 1431 | case PACKET_FANOUT_LB: |
1324 | case PACKET_FANOUT_CPU: | 1432 | case PACKET_FANOUT_CPU: |
1325 | case PACKET_FANOUT_RND: | 1433 | case PACKET_FANOUT_RND: |
1434 | case PACKET_FANOUT_QM: | ||
1326 | break; | 1435 | break; |
1327 | default: | 1436 | default: |
1328 | return -EINVAL; | 1437 | return -EINVAL; |
@@ -1485,7 +1594,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, | |||
1485 | struct msghdr *msg, size_t len) | 1594 | struct msghdr *msg, size_t len) |
1486 | { | 1595 | { |
1487 | struct sock *sk = sock->sk; | 1596 | struct sock *sk = sock->sk; |
1488 | struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; | 1597 | DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); |
1489 | struct sk_buff *skb = NULL; | 1598 | struct sk_buff *skb = NULL; |
1490 | struct net_device *dev; | 1599 | struct net_device *dev; |
1491 | __be16 proto = 0; | 1600 | __be16 proto = 0; |
@@ -1758,6 +1867,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1758 | struct timespec ts; | 1867 | struct timespec ts; |
1759 | __u32 ts_status; | 1868 | __u32 ts_status; |
1760 | 1869 | ||
1870 | /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. | ||
1871 | * We may add members to them until current aligned size without forcing | ||
1872 | * userspace to call getsockopt(..., PACKET_HDRLEN, ...). | ||
1873 | */ | ||
1874 | BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); | ||
1875 | BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); | ||
1876 | |||
1761 | if (skb->pkt_type == PACKET_LOOPBACK) | 1877 | if (skb->pkt_type == PACKET_LOOPBACK) |
1762 | goto drop; | 1878 | goto drop; |
1763 | 1879 | ||
@@ -1864,11 +1980,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1864 | h.h2->tp_nsec = ts.tv_nsec; | 1980 | h.h2->tp_nsec = ts.tv_nsec; |
1865 | if (vlan_tx_tag_present(skb)) { | 1981 | if (vlan_tx_tag_present(skb)) { |
1866 | h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); | 1982 | h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); |
1867 | status |= TP_STATUS_VLAN_VALID; | 1983 | h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); |
1984 | status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; | ||
1868 | } else { | 1985 | } else { |
1869 | h.h2->tp_vlan_tci = 0; | 1986 | h.h2->tp_vlan_tci = 0; |
1987 | h.h2->tp_vlan_tpid = 0; | ||
1870 | } | 1988 | } |
1871 | h.h2->tp_padding = 0; | 1989 | memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); |
1872 | hdrlen = sizeof(*h.h2); | 1990 | hdrlen = sizeof(*h.h2); |
1873 | break; | 1991 | break; |
1874 | case TPACKET_V3: | 1992 | case TPACKET_V3: |
@@ -1882,6 +2000,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1882 | h.h3->tp_net = netoff; | 2000 | h.h3->tp_net = netoff; |
1883 | h.h3->tp_sec = ts.tv_sec; | 2001 | h.h3->tp_sec = ts.tv_sec; |
1884 | h.h3->tp_nsec = ts.tv_nsec; | 2002 | h.h3->tp_nsec = ts.tv_nsec; |
2003 | memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); | ||
1885 | hdrlen = sizeof(*h.h3); | 2004 | hdrlen = sizeof(*h.h3); |
1886 | break; | 2005 | break; |
1887 | default: | 2006 | default: |
@@ -1900,19 +2019,20 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1900 | sll->sll_ifindex = dev->ifindex; | 2019 | sll->sll_ifindex = dev->ifindex; |
1901 | 2020 | ||
1902 | smp_mb(); | 2021 | smp_mb(); |
2022 | |||
1903 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | 2023 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 |
1904 | { | 2024 | if (po->tp_version <= TPACKET_V2) { |
1905 | u8 *start, *end; | 2025 | u8 *start, *end; |
1906 | 2026 | ||
1907 | if (po->tp_version <= TPACKET_V2) { | 2027 | end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + |
1908 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw | 2028 | macoff + snaplen); |
1909 | + macoff + snaplen); | 2029 | |
1910 | for (start = h.raw; start < end; start += PAGE_SIZE) | 2030 | for (start = h.raw; start < end; start += PAGE_SIZE) |
1911 | flush_dcache_page(pgv_to_page(start)); | 2031 | flush_dcache_page(pgv_to_page(start)); |
1912 | } | ||
1913 | smp_wmb(); | ||
1914 | } | 2032 | } |
2033 | smp_wmb(); | ||
1915 | #endif | 2034 | #endif |
2035 | |||
1916 | if (po->tp_version <= TPACKET_V2) | 2036 | if (po->tp_version <= TPACKET_V2) |
1917 | __packet_set_status(po, h.raw, status); | 2037 | __packet_set_status(po, h.raw, status); |
1918 | else | 2038 | else |
@@ -1941,14 +2061,13 @@ ring_is_full: | |||
1941 | static void tpacket_destruct_skb(struct sk_buff *skb) | 2061 | static void tpacket_destruct_skb(struct sk_buff *skb) |
1942 | { | 2062 | { |
1943 | struct packet_sock *po = pkt_sk(skb->sk); | 2063 | struct packet_sock *po = pkt_sk(skb->sk); |
1944 | void *ph; | ||
1945 | 2064 | ||
1946 | if (likely(po->tx_ring.pg_vec)) { | 2065 | if (likely(po->tx_ring.pg_vec)) { |
2066 | void *ph; | ||
1947 | __u32 ts; | 2067 | __u32 ts; |
1948 | 2068 | ||
1949 | ph = skb_shinfo(skb)->destructor_arg; | 2069 | ph = skb_shinfo(skb)->destructor_arg; |
1950 | BUG_ON(atomic_read(&po->tx_ring.pending) == 0); | 2070 | packet_dec_pending(&po->tx_ring); |
1951 | atomic_dec(&po->tx_ring.pending); | ||
1952 | 2071 | ||
1953 | ts = __packet_set_timestamp(po, ph, skb); | 2072 | ts = __packet_set_timestamp(po, ph, skb); |
1954 | __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); | 2073 | __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); |
@@ -1992,9 +2111,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | |||
1992 | 2111 | ||
1993 | skb_reserve(skb, hlen); | 2112 | skb_reserve(skb, hlen); |
1994 | skb_reset_network_header(skb); | 2113 | skb_reset_network_header(skb); |
1995 | skb_probe_transport_header(skb, 0); | ||
1996 | 2114 | ||
1997 | if (po->tp_tx_has_off) { | 2115 | if (!packet_use_direct_xmit(po)) |
2116 | skb_probe_transport_header(skb, 0); | ||
2117 | if (unlikely(po->tp_tx_has_off)) { | ||
1998 | int off_min, off_max, off; | 2118 | int off_min, off_max, off; |
1999 | off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); | 2119 | off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); |
2000 | off_max = po->tx_ring.frame_size - tp_len; | 2120 | off_max = po->tx_ring.frame_size - tp_len; |
@@ -2087,7 +2207,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
2087 | __be16 proto; | 2207 | __be16 proto; |
2088 | int err, reserve = 0; | 2208 | int err, reserve = 0; |
2089 | void *ph; | 2209 | void *ph; |
2090 | struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; | 2210 | DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); |
2211 | bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); | ||
2091 | int tp_len, size_max; | 2212 | int tp_len, size_max; |
2092 | unsigned char *addr; | 2213 | unsigned char *addr; |
2093 | int len_sum = 0; | 2214 | int len_sum = 0; |
@@ -2130,10 +2251,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
2130 | 2251 | ||
2131 | do { | 2252 | do { |
2132 | ph = packet_current_frame(po, &po->tx_ring, | 2253 | ph = packet_current_frame(po, &po->tx_ring, |
2133 | TP_STATUS_SEND_REQUEST); | 2254 | TP_STATUS_SEND_REQUEST); |
2134 | |||
2135 | if (unlikely(ph == NULL)) { | 2255 | if (unlikely(ph == NULL)) { |
2136 | schedule(); | 2256 | if (need_wait && need_resched()) |
2257 | schedule(); | ||
2137 | continue; | 2258 | continue; |
2138 | } | 2259 | } |
2139 | 2260 | ||
@@ -2164,12 +2285,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
2164 | } | 2285 | } |
2165 | } | 2286 | } |
2166 | 2287 | ||
2288 | skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); | ||
2167 | skb->destructor = tpacket_destruct_skb; | 2289 | skb->destructor = tpacket_destruct_skb; |
2168 | __packet_set_status(po, ph, TP_STATUS_SENDING); | 2290 | __packet_set_status(po, ph, TP_STATUS_SENDING); |
2169 | atomic_inc(&po->tx_ring.pending); | 2291 | packet_inc_pending(&po->tx_ring); |
2170 | 2292 | ||
2171 | status = TP_STATUS_SEND_REQUEST; | 2293 | status = TP_STATUS_SEND_REQUEST; |
2172 | err = dev_queue_xmit(skb); | 2294 | err = po->xmit(skb); |
2173 | if (unlikely(err > 0)) { | 2295 | if (unlikely(err > 0)) { |
2174 | err = net_xmit_errno(err); | 2296 | err = net_xmit_errno(err); |
2175 | if (err && __packet_get_status(po, ph) == | 2297 | if (err && __packet_get_status(po, ph) == |
@@ -2187,9 +2309,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
2187 | packet_increment_head(&po->tx_ring); | 2309 | packet_increment_head(&po->tx_ring); |
2188 | len_sum += tp_len; | 2310 | len_sum += tp_len; |
2189 | } while (likely((ph != NULL) || | 2311 | } while (likely((ph != NULL) || |
2190 | ((!(msg->msg_flags & MSG_DONTWAIT)) && | 2312 | /* Note: packet_read_pending() might be slow if we have |
2191 | (atomic_read(&po->tx_ring.pending)))) | 2313 | * to call it as it's per_cpu variable, but in fast-path |
2192 | ); | 2314 | * we already short-circuit the loop with the first |
2315 | * condition, and luckily don't have to go that path | ||
2316 | * anyway. | ||
2317 | */ | ||
2318 | (need_wait && packet_read_pending(&po->tx_ring)))); | ||
2193 | 2319 | ||
2194 | err = len_sum; | 2320 | err = len_sum; |
2195 | goto out_put; | 2321 | goto out_put; |
@@ -2228,11 +2354,10 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, | |||
2228 | return skb; | 2354 | return skb; |
2229 | } | 2355 | } |
2230 | 2356 | ||
2231 | static int packet_snd(struct socket *sock, | 2357 | static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) |
2232 | struct msghdr *msg, size_t len) | ||
2233 | { | 2358 | { |
2234 | struct sock *sk = sock->sk; | 2359 | struct sock *sk = sock->sk; |
2235 | struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; | 2360 | DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); |
2236 | struct sk_buff *skb; | 2361 | struct sk_buff *skb; |
2237 | struct net_device *dev; | 2362 | struct net_device *dev; |
2238 | __be16 proto; | 2363 | __be16 proto; |
@@ -2374,6 +2499,7 @@ static int packet_snd(struct socket *sock, | |||
2374 | skb->dev = dev; | 2499 | skb->dev = dev; |
2375 | skb->priority = sk->sk_priority; | 2500 | skb->priority = sk->sk_priority; |
2376 | skb->mark = sk->sk_mark; | 2501 | skb->mark = sk->sk_mark; |
2502 | skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); | ||
2377 | 2503 | ||
2378 | if (po->has_vnet_hdr) { | 2504 | if (po->has_vnet_hdr) { |
2379 | if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { | 2505 | if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { |
@@ -2394,16 +2520,12 @@ static int packet_snd(struct socket *sock, | |||
2394 | len += vnet_hdr_len; | 2520 | len += vnet_hdr_len; |
2395 | } | 2521 | } |
2396 | 2522 | ||
2397 | skb_probe_transport_header(skb, reserve); | 2523 | if (!packet_use_direct_xmit(po)) |
2398 | 2524 | skb_probe_transport_header(skb, reserve); | |
2399 | if (unlikely(extra_len == 4)) | 2525 | if (unlikely(extra_len == 4)) |
2400 | skb->no_fcs = 1; | 2526 | skb->no_fcs = 1; |
2401 | 2527 | ||
2402 | /* | 2528 | err = po->xmit(skb); |
2403 | * Now send it | ||
2404 | */ | ||
2405 | |||
2406 | err = dev_queue_xmit(skb); | ||
2407 | if (err > 0 && (err = net_xmit_errno(err)) != 0) | 2529 | if (err > 0 && (err = net_xmit_errno(err)) != 0) |
2408 | goto out_unlock; | 2530 | goto out_unlock; |
2409 | 2531 | ||
@@ -2425,6 +2547,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
2425 | { | 2547 | { |
2426 | struct sock *sk = sock->sk; | 2548 | struct sock *sk = sock->sk; |
2427 | struct packet_sock *po = pkt_sk(sk); | 2549 | struct packet_sock *po = pkt_sk(sk); |
2550 | |||
2428 | if (po->tx_ring.pg_vec) | 2551 | if (po->tx_ring.pg_vec) |
2429 | return tpacket_snd(po, msg); | 2552 | return tpacket_snd(po, msg); |
2430 | else | 2553 | else |
@@ -2491,6 +2614,7 @@ static int packet_release(struct socket *sock) | |||
2491 | /* Purge queues */ | 2614 | /* Purge queues */ |
2492 | 2615 | ||
2493 | skb_queue_purge(&sk->sk_receive_queue); | 2616 | skb_queue_purge(&sk->sk_receive_queue); |
2617 | packet_free_pending(po); | ||
2494 | sk_refcnt_debug_release(sk); | 2618 | sk_refcnt_debug_release(sk); |
2495 | 2619 | ||
2496 | sock_put(sk); | 2620 | sock_put(sk); |
@@ -2501,9 +2625,12 @@ static int packet_release(struct socket *sock) | |||
2501 | * Attach a packet hook. | 2625 | * Attach a packet hook. |
2502 | */ | 2626 | */ |
2503 | 2627 | ||
2504 | static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) | 2628 | static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) |
2505 | { | 2629 | { |
2506 | struct packet_sock *po = pkt_sk(sk); | 2630 | struct packet_sock *po = pkt_sk(sk); |
2631 | const struct net_device *dev_curr; | ||
2632 | __be16 proto_curr; | ||
2633 | bool need_rehook; | ||
2507 | 2634 | ||
2508 | if (po->fanout) { | 2635 | if (po->fanout) { |
2509 | if (dev) | 2636 | if (dev) |
@@ -2513,21 +2640,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc | |||
2513 | } | 2640 | } |
2514 | 2641 | ||
2515 | lock_sock(sk); | 2642 | lock_sock(sk); |
2516 | |||
2517 | spin_lock(&po->bind_lock); | 2643 | spin_lock(&po->bind_lock); |
2518 | unregister_prot_hook(sk, true); | ||
2519 | 2644 | ||
2520 | po->num = protocol; | 2645 | proto_curr = po->prot_hook.type; |
2521 | po->prot_hook.type = protocol; | 2646 | dev_curr = po->prot_hook.dev; |
2522 | if (po->prot_hook.dev) | 2647 | |
2523 | dev_put(po->prot_hook.dev); | 2648 | need_rehook = proto_curr != proto || dev_curr != dev; |
2649 | |||
2650 | if (need_rehook) { | ||
2651 | unregister_prot_hook(sk, true); | ||
2652 | |||
2653 | po->num = proto; | ||
2654 | po->prot_hook.type = proto; | ||
2655 | |||
2656 | if (po->prot_hook.dev) | ||
2657 | dev_put(po->prot_hook.dev); | ||
2524 | 2658 | ||
2525 | po->prot_hook.dev = dev; | 2659 | po->prot_hook.dev = dev; |
2526 | po->ifindex = dev ? dev->ifindex : 0; | ||
2527 | 2660 | ||
2528 | packet_cached_dev_assign(po, dev); | 2661 | po->ifindex = dev ? dev->ifindex : 0; |
2662 | packet_cached_dev_assign(po, dev); | ||
2663 | } | ||
2529 | 2664 | ||
2530 | if (protocol == 0) | 2665 | if (proto == 0 || !need_rehook) |
2531 | goto out_unlock; | 2666 | goto out_unlock; |
2532 | 2667 | ||
2533 | if (!dev || (dev->flags & IFF_UP)) { | 2668 | if (!dev || (dev->flags & IFF_UP)) { |
@@ -2639,6 +2774,11 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, | |||
2639 | po = pkt_sk(sk); | 2774 | po = pkt_sk(sk); |
2640 | sk->sk_family = PF_PACKET; | 2775 | sk->sk_family = PF_PACKET; |
2641 | po->num = proto; | 2776 | po->num = proto; |
2777 | po->xmit = dev_queue_xmit; | ||
2778 | |||
2779 | err = packet_alloc_pending(po); | ||
2780 | if (err) | ||
2781 | goto out2; | ||
2642 | 2782 | ||
2643 | packet_cached_dev_reset(po); | 2783 | packet_cached_dev_reset(po); |
2644 | 2784 | ||
@@ -2672,6 +2812,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, | |||
2672 | preempt_enable(); | 2812 | preempt_enable(); |
2673 | 2813 | ||
2674 | return 0; | 2814 | return 0; |
2815 | out2: | ||
2816 | sk_free(sk); | ||
2675 | out: | 2817 | out: |
2676 | return err; | 2818 | return err; |
2677 | } | 2819 | } |
@@ -2791,6 +2933,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
2791 | * in, we fill it in now. | 2933 | * in, we fill it in now. |
2792 | */ | 2934 | */ |
2793 | if (sock->type == SOCK_PACKET) { | 2935 | if (sock->type == SOCK_PACKET) { |
2936 | __sockaddr_check_size(sizeof(struct sockaddr_pkt)); | ||
2794 | msg->msg_namelen = sizeof(struct sockaddr_pkt); | 2937 | msg->msg_namelen = sizeof(struct sockaddr_pkt); |
2795 | } else { | 2938 | } else { |
2796 | struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; | 2939 | struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; |
@@ -2813,11 +2956,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
2813 | aux.tp_net = skb_network_offset(skb); | 2956 | aux.tp_net = skb_network_offset(skb); |
2814 | if (vlan_tx_tag_present(skb)) { | 2957 | if (vlan_tx_tag_present(skb)) { |
2815 | aux.tp_vlan_tci = vlan_tx_tag_get(skb); | 2958 | aux.tp_vlan_tci = vlan_tx_tag_get(skb); |
2816 | aux.tp_status |= TP_STATUS_VLAN_VALID; | 2959 | aux.tp_vlan_tpid = ntohs(skb->vlan_proto); |
2960 | aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; | ||
2817 | } else { | 2961 | } else { |
2818 | aux.tp_vlan_tci = 0; | 2962 | aux.tp_vlan_tci = 0; |
2963 | aux.tp_vlan_tpid = 0; | ||
2819 | } | 2964 | } |
2820 | aux.tp_padding = 0; | ||
2821 | put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); | 2965 | put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); |
2822 | } | 2966 | } |
2823 | 2967 | ||
@@ -3218,6 +3362,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
3218 | po->tp_tx_has_off = !!val; | 3362 | po->tp_tx_has_off = !!val; |
3219 | return 0; | 3363 | return 0; |
3220 | } | 3364 | } |
3365 | case PACKET_QDISC_BYPASS: | ||
3366 | { | ||
3367 | int val; | ||
3368 | |||
3369 | if (optlen != sizeof(val)) | ||
3370 | return -EINVAL; | ||
3371 | if (copy_from_user(&val, optval, sizeof(val))) | ||
3372 | return -EFAULT; | ||
3373 | |||
3374 | po->xmit = val ? packet_direct_xmit : dev_queue_xmit; | ||
3375 | return 0; | ||
3376 | } | ||
3221 | default: | 3377 | default: |
3222 | return -ENOPROTOOPT; | 3378 | return -ENOPROTOOPT; |
3223 | } | 3379 | } |
@@ -3310,6 +3466,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
3310 | case PACKET_TX_HAS_OFF: | 3466 | case PACKET_TX_HAS_OFF: |
3311 | val = po->tp_tx_has_off; | 3467 | val = po->tp_tx_has_off; |
3312 | break; | 3468 | break; |
3469 | case PACKET_QDISC_BYPASS: | ||
3470 | val = packet_use_direct_xmit(po); | ||
3471 | break; | ||
3313 | default: | 3472 | default: |
3314 | return -ENOPROTOOPT; | 3473 | return -ENOPROTOOPT; |
3315 | } | 3474 | } |
@@ -3501,34 +3660,26 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order, | |||
3501 | 3660 | ||
3502 | static char *alloc_one_pg_vec_page(unsigned long order) | 3661 | static char *alloc_one_pg_vec_page(unsigned long order) |
3503 | { | 3662 | { |
3504 | char *buffer = NULL; | 3663 | char *buffer; |
3505 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | | 3664 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | |
3506 | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; | 3665 | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; |
3507 | 3666 | ||
3508 | buffer = (char *) __get_free_pages(gfp_flags, order); | 3667 | buffer = (char *) __get_free_pages(gfp_flags, order); |
3509 | |||
3510 | if (buffer) | 3668 | if (buffer) |
3511 | return buffer; | 3669 | return buffer; |
3512 | 3670 | ||
3513 | /* | 3671 | /* __get_free_pages failed, fall back to vmalloc */ |
3514 | * __get_free_pages failed, fall back to vmalloc | ||
3515 | */ | ||
3516 | buffer = vzalloc((1 << order) * PAGE_SIZE); | 3672 | buffer = vzalloc((1 << order) * PAGE_SIZE); |
3517 | |||
3518 | if (buffer) | 3673 | if (buffer) |
3519 | return buffer; | 3674 | return buffer; |
3520 | 3675 | ||
3521 | /* | 3676 | /* vmalloc failed, lets dig into swap here */ |
3522 | * vmalloc failed, lets dig into swap here | ||
3523 | */ | ||
3524 | gfp_flags &= ~__GFP_NORETRY; | 3677 | gfp_flags &= ~__GFP_NORETRY; |
3525 | buffer = (char *)__get_free_pages(gfp_flags, order); | 3678 | buffer = (char *) __get_free_pages(gfp_flags, order); |
3526 | if (buffer) | 3679 | if (buffer) |
3527 | return buffer; | 3680 | return buffer; |
3528 | 3681 | ||
3529 | /* | 3682 | /* complete and utter failure */ |
3530 | * complete and utter failure | ||
3531 | */ | ||
3532 | return NULL; | 3683 | return NULL; |
3533 | } | 3684 | } |
3534 | 3685 | ||
@@ -3583,7 +3734,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, | |||
3583 | if (!closing) { | 3734 | if (!closing) { |
3584 | if (atomic_read(&po->mapped)) | 3735 | if (atomic_read(&po->mapped)) |
3585 | goto out; | 3736 | goto out; |
3586 | if (atomic_read(&rb->pending)) | 3737 | if (packet_read_pending(rb)) |
3587 | goto out; | 3738 | goto out; |
3588 | } | 3739 | } |
3589 | 3740 | ||