aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c299
1 files changed, 225 insertions, 74 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 88cfbc189558..6a2bb37506c5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,7 +88,7 @@
88#include <linux/virtio_net.h> 88#include <linux/virtio_net.h>
89#include <linux/errqueue.h> 89#include <linux/errqueue.h>
90#include <linux/net_tstamp.h> 90#include <linux/net_tstamp.h>
91#include <linux/reciprocal_div.h> 91#include <linux/percpu.h>
92#ifdef CONFIG_INET 92#ifdef CONFIG_INET
93#include <net/inet_common.h> 93#include <net/inet_common.h>
94#endif 94#endif
@@ -237,6 +237,48 @@ struct packet_skb_cb {
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po); 237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po); 238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239 239
240static int packet_direct_xmit(struct sk_buff *skb)
241{
242 struct net_device *dev = skb->dev;
243 const struct net_device_ops *ops = dev->netdev_ops;
244 netdev_features_t features;
245 struct netdev_queue *txq;
246 u16 queue_map;
247 int ret;
248
249 if (unlikely(!netif_running(dev) ||
250 !netif_carrier_ok(dev))) {
251 kfree_skb(skb);
252 return NET_XMIT_DROP;
253 }
254
255 features = netif_skb_features(skb);
256 if (skb_needs_linearize(skb, features) &&
257 __skb_linearize(skb)) {
258 kfree_skb(skb);
259 return NET_XMIT_DROP;
260 }
261
262 queue_map = skb_get_queue_mapping(skb);
263 txq = netdev_get_tx_queue(dev, queue_map);
264
265 __netif_tx_lock_bh(txq);
266 if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
267 ret = NETDEV_TX_BUSY;
268 kfree_skb(skb);
269 goto out;
270 }
271
272 ret = ops->ndo_start_xmit(skb, dev);
273 if (likely(dev_xmit_complete(ret)))
274 txq_trans_update(txq);
275 else
276 kfree_skb(skb);
277out:
278 __netif_tx_unlock_bh(txq);
279 return ret;
280}
281
240static struct net_device *packet_cached_dev_get(struct packet_sock *po) 282static struct net_device *packet_cached_dev_get(struct packet_sock *po)
241{ 283{
242 struct net_device *dev; 284 struct net_device *dev;
@@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po)
261 RCU_INIT_POINTER(po->cached_dev, NULL); 303 RCU_INIT_POINTER(po->cached_dev, NULL);
262} 304}
263 305
306static bool packet_use_direct_xmit(const struct packet_sock *po)
307{
308 return po->xmit == packet_direct_xmit;
309}
310
311static u16 packet_pick_tx_queue(struct net_device *dev)
312{
313 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
314}
315
264/* register_prot_hook must be invoked with the po->bind_lock held, 316/* register_prot_hook must be invoked with the po->bind_lock held,
265 * or from a context in which asynchronous accesses to the packet 317 * or from a context in which asynchronous accesses to the packet
266 * socket is not possible (packet_create()). 318 * socket is not possible (packet_create()).
@@ -458,7 +510,8 @@ static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
458{ 510{
459 struct tpacket_kbdq_core *pkc; 511 struct tpacket_kbdq_core *pkc;
460 512
461 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; 513 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
514 GET_PBDQC_FROM_RB(&po->rx_ring);
462 515
463 spin_lock_bh(&rb_queue->lock); 516 spin_lock_bh(&rb_queue->lock);
464 pkc->delete_blk_timer = 1; 517 pkc->delete_blk_timer = 1;
@@ -484,7 +537,8 @@ static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
484 if (tx_ring) 537 if (tx_ring)
485 BUG(); 538 BUG();
486 539
487 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; 540 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
541 GET_PBDQC_FROM_RB(&po->rx_ring);
488 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); 542 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
489} 543}
490 544
@@ -542,7 +596,7 @@ static void init_prb_bdqc(struct packet_sock *po,
542 struct pgv *pg_vec, 596 struct pgv *pg_vec,
543 union tpacket_req_u *req_u, int tx_ring) 597 union tpacket_req_u *req_u, int tx_ring)
544{ 598{
545 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc; 599 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
546 struct tpacket_block_desc *pbd; 600 struct tpacket_block_desc *pbd;
547 601
548 memset(p1, 0x0, sizeof(*p1)); 602 memset(p1, 0x0, sizeof(*p1));
@@ -606,7 +660,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
606static void prb_retire_rx_blk_timer_expired(unsigned long data) 660static void prb_retire_rx_blk_timer_expired(unsigned long data)
607{ 661{
608 struct packet_sock *po = (struct packet_sock *)data; 662 struct packet_sock *po = (struct packet_sock *)data;
609 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc; 663 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
610 unsigned int frozen; 664 unsigned int frozen;
611 struct tpacket_block_desc *pbd; 665 struct tpacket_block_desc *pbd;
612 666
@@ -909,7 +963,7 @@ static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
909static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, 963static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
910 struct tpacket3_hdr *ppd) 964 struct tpacket3_hdr *ppd)
911{ 965{
912 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); 966 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
913} 967}
914 968
915static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, 969static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
@@ -923,9 +977,11 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
923{ 977{
924 if (vlan_tx_tag_present(pkc->skb)) { 978 if (vlan_tx_tag_present(pkc->skb)) {
925 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); 979 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
926 ppd->tp_status = TP_STATUS_VLAN_VALID; 980 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
981 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
927 } else { 982 } else {
928 ppd->hv1.tp_vlan_tci = 0; 983 ppd->hv1.tp_vlan_tci = 0;
984 ppd->hv1.tp_vlan_tpid = 0;
929 ppd->tp_status = TP_STATUS_AVAILABLE; 985 ppd->tp_status = TP_STATUS_AVAILABLE;
930 } 986 }
931} 987}
@@ -933,6 +989,7 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
933static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, 989static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
934 struct tpacket3_hdr *ppd) 990 struct tpacket3_hdr *ppd)
935{ 991{
992 ppd->hv1.tp_padding = 0;
936 prb_fill_vlan_info(pkc, ppd); 993 prb_fill_vlan_info(pkc, ppd);
937 994
938 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) 995 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
@@ -1111,6 +1168,47 @@ static void packet_increment_head(struct packet_ring_buffer *buff)
1111 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 1168 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1112} 1169}
1113 1170
1171static void packet_inc_pending(struct packet_ring_buffer *rb)
1172{
1173 this_cpu_inc(*rb->pending_refcnt);
1174}
1175
1176static void packet_dec_pending(struct packet_ring_buffer *rb)
1177{
1178 this_cpu_dec(*rb->pending_refcnt);
1179}
1180
1181static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1182{
1183 unsigned int refcnt = 0;
1184 int cpu;
1185
1186 /* We don't use pending refcount in rx_ring. */
1187 if (rb->pending_refcnt == NULL)
1188 return 0;
1189
1190 for_each_possible_cpu(cpu)
1191 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1192
1193 return refcnt;
1194}
1195
1196static int packet_alloc_pending(struct packet_sock *po)
1197{
1198 po->rx_ring.pending_refcnt = NULL;
1199
1200 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1201 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1202 return -ENOBUFS;
1203
1204 return 0;
1205}
1206
1207static void packet_free_pending(struct packet_sock *po)
1208{
1209 free_percpu(po->tx_ring.pending_refcnt);
1210}
1211
1114static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) 1212static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1115{ 1213{
1116 struct sock *sk = &po->sk; 1214 struct sock *sk = &po->sk;
@@ -1163,7 +1261,7 @@ static unsigned int fanout_demux_hash(struct packet_fanout *f,
1163 struct sk_buff *skb, 1261 struct sk_buff *skb,
1164 unsigned int num) 1262 unsigned int num)
1165{ 1263{
1166 return reciprocal_divide(skb->rxhash, num); 1264 return reciprocal_scale(skb->rxhash, num);
1167} 1265}
1168 1266
1169static unsigned int fanout_demux_lb(struct packet_fanout *f, 1267static unsigned int fanout_demux_lb(struct packet_fanout *f,
@@ -1190,7 +1288,7 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1190 struct sk_buff *skb, 1288 struct sk_buff *skb,
1191 unsigned int num) 1289 unsigned int num)
1192{ 1290{
1193 return reciprocal_divide(prandom_u32(), num); 1291 return prandom_u32_max(num);
1194} 1292}
1195 1293
1196static unsigned int fanout_demux_rollover(struct packet_fanout *f, 1294static unsigned int fanout_demux_rollover(struct packet_fanout *f,
@@ -1214,6 +1312,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1214 return idx; 1312 return idx;
1215} 1313}
1216 1314
1315static unsigned int fanout_demux_qm(struct packet_fanout *f,
1316 struct sk_buff *skb,
1317 unsigned int num)
1318{
1319 return skb_get_queue_mapping(skb) % num;
1320}
1321
1217static bool fanout_has_flag(struct packet_fanout *f, u16 flag) 1322static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1218{ 1323{
1219 return f->flags & (flag >> 8); 1324 return f->flags & (flag >> 8);
@@ -1241,7 +1346,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1241 if (!skb) 1346 if (!skb)
1242 return 0; 1347 return 0;
1243 } 1348 }
1244 skb_get_rxhash(skb); 1349 skb_get_hash(skb);
1245 idx = fanout_demux_hash(f, skb, num); 1350 idx = fanout_demux_hash(f, skb, num);
1246 break; 1351 break;
1247 case PACKET_FANOUT_LB: 1352 case PACKET_FANOUT_LB:
@@ -1253,6 +1358,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1253 case PACKET_FANOUT_RND: 1358 case PACKET_FANOUT_RND:
1254 idx = fanout_demux_rnd(f, skb, num); 1359 idx = fanout_demux_rnd(f, skb, num);
1255 break; 1360 break;
1361 case PACKET_FANOUT_QM:
1362 idx = fanout_demux_qm(f, skb, num);
1363 break;
1256 case PACKET_FANOUT_ROLLOVER: 1364 case PACKET_FANOUT_ROLLOVER:
1257 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); 1365 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
1258 break; 1366 break;
@@ -1299,9 +1407,9 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1299 spin_unlock(&f->lock); 1407 spin_unlock(&f->lock);
1300} 1408}
1301 1409
1302static bool match_fanout_group(struct packet_type *ptype, struct sock * sk) 1410static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1303{ 1411{
1304 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) 1412 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
1305 return true; 1413 return true;
1306 1414
1307 return false; 1415 return false;
@@ -1323,6 +1431,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1323 case PACKET_FANOUT_LB: 1431 case PACKET_FANOUT_LB:
1324 case PACKET_FANOUT_CPU: 1432 case PACKET_FANOUT_CPU:
1325 case PACKET_FANOUT_RND: 1433 case PACKET_FANOUT_RND:
1434 case PACKET_FANOUT_QM:
1326 break; 1435 break;
1327 default: 1436 default:
1328 return -EINVAL; 1437 return -EINVAL;
@@ -1485,7 +1594,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1485 struct msghdr *msg, size_t len) 1594 struct msghdr *msg, size_t len)
1486{ 1595{
1487 struct sock *sk = sock->sk; 1596 struct sock *sk = sock->sk;
1488 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; 1597 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1489 struct sk_buff *skb = NULL; 1598 struct sk_buff *skb = NULL;
1490 struct net_device *dev; 1599 struct net_device *dev;
1491 __be16 proto = 0; 1600 __be16 proto = 0;
@@ -1758,6 +1867,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1758 struct timespec ts; 1867 struct timespec ts;
1759 __u32 ts_status; 1868 __u32 ts_status;
1760 1869
1870 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1871 * We may add members to them until current aligned size without forcing
1872 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1873 */
1874 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1875 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1876
1761 if (skb->pkt_type == PACKET_LOOPBACK) 1877 if (skb->pkt_type == PACKET_LOOPBACK)
1762 goto drop; 1878 goto drop;
1763 1879
@@ -1864,11 +1980,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1864 h.h2->tp_nsec = ts.tv_nsec; 1980 h.h2->tp_nsec = ts.tv_nsec;
1865 if (vlan_tx_tag_present(skb)) { 1981 if (vlan_tx_tag_present(skb)) {
1866 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); 1982 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1867 status |= TP_STATUS_VLAN_VALID; 1983 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
1984 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1868 } else { 1985 } else {
1869 h.h2->tp_vlan_tci = 0; 1986 h.h2->tp_vlan_tci = 0;
1987 h.h2->tp_vlan_tpid = 0;
1870 } 1988 }
1871 h.h2->tp_padding = 0; 1989 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
1872 hdrlen = sizeof(*h.h2); 1990 hdrlen = sizeof(*h.h2);
1873 break; 1991 break;
1874 case TPACKET_V3: 1992 case TPACKET_V3:
@@ -1882,6 +2000,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1882 h.h3->tp_net = netoff; 2000 h.h3->tp_net = netoff;
1883 h.h3->tp_sec = ts.tv_sec; 2001 h.h3->tp_sec = ts.tv_sec;
1884 h.h3->tp_nsec = ts.tv_nsec; 2002 h.h3->tp_nsec = ts.tv_nsec;
2003 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
1885 hdrlen = sizeof(*h.h3); 2004 hdrlen = sizeof(*h.h3);
1886 break; 2005 break;
1887 default: 2006 default:
@@ -1900,19 +2019,20 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1900 sll->sll_ifindex = dev->ifindex; 2019 sll->sll_ifindex = dev->ifindex;
1901 2020
1902 smp_mb(); 2021 smp_mb();
2022
1903#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 2023#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1904 { 2024 if (po->tp_version <= TPACKET_V2) {
1905 u8 *start, *end; 2025 u8 *start, *end;
1906 2026
1907 if (po->tp_version <= TPACKET_V2) { 2027 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
1908 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw 2028 macoff + snaplen);
1909 + macoff + snaplen); 2029
1910 for (start = h.raw; start < end; start += PAGE_SIZE) 2030 for (start = h.raw; start < end; start += PAGE_SIZE)
1911 flush_dcache_page(pgv_to_page(start)); 2031 flush_dcache_page(pgv_to_page(start));
1912 }
1913 smp_wmb();
1914 } 2032 }
2033 smp_wmb();
1915#endif 2034#endif
2035
1916 if (po->tp_version <= TPACKET_V2) 2036 if (po->tp_version <= TPACKET_V2)
1917 __packet_set_status(po, h.raw, status); 2037 __packet_set_status(po, h.raw, status);
1918 else 2038 else
@@ -1941,14 +2061,13 @@ ring_is_full:
1941static void tpacket_destruct_skb(struct sk_buff *skb) 2061static void tpacket_destruct_skb(struct sk_buff *skb)
1942{ 2062{
1943 struct packet_sock *po = pkt_sk(skb->sk); 2063 struct packet_sock *po = pkt_sk(skb->sk);
1944 void *ph;
1945 2064
1946 if (likely(po->tx_ring.pg_vec)) { 2065 if (likely(po->tx_ring.pg_vec)) {
2066 void *ph;
1947 __u32 ts; 2067 __u32 ts;
1948 2068
1949 ph = skb_shinfo(skb)->destructor_arg; 2069 ph = skb_shinfo(skb)->destructor_arg;
1950 BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 2070 packet_dec_pending(&po->tx_ring);
1951 atomic_dec(&po->tx_ring.pending);
1952 2071
1953 ts = __packet_set_timestamp(po, ph, skb); 2072 ts = __packet_set_timestamp(po, ph, skb);
1954 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); 2073 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
@@ -1992,9 +2111,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1992 2111
1993 skb_reserve(skb, hlen); 2112 skb_reserve(skb, hlen);
1994 skb_reset_network_header(skb); 2113 skb_reset_network_header(skb);
1995 skb_probe_transport_header(skb, 0);
1996 2114
1997 if (po->tp_tx_has_off) { 2115 if (!packet_use_direct_xmit(po))
2116 skb_probe_transport_header(skb, 0);
2117 if (unlikely(po->tp_tx_has_off)) {
1998 int off_min, off_max, off; 2118 int off_min, off_max, off;
1999 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); 2119 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2000 off_max = po->tx_ring.frame_size - tp_len; 2120 off_max = po->tx_ring.frame_size - tp_len;
@@ -2087,7 +2207,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2087 __be16 proto; 2207 __be16 proto;
2088 int err, reserve = 0; 2208 int err, reserve = 0;
2089 void *ph; 2209 void *ph;
2090 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; 2210 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2211 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2091 int tp_len, size_max; 2212 int tp_len, size_max;
2092 unsigned char *addr; 2213 unsigned char *addr;
2093 int len_sum = 0; 2214 int len_sum = 0;
@@ -2130,10 +2251,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2130 2251
2131 do { 2252 do {
2132 ph = packet_current_frame(po, &po->tx_ring, 2253 ph = packet_current_frame(po, &po->tx_ring,
2133 TP_STATUS_SEND_REQUEST); 2254 TP_STATUS_SEND_REQUEST);
2134
2135 if (unlikely(ph == NULL)) { 2255 if (unlikely(ph == NULL)) {
2136 schedule(); 2256 if (need_wait && need_resched())
2257 schedule();
2137 continue; 2258 continue;
2138 } 2259 }
2139 2260
@@ -2164,12 +2285,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2164 } 2285 }
2165 } 2286 }
2166 2287
2288 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
2167 skb->destructor = tpacket_destruct_skb; 2289 skb->destructor = tpacket_destruct_skb;
2168 __packet_set_status(po, ph, TP_STATUS_SENDING); 2290 __packet_set_status(po, ph, TP_STATUS_SENDING);
2169 atomic_inc(&po->tx_ring.pending); 2291 packet_inc_pending(&po->tx_ring);
2170 2292
2171 status = TP_STATUS_SEND_REQUEST; 2293 status = TP_STATUS_SEND_REQUEST;
2172 err = dev_queue_xmit(skb); 2294 err = po->xmit(skb);
2173 if (unlikely(err > 0)) { 2295 if (unlikely(err > 0)) {
2174 err = net_xmit_errno(err); 2296 err = net_xmit_errno(err);
2175 if (err && __packet_get_status(po, ph) == 2297 if (err && __packet_get_status(po, ph) ==
@@ -2187,9 +2309,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2187 packet_increment_head(&po->tx_ring); 2309 packet_increment_head(&po->tx_ring);
2188 len_sum += tp_len; 2310 len_sum += tp_len;
2189 } while (likely((ph != NULL) || 2311 } while (likely((ph != NULL) ||
2190 ((!(msg->msg_flags & MSG_DONTWAIT)) && 2312 /* Note: packet_read_pending() might be slow if we have
2191 (atomic_read(&po->tx_ring.pending)))) 2313 * to call it as it's per_cpu variable, but in fast-path
2192 ); 2314 * we already short-circuit the loop with the first
2315 * condition, and luckily don't have to go that path
2316 * anyway.
2317 */
2318 (need_wait && packet_read_pending(&po->tx_ring))));
2193 2319
2194 err = len_sum; 2320 err = len_sum;
2195 goto out_put; 2321 goto out_put;
@@ -2228,11 +2354,10 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2228 return skb; 2354 return skb;
2229} 2355}
2230 2356
2231static int packet_snd(struct socket *sock, 2357static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2232 struct msghdr *msg, size_t len)
2233{ 2358{
2234 struct sock *sk = sock->sk; 2359 struct sock *sk = sock->sk;
2235 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; 2360 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2236 struct sk_buff *skb; 2361 struct sk_buff *skb;
2237 struct net_device *dev; 2362 struct net_device *dev;
2238 __be16 proto; 2363 __be16 proto;
@@ -2374,6 +2499,7 @@ static int packet_snd(struct socket *sock,
2374 skb->dev = dev; 2499 skb->dev = dev;
2375 skb->priority = sk->sk_priority; 2500 skb->priority = sk->sk_priority;
2376 skb->mark = sk->sk_mark; 2501 skb->mark = sk->sk_mark;
2502 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
2377 2503
2378 if (po->has_vnet_hdr) { 2504 if (po->has_vnet_hdr) {
2379 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 2505 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
@@ -2394,16 +2520,12 @@ static int packet_snd(struct socket *sock,
2394 len += vnet_hdr_len; 2520 len += vnet_hdr_len;
2395 } 2521 }
2396 2522
2397 skb_probe_transport_header(skb, reserve); 2523 if (!packet_use_direct_xmit(po))
2398 2524 skb_probe_transport_header(skb, reserve);
2399 if (unlikely(extra_len == 4)) 2525 if (unlikely(extra_len == 4))
2400 skb->no_fcs = 1; 2526 skb->no_fcs = 1;
2401 2527
2402 /* 2528 err = po->xmit(skb);
2403 * Now send it
2404 */
2405
2406 err = dev_queue_xmit(skb);
2407 if (err > 0 && (err = net_xmit_errno(err)) != 0) 2529 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2408 goto out_unlock; 2530 goto out_unlock;
2409 2531
@@ -2425,6 +2547,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2425{ 2547{
2426 struct sock *sk = sock->sk; 2548 struct sock *sk = sock->sk;
2427 struct packet_sock *po = pkt_sk(sk); 2549 struct packet_sock *po = pkt_sk(sk);
2550
2428 if (po->tx_ring.pg_vec) 2551 if (po->tx_ring.pg_vec)
2429 return tpacket_snd(po, msg); 2552 return tpacket_snd(po, msg);
2430 else 2553 else
@@ -2491,6 +2614,7 @@ static int packet_release(struct socket *sock)
2491 /* Purge queues */ 2614 /* Purge queues */
2492 2615
2493 skb_queue_purge(&sk->sk_receive_queue); 2616 skb_queue_purge(&sk->sk_receive_queue);
2617 packet_free_pending(po);
2494 sk_refcnt_debug_release(sk); 2618 sk_refcnt_debug_release(sk);
2495 2619
2496 sock_put(sk); 2620 sock_put(sk);
@@ -2501,9 +2625,12 @@ static int packet_release(struct socket *sock)
2501 * Attach a packet hook. 2625 * Attach a packet hook.
2502 */ 2626 */
2503 2627
2504static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) 2628static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
2505{ 2629{
2506 struct packet_sock *po = pkt_sk(sk); 2630 struct packet_sock *po = pkt_sk(sk);
2631 const struct net_device *dev_curr;
2632 __be16 proto_curr;
2633 bool need_rehook;
2507 2634
2508 if (po->fanout) { 2635 if (po->fanout) {
2509 if (dev) 2636 if (dev)
@@ -2513,21 +2640,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
2513 } 2640 }
2514 2641
2515 lock_sock(sk); 2642 lock_sock(sk);
2516
2517 spin_lock(&po->bind_lock); 2643 spin_lock(&po->bind_lock);
2518 unregister_prot_hook(sk, true);
2519 2644
2520 po->num = protocol; 2645 proto_curr = po->prot_hook.type;
2521 po->prot_hook.type = protocol; 2646 dev_curr = po->prot_hook.dev;
2522 if (po->prot_hook.dev) 2647
2523 dev_put(po->prot_hook.dev); 2648 need_rehook = proto_curr != proto || dev_curr != dev;
2649
2650 if (need_rehook) {
2651 unregister_prot_hook(sk, true);
2652
2653 po->num = proto;
2654 po->prot_hook.type = proto;
2655
2656 if (po->prot_hook.dev)
2657 dev_put(po->prot_hook.dev);
2524 2658
2525 po->prot_hook.dev = dev; 2659 po->prot_hook.dev = dev;
2526 po->ifindex = dev ? dev->ifindex : 0;
2527 2660
2528 packet_cached_dev_assign(po, dev); 2661 po->ifindex = dev ? dev->ifindex : 0;
2662 packet_cached_dev_assign(po, dev);
2663 }
2529 2664
2530 if (protocol == 0) 2665 if (proto == 0 || !need_rehook)
2531 goto out_unlock; 2666 goto out_unlock;
2532 2667
2533 if (!dev || (dev->flags & IFF_UP)) { 2668 if (!dev || (dev->flags & IFF_UP)) {
@@ -2639,6 +2774,11 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2639 po = pkt_sk(sk); 2774 po = pkt_sk(sk);
2640 sk->sk_family = PF_PACKET; 2775 sk->sk_family = PF_PACKET;
2641 po->num = proto; 2776 po->num = proto;
2777 po->xmit = dev_queue_xmit;
2778
2779 err = packet_alloc_pending(po);
2780 if (err)
2781 goto out2;
2642 2782
2643 packet_cached_dev_reset(po); 2783 packet_cached_dev_reset(po);
2644 2784
@@ -2672,6 +2812,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2672 preempt_enable(); 2812 preempt_enable();
2673 2813
2674 return 0; 2814 return 0;
2815out2:
2816 sk_free(sk);
2675out: 2817out:
2676 return err; 2818 return err;
2677} 2819}
@@ -2791,6 +2933,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2791 * in, we fill it in now. 2933 * in, we fill it in now.
2792 */ 2934 */
2793 if (sock->type == SOCK_PACKET) { 2935 if (sock->type == SOCK_PACKET) {
2936 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
2794 msg->msg_namelen = sizeof(struct sockaddr_pkt); 2937 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2795 } else { 2938 } else {
2796 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; 2939 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
@@ -2813,11 +2956,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2813 aux.tp_net = skb_network_offset(skb); 2956 aux.tp_net = skb_network_offset(skb);
2814 if (vlan_tx_tag_present(skb)) { 2957 if (vlan_tx_tag_present(skb)) {
2815 aux.tp_vlan_tci = vlan_tx_tag_get(skb); 2958 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2816 aux.tp_status |= TP_STATUS_VLAN_VALID; 2959 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
2960 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2817 } else { 2961 } else {
2818 aux.tp_vlan_tci = 0; 2962 aux.tp_vlan_tci = 0;
2963 aux.tp_vlan_tpid = 0;
2819 } 2964 }
2820 aux.tp_padding = 0;
2821 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); 2965 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
2822 } 2966 }
2823 2967
@@ -3218,6 +3362,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
3218 po->tp_tx_has_off = !!val; 3362 po->tp_tx_has_off = !!val;
3219 return 0; 3363 return 0;
3220 } 3364 }
3365 case PACKET_QDISC_BYPASS:
3366 {
3367 int val;
3368
3369 if (optlen != sizeof(val))
3370 return -EINVAL;
3371 if (copy_from_user(&val, optval, sizeof(val)))
3372 return -EFAULT;
3373
3374 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3375 return 0;
3376 }
3221 default: 3377 default:
3222 return -ENOPROTOOPT; 3378 return -ENOPROTOOPT;
3223 } 3379 }
@@ -3310,6 +3466,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3310 case PACKET_TX_HAS_OFF: 3466 case PACKET_TX_HAS_OFF:
3311 val = po->tp_tx_has_off; 3467 val = po->tp_tx_has_off;
3312 break; 3468 break;
3469 case PACKET_QDISC_BYPASS:
3470 val = packet_use_direct_xmit(po);
3471 break;
3313 default: 3472 default:
3314 return -ENOPROTOOPT; 3473 return -ENOPROTOOPT;
3315 } 3474 }
@@ -3501,34 +3660,26 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3501 3660
3502static char *alloc_one_pg_vec_page(unsigned long order) 3661static char *alloc_one_pg_vec_page(unsigned long order)
3503{ 3662{
3504 char *buffer = NULL; 3663 char *buffer;
3505 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | 3664 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3506 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; 3665 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3507 3666
3508 buffer = (char *) __get_free_pages(gfp_flags, order); 3667 buffer = (char *) __get_free_pages(gfp_flags, order);
3509
3510 if (buffer) 3668 if (buffer)
3511 return buffer; 3669 return buffer;
3512 3670
3513 /* 3671 /* __get_free_pages failed, fall back to vmalloc */
3514 * __get_free_pages failed, fall back to vmalloc
3515 */
3516 buffer = vzalloc((1 << order) * PAGE_SIZE); 3672 buffer = vzalloc((1 << order) * PAGE_SIZE);
3517
3518 if (buffer) 3673 if (buffer)
3519 return buffer; 3674 return buffer;
3520 3675
3521 /* 3676 /* vmalloc failed, lets dig into swap here */
3522 * vmalloc failed, lets dig into swap here
3523 */
3524 gfp_flags &= ~__GFP_NORETRY; 3677 gfp_flags &= ~__GFP_NORETRY;
3525 buffer = (char *)__get_free_pages(gfp_flags, order); 3678 buffer = (char *) __get_free_pages(gfp_flags, order);
3526 if (buffer) 3679 if (buffer)
3527 return buffer; 3680 return buffer;
3528 3681
3529 /* 3682 /* complete and utter failure */
3530 * complete and utter failure
3531 */
3532 return NULL; 3683 return NULL;
3533} 3684}
3534 3685
@@ -3583,7 +3734,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3583 if (!closing) { 3734 if (!closing) {
3584 if (atomic_read(&po->mapped)) 3735 if (atomic_read(&po->mapped))
3585 goto out; 3736 goto out;
3586 if (atomic_read(&rb->pending)) 3737 if (packet_read_pending(rb))
3587 goto out; 3738 goto out;
3588 } 3739 }
3589 3740