diff options
author | Patrick McHardy <kaber@trash.net> | 2008-07-15 01:50:15 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-07-15 01:50:15 -0400 |
commit | bbd6ef87c544d88c30e4b762b1b61ef267a7d279 (patch) | |
tree | dd9fdfbde65332d3212290d1b8783666475bd861 /net | |
parent | bc1d0411b804ad190cdadabac48a10067f17b9e6 (diff) |
packet: support extensible, 64 bit clean mmaped ring structure
The tpacket_hdr is not 64 bit clean due to use of an unsigned long
and can't be extended because the following struct sockaddr_ll needs
to be at a fixed offset.
Add support for a version 2 tpacket protocol that removes these
limitations.
Userspace can query the header size through a new getsockopt option
and change the protocol version through a setsockopt option. The
changes needed to switch to the new protocol version are:
1. replace struct tpacket_hdr by struct tpacket2_hdr
2. query header len and save
3. set protocol version to 2
- set up ring as usual
4. for getting the sockaddr_ll, use (void *)hdr + TPACKET_ALIGN(hdrlen)
instead of (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr))
Steps 2 and 4 can be omitted if the struct sockaddr_ll isn't needed.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/packet/af_packet.c | 179 |
1 files changed, 146 insertions, 33 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9f2269166687..4f059775d48f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -186,6 +186,8 @@ struct packet_sock { | |||
186 | unsigned int pg_vec_order; | 186 | unsigned int pg_vec_order; |
187 | unsigned int pg_vec_pages; | 187 | unsigned int pg_vec_pages; |
188 | unsigned int pg_vec_len; | 188 | unsigned int pg_vec_len; |
189 | enum tpacket_versions tp_version; | ||
190 | unsigned int tp_hdrlen; | ||
189 | #endif | 191 | #endif |
190 | }; | 192 | }; |
191 | 193 | ||
@@ -201,14 +203,52 @@ struct packet_skb_cb { | |||
201 | 203 | ||
202 | #ifdef CONFIG_PACKET_MMAP | 204 | #ifdef CONFIG_PACKET_MMAP |
203 | 205 | ||
204 | static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position) | 206 | static void *packet_lookup_frame(struct packet_sock *po, unsigned int position, |
207 | int status) | ||
205 | { | 208 | { |
206 | unsigned int pg_vec_pos, frame_offset; | 209 | unsigned int pg_vec_pos, frame_offset; |
210 | union { | ||
211 | struct tpacket_hdr *h1; | ||
212 | struct tpacket2_hdr *h2; | ||
213 | void *raw; | ||
214 | } h; | ||
207 | 215 | ||
208 | pg_vec_pos = position / po->frames_per_block; | 216 | pg_vec_pos = position / po->frames_per_block; |
209 | frame_offset = position % po->frames_per_block; | 217 | frame_offset = position % po->frames_per_block; |
210 | 218 | ||
211 | return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size)); | 219 | h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size); |
220 | switch (po->tp_version) { | ||
221 | case TPACKET_V1: | ||
222 | if (status != h.h1->tp_status ? TP_STATUS_USER : | ||
223 | TP_STATUS_KERNEL) | ||
224 | return NULL; | ||
225 | break; | ||
226 | case TPACKET_V2: | ||
227 | if (status != h.h2->tp_status ? TP_STATUS_USER : | ||
228 | TP_STATUS_KERNEL) | ||
229 | return NULL; | ||
230 | break; | ||
231 | } | ||
232 | return h.raw; | ||
233 | } | ||
234 | |||
235 | static void __packet_set_status(struct packet_sock *po, void *frame, int status) | ||
236 | { | ||
237 | union { | ||
238 | struct tpacket_hdr *h1; | ||
239 | struct tpacket2_hdr *h2; | ||
240 | void *raw; | ||
241 | } h; | ||
242 | |||
243 | h.raw = frame; | ||
244 | switch (po->tp_version) { | ||
245 | case TPACKET_V1: | ||
246 | h.h1->tp_status = status; | ||
247 | break; | ||
248 | case TPACKET_V2: | ||
249 | h.h2->tp_status = status; | ||
250 | break; | ||
251 | } | ||
212 | } | 252 | } |
213 | #endif | 253 | #endif |
214 | 254 | ||
@@ -551,14 +591,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
551 | struct sock *sk; | 591 | struct sock *sk; |
552 | struct packet_sock *po; | 592 | struct packet_sock *po; |
553 | struct sockaddr_ll *sll; | 593 | struct sockaddr_ll *sll; |
554 | struct tpacket_hdr *h; | 594 | union { |
595 | struct tpacket_hdr *h1; | ||
596 | struct tpacket2_hdr *h2; | ||
597 | void *raw; | ||
598 | } h; | ||
555 | u8 * skb_head = skb->data; | 599 | u8 * skb_head = skb->data; |
556 | int skb_len = skb->len; | 600 | int skb_len = skb->len; |
557 | unsigned int snaplen, res; | 601 | unsigned int snaplen, res; |
558 | unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; | 602 | unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; |
559 | unsigned short macoff, netoff; | 603 | unsigned short macoff, netoff, hdrlen; |
560 | struct sk_buff *copy_skb = NULL; | 604 | struct sk_buff *copy_skb = NULL; |
561 | struct timeval tv; | 605 | struct timeval tv; |
606 | struct timespec ts; | ||
562 | 607 | ||
563 | if (skb->pkt_type == PACKET_LOOPBACK) | 608 | if (skb->pkt_type == PACKET_LOOPBACK) |
564 | goto drop; | 609 | goto drop; |
@@ -590,10 +635,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
590 | snaplen = res; | 635 | snaplen = res; |
591 | 636 | ||
592 | if (sk->sk_type == SOCK_DGRAM) { | 637 | if (sk->sk_type == SOCK_DGRAM) { |
593 | macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; | 638 | macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16; |
594 | } else { | 639 | } else { |
595 | unsigned maclen = skb_network_offset(skb); | 640 | unsigned maclen = skb_network_offset(skb); |
596 | netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen)); | 641 | netoff = TPACKET_ALIGN(po->tp_hdrlen + |
642 | (maclen < 16 ? 16 : maclen)); | ||
597 | macoff = netoff - maclen; | 643 | macoff = netoff - maclen; |
598 | } | 644 | } |
599 | 645 | ||
@@ -616,9 +662,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
616 | } | 662 | } |
617 | 663 | ||
618 | spin_lock(&sk->sk_receive_queue.lock); | 664 | spin_lock(&sk->sk_receive_queue.lock); |
619 | h = packet_lookup_frame(po, po->head); | 665 | h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL); |
620 | 666 | if (!h.raw) | |
621 | if (h->tp_status) | ||
622 | goto ring_is_full; | 667 | goto ring_is_full; |
623 | po->head = po->head != po->frame_max ? po->head+1 : 0; | 668 | po->head = po->head != po->frame_max ? po->head+1 : 0; |
624 | po->stats.tp_packets++; | 669 | po->stats.tp_packets++; |
@@ -630,20 +675,40 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
630 | status &= ~TP_STATUS_LOSING; | 675 | status &= ~TP_STATUS_LOSING; |
631 | spin_unlock(&sk->sk_receive_queue.lock); | 676 | spin_unlock(&sk->sk_receive_queue.lock); |
632 | 677 | ||
633 | skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen); | 678 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); |
634 | 679 | ||
635 | h->tp_len = skb->len; | 680 | switch (po->tp_version) { |
636 | h->tp_snaplen = snaplen; | 681 | case TPACKET_V1: |
637 | h->tp_mac = macoff; | 682 | h.h1->tp_len = skb->len; |
638 | h->tp_net = netoff; | 683 | h.h1->tp_snaplen = snaplen; |
639 | if (skb->tstamp.tv64) | 684 | h.h1->tp_mac = macoff; |
640 | tv = ktime_to_timeval(skb->tstamp); | 685 | h.h1->tp_net = netoff; |
641 | else | 686 | if (skb->tstamp.tv64) |
642 | do_gettimeofday(&tv); | 687 | tv = ktime_to_timeval(skb->tstamp); |
643 | h->tp_sec = tv.tv_sec; | 688 | else |
644 | h->tp_usec = tv.tv_usec; | 689 | do_gettimeofday(&tv); |
690 | h.h1->tp_sec = tv.tv_sec; | ||
691 | h.h1->tp_usec = tv.tv_usec; | ||
692 | hdrlen = sizeof(*h.h1); | ||
693 | break; | ||
694 | case TPACKET_V2: | ||
695 | h.h2->tp_len = skb->len; | ||
696 | h.h2->tp_snaplen = snaplen; | ||
697 | h.h2->tp_mac = macoff; | ||
698 | h.h2->tp_net = netoff; | ||
699 | if (skb->tstamp.tv64) | ||
700 | ts = ktime_to_timespec(skb->tstamp); | ||
701 | else | ||
702 | getnstimeofday(&ts); | ||
703 | h.h2->tp_sec = ts.tv_sec; | ||
704 | h.h2->tp_nsec = ts.tv_nsec; | ||
705 | hdrlen = sizeof(*h.h2); | ||
706 | break; | ||
707 | default: | ||
708 | BUG(); | ||
709 | } | ||
645 | 710 | ||
646 | sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); | 711 | sll = h.raw + TPACKET_ALIGN(hdrlen); |
647 | sll->sll_halen = dev_parse_header(skb, sll->sll_addr); | 712 | sll->sll_halen = dev_parse_header(skb, sll->sll_addr); |
648 | sll->sll_family = AF_PACKET; | 713 | sll->sll_family = AF_PACKET; |
649 | sll->sll_hatype = dev->type; | 714 | sll->sll_hatype = dev->type; |
@@ -654,14 +719,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
654 | else | 719 | else |
655 | sll->sll_ifindex = dev->ifindex; | 720 | sll->sll_ifindex = dev->ifindex; |
656 | 721 | ||
657 | h->tp_status = status; | 722 | __packet_set_status(po, h.raw, status); |
658 | smp_mb(); | 723 | smp_mb(); |
659 | 724 | ||
660 | { | 725 | { |
661 | struct page *p_start, *p_end; | 726 | struct page *p_start, *p_end; |
662 | u8 *h_end = (u8 *)h + macoff + snaplen - 1; | 727 | u8 *h_end = h.raw + macoff + snaplen - 1; |
663 | 728 | ||
664 | p_start = virt_to_page(h); | 729 | p_start = virt_to_page(h.raw); |
665 | p_end = virt_to_page(h_end); | 730 | p_end = virt_to_page(h_end); |
666 | while (p_start <= p_end) { | 731 | while (p_start <= p_end) { |
667 | flush_dcache_page(p_start); | 732 | flush_dcache_page(p_start); |
@@ -1362,6 +1427,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
1362 | pkt_sk(sk)->copy_thresh = val; | 1427 | pkt_sk(sk)->copy_thresh = val; |
1363 | return 0; | 1428 | return 0; |
1364 | } | 1429 | } |
1430 | case PACKET_VERSION: | ||
1431 | { | ||
1432 | int val; | ||
1433 | |||
1434 | if (optlen != sizeof(val)) | ||
1435 | return -EINVAL; | ||
1436 | if (po->pg_vec) | ||
1437 | return -EBUSY; | ||
1438 | if (copy_from_user(&val, optval, sizeof(val))) | ||
1439 | return -EFAULT; | ||
1440 | switch (val) { | ||
1441 | case TPACKET_V1: | ||
1442 | case TPACKET_V2: | ||
1443 | po->tp_version = val; | ||
1444 | return 0; | ||
1445 | default: | ||
1446 | return -EINVAL; | ||
1447 | } | ||
1448 | } | ||
1365 | #endif | 1449 | #endif |
1366 | case PACKET_AUXDATA: | 1450 | case PACKET_AUXDATA: |
1367 | { | 1451 | { |
@@ -1437,6 +1521,31 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
1437 | 1521 | ||
1438 | data = &val; | 1522 | data = &val; |
1439 | break; | 1523 | break; |
1524 | #ifdef CONFIG_PACKET_MMAP | ||
1525 | case PACKET_VERSION: | ||
1526 | if (len > sizeof(int)) | ||
1527 | len = sizeof(int); | ||
1528 | val = po->tp_version; | ||
1529 | data = &val; | ||
1530 | break; | ||
1531 | case PACKET_HDRLEN: | ||
1532 | if (len > sizeof(int)) | ||
1533 | len = sizeof(int); | ||
1534 | if (copy_from_user(&val, optval, len)) | ||
1535 | return -EFAULT; | ||
1536 | switch (val) { | ||
1537 | case TPACKET_V1: | ||
1538 | val = sizeof(struct tpacket_hdr); | ||
1539 | break; | ||
1540 | case TPACKET_V2: | ||
1541 | val = sizeof(struct tpacket2_hdr); | ||
1542 | break; | ||
1543 | default: | ||
1544 | return -EINVAL; | ||
1545 | } | ||
1546 | data = &val; | ||
1547 | break; | ||
1548 | #endif | ||
1440 | default: | 1549 | default: |
1441 | return -ENOPROTOOPT; | 1550 | return -ENOPROTOOPT; |
1442 | } | 1551 | } |
@@ -1570,11 +1679,8 @@ static unsigned int packet_poll(struct file * file, struct socket *sock, | |||
1570 | spin_lock_bh(&sk->sk_receive_queue.lock); | 1679 | spin_lock_bh(&sk->sk_receive_queue.lock); |
1571 | if (po->pg_vec) { | 1680 | if (po->pg_vec) { |
1572 | unsigned last = po->head ? po->head-1 : po->frame_max; | 1681 | unsigned last = po->head ? po->head-1 : po->frame_max; |
1573 | struct tpacket_hdr *h; | ||
1574 | |||
1575 | h = packet_lookup_frame(po, last); | ||
1576 | 1682 | ||
1577 | if (h->tp_status) | 1683 | if (packet_lookup_frame(po, last, TP_STATUS_USER)) |
1578 | mask |= POLLIN | POLLRDNORM; | 1684 | mask |= POLLIN | POLLRDNORM; |
1579 | } | 1685 | } |
1580 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 1686 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
@@ -1669,11 +1775,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing | |||
1669 | if (unlikely(po->pg_vec)) | 1775 | if (unlikely(po->pg_vec)) |
1670 | return -EBUSY; | 1776 | return -EBUSY; |
1671 | 1777 | ||
1778 | switch (po->tp_version) { | ||
1779 | case TPACKET_V1: | ||
1780 | po->tp_hdrlen = TPACKET_HDRLEN; | ||
1781 | break; | ||
1782 | case TPACKET_V2: | ||
1783 | po->tp_hdrlen = TPACKET2_HDRLEN; | ||
1784 | break; | ||
1785 | } | ||
1786 | |||
1672 | if (unlikely((int)req->tp_block_size <= 0)) | 1787 | if (unlikely((int)req->tp_block_size <= 0)) |
1673 | return -EINVAL; | 1788 | return -EINVAL; |
1674 | if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) | 1789 | if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) |
1675 | return -EINVAL; | 1790 | return -EINVAL; |
1676 | if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) | 1791 | if (unlikely(req->tp_frame_size < po->tp_hdrlen)) |
1677 | return -EINVAL; | 1792 | return -EINVAL; |
1678 | if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) | 1793 | if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) |
1679 | return -EINVAL; | 1794 | return -EINVAL; |
@@ -1692,13 +1807,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing | |||
1692 | goto out; | 1807 | goto out; |
1693 | 1808 | ||
1694 | for (i = 0; i < req->tp_block_nr; i++) { | 1809 | for (i = 0; i < req->tp_block_nr; i++) { |
1695 | char *ptr = pg_vec[i]; | 1810 | void *ptr = pg_vec[i]; |
1696 | struct tpacket_hdr *header; | ||
1697 | int k; | 1811 | int k; |
1698 | 1812 | ||
1699 | for (k = 0; k < po->frames_per_block; k++) { | 1813 | for (k = 0; k < po->frames_per_block; k++) { |
1700 | header = (struct tpacket_hdr *) ptr; | 1814 | __packet_set_status(po, ptr, TP_STATUS_KERNEL); |
1701 | header->tp_status = TP_STATUS_KERNEL; | ||
1702 | ptr += req->tp_frame_size; | 1815 | ptr += req->tp_frame_size; |
1703 | } | 1816 | } |
1704 | } | 1817 | } |