diff options
| author | Patrick McHardy <kaber@trash.net> | 2008-07-15 01:50:15 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2008-07-15 01:50:15 -0400 |
| commit | bbd6ef87c544d88c30e4b762b1b61ef267a7d279 (patch) | |
| tree | dd9fdfbde65332d3212290d1b8783666475bd861 | |
| parent | bc1d0411b804ad190cdadabac48a10067f17b9e6 (diff) | |
packet: support extensible, 64 bit clean mmaped ring structure
The tpacket_hdr is not 64 bit clean due to use of an unsigned long
and can't be extended because the following struct sockaddr_ll needs
to be at a fixed offset.
Add support for a version 2 tpacket protocol that removes these
limitations.
Userspace can query the header size through a new getsockopt option
and change the protocol version through a setsockopt option. The
changes needed to switch to the new protocol version are:
1. replace struct tpacket_hdr by struct tpacket2_hdr
2. query header len and save
3. set protocol version to 2
- set up ring as usual
4. for getting the sockaddr_ll, use (void *)hdr + TPACKET_ALIGN(hdrlen)
instead of (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr))
Steps 2 and 4 can be omitted if the struct sockaddr_ll isn't needed.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | include/linux/if_packet.h | 21 | ||||
| -rw-r--r-- | net/packet/af_packet.c | 179 |
2 files changed, 167 insertions, 33 deletions
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h index ad09609227ff..d4d3c82448f5 100644 --- a/include/linux/if_packet.h +++ b/include/linux/if_packet.h | |||
| @@ -43,6 +43,8 @@ struct sockaddr_ll | |||
| 43 | #define PACKET_COPY_THRESH 7 | 43 | #define PACKET_COPY_THRESH 7 |
| 44 | #define PACKET_AUXDATA 8 | 44 | #define PACKET_AUXDATA 8 |
| 45 | #define PACKET_ORIGDEV 9 | 45 | #define PACKET_ORIGDEV 9 |
| 46 | #define PACKET_VERSION 10 | ||
| 47 | #define PACKET_HDRLEN 11 | ||
| 46 | 48 | ||
| 47 | struct tpacket_stats | 49 | struct tpacket_stats |
| 48 | { | 50 | { |
| @@ -79,6 +81,25 @@ struct tpacket_hdr | |||
| 79 | #define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) | 81 | #define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) |
| 80 | #define TPACKET_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + sizeof(struct sockaddr_ll)) | 82 | #define TPACKET_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + sizeof(struct sockaddr_ll)) |
| 81 | 83 | ||
| 84 | struct tpacket2_hdr | ||
| 85 | { | ||
| 86 | __u32 tp_status; | ||
| 87 | __u32 tp_len; | ||
| 88 | __u32 tp_snaplen; | ||
| 89 | __u16 tp_mac; | ||
| 90 | __u16 tp_net; | ||
| 91 | __u32 tp_sec; | ||
| 92 | __u32 tp_nsec; | ||
| 93 | }; | ||
| 94 | |||
| 95 | #define TPACKET2_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll)) | ||
| 96 | |||
| 97 | enum tpacket_versions | ||
| 98 | { | ||
| 99 | TPACKET_V1, | ||
| 100 | TPACKET_V2, | ||
| 101 | }; | ||
| 102 | |||
| 82 | /* | 103 | /* |
| 83 | Frame structure: | 104 | Frame structure: |
| 84 | 105 | ||
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9f2269166687..4f059775d48f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
| @@ -186,6 +186,8 @@ struct packet_sock { | |||
| 186 | unsigned int pg_vec_order; | 186 | unsigned int pg_vec_order; |
| 187 | unsigned int pg_vec_pages; | 187 | unsigned int pg_vec_pages; |
| 188 | unsigned int pg_vec_len; | 188 | unsigned int pg_vec_len; |
| 189 | enum tpacket_versions tp_version; | ||
| 190 | unsigned int tp_hdrlen; | ||
| 189 | #endif | 191 | #endif |
| 190 | }; | 192 | }; |
| 191 | 193 | ||
| @@ -201,14 +203,52 @@ struct packet_skb_cb { | |||
| 201 | 203 | ||
| 202 | #ifdef CONFIG_PACKET_MMAP | 204 | #ifdef CONFIG_PACKET_MMAP |
| 203 | 205 | ||
| 204 | static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position) | 206 | static void *packet_lookup_frame(struct packet_sock *po, unsigned int position, |
| 207 | int status) | ||
| 205 | { | 208 | { |
| 206 | unsigned int pg_vec_pos, frame_offset; | 209 | unsigned int pg_vec_pos, frame_offset; |
| 210 | union { | ||
| 211 | struct tpacket_hdr *h1; | ||
| 212 | struct tpacket2_hdr *h2; | ||
| 213 | void *raw; | ||
| 214 | } h; | ||
| 207 | 215 | ||
| 208 | pg_vec_pos = position / po->frames_per_block; | 216 | pg_vec_pos = position / po->frames_per_block; |
| 209 | frame_offset = position % po->frames_per_block; | 217 | frame_offset = position % po->frames_per_block; |
| 210 | 218 | ||
| 211 | return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size)); | 219 | h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size); |
| 220 | switch (po->tp_version) { | ||
| 221 | case TPACKET_V1: | ||
| 222 | if (status != h.h1->tp_status ? TP_STATUS_USER : | ||
| 223 | TP_STATUS_KERNEL) | ||
| 224 | return NULL; | ||
| 225 | break; | ||
| 226 | case TPACKET_V2: | ||
| 227 | if (status != h.h2->tp_status ? TP_STATUS_USER : | ||
| 228 | TP_STATUS_KERNEL) | ||
| 229 | return NULL; | ||
| 230 | break; | ||
| 231 | } | ||
| 232 | return h.raw; | ||
| 233 | } | ||
| 234 | |||
| 235 | static void __packet_set_status(struct packet_sock *po, void *frame, int status) | ||
| 236 | { | ||
| 237 | union { | ||
| 238 | struct tpacket_hdr *h1; | ||
| 239 | struct tpacket2_hdr *h2; | ||
| 240 | void *raw; | ||
| 241 | } h; | ||
| 242 | |||
| 243 | h.raw = frame; | ||
| 244 | switch (po->tp_version) { | ||
| 245 | case TPACKET_V1: | ||
| 246 | h.h1->tp_status = status; | ||
| 247 | break; | ||
| 248 | case TPACKET_V2: | ||
| 249 | h.h2->tp_status = status; | ||
| 250 | break; | ||
| 251 | } | ||
| 212 | } | 252 | } |
| 213 | #endif | 253 | #endif |
| 214 | 254 | ||
| @@ -551,14 +591,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
| 551 | struct sock *sk; | 591 | struct sock *sk; |
| 552 | struct packet_sock *po; | 592 | struct packet_sock *po; |
| 553 | struct sockaddr_ll *sll; | 593 | struct sockaddr_ll *sll; |
| 554 | struct tpacket_hdr *h; | 594 | union { |
| 595 | struct tpacket_hdr *h1; | ||
| 596 | struct tpacket2_hdr *h2; | ||
| 597 | void *raw; | ||
| 598 | } h; | ||
| 555 | u8 * skb_head = skb->data; | 599 | u8 * skb_head = skb->data; |
| 556 | int skb_len = skb->len; | 600 | int skb_len = skb->len; |
| 557 | unsigned int snaplen, res; | 601 | unsigned int snaplen, res; |
| 558 | unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; | 602 | unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; |
| 559 | unsigned short macoff, netoff; | 603 | unsigned short macoff, netoff, hdrlen; |
| 560 | struct sk_buff *copy_skb = NULL; | 604 | struct sk_buff *copy_skb = NULL; |
| 561 | struct timeval tv; | 605 | struct timeval tv; |
| 606 | struct timespec ts; | ||
| 562 | 607 | ||
| 563 | if (skb->pkt_type == PACKET_LOOPBACK) | 608 | if (skb->pkt_type == PACKET_LOOPBACK) |
| 564 | goto drop; | 609 | goto drop; |
| @@ -590,10 +635,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
| 590 | snaplen = res; | 635 | snaplen = res; |
| 591 | 636 | ||
| 592 | if (sk->sk_type == SOCK_DGRAM) { | 637 | if (sk->sk_type == SOCK_DGRAM) { |
| 593 | macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; | 638 | macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16; |
| 594 | } else { | 639 | } else { |
| 595 | unsigned maclen = skb_network_offset(skb); | 640 | unsigned maclen = skb_network_offset(skb); |
| 596 | netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen)); | 641 | netoff = TPACKET_ALIGN(po->tp_hdrlen + |
| 642 | (maclen < 16 ? 16 : maclen)); | ||
| 597 | macoff = netoff - maclen; | 643 | macoff = netoff - maclen; |
| 598 | } | 644 | } |
| 599 | 645 | ||
| @@ -616,9 +662,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
| 616 | } | 662 | } |
| 617 | 663 | ||
| 618 | spin_lock(&sk->sk_receive_queue.lock); | 664 | spin_lock(&sk->sk_receive_queue.lock); |
| 619 | h = packet_lookup_frame(po, po->head); | 665 | h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL); |
| 620 | 666 | if (!h.raw) | |
| 621 | if (h->tp_status) | ||
| 622 | goto ring_is_full; | 667 | goto ring_is_full; |
| 623 | po->head = po->head != po->frame_max ? po->head+1 : 0; | 668 | po->head = po->head != po->frame_max ? po->head+1 : 0; |
| 624 | po->stats.tp_packets++; | 669 | po->stats.tp_packets++; |
| @@ -630,20 +675,40 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
| 630 | status &= ~TP_STATUS_LOSING; | 675 | status &= ~TP_STATUS_LOSING; |
| 631 | spin_unlock(&sk->sk_receive_queue.lock); | 676 | spin_unlock(&sk->sk_receive_queue.lock); |
| 632 | 677 | ||
| 633 | skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen); | 678 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); |
| 634 | 679 | ||
| 635 | h->tp_len = skb->len; | 680 | switch (po->tp_version) { |
| 636 | h->tp_snaplen = snaplen; | 681 | case TPACKET_V1: |
| 637 | h->tp_mac = macoff; | 682 | h.h1->tp_len = skb->len; |
| 638 | h->tp_net = netoff; | 683 | h.h1->tp_snaplen = snaplen; |
| 639 | if (skb->tstamp.tv64) | 684 | h.h1->tp_mac = macoff; |
| 640 | tv = ktime_to_timeval(skb->tstamp); | 685 | h.h1->tp_net = netoff; |
| 641 | else | 686 | if (skb->tstamp.tv64) |
| 642 | do_gettimeofday(&tv); | 687 | tv = ktime_to_timeval(skb->tstamp); |
| 643 | h->tp_sec = tv.tv_sec; | 688 | else |
| 644 | h->tp_usec = tv.tv_usec; | 689 | do_gettimeofday(&tv); |
| 690 | h.h1->tp_sec = tv.tv_sec; | ||
| 691 | h.h1->tp_usec = tv.tv_usec; | ||
| 692 | hdrlen = sizeof(*h.h1); | ||
| 693 | break; | ||
| 694 | case TPACKET_V2: | ||
| 695 | h.h2->tp_len = skb->len; | ||
| 696 | h.h2->tp_snaplen = snaplen; | ||
| 697 | h.h2->tp_mac = macoff; | ||
| 698 | h.h2->tp_net = netoff; | ||
| 699 | if (skb->tstamp.tv64) | ||
| 700 | ts = ktime_to_timespec(skb->tstamp); | ||
| 701 | else | ||
| 702 | getnstimeofday(&ts); | ||
| 703 | h.h2->tp_sec = ts.tv_sec; | ||
| 704 | h.h2->tp_nsec = ts.tv_nsec; | ||
| 705 | hdrlen = sizeof(*h.h2); | ||
| 706 | break; | ||
| 707 | default: | ||
| 708 | BUG(); | ||
| 709 | } | ||
| 645 | 710 | ||
| 646 | sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); | 711 | sll = h.raw + TPACKET_ALIGN(hdrlen); |
| 647 | sll->sll_halen = dev_parse_header(skb, sll->sll_addr); | 712 | sll->sll_halen = dev_parse_header(skb, sll->sll_addr); |
| 648 | sll->sll_family = AF_PACKET; | 713 | sll->sll_family = AF_PACKET; |
| 649 | sll->sll_hatype = dev->type; | 714 | sll->sll_hatype = dev->type; |
| @@ -654,14 +719,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe | |||
| 654 | else | 719 | else |
| 655 | sll->sll_ifindex = dev->ifindex; | 720 | sll->sll_ifindex = dev->ifindex; |
| 656 | 721 | ||
| 657 | h->tp_status = status; | 722 | __packet_set_status(po, h.raw, status); |
| 658 | smp_mb(); | 723 | smp_mb(); |
| 659 | 724 | ||
| 660 | { | 725 | { |
| 661 | struct page *p_start, *p_end; | 726 | struct page *p_start, *p_end; |
| 662 | u8 *h_end = (u8 *)h + macoff + snaplen - 1; | 727 | u8 *h_end = h.raw + macoff + snaplen - 1; |
| 663 | 728 | ||
| 664 | p_start = virt_to_page(h); | 729 | p_start = virt_to_page(h.raw); |
| 665 | p_end = virt_to_page(h_end); | 730 | p_end = virt_to_page(h_end); |
| 666 | while (p_start <= p_end) { | 731 | while (p_start <= p_end) { |
| 667 | flush_dcache_page(p_start); | 732 | flush_dcache_page(p_start); |
| @@ -1362,6 +1427,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
| 1362 | pkt_sk(sk)->copy_thresh = val; | 1427 | pkt_sk(sk)->copy_thresh = val; |
| 1363 | return 0; | 1428 | return 0; |
| 1364 | } | 1429 | } |
| 1430 | case PACKET_VERSION: | ||
| 1431 | { | ||
| 1432 | int val; | ||
| 1433 | |||
| 1434 | if (optlen != sizeof(val)) | ||
| 1435 | return -EINVAL; | ||
| 1436 | if (po->pg_vec) | ||
| 1437 | return -EBUSY; | ||
| 1438 | if (copy_from_user(&val, optval, sizeof(val))) | ||
| 1439 | return -EFAULT; | ||
| 1440 | switch (val) { | ||
| 1441 | case TPACKET_V1: | ||
| 1442 | case TPACKET_V2: | ||
| 1443 | po->tp_version = val; | ||
| 1444 | return 0; | ||
| 1445 | default: | ||
| 1446 | return -EINVAL; | ||
| 1447 | } | ||
| 1448 | } | ||
| 1365 | #endif | 1449 | #endif |
| 1366 | case PACKET_AUXDATA: | 1450 | case PACKET_AUXDATA: |
| 1367 | { | 1451 | { |
| @@ -1437,6 +1521,31 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
| 1437 | 1521 | ||
| 1438 | data = &val; | 1522 | data = &val; |
| 1439 | break; | 1523 | break; |
| 1524 | #ifdef CONFIG_PACKET_MMAP | ||
| 1525 | case PACKET_VERSION: | ||
| 1526 | if (len > sizeof(int)) | ||
| 1527 | len = sizeof(int); | ||
| 1528 | val = po->tp_version; | ||
| 1529 | data = &val; | ||
| 1530 | break; | ||
| 1531 | case PACKET_HDRLEN: | ||
| 1532 | if (len > sizeof(int)) | ||
| 1533 | len = sizeof(int); | ||
| 1534 | if (copy_from_user(&val, optval, len)) | ||
| 1535 | return -EFAULT; | ||
| 1536 | switch (val) { | ||
| 1537 | case TPACKET_V1: | ||
| 1538 | val = sizeof(struct tpacket_hdr); | ||
| 1539 | break; | ||
| 1540 | case TPACKET_V2: | ||
| 1541 | val = sizeof(struct tpacket2_hdr); | ||
| 1542 | break; | ||
| 1543 | default: | ||
| 1544 | return -EINVAL; | ||
| 1545 | } | ||
| 1546 | data = &val; | ||
| 1547 | break; | ||
| 1548 | #endif | ||
| 1440 | default: | 1549 | default: |
| 1441 | return -ENOPROTOOPT; | 1550 | return -ENOPROTOOPT; |
| 1442 | } | 1551 | } |
| @@ -1570,11 +1679,8 @@ static unsigned int packet_poll(struct file * file, struct socket *sock, | |||
| 1570 | spin_lock_bh(&sk->sk_receive_queue.lock); | 1679 | spin_lock_bh(&sk->sk_receive_queue.lock); |
| 1571 | if (po->pg_vec) { | 1680 | if (po->pg_vec) { |
| 1572 | unsigned last = po->head ? po->head-1 : po->frame_max; | 1681 | unsigned last = po->head ? po->head-1 : po->frame_max; |
| 1573 | struct tpacket_hdr *h; | ||
| 1574 | |||
| 1575 | h = packet_lookup_frame(po, last); | ||
| 1576 | 1682 | ||
| 1577 | if (h->tp_status) | 1683 | if (packet_lookup_frame(po, last, TP_STATUS_USER)) |
| 1578 | mask |= POLLIN | POLLRDNORM; | 1684 | mask |= POLLIN | POLLRDNORM; |
| 1579 | } | 1685 | } |
| 1580 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 1686 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
| @@ -1669,11 +1775,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing | |||
| 1669 | if (unlikely(po->pg_vec)) | 1775 | if (unlikely(po->pg_vec)) |
| 1670 | return -EBUSY; | 1776 | return -EBUSY; |
| 1671 | 1777 | ||
| 1778 | switch (po->tp_version) { | ||
| 1779 | case TPACKET_V1: | ||
| 1780 | po->tp_hdrlen = TPACKET_HDRLEN; | ||
| 1781 | break; | ||
| 1782 | case TPACKET_V2: | ||
| 1783 | po->tp_hdrlen = TPACKET2_HDRLEN; | ||
| 1784 | break; | ||
| 1785 | } | ||
| 1786 | |||
| 1672 | if (unlikely((int)req->tp_block_size <= 0)) | 1787 | if (unlikely((int)req->tp_block_size <= 0)) |
| 1673 | return -EINVAL; | 1788 | return -EINVAL; |
| 1674 | if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) | 1789 | if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) |
| 1675 | return -EINVAL; | 1790 | return -EINVAL; |
| 1676 | if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) | 1791 | if (unlikely(req->tp_frame_size < po->tp_hdrlen)) |
| 1677 | return -EINVAL; | 1792 | return -EINVAL; |
| 1678 | if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) | 1793 | if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) |
| 1679 | return -EINVAL; | 1794 | return -EINVAL; |
| @@ -1692,13 +1807,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing | |||
| 1692 | goto out; | 1807 | goto out; |
| 1693 | 1808 | ||
| 1694 | for (i = 0; i < req->tp_block_nr; i++) { | 1809 | for (i = 0; i < req->tp_block_nr; i++) { |
| 1695 | char *ptr = pg_vec[i]; | 1810 | void *ptr = pg_vec[i]; |
| 1696 | struct tpacket_hdr *header; | ||
| 1697 | int k; | 1811 | int k; |
| 1698 | 1812 | ||
| 1699 | for (k = 0; k < po->frames_per_block; k++) { | 1813 | for (k = 0; k < po->frames_per_block; k++) { |
| 1700 | header = (struct tpacket_hdr *) ptr; | 1814 | __packet_set_status(po, ptr, TP_STATUS_KERNEL); |
| 1701 | header->tp_status = TP_STATUS_KERNEL; | ||
| 1702 | ptr += req->tp_frame_size; | 1815 | ptr += req->tp_frame_size; |
| 1703 | } | 1816 | } |
| 1704 | } | 1817 | } |
