diff options
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r-- | net/packet/af_packet.c | 163 |
1 files changed, 105 insertions, 58 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3616f27b9d46..91cb1d71f018 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -61,6 +61,7 @@ | |||
61 | #include <linux/kernel.h> | 61 | #include <linux/kernel.h> |
62 | #include <linux/kmod.h> | 62 | #include <linux/kmod.h> |
63 | #include <linux/slab.h> | 63 | #include <linux/slab.h> |
64 | #include <linux/vmalloc.h> | ||
64 | #include <net/net_namespace.h> | 65 | #include <net/net_namespace.h> |
65 | #include <net/ip.h> | 66 | #include <net/ip.h> |
66 | #include <net/protocol.h> | 67 | #include <net/protocol.h> |
@@ -163,8 +164,13 @@ struct packet_mreq_max { | |||
163 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | 164 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, |
164 | int closing, int tx_ring); | 165 | int closing, int tx_ring); |
165 | 166 | ||
167 | #define PGV_FROM_VMALLOC 1 | ||
168 | struct pgv { | ||
169 | char *buffer; | ||
170 | }; | ||
171 | |||
166 | struct packet_ring_buffer { | 172 | struct packet_ring_buffer { |
167 | char **pg_vec; | 173 | struct pgv *pg_vec; |
168 | unsigned int head; | 174 | unsigned int head; |
169 | unsigned int frames_per_block; | 175 | unsigned int frames_per_block; |
170 | unsigned int frame_size; | 176 | unsigned int frame_size; |
@@ -217,6 +223,13 @@ struct packet_skb_cb { | |||
217 | 223 | ||
218 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) | 224 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) |
219 | 225 | ||
226 | static inline __pure struct page *pgv_to_page(void *addr) | ||
227 | { | ||
228 | if (is_vmalloc_addr(addr)) | ||
229 | return vmalloc_to_page(addr); | ||
230 | return virt_to_page(addr); | ||
231 | } | ||
232 | |||
220 | static void __packet_set_status(struct packet_sock *po, void *frame, int status) | 233 | static void __packet_set_status(struct packet_sock *po, void *frame, int status) |
221 | { | 234 | { |
222 | union { | 235 | union { |
@@ -229,11 +242,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) | |||
229 | switch (po->tp_version) { | 242 | switch (po->tp_version) { |
230 | case TPACKET_V1: | 243 | case TPACKET_V1: |
231 | h.h1->tp_status = status; | 244 | h.h1->tp_status = status; |
232 | flush_dcache_page(virt_to_page(&h.h1->tp_status)); | 245 | flush_dcache_page(pgv_to_page(&h.h1->tp_status)); |
233 | break; | 246 | break; |
234 | case TPACKET_V2: | 247 | case TPACKET_V2: |
235 | h.h2->tp_status = status; | 248 | h.h2->tp_status = status; |
236 | flush_dcache_page(virt_to_page(&h.h2->tp_status)); | 249 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
237 | break; | 250 | break; |
238 | default: | 251 | default: |
239 | pr_err("TPACKET version not supported\n"); | 252 | pr_err("TPACKET version not supported\n"); |
@@ -256,10 +269,10 @@ static int __packet_get_status(struct packet_sock *po, void *frame) | |||
256 | h.raw = frame; | 269 | h.raw = frame; |
257 | switch (po->tp_version) { | 270 | switch (po->tp_version) { |
258 | case TPACKET_V1: | 271 | case TPACKET_V1: |
259 | flush_dcache_page(virt_to_page(&h.h1->tp_status)); | 272 | flush_dcache_page(pgv_to_page(&h.h1->tp_status)); |
260 | return h.h1->tp_status; | 273 | return h.h1->tp_status; |
261 | case TPACKET_V2: | 274 | case TPACKET_V2: |
262 | flush_dcache_page(virt_to_page(&h.h2->tp_status)); | 275 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
263 | return h.h2->tp_status; | 276 | return h.h2->tp_status; |
264 | default: | 277 | default: |
265 | pr_err("TPACKET version not supported\n"); | 278 | pr_err("TPACKET version not supported\n"); |
@@ -283,7 +296,8 @@ static void *packet_lookup_frame(struct packet_sock *po, | |||
283 | pg_vec_pos = position / rb->frames_per_block; | 296 | pg_vec_pos = position / rb->frames_per_block; |
284 | frame_offset = position % rb->frames_per_block; | 297 | frame_offset = position % rb->frames_per_block; |
285 | 298 | ||
286 | h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size); | 299 | h.raw = rb->pg_vec[pg_vec_pos].buffer + |
300 | (frame_offset * rb->frame_size); | ||
287 | 301 | ||
288 | if (status != __packet_get_status(po, h.raw)) | 302 | if (status != __packet_get_status(po, h.raw)) |
289 | return NULL; | 303 | return NULL; |
@@ -503,7 +517,8 @@ out_free: | |||
503 | return err; | 517 | return err; |
504 | } | 518 | } |
505 | 519 | ||
506 | static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, | 520 | static inline unsigned int run_filter(const struct sk_buff *skb, |
521 | const struct sock *sk, | ||
507 | unsigned int res) | 522 | unsigned int res) |
508 | { | 523 | { |
509 | struct sk_filter *filter; | 524 | struct sk_filter *filter; |
@@ -511,22 +526,22 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, | |||
511 | rcu_read_lock_bh(); | 526 | rcu_read_lock_bh(); |
512 | filter = rcu_dereference_bh(sk->sk_filter); | 527 | filter = rcu_dereference_bh(sk->sk_filter); |
513 | if (filter != NULL) | 528 | if (filter != NULL) |
514 | res = sk_run_filter(skb, filter->insns, filter->len); | 529 | res = sk_run_filter(skb, filter->insns); |
515 | rcu_read_unlock_bh(); | 530 | rcu_read_unlock_bh(); |
516 | 531 | ||
517 | return res; | 532 | return res; |
518 | } | 533 | } |
519 | 534 | ||
520 | /* | 535 | /* |
521 | This function makes lazy skb cloning in hope that most of packets | 536 | * This function makes lazy skb cloning in hope that most of packets |
522 | are discarded by BPF. | 537 | * are discarded by BPF. |
523 | 538 | * | |
524 | Note tricky part: we DO mangle shared skb! skb->data, skb->len | 539 | * Note tricky part: we DO mangle shared skb! skb->data, skb->len |
525 | and skb->cb are mangled. It works because (and until) packets | 540 | * and skb->cb are mangled. It works because (and until) packets |
526 | falling here are owned by current CPU. Output packets are cloned | 541 | * falling here are owned by current CPU. Output packets are cloned |
527 | by dev_queue_xmit_nit(), input packets are processed by net_bh | 542 | * by dev_queue_xmit_nit(), input packets are processed by net_bh |
528 | sequencially, so that if we return skb to original state on exit, | 543 | * sequencially, so that if we return skb to original state on exit, |
529 | we will not harm anyone. | 544 | * we will not harm anyone. |
530 | */ | 545 | */ |
531 | 546 | ||
532 | static int packet_rcv(struct sk_buff *skb, struct net_device *dev, | 547 | static int packet_rcv(struct sk_buff *skb, struct net_device *dev, |
@@ -552,11 +567,11 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, | |||
552 | 567 | ||
553 | if (dev->header_ops) { | 568 | if (dev->header_ops) { |
554 | /* The device has an explicit notion of ll header, | 569 | /* The device has an explicit notion of ll header, |
555 | exported to higher levels. | 570 | * exported to higher levels. |
556 | 571 | * | |
557 | Otherwise, the device hides datails of it frame | 572 | * Otherwise, the device hides details of its frame |
558 | structure, so that corresponding packet head | 573 | * structure, so that corresponding packet head is |
559 | never delivered to user. | 574 | * never delivered to user. |
560 | */ | 575 | */ |
561 | if (sk->sk_type != SOCK_DGRAM) | 576 | if (sk->sk_type != SOCK_DGRAM) |
562 | skb_push(skb, skb->data - skb_mac_header(skb)); | 577 | skb_push(skb, skb->data - skb_mac_header(skb)); |
@@ -791,17 +806,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
791 | 806 | ||
792 | __packet_set_status(po, h.raw, status); | 807 | __packet_set_status(po, h.raw, status); |
793 | smp_mb(); | 808 | smp_mb(); |
809 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
794 | { | 810 | { |
795 | struct page *p_start, *p_end; | 811 | u8 *start, *end; |
796 | u8 *h_end = h.raw + macoff + snaplen - 1; | 812 | |
797 | 813 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); | |
798 | p_start = virt_to_page(h.raw); | 814 | for (start = h.raw; start < end; start += PAGE_SIZE) |
799 | p_end = virt_to_page(h_end); | 815 | flush_dcache_page(pgv_to_page(start)); |
800 | while (p_start <= p_end) { | ||
801 | flush_dcache_page(p_start); | ||
802 | p_start++; | ||
803 | } | ||
804 | } | 816 | } |
817 | #endif | ||
805 | 818 | ||
806 | sk->sk_data_ready(sk, 0); | 819 | sk->sk_data_ready(sk, 0); |
807 | 820 | ||
@@ -907,7 +920,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | |||
907 | } | 920 | } |
908 | 921 | ||
909 | err = -EFAULT; | 922 | err = -EFAULT; |
910 | page = virt_to_page(data); | ||
911 | offset = offset_in_page(data); | 923 | offset = offset_in_page(data); |
912 | len_max = PAGE_SIZE - offset; | 924 | len_max = PAGE_SIZE - offset; |
913 | len = ((to_write > len_max) ? len_max : to_write); | 925 | len = ((to_write > len_max) ? len_max : to_write); |
@@ -926,11 +938,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | |||
926 | return -EFAULT; | 938 | return -EFAULT; |
927 | } | 939 | } |
928 | 940 | ||
941 | page = pgv_to_page(data); | ||
942 | data += len; | ||
929 | flush_dcache_page(page); | 943 | flush_dcache_page(page); |
930 | get_page(page); | 944 | get_page(page); |
931 | skb_fill_page_desc(skb, | 945 | skb_fill_page_desc(skb, nr_frags, page, offset, len); |
932 | nr_frags, | ||
933 | page++, offset, len); | ||
934 | to_write -= len; | 946 | to_write -= len; |
935 | offset = 0; | 947 | offset = 0; |
936 | len_max = PAGE_SIZE; | 948 | len_max = PAGE_SIZE; |
@@ -1610,9 +1622,11 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
1610 | 1622 | ||
1611 | err = -EINVAL; | 1623 | err = -EINVAL; |
1612 | vnet_hdr_len = sizeof(vnet_hdr); | 1624 | vnet_hdr_len = sizeof(vnet_hdr); |
1613 | if ((len -= vnet_hdr_len) < 0) | 1625 | if (len < vnet_hdr_len) |
1614 | goto out_free; | 1626 | goto out_free; |
1615 | 1627 | ||
1628 | len -= vnet_hdr_len; | ||
1629 | |||
1616 | if (skb_is_gso(skb)) { | 1630 | if (skb_is_gso(skb)) { |
1617 | struct skb_shared_info *sinfo = skb_shinfo(skb); | 1631 | struct skb_shared_info *sinfo = skb_shinfo(skb); |
1618 | 1632 | ||
@@ -1636,8 +1650,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
1636 | 1650 | ||
1637 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | 1651 | if (skb->ip_summed == CHECKSUM_PARTIAL) { |
1638 | vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; | 1652 | vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; |
1639 | vnet_hdr.csum_start = skb->csum_start - | 1653 | vnet_hdr.csum_start = skb_checksum_start_offset(skb); |
1640 | skb_headroom(skb); | ||
1641 | vnet_hdr.csum_offset = skb->csum_offset; | 1654 | vnet_hdr.csum_offset = skb->csum_offset; |
1642 | } /* else everything is zero */ | 1655 | } /* else everything is zero */ |
1643 | 1656 | ||
@@ -1719,7 +1732,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, | |||
1719 | rcu_read_lock(); | 1732 | rcu_read_lock(); |
1720 | dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); | 1733 | dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); |
1721 | if (dev) | 1734 | if (dev) |
1722 | strlcpy(uaddr->sa_data, dev->name, 15); | 1735 | strncpy(uaddr->sa_data, dev->name, 14); |
1723 | else | 1736 | else |
1724 | memset(uaddr->sa_data, 0, 14); | 1737 | memset(uaddr->sa_data, 0, 14); |
1725 | rcu_read_unlock(); | 1738 | rcu_read_unlock(); |
@@ -1742,6 +1755,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, | |||
1742 | sll->sll_family = AF_PACKET; | 1755 | sll->sll_family = AF_PACKET; |
1743 | sll->sll_ifindex = po->ifindex; | 1756 | sll->sll_ifindex = po->ifindex; |
1744 | sll->sll_protocol = po->num; | 1757 | sll->sll_protocol = po->num; |
1758 | sll->sll_pkttype = 0; | ||
1745 | rcu_read_lock(); | 1759 | rcu_read_lock(); |
1746 | dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); | 1760 | dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); |
1747 | if (dev) { | 1761 | if (dev) { |
@@ -2322,37 +2336,70 @@ static const struct vm_operations_struct packet_mmap_ops = { | |||
2322 | .close = packet_mm_close, | 2336 | .close = packet_mm_close, |
2323 | }; | 2337 | }; |
2324 | 2338 | ||
2325 | static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) | 2339 | static void free_pg_vec(struct pgv *pg_vec, unsigned int order, |
2340 | unsigned int len) | ||
2326 | { | 2341 | { |
2327 | int i; | 2342 | int i; |
2328 | 2343 | ||
2329 | for (i = 0; i < len; i++) { | 2344 | for (i = 0; i < len; i++) { |
2330 | if (likely(pg_vec[i])) | 2345 | if (likely(pg_vec[i].buffer)) { |
2331 | free_pages((unsigned long) pg_vec[i], order); | 2346 | if (is_vmalloc_addr(pg_vec[i].buffer)) |
2347 | vfree(pg_vec[i].buffer); | ||
2348 | else | ||
2349 | free_pages((unsigned long)pg_vec[i].buffer, | ||
2350 | order); | ||
2351 | pg_vec[i].buffer = NULL; | ||
2352 | } | ||
2332 | } | 2353 | } |
2333 | kfree(pg_vec); | 2354 | kfree(pg_vec); |
2334 | } | 2355 | } |
2335 | 2356 | ||
2336 | static inline char *alloc_one_pg_vec_page(unsigned long order) | 2357 | static inline char *alloc_one_pg_vec_page(unsigned long order) |
2337 | { | 2358 | { |
2338 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN; | 2359 | char *buffer = NULL; |
2360 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | | ||
2361 | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; | ||
2362 | |||
2363 | buffer = (char *) __get_free_pages(gfp_flags, order); | ||
2364 | |||
2365 | if (buffer) | ||
2366 | return buffer; | ||
2367 | |||
2368 | /* | ||
2369 | * __get_free_pages failed, fall back to vmalloc | ||
2370 | */ | ||
2371 | buffer = vzalloc((1 << order) * PAGE_SIZE); | ||
2339 | 2372 | ||
2340 | return (char *) __get_free_pages(gfp_flags, order); | 2373 | if (buffer) |
2374 | return buffer; | ||
2375 | |||
2376 | /* | ||
2377 | * vmalloc failed, lets dig into swap here | ||
2378 | */ | ||
2379 | gfp_flags &= ~__GFP_NORETRY; | ||
2380 | buffer = (char *)__get_free_pages(gfp_flags, order); | ||
2381 | if (buffer) | ||
2382 | return buffer; | ||
2383 | |||
2384 | /* | ||
2385 | * complete and utter failure | ||
2386 | */ | ||
2387 | return NULL; | ||
2341 | } | 2388 | } |
2342 | 2389 | ||
2343 | static char **alloc_pg_vec(struct tpacket_req *req, int order) | 2390 | static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) |
2344 | { | 2391 | { |
2345 | unsigned int block_nr = req->tp_block_nr; | 2392 | unsigned int block_nr = req->tp_block_nr; |
2346 | char **pg_vec; | 2393 | struct pgv *pg_vec; |
2347 | int i; | 2394 | int i; |
2348 | 2395 | ||
2349 | pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); | 2396 | pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); |
2350 | if (unlikely(!pg_vec)) | 2397 | if (unlikely(!pg_vec)) |
2351 | goto out; | 2398 | goto out; |
2352 | 2399 | ||
2353 | for (i = 0; i < block_nr; i++) { | 2400 | for (i = 0; i < block_nr; i++) { |
2354 | pg_vec[i] = alloc_one_pg_vec_page(order); | 2401 | pg_vec[i].buffer = alloc_one_pg_vec_page(order); |
2355 | if (unlikely(!pg_vec[i])) | 2402 | if (unlikely(!pg_vec[i].buffer)) |
2356 | goto out_free_pgvec; | 2403 | goto out_free_pgvec; |
2357 | } | 2404 | } |
2358 | 2405 | ||
@@ -2368,7 +2415,7 @@ out_free_pgvec: | |||
2368 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | 2415 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, |
2369 | int closing, int tx_ring) | 2416 | int closing, int tx_ring) |
2370 | { | 2417 | { |
2371 | char **pg_vec = NULL; | 2418 | struct pgv *pg_vec = NULL; |
2372 | struct packet_sock *po = pkt_sk(sk); | 2419 | struct packet_sock *po = pkt_sk(sk); |
2373 | int was_running, order = 0; | 2420 | int was_running, order = 0; |
2374 | struct packet_ring_buffer *rb; | 2421 | struct packet_ring_buffer *rb; |
@@ -2453,22 +2500,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2453 | mutex_lock(&po->pg_vec_lock); | 2500 | mutex_lock(&po->pg_vec_lock); |
2454 | if (closing || atomic_read(&po->mapped) == 0) { | 2501 | if (closing || atomic_read(&po->mapped) == 0) { |
2455 | err = 0; | 2502 | err = 0; |
2456 | #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) | ||
2457 | spin_lock_bh(&rb_queue->lock); | 2503 | spin_lock_bh(&rb_queue->lock); |
2458 | pg_vec = XC(rb->pg_vec, pg_vec); | 2504 | swap(rb->pg_vec, pg_vec); |
2459 | rb->frame_max = (req->tp_frame_nr - 1); | 2505 | rb->frame_max = (req->tp_frame_nr - 1); |
2460 | rb->head = 0; | 2506 | rb->head = 0; |
2461 | rb->frame_size = req->tp_frame_size; | 2507 | rb->frame_size = req->tp_frame_size; |
2462 | spin_unlock_bh(&rb_queue->lock); | 2508 | spin_unlock_bh(&rb_queue->lock); |
2463 | 2509 | ||
2464 | order = XC(rb->pg_vec_order, order); | 2510 | swap(rb->pg_vec_order, order); |
2465 | req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr); | 2511 | swap(rb->pg_vec_len, req->tp_block_nr); |
2466 | 2512 | ||
2467 | rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; | 2513 | rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; |
2468 | po->prot_hook.func = (po->rx_ring.pg_vec) ? | 2514 | po->prot_hook.func = (po->rx_ring.pg_vec) ? |
2469 | tpacket_rcv : packet_rcv; | 2515 | tpacket_rcv : packet_rcv; |
2470 | skb_queue_purge(rb_queue); | 2516 | skb_queue_purge(rb_queue); |
2471 | #undef XC | ||
2472 | if (atomic_read(&po->mapped)) | 2517 | if (atomic_read(&po->mapped)) |
2473 | pr_err("packet_mmap: vma is busy: %d\n", | 2518 | pr_err("packet_mmap: vma is busy: %d\n", |
2474 | atomic_read(&po->mapped)); | 2519 | atomic_read(&po->mapped)); |
@@ -2530,15 +2575,17 @@ static int packet_mmap(struct file *file, struct socket *sock, | |||
2530 | continue; | 2575 | continue; |
2531 | 2576 | ||
2532 | for (i = 0; i < rb->pg_vec_len; i++) { | 2577 | for (i = 0; i < rb->pg_vec_len; i++) { |
2533 | struct page *page = virt_to_page(rb->pg_vec[i]); | 2578 | struct page *page; |
2579 | void *kaddr = rb->pg_vec[i].buffer; | ||
2534 | int pg_num; | 2580 | int pg_num; |
2535 | 2581 | ||
2536 | for (pg_num = 0; pg_num < rb->pg_vec_pages; | 2582 | for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { |
2537 | pg_num++, page++) { | 2583 | page = pgv_to_page(kaddr); |
2538 | err = vm_insert_page(vma, start, page); | 2584 | err = vm_insert_page(vma, start, page); |
2539 | if (unlikely(err)) | 2585 | if (unlikely(err)) |
2540 | goto out; | 2586 | goto out; |
2541 | start += PAGE_SIZE; | 2587 | start += PAGE_SIZE; |
2588 | kaddr += PAGE_SIZE; | ||
2542 | } | 2589 | } |
2543 | } | 2590 | } |
2544 | } | 2591 | } |