aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c163
1 files changed, 105 insertions, 58 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3616f27b9d46..91cb1d71f018 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -61,6 +61,7 @@
61#include <linux/kernel.h> 61#include <linux/kernel.h>
62#include <linux/kmod.h> 62#include <linux/kmod.h>
63#include <linux/slab.h> 63#include <linux/slab.h>
64#include <linux/vmalloc.h>
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65#include <net/ip.h> 66#include <net/ip.h>
66#include <net/protocol.h> 67#include <net/protocol.h>
@@ -163,8 +164,13 @@ struct packet_mreq_max {
163static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164 int closing, int tx_ring); 165 int closing, int tx_ring);
165 166
167#define PGV_FROM_VMALLOC 1
168struct pgv {
169 char *buffer;
170};
171
166struct packet_ring_buffer { 172struct packet_ring_buffer {
167 char **pg_vec; 173 struct pgv *pg_vec;
168 unsigned int head; 174 unsigned int head;
169 unsigned int frames_per_block; 175 unsigned int frames_per_block;
170 unsigned int frame_size; 176 unsigned int frame_size;
@@ -217,6 +223,13 @@ struct packet_skb_cb {
217 223
218#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 224#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
219 225
226static inline __pure struct page *pgv_to_page(void *addr)
227{
228 if (is_vmalloc_addr(addr))
229 return vmalloc_to_page(addr);
230 return virt_to_page(addr);
231}
232
220static void __packet_set_status(struct packet_sock *po, void *frame, int status) 233static void __packet_set_status(struct packet_sock *po, void *frame, int status)
221{ 234{
222 union { 235 union {
@@ -229,11 +242,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
229 switch (po->tp_version) { 242 switch (po->tp_version) {
230 case TPACKET_V1: 243 case TPACKET_V1:
231 h.h1->tp_status = status; 244 h.h1->tp_status = status;
232 flush_dcache_page(virt_to_page(&h.h1->tp_status)); 245 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
233 break; 246 break;
234 case TPACKET_V2: 247 case TPACKET_V2:
235 h.h2->tp_status = status; 248 h.h2->tp_status = status;
236 flush_dcache_page(virt_to_page(&h.h2->tp_status)); 249 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
237 break; 250 break;
238 default: 251 default:
239 pr_err("TPACKET version not supported\n"); 252 pr_err("TPACKET version not supported\n");
@@ -256,10 +269,10 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
256 h.raw = frame; 269 h.raw = frame;
257 switch (po->tp_version) { 270 switch (po->tp_version) {
258 case TPACKET_V1: 271 case TPACKET_V1:
259 flush_dcache_page(virt_to_page(&h.h1->tp_status)); 272 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
260 return h.h1->tp_status; 273 return h.h1->tp_status;
261 case TPACKET_V2: 274 case TPACKET_V2:
262 flush_dcache_page(virt_to_page(&h.h2->tp_status)); 275 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
263 return h.h2->tp_status; 276 return h.h2->tp_status;
264 default: 277 default:
265 pr_err("TPACKET version not supported\n"); 278 pr_err("TPACKET version not supported\n");
@@ -283,7 +296,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
283 pg_vec_pos = position / rb->frames_per_block; 296 pg_vec_pos = position / rb->frames_per_block;
284 frame_offset = position % rb->frames_per_block; 297 frame_offset = position % rb->frames_per_block;
285 298
286 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size); 299 h.raw = rb->pg_vec[pg_vec_pos].buffer +
300 (frame_offset * rb->frame_size);
287 301
288 if (status != __packet_get_status(po, h.raw)) 302 if (status != __packet_get_status(po, h.raw))
289 return NULL; 303 return NULL;
@@ -503,7 +517,8 @@ out_free:
503 return err; 517 return err;
504} 518}
505 519
506static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, 520static inline unsigned int run_filter(const struct sk_buff *skb,
521 const struct sock *sk,
507 unsigned int res) 522 unsigned int res)
508{ 523{
509 struct sk_filter *filter; 524 struct sk_filter *filter;
@@ -511,22 +526,22 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
511 rcu_read_lock_bh(); 526 rcu_read_lock_bh();
512 filter = rcu_dereference_bh(sk->sk_filter); 527 filter = rcu_dereference_bh(sk->sk_filter);
513 if (filter != NULL) 528 if (filter != NULL)
514 res = sk_run_filter(skb, filter->insns, filter->len); 529 res = sk_run_filter(skb, filter->insns);
515 rcu_read_unlock_bh(); 530 rcu_read_unlock_bh();
516 531
517 return res; 532 return res;
518} 533}
519 534
520/* 535/*
521 This function makes lazy skb cloning in hope that most of packets 536 * This function makes lazy skb cloning in hope that most of packets
522 are discarded by BPF. 537 * are discarded by BPF.
523 538 *
524 Note tricky part: we DO mangle shared skb! skb->data, skb->len 539 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
525 and skb->cb are mangled. It works because (and until) packets 540 * and skb->cb are mangled. It works because (and until) packets
526 falling here are owned by current CPU. Output packets are cloned 541 * falling here are owned by current CPU. Output packets are cloned
527 by dev_queue_xmit_nit(), input packets are processed by net_bh 542 * by dev_queue_xmit_nit(), input packets are processed by net_bh
528 sequencially, so that if we return skb to original state on exit, 543 * sequencially, so that if we return skb to original state on exit,
529 we will not harm anyone. 544 * we will not harm anyone.
530 */ 545 */
531 546
532static int packet_rcv(struct sk_buff *skb, struct net_device *dev, 547static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -552,11 +567,11 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
552 567
553 if (dev->header_ops) { 568 if (dev->header_ops) {
554 /* The device has an explicit notion of ll header, 569 /* The device has an explicit notion of ll header,
555 exported to higher levels. 570 * exported to higher levels.
556 571 *
557 Otherwise, the device hides datails of it frame 572 * Otherwise, the device hides details of its frame
558 structure, so that corresponding packet head 573 * structure, so that corresponding packet head is
559 never delivered to user. 574 * never delivered to user.
560 */ 575 */
561 if (sk->sk_type != SOCK_DGRAM) 576 if (sk->sk_type != SOCK_DGRAM)
562 skb_push(skb, skb->data - skb_mac_header(skb)); 577 skb_push(skb, skb->data - skb_mac_header(skb));
@@ -791,17 +806,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
791 806
792 __packet_set_status(po, h.raw, status); 807 __packet_set_status(po, h.raw, status);
793 smp_mb(); 808 smp_mb();
809#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
794 { 810 {
795 struct page *p_start, *p_end; 811 u8 *start, *end;
796 u8 *h_end = h.raw + macoff + snaplen - 1; 812
797 813 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
798 p_start = virt_to_page(h.raw); 814 for (start = h.raw; start < end; start += PAGE_SIZE)
799 p_end = virt_to_page(h_end); 815 flush_dcache_page(pgv_to_page(start));
800 while (p_start <= p_end) {
801 flush_dcache_page(p_start);
802 p_start++;
803 }
804 } 816 }
817#endif
805 818
806 sk->sk_data_ready(sk, 0); 819 sk->sk_data_ready(sk, 0);
807 820
@@ -907,7 +920,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
907 } 920 }
908 921
909 err = -EFAULT; 922 err = -EFAULT;
910 page = virt_to_page(data);
911 offset = offset_in_page(data); 923 offset = offset_in_page(data);
912 len_max = PAGE_SIZE - offset; 924 len_max = PAGE_SIZE - offset;
913 len = ((to_write > len_max) ? len_max : to_write); 925 len = ((to_write > len_max) ? len_max : to_write);
@@ -926,11 +938,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
926 return -EFAULT; 938 return -EFAULT;
927 } 939 }
928 940
941 page = pgv_to_page(data);
942 data += len;
929 flush_dcache_page(page); 943 flush_dcache_page(page);
930 get_page(page); 944 get_page(page);
931 skb_fill_page_desc(skb, 945 skb_fill_page_desc(skb, nr_frags, page, offset, len);
932 nr_frags,
933 page++, offset, len);
934 to_write -= len; 946 to_write -= len;
935 offset = 0; 947 offset = 0;
936 len_max = PAGE_SIZE; 948 len_max = PAGE_SIZE;
@@ -1610,9 +1622,11 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1610 1622
1611 err = -EINVAL; 1623 err = -EINVAL;
1612 vnet_hdr_len = sizeof(vnet_hdr); 1624 vnet_hdr_len = sizeof(vnet_hdr);
1613 if ((len -= vnet_hdr_len) < 0) 1625 if (len < vnet_hdr_len)
1614 goto out_free; 1626 goto out_free;
1615 1627
1628 len -= vnet_hdr_len;
1629
1616 if (skb_is_gso(skb)) { 1630 if (skb_is_gso(skb)) {
1617 struct skb_shared_info *sinfo = skb_shinfo(skb); 1631 struct skb_shared_info *sinfo = skb_shinfo(skb);
1618 1632
@@ -1636,8 +1650,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1636 1650
1637 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1651 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1638 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 1652 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1639 vnet_hdr.csum_start = skb->csum_start - 1653 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
1640 skb_headroom(skb);
1641 vnet_hdr.csum_offset = skb->csum_offset; 1654 vnet_hdr.csum_offset = skb->csum_offset;
1642 } /* else everything is zero */ 1655 } /* else everything is zero */
1643 1656
@@ -1719,7 +1732,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1719 rcu_read_lock(); 1732 rcu_read_lock();
1720 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); 1733 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1721 if (dev) 1734 if (dev)
1722 strlcpy(uaddr->sa_data, dev->name, 15); 1735 strncpy(uaddr->sa_data, dev->name, 14);
1723 else 1736 else
1724 memset(uaddr->sa_data, 0, 14); 1737 memset(uaddr->sa_data, 0, 14);
1725 rcu_read_unlock(); 1738 rcu_read_unlock();
@@ -1742,6 +1755,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1742 sll->sll_family = AF_PACKET; 1755 sll->sll_family = AF_PACKET;
1743 sll->sll_ifindex = po->ifindex; 1756 sll->sll_ifindex = po->ifindex;
1744 sll->sll_protocol = po->num; 1757 sll->sll_protocol = po->num;
1758 sll->sll_pkttype = 0;
1745 rcu_read_lock(); 1759 rcu_read_lock();
1746 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); 1760 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1747 if (dev) { 1761 if (dev) {
@@ -2322,37 +2336,70 @@ static const struct vm_operations_struct packet_mmap_ops = {
2322 .close = packet_mm_close, 2336 .close = packet_mm_close,
2323}; 2337};
2324 2338
2325static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) 2339static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2340 unsigned int len)
2326{ 2341{
2327 int i; 2342 int i;
2328 2343
2329 for (i = 0; i < len; i++) { 2344 for (i = 0; i < len; i++) {
2330 if (likely(pg_vec[i])) 2345 if (likely(pg_vec[i].buffer)) {
2331 free_pages((unsigned long) pg_vec[i], order); 2346 if (is_vmalloc_addr(pg_vec[i].buffer))
2347 vfree(pg_vec[i].buffer);
2348 else
2349 free_pages((unsigned long)pg_vec[i].buffer,
2350 order);
2351 pg_vec[i].buffer = NULL;
2352 }
2332 } 2353 }
2333 kfree(pg_vec); 2354 kfree(pg_vec);
2334} 2355}
2335 2356
2336static inline char *alloc_one_pg_vec_page(unsigned long order) 2357static inline char *alloc_one_pg_vec_page(unsigned long order)
2337{ 2358{
2338 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN; 2359 char *buffer = NULL;
2360 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2361 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2362
2363 buffer = (char *) __get_free_pages(gfp_flags, order);
2364
2365 if (buffer)
2366 return buffer;
2367
2368 /*
2369 * __get_free_pages failed, fall back to vmalloc
2370 */
2371 buffer = vzalloc((1 << order) * PAGE_SIZE);
2339 2372
2340 return (char *) __get_free_pages(gfp_flags, order); 2373 if (buffer)
2374 return buffer;
2375
2376 /*
2377 * vmalloc failed, lets dig into swap here
2378 */
2379 gfp_flags &= ~__GFP_NORETRY;
2380 buffer = (char *)__get_free_pages(gfp_flags, order);
2381 if (buffer)
2382 return buffer;
2383
2384 /*
2385 * complete and utter failure
2386 */
2387 return NULL;
2341} 2388}
2342 2389
2343static char **alloc_pg_vec(struct tpacket_req *req, int order) 2390static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2344{ 2391{
2345 unsigned int block_nr = req->tp_block_nr; 2392 unsigned int block_nr = req->tp_block_nr;
2346 char **pg_vec; 2393 struct pgv *pg_vec;
2347 int i; 2394 int i;
2348 2395
2349 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); 2396 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2350 if (unlikely(!pg_vec)) 2397 if (unlikely(!pg_vec))
2351 goto out; 2398 goto out;
2352 2399
2353 for (i = 0; i < block_nr; i++) { 2400 for (i = 0; i < block_nr; i++) {
2354 pg_vec[i] = alloc_one_pg_vec_page(order); 2401 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
2355 if (unlikely(!pg_vec[i])) 2402 if (unlikely(!pg_vec[i].buffer))
2356 goto out_free_pgvec; 2403 goto out_free_pgvec;
2357 } 2404 }
2358 2405
@@ -2368,7 +2415,7 @@ out_free_pgvec:
2368static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 2415static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2369 int closing, int tx_ring) 2416 int closing, int tx_ring)
2370{ 2417{
2371 char **pg_vec = NULL; 2418 struct pgv *pg_vec = NULL;
2372 struct packet_sock *po = pkt_sk(sk); 2419 struct packet_sock *po = pkt_sk(sk);
2373 int was_running, order = 0; 2420 int was_running, order = 0;
2374 struct packet_ring_buffer *rb; 2421 struct packet_ring_buffer *rb;
@@ -2453,22 +2500,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2453 mutex_lock(&po->pg_vec_lock); 2500 mutex_lock(&po->pg_vec_lock);
2454 if (closing || atomic_read(&po->mapped) == 0) { 2501 if (closing || atomic_read(&po->mapped) == 0) {
2455 err = 0; 2502 err = 0;
2456#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2457 spin_lock_bh(&rb_queue->lock); 2503 spin_lock_bh(&rb_queue->lock);
2458 pg_vec = XC(rb->pg_vec, pg_vec); 2504 swap(rb->pg_vec, pg_vec);
2459 rb->frame_max = (req->tp_frame_nr - 1); 2505 rb->frame_max = (req->tp_frame_nr - 1);
2460 rb->head = 0; 2506 rb->head = 0;
2461 rb->frame_size = req->tp_frame_size; 2507 rb->frame_size = req->tp_frame_size;
2462 spin_unlock_bh(&rb_queue->lock); 2508 spin_unlock_bh(&rb_queue->lock);
2463 2509
2464 order = XC(rb->pg_vec_order, order); 2510 swap(rb->pg_vec_order, order);
2465 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr); 2511 swap(rb->pg_vec_len, req->tp_block_nr);
2466 2512
2467 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 2513 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2468 po->prot_hook.func = (po->rx_ring.pg_vec) ? 2514 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2469 tpacket_rcv : packet_rcv; 2515 tpacket_rcv : packet_rcv;
2470 skb_queue_purge(rb_queue); 2516 skb_queue_purge(rb_queue);
2471#undef XC
2472 if (atomic_read(&po->mapped)) 2517 if (atomic_read(&po->mapped))
2473 pr_err("packet_mmap: vma is busy: %d\n", 2518 pr_err("packet_mmap: vma is busy: %d\n",
2474 atomic_read(&po->mapped)); 2519 atomic_read(&po->mapped));
@@ -2530,15 +2575,17 @@ static int packet_mmap(struct file *file, struct socket *sock,
2530 continue; 2575 continue;
2531 2576
2532 for (i = 0; i < rb->pg_vec_len; i++) { 2577 for (i = 0; i < rb->pg_vec_len; i++) {
2533 struct page *page = virt_to_page(rb->pg_vec[i]); 2578 struct page *page;
2579 void *kaddr = rb->pg_vec[i].buffer;
2534 int pg_num; 2580 int pg_num;
2535 2581
2536 for (pg_num = 0; pg_num < rb->pg_vec_pages; 2582 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
2537 pg_num++, page++) { 2583 page = pgv_to_page(kaddr);
2538 err = vm_insert_page(vma, start, page); 2584 err = vm_insert_page(vma, start, page);
2539 if (unlikely(err)) 2585 if (unlikely(err))
2540 goto out; 2586 goto out;
2541 start += PAGE_SIZE; 2587 start += PAGE_SIZE;
2588 kaddr += PAGE_SIZE;
2542 } 2589 }
2543 } 2590 }
2544 } 2591 }