aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c156
1 files changed, 100 insertions, 56 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8298e676f5a0..91cb1d71f018 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -61,6 +61,7 @@
61#include <linux/kernel.h> 61#include <linux/kernel.h>
62#include <linux/kmod.h> 62#include <linux/kmod.h>
63#include <linux/slab.h> 63#include <linux/slab.h>
64#include <linux/vmalloc.h>
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65#include <net/ip.h> 66#include <net/ip.h>
66#include <net/protocol.h> 67#include <net/protocol.h>
@@ -163,8 +164,13 @@ struct packet_mreq_max {
163static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164 int closing, int tx_ring); 165 int closing, int tx_ring);
165 166
167#define PGV_FROM_VMALLOC 1
168struct pgv {
169 char *buffer;
170};
171
166struct packet_ring_buffer { 172struct packet_ring_buffer {
167 char **pg_vec; 173 struct pgv *pg_vec;
168 unsigned int head; 174 unsigned int head;
169 unsigned int frames_per_block; 175 unsigned int frames_per_block;
170 unsigned int frame_size; 176 unsigned int frame_size;
@@ -217,6 +223,13 @@ struct packet_skb_cb {
217 223
218#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 224#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
219 225
226static inline __pure struct page *pgv_to_page(void *addr)
227{
228 if (is_vmalloc_addr(addr))
229 return vmalloc_to_page(addr);
230 return virt_to_page(addr);
231}
232
220static void __packet_set_status(struct packet_sock *po, void *frame, int status) 233static void __packet_set_status(struct packet_sock *po, void *frame, int status)
221{ 234{
222 union { 235 union {
@@ -229,11 +242,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
229 switch (po->tp_version) { 242 switch (po->tp_version) {
230 case TPACKET_V1: 243 case TPACKET_V1:
231 h.h1->tp_status = status; 244 h.h1->tp_status = status;
232 flush_dcache_page(virt_to_page(&h.h1->tp_status)); 245 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
233 break; 246 break;
234 case TPACKET_V2: 247 case TPACKET_V2:
235 h.h2->tp_status = status; 248 h.h2->tp_status = status;
236 flush_dcache_page(virt_to_page(&h.h2->tp_status)); 249 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
237 break; 250 break;
238 default: 251 default:
239 pr_err("TPACKET version not supported\n"); 252 pr_err("TPACKET version not supported\n");
@@ -256,10 +269,10 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
256 h.raw = frame; 269 h.raw = frame;
257 switch (po->tp_version) { 270 switch (po->tp_version) {
258 case TPACKET_V1: 271 case TPACKET_V1:
259 flush_dcache_page(virt_to_page(&h.h1->tp_status)); 272 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
260 return h.h1->tp_status; 273 return h.h1->tp_status;
261 case TPACKET_V2: 274 case TPACKET_V2:
262 flush_dcache_page(virt_to_page(&h.h2->tp_status)); 275 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
263 return h.h2->tp_status; 276 return h.h2->tp_status;
264 default: 277 default:
265 pr_err("TPACKET version not supported\n"); 278 pr_err("TPACKET version not supported\n");
@@ -283,7 +296,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
283 pg_vec_pos = position / rb->frames_per_block; 296 pg_vec_pos = position / rb->frames_per_block;
284 frame_offset = position % rb->frames_per_block; 297 frame_offset = position % rb->frames_per_block;
285 298
286 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size); 299 h.raw = rb->pg_vec[pg_vec_pos].buffer +
300 (frame_offset * rb->frame_size);
287 301
288 if (status != __packet_get_status(po, h.raw)) 302 if (status != __packet_get_status(po, h.raw))
289 return NULL; 303 return NULL;
@@ -503,7 +517,8 @@ out_free:
503 return err; 517 return err;
504} 518}
505 519
506static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, 520static inline unsigned int run_filter(const struct sk_buff *skb,
521 const struct sock *sk,
507 unsigned int res) 522 unsigned int res)
508{ 523{
509 struct sk_filter *filter; 524 struct sk_filter *filter;
@@ -511,22 +526,22 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
511 rcu_read_lock_bh(); 526 rcu_read_lock_bh();
512 filter = rcu_dereference_bh(sk->sk_filter); 527 filter = rcu_dereference_bh(sk->sk_filter);
513 if (filter != NULL) 528 if (filter != NULL)
514 res = sk_run_filter(skb, filter->insns, filter->len); 529 res = sk_run_filter(skb, filter->insns);
515 rcu_read_unlock_bh(); 530 rcu_read_unlock_bh();
516 531
517 return res; 532 return res;
518} 533}
519 534
520/* 535/*
521 This function makes lazy skb cloning in hope that most of packets 536 * This function makes lazy skb cloning in hope that most of packets
522 are discarded by BPF. 537 * are discarded by BPF.
523 538 *
524 Note tricky part: we DO mangle shared skb! skb->data, skb->len 539 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
525 and skb->cb are mangled. It works because (and until) packets 540 * and skb->cb are mangled. It works because (and until) packets
526 falling here are owned by current CPU. Output packets are cloned 541 * falling here are owned by current CPU. Output packets are cloned
527 by dev_queue_xmit_nit(), input packets are processed by net_bh 542 * by dev_queue_xmit_nit(), input packets are processed by net_bh
528 sequencially, so that if we return skb to original state on exit, 543 * sequencially, so that if we return skb to original state on exit,
529 we will not harm anyone. 544 * we will not harm anyone.
530 */ 545 */
531 546
532static int packet_rcv(struct sk_buff *skb, struct net_device *dev, 547static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -552,11 +567,11 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
552 567
553 if (dev->header_ops) { 568 if (dev->header_ops) {
554 /* The device has an explicit notion of ll header, 569 /* The device has an explicit notion of ll header,
555 exported to higher levels. 570 * exported to higher levels.
556 571 *
557 Otherwise, the device hides datails of it frame 572 * Otherwise, the device hides details of its frame
558 structure, so that corresponding packet head 573 * structure, so that corresponding packet head is
559 never delivered to user. 574 * never delivered to user.
560 */ 575 */
561 if (sk->sk_type != SOCK_DGRAM) 576 if (sk->sk_type != SOCK_DGRAM)
562 skb_push(skb, skb->data - skb_mac_header(skb)); 577 skb_push(skb, skb->data - skb_mac_header(skb));
@@ -791,17 +806,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
791 806
792 __packet_set_status(po, h.raw, status); 807 __packet_set_status(po, h.raw, status);
793 smp_mb(); 808 smp_mb();
809#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
794 { 810 {
795 struct page *p_start, *p_end; 811 u8 *start, *end;
796 u8 *h_end = h.raw + macoff + snaplen - 1; 812
797 813 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
798 p_start = virt_to_page(h.raw); 814 for (start = h.raw; start < end; start += PAGE_SIZE)
799 p_end = virt_to_page(h_end); 815 flush_dcache_page(pgv_to_page(start));
800 while (p_start <= p_end) {
801 flush_dcache_page(p_start);
802 p_start++;
803 }
804 } 816 }
817#endif
805 818
806 sk->sk_data_ready(sk, 0); 819 sk->sk_data_ready(sk, 0);
807 820
@@ -907,7 +920,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
907 } 920 }
908 921
909 err = -EFAULT; 922 err = -EFAULT;
910 page = virt_to_page(data);
911 offset = offset_in_page(data); 923 offset = offset_in_page(data);
912 len_max = PAGE_SIZE - offset; 924 len_max = PAGE_SIZE - offset;
913 len = ((to_write > len_max) ? len_max : to_write); 925 len = ((to_write > len_max) ? len_max : to_write);
@@ -926,11 +938,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
926 return -EFAULT; 938 return -EFAULT;
927 } 939 }
928 940
941 page = pgv_to_page(data);
942 data += len;
929 flush_dcache_page(page); 943 flush_dcache_page(page);
930 get_page(page); 944 get_page(page);
931 skb_fill_page_desc(skb, 945 skb_fill_page_desc(skb, nr_frags, page, offset, len);
932 nr_frags,
933 page++, offset, len);
934 to_write -= len; 946 to_write -= len;
935 offset = 0; 947 offset = 0;
936 len_max = PAGE_SIZE; 948 len_max = PAGE_SIZE;
@@ -1638,8 +1650,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1638 1650
1639 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1651 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1640 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 1652 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1641 vnet_hdr.csum_start = skb->csum_start - 1653 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
1642 skb_headroom(skb);
1643 vnet_hdr.csum_offset = skb->csum_offset; 1654 vnet_hdr.csum_offset = skb->csum_offset;
1644 } /* else everything is zero */ 1655 } /* else everything is zero */
1645 1656
@@ -2325,37 +2336,70 @@ static const struct vm_operations_struct packet_mmap_ops = {
2325 .close = packet_mm_close, 2336 .close = packet_mm_close,
2326}; 2337};
2327 2338
2328static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) 2339static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2340 unsigned int len)
2329{ 2341{
2330 int i; 2342 int i;
2331 2343
2332 for (i = 0; i < len; i++) { 2344 for (i = 0; i < len; i++) {
2333 if (likely(pg_vec[i])) 2345 if (likely(pg_vec[i].buffer)) {
2334 free_pages((unsigned long) pg_vec[i], order); 2346 if (is_vmalloc_addr(pg_vec[i].buffer))
2347 vfree(pg_vec[i].buffer);
2348 else
2349 free_pages((unsigned long)pg_vec[i].buffer,
2350 order);
2351 pg_vec[i].buffer = NULL;
2352 }
2335 } 2353 }
2336 kfree(pg_vec); 2354 kfree(pg_vec);
2337} 2355}
2338 2356
2339static inline char *alloc_one_pg_vec_page(unsigned long order) 2357static inline char *alloc_one_pg_vec_page(unsigned long order)
2340{ 2358{
2341 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN; 2359 char *buffer = NULL;
2360 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2361 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2342 2362
2343 return (char *) __get_free_pages(gfp_flags, order); 2363 buffer = (char *) __get_free_pages(gfp_flags, order);
2364
2365 if (buffer)
2366 return buffer;
2367
2368 /*
2369 * __get_free_pages failed, fall back to vmalloc
2370 */
2371 buffer = vzalloc((1 << order) * PAGE_SIZE);
2372
2373 if (buffer)
2374 return buffer;
2375
2376 /*
2377 * vmalloc failed, lets dig into swap here
2378 */
2379 gfp_flags &= ~__GFP_NORETRY;
2380 buffer = (char *)__get_free_pages(gfp_flags, order);
2381 if (buffer)
2382 return buffer;
2383
2384 /*
2385 * complete and utter failure
2386 */
2387 return NULL;
2344} 2388}
2345 2389
2346static char **alloc_pg_vec(struct tpacket_req *req, int order) 2390static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2347{ 2391{
2348 unsigned int block_nr = req->tp_block_nr; 2392 unsigned int block_nr = req->tp_block_nr;
2349 char **pg_vec; 2393 struct pgv *pg_vec;
2350 int i; 2394 int i;
2351 2395
2352 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); 2396 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2353 if (unlikely(!pg_vec)) 2397 if (unlikely(!pg_vec))
2354 goto out; 2398 goto out;
2355 2399
2356 for (i = 0; i < block_nr; i++) { 2400 for (i = 0; i < block_nr; i++) {
2357 pg_vec[i] = alloc_one_pg_vec_page(order); 2401 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
2358 if (unlikely(!pg_vec[i])) 2402 if (unlikely(!pg_vec[i].buffer))
2359 goto out_free_pgvec; 2403 goto out_free_pgvec;
2360 } 2404 }
2361 2405
@@ -2371,7 +2415,7 @@ out_free_pgvec:
2371static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 2415static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2372 int closing, int tx_ring) 2416 int closing, int tx_ring)
2373{ 2417{
2374 char **pg_vec = NULL; 2418 struct pgv *pg_vec = NULL;
2375 struct packet_sock *po = pkt_sk(sk); 2419 struct packet_sock *po = pkt_sk(sk);
2376 int was_running, order = 0; 2420 int was_running, order = 0;
2377 struct packet_ring_buffer *rb; 2421 struct packet_ring_buffer *rb;
@@ -2456,22 +2500,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2456 mutex_lock(&po->pg_vec_lock); 2500 mutex_lock(&po->pg_vec_lock);
2457 if (closing || atomic_read(&po->mapped) == 0) { 2501 if (closing || atomic_read(&po->mapped) == 0) {
2458 err = 0; 2502 err = 0;
2459#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2460 spin_lock_bh(&rb_queue->lock); 2503 spin_lock_bh(&rb_queue->lock);
2461 pg_vec = XC(rb->pg_vec, pg_vec); 2504 swap(rb->pg_vec, pg_vec);
2462 rb->frame_max = (req->tp_frame_nr - 1); 2505 rb->frame_max = (req->tp_frame_nr - 1);
2463 rb->head = 0; 2506 rb->head = 0;
2464 rb->frame_size = req->tp_frame_size; 2507 rb->frame_size = req->tp_frame_size;
2465 spin_unlock_bh(&rb_queue->lock); 2508 spin_unlock_bh(&rb_queue->lock);
2466 2509
2467 order = XC(rb->pg_vec_order, order); 2510 swap(rb->pg_vec_order, order);
2468 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr); 2511 swap(rb->pg_vec_len, req->tp_block_nr);
2469 2512
2470 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 2513 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2471 po->prot_hook.func = (po->rx_ring.pg_vec) ? 2514 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2472 tpacket_rcv : packet_rcv; 2515 tpacket_rcv : packet_rcv;
2473 skb_queue_purge(rb_queue); 2516 skb_queue_purge(rb_queue);
2474#undef XC
2475 if (atomic_read(&po->mapped)) 2517 if (atomic_read(&po->mapped))
2476 pr_err("packet_mmap: vma is busy: %d\n", 2518 pr_err("packet_mmap: vma is busy: %d\n",
2477 atomic_read(&po->mapped)); 2519 atomic_read(&po->mapped));
@@ -2533,15 +2575,17 @@ static int packet_mmap(struct file *file, struct socket *sock,
2533 continue; 2575 continue;
2534 2576
2535 for (i = 0; i < rb->pg_vec_len; i++) { 2577 for (i = 0; i < rb->pg_vec_len; i++) {
2536 struct page *page = virt_to_page(rb->pg_vec[i]); 2578 struct page *page;
2579 void *kaddr = rb->pg_vec[i].buffer;
2537 int pg_num; 2580 int pg_num;
2538 2581
2539 for (pg_num = 0; pg_num < rb->pg_vec_pages; 2582 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
2540 pg_num++, page++) { 2583 page = pgv_to_page(kaddr);
2541 err = vm_insert_page(vma, start, page); 2584 err = vm_insert_page(vma, start, page);
2542 if (unlikely(err)) 2585 if (unlikely(err))
2543 goto out; 2586 goto out;
2544 start += PAGE_SIZE; 2587 start += PAGE_SIZE;
2588 kaddr += PAGE_SIZE;
2545 } 2589 }
2546 } 2590 }
2547 } 2591 }