aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 17:27:06 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 17:27:06 -0500
commit70e71ca0af244f48a5dcf56dc435243792e3a495 (patch)
treef7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /net/core
parentbae41e45b7400496b9bf0c70c6004419d9987819 (diff)
parent00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) New offloading infrastructure and example 'rocker' driver for offloading of switching and routing to hardware. This work was done by a large group of dedicated individuals, not limited to: Scott Feldman, Jiri Pirko, Thomas Graf, John Fastabend, Jamal Hadi Salim, Andy Gospodarek, Florian Fainelli, Roopa Prabhu 2) Start making the networking operate on IOV iterators instead of modifying iov objects in-situ during transfers. Thanks to Al Viro and Herbert Xu. 3) A set of new netlink interfaces for the TIPC stack, from Richard Alpe. 4) Remove unnecessary looping during ipv6 routing lookups, from Martin KaFai Lau. 5) Add PAUSE frame generation support to gianfar driver, from Matei Pavaluca. 6) Allow for larger reordering levels in TCP, which are easily achievable in the real world right now, from Eric Dumazet. 7) Add a variable of napi_schedule that doesn't need to disable cpu interrupts, from Eric Dumazet. 8) Use a doubly linked list to optimize neigh_parms_release(), from Nicolas Dichtel. 9) Various enhancements to the kernel BPF verifier, and allow eBPF programs to actually be attached to sockets. From Alexei Starovoitov. 10) Support TSO/LSO in sunvnet driver, from David L Stevens. 11) Allow controlling ECN usage via routing metrics, from Florian Westphal. 12) Remote checksum offload, from Tom Herbert. 13) Add split-header receive, BQL, and xmit_more support to amd-xgbe driver, from Thomas Lendacky. 14) Add MPLS support to openvswitch, from Simon Horman. 15) Support wildcard tunnel endpoints in ipv6 tunnels, from Steffen Klassert. 16) Do gro flushes on a per-device basis using a timer, from Eric Dumazet. This tries to resolve the conflicting goals between the desired handling of bulk vs. RPC-like traffic. 17) Allow userspace to ask for the CPU upon what a packet was received/steered, via SO_INCOMING_CPU. From Eric Dumazet. 18) Limit GSO packets to half the current congestion window, from Eric Dumazet. 19) Add a generic helper so that all drivers set their RSS keys in a consistent way, from Eric Dumazet. 20) Add xmit_more support to enic driver, from Govindarajulu Varadarajan. 21) Add VLAN packet scheduler action, from Jiri Pirko. 22) Support configurable RSS hash functions via ethtool, from Eyal Perry. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1820 commits) Fix race condition between vxlan_sock_add and vxlan_sock_release net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header net/mlx4: Add support for A0 steering net/mlx4: Refactor QUERY_PORT net/mlx4_core: Add explicit error message when rule doesn't meet configuration net/mlx4: Add A0 hybrid steering net/mlx4: Add mlx4_bitmap zone allocator net/mlx4: Add a check if there are too many reserved QPs net/mlx4: Change QP allocation scheme net/mlx4_core: Use tasklet for user-space CQ completion events net/mlx4_core: Mask out host side virtualization features for guests net/mlx4_en: Set csum level for encapsulated packets be2net: Export tunnel offloads only when a VxLAN tunnel is created gianfar: Fix dma check map error when DMA_API_DEBUG is enabled cxgb4/csiostor: Don't use MASTER_MUST for fw_hello call net: fec: only enable mdio interrupt before phy device link up net: fec: clear all interrupt events to support i.MX6SX net: fec: reset fep link status in suspend function net: sock: fix access via invalid file descriptor net: introduce helper macro for_each_cmsghdr ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c301
-rw-r--r--net/core/dev.c172
-rw-r--r--net/core/dev_addr_lists.c4
-rw-r--r--net/core/dev_ioctl.c9
-rw-r--r--net/core/dst.c24
-rw-r--r--net/core/ethtool.c81
-rw-r--r--net/core/filter.c97
-rw-r--r--net/core/iovec.c47
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/neighbour.c279
-rw-r--r--net/core/net-sysfs.c44
-rw-r--r--net/core/netpoll.c4
-rw-r--r--net/core/pktgen.c3
-rw-r--r--net/core/rtnetlink.c154
-rw-r--r--net/core/scm.c3
-rw-r--r--net/core/skbuff.c347
-rw-r--r--net/core/sock.c20
-rw-r--r--net/core/sysctl_net_core.c21
-rw-r--r--net/core/utils.c3
19 files changed, 985 insertions, 630 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a81d4c2..df493d68330c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
49#include <linux/spinlock.h> 49#include <linux/spinlock.h>
50#include <linux/slab.h> 50#include <linux/slab.h>
51#include <linux/pagemap.h> 51#include <linux/pagemap.h>
52#include <linux/uio.h>
52 53
53#include <net/protocol.h> 54#include <net/protocol.h>
54#include <linux/skbuff.h> 55#include <linux/skbuff.h>
@@ -309,16 +310,14 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
309EXPORT_SYMBOL(skb_kill_datagram); 310EXPORT_SYMBOL(skb_kill_datagram);
310 311
311/** 312/**
312 * skb_copy_datagram_iovec - Copy a datagram to an iovec. 313 * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
313 * @skb: buffer to copy 314 * @skb: buffer to copy
314 * @offset: offset in the buffer to start copying from 315 * @offset: offset in the buffer to start copying from
315 * @to: io vector to copy to 316 * @to: iovec iterator to copy to
316 * @len: amount of data to copy from buffer to iovec 317 * @len: amount of data to copy from buffer to iovec
317 *
318 * Note: the iovec is modified during the copy.
319 */ 318 */
320int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, 319int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
321 struct iovec *to, int len) 320 struct iov_iter *to, int len)
322{ 321{
323 int start = skb_headlen(skb); 322 int start = skb_headlen(skb);
324 int i, copy = start - offset; 323 int i, copy = start - offset;
@@ -330,8 +329,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
330 if (copy > 0) { 329 if (copy > 0) {
331 if (copy > len) 330 if (copy > len)
332 copy = len; 331 copy = len;
333 if (memcpy_toiovec(to, skb->data + offset, copy)) 332 if (copy_to_iter(skb->data + offset, copy, to) != copy)
334 goto fault; 333 goto short_copy;
335 if ((len -= copy) == 0) 334 if ((len -= copy) == 0)
336 return 0; 335 return 0;
337 offset += copy; 336 offset += copy;
@@ -346,18 +345,12 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
346 345
347 end = start + skb_frag_size(frag); 346 end = start + skb_frag_size(frag);
348 if ((copy = end - offset) > 0) { 347 if ((copy = end - offset) > 0) {
349 int err;
350 u8 *vaddr;
351 struct page *page = skb_frag_page(frag);
352
353 if (copy > len) 348 if (copy > len)
354 copy = len; 349 copy = len;
355 vaddr = kmap(page); 350 if (copy_page_to_iter(skb_frag_page(frag),
356 err = memcpy_toiovec(to, vaddr + frag->page_offset + 351 frag->page_offset + offset -
357 offset - start, copy); 352 start, copy, to) != copy)
358 kunmap(page); 353 goto short_copy;
359 if (err)
360 goto fault;
361 if (!(len -= copy)) 354 if (!(len -= copy))
362 return 0; 355 return 0;
363 offset += copy; 356 offset += copy;
@@ -374,9 +367,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
374 if ((copy = end - offset) > 0) { 367 if ((copy = end - offset) > 0) {
375 if (copy > len) 368 if (copy > len)
376 copy = len; 369 copy = len;
377 if (skb_copy_datagram_iovec(frag_iter, 370 if (skb_copy_datagram_iter(frag_iter, offset - start,
378 offset - start, 371 to, copy))
379 to, copy))
380 goto fault; 372 goto fault;
381 if ((len -= copy) == 0) 373 if ((len -= copy) == 0)
382 return 0; 374 return 0;
@@ -387,113 +379,33 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
387 if (!len) 379 if (!len)
388 return 0; 380 return 0;
389 381
382 /* This is not really a user copy fault, but rather someone
383 * gave us a bogus length on the skb. We should probably
384 * print a warning here as it may indicate a kernel bug.
385 */
386
390fault: 387fault:
391 return -EFAULT; 388 return -EFAULT;
392}
393EXPORT_SYMBOL(skb_copy_datagram_iovec);
394 389
395/** 390short_copy:
396 * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. 391 if (iov_iter_count(to))
397 * @skb: buffer to copy 392 goto fault;
398 * @offset: offset in the buffer to start copying from
399 * @to: io vector to copy to
400 * @to_offset: offset in the io vector to start copying to
401 * @len: amount of data to copy from buffer to iovec
402 *
403 * Returns 0 or -EFAULT.
404 * Note: the iovec is not modified during the copy.
405 */
406int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
407 const struct iovec *to, int to_offset,
408 int len)
409{
410 int start = skb_headlen(skb);
411 int i, copy = start - offset;
412 struct sk_buff *frag_iter;
413 393
414 /* Copy header. */ 394 return 0;
415 if (copy > 0) {
416 if (copy > len)
417 copy = len;
418 if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
419 goto fault;
420 if ((len -= copy) == 0)
421 return 0;
422 offset += copy;
423 to_offset += copy;
424 }
425
426 /* Copy paged appendix. Hmm... why does this look so complicated? */
427 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
428 int end;
429 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
430
431 WARN_ON(start > offset + len);
432
433 end = start + skb_frag_size(frag);
434 if ((copy = end - offset) > 0) {
435 int err;
436 u8 *vaddr;
437 struct page *page = skb_frag_page(frag);
438
439 if (copy > len)
440 copy = len;
441 vaddr = kmap(page);
442 err = memcpy_toiovecend(to, vaddr + frag->page_offset +
443 offset - start, to_offset, copy);
444 kunmap(page);
445 if (err)
446 goto fault;
447 if (!(len -= copy))
448 return 0;
449 offset += copy;
450 to_offset += copy;
451 }
452 start = end;
453 }
454
455 skb_walk_frags(skb, frag_iter) {
456 int end;
457
458 WARN_ON(start > offset + len);
459
460 end = start + frag_iter->len;
461 if ((copy = end - offset) > 0) {
462 if (copy > len)
463 copy = len;
464 if (skb_copy_datagram_const_iovec(frag_iter,
465 offset - start,
466 to, to_offset,
467 copy))
468 goto fault;
469 if ((len -= copy) == 0)
470 return 0;
471 offset += copy;
472 to_offset += copy;
473 }
474 start = end;
475 }
476 if (!len)
477 return 0;
478
479fault:
480 return -EFAULT;
481} 395}
482EXPORT_SYMBOL(skb_copy_datagram_const_iovec); 396EXPORT_SYMBOL(skb_copy_datagram_iter);
483 397
484/** 398/**
485 * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. 399 * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
486 * @skb: buffer to copy 400 * @skb: buffer to copy
487 * @offset: offset in the buffer to start copying to 401 * @offset: offset in the buffer to start copying to
488 * @from: io vector to copy to 402 * @from: the copy source
489 * @from_offset: offset in the io vector to start copying from
490 * @len: amount of data to copy to buffer from iovec 403 * @len: amount of data to copy to buffer from iovec
491 * 404 *
492 * Returns 0 or -EFAULT. 405 * Returns 0 or -EFAULT.
493 * Note: the iovec is not modified during the copy.
494 */ 406 */
495int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, 407int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
496 const struct iovec *from, int from_offset, 408 struct iov_iter *from,
497 int len) 409 int len)
498{ 410{
499 int start = skb_headlen(skb); 411 int start = skb_headlen(skb);
@@ -504,13 +416,11 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
504 if (copy > 0) { 416 if (copy > 0) {
505 if (copy > len) 417 if (copy > len)
506 copy = len; 418 copy = len;
507 if (memcpy_fromiovecend(skb->data + offset, from, from_offset, 419 if (copy_from_iter(skb->data + offset, copy, from) != copy)
508 copy))
509 goto fault; 420 goto fault;
510 if ((len -= copy) == 0) 421 if ((len -= copy) == 0)
511 return 0; 422 return 0;
512 offset += copy; 423 offset += copy;
513 from_offset += copy;
514 } 424 }
515 425
516 /* Copy paged appendix. Hmm... why does this look so complicated? */ 426 /* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -522,24 +432,19 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
522 432
523 end = start + skb_frag_size(frag); 433 end = start + skb_frag_size(frag);
524 if ((copy = end - offset) > 0) { 434 if ((copy = end - offset) > 0) {
525 int err; 435 size_t copied;
526 u8 *vaddr;
527 struct page *page = skb_frag_page(frag);
528 436
529 if (copy > len) 437 if (copy > len)
530 copy = len; 438 copy = len;
531 vaddr = kmap(page); 439 copied = copy_page_from_iter(skb_frag_page(frag),
532 err = memcpy_fromiovecend(vaddr + frag->page_offset + 440 frag->page_offset + offset - start,
533 offset - start, 441 copy, from);
534 from, from_offset, copy); 442 if (copied != copy)
535 kunmap(page);
536 if (err)
537 goto fault; 443 goto fault;
538 444
539 if (!(len -= copy)) 445 if (!(len -= copy))
540 return 0; 446 return 0;
541 offset += copy; 447 offset += copy;
542 from_offset += copy;
543 } 448 }
544 start = end; 449 start = end;
545 } 450 }
@@ -553,16 +458,13 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
553 if ((copy = end - offset) > 0) { 458 if ((copy = end - offset) > 0) {
554 if (copy > len) 459 if (copy > len)
555 copy = len; 460 copy = len;
556 if (skb_copy_datagram_from_iovec(frag_iter, 461 if (skb_copy_datagram_from_iter(frag_iter,
557 offset - start, 462 offset - start,
558 from, 463 from, copy))
559 from_offset,
560 copy))
561 goto fault; 464 goto fault;
562 if ((len -= copy) == 0) 465 if ((len -= copy) == 0)
563 return 0; 466 return 0;
564 offset += copy; 467 offset += copy;
565 from_offset += copy;
566 } 468 }
567 start = end; 469 start = end;
568 } 470 }
@@ -572,101 +474,82 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
572fault: 474fault:
573 return -EFAULT; 475 return -EFAULT;
574} 476}
575EXPORT_SYMBOL(skb_copy_datagram_from_iovec); 477EXPORT_SYMBOL(skb_copy_datagram_from_iter);
576 478
577/** 479/**
578 * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec 480 * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
579 * @skb: buffer to copy 481 * @skb: buffer to copy
580 * @from: io vector to copy from 482 * @from: the source to copy from
581 * @offset: offset in the io vector to start copying from
582 * @count: amount of vectors to copy to buffer from
583 * 483 *
584 * The function will first copy up to headlen, and then pin the userspace 484 * The function will first copy up to headlen, and then pin the userspace
585 * pages and build frags through them. 485 * pages and build frags through them.
586 * 486 *
587 * Returns 0, -EFAULT or -EMSGSIZE. 487 * Returns 0, -EFAULT or -EMSGSIZE.
588 * Note: the iovec is not modified during the copy
589 */ 488 */
590int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, 489int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
591 int offset, size_t count)
592{ 490{
593 int len = iov_length(from, count) - offset; 491 int len = iov_iter_count(from);
594 int copy = min_t(int, skb_headlen(skb), len); 492 int copy = min_t(int, skb_headlen(skb), len);
595 int size; 493 int frag = 0;
596 int i = 0;
597 494
598 /* copy up to skb headlen */ 495 /* copy up to skb headlen */
599 if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) 496 if (skb_copy_datagram_from_iter(skb, 0, from, copy))
600 return -EFAULT; 497 return -EFAULT;
601 498
602 if (len == copy) 499 while (iov_iter_count(from)) {
603 return 0; 500 struct page *pages[MAX_SKB_FRAGS];
604 501 size_t start;
605 offset += copy; 502 ssize_t copied;
606 while (count--) {
607 struct page *page[MAX_SKB_FRAGS];
608 int num_pages;
609 unsigned long base;
610 unsigned long truesize; 503 unsigned long truesize;
504 int n = 0;
611 505
612 /* Skip over from offset and copied */ 506 if (frag == MAX_SKB_FRAGS)
613 if (offset >= from->iov_len) {
614 offset -= from->iov_len;
615 ++from;
616 continue;
617 }
618 len = from->iov_len - offset;
619 base = (unsigned long)from->iov_base + offset;
620 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
621 if (i + size > MAX_SKB_FRAGS)
622 return -EMSGSIZE; 507 return -EMSGSIZE;
623 num_pages = get_user_pages_fast(base, size, 0, &page[i]); 508
624 if (num_pages != size) { 509 copied = iov_iter_get_pages(from, pages, ~0U,
625 release_pages(&page[i], num_pages, 0); 510 MAX_SKB_FRAGS - frag, &start);
511 if (copied < 0)
626 return -EFAULT; 512 return -EFAULT;
627 } 513
628 truesize = size * PAGE_SIZE; 514 iov_iter_advance(from, copied);
629 skb->data_len += len; 515
630 skb->len += len; 516 truesize = PAGE_ALIGN(copied + start);
517 skb->data_len += copied;
518 skb->len += copied;
631 skb->truesize += truesize; 519 skb->truesize += truesize;
632 atomic_add(truesize, &skb->sk->sk_wmem_alloc); 520 atomic_add(truesize, &skb->sk->sk_wmem_alloc);
633 while (len) { 521 while (copied) {
634 int off = base & ~PAGE_MASK; 522 int size = min_t(int, copied, PAGE_SIZE - start);
635 int size = min_t(int, len, PAGE_SIZE - off); 523 skb_fill_page_desc(skb, frag++, pages[n], start, size);
636 skb_fill_page_desc(skb, i, page[i], off, size); 524 start = 0;
637 base += size; 525 copied -= size;
638 len -= size; 526 n++;
639 i++;
640 } 527 }
641 offset = 0;
642 ++from;
643 } 528 }
644 return 0; 529 return 0;
645} 530}
646EXPORT_SYMBOL(zerocopy_sg_from_iovec); 531EXPORT_SYMBOL(zerocopy_sg_from_iter);
647 532
648static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 533static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
649 u8 __user *to, int len, 534 struct iov_iter *to, int len,
650 __wsum *csump) 535 __wsum *csump)
651{ 536{
652 int start = skb_headlen(skb); 537 int start = skb_headlen(skb);
653 int i, copy = start - offset; 538 int i, copy = start - offset;
654 struct sk_buff *frag_iter; 539 struct sk_buff *frag_iter;
655 int pos = 0; 540 int pos = 0;
541 int n;
656 542
657 /* Copy header. */ 543 /* Copy header. */
658 if (copy > 0) { 544 if (copy > 0) {
659 int err = 0;
660 if (copy > len) 545 if (copy > len)
661 copy = len; 546 copy = len;
662 *csump = csum_and_copy_to_user(skb->data + offset, to, copy, 547 n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
663 *csump, &err); 548 if (n != copy)
664 if (err)
665 goto fault; 549 goto fault;
666 if ((len -= copy) == 0) 550 if ((len -= copy) == 0)
667 return 0; 551 return 0;
668 offset += copy; 552 offset += copy;
669 to += copy;
670 pos = copy; 553 pos = copy;
671 } 554 }
672 555
@@ -678,26 +561,22 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
678 561
679 end = start + skb_frag_size(frag); 562 end = start + skb_frag_size(frag);
680 if ((copy = end - offset) > 0) { 563 if ((copy = end - offset) > 0) {
681 __wsum csum2; 564 __wsum csum2 = 0;
682 int err = 0;
683 u8 *vaddr;
684 struct page *page = skb_frag_page(frag); 565 struct page *page = skb_frag_page(frag);
566 u8 *vaddr = kmap(page);
685 567
686 if (copy > len) 568 if (copy > len)
687 copy = len; 569 copy = len;
688 vaddr = kmap(page); 570 n = csum_and_copy_to_iter(vaddr + frag->page_offset +
689 csum2 = csum_and_copy_to_user(vaddr + 571 offset - start, copy,
690 frag->page_offset + 572 &csum2, to);
691 offset - start,
692 to, copy, 0, &err);
693 kunmap(page); 573 kunmap(page);
694 if (err) 574 if (n != copy)
695 goto fault; 575 goto fault;
696 *csump = csum_block_add(*csump, csum2, pos); 576 *csump = csum_block_add(*csump, csum2, pos);
697 if (!(len -= copy)) 577 if (!(len -= copy))
698 return 0; 578 return 0;
699 offset += copy; 579 offset += copy;
700 to += copy;
701 pos += copy; 580 pos += copy;
702 } 581 }
703 start = end; 582 start = end;
@@ -722,7 +601,6 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
722 if ((len -= copy) == 0) 601 if ((len -= copy) == 0)
723 return 0; 602 return 0;
724 offset += copy; 603 offset += copy;
725 to += copy;
726 pos += copy; 604 pos += copy;
727 } 605 }
728 start = end; 606 start = end;
@@ -775,20 +653,19 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
775EXPORT_SYMBOL(__skb_checksum_complete); 653EXPORT_SYMBOL(__skb_checksum_complete);
776 654
777/** 655/**
778 * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec. 656 * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
779 * @skb: skbuff 657 * @skb: skbuff
780 * @hlen: hardware length 658 * @hlen: hardware length
781 * @iov: io vector 659 * @msg: destination
782 * 660 *
783 * Caller _must_ check that skb will fit to this iovec. 661 * Caller _must_ check that skb will fit to this iovec.
784 * 662 *
785 * Returns: 0 - success. 663 * Returns: 0 - success.
786 * -EINVAL - checksum failure. 664 * -EINVAL - checksum failure.
787 * -EFAULT - fault during copy. Beware, in this case iovec 665 * -EFAULT - fault during copy.
788 * can be modified!
789 */ 666 */
790int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, 667int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
791 int hlen, struct iovec *iov) 668 int hlen, struct msghdr *msg)
792{ 669{
793 __wsum csum; 670 __wsum csum;
794 int chunk = skb->len - hlen; 671 int chunk = skb->len - hlen;
@@ -796,28 +673,20 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
796 if (!chunk) 673 if (!chunk)
797 return 0; 674 return 0;
798 675
799 /* Skip filled elements. 676 if (iov_iter_count(&msg->msg_iter) < chunk) {
800 * Pretty silly, look at memcpy_toiovec, though 8)
801 */
802 while (!iov->iov_len)
803 iov++;
804
805 if (iov->iov_len < chunk) {
806 if (__skb_checksum_complete(skb)) 677 if (__skb_checksum_complete(skb))
807 goto csum_error; 678 goto csum_error;
808 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) 679 if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
809 goto fault; 680 goto fault;
810 } else { 681 } else {
811 csum = csum_partial(skb->data, hlen, skb->csum); 682 csum = csum_partial(skb->data, hlen, skb->csum);
812 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, 683 if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
813 chunk, &csum)) 684 chunk, &csum))
814 goto fault; 685 goto fault;
815 if (csum_fold(csum)) 686 if (csum_fold(csum))
816 goto csum_error; 687 goto csum_error;
817 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 688 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
818 netdev_rx_csum_fault(skb->dev); 689 netdev_rx_csum_fault(skb->dev);
819 iov->iov_len -= chunk;
820 iov->iov_base += chunk;
821 } 690 }
822 return 0; 691 return 0;
823csum_error: 692csum_error:
@@ -825,7 +694,7 @@ csum_error:
825fault: 694fault:
826 return -EFAULT; 695 return -EFAULT;
827} 696}
828EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); 697EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
829 698
830/** 699/**
831 * datagram_poll - generic datagram poll 700 * datagram_poll - generic datagram poll
diff --git a/net/core/dev.c b/net/core/dev.c
index 3acff0974560..f411c28d0a66 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
118#include <linux/if_vlan.h> 118#include <linux/if_vlan.h>
119#include <linux/ip.h> 119#include <linux/ip.h>
120#include <net/ip.h> 120#include <net/ip.h>
121#include <net/mpls.h>
121#include <linux/ipv6.h> 122#include <linux/ipv6.h>
122#include <linux/in.h> 123#include <linux/in.h>
123#include <linux/jhash.h> 124#include <linux/jhash.h>
@@ -133,6 +134,7 @@
133#include <linux/vmalloc.h> 134#include <linux/vmalloc.h>
134#include <linux/if_macvlan.h> 135#include <linux/if_macvlan.h>
135#include <linux/errqueue.h> 136#include <linux/errqueue.h>
137#include <linux/hrtimer.h>
136 138
137#include "net-sysfs.h" 139#include "net-sysfs.h"
138 140
@@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close);
1435 */ 1437 */
1436void dev_disable_lro(struct net_device *dev) 1438void dev_disable_lro(struct net_device *dev)
1437{ 1439{
1438 /* 1440 struct net_device *lower_dev;
1439 * If we're trying to disable lro on a vlan device 1441 struct list_head *iter;
1440 * use the underlying physical device instead
1441 */
1442 if (is_vlan_dev(dev))
1443 dev = vlan_dev_real_dev(dev);
1444
1445 /* the same for macvlan devices */
1446 if (netif_is_macvlan(dev))
1447 dev = macvlan_dev_real_dev(dev);
1448 1442
1449 dev->wanted_features &= ~NETIF_F_LRO; 1443 dev->wanted_features &= ~NETIF_F_LRO;
1450 netdev_update_features(dev); 1444 netdev_update_features(dev);
1451 1445
1452 if (unlikely(dev->features & NETIF_F_LRO)) 1446 if (unlikely(dev->features & NETIF_F_LRO))
1453 netdev_WARN(dev, "failed to disable LRO!\n"); 1447 netdev_WARN(dev, "failed to disable LRO!\n");
1448
1449 netdev_for_each_lower_dev(dev, lower_dev, iter)
1450 dev_disable_lro(lower_dev);
1454} 1451}
1455EXPORT_SYMBOL(dev_disable_lro); 1452EXPORT_SYMBOL(dev_disable_lro);
1456 1453
@@ -2530,7 +2527,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb,
2530 netdev_features_t features, 2527 netdev_features_t features,
2531 __be16 type) 2528 __be16 type)
2532{ 2529{
2533 if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) 2530 if (eth_p_mpls(type))
2534 features &= skb->dev->mpls_features; 2531 features &= skb->dev->mpls_features;
2535 2532
2536 return features; 2533 return features;
@@ -2647,12 +2644,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2647 netdev_features_t features) 2644 netdev_features_t features)
2648{ 2645{
2649 if (vlan_tx_tag_present(skb) && 2646 if (vlan_tx_tag_present(skb) &&
2650 !vlan_hw_offload_capable(features, skb->vlan_proto)) { 2647 !vlan_hw_offload_capable(features, skb->vlan_proto))
2651 skb = __vlan_put_tag(skb, skb->vlan_proto, 2648 skb = __vlan_hwaccel_push_inside(skb);
2652 vlan_tx_tag_get(skb));
2653 if (skb)
2654 skb->vlan_tci = 0;
2655 }
2656 return skb; 2649 return skb;
2657} 2650}
2658 2651
@@ -3304,7 +3297,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3304 rps_lock(sd); 3297 rps_lock(sd);
3305 qlen = skb_queue_len(&sd->input_pkt_queue); 3298 qlen = skb_queue_len(&sd->input_pkt_queue);
3306 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3299 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3307 if (skb_queue_len(&sd->input_pkt_queue)) { 3300 if (qlen) {
3308enqueue: 3301enqueue:
3309 __skb_queue_tail(&sd->input_pkt_queue, skb); 3302 __skb_queue_tail(&sd->input_pkt_queue, skb);
3310 input_queue_tail_incr_save(sd, qtail); 3303 input_queue_tail_incr_save(sd, qtail);
@@ -4179,7 +4172,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
4179 struct sk_buff *skb = napi->skb; 4172 struct sk_buff *skb = napi->skb;
4180 4173
4181 if (!skb) { 4174 if (!skb) {
4182 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); 4175 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4183 napi->skb = skb; 4176 napi->skb = skb;
4184 } 4177 }
4185 return skb; 4178 return skb;
@@ -4316,20 +4309,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4316 local_irq_enable(); 4309 local_irq_enable();
4317} 4310}
4318 4311
4312static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4313{
4314#ifdef CONFIG_RPS
4315 return sd->rps_ipi_list != NULL;
4316#else
4317 return false;
4318#endif
4319}
4320
4319static int process_backlog(struct napi_struct *napi, int quota) 4321static int process_backlog(struct napi_struct *napi, int quota)
4320{ 4322{
4321 int work = 0; 4323 int work = 0;
4322 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4324 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4323 4325
4324#ifdef CONFIG_RPS
4325 /* Check if we have pending ipi, its better to send them now, 4326 /* Check if we have pending ipi, its better to send them now,
4326 * not waiting net_rx_action() end. 4327 * not waiting net_rx_action() end.
4327 */ 4328 */
4328 if (sd->rps_ipi_list) { 4329 if (sd_has_rps_ipi_waiting(sd)) {
4329 local_irq_disable(); 4330 local_irq_disable();
4330 net_rps_action_and_irq_enable(sd); 4331 net_rps_action_and_irq_enable(sd);
4331 } 4332 }
4332#endif 4333
4333 napi->weight = weight_p; 4334 napi->weight = weight_p;
4334 local_irq_disable(); 4335 local_irq_disable();
4335 while (1) { 4336 while (1) {
@@ -4356,7 +4357,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
4356 * We can use a plain write instead of clear_bit(), 4357 * We can use a plain write instead of clear_bit(),
4357 * and we dont need an smp_mb() memory barrier. 4358 * and we dont need an smp_mb() memory barrier.
4358 */ 4359 */
4359 list_del(&napi->poll_list);
4360 napi->state = 0; 4360 napi->state = 0;
4361 rps_unlock(sd); 4361 rps_unlock(sd);
4362 4362
@@ -4376,7 +4376,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
4376 * __napi_schedule - schedule for receive 4376 * __napi_schedule - schedule for receive
4377 * @n: entry to schedule 4377 * @n: entry to schedule
4378 * 4378 *
4379 * The entry's receive function will be scheduled to run 4379 * The entry's receive function will be scheduled to run.
4380 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4380 */ 4381 */
4381void __napi_schedule(struct napi_struct *n) 4382void __napi_schedule(struct napi_struct *n)
4382{ 4383{
@@ -4388,18 +4389,29 @@ void __napi_schedule(struct napi_struct *n)
4388} 4389}
4389EXPORT_SYMBOL(__napi_schedule); 4390EXPORT_SYMBOL(__napi_schedule);
4390 4391
4392/**
4393 * __napi_schedule_irqoff - schedule for receive
4394 * @n: entry to schedule
4395 *
4396 * Variant of __napi_schedule() assuming hard irqs are masked
4397 */
4398void __napi_schedule_irqoff(struct napi_struct *n)
4399{
4400 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4401}
4402EXPORT_SYMBOL(__napi_schedule_irqoff);
4403
4391void __napi_complete(struct napi_struct *n) 4404void __napi_complete(struct napi_struct *n)
4392{ 4405{
4393 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4406 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4394 BUG_ON(n->gro_list);
4395 4407
4396 list_del(&n->poll_list); 4408 list_del_init(&n->poll_list);
4397 smp_mb__before_atomic(); 4409 smp_mb__before_atomic();
4398 clear_bit(NAPI_STATE_SCHED, &n->state); 4410 clear_bit(NAPI_STATE_SCHED, &n->state);
4399} 4411}
4400EXPORT_SYMBOL(__napi_complete); 4412EXPORT_SYMBOL(__napi_complete);
4401 4413
4402void napi_complete(struct napi_struct *n) 4414void napi_complete_done(struct napi_struct *n, int work_done)
4403{ 4415{
4404 unsigned long flags; 4416 unsigned long flags;
4405 4417
@@ -4410,12 +4422,28 @@ void napi_complete(struct napi_struct *n)
4410 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4422 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4411 return; 4423 return;
4412 4424
4413 napi_gro_flush(n, false); 4425 if (n->gro_list) {
4414 local_irq_save(flags); 4426 unsigned long timeout = 0;
4415 __napi_complete(n); 4427
4416 local_irq_restore(flags); 4428 if (work_done)
4429 timeout = n->dev->gro_flush_timeout;
4430
4431 if (timeout)
4432 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4433 HRTIMER_MODE_REL_PINNED);
4434 else
4435 napi_gro_flush(n, false);
4436 }
4437 if (likely(list_empty(&n->poll_list))) {
4438 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4439 } else {
4440 /* If n->poll_list is not empty, we need to mask irqs */
4441 local_irq_save(flags);
4442 __napi_complete(n);
4443 local_irq_restore(flags);
4444 }
4417} 4445}
4418EXPORT_SYMBOL(napi_complete); 4446EXPORT_SYMBOL(napi_complete_done);
4419 4447
4420/* must be called under rcu_read_lock(), as we dont take a reference */ 4448/* must be called under rcu_read_lock(), as we dont take a reference */
4421struct napi_struct *napi_by_id(unsigned int napi_id) 4449struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4469,10 +4497,23 @@ void napi_hash_del(struct napi_struct *napi)
4469} 4497}
4470EXPORT_SYMBOL_GPL(napi_hash_del); 4498EXPORT_SYMBOL_GPL(napi_hash_del);
4471 4499
4500static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4501{
4502 struct napi_struct *napi;
4503
4504 napi = container_of(timer, struct napi_struct, timer);
4505 if (napi->gro_list)
4506 napi_schedule(napi);
4507
4508 return HRTIMER_NORESTART;
4509}
4510
4472void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 4511void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4473 int (*poll)(struct napi_struct *, int), int weight) 4512 int (*poll)(struct napi_struct *, int), int weight)
4474{ 4513{
4475 INIT_LIST_HEAD(&napi->poll_list); 4514 INIT_LIST_HEAD(&napi->poll_list);
4515 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4516 napi->timer.function = napi_watchdog;
4476 napi->gro_count = 0; 4517 napi->gro_count = 0;
4477 napi->gro_list = NULL; 4518 napi->gro_list = NULL;
4478 napi->skb = NULL; 4519 napi->skb = NULL;
@@ -4491,6 +4532,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4491} 4532}
4492EXPORT_SYMBOL(netif_napi_add); 4533EXPORT_SYMBOL(netif_napi_add);
4493 4534
4535void napi_disable(struct napi_struct *n)
4536{
4537 might_sleep();
4538 set_bit(NAPI_STATE_DISABLE, &n->state);
4539
4540 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4541 msleep(1);
4542
4543 hrtimer_cancel(&n->timer);
4544
4545 clear_bit(NAPI_STATE_DISABLE, &n->state);
4546}
4547EXPORT_SYMBOL(napi_disable);
4548
4494void netif_napi_del(struct napi_struct *napi) 4549void netif_napi_del(struct napi_struct *napi)
4495{ 4550{
4496 list_del_init(&napi->dev_list); 4551 list_del_init(&napi->dev_list);
@@ -4507,29 +4562,28 @@ static void net_rx_action(struct softirq_action *h)
4507 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4562 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4508 unsigned long time_limit = jiffies + 2; 4563 unsigned long time_limit = jiffies + 2;
4509 int budget = netdev_budget; 4564 int budget = netdev_budget;
4565 LIST_HEAD(list);
4566 LIST_HEAD(repoll);
4510 void *have; 4567 void *have;
4511 4568
4512 local_irq_disable(); 4569 local_irq_disable();
4570 list_splice_init(&sd->poll_list, &list);
4571 local_irq_enable();
4513 4572
4514 while (!list_empty(&sd->poll_list)) { 4573 while (!list_empty(&list)) {
4515 struct napi_struct *n; 4574 struct napi_struct *n;
4516 int work, weight; 4575 int work, weight;
4517 4576
4518 /* If softirq window is exhuasted then punt. 4577 /* If softirq window is exhausted then punt.
4519 * Allow this to run for 2 jiffies since which will allow 4578 * Allow this to run for 2 jiffies since which will allow
4520 * an average latency of 1.5/HZ. 4579 * an average latency of 1.5/HZ.
4521 */ 4580 */
4522 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) 4581 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4523 goto softnet_break; 4582 goto softnet_break;
4524 4583
4525 local_irq_enable();
4526 4584
4527 /* Even though interrupts have been re-enabled, this 4585 n = list_first_entry(&list, struct napi_struct, poll_list);
4528 * access is safe because interrupts can only add new 4586 list_del_init(&n->poll_list);
4529 * entries to the tail of this list, and only ->poll()
4530 * calls can remove this head entry from the list.
4531 */
4532 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4533 4587
4534 have = netpoll_poll_lock(n); 4588 have = netpoll_poll_lock(n);
4535 4589
@@ -4551,8 +4605,6 @@ static void net_rx_action(struct softirq_action *h)
4551 4605
4552 budget -= work; 4606 budget -= work;
4553 4607
4554 local_irq_disable();
4555
4556 /* Drivers must not modify the NAPI state if they 4608 /* Drivers must not modify the NAPI state if they
4557 * consume the entire weight. In such cases this code 4609 * consume the entire weight. In such cases this code
4558 * still "owns" the NAPI instance and therefore can 4610 * still "owns" the NAPI instance and therefore can
@@ -4560,32 +4612,40 @@ static void net_rx_action(struct softirq_action *h)
4560 */ 4612 */
4561 if (unlikely(work == weight)) { 4613 if (unlikely(work == weight)) {
4562 if (unlikely(napi_disable_pending(n))) { 4614 if (unlikely(napi_disable_pending(n))) {
4563 local_irq_enable();
4564 napi_complete(n); 4615 napi_complete(n);
4565 local_irq_disable();
4566 } else { 4616 } else {
4567 if (n->gro_list) { 4617 if (n->gro_list) {
4568 /* flush too old packets 4618 /* flush too old packets
4569 * If HZ < 1000, flush all packets. 4619 * If HZ < 1000, flush all packets.
4570 */ 4620 */
4571 local_irq_enable();
4572 napi_gro_flush(n, HZ >= 1000); 4621 napi_gro_flush(n, HZ >= 1000);
4573 local_irq_disable();
4574 } 4622 }
4575 list_move_tail(&n->poll_list, &sd->poll_list); 4623 list_add_tail(&n->poll_list, &repoll);
4576 } 4624 }
4577 } 4625 }
4578 4626
4579 netpoll_poll_unlock(have); 4627 netpoll_poll_unlock(have);
4580 } 4628 }
4629
4630 if (!sd_has_rps_ipi_waiting(sd) &&
4631 list_empty(&list) &&
4632 list_empty(&repoll))
4633 return;
4581out: 4634out:
4635 local_irq_disable();
4636
4637 list_splice_tail_init(&sd->poll_list, &list);
4638 list_splice_tail(&repoll, &list);
4639 list_splice(&list, &sd->poll_list);
4640 if (!list_empty(&sd->poll_list))
4641 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4642
4582 net_rps_action_and_irq_enable(sd); 4643 net_rps_action_and_irq_enable(sd);
4583 4644
4584 return; 4645 return;
4585 4646
4586softnet_break: 4647softnet_break:
4587 sd->time_squeeze++; 4648 sd->time_squeeze++;
4588 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4589 goto out; 4649 goto out;
4590} 4650}
4591 4651
@@ -5786,7 +5846,7 @@ EXPORT_SYMBOL(dev_change_carrier);
5786 * Get device physical port ID 5846 * Get device physical port ID
5787 */ 5847 */
5788int dev_get_phys_port_id(struct net_device *dev, 5848int dev_get_phys_port_id(struct net_device *dev,
5789 struct netdev_phys_port_id *ppid) 5849 struct netdev_phys_item_id *ppid)
5790{ 5850{
5791 const struct net_device_ops *ops = dev->netdev_ops; 5851 const struct net_device_ops *ops = dev->netdev_ops;
5792 5852
@@ -5865,6 +5925,8 @@ static void rollback_registered_many(struct list_head *head)
5865 synchronize_net(); 5925 synchronize_net();
5866 5926
5867 list_for_each_entry(dev, head, unreg_list) { 5927 list_for_each_entry(dev, head, unreg_list) {
5928 struct sk_buff *skb = NULL;
5929
5868 /* Shutdown queueing discipline. */ 5930 /* Shutdown queueing discipline. */
5869 dev_shutdown(dev); 5931 dev_shutdown(dev);
5870 5932
@@ -5874,6 +5936,11 @@ static void rollback_registered_many(struct list_head *head)
5874 */ 5936 */
5875 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5937 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5876 5938
5939 if (!dev->rtnl_link_ops ||
5940 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5941 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5942 GFP_KERNEL);
5943
5877 /* 5944 /*
5878 * Flush the unicast and multicast chains 5945 * Flush the unicast and multicast chains
5879 */ 5946 */
@@ -5883,9 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
5883 if (dev->netdev_ops->ndo_uninit) 5950 if (dev->netdev_ops->ndo_uninit)
5884 dev->netdev_ops->ndo_uninit(dev); 5951 dev->netdev_ops->ndo_uninit(dev);
5885 5952
5886 if (!dev->rtnl_link_ops || 5953 if (skb)
5887 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 5954 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5888 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5889 5955
5890 /* Notifier chain MUST detach us all upper devices. */ 5956 /* Notifier chain MUST detach us all upper devices. */
5891 WARN_ON(netdev_has_any_upper_dev(dev)); 5957 WARN_ON(netdev_has_any_upper_dev(dev));
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index b6b230600b97..c0548d268e1a 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -278,8 +278,8 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
278EXPORT_SYMBOL(__hw_addr_sync_dev); 278EXPORT_SYMBOL(__hw_addr_sync_dev);
279 279
280/** 280/**
281 * __hw_addr_unsync_dev - Remove synchonized addresses from device 281 * __hw_addr_unsync_dev - Remove synchronized addresses from device
282 * @list: address list to remove syncronized addresses from 282 * @list: address list to remove synchronized addresses from
283 * @dev: device to sync 283 * @dev: device to sync
284 * @unsync: function to call if address should be removed 284 * @unsync: function to call if address should be removed
285 * 285 *
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 72e899a3efda..b94b1d293506 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -142,10 +142,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
142 142
143 case SIOCGIFHWADDR: 143 case SIOCGIFHWADDR:
144 if (!dev->addr_len) 144 if (!dev->addr_len)
145 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); 145 memset(ifr->ifr_hwaddr.sa_data, 0,
146 sizeof(ifr->ifr_hwaddr.sa_data));
146 else 147 else
147 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, 148 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
148 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 149 min(sizeof(ifr->ifr_hwaddr.sa_data),
150 (size_t)dev->addr_len));
149 ifr->ifr_hwaddr.sa_family = dev->type; 151 ifr->ifr_hwaddr.sa_family = dev->type;
150 return 0; 152 return 0;
151 153
@@ -265,7 +267,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
265 if (ifr->ifr_hwaddr.sa_family != dev->type) 267 if (ifr->ifr_hwaddr.sa_family != dev->type)
266 return -EINVAL; 268 return -EINVAL;
267 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, 269 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
268 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 270 min(sizeof(ifr->ifr_hwaddr.sa_data),
271 (size_t)dev->addr_len));
269 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 272 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
270 return 0; 273 return 0;
271 274
diff --git a/net/core/dst.c b/net/core/dst.c
index a028409ee438..e956ce6d1378 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
327} 327}
328EXPORT_SYMBOL(__dst_destroy_metrics_generic); 328EXPORT_SYMBOL(__dst_destroy_metrics_generic);
329 329
330/**
331 * __skb_dst_set_noref - sets skb dst, without a reference
332 * @skb: buffer
333 * @dst: dst entry
334 * @force: if force is set, use noref version even for DST_NOCACHE entries
335 *
336 * Sets skb dst, assuming a reference was not taken on dst
337 * skb_dst_drop() should not dst_release() this dst
338 */
339void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
340{
341 WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
342 /* If dst not in cache, we must take a reference, because
343 * dst_release() will destroy dst as soon as its refcount becomes zero
344 */
345 if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
346 dst_hold(dst);
347 skb_dst_set(skb, dst);
348 } else {
349 skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
350 }
351}
352EXPORT_SYMBOL(__skb_dst_set_noref);
353
354/* Dirty hack. We did it in 2.2 (in __dst_free), 330/* Dirty hack. We did it in 2.2 (in __dst_free),
355 * we have _very_ good reasons not to repeat 331 * we have _very_ good reasons not to repeat
356 * this mistake in 2.3, but we have no choice 332 * this mistake in 2.3, but we have no choice
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 06dfb293e5aa..550892cd6b3f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -25,6 +25,7 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/rtnetlink.h> 26#include <linux/rtnetlink.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/net.h>
28 29
29/* 30/*
30 * Some useful ethtool_ops methods that're device independent. 31 * Some useful ethtool_ops methods that're device independent.
@@ -84,7 +85,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
84 [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", 85 [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
85 [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", 86 [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
86 [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", 87 [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
87 [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation",
88 88
89 [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", 89 [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
90 [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", 90 [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
@@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
100 [NETIF_F_BUSY_POLL_BIT] = "busy-poll", 100 [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
101}; 101};
102 102
103static const char
104rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
105 [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
106 [ETH_RSS_HASH_XOR_BIT] = "xor",
107};
108
103static int ethtool_get_features(struct net_device *dev, void __user *useraddr) 109static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
104{ 110{
105 struct ethtool_gfeatures cmd = { 111 struct ethtool_gfeatures cmd = {
@@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
185 if (sset == ETH_SS_FEATURES) 191 if (sset == ETH_SS_FEATURES)
186 return ARRAY_SIZE(netdev_features_strings); 192 return ARRAY_SIZE(netdev_features_strings);
187 193
194 if (sset == ETH_SS_RSS_HASH_FUNCS)
195 return ARRAY_SIZE(rss_hash_func_strings);
196
188 if (ops->get_sset_count && ops->get_strings) 197 if (ops->get_sset_count && ops->get_strings)
189 return ops->get_sset_count(dev, sset); 198 return ops->get_sset_count(dev, sset);
190 else 199 else
@@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev,
199 if (stringset == ETH_SS_FEATURES) 208 if (stringset == ETH_SS_FEATURES)
200 memcpy(data, netdev_features_strings, 209 memcpy(data, netdev_features_strings,
201 sizeof(netdev_features_strings)); 210 sizeof(netdev_features_strings));
211 else if (stringset == ETH_SS_RSS_HASH_FUNCS)
212 memcpy(data, rss_hash_func_strings,
213 sizeof(rss_hash_func_strings));
202 else 214 else
203 /* ops->get_strings is valid because checked earlier */ 215 /* ops->get_strings is valid because checked earlier */
204 ops->get_strings(dev, stringset, data); 216 ops->get_strings(dev, stringset, data);
@@ -574,6 +586,16 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
574 return 0; 586 return 0;
575} 587}
576 588
589u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
590
591void netdev_rss_key_fill(void *buffer, size_t len)
592{
593 BUG_ON(len > sizeof(netdev_rss_key));
594 net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
595 memcpy(buffer, netdev_rss_key, len);
596}
597EXPORT_SYMBOL(netdev_rss_key_fill);
598
577static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, 599static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
578 void __user *useraddr) 600 void __user *useraddr)
579{ 601{
@@ -608,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
608 if (!indir) 630 if (!indir)
609 return -ENOMEM; 631 return -ENOMEM;
610 632
611 ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); 633 ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
612 if (ret) 634 if (ret)
613 goto out; 635 goto out;
614 636
@@ -669,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
669 goto out; 691 goto out;
670 } 692 }
671 693
672 ret = ops->set_rxfh(dev, indir, NULL); 694 ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
673 695
674out: 696out:
675 kfree(indir); 697 kfree(indir);
@@ -687,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
687 u32 total_size; 709 u32 total_size;
688 u32 indir_bytes; 710 u32 indir_bytes;
689 u32 *indir = NULL; 711 u32 *indir = NULL;
712 u8 dev_hfunc = 0;
690 u8 *hkey = NULL; 713 u8 *hkey = NULL;
691 u8 *rss_config; 714 u8 *rss_config;
692 715
693 if (!(dev->ethtool_ops->get_rxfh_indir_size || 716 if (!ops->get_rxfh)
694 dev->ethtool_ops->get_rxfh_key_size) ||
695 !dev->ethtool_ops->get_rxfh)
696 return -EOPNOTSUPP; 717 return -EOPNOTSUPP;
697 718
698 if (ops->get_rxfh_indir_size) 719 if (ops->get_rxfh_indir_size)
@@ -700,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
700 if (ops->get_rxfh_key_size) 721 if (ops->get_rxfh_key_size)
701 dev_key_size = ops->get_rxfh_key_size(dev); 722 dev_key_size = ops->get_rxfh_key_size(dev);
702 723
703 if ((dev_key_size + dev_indir_size) == 0)
704 return -EOPNOTSUPP;
705
706 if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) 724 if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
707 return -EFAULT; 725 return -EFAULT;
708 user_indir_size = rxfh.indir_size; 726 user_indir_size = rxfh.indir_size;
709 user_key_size = rxfh.key_size; 727 user_key_size = rxfh.key_size;
710 728
711 /* Check that reserved fields are 0 for now */ 729 /* Check that reserved fields are 0 for now */
712 if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) 730 if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
731 rxfh.rsvd8[2] || rxfh.rsvd32)
713 return -EINVAL; 732 return -EINVAL;
714 733
715 rxfh.indir_size = dev_indir_size; 734 rxfh.indir_size = dev_indir_size;
@@ -717,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
717 if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) 736 if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
718 return -EFAULT; 737 return -EFAULT;
719 738
720 /* If the user buffer size is 0, this is just a query for the
721 * device table size and key size. Otherwise, if the User size is
722 * not equal to device table size or key size it's an error.
723 */
724 if (!user_indir_size && !user_key_size)
725 return 0;
726
727 if ((user_indir_size && (user_indir_size != dev_indir_size)) || 739 if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
728 (user_key_size && (user_key_size != dev_key_size))) 740 (user_key_size && (user_key_size != dev_key_size)))
729 return -EINVAL; 741 return -EINVAL;
@@ -740,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
740 if (user_key_size) 752 if (user_key_size)
741 hkey = rss_config + indir_bytes; 753 hkey = rss_config + indir_bytes;
742 754
743 ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); 755 ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
744 if (!ret) { 756 if (ret)
745 if (copy_to_user(useraddr + 757 goto out;
746 offsetof(struct ethtool_rxfh, rss_config[0]),
747 rss_config, total_size))
748 ret = -EFAULT;
749 }
750 758
759 if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
760 &dev_hfunc, sizeof(rxfh.hfunc))) {
761 ret = -EFAULT;
762 } else if (copy_to_user(useraddr +
763 offsetof(struct ethtool_rxfh, rss_config[0]),
764 rss_config, total_size)) {
765 ret = -EFAULT;
766 }
767out:
751 kfree(rss_config); 768 kfree(rss_config);
752 769
753 return ret; 770 return ret;
@@ -766,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
766 u8 *rss_config; 783 u8 *rss_config;
767 u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); 784 u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
768 785
769 if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || 786 if (!ops->get_rxnfc || !ops->set_rxfh)
770 !ops->get_rxnfc || !ops->set_rxfh)
771 return -EOPNOTSUPP; 787 return -EOPNOTSUPP;
772 788
773 if (ops->get_rxfh_indir_size) 789 if (ops->get_rxfh_indir_size)
774 dev_indir_size = ops->get_rxfh_indir_size(dev); 790 dev_indir_size = ops->get_rxfh_indir_size(dev);
775 if (ops->get_rxfh_key_size) 791 if (ops->get_rxfh_key_size)
776 dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); 792 dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
777 if ((dev_key_size + dev_indir_size) == 0)
778 return -EOPNOTSUPP;
779 793
780 if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) 794 if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
781 return -EFAULT; 795 return -EFAULT;
782 796
783 /* Check that reserved fields are 0 for now */ 797 /* Check that reserved fields are 0 for now */
784 if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) 798 if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
799 rxfh.rsvd8[2] || rxfh.rsvd32)
785 return -EINVAL; 800 return -EINVAL;
786 801
787 /* If either indir or hash key is valid, proceed further. 802 /* If either indir, hash key or function is valid, proceed further.
788 * It is not valid to request that both be unchanged. 803 * Must request at least one change: indir size, hash key or function.
789 */ 804 */
790 if ((rxfh.indir_size && 805 if ((rxfh.indir_size &&
791 rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && 806 rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
792 rxfh.indir_size != dev_indir_size) || 807 rxfh.indir_size != dev_indir_size) ||
793 (rxfh.key_size && (rxfh.key_size != dev_key_size)) || 808 (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
794 (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && 809 (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
795 rxfh.key_size == 0)) 810 rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE))
796 return -EINVAL; 811 return -EINVAL;
797 812
798 if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) 813 if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
@@ -835,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
835 } 850 }
836 } 851 }
837 852
838 ret = ops->set_rxfh(dev, indir, hkey); 853 ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
839 854
840out: 855out:
841 kfree(rss_config); 856 kfree(rss_config);
diff --git a/net/core/filter.c b/net/core/filter.c
index 647b12265e18..ec9baea10c16 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -44,6 +44,7 @@
44#include <linux/ratelimit.h> 44#include <linux/ratelimit.h>
45#include <linux/seccomp.h> 45#include <linux/seccomp.h>
46#include <linux/if_vlan.h> 46#include <linux/if_vlan.h>
47#include <linux/bpf.h>
47 48
48/** 49/**
49 * sk_filter - run a packet through a socket filter 50 * sk_filter - run a packet through a socket filter
@@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
813 814
814static void __bpf_prog_release(struct bpf_prog *prog) 815static void __bpf_prog_release(struct bpf_prog *prog)
815{ 816{
816 bpf_release_orig_filter(prog); 817 if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
817 bpf_prog_free(prog); 818 bpf_prog_put(prog);
819 } else {
820 bpf_release_orig_filter(prog);
821 bpf_prog_free(prog);
822 }
818} 823}
819 824
820static void __sk_filter_release(struct sk_filter *fp) 825static void __sk_filter_release(struct sk_filter *fp)
@@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1088} 1093}
1089EXPORT_SYMBOL_GPL(sk_attach_filter); 1094EXPORT_SYMBOL_GPL(sk_attach_filter);
1090 1095
1096#ifdef CONFIG_BPF_SYSCALL
1097int sk_attach_bpf(u32 ufd, struct sock *sk)
1098{
1099 struct sk_filter *fp, *old_fp;
1100 struct bpf_prog *prog;
1101
1102 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1103 return -EPERM;
1104
1105 prog = bpf_prog_get(ufd);
1106 if (IS_ERR(prog))
1107 return PTR_ERR(prog);
1108
1109 if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
1110 /* valid fd, but invalid program type */
1111 bpf_prog_put(prog);
1112 return -EINVAL;
1113 }
1114
1115 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1116 if (!fp) {
1117 bpf_prog_put(prog);
1118 return -ENOMEM;
1119 }
1120 fp->prog = prog;
1121
1122 atomic_set(&fp->refcnt, 0);
1123
1124 if (!sk_filter_charge(sk, fp)) {
1125 __sk_filter_release(fp);
1126 return -ENOMEM;
1127 }
1128
1129 old_fp = rcu_dereference_protected(sk->sk_filter,
1130 sock_owned_by_user(sk));
1131 rcu_assign_pointer(sk->sk_filter, fp);
1132
1133 if (old_fp)
1134 sk_filter_uncharge(sk, old_fp);
1135
1136 return 0;
1137}
1138
1139/* allow socket filters to call
1140 * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
1141 */
1142static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id)
1143{
1144 switch (func_id) {
1145 case BPF_FUNC_map_lookup_elem:
1146 return &bpf_map_lookup_elem_proto;
1147 case BPF_FUNC_map_update_elem:
1148 return &bpf_map_update_elem_proto;
1149 case BPF_FUNC_map_delete_elem:
1150 return &bpf_map_delete_elem_proto;
1151 default:
1152 return NULL;
1153 }
1154}
1155
1156static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type)
1157{
1158 /* skb fields cannot be accessed yet */
1159 return false;
1160}
1161
1162static struct bpf_verifier_ops sock_filter_ops = {
1163 .get_func_proto = sock_filter_func_proto,
1164 .is_valid_access = sock_filter_is_valid_access,
1165};
1166
1167static struct bpf_prog_type_list tl = {
1168 .ops = &sock_filter_ops,
1169 .type = BPF_PROG_TYPE_SOCKET_FILTER,
1170};
1171
1172static int __init register_sock_filter_ops(void)
1173{
1174 bpf_register_prog_type(&tl);
1175 return 0;
1176}
1177late_initcall(register_sock_filter_ops);
1178#else
1179int sk_attach_bpf(u32 ufd, struct sock *sk)
1180{
1181 return -EOPNOTSUPP;
1182}
1183#endif
1091int sk_detach_filter(struct sock *sk) 1184int sk_detach_filter(struct sock *sk)
1092{ 1185{
1093 int ret = -ENOENT; 1186 int ret = -ENOENT;
diff --git a/net/core/iovec.c b/net/core/iovec.c
index e1ec45ab1e63..dcbe98b3726a 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -28,53 +28,6 @@
28#include <net/sock.h> 28#include <net/sock.h>
29 29
30/* 30/*
31 * Verify iovec. The caller must ensure that the iovec is big enough
32 * to hold the message iovec.
33 *
34 * Save time not doing access_ok. copy_*_user will make this work
35 * in any case.
36 */
37
38int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
39{
40 int size, ct, err;
41
42 if (m->msg_name && m->msg_namelen) {
43 if (mode == VERIFY_READ) {
44 void __user *namep;
45 namep = (void __user __force *) m->msg_name;
46 err = move_addr_to_kernel(namep, m->msg_namelen,
47 address);
48 if (err < 0)
49 return err;
50 }
51 m->msg_name = address;
52 } else {
53 m->msg_name = NULL;
54 m->msg_namelen = 0;
55 }
56
57 size = m->msg_iovlen * sizeof(struct iovec);
58 if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
59 return -EFAULT;
60
61 m->msg_iov = iov;
62 err = 0;
63
64 for (ct = 0; ct < m->msg_iovlen; ct++) {
65 size_t len = iov[ct].iov_len;
66
67 if (len > INT_MAX - err) {
68 len = INT_MAX - err;
69 iov[ct].iov_len = len;
70 }
71 err += len;
72 }
73
74 return err;
75}
76
77/*
78 * And now for the all-in-one: copy and checksum from a user iovec 31 * And now for the all-in-one: copy and checksum from a user iovec
79 * directly to a datagram 32 * directly to a datagram
80 * Calls to csum_partial but the last must be in 32 bit chunks 33 * Calls to csum_partial but the last must be in 32 bit chunks
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index bd0767e6b2b3..49a9e3e06c08 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -21,7 +21,7 @@
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/workqueue.h> 22#include <linux/workqueue.h>
23#include <linux/bitops.h> 23#include <linux/bitops.h>
24#include <asm/types.h> 24#include <linux/types.h>
25 25
26 26
27enum lw_bits { 27enum lw_bits {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ef31fef25e5a..8e38f17288d3 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -56,7 +56,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags);
56static void neigh_update_notify(struct neighbour *neigh); 56static void neigh_update_notify(struct neighbour *neigh);
57static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); 57static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
58 58
59static struct neigh_table *neigh_tables;
60#ifdef CONFIG_PROC_FS 59#ifdef CONFIG_PROC_FS
61static const struct file_operations neigh_stat_seq_fops; 60static const struct file_operations neigh_stat_seq_fops;
62#endif 61#endif
@@ -87,13 +86,8 @@ static const struct file_operations neigh_stat_seq_fops;
87 the most complicated procedure, which we allow is dev->hard_header. 86 the most complicated procedure, which we allow is dev->hard_header.
88 It is supposed, that dev->hard_header is simplistic and does 87 It is supposed, that dev->hard_header is simplistic and does
89 not make callbacks to neighbour tables. 88 not make callbacks to neighbour tables.
90
91 The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
92 list of neighbour tables. This list is used only in process context,
93 */ 89 */
94 90
95static DEFINE_RWLOCK(neigh_tbl_lock);
96
97static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) 91static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
98{ 92{
99 kfree_skb(skb); 93 kfree_skb(skb);
@@ -773,7 +767,7 @@ static void neigh_periodic_work(struct work_struct *work)
773 if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { 767 if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
774 struct neigh_parms *p; 768 struct neigh_parms *p;
775 tbl->last_rand = jiffies; 769 tbl->last_rand = jiffies;
776 for (p = &tbl->parms; p; p = p->next) 770 list_for_each_entry(p, &tbl->parms_list, list)
777 p->reachable_time = 771 p->reachable_time =
778 neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); 772 neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
779 } 773 }
@@ -1446,7 +1440,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
1446{ 1440{
1447 struct neigh_parms *p; 1441 struct neigh_parms *p;
1448 1442
1449 for (p = &tbl->parms; p; p = p->next) { 1443 list_for_each_entry(p, &tbl->parms_list, list) {
1450 if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || 1444 if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
1451 (!p->dev && !ifindex && net_eq(net, &init_net))) 1445 (!p->dev && !ifindex && net_eq(net, &init_net)))
1452 return p; 1446 return p;
@@ -1481,8 +1475,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
1481 } 1475 }
1482 1476
1483 write_lock_bh(&tbl->lock); 1477 write_lock_bh(&tbl->lock);
1484 p->next = tbl->parms.next; 1478 list_add(&p->list, &tbl->parms.list);
1485 tbl->parms.next = p;
1486 write_unlock_bh(&tbl->lock); 1479 write_unlock_bh(&tbl->lock);
1487 1480
1488 neigh_parms_data_state_cleanall(p); 1481 neigh_parms_data_state_cleanall(p);
@@ -1501,24 +1494,15 @@ static void neigh_rcu_free_parms(struct rcu_head *head)
1501 1494
1502void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) 1495void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
1503{ 1496{
1504 struct neigh_parms **p;
1505
1506 if (!parms || parms == &tbl->parms) 1497 if (!parms || parms == &tbl->parms)
1507 return; 1498 return;
1508 write_lock_bh(&tbl->lock); 1499 write_lock_bh(&tbl->lock);
1509 for (p = &tbl->parms.next; *p; p = &(*p)->next) { 1500 list_del(&parms->list);
1510 if (*p == parms) { 1501 parms->dead = 1;
1511 *p = parms->next;
1512 parms->dead = 1;
1513 write_unlock_bh(&tbl->lock);
1514 if (parms->dev)
1515 dev_put(parms->dev);
1516 call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
1517 return;
1518 }
1519 }
1520 write_unlock_bh(&tbl->lock); 1502 write_unlock_bh(&tbl->lock);
1521 neigh_dbg(1, "%s: not found\n", __func__); 1503 if (parms->dev)
1504 dev_put(parms->dev);
1505 call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
1522} 1506}
1523EXPORT_SYMBOL(neigh_parms_release); 1507EXPORT_SYMBOL(neigh_parms_release);
1524 1508
@@ -1530,11 +1514,15 @@ static void neigh_parms_destroy(struct neigh_parms *parms)
1530 1514
1531static struct lock_class_key neigh_table_proxy_queue_class; 1515static struct lock_class_key neigh_table_proxy_queue_class;
1532 1516
1533static void neigh_table_init_no_netlink(struct neigh_table *tbl) 1517static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
1518
1519void neigh_table_init(int index, struct neigh_table *tbl)
1534{ 1520{
1535 unsigned long now = jiffies; 1521 unsigned long now = jiffies;
1536 unsigned long phsize; 1522 unsigned long phsize;
1537 1523
1524 INIT_LIST_HEAD(&tbl->parms_list);
1525 list_add(&tbl->parms.list, &tbl->parms_list);
1538 write_pnet(&tbl->parms.net, &init_net); 1526 write_pnet(&tbl->parms.net, &init_net);
1539 atomic_set(&tbl->parms.refcnt, 1); 1527 atomic_set(&tbl->parms.refcnt, 1);
1540 tbl->parms.reachable_time = 1528 tbl->parms.reachable_time =
@@ -1574,34 +1562,14 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl)
1574 1562
1575 tbl->last_flush = now; 1563 tbl->last_flush = now;
1576 tbl->last_rand = now + tbl->parms.reachable_time * 20; 1564 tbl->last_rand = now + tbl->parms.reachable_time * 20;
1577}
1578
1579void neigh_table_init(struct neigh_table *tbl)
1580{
1581 struct neigh_table *tmp;
1582
1583 neigh_table_init_no_netlink(tbl);
1584 write_lock(&neigh_tbl_lock);
1585 for (tmp = neigh_tables; tmp; tmp = tmp->next) {
1586 if (tmp->family == tbl->family)
1587 break;
1588 }
1589 tbl->next = neigh_tables;
1590 neigh_tables = tbl;
1591 write_unlock(&neigh_tbl_lock);
1592 1565
1593 if (unlikely(tmp)) { 1566 neigh_tables[index] = tbl;
1594 pr_err("Registering multiple tables for family %d\n",
1595 tbl->family);
1596 dump_stack();
1597 }
1598} 1567}
1599EXPORT_SYMBOL(neigh_table_init); 1568EXPORT_SYMBOL(neigh_table_init);
1600 1569
1601int neigh_table_clear(struct neigh_table *tbl) 1570int neigh_table_clear(int index, struct neigh_table *tbl)
1602{ 1571{
1603 struct neigh_table **tp; 1572 neigh_tables[index] = NULL;
1604
1605 /* It is not clean... Fix it to unload IPv6 module safely */ 1573 /* It is not clean... Fix it to unload IPv6 module safely */
1606 cancel_delayed_work_sync(&tbl->gc_work); 1574 cancel_delayed_work_sync(&tbl->gc_work);
1607 del_timer_sync(&tbl->proxy_timer); 1575 del_timer_sync(&tbl->proxy_timer);
@@ -1609,14 +1577,6 @@ int neigh_table_clear(struct neigh_table *tbl)
1609 neigh_ifdown(tbl, NULL); 1577 neigh_ifdown(tbl, NULL);
1610 if (atomic_read(&tbl->entries)) 1578 if (atomic_read(&tbl->entries))
1611 pr_crit("neighbour leakage\n"); 1579 pr_crit("neighbour leakage\n");
1612 write_lock(&neigh_tbl_lock);
1613 for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
1614 if (*tp == tbl) {
1615 *tp = tbl->next;
1616 break;
1617 }
1618 }
1619 write_unlock(&neigh_tbl_lock);
1620 1580
1621 call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, 1581 call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
1622 neigh_hash_free_rcu); 1582 neigh_hash_free_rcu);
@@ -1634,12 +1594,32 @@ int neigh_table_clear(struct neigh_table *tbl)
1634} 1594}
1635EXPORT_SYMBOL(neigh_table_clear); 1595EXPORT_SYMBOL(neigh_table_clear);
1636 1596
1597static struct neigh_table *neigh_find_table(int family)
1598{
1599 struct neigh_table *tbl = NULL;
1600
1601 switch (family) {
1602 case AF_INET:
1603 tbl = neigh_tables[NEIGH_ARP_TABLE];
1604 break;
1605 case AF_INET6:
1606 tbl = neigh_tables[NEIGH_ND_TABLE];
1607 break;
1608 case AF_DECnet:
1609 tbl = neigh_tables[NEIGH_DN_TABLE];
1610 break;
1611 }
1612
1613 return tbl;
1614}
1615
1637static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) 1616static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
1638{ 1617{
1639 struct net *net = sock_net(skb->sk); 1618 struct net *net = sock_net(skb->sk);
1640 struct ndmsg *ndm; 1619 struct ndmsg *ndm;
1641 struct nlattr *dst_attr; 1620 struct nlattr *dst_attr;
1642 struct neigh_table *tbl; 1621 struct neigh_table *tbl;
1622 struct neighbour *neigh;
1643 struct net_device *dev = NULL; 1623 struct net_device *dev = NULL;
1644 int err = -EINVAL; 1624 int err = -EINVAL;
1645 1625
@@ -1660,39 +1640,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
1660 } 1640 }
1661 } 1641 }
1662 1642
1663 read_lock(&neigh_tbl_lock); 1643 tbl = neigh_find_table(ndm->ndm_family);
1664 for (tbl = neigh_tables; tbl; tbl = tbl->next) { 1644 if (tbl == NULL)
1665 struct neighbour *neigh; 1645 return -EAFNOSUPPORT;
1666 1646
1667 if (tbl->family != ndm->ndm_family) 1647 if (nla_len(dst_attr) < tbl->key_len)
1668 continue; 1648 goto out;
1669 read_unlock(&neigh_tbl_lock);
1670
1671 if (nla_len(dst_attr) < tbl->key_len)
1672 goto out;
1673
1674 if (ndm->ndm_flags & NTF_PROXY) {
1675 err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
1676 goto out;
1677 }
1678 1649
1679 if (dev == NULL) 1650 if (ndm->ndm_flags & NTF_PROXY) {
1680 goto out; 1651 err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
1652 goto out;
1653 }
1681 1654
1682 neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); 1655 if (dev == NULL)
1683 if (neigh == NULL) { 1656 goto out;
1684 err = -ENOENT;
1685 goto out;
1686 }
1687 1657
1688 err = neigh_update(neigh, NULL, NUD_FAILED, 1658 neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
1689 NEIGH_UPDATE_F_OVERRIDE | 1659 if (neigh == NULL) {
1690 NEIGH_UPDATE_F_ADMIN); 1660 err = -ENOENT;
1691 neigh_release(neigh);
1692 goto out; 1661 goto out;
1693 } 1662 }
1694 read_unlock(&neigh_tbl_lock); 1663
1695 err = -EAFNOSUPPORT; 1664 err = neigh_update(neigh, NULL, NUD_FAILED,
1665 NEIGH_UPDATE_F_OVERRIDE |
1666 NEIGH_UPDATE_F_ADMIN);
1667 neigh_release(neigh);
1696 1668
1697out: 1669out:
1698 return err; 1670 return err;
@@ -1700,11 +1672,14 @@ out:
1700 1672
1701static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) 1673static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
1702{ 1674{
1675 int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
1703 struct net *net = sock_net(skb->sk); 1676 struct net *net = sock_net(skb->sk);
1704 struct ndmsg *ndm; 1677 struct ndmsg *ndm;
1705 struct nlattr *tb[NDA_MAX+1]; 1678 struct nlattr *tb[NDA_MAX+1];
1706 struct neigh_table *tbl; 1679 struct neigh_table *tbl;
1707 struct net_device *dev = NULL; 1680 struct net_device *dev = NULL;
1681 struct neighbour *neigh;
1682 void *dst, *lladdr;
1708 int err; 1683 int err;
1709 1684
1710 ASSERT_RTNL(); 1685 ASSERT_RTNL();
@@ -1728,70 +1703,60 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
1728 goto out; 1703 goto out;
1729 } 1704 }
1730 1705
1731 read_lock(&neigh_tbl_lock); 1706 tbl = neigh_find_table(ndm->ndm_family);
1732 for (tbl = neigh_tables; tbl; tbl = tbl->next) { 1707 if (tbl == NULL)
1733 int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; 1708 return -EAFNOSUPPORT;
1734 struct neighbour *neigh;
1735 void *dst, *lladdr;
1736 1709
1737 if (tbl->family != ndm->ndm_family) 1710 if (nla_len(tb[NDA_DST]) < tbl->key_len)
1738 continue; 1711 goto out;
1739 read_unlock(&neigh_tbl_lock); 1712 dst = nla_data(tb[NDA_DST]);
1713 lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
1740 1714
1741 if (nla_len(tb[NDA_DST]) < tbl->key_len) 1715 if (ndm->ndm_flags & NTF_PROXY) {
1742 goto out; 1716 struct pneigh_entry *pn;
1743 dst = nla_data(tb[NDA_DST]);
1744 lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
1745 1717
1746 if (ndm->ndm_flags & NTF_PROXY) { 1718 err = -ENOBUFS;
1747 struct pneigh_entry *pn; 1719 pn = pneigh_lookup(tbl, net, dst, dev, 1);
1720 if (pn) {
1721 pn->flags = ndm->ndm_flags;
1722 err = 0;
1723 }
1724 goto out;
1725 }
1748 1726
1749 err = -ENOBUFS; 1727 if (dev == NULL)
1750 pn = pneigh_lookup(tbl, net, dst, dev, 1); 1728 goto out;
1751 if (pn) { 1729
1752 pn->flags = ndm->ndm_flags; 1730 neigh = neigh_lookup(tbl, dst, dev);
1753 err = 0; 1731 if (neigh == NULL) {
1754 } 1732 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
1733 err = -ENOENT;
1755 goto out; 1734 goto out;
1756 } 1735 }
1757 1736
1758 if (dev == NULL) 1737 neigh = __neigh_lookup_errno(tbl, dst, dev);
1738 if (IS_ERR(neigh)) {
1739 err = PTR_ERR(neigh);
1740 goto out;
1741 }
1742 } else {
1743 if (nlh->nlmsg_flags & NLM_F_EXCL) {
1744 err = -EEXIST;
1745 neigh_release(neigh);
1759 goto out; 1746 goto out;
1760
1761 neigh = neigh_lookup(tbl, dst, dev);
1762 if (neigh == NULL) {
1763 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
1764 err = -ENOENT;
1765 goto out;
1766 }
1767
1768 neigh = __neigh_lookup_errno(tbl, dst, dev);
1769 if (IS_ERR(neigh)) {
1770 err = PTR_ERR(neigh);
1771 goto out;
1772 }
1773 } else {
1774 if (nlh->nlmsg_flags & NLM_F_EXCL) {
1775 err = -EEXIST;
1776 neigh_release(neigh);
1777 goto out;
1778 }
1779
1780 if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
1781 flags &= ~NEIGH_UPDATE_F_OVERRIDE;
1782 } 1747 }
1783 1748
1784 if (ndm->ndm_flags & NTF_USE) { 1749 if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
1785 neigh_event_send(neigh, NULL); 1750 flags &= ~NEIGH_UPDATE_F_OVERRIDE;
1786 err = 0;
1787 } else
1788 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
1789 neigh_release(neigh);
1790 goto out;
1791 } 1751 }
1792 1752
1793 read_unlock(&neigh_tbl_lock); 1753 if (ndm->ndm_flags & NTF_USE) {
1794 err = -EAFNOSUPPORT; 1754 neigh_event_send(neigh, NULL);
1755 err = 0;
1756 } else
1757 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
1758 neigh_release(neigh);
1759
1795out: 1760out:
1796 return err; 1761 return err;
1797} 1762}
@@ -1990,7 +1955,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
1990 struct neigh_table *tbl; 1955 struct neigh_table *tbl;
1991 struct ndtmsg *ndtmsg; 1956 struct ndtmsg *ndtmsg;
1992 struct nlattr *tb[NDTA_MAX+1]; 1957 struct nlattr *tb[NDTA_MAX+1];
1993 int err; 1958 bool found = false;
1959 int err, tidx;
1994 1960
1995 err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, 1961 err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
1996 nl_neightbl_policy); 1962 nl_neightbl_policy);
@@ -2003,19 +1969,21 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
2003 } 1969 }
2004 1970
2005 ndtmsg = nlmsg_data(nlh); 1971 ndtmsg = nlmsg_data(nlh);
2006 read_lock(&neigh_tbl_lock); 1972
2007 for (tbl = neigh_tables; tbl; tbl = tbl->next) { 1973 for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
1974 tbl = neigh_tables[tidx];
1975 if (!tbl)
1976 continue;
2008 if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) 1977 if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
2009 continue; 1978 continue;
2010 1979 if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
2011 if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) 1980 found = true;
2012 break; 1981 break;
1982 }
2013 } 1983 }
2014 1984
2015 if (tbl == NULL) { 1985 if (!found)
2016 err = -ENOENT; 1986 return -ENOENT;
2017 goto errout_locked;
2018 }
2019 1987
2020 /* 1988 /*
2021 * We acquire tbl->lock to be nice to the periodic timers and 1989 * We acquire tbl->lock to be nice to the periodic timers and
@@ -2126,8 +2094,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
2126 2094
2127errout_tbl_lock: 2095errout_tbl_lock:
2128 write_unlock_bh(&tbl->lock); 2096 write_unlock_bh(&tbl->lock);
2129errout_locked:
2130 read_unlock(&neigh_tbl_lock);
2131errout: 2097errout:
2132 return err; 2098 return err;
2133} 2099}
@@ -2142,10 +2108,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2142 2108
2143 family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; 2109 family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
2144 2110
2145 read_lock(&neigh_tbl_lock); 2111 for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
2146 for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
2147 struct neigh_parms *p; 2112 struct neigh_parms *p;
2148 2113
2114 tbl = neigh_tables[tidx];
2115 if (!tbl)
2116 continue;
2117
2149 if (tidx < tbl_skip || (family && tbl->family != family)) 2118 if (tidx < tbl_skip || (family && tbl->family != family))
2150 continue; 2119 continue;
2151 2120
@@ -2154,7 +2123,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2154 NLM_F_MULTI) <= 0) 2123 NLM_F_MULTI) <= 0)
2155 break; 2124 break;
2156 2125
2157 for (nidx = 0, p = tbl->parms.next; p; p = p->next) { 2126 nidx = 0;
2127 p = list_next_entry(&tbl->parms, list);
2128 list_for_each_entry_from(p, &tbl->parms_list, list) {
2158 if (!net_eq(neigh_parms_net(p), net)) 2129 if (!net_eq(neigh_parms_net(p), net))
2159 continue; 2130 continue;
2160 2131
@@ -2174,7 +2145,6 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2174 neigh_skip = 0; 2145 neigh_skip = 0;
2175 } 2146 }
2176out: 2147out:
2177 read_unlock(&neigh_tbl_lock);
2178 cb->args[0] = tidx; 2148 cb->args[0] = tidx;
2179 cb->args[1] = nidx; 2149 cb->args[1] = nidx;
2180 2150
@@ -2357,7 +2327,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2357 int proxy = 0; 2327 int proxy = 0;
2358 int err; 2328 int err;
2359 2329
2360 read_lock(&neigh_tbl_lock);
2361 family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; 2330 family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
2362 2331
2363 /* check for full ndmsg structure presence, family member is 2332 /* check for full ndmsg structure presence, family member is
@@ -2369,8 +2338,11 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2369 2338
2370 s_t = cb->args[0]; 2339 s_t = cb->args[0];
2371 2340
2372 for (tbl = neigh_tables, t = 0; tbl; 2341 for (t = 0; t < NEIGH_NR_TABLES; t++) {
2373 tbl = tbl->next, t++) { 2342 tbl = neigh_tables[t];
2343
2344 if (!tbl)
2345 continue;
2374 if (t < s_t || (family && tbl->family != family)) 2346 if (t < s_t || (family && tbl->family != family))
2375 continue; 2347 continue;
2376 if (t > s_t) 2348 if (t > s_t)
@@ -2383,7 +2355,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2383 if (err < 0) 2355 if (err < 0)
2384 break; 2356 break;
2385 } 2357 }
2386 read_unlock(&neigh_tbl_lock);
2387 2358
2388 cb->args[0] = t; 2359 cb->args[0] = t;
2389 return skb->len; 2360 return skb->len;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9dd06699b09c..999341244434 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -12,6 +12,7 @@
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/netdevice.h> 14#include <linux/netdevice.h>
15#include <net/switchdev.h>
15#include <linux/if_arp.h> 16#include <linux/if_arp.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
@@ -325,6 +326,23 @@ static ssize_t tx_queue_len_store(struct device *dev,
325} 326}
326NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); 327NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
327 328
329static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
330{
331 dev->gro_flush_timeout = val;
332 return 0;
333}
334
335static ssize_t gro_flush_timeout_store(struct device *dev,
336 struct device_attribute *attr,
337 const char *buf, size_t len)
338{
339 if (!capable(CAP_NET_ADMIN))
340 return -EPERM;
341
342 return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
343}
344NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
345
328static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, 346static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
329 const char *buf, size_t len) 347 const char *buf, size_t len)
330{ 348{
@@ -387,7 +405,7 @@ static ssize_t phys_port_id_show(struct device *dev,
387 return restart_syscall(); 405 return restart_syscall();
388 406
389 if (dev_isalive(netdev)) { 407 if (dev_isalive(netdev)) {
390 struct netdev_phys_port_id ppid; 408 struct netdev_phys_item_id ppid;
391 409
392 ret = dev_get_phys_port_id(netdev, &ppid); 410 ret = dev_get_phys_port_id(netdev, &ppid);
393 if (!ret) 411 if (!ret)
@@ -399,6 +417,28 @@ static ssize_t phys_port_id_show(struct device *dev,
399} 417}
400static DEVICE_ATTR_RO(phys_port_id); 418static DEVICE_ATTR_RO(phys_port_id);
401 419
420static ssize_t phys_switch_id_show(struct device *dev,
421 struct device_attribute *attr, char *buf)
422{
423 struct net_device *netdev = to_net_dev(dev);
424 ssize_t ret = -EINVAL;
425
426 if (!rtnl_trylock())
427 return restart_syscall();
428
429 if (dev_isalive(netdev)) {
430 struct netdev_phys_item_id ppid;
431
432 ret = netdev_switch_parent_id_get(netdev, &ppid);
433 if (!ret)
434 ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
435 }
436 rtnl_unlock();
437
438 return ret;
439}
440static DEVICE_ATTR_RO(phys_switch_id);
441
402static struct attribute *net_class_attrs[] = { 442static struct attribute *net_class_attrs[] = {
403 &dev_attr_netdev_group.attr, 443 &dev_attr_netdev_group.attr,
404 &dev_attr_type.attr, 444 &dev_attr_type.attr,
@@ -422,7 +462,9 @@ static struct attribute *net_class_attrs[] = {
422 &dev_attr_mtu.attr, 462 &dev_attr_mtu.attr,
423 &dev_attr_flags.attr, 463 &dev_attr_flags.attr,
424 &dev_attr_tx_queue_len.attr, 464 &dev_attr_tx_queue_len.attr,
465 &dev_attr_gro_flush_timeout.attr,
425 &dev_attr_phys_port_id.attr, 466 &dev_attr_phys_port_id.attr,
467 &dev_attr_phys_switch_id.attr,
426 NULL, 468 NULL,
427}; 469};
428ATTRIBUTE_GROUPS(net_class); 470ATTRIBUTE_GROUPS(net_class);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e6645b4f330a..e0ad5d16c9c5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -79,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
79 79
80 if (vlan_tx_tag_present(skb) && 80 if (vlan_tx_tag_present(skb) &&
81 !vlan_hw_offload_capable(features, skb->vlan_proto)) { 81 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
82 skb = __vlan_put_tag(skb, skb->vlan_proto, 82 skb = __vlan_hwaccel_push_inside(skb);
83 vlan_tx_tag_get(skb));
84 if (unlikely(!skb)) { 83 if (unlikely(!skb)) {
85 /* This is actually a packet drop, but we 84 /* This is actually a packet drop, but we
86 * don't want the code that calls this 85 * don't want the code that calls this
@@ -88,7 +87,6 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
88 */ 87 */
89 goto out; 88 goto out;
90 } 89 }
91 skb->vlan_tci = 0;
92 } 90 }
93 91
94 status = netdev_start_xmit(skb, dev, txq, false); 92 status = netdev_start_xmit(skb, dev, txq, false);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 443256bdcddc..da934fc3faa8 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3728,8 +3728,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
3728 /* Remove proc before if_list entry, because add_device uses 3728 /* Remove proc before if_list entry, because add_device uses
3729 * list to determine if interface already exist, avoid race 3729 * list to determine if interface already exist, avoid race
3730 * with proc_create_data() */ 3730 * with proc_create_data() */
3731 if (pkt_dev->entry) 3731 proc_remove(pkt_dev->entry);
3732 proc_remove(pkt_dev->entry);
3733 3732
3734 /* And update the thread if_list */ 3733 /* And update the thread if_list */
3735 _rem_dev_from_if_list(t, pkt_dev); 3734 _rem_dev_from_if_list(t, pkt_dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 88e8de3b59b0..d06107d36ec8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -36,6 +36,7 @@
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/if_addr.h> 37#include <linux/if_addr.h>
38#include <linux/if_bridge.h> 38#include <linux/if_bridge.h>
39#include <linux/if_vlan.h>
39#include <linux/pci.h> 40#include <linux/pci.h>
40#include <linux/etherdevice.h> 41#include <linux/etherdevice.h>
41 42
@@ -43,6 +44,7 @@
43 44
44#include <linux/inet.h> 45#include <linux/inet.h>
45#include <linux/netdevice.h> 46#include <linux/netdevice.h>
47#include <net/switchdev.h>
46#include <net/ip.h> 48#include <net/ip.h>
47#include <net/protocol.h> 49#include <net/protocol.h>
48#include <net/arp.h> 50#include <net/arp.h>
@@ -868,7 +870,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
868 + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ 870 + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
869 + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ 871 + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
870 + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ 872 + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
871 + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ 873 + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
874 + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
872} 875}
873 876
874static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) 877static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -952,7 +955,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
952static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) 955static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
953{ 956{
954 int err; 957 int err;
955 struct netdev_phys_port_id ppid; 958 struct netdev_phys_item_id ppid;
956 959
957 err = dev_get_phys_port_id(dev, &ppid); 960 err = dev_get_phys_port_id(dev, &ppid);
958 if (err) { 961 if (err) {
@@ -967,6 +970,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
967 return 0; 970 return 0;
968} 971}
969 972
973static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
974{
975 int err;
976 struct netdev_phys_item_id psid;
977
978 err = netdev_switch_parent_id_get(dev, &psid);
979 if (err) {
980 if (err == -EOPNOTSUPP)
981 return 0;
982 return err;
983 }
984
985 if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
986 return -EMSGSIZE;
987
988 return 0;
989}
990
970static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, 991static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
971 int type, u32 pid, u32 seq, u32 change, 992 int type, u32 pid, u32 seq, u32 change,
972 unsigned int flags, u32 ext_filter_mask) 993 unsigned int flags, u32 ext_filter_mask)
@@ -1039,6 +1060,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
1039 if (rtnl_phys_port_id_fill(skb, dev)) 1060 if (rtnl_phys_port_id_fill(skb, dev))
1040 goto nla_put_failure; 1061 goto nla_put_failure;
1041 1062
1063 if (rtnl_phys_switch_id_fill(skb, dev))
1064 goto nla_put_failure;
1065
1042 attr = nla_reserve(skb, IFLA_STATS, 1066 attr = nla_reserve(skb, IFLA_STATS,
1043 sizeof(struct rtnl_link_stats)); 1067 sizeof(struct rtnl_link_stats));
1044 if (attr == NULL) 1068 if (attr == NULL)
@@ -1196,8 +1220,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1196 [IFLA_PROMISCUITY] = { .type = NLA_U32 }, 1220 [IFLA_PROMISCUITY] = { .type = NLA_U32 },
1197 [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, 1221 [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
1198 [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, 1222 [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
1199 [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, 1223 [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
1200 [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ 1224 [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
1225 [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
1201}; 1226};
1202 1227
1203static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { 1228static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2221,8 +2246,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
2221 return skb->len; 2246 return skb->len;
2222} 2247}
2223 2248
2224void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, 2249struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
2225 gfp_t flags) 2250 unsigned int change, gfp_t flags)
2226{ 2251{
2227 struct net *net = dev_net(dev); 2252 struct net *net = dev_net(dev);
2228 struct sk_buff *skb; 2253 struct sk_buff *skb;
@@ -2240,11 +2265,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
2240 kfree_skb(skb); 2265 kfree_skb(skb);
2241 goto errout; 2266 goto errout;
2242 } 2267 }
2243 rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); 2268 return skb;
2244 return;
2245errout: 2269errout:
2246 if (err < 0) 2270 if (err < 0)
2247 rtnl_set_sk_err(net, RTNLGRP_LINK, err); 2271 rtnl_set_sk_err(net, RTNLGRP_LINK, err);
2272 return NULL;
2273}
2274
2275void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
2276{
2277 struct net *net = dev_net(dev);
2278
2279 rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
2280}
2281
2282void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
2283 gfp_t flags)
2284{
2285 struct sk_buff *skb;
2286
2287 skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
2288 if (skb)
2289 rtmsg_ifinfo_send(skb, dev, flags);
2248} 2290}
2249EXPORT_SYMBOL(rtmsg_ifinfo); 2291EXPORT_SYMBOL(rtmsg_ifinfo);
2250 2292
@@ -2313,7 +2355,7 @@ errout:
2313int ndo_dflt_fdb_add(struct ndmsg *ndm, 2355int ndo_dflt_fdb_add(struct ndmsg *ndm,
2314 struct nlattr *tb[], 2356 struct nlattr *tb[],
2315 struct net_device *dev, 2357 struct net_device *dev,
2316 const unsigned char *addr, 2358 const unsigned char *addr, u16 vid,
2317 u16 flags) 2359 u16 flags)
2318{ 2360{
2319 int err = -EINVAL; 2361 int err = -EINVAL;
@@ -2339,6 +2381,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
2339} 2381}
2340EXPORT_SYMBOL(ndo_dflt_fdb_add); 2382EXPORT_SYMBOL(ndo_dflt_fdb_add);
2341 2383
2384static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
2385{
2386 u16 vid = 0;
2387
2388 if (vlan_attr) {
2389 if (nla_len(vlan_attr) != sizeof(u16)) {
2390 pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
2391 return -EINVAL;
2392 }
2393
2394 vid = nla_get_u16(vlan_attr);
2395
2396 if (!vid || vid >= VLAN_VID_MASK) {
2397 pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
2398 vid);
2399 return -EINVAL;
2400 }
2401 }
2402 *p_vid = vid;
2403 return 0;
2404}
2405
2342static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) 2406static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
2343{ 2407{
2344 struct net *net = sock_net(skb->sk); 2408 struct net *net = sock_net(skb->sk);
@@ -2346,6 +2410,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
2346 struct nlattr *tb[NDA_MAX+1]; 2410 struct nlattr *tb[NDA_MAX+1];
2347 struct net_device *dev; 2411 struct net_device *dev;
2348 u8 *addr; 2412 u8 *addr;
2413 u16 vid;
2349 int err; 2414 int err;
2350 2415
2351 err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); 2416 err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
@@ -2371,6 +2436,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
2371 2436
2372 addr = nla_data(tb[NDA_LLADDR]); 2437 addr = nla_data(tb[NDA_LLADDR]);
2373 2438
2439 err = fdb_vid_parse(tb[NDA_VLAN], &vid);
2440 if (err)
2441 return err;
2442
2374 err = -EOPNOTSUPP; 2443 err = -EOPNOTSUPP;
2375 2444
2376 /* Support fdb on master device the net/bridge default case */ 2445 /* Support fdb on master device the net/bridge default case */
@@ -2379,7 +2448,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
2379 struct net_device *br_dev = netdev_master_upper_dev_get(dev); 2448 struct net_device *br_dev = netdev_master_upper_dev_get(dev);
2380 const struct net_device_ops *ops = br_dev->netdev_ops; 2449 const struct net_device_ops *ops = br_dev->netdev_ops;
2381 2450
2382 err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); 2451 err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
2452 nlh->nlmsg_flags);
2383 if (err) 2453 if (err)
2384 goto out; 2454 goto out;
2385 else 2455 else
@@ -2390,9 +2460,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
2390 if ((ndm->ndm_flags & NTF_SELF)) { 2460 if ((ndm->ndm_flags & NTF_SELF)) {
2391 if (dev->netdev_ops->ndo_fdb_add) 2461 if (dev->netdev_ops->ndo_fdb_add)
2392 err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, 2462 err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
2463 vid,
2393 nlh->nlmsg_flags); 2464 nlh->nlmsg_flags);
2394 else 2465 else
2395 err = ndo_dflt_fdb_add(ndm, tb, dev, addr, 2466 err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
2396 nlh->nlmsg_flags); 2467 nlh->nlmsg_flags);
2397 2468
2398 if (!err) { 2469 if (!err) {
@@ -2410,7 +2481,7 @@ out:
2410int ndo_dflt_fdb_del(struct ndmsg *ndm, 2481int ndo_dflt_fdb_del(struct ndmsg *ndm,
2411 struct nlattr *tb[], 2482 struct nlattr *tb[],
2412 struct net_device *dev, 2483 struct net_device *dev,
2413 const unsigned char *addr) 2484 const unsigned char *addr, u16 vid)
2414{ 2485{
2415 int err = -EINVAL; 2486 int err = -EINVAL;
2416 2487
@@ -2439,6 +2510,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
2439 struct net_device *dev; 2510 struct net_device *dev;
2440 int err = -EINVAL; 2511 int err = -EINVAL;
2441 __u8 *addr; 2512 __u8 *addr;
2513 u16 vid;
2442 2514
2443 if (!netlink_capable(skb, CAP_NET_ADMIN)) 2515 if (!netlink_capable(skb, CAP_NET_ADMIN))
2444 return -EPERM; 2516 return -EPERM;
@@ -2466,6 +2538,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
2466 2538
2467 addr = nla_data(tb[NDA_LLADDR]); 2539 addr = nla_data(tb[NDA_LLADDR]);
2468 2540
2541 err = fdb_vid_parse(tb[NDA_VLAN], &vid);
2542 if (err)
2543 return err;
2544
2469 err = -EOPNOTSUPP; 2545 err = -EOPNOTSUPP;
2470 2546
2471 /* Support fdb on master device the net/bridge default case */ 2547 /* Support fdb on master device the net/bridge default case */
@@ -2475,7 +2551,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
2475 const struct net_device_ops *ops = br_dev->netdev_ops; 2551 const struct net_device_ops *ops = br_dev->netdev_ops;
2476 2552
2477 if (ops->ndo_fdb_del) 2553 if (ops->ndo_fdb_del)
2478 err = ops->ndo_fdb_del(ndm, tb, dev, addr); 2554 err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
2479 2555
2480 if (err) 2556 if (err)
2481 goto out; 2557 goto out;
@@ -2486,9 +2562,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
2486 /* Embedded bridge, macvlan, and any other device support */ 2562 /* Embedded bridge, macvlan, and any other device support */
2487 if (ndm->ndm_flags & NTF_SELF) { 2563 if (ndm->ndm_flags & NTF_SELF) {
2488 if (dev->netdev_ops->ndo_fdb_del) 2564 if (dev->netdev_ops->ndo_fdb_del)
2489 err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); 2565 err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
2566 vid);
2490 else 2567 else
2491 err = ndo_dflt_fdb_del(ndm, tb, dev, addr); 2568 err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
2492 2569
2493 if (!err) { 2570 if (!err) {
2494 rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); 2571 rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
@@ -2628,12 +2705,22 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
2628 return skb->len; 2705 return skb->len;
2629} 2706}
2630 2707
2708static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
2709 unsigned int attrnum, unsigned int flag)
2710{
2711 if (mask & flag)
2712 return nla_put_u8(skb, attrnum, !!(flags & flag));
2713 return 0;
2714}
2715
2631int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, 2716int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
2632 struct net_device *dev, u16 mode) 2717 struct net_device *dev, u16 mode,
2718 u32 flags, u32 mask)
2633{ 2719{
2634 struct nlmsghdr *nlh; 2720 struct nlmsghdr *nlh;
2635 struct ifinfomsg *ifm; 2721 struct ifinfomsg *ifm;
2636 struct nlattr *br_afspec; 2722 struct nlattr *br_afspec;
2723 struct nlattr *protinfo;
2637 u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; 2724 u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
2638 struct net_device *br_dev = netdev_master_upper_dev_get(dev); 2725 struct net_device *br_dev = netdev_master_upper_dev_get(dev);
2639 2726
@@ -2665,13 +2752,46 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
2665 if (!br_afspec) 2752 if (!br_afspec)
2666 goto nla_put_failure; 2753 goto nla_put_failure;
2667 2754
2668 if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || 2755 if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
2669 nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
2670 nla_nest_cancel(skb, br_afspec); 2756 nla_nest_cancel(skb, br_afspec);
2671 goto nla_put_failure; 2757 goto nla_put_failure;
2672 } 2758 }
2759
2760 if (mode != BRIDGE_MODE_UNDEF) {
2761 if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
2762 nla_nest_cancel(skb, br_afspec);
2763 goto nla_put_failure;
2764 }
2765 }
2673 nla_nest_end(skb, br_afspec); 2766 nla_nest_end(skb, br_afspec);
2674 2767
2768 protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
2769 if (!protinfo)
2770 goto nla_put_failure;
2771
2772 if (brport_nla_put_flag(skb, flags, mask,
2773 IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
2774 brport_nla_put_flag(skb, flags, mask,
2775 IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
2776 brport_nla_put_flag(skb, flags, mask,
2777 IFLA_BRPORT_FAST_LEAVE,
2778 BR_MULTICAST_FAST_LEAVE) ||
2779 brport_nla_put_flag(skb, flags, mask,
2780 IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
2781 brport_nla_put_flag(skb, flags, mask,
2782 IFLA_BRPORT_LEARNING, BR_LEARNING) ||
2783 brport_nla_put_flag(skb, flags, mask,
2784 IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
2785 brport_nla_put_flag(skb, flags, mask,
2786 IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
2787 brport_nla_put_flag(skb, flags, mask,
2788 IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
2789 nla_nest_cancel(skb, protinfo);
2790 goto nla_put_failure;
2791 }
2792
2793 nla_nest_end(skb, protinfo);
2794
2675 return nlmsg_end(skb, nlh); 2795 return nlmsg_end(skb, nlh);
2676nla_put_failure: 2796nla_put_failure:
2677 nlmsg_cancel(skb, nlh); 2797 nlmsg_cancel(skb, nlh);
diff --git a/net/core/scm.c b/net/core/scm.c
index b442e7e25e60..3b6899b7d810 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
129 struct cmsghdr *cmsg; 129 struct cmsghdr *cmsg;
130 int err; 130 int err;
131 131
132 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) 132 for_each_cmsghdr(cmsg, msg) {
133 {
134 err = -EINVAL; 133 err = -EINVAL;
135 134
136 /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ 135 /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 32e31c299631..ae13ef6b3ea7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
265 skb->fclone = SKB_FCLONE_ORIG; 265 skb->fclone = SKB_FCLONE_ORIG;
266 atomic_set(&fclones->fclone_ref, 1); 266 atomic_set(&fclones->fclone_ref, 1);
267 267
268 fclones->skb2.fclone = SKB_FCLONE_FREE; 268 fclones->skb2.fclone = SKB_FCLONE_CLONE;
269 fclones->skb2.pfmemalloc = pfmemalloc; 269 fclones->skb2.pfmemalloc = pfmemalloc;
270 } 270 }
271out: 271out:
@@ -336,59 +336,85 @@ struct netdev_alloc_cache {
336 unsigned int pagecnt_bias; 336 unsigned int pagecnt_bias;
337}; 337};
338static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); 338static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
339static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
339 340
340static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) 341static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
342 gfp_t gfp_mask)
341{ 343{
342 struct netdev_alloc_cache *nc; 344 const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
343 void *data = NULL; 345 struct page *page = NULL;
344 int order; 346 gfp_t gfp = gfp_mask;
345 unsigned long flags; 347
348 if (order) {
349 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
350 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
351 nc->frag.size = PAGE_SIZE << (page ? order : 0);
352 }
346 353
347 local_irq_save(flags); 354 if (unlikely(!page))
348 nc = this_cpu_ptr(&netdev_alloc_cache); 355 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
349 if (unlikely(!nc->frag.page)) { 356
357 nc->frag.page = page;
358
359 return page;
360}
361
362static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
363 unsigned int fragsz, gfp_t gfp_mask)
364{
365 struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
366 struct page *page = nc->frag.page;
367 unsigned int size;
368 int offset;
369
370 if (unlikely(!page)) {
350refill: 371refill:
351 for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { 372 page = __page_frag_refill(nc, gfp_mask);
352 gfp_t gfp = gfp_mask; 373 if (!page)
374 return NULL;
375
376 /* if size can vary use frag.size else just use PAGE_SIZE */
377 size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
353 378
354 if (order)
355 gfp |= __GFP_COMP | __GFP_NOWARN;
356 nc->frag.page = alloc_pages(gfp, order);
357 if (likely(nc->frag.page))
358 break;
359 if (--order < 0)
360 goto end;
361 }
362 nc->frag.size = PAGE_SIZE << order;
363 /* Even if we own the page, we do not use atomic_set(). 379 /* Even if we own the page, we do not use atomic_set().
364 * This would break get_page_unless_zero() users. 380 * This would break get_page_unless_zero() users.
365 */ 381 */
366 atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1, 382 atomic_add(size - 1, &page->_count);
367 &nc->frag.page->_count); 383
368 nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; 384 /* reset page count bias and offset to start of new frag */
369 nc->frag.offset = 0; 385 nc->pagecnt_bias = size;
386 nc->frag.offset = size;
370 } 387 }
371 388
372 if (nc->frag.offset + fragsz > nc->frag.size) { 389 offset = nc->frag.offset - fragsz;
373 if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) { 390 if (unlikely(offset < 0)) {
374 if (!atomic_sub_and_test(nc->pagecnt_bias, 391 if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
375 &nc->frag.page->_count)) 392 goto refill;
376 goto refill; 393
377 /* OK, page count is 0, we can safely set it */ 394 /* if size can vary use frag.size else just use PAGE_SIZE */
378 atomic_set(&nc->frag.page->_count, 395 size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
379 NETDEV_PAGECNT_MAX_BIAS); 396
380 } else { 397 /* OK, page count is 0, we can safely set it */
381 atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias, 398 atomic_set(&page->_count, size);
382 &nc->frag.page->_count); 399
383 } 400 /* reset page count bias and offset to start of new frag */
384 nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; 401 nc->pagecnt_bias = size;
385 nc->frag.offset = 0; 402 offset = size - fragsz;
386 } 403 }
387 404
388 data = page_address(nc->frag.page) + nc->frag.offset;
389 nc->frag.offset += fragsz;
390 nc->pagecnt_bias--; 405 nc->pagecnt_bias--;
391end: 406 nc->frag.offset = offset;
407
408 return page_address(page) + offset;
409}
410
411static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
412{
413 unsigned long flags;
414 void *data;
415
416 local_irq_save(flags);
417 data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
392 local_irq_restore(flags); 418 local_irq_restore(flags);
393 return data; 419 return data;
394} 420}
@@ -406,11 +432,25 @@ void *netdev_alloc_frag(unsigned int fragsz)
406} 432}
407EXPORT_SYMBOL(netdev_alloc_frag); 433EXPORT_SYMBOL(netdev_alloc_frag);
408 434
435static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
436{
437 return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
438}
439
440void *napi_alloc_frag(unsigned int fragsz)
441{
442 return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
443}
444EXPORT_SYMBOL(napi_alloc_frag);
445
409/** 446/**
410 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 447 * __alloc_rx_skb - allocate an skbuff for rx
411 * @dev: network device to receive on
412 * @length: length to allocate 448 * @length: length to allocate
413 * @gfp_mask: get_free_pages mask, passed to alloc_skb 449 * @gfp_mask: get_free_pages mask, passed to alloc_skb
450 * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
451 * allocations in case we have to fallback to __alloc_skb()
452 * If SKB_ALLOC_NAPI is set, page fragment will be allocated
453 * from napi_cache instead of netdev_cache.
414 * 454 *
415 * Allocate a new &sk_buff and assign it a usage count of one. The 455 * Allocate a new &sk_buff and assign it a usage count of one. The
416 * buffer has unspecified headroom built in. Users should allocate 456 * buffer has unspecified headroom built in. Users should allocate
@@ -419,11 +459,11 @@ EXPORT_SYMBOL(netdev_alloc_frag);
419 * 459 *
420 * %NULL is returned if there is no free memory. 460 * %NULL is returned if there is no free memory.
421 */ 461 */
422struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 462static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
423 unsigned int length, gfp_t gfp_mask) 463 int flags)
424{ 464{
425 struct sk_buff *skb = NULL; 465 struct sk_buff *skb = NULL;
426 unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + 466 unsigned int fragsz = SKB_DATA_ALIGN(length) +
427 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 467 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
428 468
429 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { 469 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
@@ -432,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
432 if (sk_memalloc_socks()) 472 if (sk_memalloc_socks())
433 gfp_mask |= __GFP_MEMALLOC; 473 gfp_mask |= __GFP_MEMALLOC;
434 474
435 data = __netdev_alloc_frag(fragsz, gfp_mask); 475 data = (flags & SKB_ALLOC_NAPI) ?
476 __napi_alloc_frag(fragsz, gfp_mask) :
477 __netdev_alloc_frag(fragsz, gfp_mask);
436 478
437 if (likely(data)) { 479 if (likely(data)) {
438 skb = build_skb(data, fragsz); 480 skb = build_skb(data, fragsz);
@@ -440,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
440 put_page(virt_to_head_page(data)); 482 put_page(virt_to_head_page(data));
441 } 483 }
442 } else { 484 } else {
443 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 485 skb = __alloc_skb(length, gfp_mask,
444 SKB_ALLOC_RX, NUMA_NO_NODE); 486 SKB_ALLOC_RX, NUMA_NO_NODE);
445 } 487 }
488 return skb;
489}
490
491/**
492 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
493 * @dev: network device to receive on
494 * @length: length to allocate
495 * @gfp_mask: get_free_pages mask, passed to alloc_skb
496 *
497 * Allocate a new &sk_buff and assign it a usage count of one. The
498 * buffer has NET_SKB_PAD headroom built in. Users should allocate
499 * the headroom they think they need without accounting for the
500 * built in space. The built in space is used for optimisations.
501 *
502 * %NULL is returned if there is no free memory.
503 */
504struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
505 unsigned int length, gfp_t gfp_mask)
506{
507 struct sk_buff *skb;
508
509 length += NET_SKB_PAD;
510 skb = __alloc_rx_skb(length, gfp_mask, 0);
511
446 if (likely(skb)) { 512 if (likely(skb)) {
447 skb_reserve(skb, NET_SKB_PAD); 513 skb_reserve(skb, NET_SKB_PAD);
448 skb->dev = dev; 514 skb->dev = dev;
449 } 515 }
516
450 return skb; 517 return skb;
451} 518}
452EXPORT_SYMBOL(__netdev_alloc_skb); 519EXPORT_SYMBOL(__netdev_alloc_skb);
453 520
521/**
522 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
523 * @napi: napi instance this buffer was allocated for
524 * @length: length to allocate
525 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
526 *
527 * Allocate a new sk_buff for use in NAPI receive. This buffer will
528 * attempt to allocate the head from a special reserved region used
529 * only for NAPI Rx allocation. By doing this we can save several
530 * CPU cycles by avoiding having to disable and re-enable IRQs.
531 *
532 * %NULL is returned if there is no free memory.
533 */
534struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
535 unsigned int length, gfp_t gfp_mask)
536{
537 struct sk_buff *skb;
538
539 length += NET_SKB_PAD + NET_IP_ALIGN;
540 skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
541
542 if (likely(skb)) {
543 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
544 skb->dev = napi->dev;
545 }
546
547 return skb;
548}
549EXPORT_SYMBOL(__napi_alloc_skb);
550
454void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 551void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
455 int size, unsigned int truesize) 552 int size, unsigned int truesize)
456{ 553{
@@ -541,26 +638,27 @@ static void kfree_skbmem(struct sk_buff *skb)
541 switch (skb->fclone) { 638 switch (skb->fclone) {
542 case SKB_FCLONE_UNAVAILABLE: 639 case SKB_FCLONE_UNAVAILABLE:
543 kmem_cache_free(skbuff_head_cache, skb); 640 kmem_cache_free(skbuff_head_cache, skb);
544 break; 641 return;
545 642
546 case SKB_FCLONE_ORIG: 643 case SKB_FCLONE_ORIG:
547 fclones = container_of(skb, struct sk_buff_fclones, skb1); 644 fclones = container_of(skb, struct sk_buff_fclones, skb1);
548 if (atomic_dec_and_test(&fclones->fclone_ref))
549 kmem_cache_free(skbuff_fclone_cache, fclones);
550 break;
551 645
552 case SKB_FCLONE_CLONE: 646 /* We usually free the clone (TX completion) before original skb
553 fclones = container_of(skb, struct sk_buff_fclones, skb2); 647 * This test would have no chance to be true for the clone,
554 648 * while here, branch prediction will be good.
555 /* The clone portion is available for
556 * fast-cloning again.
557 */ 649 */
558 skb->fclone = SKB_FCLONE_FREE; 650 if (atomic_read(&fclones->fclone_ref) == 1)
651 goto fastpath;
652 break;
559 653
560 if (atomic_dec_and_test(&fclones->fclone_ref)) 654 default: /* SKB_FCLONE_CLONE */
561 kmem_cache_free(skbuff_fclone_cache, fclones); 655 fclones = container_of(skb, struct sk_buff_fclones, skb2);
562 break; 656 break;
563 } 657 }
658 if (!atomic_dec_and_test(&fclones->fclone_ref))
659 return;
660fastpath:
661 kmem_cache_free(skbuff_fclone_cache, fclones);
564} 662}
565 663
566static void skb_release_head_state(struct sk_buff *skb) 664static void skb_release_head_state(struct sk_buff *skb)
@@ -872,15 +970,15 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
872 struct sk_buff_fclones *fclones = container_of(skb, 970 struct sk_buff_fclones *fclones = container_of(skb,
873 struct sk_buff_fclones, 971 struct sk_buff_fclones,
874 skb1); 972 skb1);
875 struct sk_buff *n = &fclones->skb2; 973 struct sk_buff *n;
876 974
877 if (skb_orphan_frags(skb, gfp_mask)) 975 if (skb_orphan_frags(skb, gfp_mask))
878 return NULL; 976 return NULL;
879 977
880 if (skb->fclone == SKB_FCLONE_ORIG && 978 if (skb->fclone == SKB_FCLONE_ORIG &&
881 n->fclone == SKB_FCLONE_FREE) { 979 atomic_read(&fclones->fclone_ref) == 1) {
882 n->fclone = SKB_FCLONE_CLONE; 980 n = &fclones->skb2;
883 atomic_inc(&fclones->fclone_ref); 981 atomic_set(&fclones->fclone_ref, 2);
884 } else { 982 } else {
885 if (skb_pfmemalloc(skb)) 983 if (skb_pfmemalloc(skb))
886 gfp_mask |= __GFP_MEMALLOC; 984 gfp_mask |= __GFP_MEMALLOC;
@@ -3002,7 +3100,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
3002 if (nskb->len == len + doffset) 3100 if (nskb->len == len + doffset)
3003 goto perform_csum_check; 3101 goto perform_csum_check;
3004 3102
3005 if (!sg) { 3103 if (!sg && !nskb->remcsum_offload) {
3006 nskb->ip_summed = CHECKSUM_NONE; 3104 nskb->ip_summed = CHECKSUM_NONE;
3007 nskb->csum = skb_copy_and_csum_bits(head_skb, offset, 3105 nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
3008 skb_put(nskb, len), 3106 skb_put(nskb, len),
@@ -3074,7 +3172,7 @@ skip_fraglist:
3074 nskb->truesize += nskb->data_len; 3172 nskb->truesize += nskb->data_len;
3075 3173
3076perform_csum_check: 3174perform_csum_check:
3077 if (!csum) { 3175 if (!csum && !nskb->remcsum_offload) {
3078 nskb->csum = skb_checksum(nskb, doffset, 3176 nskb->csum = skb_checksum(nskb, doffset,
3079 nskb->len - doffset, 0); 3177 nskb->len - doffset, 0);
3080 nskb->ip_summed = CHECKSUM_NONE; 3178 nskb->ip_summed = CHECKSUM_NONE;
@@ -3088,6 +3186,16 @@ perform_csum_check:
3088 * (see validate_xmit_skb_list() for example) 3186 * (see validate_xmit_skb_list() for example)
3089 */ 3187 */
3090 segs->prev = tail; 3188 segs->prev = tail;
3189
3190 /* Following permits correct backpressure, for protocols
3191 * using skb_set_owner_w().
3192 * Idea is to tranfert ownership from head_skb to last segment.
3193 */
3194 if (head_skb->destructor == sock_wfree) {
3195 swap(tail->truesize, head_skb->truesize);
3196 swap(tail->destructor, head_skb->destructor);
3197 swap(tail->sk, head_skb->sk);
3198 }
3091 return segs; 3199 return segs;
3092 3200
3093err: 3201err:
@@ -4130,6 +4238,113 @@ err_free:
4130} 4238}
4131EXPORT_SYMBOL(skb_vlan_untag); 4239EXPORT_SYMBOL(skb_vlan_untag);
4132 4240
4241int skb_ensure_writable(struct sk_buff *skb, int write_len)
4242{
4243 if (!pskb_may_pull(skb, write_len))
4244 return -ENOMEM;
4245
4246 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
4247 return 0;
4248
4249 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
4250}
4251EXPORT_SYMBOL(skb_ensure_writable);
4252
4253/* remove VLAN header from packet and update csum accordingly. */
4254static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
4255{
4256 struct vlan_hdr *vhdr;
4257 unsigned int offset = skb->data - skb_mac_header(skb);
4258 int err;
4259
4260 __skb_push(skb, offset);
4261 err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
4262 if (unlikely(err))
4263 goto pull;
4264
4265 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
4266
4267 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
4268 *vlan_tci = ntohs(vhdr->h_vlan_TCI);
4269
4270 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
4271 __skb_pull(skb, VLAN_HLEN);
4272
4273 vlan_set_encap_proto(skb, vhdr);
4274 skb->mac_header += VLAN_HLEN;
4275
4276 if (skb_network_offset(skb) < ETH_HLEN)
4277 skb_set_network_header(skb, ETH_HLEN);
4278
4279 skb_reset_mac_len(skb);
4280pull:
4281 __skb_pull(skb, offset);
4282
4283 return err;
4284}
4285
4286int skb_vlan_pop(struct sk_buff *skb)
4287{
4288 u16 vlan_tci;
4289 __be16 vlan_proto;
4290 int err;
4291
4292 if (likely(vlan_tx_tag_present(skb))) {
4293 skb->vlan_tci = 0;
4294 } else {
4295 if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
4296 skb->protocol != htons(ETH_P_8021AD)) ||
4297 skb->len < VLAN_ETH_HLEN))
4298 return 0;
4299
4300 err = __skb_vlan_pop(skb, &vlan_tci);
4301 if (err)
4302 return err;
4303 }
4304 /* move next vlan tag to hw accel tag */
4305 if (likely((skb->protocol != htons(ETH_P_8021Q) &&
4306 skb->protocol != htons(ETH_P_8021AD)) ||
4307 skb->len < VLAN_ETH_HLEN))
4308 return 0;
4309
4310 vlan_proto = skb->protocol;
4311 err = __skb_vlan_pop(skb, &vlan_tci);
4312 if (unlikely(err))
4313 return err;
4314
4315 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
4316 return 0;
4317}
4318EXPORT_SYMBOL(skb_vlan_pop);
4319
4320int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
4321{
4322 if (vlan_tx_tag_present(skb)) {
4323 unsigned int offset = skb->data - skb_mac_header(skb);
4324 int err;
4325
4326 /* __vlan_insert_tag expect skb->data pointing to mac header.
4327 * So change skb->data before calling it and change back to
4328 * original position later
4329 */
4330 __skb_push(skb, offset);
4331 err = __vlan_insert_tag(skb, skb->vlan_proto,
4332 vlan_tx_tag_get(skb));
4333 if (err)
4334 return err;
4335 skb->protocol = skb->vlan_proto;
4336 skb->mac_len += VLAN_HLEN;
4337 __skb_pull(skb, offset);
4338
4339 if (skb->ip_summed == CHECKSUM_COMPLETE)
4340 skb->csum = csum_add(skb->csum, csum_partial(skb->data
4341 + (2 * ETH_ALEN), VLAN_HLEN, 0));
4342 }
4343 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
4344 return 0;
4345}
4346EXPORT_SYMBOL(skb_vlan_push);
4347
4133/** 4348/**
4134 * alloc_skb_with_frags - allocate skb with page frags 4349 * alloc_skb_with_frags - allocate skb with page frags
4135 * 4350 *
diff --git a/net/core/sock.c b/net/core/sock.c
index 15e0c67b1069..9a56b2000c3f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -888,6 +888,19 @@ set_rcvbuf:
888 } 888 }
889 break; 889 break;
890 890
891 case SO_ATTACH_BPF:
892 ret = -EINVAL;
893 if (optlen == sizeof(u32)) {
894 u32 ufd;
895
896 ret = -EFAULT;
897 if (copy_from_user(&ufd, optval, sizeof(ufd)))
898 break;
899
900 ret = sk_attach_bpf(ufd, sk);
901 }
902 break;
903
891 case SO_DETACH_FILTER: 904 case SO_DETACH_FILTER:
892 ret = sk_detach_filter(sk); 905 ret = sk_detach_filter(sk);
893 break; 906 break;
@@ -1213,6 +1226,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1213 v.val = sk->sk_max_pacing_rate; 1226 v.val = sk->sk_max_pacing_rate;
1214 break; 1227 break;
1215 1228
1229 case SO_INCOMING_CPU:
1230 v.val = sk->sk_incoming_cpu;
1231 break;
1232
1216 default: 1233 default:
1217 return -ENOPROTOOPT; 1234 return -ENOPROTOOPT;
1218 } 1235 }
@@ -1517,6 +1534,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1517 1534
1518 newsk->sk_err = 0; 1535 newsk->sk_err = 0;
1519 newsk->sk_priority = 0; 1536 newsk->sk_priority = 0;
1537 newsk->sk_incoming_cpu = raw_smp_processor_id();
1520 /* 1538 /*
1521 * Before updating sk_refcnt, we must commit prior changes to memory 1539 * Before updating sk_refcnt, we must commit prior changes to memory
1522 * (Documentation/RCU/rculist_nulls.txt for details) 1540 * (Documentation/RCU/rculist_nulls.txt for details)
@@ -2457,7 +2475,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2457 msg->msg_flags |= MSG_TRUNC; 2475 msg->msg_flags |= MSG_TRUNC;
2458 copied = len; 2476 copied = len;
2459 } 2477 }
2460 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 2478 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2461 if (err) 2479 if (err)
2462 goto out_free_skb; 2480 goto out_free_skb;
2463 2481
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cf9cd13509a7..31baba2a71ce 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,8 @@ static int zero = 0;
26static int one = 1; 26static int one = 1;
27static int ushort_max = USHRT_MAX; 27static int ushort_max = USHRT_MAX;
28 28
29static int net_msg_warn; /* Unused, but still a sysctl */
30
29#ifdef CONFIG_RPS 31#ifdef CONFIG_RPS
30static int rps_sock_flow_sysctl(struct ctl_table *table, int write, 32static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
31 void __user *buffer, size_t *lenp, loff_t *ppos) 33 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -215,6 +217,18 @@ static int set_default_qdisc(struct ctl_table *table, int write,
215} 217}
216#endif 218#endif
217 219
220static int proc_do_rss_key(struct ctl_table *table, int write,
221 void __user *buffer, size_t *lenp, loff_t *ppos)
222{
223 struct ctl_table fake_table;
224 char buf[NETDEV_RSS_KEY_LEN * 3];
225
226 snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
227 fake_table.data = buf;
228 fake_table.maxlen = sizeof(buf);
229 return proc_dostring(&fake_table, write, buffer, lenp, ppos);
230}
231
218static struct ctl_table net_core_table[] = { 232static struct ctl_table net_core_table[] = {
219#ifdef CONFIG_NET 233#ifdef CONFIG_NET
220 { 234 {
@@ -263,6 +277,13 @@ static struct ctl_table net_core_table[] = {
263 .mode = 0644, 277 .mode = 0644,
264 .proc_handler = proc_dointvec 278 .proc_handler = proc_dointvec
265 }, 279 },
280 {
281 .procname = "netdev_rss_key",
282 .data = &netdev_rss_key,
283 .maxlen = sizeof(int),
284 .mode = 0444,
285 .proc_handler = proc_do_rss_key,
286 },
266#ifdef CONFIG_BPF_JIT 287#ifdef CONFIG_BPF_JIT
267 { 288 {
268 .procname = "bpf_jit_enable", 289 .procname = "bpf_jit_enable",
diff --git a/net/core/utils.c b/net/core/utils.c
index efc76dd9dcd1..7b803884c162 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -33,9 +33,6 @@
33#include <asm/byteorder.h> 33#include <asm/byteorder.h>
34#include <asm/uaccess.h> 34#include <asm/uaccess.h>
35 35
36int net_msg_warn __read_mostly = 1;
37EXPORT_SYMBOL(net_msg_warn);
38
39DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); 36DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
40/* 37/*
41 * All net warning printk()s should be guarded by this function. 38 * All net warning printk()s should be guarded by this function.