aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/networking
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/networking')
-rw-r--r--Documentation/networking/dccp.txt3
-rw-r--r--Documentation/networking/ip-sysctl.txt148
-rw-r--r--Documentation/networking/ixgbe.txt199
-rw-r--r--Documentation/networking/rds.txt356
-rw-r--r--Documentation/networking/timestamping.txt180
-rw-r--r--Documentation/networking/timestamping/.gitignore1
-rw-r--r--Documentation/networking/timestamping/Makefile6
-rw-r--r--Documentation/networking/timestamping/timestamping.c533
8 files changed, 1358 insertions, 68 deletions
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 7a3bb1abb830..b132e4a3cf0f 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -141,7 +141,8 @@ rx_ccid = 2
141 Default CCID for the receiver-sender half-connection; see tx_ccid. 141 Default CCID for the receiver-sender half-connection; see tx_ccid.
142 142
143seq_window = 100 143seq_window = 100
144 The initial sequence window (sec. 7.5.2). 144 The initial sequence window (sec. 7.5.2) of the sender. This influences
145 the local ackno validity and the remote seqno validity windows (7.5.1).
145 146
146tx_qlen = 5 147tx_qlen = 5
147 The size of the transmit buffer in packets. A value of 0 corresponds 148 The size of the transmit buffer in packets. A value of 0 corresponds
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index c7712787933c..ec5de02f543f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -2,7 +2,7 @@
2 2
3ip_forward - BOOLEAN 3ip_forward - BOOLEAN
4 0 - disabled (default) 4 0 - disabled (default)
5 not 0 - enabled 5 not 0 - enabled
6 6
7 Forward Packets between interfaces. 7 Forward Packets between interfaces.
8 8
@@ -36,49 +36,49 @@ rt_cache_rebuild_count - INTEGER
36IP Fragmentation: 36IP Fragmentation:
37 37
38ipfrag_high_thresh - INTEGER 38ipfrag_high_thresh - INTEGER
39 Maximum memory used to reassemble IP fragments. When 39 Maximum memory used to reassemble IP fragments. When
40 ipfrag_high_thresh bytes of memory is allocated for this purpose, 40 ipfrag_high_thresh bytes of memory is allocated for this purpose,
41 the fragment handler will toss packets until ipfrag_low_thresh 41 the fragment handler will toss packets until ipfrag_low_thresh
42 is reached. 42 is reached.
43 43
44ipfrag_low_thresh - INTEGER 44ipfrag_low_thresh - INTEGER
45 See ipfrag_high_thresh 45 See ipfrag_high_thresh
46 46
47ipfrag_time - INTEGER 47ipfrag_time - INTEGER
48 Time in seconds to keep an IP fragment in memory. 48 Time in seconds to keep an IP fragment in memory.
49 49
50ipfrag_secret_interval - INTEGER 50ipfrag_secret_interval - INTEGER
51 Regeneration interval (in seconds) of the hash secret (or lifetime 51 Regeneration interval (in seconds) of the hash secret (or lifetime
52 for the hash secret) for IP fragments. 52 for the hash secret) for IP fragments.
53 Default: 600 53 Default: 600
54 54
55ipfrag_max_dist - INTEGER 55ipfrag_max_dist - INTEGER
56 ipfrag_max_dist is a non-negative integer value which defines the 56 ipfrag_max_dist is a non-negative integer value which defines the
57 maximum "disorder" which is allowed among fragments which share a 57 maximum "disorder" which is allowed among fragments which share a
58 common IP source address. Note that reordering of packets is 58 common IP source address. Note that reordering of packets is
59 not unusual, but if a large number of fragments arrive from a source 59 not unusual, but if a large number of fragments arrive from a source
60 IP address while a particular fragment queue remains incomplete, it 60 IP address while a particular fragment queue remains incomplete, it
61 probably indicates that one or more fragments belonging to that queue 61 probably indicates that one or more fragments belonging to that queue
62 have been lost. When ipfrag_max_dist is positive, an additional check 62 have been lost. When ipfrag_max_dist is positive, an additional check
63 is done on fragments before they are added to a reassembly queue - if 63 is done on fragments before they are added to a reassembly queue - if
64 ipfrag_max_dist (or more) fragments have arrived from a particular IP 64 ipfrag_max_dist (or more) fragments have arrived from a particular IP
65 address between additions to any IP fragment queue using that source 65 address between additions to any IP fragment queue using that source
66 address, it's presumed that one or more fragments in the queue are 66 address, it's presumed that one or more fragments in the queue are
67 lost. The existing fragment queue will be dropped, and a new one 67 lost. The existing fragment queue will be dropped, and a new one
68 started. An ipfrag_max_dist value of zero disables this check. 68 started. An ipfrag_max_dist value of zero disables this check.
69 69
70 Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can 70 Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
71 result in unnecessarily dropping fragment queues when normal 71 result in unnecessarily dropping fragment queues when normal
72 reordering of packets occurs, which could lead to poor application 72 reordering of packets occurs, which could lead to poor application
73 performance. Using a very large value, e.g. 50000, increases the 73 performance. Using a very large value, e.g. 50000, increases the
74 likelihood of incorrectly reassembling IP fragments that originate 74 likelihood of incorrectly reassembling IP fragments that originate
75 from different IP datagrams, which could result in data corruption. 75 from different IP datagrams, which could result in data corruption.
76 Default: 64 76 Default: 64
77 77
78INET peer storage: 78INET peer storage:
79 79
80inet_peer_threshold - INTEGER 80inet_peer_threshold - INTEGER
81 The approximate size of the storage. Starting from this threshold 81 The approximate size of the storage. Starting from this threshold
82 entries will be thrown aggressively. This threshold also determines 82 entries will be thrown aggressively. This threshold also determines
83 entries' time-to-live and time intervals between garbage collection 83 entries' time-to-live and time intervals between garbage collection
84 passes. More entries, less time-to-live, less GC interval. 84 passes. More entries, less time-to-live, less GC interval.
@@ -105,7 +105,7 @@ inet_peer_gc_maxtime - INTEGER
105 in effect under low (or absent) memory pressure on the pool. 105 in effect under low (or absent) memory pressure on the pool.
106 Measured in seconds. 106 Measured in seconds.
107 107
108TCP variables: 108TCP variables:
109 109
110somaxconn - INTEGER 110somaxconn - INTEGER
111 Limit of socket listen() backlog, known in userspace as SOMAXCONN. 111 Limit of socket listen() backlog, known in userspace as SOMAXCONN.
@@ -310,7 +310,7 @@ tcp_orphan_retries - INTEGER
310 310
311tcp_reordering - INTEGER 311tcp_reordering - INTEGER
312 Maximal reordering of packets in a TCP stream. 312 Maximal reordering of packets in a TCP stream.
313 Default: 3 313 Default: 3
314 314
315tcp_retrans_collapse - BOOLEAN 315tcp_retrans_collapse - BOOLEAN
316 Bug-to-bug compatibility with some broken printers. 316 Bug-to-bug compatibility with some broken printers.
@@ -521,7 +521,7 @@ IP Variables:
521 521
522ip_local_port_range - 2 INTEGERS 522ip_local_port_range - 2 INTEGERS
523 Defines the local port range that is used by TCP and UDP to 523 Defines the local port range that is used by TCP and UDP to
524 choose the local port. The first number is the first, the 524 choose the local port. The first number is the first, the
525 second the last local port number. Default value depends on 525 second the last local port number. Default value depends on
526 amount of memory available on the system: 526 amount of memory available on the system:
527 > 128Mb 32768-61000 527 > 128Mb 32768-61000
@@ -594,12 +594,12 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
594 594
595 If zero, icmp error messages are sent with the primary address of 595 If zero, icmp error messages are sent with the primary address of
596 the exiting interface. 596 the exiting interface.
597 597
598 If non-zero, the message will be sent with the primary address of 598 If non-zero, the message will be sent with the primary address of
599 the interface that received the packet that caused the icmp error. 599 the interface that received the packet that caused the icmp error.
600 This is the behaviour network many administrators will expect from 600 This is the behaviour network many administrators will expect from
601 a router. And it can make debugging complicated network layouts 601 a router. And it can make debugging complicated network layouts
602 much easier. 602 much easier.
603 603
604 Note that if no primary address exists for the interface selected, 604 Note that if no primary address exists for the interface selected,
605 then the primary address of the first non-loopback interface that 605 then the primary address of the first non-loopback interface that
@@ -611,7 +611,7 @@ igmp_max_memberships - INTEGER
611 Change the maximum number of multicast groups we can subscribe to. 611 Change the maximum number of multicast groups we can subscribe to.
612 Default: 20 612 Default: 20
613 613
614conf/interface/* changes special settings per interface (where "interface" is 614conf/interface/* changes special settings per interface (where "interface" is
615 the name of your network interface) 615 the name of your network interface)
616conf/all/* is special, changes the settings for all interfaces 616conf/all/* is special, changes the settings for all interfaces
617 617
@@ -625,11 +625,11 @@ log_martians - BOOLEAN
625accept_redirects - BOOLEAN 625accept_redirects - BOOLEAN
626 Accept ICMP redirect messages. 626 Accept ICMP redirect messages.
627 accept_redirects for the interface will be enabled if: 627 accept_redirects for the interface will be enabled if:
628 - both conf/{all,interface}/accept_redirects are TRUE in the case forwarding 628 - both conf/{all,interface}/accept_redirects are TRUE in the case
629 for the interface is enabled 629 forwarding for the interface is enabled
630 or 630 or
631 - at least one of conf/{all,interface}/accept_redirects is TRUE in the case 631 - at least one of conf/{all,interface}/accept_redirects is TRUE in the
632 forwarding for the interface is disabled 632 case forwarding for the interface is disabled
633 accept_redirects for the interface will be disabled otherwise 633 accept_redirects for the interface will be disabled otherwise
634 default TRUE (host) 634 default TRUE (host)
635 FALSE (router) 635 FALSE (router)
@@ -640,8 +640,8 @@ forwarding - BOOLEAN
640mc_forwarding - BOOLEAN 640mc_forwarding - BOOLEAN
641 Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE 641 Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE
642 and a multicast routing daemon is required. 642 and a multicast routing daemon is required.
643 conf/all/mc_forwarding must also be set to TRUE to enable multicast routing 643 conf/all/mc_forwarding must also be set to TRUE to enable multicast
644 for the interface 644 routing for the interface
645 645
646medium_id - INTEGER 646medium_id - INTEGER
647 Integer value used to differentiate the devices by the medium they 647 Integer value used to differentiate the devices by the medium they
@@ -649,7 +649,7 @@ medium_id - INTEGER
649 the broadcast packets are received only on one of them. 649 the broadcast packets are received only on one of them.
650 The default value 0 means that the device is the only interface 650 The default value 0 means that the device is the only interface
651 to its medium, value of -1 means that medium is not known. 651 to its medium, value of -1 means that medium is not known.
652 652
653 Currently, it is used to change the proxy_arp behavior: 653 Currently, it is used to change the proxy_arp behavior:
654 the proxy_arp feature is enabled for packets forwarded between 654 the proxy_arp feature is enabled for packets forwarded between
655 two devices attached to different media. 655 two devices attached to different media.
@@ -699,16 +699,22 @@ accept_source_route - BOOLEAN
699 default TRUE (router) 699 default TRUE (router)
700 FALSE (host) 700 FALSE (host)
701 701
702rp_filter - BOOLEAN 702rp_filter - INTEGER
703 1 - do source validation by reversed path, as specified in RFC1812
704 Recommended option for single homed hosts and stub network
705 routers. Could cause troubles for complicated (not loop free)
706 networks running a slow unreliable protocol (sort of RIP),
707 or using static routes.
708
709 0 - No source validation. 703 0 - No source validation.
710 704 1 - Strict mode as defined in RFC3704 Strict Reverse Path
711 conf/all/rp_filter must also be set to TRUE to do source validation 705 Each incoming packet is tested against the FIB and if the interface
706 is not the best reverse path the packet check will fail.
707 By default failed packets are discarded.
708 2 - Loose mode as defined in RFC3704 Loose Reverse Path
709 Each incoming packet's source address is also tested against the FIB
710 and if the source address is not reachable via any interface
711 the packet check will fail.
712
713 Current recommended practice in RFC3704 is to enable strict mode
714 to prevent IP spoofing from DDos attacks. If using asymmetric routing
715 or other complicated routing, then loose mode is recommended.
716
717 conf/all/rp_filter must also be set to non-zero to do source validation
712 on the interface 718 on the interface
713 719
714 Default value is 0. Note that some distributions enable it 720 Default value is 0. Note that some distributions enable it
@@ -782,6 +788,12 @@ arp_ignore - INTEGER
782 The max value from conf/{all,interface}/arp_ignore is used 788 The max value from conf/{all,interface}/arp_ignore is used
783 when ARP request is received on the {interface} 789 when ARP request is received on the {interface}
784 790
791arp_notify - BOOLEAN
792 Define mode for notification of address and device changes.
793 0 - (default): do nothing
794 1 - Generate gratuitous arp replies when device is brought up
795 or hardware address changes.
796
785arp_accept - BOOLEAN 797arp_accept - BOOLEAN
786 Define behavior when gratuitous arp replies are received: 798 Define behavior when gratuitous arp replies are received:
787 0 - drop gratuitous arp frames 799 0 - drop gratuitous arp frames
@@ -823,7 +835,7 @@ apply to IPv6 [XXX?].
823 835
824bindv6only - BOOLEAN 836bindv6only - BOOLEAN
825 Default value for IPV6_V6ONLY socket option, 837 Default value for IPV6_V6ONLY socket option,
826 which restricts use of the IPv6 socket to IPv6 communication 838 which restricts use of the IPv6 socket to IPv6 communication
827 only. 839 only.
828 TRUE: disable IPv4-mapped address feature 840 TRUE: disable IPv4-mapped address feature
829 FALSE: enable IPv4-mapped address feature 841 FALSE: enable IPv4-mapped address feature
@@ -833,19 +845,19 @@ bindv6only - BOOLEAN
833IPv6 Fragmentation: 845IPv6 Fragmentation:
834 846
835ip6frag_high_thresh - INTEGER 847ip6frag_high_thresh - INTEGER
836 Maximum memory used to reassemble IPv6 fragments. When 848 Maximum memory used to reassemble IPv6 fragments. When
837 ip6frag_high_thresh bytes of memory is allocated for this purpose, 849 ip6frag_high_thresh bytes of memory is allocated for this purpose,
838 the fragment handler will toss packets until ip6frag_low_thresh 850 the fragment handler will toss packets until ip6frag_low_thresh
839 is reached. 851 is reached.
840 852
841ip6frag_low_thresh - INTEGER 853ip6frag_low_thresh - INTEGER
842 See ip6frag_high_thresh 854 See ip6frag_high_thresh
843 855
844ip6frag_time - INTEGER 856ip6frag_time - INTEGER
845 Time in seconds to keep an IPv6 fragment in memory. 857 Time in seconds to keep an IPv6 fragment in memory.
846 858
847ip6frag_secret_interval - INTEGER 859ip6frag_secret_interval - INTEGER
848 Regeneration interval (in seconds) of the hash secret (or lifetime 860 Regeneration interval (in seconds) of the hash secret (or lifetime
849 for the hash secret) for IPv6 fragments. 861 for the hash secret) for IPv6 fragments.
850 Default: 600 862 Default: 600
851 863
@@ -854,17 +866,17 @@ conf/default/*:
854 866
855 867
856conf/all/*: 868conf/all/*:
857 Change all the interface-specific settings. 869 Change all the interface-specific settings.
858 870
859 [XXX: Other special features than forwarding?] 871 [XXX: Other special features than forwarding?]
860 872
861conf/all/forwarding - BOOLEAN 873conf/all/forwarding - BOOLEAN
862 Enable global IPv6 forwarding between all interfaces. 874 Enable global IPv6 forwarding between all interfaces.
863 875
864 IPv4 and IPv6 work differently here; e.g. netfilter must be used 876 IPv4 and IPv6 work differently here; e.g. netfilter must be used
865 to control which interfaces may forward packets and which not. 877 to control which interfaces may forward packets and which not.
866 878
867 This also sets all interfaces' Host/Router setting 879 This also sets all interfaces' Host/Router setting
868 'forwarding' to the specified value. See below for details. 880 'forwarding' to the specified value. See below for details.
869 881
870 This referred to as global forwarding. 882 This referred to as global forwarding.
@@ -875,12 +887,12 @@ proxy_ndp - BOOLEAN
875conf/interface/*: 887conf/interface/*:
876 Change special settings per interface. 888 Change special settings per interface.
877 889
878 The functional behaviour for certain settings is different 890 The functional behaviour for certain settings is different
879 depending on whether local forwarding is enabled or not. 891 depending on whether local forwarding is enabled or not.
880 892
881accept_ra - BOOLEAN 893accept_ra - BOOLEAN
882 Accept Router Advertisements; autoconfigure using them. 894 Accept Router Advertisements; autoconfigure using them.
883 895
884 Functional default: enabled if local forwarding is disabled. 896 Functional default: enabled if local forwarding is disabled.
885 disabled if local forwarding is enabled. 897 disabled if local forwarding is enabled.
886 898
@@ -926,7 +938,7 @@ accept_source_route - INTEGER
926 Default: 0 938 Default: 0
927 939
928autoconf - BOOLEAN 940autoconf - BOOLEAN
929 Autoconfigure addresses using Prefix Information in Router 941 Autoconfigure addresses using Prefix Information in Router
930 Advertisements. 942 Advertisements.
931 943
932 Functional default: enabled if accept_ra_pinfo is enabled. 944 Functional default: enabled if accept_ra_pinfo is enabled.
@@ -935,11 +947,11 @@ autoconf - BOOLEAN
935dad_transmits - INTEGER 947dad_transmits - INTEGER
936 The amount of Duplicate Address Detection probes to send. 948 The amount of Duplicate Address Detection probes to send.
937 Default: 1 949 Default: 1
938 950
939forwarding - BOOLEAN 951forwarding - BOOLEAN
940 Configure interface-specific Host/Router behaviour. 952 Configure interface-specific Host/Router behaviour.
941 953
942 Note: It is recommended to have the same setting on all 954 Note: It is recommended to have the same setting on all
943 interfaces; mixed router/host scenarios are rather uncommon. 955 interfaces; mixed router/host scenarios are rather uncommon.
944 956
945 FALSE: 957 FALSE:
@@ -948,13 +960,13 @@ forwarding - BOOLEAN
948 960
949 1. IsRouter flag is not set in Neighbour Advertisements. 961 1. IsRouter flag is not set in Neighbour Advertisements.
950 2. Router Solicitations are being sent when necessary. 962 2. Router Solicitations are being sent when necessary.
951 3. If accept_ra is TRUE (default), accept Router 963 3. If accept_ra is TRUE (default), accept Router
952 Advertisements (and do autoconfiguration). 964 Advertisements (and do autoconfiguration).
953 4. If accept_redirects is TRUE (default), accept Redirects. 965 4. If accept_redirects is TRUE (default), accept Redirects.
954 966
955 TRUE: 967 TRUE:
956 968
957 If local forwarding is enabled, Router behaviour is assumed. 969 If local forwarding is enabled, Router behaviour is assumed.
958 This means exactly the reverse from the above: 970 This means exactly the reverse from the above:
959 971
960 1. IsRouter flag is set in Neighbour Advertisements. 972 1. IsRouter flag is set in Neighbour Advertisements.
@@ -989,7 +1001,7 @@ router_solicitation_interval - INTEGER
989 Default: 4 1001 Default: 4
990 1002
991router_solicitations - INTEGER 1003router_solicitations - INTEGER
992 Number of Router Solicitations to send until assuming no 1004 Number of Router Solicitations to send until assuming no
993 routers are present. 1005 routers are present.
994 Default: 3 1006 Default: 3
995 1007
@@ -1013,11 +1025,11 @@ temp_prefered_lft - INTEGER
1013 1025
1014max_desync_factor - INTEGER 1026max_desync_factor - INTEGER
1015 Maximum value for DESYNC_FACTOR, which is a random value 1027 Maximum value for DESYNC_FACTOR, which is a random value
1016 that ensures that clients don't synchronize with each 1028 that ensures that clients don't synchronize with each
1017 other and generate new addresses at exactly the same time. 1029 other and generate new addresses at exactly the same time.
1018 value is in seconds. 1030 value is in seconds.
1019 Default: 600 1031 Default: 600
1020 1032
1021regen_max_retry - INTEGER 1033regen_max_retry - INTEGER
1022 Number of attempts before give up attempting to generate 1034 Number of attempts before give up attempting to generate
1023 valid temporary addresses. 1035 valid temporary addresses.
@@ -1025,13 +1037,15 @@ regen_max_retry - INTEGER
1025 1037
1026max_addresses - INTEGER 1038max_addresses - INTEGER
1027 Number of maximum addresses per interface. 0 disables limitation. 1039 Number of maximum addresses per interface. 0 disables limitation.
1028 It is recommended not set too large value (or 0) because it would 1040 It is recommended not set too large value (or 0) because it would
1029 be too easy way to crash kernel to allow to create too much of 1041 be too easy way to crash kernel to allow to create too much of
1030 autoconfigured addresses. 1042 autoconfigured addresses.
1031 Default: 16 1043 Default: 16
1032 1044
1033disable_ipv6 - BOOLEAN 1045disable_ipv6 - BOOLEAN
1034 Disable IPv6 operation. 1046 Disable IPv6 operation. If accept_dad is set to 2, this value
1047 will be dynamically set to TRUE if DAD fails for the link-local
1048 address.
1035 Default: FALSE (enable IPv6 operation) 1049 Default: FALSE (enable IPv6 operation)
1036 1050
1037accept_dad - INTEGER 1051accept_dad - INTEGER
diff --git a/Documentation/networking/ixgbe.txt b/Documentation/networking/ixgbe.txt
new file mode 100644
index 000000000000..eeb68685c788
--- /dev/null
+++ b/Documentation/networking/ixgbe.txt
@@ -0,0 +1,199 @@
1Linux Base Driver for 10 Gigabit PCI Express Intel(R) Network Connection
2========================================================================
3
4March 10, 2009
5
6
7Contents
8========
9
10- In This Release
11- Identifying Your Adapter
12- Building and Installation
13- Additional Configurations
14- Support
15
16
17
18In This Release
19===============
20
21This file describes the ixgbe Linux Base Driver for the 10 Gigabit PCI
22Express Intel(R) Network Connection. This driver includes support for
23Itanium(R)2-based systems.
24
25For questions related to hardware requirements, refer to the documentation
26supplied with your 10 Gigabit adapter. All hardware requirements listed apply
27to use with Linux.
28
29The following features are available in this kernel:
30 - Native VLANs
31 - Channel Bonding (teaming)
32 - SNMP
33 - Generic Receive Offload
34 - Data Center Bridging
35
36Channel Bonding documentation can be found in the Linux kernel source:
37/Documentation/networking/bonding.txt
38
39Ethtool, lspci, and ifconfig can be used to display device and driver
40specific information.
41
42
43Identifying Your Adapter
44========================
45
46This driver supports devices based on the 82598 controller and the 82599
47controller.
48
49For specific information on identifying which adapter you have, please visit:
50
51 http://support.intel.com/support/network/sb/CS-008441.htm
52
53
54Building and Installation
55=========================
56
57select m for "Intel(R) 10GbE PCI Express adapters support" located at:
58 Location:
59 -> Device Drivers
60 -> Network device support (NETDEVICES [=y])
61 -> Ethernet (10000 Mbit) (NETDEV_10000 [=y])
62
631. make modules & make modules_install
64
652. Load the module:
66
67# modprobe ixgbe
68
69 The insmod command can be used if the full
70 path to the driver module is specified. For example:
71
72 insmod /lib/modules/<KERNEL VERSION>/kernel/drivers/net/ixgbe/ixgbe.ko
73
74 With 2.6 based kernels also make sure that older ixgbe drivers are
75 removed from the kernel, before loading the new module:
76
77 rmmod ixgbe; modprobe ixgbe
78
793. Assign an IP address to the interface by entering the following, where
80 x is the interface number:
81
82 ifconfig ethx <IP_address>
83
844. Verify that the interface works. Enter the following, where <IP_address>
85 is the IP address for another machine on the same subnet as the interface
86 that is being tested:
87
88 ping <IP_address>
89
90
91Additional Configurations
92=========================
93
94 Viewing Link Messages
95 ---------------------
96 Link messages will not be displayed to the console if the distribution is
97 restricting system messages. In order to see network driver link messages on
98 your console, set dmesg to eight by entering the following:
99
100 dmesg -n 8
101
102 NOTE: This setting is not saved across reboots.
103
104
105 Jumbo Frames
106 ------------
107 The driver supports Jumbo Frames for all adapters. Jumbo Frames support is
108 enabled by changing the MTU to a value larger than the default of 1500.
109 The maximum value for the MTU is 16110. Use the ifconfig command to
110 increase the MTU size. For example:
111
112 ifconfig ethx mtu 9000 up
113
114 The maximum MTU setting for Jumbo Frames is 16110. This value coincides
115 with the maximum Jumbo Frames size of 16128.
116
117 Generic Receive Offload, aka GRO
118 --------------------------------
119 The driver supports the in-kernel software implementation of GRO. GRO has
120 shown that by coalescing Rx traffic into larger chunks of data, CPU
121 utilization can be significantly reduced when under large Rx load. GRO is an
122 evolution of the previously-used LRO interface. GRO is able to coalesce
123 other protocols besides TCP. It's also safe to use with configurations that
124 are problematic for LRO, namely bridging and iSCSI.
125
126 GRO is enabled by default in the driver. Future versions of ethtool will
127 support disabling and re-enabling GRO on the fly.
128
129
130 Data Center Bridging, aka DCB
131 -----------------------------
132
133 DCB is a configuration Quality of Service implementation in hardware.
134 It uses the VLAN priority tag (802.1p) to filter traffic. That means
135 that there are 8 different priorities that traffic can be filtered into.
136 It also enables priority flow control which can limit or eliminate the
137 number of dropped packets during network stress. Bandwidth can be
138 allocated to each of these priorities, which is enforced at the hardware
139 level.
140
141 To enable DCB support in ixgbe, you must enable the DCB netlink layer to
142 allow the userspace tools (see below) to communicate with the driver.
143 This can be found in the kernel configuration here:
144
145 -> Networking support
146 -> Networking options
147 -> Data Center Bridging support
148
149 Once this is selected, DCB support must be selected for ixgbe. This can
150 be found here:
151
152 -> Device Drivers
153 -> Network device support (NETDEVICES [=y])
154 -> Ethernet (10000 Mbit) (NETDEV_10000 [=y])
155 -> Intel(R) 10GbE PCI Express adapters support
156 -> Data Center Bridging (DCB) Support
157
158 After these options are selected, you must rebuild your kernel and your
159 modules.
160
161 In order to use DCB, userspace tools must be downloaded and installed.
162 The dcbd tools can be found at:
163
164 http://e1000.sf.net
165
166
167 Ethtool
168 -------
169 The driver utilizes the ethtool interface for driver configuration and
170 diagnostics, as well as displaying statistical information. Ethtool
171 version 3.0 or later is required for this functionality.
172
173 The latest release of ethtool can be found from
174 http://sourceforge.net/projects/gkernel.
175
176
177 NAPI
178 ----
179
180 NAPI (Rx polling mode) is supported in the ixgbe driver. NAPI is enabled
181 by default in the driver.
182
183 See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
184
185
186Support
187=======
188
189For general information, go to the Intel support website at:
190
191 http://support.intel.com
192
193or the Intel Wired Networking project hosted by Sourceforge at:
194
195 http://e1000.sourceforge.net
196
197If an issue is identified with the released source code on the supported
198kernel with a supported adapter, email the specific information related
199to the issue to e1000-devel@lists.sf.net
diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt
new file mode 100644
index 000000000000..c67077cbeb80
--- /dev/null
+++ b/Documentation/networking/rds.txt
@@ -0,0 +1,356 @@
1
2Overview
3========
4
5This readme tries to provide some background on the hows and whys of RDS,
6and will hopefully help you find your way around the code.
7
8In addition, please see this email about RDS origins:
9http://oss.oracle.com/pipermail/rds-devel/2007-November/000228.html
10
11RDS Architecture
12================
13
14RDS provides reliable, ordered datagram delivery by using a single
15reliable connection between any two nodes in the cluster. This allows
16applications to use a single socket to talk to any other process in the
17cluster - so in a cluster with N processes you need N sockets, in contrast
18to N*N if you use a connection-oriented socket transport like TCP.
19
20RDS is not Infiniband-specific; it was designed to support different
21transports. The current implementation used to support RDS over TCP as well
22as IB. Work is in progress to support RDS over iWARP, and using DCE to
23guarantee no dropped packets on Ethernet, it may be possible to use RDS over
24UDP in the future.
25
26The high-level semantics of RDS from the application's point of view are
27
28 * Addressing
29 RDS uses IPv4 addresses and 16bit port numbers to identify
30 the end point of a connection. All socket operations that involve
31 passing addresses between kernel and user space generally
32 use a struct sockaddr_in.
33
34 The fact that IPv4 addresses are used does not mean the underlying
35 transport has to be IP-based. In fact, RDS over IB uses a
36 reliable IB connection; the IP address is used exclusively to
37 locate the remote node's GID (by ARPing for the given IP).
38
39 The port space is entirely independent of UDP, TCP or any other
40 protocol.
41
42 * Socket interface
43 RDS sockets work *mostly* as you would expect from a BSD
44 socket. The next section will cover the details. At any rate,
45 all I/O is performed through the standard BSD socket API.
46 Some additions like zerocopy support are implemented through
47 control messages, while other extensions use the getsockopt/
48 setsockopt calls.
49
50 Sockets must be bound before you can send or receive data.
51 This is needed because binding also selects a transport and
52 attaches it to the socket. Once bound, the transport assignment
53 does not change. RDS will tolerate IPs moving around (eg in
54 a active-active HA scenario), but only as long as the address
55 doesn't move to a different transport.
56
57 * sysctls
58 RDS supports a number of sysctls in /proc/sys/net/rds
59
60
61Socket Interface
62================
63
64 AF_RDS, PF_RDS, SOL_RDS
65 These constants haven't been assigned yet, because RDS isn't in
66 mainline yet. Currently, the kernel module assigns some constant
67 and publishes it to user space through two sysctl files
68 /proc/sys/net/rds/pf_rds
69 /proc/sys/net/rds/sol_rds
70
71 fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
72 This creates a new, unbound RDS socket.
73
74 setsockopt(SOL_SOCKET): send and receive buffer size
75 RDS honors the send and receive buffer size socket options.
76 You are not allowed to queue more than SO_SNDSIZE bytes to
77 a socket. A message is queued when sendmsg is called, and
78 it leaves the queue when the remote system acknowledges
79 its arrival.
80
81 The SO_RCVSIZE option controls the maximum receive queue length.
82 This is a soft limit rather than a hard limit - RDS will
83 continue to accept and queue incoming messages, even if that
84 takes the queue length over the limit. However, it will also
85 mark the port as "congested" and send a congestion update to
86 the source node. The source node is supposed to throttle any
87 processes sending to this congested port.
88
89 bind(fd, &sockaddr_in, ...)
90 This binds the socket to a local IP address and port, and a
91 transport.
92
93 sendmsg(fd, ...)
94 Sends a message to the indicated recipient. The kernel will
95 transparently establish the underlying reliable connection
96 if it isn't up yet.
97
98 An attempt to send a message that exceeds SO_SNDSIZE will
99 return with -EMSGSIZE
100
101 An attempt to send a message that would take the total number
102 of queued bytes over the SO_SNDSIZE threshold will return
103 EAGAIN.
104
105 An attempt to send a message to a destination that is marked
106 as "congested" will return ENOBUFS.
107
108 recvmsg(fd, ...)
109 Receives a message that was queued to this socket. The sockets
110 recv queue accounting is adjusted, and if the queue length
111 drops below SO_SNDSIZE, the port is marked uncongested, and
112 a congestion update is sent to all peers.
113
114 Applications can ask the RDS kernel module to receive
115 notifications via control messages (for instance, there is a
116 notification when a congestion update arrived, or when a RDMA
117 operation completes). These notifications are received through
118 the msg.msg_control buffer of struct msghdr. The format of the
119 messages is described in manpages.
120
121 poll(fd)
122 RDS supports the poll interface to allow the application
123 to implement async I/O.
124
125 POLLIN handling is pretty straightforward. When there's an
126 incoming message queued to the socket, or a pending notification,
127 we signal POLLIN.
128
129 POLLOUT is a little harder. Since you can essentially send
130 to any destination, RDS will always signal POLLOUT as long as
131 there's room on the send queue (ie the number of bytes queued
132 is less than the sendbuf size).
133
134 However, the kernel will refuse to accept messages to
135 a destination marked congested - in this case you will loop
136 forever if you rely on poll to tell you what to do.
137 This isn't a trivial problem, but applications can deal with
138 this - by using congestion notifications, and by checking for
139 ENOBUFS errors returned by sendmsg.
140
141 setsockopt(SOL_RDS, RDS_CANCEL_SENT_TO, &sockaddr_in)
142 This allows the application to discard all messages queued to a
143 specific destination on this particular socket.
144
145 This allows the application to cancel outstanding messages if
146 it detects a timeout. For instance, if it tried to send a message,
147 and the remote host is unreachable, RDS will keep trying forever.
148 The application may decide it's not worth it, and cancel the
149 operation. In this case, it would use RDS_CANCEL_SENT_TO to
150 nuke any pending messages.
151
152
153RDMA for RDS
154============
155
156 see rds-rdma(7) manpage (available in rds-tools)
157
158
159Congestion Notifications
160========================
161
162 see rds(7) manpage
163
164
165RDS Protocol
166============
167
168 Message header
169
170 The message header is a 'struct rds_header' (see rds.h):
171 Fields:
172 h_sequence:
173 per-packet sequence number
174 h_ack:
175 piggybacked acknowledgment of last packet received
176 h_len:
177 length of data, not including header
178 h_sport:
179 source port
180 h_dport:
181 destination port
182 h_flags:
183 CONG_BITMAP - this is a congestion update bitmap
184 ACK_REQUIRED - receiver must ack this packet
185 RETRANSMITTED - packet has previously been sent
186 h_credit:
187 indicate to other end of connection that
188 it has more credits available (i.e. there is
189 more send room)
190 h_padding[4]:
191 unused, for future use
192 h_csum:
193 header checksum
194 h_exthdr:
195 optional data can be passed here. This is currently used for
196 passing RDMA-related information.
197
198 ACK and retransmit handling
199
200 One might think that with reliable IB connections you wouldn't need
201 to ack messages that have been received. The problem is that IB
202 hardware generates an ack message before it has DMAed the message
203 into memory. This creates a potential message loss if the HCA is
204 disabled for any reason between when it sends the ack and before
205 the message is DMAed and processed. This is only a potential issue
206 if another HCA is available for fail-over.
207
208 Sending an ack immediately would allow the sender to free the sent
209 message from their send queue quickly, but could cause excessive
210 traffic to be used for acks. RDS piggybacks acks on sent data
211 packets. Ack-only packets are reduced by only allowing one to be
212 in flight at a time, and by the sender only asking for acks when
213 its send buffers start to fill up. All retransmissions are also
214 acked.
215
216 Flow Control
217
218 RDS's IB transport uses a credit-based mechanism to verify that
219 there is space in the peer's receive buffers for more data. This
220 eliminates the need for hardware retries on the connection.
221
222 Congestion
223
224 Messages waiting in the receive queue on the receiving socket
225 are accounted against the sockets SO_RCVBUF option value. Only
226 the payload bytes in the message are accounted for. If the
227 number of bytes queued equals or exceeds rcvbuf then the socket
228 is congested. All sends attempted to this socket's address
229 should return block or return -EWOULDBLOCK.
230
231 Applications are expected to be reasonably tuned such that this
232 situation very rarely occurs. An application encountering this
233 "back-pressure" is considered a bug.
234
235 This is implemented by having each node maintain bitmaps which
236 indicate which ports on bound addresses are congested. As the
237 bitmap changes it is sent through all the connections which
238 terminate in the local address of the bitmap which changed.
239
240 The bitmaps are allocated as connections are brought up. This
241 avoids allocation in the interrupt handling path which queues
242 sages on sockets. The dense bitmaps let transports send the
243 entire bitmap on any bitmap change reasonably efficiently. This
244 is much easier to implement than some finer-grained
245 communication of per-port congestion. The sender does a very
246 inexpensive bit test to test if the port it's about to send to
247 is congested or not.
248
249
250RDS Transport Layer
251==================
252
253 As mentioned above, RDS is not IB-specific. Its code is divided
254 into a general RDS layer and a transport layer.
255
256 The general layer handles the socket API, congestion handling,
257 loopback, stats, usermem pinning, and the connection state machine.
258
259 The transport layer handles the details of the transport. The IB
260 transport, for example, handles all the queue pairs, work requests,
261 CM event handlers, and other Infiniband details.
262
263
264RDS Kernel Structures
265=====================
266
267 struct rds_message
268 aka possibly "rds_outgoing", the generic RDS layer copies data to
269 be sent and sets header fields as needed, based on the socket API.
270 This is then queued for the individual connection and sent by the
271 connection's transport.
272 struct rds_incoming
273 a generic struct referring to incoming data that can be handed from
274 the transport to the general code and queued by the general code
275 while the socket is awoken. It is then passed back to the transport
276 code to handle the actual copy-to-user.
277 struct rds_socket
278 per-socket information
279 struct rds_connection
280 per-connection information
281 struct rds_transport
282 pointers to transport-specific functions
283 struct rds_statistics
284 non-transport-specific statistics
285 struct rds_cong_map
286 wraps the raw congestion bitmap, contains rbnode, waitq, etc.
287
288Connection management
289=====================
290
291 Connections may be in UP, DOWN, CONNECTING, DISCONNECTING, and
292 ERROR states.
293
294 The first time an attempt is made by an RDS socket to send data to
295 a node, a connection is allocated and connected. That connection is
296 then maintained forever -- if there are transport errors, the
297 connection will be dropped and re-established.
298
299 Dropping a connection while packets are queued will cause queued or
300 partially-sent datagrams to be retransmitted when the connection is
301 re-established.
302
303
304The send path
305=============
306
307 rds_sendmsg()
308 struct rds_message built from incoming data
309 CMSGs parsed (e.g. RDMA ops)
310 transport connection alloced and connected if not already
311 rds_message placed on send queue
312 send worker awoken
313 rds_send_worker()
314 calls rds_send_xmit() until queue is empty
315 rds_send_xmit()
316 transmits congestion map if one is pending
317 may set ACK_REQUIRED
318 calls transport to send either non-RDMA or RDMA message
319 (RDMA ops never retransmitted)
320 rds_ib_xmit()
321 allocs work requests from send ring
322 adds any new send credits available to peer (h_credits)
323 maps the rds_message's sg list
324 piggybacks ack
325 populates work requests
326 post send to connection's queue pair
327
328The recv path
329=============
330
331 rds_ib_recv_cq_comp_handler()
332 looks at write completions
333 unmaps recv buffer from device
334 no errors, call rds_ib_process_recv()
335 refill recv ring
336 rds_ib_process_recv()
337 validate header checksum
338 copy header to rds_ib_incoming struct if start of a new datagram
339 add to ibinc's fraglist
340 if competed datagram:
341 update cong map if datagram was cong update
342 call rds_recv_incoming() otherwise
343 note if ack is required
344 rds_recv_incoming()
345 drop duplicate packets
346 respond to pings
347 find the sock associated with this datagram
348 add to sock queue
349 wake up sock
350 do some congestion calculations
351 rds_recvmsg
352 copy data into user iovec
353 handle CMSGs
354 return to application
355
356
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
new file mode 100644
index 000000000000..0e58b4539176
--- /dev/null
+++ b/Documentation/networking/timestamping.txt
@@ -0,0 +1,180 @@
1The existing interfaces for getting network packages time stamped are:
2
3* SO_TIMESTAMP
4 Generate time stamp for each incoming packet using the (not necessarily
5 monotonous!) system time. Result is returned via recv_msg() in a
6 control message as timeval (usec resolution).
7
8* SO_TIMESTAMPNS
9 Same time stamping mechanism as SO_TIMESTAMP, but returns result as
10 timespec (nsec resolution).
11
12* IP_MULTICAST_LOOP + SO_TIMESTAMP[NS]
13 Only for multicasts: approximate send time stamp by receiving the looped
14 packet and using its receive time stamp.
15
16The following interface complements the existing ones: receive time
17stamps can be generated and returned for arbitrary packets and much
18closer to the point where the packet is really sent. Time stamps can
19be generated in software (as before) or in hardware (if the hardware
20has such a feature).
21
22SO_TIMESTAMPING:
23
24Instructs the socket layer which kind of information is wanted. The
25parameter is an integer with some of the following bits set. Setting
26other bits is an error and doesn't change the current state.
27
28SOF_TIMESTAMPING_TX_HARDWARE: try to obtain send time stamp in hardware
29SOF_TIMESTAMPING_TX_SOFTWARE: if SOF_TIMESTAMPING_TX_HARDWARE is off or
30 fails, then do it in software
31SOF_TIMESTAMPING_RX_HARDWARE: return the original, unmodified time stamp
32 as generated by the hardware
33SOF_TIMESTAMPING_RX_SOFTWARE: if SOF_TIMESTAMPING_RX_HARDWARE is off or
34 fails, then do it in software
35SOF_TIMESTAMPING_RAW_HARDWARE: return original raw hardware time stamp
36SOF_TIMESTAMPING_SYS_HARDWARE: return hardware time stamp transformed to
37 the system time base
38SOF_TIMESTAMPING_SOFTWARE: return system time stamp generated in
39 software
40
41SOF_TIMESTAMPING_TX/RX determine how time stamps are generated.
42SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the
43following control message:
44 struct scm_timestamping {
45 struct timespec systime;
46 struct timespec hwtimetrans;
47 struct timespec hwtimeraw;
48 };
49
50recvmsg() can be used to get this control message for regular incoming
51packets. For send time stamps the outgoing packet is looped back to
52the socket's error queue with the send time stamp(s) attached. It can
53be received with recvmsg(flags=MSG_ERRQUEUE). The call returns the
54original outgoing packet data including all headers preprended down to
55and including the link layer, the scm_timestamping control message and
56a sock_extended_err control message with ee_errno==ENOMSG and
57ee_origin==SO_EE_ORIGIN_TIMESTAMPING. A socket with such a pending
58bounced packet is ready for reading as far as select() is concerned.
59If the outgoing packet has to be fragmented, then only the first
60fragment is time stamped and returned to the sending socket.
61
62All three values correspond to the same event in time, but were
63generated in different ways. Each of these values may be empty (= all
64zero), in which case no such value was available. If the application
65is not interested in some of these values, they can be left blank to
66avoid the potential overhead of calculating them.
67
68systime is the value of the system time at that moment. This
69corresponds to the value also returned via SO_TIMESTAMP[NS]. If the
70time stamp was generated by hardware, then this field is
71empty. Otherwise it is filled in if SOF_TIMESTAMPING_SOFTWARE is
72set.
73
74hwtimeraw is the original hardware time stamp. Filled in if
75SOF_TIMESTAMPING_RAW_HARDWARE is set. No assumptions about its
76relation to system time should be made.
77
78hwtimetrans is the hardware time stamp transformed so that it
79corresponds as good as possible to system time. This correlation is
80not perfect; as a consequence, sorting packets received via different
81NICs by their hwtimetrans may differ from the order in which they were
82received. hwtimetrans may be non-monotonic even for the same NIC.
83Filled in if SOF_TIMESTAMPING_SYS_HARDWARE is set. Requires support
84by the network device and will be empty without that support.
85
86
87SIOCSHWTSTAMP:
88
89Hardware time stamping must also be initialized for each device driver
90that is expected to do hardware time stamping. The parameter is:
91
92struct hwtstamp_config {
93 int flags; /* no flags defined right now, must be zero */
94 int tx_type; /* HWTSTAMP_TX_* */
95 int rx_filter; /* HWTSTAMP_FILTER_* */
96};
97
98Desired behavior is passed into the kernel and to a specific device by
99calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose
100ifr_data points to a struct hwtstamp_config. The tx_type and
101rx_filter are hints to the driver what it is expected to do. If
102the requested fine-grained filtering for incoming packets is not
103supported, the driver may time stamp more than just the requested types
104of packets.
105
106A driver which supports hardware time stamping shall update the struct
107with the actual, possibly more permissive configuration. If the
108requested packets cannot be time stamped, then nothing should be
109changed and ERANGE shall be returned (in contrast to EINVAL, which
110indicates that SIOCSHWTSTAMP is not supported at all).
111
112Only a processes with admin rights may change the configuration. User
113space is responsible to ensure that multiple processes don't interfere
114with each other and that the settings are reset.
115
116/* possible values for hwtstamp_config->tx_type */
117enum {
118 /*
119 * no outgoing packet will need hardware time stamping;
120 * should a packet arrive which asks for it, no hardware
121 * time stamping will be done
122 */
123 HWTSTAMP_TX_OFF,
124
125 /*
126 * enables hardware time stamping for outgoing packets;
127 * the sender of the packet decides which are to be
128 * time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE
129 * before sending the packet
130 */
131 HWTSTAMP_TX_ON,
132};
133
134/* possible values for hwtstamp_config->rx_filter */
135enum {
136 /* time stamp no incoming packet at all */
137 HWTSTAMP_FILTER_NONE,
138
139 /* time stamp any incoming packet */
140 HWTSTAMP_FILTER_ALL,
141
142 /* return value: time stamp all packets requested plus some others */
143 HWTSTAMP_FILTER_SOME,
144
145 /* PTP v1, UDP, any kind of event packet */
146 HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
147
148 ...
149};
150
151
152DEVICE IMPLEMENTATION
153
154A driver which supports hardware time stamping must support the
155SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored
156in the skb with skb_hwtstamp_set().
157
158Time stamps for outgoing packets are to be generated as follows:
159- In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware()
160 returns non-zero. If yes, then the driver is expected
161 to do hardware time stamping.
162- If this is possible for the skb and requested, then declare
163 that the driver is doing the time stamping by calling
164 skb_hwtstamp_tx_in_progress(). A driver not supporting
165 hardware time stamping doesn't do that. A driver must never
166 touch sk_buff::tstamp! It is used to store how time stamping
167 for an outgoing packets is to be done.
168- As soon as the driver has sent the packet and/or obtained a
169 hardware time stamp for it, it passes the time stamp back by
170 calling skb_hwtstamp_tx() with the original skb, the raw
171 hardware time stamp and a handle to the device (necessary
172 to convert the hardware time stamp to system time). If obtaining
173 the hardware time stamp somehow fails, then the driver should
174 not fall back to software time stamping. The rationale is that
175 this would occur at a later time in the processing pipeline
176 than other software time stamping and therefore could lead
177 to unexpected deltas between time stamps.
178- If the driver did not call skb_hwtstamp_tx_in_progress(), then
179 dev_hard_start_xmit() checks whether software time stamping
180 is wanted as fallback and potentially generates the time stamp.
diff --git a/Documentation/networking/timestamping/.gitignore b/Documentation/networking/timestamping/.gitignore
new file mode 100644
index 000000000000..71e81eb2e22f
--- /dev/null
+++ b/Documentation/networking/timestamping/.gitignore
@@ -0,0 +1 @@
timestamping
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile
new file mode 100644
index 000000000000..2a1489fdc036
--- /dev/null
+++ b/Documentation/networking/timestamping/Makefile
@@ -0,0 +1,6 @@
1CPPFLAGS = -I../../../include
2
3timestamping: timestamping.c
4
5clean:
6 rm -f timestamping
diff --git a/Documentation/networking/timestamping/timestamping.c b/Documentation/networking/timestamping/timestamping.c
new file mode 100644
index 000000000000..43d143104210
--- /dev/null
+++ b/Documentation/networking/timestamping/timestamping.c
@@ -0,0 +1,533 @@
1/*
2 * This program demonstrates how the various time stamping features in
3 * the Linux kernel work. It emulates the behavior of a PTP
4 * implementation in stand-alone master mode by sending PTPv1 Sync
5 * multicasts once every second. It looks for similar packets, but
6 * beyond that doesn't actually implement PTP.
7 *
8 * Outgoing packets are time stamped with SO_TIMESTAMPING with or
9 * without hardware support.
10 *
11 * Incoming packets are time stamped with SO_TIMESTAMPING with or
12 * without hardware support, SIOCGSTAMP[NS] (per-socket time stamp) and
13 * SO_TIMESTAMP[NS].
14 *
15 * Copyright (C) 2009 Intel Corporation.
16 * Author: Patrick Ohly <patrick.ohly@intel.com>
17 *
18 * This program is free software; you can redistribute it and/or modify it
19 * under the terms and conditions of the GNU General Public License,
20 * version 2, as published by the Free Software Foundation.
21 *
22 * This program is distributed in the hope it will be useful, but WITHOUT
23 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
24 * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
25 * more details.
26 *
27 * You should have received a copy of the GNU General Public License along with
28 * this program; if not, write to the Free Software Foundation, Inc.,
29 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
30 */
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <errno.h>
35#include <string.h>
36
37#include <sys/time.h>
38#include <sys/socket.h>
39#include <sys/select.h>
40#include <sys/ioctl.h>
41#include <arpa/inet.h>
42#include <net/if.h>
43
44#include "asm/types.h"
45#include "linux/net_tstamp.h"
46#include "linux/errqueue.h"
47
48#ifndef SO_TIMESTAMPING
49# define SO_TIMESTAMPING 37
50# define SCM_TIMESTAMPING SO_TIMESTAMPING
51#endif
52
53#ifndef SO_TIMESTAMPNS
54# define SO_TIMESTAMPNS 35
55#endif
56
57#ifndef SIOCGSTAMPNS
58# define SIOCGSTAMPNS 0x8907
59#endif
60
61#ifndef SIOCSHWTSTAMP
62# define SIOCSHWTSTAMP 0x89b0
63#endif
64
65static void usage(const char *error)
66{
67 if (error)
68 printf("invalid option: %s\n", error);
69 printf("timestamping interface option*\n\n"
70 "Options:\n"
71 " IP_MULTICAST_LOOP - looping outgoing multicasts\n"
72 " SO_TIMESTAMP - normal software time stamping, ms resolution\n"
73 " SO_TIMESTAMPNS - more accurate software time stamping\n"
74 " SOF_TIMESTAMPING_TX_HARDWARE - hardware time stamping of outgoing packets\n"
75 " SOF_TIMESTAMPING_TX_SOFTWARE - software fallback for outgoing packets\n"
76 " SOF_TIMESTAMPING_RX_HARDWARE - hardware time stamping of incoming packets\n"
77 " SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n"
78 " SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n"
79 " SOF_TIMESTAMPING_SYS_HARDWARE - request reporting of transformed HW time stamps\n"
80 " SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n"
81 " SIOCGSTAMP - check last socket time stamp\n"
82 " SIOCGSTAMPNS - more accurate socket time stamp\n");
83 exit(1);
84}
85
86static void bail(const char *error)
87{
88 printf("%s: %s\n", error, strerror(errno));
89 exit(1);
90}
91
92static const unsigned char sync[] = {
93 0x00, 0x01, 0x00, 0x01,
94 0x5f, 0x44, 0x46, 0x4c,
95 0x54, 0x00, 0x00, 0x00,
96 0x00, 0x00, 0x00, 0x00,
97 0x00, 0x00, 0x00, 0x00,
98 0x01, 0x01,
99
100 /* fake uuid */
101 0x00, 0x01,
102 0x02, 0x03, 0x04, 0x05,
103
104 0x00, 0x01, 0x00, 0x37,
105 0x00, 0x00, 0x00, 0x08,
106 0x00, 0x00, 0x00, 0x00,
107 0x49, 0x05, 0xcd, 0x01,
108 0x29, 0xb1, 0x8d, 0xb0,
109 0x00, 0x00, 0x00, 0x00,
110 0x00, 0x01,
111
112 /* fake uuid */
113 0x00, 0x01,
114 0x02, 0x03, 0x04, 0x05,
115
116 0x00, 0x00, 0x00, 0x37,
117 0x00, 0x00, 0x00, 0x04,
118 0x44, 0x46, 0x4c, 0x54,
119 0x00, 0x00, 0xf0, 0x60,
120 0x00, 0x01, 0x00, 0x00,
121 0x00, 0x00, 0x00, 0x01,
122 0x00, 0x00, 0xf0, 0x60,
123 0x00, 0x00, 0x00, 0x00,
124 0x00, 0x00, 0x00, 0x04,
125 0x44, 0x46, 0x4c, 0x54,
126 0x00, 0x01,
127
128 /* fake uuid */
129 0x00, 0x01,
130 0x02, 0x03, 0x04, 0x05,
131
132 0x00, 0x00, 0x00, 0x00,
133 0x00, 0x00, 0x00, 0x00,
134 0x00, 0x00, 0x00, 0x00,
135 0x00, 0x00, 0x00, 0x00
136};
137
138static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len)
139{
140 struct timeval now;
141 int res;
142
143 res = sendto(sock, sync, sizeof(sync), 0,
144 addr, addr_len);
145 gettimeofday(&now, 0);
146 if (res < 0)
147 printf("%s: %s\n", "send", strerror(errno));
148 else
149 printf("%ld.%06ld: sent %d bytes\n",
150 (long)now.tv_sec, (long)now.tv_usec,
151 res);
152}
153
154static void printpacket(struct msghdr *msg, int res,
155 char *data,
156 int sock, int recvmsg_flags,
157 int siocgstamp, int siocgstampns)
158{
159 struct sockaddr_in *from_addr = (struct sockaddr_in *)msg->msg_name;
160 struct cmsghdr *cmsg;
161 struct timeval tv;
162 struct timespec ts;
163 struct timeval now;
164
165 gettimeofday(&now, 0);
166
167 printf("%ld.%06ld: received %s data, %d bytes from %s, %d bytes control messages\n",
168 (long)now.tv_sec, (long)now.tv_usec,
169 (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
170 res,
171 inet_ntoa(from_addr->sin_addr),
172 msg->msg_controllen);
173 for (cmsg = CMSG_FIRSTHDR(msg);
174 cmsg;
175 cmsg = CMSG_NXTHDR(msg, cmsg)) {
176 printf(" cmsg len %d: ", cmsg->cmsg_len);
177 switch (cmsg->cmsg_level) {
178 case SOL_SOCKET:
179 printf("SOL_SOCKET ");
180 switch (cmsg->cmsg_type) {
181 case SO_TIMESTAMP: {
182 struct timeval *stamp =
183 (struct timeval *)CMSG_DATA(cmsg);
184 printf("SO_TIMESTAMP %ld.%06ld",
185 (long)stamp->tv_sec,
186 (long)stamp->tv_usec);
187 break;
188 }
189 case SO_TIMESTAMPNS: {
190 struct timespec *stamp =
191 (struct timespec *)CMSG_DATA(cmsg);
192 printf("SO_TIMESTAMPNS %ld.%09ld",
193 (long)stamp->tv_sec,
194 (long)stamp->tv_nsec);
195 break;
196 }
197 case SO_TIMESTAMPING: {
198 struct timespec *stamp =
199 (struct timespec *)CMSG_DATA(cmsg);
200 printf("SO_TIMESTAMPING ");
201 printf("SW %ld.%09ld ",
202 (long)stamp->tv_sec,
203 (long)stamp->tv_nsec);
204 stamp++;
205 printf("HW transformed %ld.%09ld ",
206 (long)stamp->tv_sec,
207 (long)stamp->tv_nsec);
208 stamp++;
209 printf("HW raw %ld.%09ld",
210 (long)stamp->tv_sec,
211 (long)stamp->tv_nsec);
212 break;
213 }
214 default:
215 printf("type %d", cmsg->cmsg_type);
216 break;
217 }
218 break;
219 case IPPROTO_IP:
220 printf("IPPROTO_IP ");
221 switch (cmsg->cmsg_type) {
222 case IP_RECVERR: {
223 struct sock_extended_err *err =
224 (struct sock_extended_err *)CMSG_DATA(cmsg);
225 printf("IP_RECVERR ee_errno '%s' ee_origin %d => %s",
226 strerror(err->ee_errno),
227 err->ee_origin,
228#ifdef SO_EE_ORIGIN_TIMESTAMPING
229 err->ee_origin == SO_EE_ORIGIN_TIMESTAMPING ?
230 "bounced packet" : "unexpected origin"
231#else
232 "probably SO_EE_ORIGIN_TIMESTAMPING"
233#endif
234 );
235 if (res < sizeof(sync))
236 printf(" => truncated data?!");
237 else if (!memcmp(sync, data + res - sizeof(sync),
238 sizeof(sync)))
239 printf(" => GOT OUR DATA BACK (HURRAY!)");
240 break;
241 }
242 case IP_PKTINFO: {
243 struct in_pktinfo *pktinfo =
244 (struct in_pktinfo *)CMSG_DATA(cmsg);
245 printf("IP_PKTINFO interface index %u",
246 pktinfo->ipi_ifindex);
247 break;
248 }
249 default:
250 printf("type %d", cmsg->cmsg_type);
251 break;
252 }
253 break;
254 default:
255 printf("level %d type %d",
256 cmsg->cmsg_level,
257 cmsg->cmsg_type);
258 break;
259 }
260 printf("\n");
261 }
262
263 if (siocgstamp) {
264 if (ioctl(sock, SIOCGSTAMP, &tv))
265 printf(" %s: %s\n", "SIOCGSTAMP", strerror(errno));
266 else
267 printf("SIOCGSTAMP %ld.%06ld\n",
268 (long)tv.tv_sec,
269 (long)tv.tv_usec);
270 }
271 if (siocgstampns) {
272 if (ioctl(sock, SIOCGSTAMPNS, &ts))
273 printf(" %s: %s\n", "SIOCGSTAMPNS", strerror(errno));
274 else
275 printf("SIOCGSTAMPNS %ld.%09ld\n",
276 (long)ts.tv_sec,
277 (long)ts.tv_nsec);
278 }
279}
280
281static void recvpacket(int sock, int recvmsg_flags,
282 int siocgstamp, int siocgstampns)
283{
284 char data[256];
285 struct msghdr msg;
286 struct iovec entry;
287 struct sockaddr_in from_addr;
288 struct {
289 struct cmsghdr cm;
290 char control[512];
291 } control;
292 int res;
293
294 memset(&msg, 0, sizeof(msg));
295 msg.msg_iov = &entry;
296 msg.msg_iovlen = 1;
297 entry.iov_base = data;
298 entry.iov_len = sizeof(data);
299 msg.msg_name = (caddr_t)&from_addr;
300 msg.msg_namelen = sizeof(from_addr);
301 msg.msg_control = &control;
302 msg.msg_controllen = sizeof(control);
303
304 res = recvmsg(sock, &msg, recvmsg_flags|MSG_DONTWAIT);
305 if (res < 0) {
306 printf("%s %s: %s\n",
307 "recvmsg",
308 (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
309 strerror(errno));
310 } else {
311 printpacket(&msg, res, data,
312 sock, recvmsg_flags,
313 siocgstamp, siocgstampns);
314 }
315}
316
317int main(int argc, char **argv)
318{
319 int so_timestamping_flags = 0;
320 int so_timestamp = 0;
321 int so_timestampns = 0;
322 int siocgstamp = 0;
323 int siocgstampns = 0;
324 int ip_multicast_loop = 0;
325 char *interface;
326 int i;
327 int enabled = 1;
328 int sock;
329 struct ifreq device;
330 struct ifreq hwtstamp;
331 struct hwtstamp_config hwconfig, hwconfig_requested;
332 struct sockaddr_in addr;
333 struct ip_mreq imr;
334 struct in_addr iaddr;
335 int val;
336 socklen_t len;
337 struct timeval next;
338
339 if (argc < 2)
340 usage(0);
341 interface = argv[1];
342
343 for (i = 2; i < argc; i++) {
344 if (!strcasecmp(argv[i], "SO_TIMESTAMP"))
345 so_timestamp = 1;
346 else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS"))
347 so_timestampns = 1;
348 else if (!strcasecmp(argv[i], "SIOCGSTAMP"))
349 siocgstamp = 1;
350 else if (!strcasecmp(argv[i], "SIOCGSTAMPNS"))
351 siocgstampns = 1;
352 else if (!strcasecmp(argv[i], "IP_MULTICAST_LOOP"))
353 ip_multicast_loop = 1;
354 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE"))
355 so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE;
356 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE"))
357 so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
358 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE"))
359 so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE;
360 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE"))
361 so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
362 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE"))
363 so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE;
364 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SYS_HARDWARE"))
365 so_timestamping_flags |= SOF_TIMESTAMPING_SYS_HARDWARE;
366 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE"))
367 so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
368 else
369 usage(argv[i]);
370 }
371
372 sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
373 if (socket < 0)
374 bail("socket");
375
376 memset(&device, 0, sizeof(device));
377 strncpy(device.ifr_name, interface, sizeof(device.ifr_name));
378 if (ioctl(sock, SIOCGIFADDR, &device) < 0)
379 bail("getting interface IP address");
380
381 memset(&hwtstamp, 0, sizeof(hwtstamp));
382 strncpy(hwtstamp.ifr_name, interface, sizeof(hwtstamp.ifr_name));
383 hwtstamp.ifr_data = (void *)&hwconfig;
384 memset(&hwconfig, 0, sizeof(&hwconfig));
385 hwconfig.tx_type =
386 (so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
387 HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
388 hwconfig.rx_filter =
389 (so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
390 HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE;
391 hwconfig_requested = hwconfig;
392 if (ioctl(sock, SIOCSHWTSTAMP, &hwtstamp) < 0) {
393 if ((errno == EINVAL || errno == ENOTSUP) &&
394 hwconfig_requested.tx_type == HWTSTAMP_TX_OFF &&
395 hwconfig_requested.rx_filter == HWTSTAMP_FILTER_NONE)
396 printf("SIOCSHWTSTAMP: disabling hardware time stamping not possible\n");
397 else
398 bail("SIOCSHWTSTAMP");
399 }
400 printf("SIOCSHWTSTAMP: tx_type %d requested, got %d; rx_filter %d requested, got %d\n",
401 hwconfig_requested.tx_type, hwconfig.tx_type,
402 hwconfig_requested.rx_filter, hwconfig.rx_filter);
403
404 /* bind to PTP port */
405 addr.sin_family = AF_INET;
406 addr.sin_addr.s_addr = htonl(INADDR_ANY);
407 addr.sin_port = htons(319 /* PTP event port */);
408 if (bind(sock,
409 (struct sockaddr *)&addr,
410 sizeof(struct sockaddr_in)) < 0)
411 bail("bind");
412
413 /* set multicast group for outgoing packets */
414 inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */
415 addr.sin_addr = iaddr;
416 imr.imr_multiaddr.s_addr = iaddr.s_addr;
417 imr.imr_interface.s_addr =
418 ((struct sockaddr_in *)&device.ifr_addr)->sin_addr.s_addr;
419 if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_IF,
420 &imr.imr_interface.s_addr, sizeof(struct in_addr)) < 0)
421 bail("set multicast");
422
423 /* join multicast group, loop our own packet */
424 if (setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP,
425 &imr, sizeof(struct ip_mreq)) < 0)
426 bail("join multicast group");
427
428 if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_LOOP,
429 &ip_multicast_loop, sizeof(enabled)) < 0) {
430 bail("loop multicast");
431 }
432
433 /* set socket options for time stamping */
434 if (so_timestamp &&
435 setsockopt(sock, SOL_SOCKET, SO_TIMESTAMP,
436 &enabled, sizeof(enabled)) < 0)
437 bail("setsockopt SO_TIMESTAMP");
438
439 if (so_timestampns &&
440 setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS,
441 &enabled, sizeof(enabled)) < 0)
442 bail("setsockopt SO_TIMESTAMPNS");
443
444 if (so_timestamping_flags &&
445 setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING,
446 &so_timestamping_flags,
447 sizeof(so_timestamping_flags)) < 0)
448 bail("setsockopt SO_TIMESTAMPING");
449
450 /* request IP_PKTINFO for debugging purposes */
451 if (setsockopt(sock, SOL_IP, IP_PKTINFO,
452 &enabled, sizeof(enabled)) < 0)
453 printf("%s: %s\n", "setsockopt IP_PKTINFO", strerror(errno));
454
455 /* verify socket options */
456 len = sizeof(val);
457 if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMP, &val, &len) < 0)
458 printf("%s: %s\n", "getsockopt SO_TIMESTAMP", strerror(errno));
459 else
460 printf("SO_TIMESTAMP %d\n", val);
461
462 if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS, &val, &len) < 0)
463 printf("%s: %s\n", "getsockopt SO_TIMESTAMPNS",
464 strerror(errno));
465 else
466 printf("SO_TIMESTAMPNS %d\n", val);
467
468 if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) {
469 printf("%s: %s\n", "getsockopt SO_TIMESTAMPING",
470 strerror(errno));
471 } else {
472 printf("SO_TIMESTAMPING %d\n", val);
473 if (val != so_timestamping_flags)
474 printf(" not the expected value %d\n",
475 so_timestamping_flags);
476 }
477
478 /* send packets forever every five seconds */
479 gettimeofday(&next, 0);
480 next.tv_sec = (next.tv_sec + 1) / 5 * 5;
481 next.tv_usec = 0;
482 while (1) {
483 struct timeval now;
484 struct timeval delta;
485 long delta_us;
486 int res;
487 fd_set readfs, errorfs;
488
489 gettimeofday(&now, 0);
490 delta_us = (long)(next.tv_sec - now.tv_sec) * 1000000 +
491 (long)(next.tv_usec - now.tv_usec);
492 if (delta_us > 0) {
493 /* continue waiting for timeout or data */
494 delta.tv_sec = delta_us / 1000000;
495 delta.tv_usec = delta_us % 1000000;
496
497 FD_ZERO(&readfs);
498 FD_ZERO(&errorfs);
499 FD_SET(sock, &readfs);
500 FD_SET(sock, &errorfs);
501 printf("%ld.%06ld: select %ldus\n",
502 (long)now.tv_sec, (long)now.tv_usec,
503 delta_us);
504 res = select(sock + 1, &readfs, 0, &errorfs, &delta);
505 gettimeofday(&now, 0);
506 printf("%ld.%06ld: select returned: %d, %s\n",
507 (long)now.tv_sec, (long)now.tv_usec,
508 res,
509 res < 0 ? strerror(errno) : "success");
510 if (res > 0) {
511 if (FD_ISSET(sock, &readfs))
512 printf("ready for reading\n");
513 if (FD_ISSET(sock, &errorfs))
514 printf("has error\n");
515 recvpacket(sock, 0,
516 siocgstamp,
517 siocgstampns);
518 recvpacket(sock, MSG_ERRQUEUE,
519 siocgstamp,
520 siocgstampns);
521 }
522 } else {
523 /* write one packet */
524 sendpacket(sock,
525 (struct sockaddr *)&addr,
526 sizeof(addr));
527 next.tv_sec += 5;
528 continue;
529 }
530 }
531
532 return 0;
533}