diff options
Diffstat (limited to 'Documentation/networking')
-rw-r--r-- | Documentation/networking/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/networking/NAPI_HOWTO.txt | 26 | ||||
-rw-r--r-- | Documentation/networking/cs89x0.txt | 6 | ||||
-rw-r--r-- | Documentation/networking/dccp.txt | 84 | ||||
-rw-r--r-- | Documentation/networking/e1000.txt | 451 | ||||
-rw-r--r-- | Documentation/networking/generic_netlink.txt | 3 | ||||
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 347 | ||||
-rw-r--r-- | Documentation/networking/iphase.txt | 2 | ||||
-rw-r--r-- | Documentation/networking/packet_mmap.txt | 2 | ||||
-rw-r--r-- | Documentation/networking/phy.txt | 13 | ||||
-rw-r--r-- | Documentation/networking/pktgen.txt | 6 | ||||
-rw-r--r-- | Documentation/networking/proc_net_tcp.txt | 2 | ||||
-rw-r--r-- | Documentation/networking/sk98lin.txt | 2 | ||||
-rw-r--r-- | Documentation/networking/slicecom.txt | 2 | ||||
-rw-r--r-- | Documentation/networking/udplite.txt | 281 | ||||
-rw-r--r-- | Documentation/networking/wan-router.txt | 8 | ||||
-rw-r--r-- | Documentation/networking/xfrm_sync.txt | 5 |
17 files changed, 841 insertions, 401 deletions
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index b1181ce232d9..e06b6e3c1db5 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX | |||
@@ -58,6 +58,8 @@ fore200e.txt | |||
58 | - FORE Systems PCA-200E/SBA-200E ATM NIC driver info. | 58 | - FORE Systems PCA-200E/SBA-200E ATM NIC driver info. |
59 | framerelay.txt | 59 | framerelay.txt |
60 | - info on using Frame Relay/Data Link Connection Identifier (DLCI). | 60 | - info on using Frame Relay/Data Link Connection Identifier (DLCI). |
61 | generic_netlink.txt | ||
62 | - info on Generic Netlink | ||
61 | ip-sysctl.txt | 63 | ip-sysctl.txt |
62 | - /proc/sys/net/ipv4/* variables | 64 | - /proc/sys/net/ipv4/* variables |
63 | ip_dynaddr.txt | 65 | ip_dynaddr.txt |
diff --git a/Documentation/networking/NAPI_HOWTO.txt b/Documentation/networking/NAPI_HOWTO.txt index 93af3e87c65b..fb8dc6422a52 100644 --- a/Documentation/networking/NAPI_HOWTO.txt +++ b/Documentation/networking/NAPI_HOWTO.txt | |||
@@ -95,8 +95,8 @@ There are two types of event register ACK mechanisms. | |||
95 | Move all to dev->poll() | 95 | Move all to dev->poll() |
96 | 96 | ||
97 | C) Ability to detect new work correctly. | 97 | C) Ability to detect new work correctly. |
98 | NAPI works by shutting down event interrupts when theres work and | 98 | NAPI works by shutting down event interrupts when there's work and |
99 | turning them on when theres none. | 99 | turning them on when there's none. |
100 | New packets might show up in the small window while interrupts were being | 100 | New packets might show up in the small window while interrupts were being |
101 | re-enabled (refer to appendix 2). A packet might sneak in during the period | 101 | re-enabled (refer to appendix 2). A packet might sneak in during the period |
102 | we are enabling interrupts. We only get to know about such a packet when the | 102 | we are enabling interrupts. We only get to know about such a packet when the |
@@ -114,7 +114,7 @@ Locking rules and environmental guarantees | |||
114 | only one CPU can pick the initial interrupt and hence the initial | 114 | only one CPU can pick the initial interrupt and hence the initial |
115 | netif_rx_schedule(dev); | 115 | netif_rx_schedule(dev); |
116 | - The core layer invokes devices to send packets in a round robin format. | 116 | - The core layer invokes devices to send packets in a round robin format. |
117 | This implies receive is totaly lockless because of the guarantee only that | 117 | This implies receive is totally lockless because of the guarantee that only |
118 | one CPU is executing it. | 118 | one CPU is executing it. |
119 | - contention can only be the result of some other CPU accessing the rx | 119 | - contention can only be the result of some other CPU accessing the rx |
120 | ring. This happens only in close() and suspend() (when these methods | 120 | ring. This happens only in close() and suspend() (when these methods |
@@ -510,7 +510,7 @@ static int my_poll (struct net_device *dev, int *budget) | |||
510 | an interrupt will be generated */ | 510 | an interrupt will be generated */ |
511 | goto done; | 511 | goto done; |
512 | } | 512 | } |
513 | /* done! at least thats what it looks like ;-> | 513 | /* done! at least that's what it looks like ;-> |
514 | if new packets came in after our last check on status bits | 514 | if new packets came in after our last check on status bits |
515 | they'll be caught by the while check and we go back and clear them | 515 | they'll be caught by the while check and we go back and clear them |
516 | since we havent exceeded our quota */ | 516 | since we havent exceeded our quota */ |
@@ -535,11 +535,11 @@ done: | |||
535 | * 1. it can race with disabling irqs in irq handler (which are done to | 535 | * 1. it can race with disabling irqs in irq handler (which are done to |
536 | * schedule polls) | 536 | * schedule polls) |
537 | * 2. it can race with dis/enabling irqs in other poll threads | 537 | * 2. it can race with dis/enabling irqs in other poll threads |
538 | * 3. if an irq raised after the begining of the outer beginning | 538 | * 3. if an irq raised after the beginning of the outer beginning |
539 | * loop(marked in the code above), it will be immediately | 539 | * loop (marked in the code above), it will be immediately |
540 | * triggered here. | 540 | * triggered here. |
541 | * | 541 | * |
542 | * Summarizing: the logic may results in some redundant irqs both | 542 | * Summarizing: the logic may result in some redundant irqs both |
543 | * due to races in masking and due to too late acking of already | 543 | * due to races in masking and due to too late acking of already |
544 | * processed irqs. The good news: no events are ever lost. | 544 | * processed irqs. The good news: no events are ever lost. |
545 | */ | 545 | */ |
@@ -601,7 +601,7 @@ a) | |||
601 | 601 | ||
602 | 5) dev->close() and dev->suspend() issues | 602 | 5) dev->close() and dev->suspend() issues |
603 | ========================================== | 603 | ========================================== |
604 | The driver writter neednt worry about this. The top net layer takes | 604 | The driver writer needn't worry about this; the top net layer takes |
605 | care of it. | 605 | care of it. |
606 | 606 | ||
607 | 6) Adding new Stats to /proc | 607 | 6) Adding new Stats to /proc |
@@ -622,9 +622,9 @@ FC should be programmed to apply in the case when the system cant pull out | |||
622 | packets fast enough i.e send a pause only when you run out of rx buffers. | 622 | packets fast enough i.e send a pause only when you run out of rx buffers. |
623 | Note FC in itself is a good solution but we have found it to not be | 623 | Note FC in itself is a good solution but we have found it to not be |
624 | much of a commodity feature (both in NICs and switches) and hence falls | 624 | much of a commodity feature (both in NICs and switches) and hence falls |
625 | under the same category as using NIC based mitigation. Also experiments | 625 | under the same category as using NIC based mitigation. Also, experiments |
626 | indicate that its much harder to resolve the resource allocation | 626 | indicate that it's much harder to resolve the resource allocation |
627 | issue (aka lazy receiving that NAPI offers) and hence quantify its usefullness | 627 | issue (aka lazy receiving that NAPI offers) and hence quantify its usefulness |
628 | proved harder. In any case, FC works even better with NAPI but is not | 628 | proved harder. In any case, FC works even better with NAPI but is not |
629 | necessary. | 629 | necessary. |
630 | 630 | ||
@@ -678,10 +678,10 @@ routine: | |||
678 | CSR5 bit of interest is only the rx status. | 678 | CSR5 bit of interest is only the rx status. |
679 | If you look at the last if statement: | 679 | If you look at the last if statement: |
680 | you just finished grabbing all the packets from the rx ring .. you check if | 680 | you just finished grabbing all the packets from the rx ring .. you check if |
681 | status bit says theres more packets just in ... it says none; you then | 681 | status bit says there are more packets just in ... it says none; you then |
682 | enable rx interrupts again; if a new packet just came in during this check, | 682 | enable rx interrupts again; if a new packet just came in during this check, |
683 | we are counting that CSR5 will be set in that small window of opportunity | 683 | we are counting that CSR5 will be set in that small window of opportunity |
684 | and that by re-enabling interrupts, we would actually triger an interrupt | 684 | and that by re-enabling interrupts, we would actually trigger an interrupt |
685 | to register the new packet for processing. | 685 | to register the new packet for processing. |
686 | 686 | ||
687 | [The above description nay be very verbose, if you have better wording | 687 | [The above description nay be very verbose, if you have better wording |
diff --git a/Documentation/networking/cs89x0.txt b/Documentation/networking/cs89x0.txt index 64896470e279..6387d3decf85 100644 --- a/Documentation/networking/cs89x0.txt +++ b/Documentation/networking/cs89x0.txt | |||
@@ -248,7 +248,7 @@ c) The driver's hardware probe routine is designed to avoid | |||
248 | with device probing. To avoid this behaviour, add one | 248 | with device probing. To avoid this behaviour, add one |
249 | to the `io=' module parameter. This doesn't actually change | 249 | to the `io=' module parameter. This doesn't actually change |
250 | the I/O address, but it is a flag to tell the driver | 250 | the I/O address, but it is a flag to tell the driver |
251 | topartially initialise the hardware before trying to | 251 | to partially initialise the hardware before trying to |
252 | identify the card. This could be dangerous if you are | 252 | identify the card. This could be dangerous if you are |
253 | not sure that there is a cs89x0 card at the provided address. | 253 | not sure that there is a cs89x0 card at the provided address. |
254 | 254 | ||
@@ -620,8 +620,8 @@ I/O Address Device IRQ Device | |||
620 | 12 Mouse (PS/2) | 620 | 12 Mouse (PS/2) |
621 | Memory Address Device 13 Math Coprocessor | 621 | Memory Address Device 13 Math Coprocessor |
622 | -------------- --------------------- 14 Hard Disk controller | 622 | -------------- --------------------- 14 Hard Disk controller |
623 | A000-BFFF EGA Graphics Adpater | 623 | A000-BFFF EGA Graphics Adapter |
624 | A000-C7FF VGA Graphics Adpater | 624 | A000-C7FF VGA Graphics Adapter |
625 | B000-BFFF Mono Graphics Adapter | 625 | B000-BFFF Mono Graphics Adapter |
626 | B800-BFFF Color Graphics Adapter | 626 | B800-BFFF Color Graphics Adapter |
627 | E000-FFFF AT BIOS | 627 | E000-FFFF AT BIOS |
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 74563b38ffd9..dda15886bcb5 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt | |||
@@ -19,21 +19,17 @@ for real time and multimedia traffic. | |||
19 | 19 | ||
20 | It has a base protocol and pluggable congestion control IDs (CCIDs). | 20 | It has a base protocol and pluggable congestion control IDs (CCIDs). |
21 | 21 | ||
22 | It is at draft RFC status and the homepage for DCCP as a protocol is at: | 22 | It is at experimental RFC status and the homepage for DCCP as a protocol is at: |
23 | http://www.icir.org/kohler/dcp/ | 23 | http://www.read.cs.ucla.edu/dccp/ |
24 | 24 | ||
25 | Missing features | 25 | Missing features |
26 | ================ | 26 | ================ |
27 | 27 | ||
28 | The DCCP implementation does not currently have all the features that are in | 28 | The DCCP implementation does not currently have all the features that are in |
29 | the draft RFC. | 29 | the RFC. |
30 | 30 | ||
31 | In particular the following are missing: | 31 | The known bugs are at: |
32 | - CCID2 support | 32 | http://linux-net.osdl.org/index.php/TODO#DCCP |
33 | - feature negotiation | ||
34 | |||
35 | When testing against other implementations it appears that elapsed time | ||
36 | options are not coded compliant to the specification. | ||
37 | 33 | ||
38 | Socket options | 34 | Socket options |
39 | ============== | 35 | ============== |
@@ -47,12 +43,70 @@ the socket will fall back to 0 (which means that no meaningful service code | |||
47 | is present). Connecting sockets set at most one service option; for | 43 | is present). Connecting sockets set at most one service option; for |
48 | listening sockets, multiple service codes can be specified. | 44 | listening sockets, multiple service codes can be specified. |
49 | 45 | ||
46 | DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the | ||
47 | partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums | ||
48 | always cover the entire packet and that only fully covered application data is | ||
49 | accepted by the receiver. Hence, when using this feature on the sender, it must | ||
50 | be enabled at the receiver, too with suitable choice of CsCov. | ||
51 | |||
52 | DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the | ||
53 | range 0..15 are acceptable. The default setting is 0 (full coverage), | ||
54 | values between 1..15 indicate partial coverage. | ||
55 | DCCP_SOCKOPT_SEND_CSCOV is for the receiver and has a different meaning: it | ||
56 | sets a threshold, where again values 0..15 are acceptable. The default | ||
57 | of 0 means that all packets with a partial coverage will be discarded. | ||
58 | Values in the range 1..15 indicate that packets with minimally such a | ||
59 | coverage value are also acceptable. The higher the number, the more | ||
60 | restrictive this setting (see [RFC 4340, sec. 9.2.1]). | ||
61 | |||
62 | Sysctl variables | ||
63 | ================ | ||
64 | Several DCCP default parameters can be managed by the following sysctls | ||
65 | (sysctl net.dccp.default or /proc/sys/net/dccp/default): | ||
66 | |||
67 | request_retries | ||
68 | The number of active connection initiation retries (the number of | ||
69 | Requests minus one) before timing out. In addition, it also governs | ||
70 | the behaviour of the other, passive side: this variable also sets | ||
71 | the number of times DCCP repeats sending a Response when the initial | ||
72 | handshake does not progress from RESPOND to OPEN (i.e. when no Ack | ||
73 | is received after the initial Request). This value should be greater | ||
74 | than 0, suggested is less than 10. Analogue of tcp_syn_retries. | ||
75 | |||
76 | retries1 | ||
77 | How often a DCCP Response is retransmitted until the listening DCCP | ||
78 | side considers its connecting peer dead. Analogue of tcp_retries1. | ||
79 | |||
80 | retries2 | ||
81 | The number of times a general DCCP packet is retransmitted. This has | ||
82 | importance for retransmitted acknowledgments and feature negotiation, | ||
83 | data packets are never retransmitted. Analogue of tcp_retries2. | ||
84 | |||
85 | send_ndp = 1 | ||
86 | Whether or not to send NDP count options (sec. 7.7.2). | ||
87 | |||
88 | send_ackvec = 1 | ||
89 | Whether or not to send Ack Vector options (sec. 11.5). | ||
90 | |||
91 | ack_ratio = 2 | ||
92 | The default Ack Ratio (sec. 11.3) to use. | ||
93 | |||
94 | tx_ccid = 2 | ||
95 | Default CCID for the sender-receiver half-connection. | ||
96 | |||
97 | rx_ccid = 2 | ||
98 | Default CCID for the receiver-sender half-connection. | ||
99 | |||
100 | seq_window = 100 | ||
101 | The initial sequence window (sec. 7.5.2). | ||
102 | |||
103 | tx_qlen = 5 | ||
104 | The size of the transmit buffer in packets. A value of 0 corresponds | ||
105 | to an unbounded transmit buffer. | ||
106 | |||
50 | Notes | 107 | Notes |
51 | ===== | 108 | ===== |
52 | 109 | ||
53 | SELinux does not yet have support for DCCP. You will need to turn it off or | 110 | DCCP does not travel through NAT successfully at present on many boxes. This is |
54 | else you will get EACCES. | 111 | because the checksum covers the psuedo-header as per TCP and UDP. Linux NAT |
55 | 112 | support for DCCP has been added. | |
56 | DCCP does not travel through NAT successfully at present. This is because | ||
57 | the checksum covers the psuedo-header as per TCP and UDP. It should be | ||
58 | relatively trivial to add Linux NAT support for DCCP. | ||
diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt index 5c0a5cc03998..61b171cf5313 100644 --- a/Documentation/networking/e1000.txt +++ b/Documentation/networking/e1000.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters | 1 | Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters |
2 | =============================================================== | 2 | =============================================================== |
3 | 3 | ||
4 | November 15, 2005 | 4 | September 26, 2006 |
5 | 5 | ||
6 | 6 | ||
7 | Contents | 7 | Contents |
@@ -9,6 +9,7 @@ Contents | |||
9 | 9 | ||
10 | - In This Release | 10 | - In This Release |
11 | - Identifying Your Adapter | 11 | - Identifying Your Adapter |
12 | - Building and Installation | ||
12 | - Command Line Parameters | 13 | - Command Line Parameters |
13 | - Speed and Duplex Configuration | 14 | - Speed and Duplex Configuration |
14 | - Additional Configurations | 15 | - Additional Configurations |
@@ -41,6 +42,9 @@ or later), lspci, and ifconfig to obtain the same information. | |||
41 | Instructions on updating ethtool can be found in the section "Additional | 42 | Instructions on updating ethtool can be found in the section "Additional |
42 | Configurations" later in this document. | 43 | Configurations" later in this document. |
43 | 44 | ||
45 | NOTE: The Intel(R) 82562v 10/100 Network Connection only provides 10/100 | ||
46 | support. | ||
47 | |||
44 | 48 | ||
45 | Identifying Your Adapter | 49 | Identifying Your Adapter |
46 | ======================== | 50 | ======================== |
@@ -51,28 +55,27 @@ Driver ID Guide at: | |||
51 | http://support.intel.com/support/network/adapter/pro100/21397.htm | 55 | http://support.intel.com/support/network/adapter/pro100/21397.htm |
52 | 56 | ||
53 | For the latest Intel network drivers for Linux, refer to the following | 57 | For the latest Intel network drivers for Linux, refer to the following |
54 | website. In the search field, enter your adapter name or type, or use the | 58 | website. In the search field, enter your adapter name or type, or use the |
55 | networking link on the left to search for your adapter: | 59 | networking link on the left to search for your adapter: |
56 | 60 | ||
57 | http://downloadfinder.intel.com/scripts-df/support_intel.asp | 61 | http://downloadfinder.intel.com/scripts-df/support_intel.asp |
58 | 62 | ||
59 | 63 | ||
60 | Command Line Parameters ======================= | 64 | Command Line Parameters |
65 | ======================= | ||
61 | 66 | ||
62 | If the driver is built as a module, the following optional parameters | 67 | If the driver is built as a module, the following optional parameters |
63 | are used by entering them on the command line with the modprobe or insmod | 68 | are used by entering them on the command line with the modprobe command |
64 | command using this syntax: | 69 | using this syntax: |
65 | 70 | ||
66 | modprobe e1000 [<option>=<VAL1>,<VAL2>,...] | 71 | modprobe e1000 [<option>=<VAL1>,<VAL2>,...] |
67 | 72 | ||
68 | insmod e1000 [<option>=<VAL1>,<VAL2>,...] | ||
69 | |||
70 | For example, with two PRO/1000 PCI adapters, entering: | 73 | For example, with two PRO/1000 PCI adapters, entering: |
71 | 74 | ||
72 | insmod e1000 TxDescriptors=80,128 | 75 | modprobe e1000 TxDescriptors=80,128 |
73 | 76 | ||
74 | loads the e1000 driver with 80 TX descriptors for the first adapter and 128 | 77 | loads the e1000 driver with 80 TX descriptors for the first adapter and |
75 | TX descriptors for the second adapter. | 78 | 128 TX descriptors for the second adapter. |
76 | 79 | ||
77 | The default value for each parameter is generally the recommended setting, | 80 | The default value for each parameter is generally the recommended setting, |
78 | unless otherwise noted. | 81 | unless otherwise noted. |
@@ -87,7 +90,7 @@ NOTES: For more information about the AutoNeg, Duplex, and Speed | |||
87 | http://www.intel.com/design/network/applnots/ap450.htm | 90 | http://www.intel.com/design/network/applnots/ap450.htm |
88 | 91 | ||
89 | A descriptor describes a data buffer and attributes related to | 92 | A descriptor describes a data buffer and attributes related to |
90 | the data buffer. This information is accessed by the hardware. | 93 | the data buffer. This information is accessed by the hardware. |
91 | 94 | ||
92 | 95 | ||
93 | AutoNeg | 96 | AutoNeg |
@@ -96,9 +99,9 @@ AutoNeg | |||
96 | Valid Range: 0x01-0x0F, 0x20-0x2F | 99 | Valid Range: 0x01-0x0F, 0x20-0x2F |
97 | Default Value: 0x2F | 100 | Default Value: 0x2F |
98 | 101 | ||
99 | This parameter is a bit mask that specifies which speed and duplex | 102 | This parameter is a bit-mask that specifies the speed and duplex settings |
100 | settings the board advertises. When this parameter is used, the Speed | 103 | advertised by the adapter. When this parameter is used, the Speed and |
101 | and Duplex parameters must not be specified. | 104 | Duplex parameters must not be specified. |
102 | 105 | ||
103 | NOTE: Refer to the Speed and Duplex section of this readme for more | 106 | NOTE: Refer to the Speed and Duplex section of this readme for more |
104 | information on the AutoNeg parameter. | 107 | information on the AutoNeg parameter. |
@@ -110,14 +113,15 @@ Duplex | |||
110 | Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full) | 113 | Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full) |
111 | Default Value: 0 | 114 | Default Value: 0 |
112 | 115 | ||
113 | Defines the direction in which data is allowed to flow. Can be either | 116 | This defines the direction in which data is allowed to flow. Can be |
114 | one or two-directional. If both Duplex and the link partner are set to | 117 | either one or two-directional. If both Duplex and the link partner are |
115 | auto-negotiate, the board auto-detects the correct duplex. If the link | 118 | set to auto-negotiate, the board auto-detects the correct duplex. If the |
116 | partner is forced (either full or half), Duplex defaults to half-duplex. | 119 | link partner is forced (either full or half), Duplex defaults to half- |
120 | duplex. | ||
117 | 121 | ||
118 | 122 | ||
119 | FlowControl | 123 | FlowControl |
120 | ---------- | 124 | ----------- |
121 | Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx) | 125 | Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx) |
122 | Default Value: Reads flow control settings from the EEPROM | 126 | Default Value: Reads flow control settings from the EEPROM |
123 | 127 | ||
@@ -127,57 +131,107 @@ to Ethernet PAUSE frames. | |||
127 | 131 | ||
128 | InterruptThrottleRate | 132 | InterruptThrottleRate |
129 | --------------------- | 133 | --------------------- |
130 | (not supported on Intel 82542, 82543 or 82544-based adapters) | 134 | (not supported on Intel(R) 82542, 82543 or 82544-based adapters) |
131 | Valid Range: 100-100000 (0=off, 1=dynamic) | 135 | Valid Range: 0,1,3,100-100000 (0=off, 1=dynamic, 3=dynamic conservative) |
132 | Default Value: 8000 | 136 | Default Value: 3 |
133 | 137 | ||
134 | This value represents the maximum number of interrupts per second the | 138 | The driver can limit the amount of interrupts per second that the adapter |
135 | controller generates. InterruptThrottleRate is another setting used in | 139 | will generate for incoming packets. It does this by writing a value to the |
136 | interrupt moderation. Dynamic mode uses a heuristic algorithm to adjust | 140 | adapter that is based on the maximum amount of interrupts that the adapter |
137 | InterruptThrottleRate based on the current traffic load. | 141 | will generate per second. |
142 | |||
143 | Setting InterruptThrottleRate to a value greater or equal to 100 | ||
144 | will program the adapter to send out a maximum of that many interrupts | ||
145 | per second, even if more packets have come in. This reduces interrupt | ||
146 | load on the system and can lower CPU utilization under heavy load, | ||
147 | but will increase latency as packets are not processed as quickly. | ||
148 | |||
149 | The default behaviour of the driver previously assumed a static | ||
150 | InterruptThrottleRate value of 8000, providing a good fallback value for | ||
151 | all traffic types,but lacking in small packet performance and latency. | ||
152 | The hardware can handle many more small packets per second however, and | ||
153 | for this reason an adaptive interrupt moderation algorithm was implemented. | ||
154 | |||
155 | Since 7.3.x, the driver has two adaptive modes (setting 1 or 3) in which | ||
156 | it dynamically adjusts the InterruptThrottleRate value based on the traffic | ||
157 | that it receives. After determining the type of incoming traffic in the last | ||
158 | timeframe, it will adjust the InterruptThrottleRate to an appropriate value | ||
159 | for that traffic. | ||
160 | |||
161 | The algorithm classifies the incoming traffic every interval into | ||
162 | classes. Once the class is determined, the InterruptThrottleRate value is | ||
163 | adjusted to suit that traffic type the best. There are three classes defined: | ||
164 | "Bulk traffic", for large amounts of packets of normal size; "Low latency", | ||
165 | for small amounts of traffic and/or a significant percentage of small | ||
166 | packets; and "Lowest latency", for almost completely small packets or | ||
167 | minimal traffic. | ||
168 | |||
169 | In dynamic conservative mode, the InterruptThrottleRate value is set to 4000 | ||
170 | for traffic that falls in class "Bulk traffic". If traffic falls in the "Low | ||
171 | latency" or "Lowest latency" class, the InterruptThrottleRate is increased | ||
172 | stepwise to 20000. This default mode is suitable for most applications. | ||
173 | |||
174 | For situations where low latency is vital such as cluster or | ||
175 | grid computing, the algorithm can reduce latency even more when | ||
176 | InterruptThrottleRate is set to mode 1. In this mode, which operates | ||
177 | the same as mode 3, the InterruptThrottleRate will be increased stepwise to | ||
178 | 70000 for traffic in class "Lowest latency". | ||
179 | |||
180 | Setting InterruptThrottleRate to 0 turns off any interrupt moderation | ||
181 | and may improve small packet latency, but is generally not suitable | ||
182 | for bulk throughput traffic. | ||
138 | 183 | ||
139 | NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and | 184 | NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and |
140 | RxAbsIntDelay parameters. In other words, minimizing the receive | 185 | RxAbsIntDelay parameters. In other words, minimizing the receive |
141 | and/or transmit absolute delays does not force the controller to | 186 | and/or transmit absolute delays does not force the controller to |
142 | generate more interrupts than what the Interrupt Throttle Rate | 187 | generate more interrupts than what the Interrupt Throttle Rate |
143 | allows. | 188 | allows. |
144 | 189 | ||
145 | CAUTION: If you are using the Intel PRO/1000 CT Network Connection | 190 | CAUTION: If you are using the Intel(R) PRO/1000 CT Network Connection |
146 | (controller 82547), setting InterruptThrottleRate to a value | 191 | (controller 82547), setting InterruptThrottleRate to a value |
147 | greater than 75,000, may hang (stop transmitting) adapters | 192 | greater than 75,000, may hang (stop transmitting) adapters |
148 | under certain network conditions. If this occurs a NETDEV | 193 | under certain network conditions. If this occurs a NETDEV |
149 | WATCHDOG message is logged in the system event log. In | 194 | WATCHDOG message is logged in the system event log. In |
150 | addition, the controller is automatically reset, restoring | 195 | addition, the controller is automatically reset, restoring |
151 | the network connection. To eliminate the potential for the | 196 | the network connection. To eliminate the potential for the |
152 | hang, ensure that InterruptThrottleRate is set no greater | 197 | hang, ensure that InterruptThrottleRate is set no greater |
153 | than 75,000 and is not set to 0. | 198 | than 75,000 and is not set to 0. |
154 | 199 | ||
155 | NOTE: When e1000 is loaded with default settings and multiple adapters | 200 | NOTE: When e1000 is loaded with default settings and multiple adapters |
156 | are in use simultaneously, the CPU utilization may increase non- | 201 | are in use simultaneously, the CPU utilization may increase non- |
157 | linearly. In order to limit the CPU utilization without impacting | 202 | linearly. In order to limit the CPU utilization without impacting |
158 | the overall throughput, we recommend that you load the driver as | 203 | the overall throughput, we recommend that you load the driver as |
159 | follows: | 204 | follows: |
160 | 205 | ||
161 | insmod e1000.o InterruptThrottleRate=3000,3000,3000 | 206 | modprobe e1000 InterruptThrottleRate=3000,3000,3000 |
162 | 207 | ||
163 | This sets the InterruptThrottleRate to 3000 interrupts/sec for | 208 | This sets the InterruptThrottleRate to 3000 interrupts/sec for |
164 | the first, second, and third instances of the driver. The range | 209 | the first, second, and third instances of the driver. The range |
165 | of 2000 to 3000 interrupts per second works on a majority of | 210 | of 2000 to 3000 interrupts per second works on a majority of |
166 | systems and is a good starting point, but the optimal value will | 211 | systems and is a good starting point, but the optimal value will |
167 | be platform-specific. If CPU utilization is not a concern, use | 212 | be platform-specific. If CPU utilization is not a concern, use |
168 | RX_POLLING (NAPI) and default driver settings. | 213 | RX_POLLING (NAPI) and default driver settings. |
169 | 214 | ||
170 | 215 | ||
216 | |||
171 | RxDescriptors | 217 | RxDescriptors |
172 | ------------- | 218 | ------------- |
173 | Valid Range: 80-256 for 82542 and 82543-based adapters | 219 | Valid Range: 80-256 for 82542 and 82543-based adapters |
174 | 80-4096 for all other supported adapters | 220 | 80-4096 for all other supported adapters |
175 | Default Value: 256 | 221 | Default Value: 256 |
176 | 222 | ||
177 | This value specifies the number of receive descriptors allocated by the | 223 | This value specifies the number of receive buffer descriptors allocated |
178 | driver. Increasing this value allows the driver to buffer more incoming | 224 | by the driver. Increasing this value allows the driver to buffer more |
179 | packets. Each descriptor is 16 bytes. A receive buffer is also | 225 | incoming packets, at the expense of increased system memory utilization. |
180 | allocated for each descriptor and is 2048. | 226 | |
227 | Each descriptor is 16 bytes. A receive buffer is also allocated for each | ||
228 | descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending | ||
229 | on the MTU setting. The maximum MTU size is 16110. | ||
230 | |||
231 | NOTE: MTU designates the frame size. It only needs to be set for Jumbo | ||
232 | Frames. Depending on the available system resources, the request | ||
233 | for a higher number of receive descriptors may be denied. In this | ||
234 | case, use a lower number. | ||
181 | 235 | ||
182 | 236 | ||
183 | RxIntDelay | 237 | RxIntDelay |
@@ -187,17 +241,17 @@ Default Value: 0 | |||
187 | 241 | ||
188 | This value delays the generation of receive interrupts in units of 1.024 | 242 | This value delays the generation of receive interrupts in units of 1.024 |
189 | microseconds. Receive interrupt reduction can improve CPU efficiency if | 243 | microseconds. Receive interrupt reduction can improve CPU efficiency if |
190 | properly tuned for specific network traffic. Increasing this value adds | 244 | properly tuned for specific network traffic. Increasing this value adds |
191 | extra latency to frame reception and can end up decreasing the throughput | 245 | extra latency to frame reception and can end up decreasing the throughput |
192 | of TCP traffic. If the system is reporting dropped receives, this value | 246 | of TCP traffic. If the system is reporting dropped receives, this value |
193 | may be set too high, causing the driver to run out of available receive | 247 | may be set too high, causing the driver to run out of available receive |
194 | descriptors. | 248 | descriptors. |
195 | 249 | ||
196 | CAUTION: When setting RxIntDelay to a value other than 0, adapters may | 250 | CAUTION: When setting RxIntDelay to a value other than 0, adapters may |
197 | hang (stop transmitting) under certain network conditions. If | 251 | hang (stop transmitting) under certain network conditions. If |
198 | this occurs a NETDEV WATCHDOG message is logged in the system | 252 | this occurs a NETDEV WATCHDOG message is logged in the system |
199 | event log. In addition, the controller is automatically reset, | 253 | event log. In addition, the controller is automatically reset, |
200 | restoring the network connection. To eliminate the potential | 254 | restoring the network connection. To eliminate the potential |
201 | for the hang ensure that RxIntDelay is set to 0. | 255 | for the hang ensure that RxIntDelay is set to 0. |
202 | 256 | ||
203 | 257 | ||
@@ -208,7 +262,7 @@ Valid Range: 0-65535 (0=off) | |||
208 | Default Value: 128 | 262 | Default Value: 128 |
209 | 263 | ||
210 | This value, in units of 1.024 microseconds, limits the delay in which a | 264 | This value, in units of 1.024 microseconds, limits the delay in which a |
211 | receive interrupt is generated. Useful only if RxIntDelay is non-zero, | 265 | receive interrupt is generated. Useful only if RxIntDelay is non-zero, |
212 | this value ensures that an interrupt is generated after the initial | 266 | this value ensures that an interrupt is generated after the initial |
213 | packet is received within the set amount of time. Proper tuning, | 267 | packet is received within the set amount of time. Proper tuning, |
214 | along with RxIntDelay, may improve traffic throughput in specific network | 268 | along with RxIntDelay, may improve traffic throughput in specific network |
@@ -222,9 +276,9 @@ Valid Settings: 0, 10, 100, 1000 | |||
222 | Default Value: 0 (auto-negotiate at all supported speeds) | 276 | Default Value: 0 (auto-negotiate at all supported speeds) |
223 | 277 | ||
224 | Speed forces the line speed to the specified value in megabits per second | 278 | Speed forces the line speed to the specified value in megabits per second |
225 | (Mbps). If this parameter is not specified or is set to 0 and the link | 279 | (Mbps). If this parameter is not specified or is set to 0 and the link |
226 | partner is set to auto-negotiate, the board will auto-detect the correct | 280 | partner is set to auto-negotiate, the board will auto-detect the correct |
227 | speed. Duplex should also be set when Speed is set to either 10 or 100. | 281 | speed. Duplex should also be set when Speed is set to either 10 or 100. |
228 | 282 | ||
229 | 283 | ||
230 | TxDescriptors | 284 | TxDescriptors |
@@ -234,7 +288,7 @@ Valid Range: 80-256 for 82542 and 82543-based adapters | |||
234 | Default Value: 256 | 288 | Default Value: 256 |
235 | 289 | ||
236 | This value is the number of transmit descriptors allocated by the driver. | 290 | This value is the number of transmit descriptors allocated by the driver. |
237 | Increasing this value allows the driver to queue more transmits. Each | 291 | Increasing this value allows the driver to queue more transmits. Each |
238 | descriptor is 16 bytes. | 292 | descriptor is 16 bytes. |
239 | 293 | ||
240 | NOTE: Depending on the available system resources, the request for a | 294 | NOTE: Depending on the available system resources, the request for a |
@@ -248,8 +302,8 @@ Valid Range: 0-65535 (0=off) | |||
248 | Default Value: 64 | 302 | Default Value: 64 |
249 | 303 | ||
250 | This value delays the generation of transmit interrupts in units of | 304 | This value delays the generation of transmit interrupts in units of |
251 | 1.024 microseconds. Transmit interrupt reduction can improve CPU | 305 | 1.024 microseconds. Transmit interrupt reduction can improve CPU |
252 | efficiency if properly tuned for specific network traffic. If the | 306 | efficiency if properly tuned for specific network traffic. If the |
253 | system is reporting dropped transmits, this value may be set too high | 307 | system is reporting dropped transmits, this value may be set too high |
254 | causing the driver to run out of available transmit descriptors. | 308 | causing the driver to run out of available transmit descriptors. |
255 | 309 | ||
@@ -261,7 +315,7 @@ Valid Range: 0-65535 (0=off) | |||
261 | Default Value: 64 | 315 | Default Value: 64 |
262 | 316 | ||
263 | This value, in units of 1.024 microseconds, limits the delay in which a | 317 | This value, in units of 1.024 microseconds, limits the delay in which a |
264 | transmit interrupt is generated. Useful only if TxIntDelay is non-zero, | 318 | transmit interrupt is generated. Useful only if TxIntDelay is non-zero, |
265 | this value ensures that an interrupt is generated after the initial | 319 | this value ensures that an interrupt is generated after the initial |
266 | packet is sent on the wire within the set amount of time. Proper tuning, | 320 | packet is sent on the wire within the set amount of time. Proper tuning, |
267 | along with TxIntDelay, may improve traffic throughput in specific | 321 | along with TxIntDelay, may improve traffic throughput in specific |
@@ -288,15 +342,15 @@ fiber interface board only links at 1000 Mbps full-duplex. | |||
288 | 342 | ||
289 | For copper-based boards, the keywords interact as follows: | 343 | For copper-based boards, the keywords interact as follows: |
290 | 344 | ||
291 | The default operation is auto-negotiate. The board advertises all | 345 | The default operation is auto-negotiate. The board advertises all |
292 | supported speed and duplex combinations, and it links at the highest | 346 | supported speed and duplex combinations, and it links at the highest |
293 | common speed and duplex mode IF the link partner is set to auto-negotiate. | 347 | common speed and duplex mode IF the link partner is set to auto-negotiate. |
294 | 348 | ||
295 | If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps | 349 | If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps |
296 | is advertised (The 1000BaseT spec requires auto-negotiation.) | 350 | is advertised (The 1000BaseT spec requires auto-negotiation.) |
297 | 351 | ||
298 | If Speed = 10 or 100, then both Speed and Duplex should be set. Auto- | 352 | If Speed = 10 or 100, then both Speed and Duplex should be set. Auto- |
299 | negotiation is disabled, and the AutoNeg parameter is ignored. Partner | 353 | negotiation is disabled, and the AutoNeg parameter is ignored. Partner |
300 | SHOULD also be forced. | 354 | SHOULD also be forced. |
301 | 355 | ||
302 | The AutoNeg parameter is used when more control is required over the | 356 | The AutoNeg parameter is used when more control is required over the |
@@ -304,7 +358,7 @@ auto-negotiation process. It should be used when you wish to control which | |||
304 | speed and duplex combinations are advertised during the auto-negotiation | 358 | speed and duplex combinations are advertised during the auto-negotiation |
305 | process. | 359 | process. |
306 | 360 | ||
307 | The parameter may be specified as either a decimal or hexidecimal value as | 361 | The parameter may be specified as either a decimal or hexadecimal value as |
308 | determined by the bitmap below. | 362 | determined by the bitmap below. |
309 | 363 | ||
310 | Bit position 7 6 5 4 3 2 1 0 | 364 | Bit position 7 6 5 4 3 2 1 0 |
@@ -337,20 +391,19 @@ Additional Configurations | |||
337 | 391 | ||
338 | Configuring the Driver on Different Distributions | 392 | Configuring the Driver on Different Distributions |
339 | ------------------------------------------------- | 393 | ------------------------------------------------- |
340 | |||
341 | Configuring a network driver to load properly when the system is started | 394 | Configuring a network driver to load properly when the system is started |
342 | is distribution dependent. Typically, the configuration process involves | 395 | is distribution dependent. Typically, the configuration process involves |
343 | adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well | 396 | adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well |
344 | as editing other system startup scripts and/or configuration files. Many | 397 | as editing other system startup scripts and/or configuration files. Many |
345 | popular Linux distributions ship with tools to make these changes for you. | 398 | popular Linux distributions ship with tools to make these changes for you. |
346 | To learn the proper way to configure a network device for your system, | 399 | To learn the proper way to configure a network device for your system, |
347 | refer to your distribution documentation. If during this process you are | 400 | refer to your distribution documentation. If during this process you are |
348 | asked for the driver or module name, the name for the Linux Base Driver | 401 | asked for the driver or module name, the name for the Linux Base Driver |
349 | for the Intel PRO/1000 Family of Adapters is e1000. | 402 | for the Intel(R) PRO/1000 Family of Adapters is e1000. |
350 | 403 | ||
351 | As an example, if you install the e1000 driver for two PRO/1000 adapters | 404 | As an example, if you install the e1000 driver for two PRO/1000 adapters |
352 | (eth0 and eth1) and set the speed and duplex to 10full and 100half, add | 405 | (eth0 and eth1) and set the speed and duplex to 10full and 100half, add |
353 | the following to modules.conf or modprobe.conf: | 406 | the following to modules.conf or or modprobe.conf: |
354 | 407 | ||
355 | alias eth0 e1000 | 408 | alias eth0 e1000 |
356 | alias eth1 e1000 | 409 | alias eth1 e1000 |
@@ -358,9 +411,8 @@ Additional Configurations | |||
358 | 411 | ||
359 | Viewing Link Messages | 412 | Viewing Link Messages |
360 | --------------------- | 413 | --------------------- |
361 | |||
362 | Link messages will not be displayed to the console if the distribution is | 414 | Link messages will not be displayed to the console if the distribution is |
363 | restricting system messages. In order to see network driver link messages | 415 | restricting system messages. In order to see network driver link messages |
364 | on your console, set dmesg to eight by entering the following: | 416 | on your console, set dmesg to eight by entering the following: |
365 | 417 | ||
366 | dmesg -n 8 | 418 | dmesg -n 8 |
@@ -369,11 +421,9 @@ Additional Configurations | |||
369 | 421 | ||
370 | Jumbo Frames | 422 | Jumbo Frames |
371 | ------------ | 423 | ------------ |
372 | 424 | Jumbo Frames support is enabled by changing the MTU to a value larger than | |
373 | The driver supports Jumbo Frames for all adapters except 82542 and | 425 | the default of 1500. Use the ifconfig command to increase the MTU size. |
374 | 82573-based adapters. Jumbo Frames support is enabled by changing the | 426 | For example: |
375 | MTU to a value larger than the default of 1500. Use the ifconfig command | ||
376 | to increase the MTU size. For example: | ||
377 | 427 | ||
378 | ifconfig eth<x> mtu 9000 up | 428 | ifconfig eth<x> mtu 9000 up |
379 | 429 | ||
@@ -390,26 +440,49 @@ Additional Configurations | |||
390 | 440 | ||
391 | - To enable Jumbo Frames, increase the MTU size on the interface beyond | 441 | - To enable Jumbo Frames, increase the MTU size on the interface beyond |
392 | 1500. | 442 | 1500. |
393 | - The maximum MTU setting for Jumbo Frames is 16110. This value coincides | 443 | |
444 | - The maximum MTU setting for Jumbo Frames is 16110. This value coincides | ||
394 | with the maximum Jumbo Frames size of 16128. | 445 | with the maximum Jumbo Frames size of 16128. |
446 | |||
395 | - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or | 447 | - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or |
396 | loss of link. | 448 | loss of link. |
449 | |||
397 | - Some Intel gigabit adapters that support Jumbo Frames have a frame size | 450 | - Some Intel gigabit adapters that support Jumbo Frames have a frame size |
398 | limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes. | 451 | limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes. |
399 | The adapters with this limitation are based on the Intel 82571EB and | 452 | The adapters with this limitation are based on the Intel(R) 82571EB, |
400 | 82572EI controllers, which correspond to these product names: | 453 | 82572EI, 82573L and 80003ES2LAN controller. These correspond to the |
401 | Intel® PRO/1000 PT Dual Port Server Adapter | 454 | following product names: |
402 | Intel® PRO/1000 PF Dual Port Server Adapter | 455 | Intel(R) PRO/1000 PT Server Adapter |
403 | Intel® PRO/1000 PT Server Adapter | 456 | Intel(R) PRO/1000 PT Desktop Adapter |
404 | Intel® PRO/1000 PT Desktop Adapter | 457 | Intel(R) PRO/1000 PT Network Connection |
405 | Intel® PRO/1000 PF Server Adapter | 458 | Intel(R) PRO/1000 PT Dual Port Server Adapter |
406 | 459 | Intel(R) PRO/1000 PT Dual Port Network Connection | |
407 | - The Intel PRO/1000 PM Network Connection does not support jumbo frames. | 460 | Intel(R) PRO/1000 PF Server Adapter |
461 | Intel(R) PRO/1000 PF Network Connection | ||
462 | Intel(R) PRO/1000 PF Dual Port Server Adapter | ||
463 | Intel(R) PRO/1000 PB Server Connection | ||
464 | Intel(R) PRO/1000 PL Network Connection | ||
465 | Intel(R) PRO/1000 EB Network Connection with I/O Acceleration | ||
466 | Intel(R) PRO/1000 EB Backplane Connection with I/O Acceleration | ||
467 | Intel(R) PRO/1000 PT Quad Port Server Adapter | ||
468 | |||
469 | - Adapters based on the Intel(R) 82542 and 82573V/E controller do not | ||
470 | support Jumbo Frames. These correspond to the following product names: | ||
471 | Intel(R) PRO/1000 Gigabit Server Adapter | ||
472 | Intel(R) PRO/1000 PM Network Connection | ||
473 | |||
474 | - The following adapters do not support Jumbo Frames: | ||
475 | Intel(R) 82562V 10/100 Network Connection | ||
476 | Intel(R) 82566DM Gigabit Network Connection | ||
477 | Intel(R) 82566DC Gigabit Network Connection | ||
478 | Intel(R) 82566MM Gigabit Network Connection | ||
479 | Intel(R) 82566MC Gigabit Network Connection | ||
480 | Intel(R) 82562GT 10/100 Network Connection | ||
481 | Intel(R) 82562G 10/100 Network Connection | ||
408 | 482 | ||
409 | 483 | ||
410 | Ethtool | 484 | Ethtool |
411 | ------- | 485 | ------- |
412 | |||
413 | The driver utilizes the ethtool interface for driver configuration and | 486 | The driver utilizes the ethtool interface for driver configuration and |
414 | diagnostics, as well as displaying statistical information. Ethtool | 487 | diagnostics, as well as displaying statistical information. Ethtool |
415 | version 1.6 or later is required for this functionality. | 488 | version 1.6 or later is required for this functionality. |
@@ -417,15 +490,14 @@ Additional Configurations | |||
417 | The latest release of ethtool can be found from | 490 | The latest release of ethtool can be found from |
418 | http://sourceforge.net/projects/gkernel. | 491 | http://sourceforge.net/projects/gkernel. |
419 | 492 | ||
420 | NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support | 493 | NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support |
421 | for a more complete ethtool feature set can be enabled by upgrading | 494 | for a more complete ethtool feature set can be enabled by upgrading |
422 | ethtool to ethtool-1.8.1. | 495 | ethtool to ethtool-1.8.1. |
423 | 496 | ||
424 | Enabling Wake on LAN* (WoL) | 497 | Enabling Wake on LAN* (WoL) |
425 | --------------------------- | 498 | --------------------------- |
426 | 499 | WoL is configured through the Ethtool* utility. Ethtool is included with | |
427 | WoL is configured through the Ethtool* utility. Ethtool is included with | 500 | all versions of Red Hat after Red Hat 7.2. For other Linux distributions, |
428 | all versions of Red Hat after Red Hat 7.2. For other Linux distributions, | ||
429 | download and install Ethtool from the following website: | 501 | download and install Ethtool from the following website: |
430 | http://sourceforge.net/projects/gkernel. | 502 | http://sourceforge.net/projects/gkernel. |
431 | 503 | ||
@@ -436,11 +508,17 @@ Additional Configurations | |||
436 | For this driver version, in order to enable WoL, the e1000 driver must be | 508 | For this driver version, in order to enable WoL, the e1000 driver must be |
437 | loaded when shutting down or rebooting the system. | 509 | loaded when shutting down or rebooting the system. |
438 | 510 | ||
511 | Wake On LAN is only supported on port A for the following devices: | ||
512 | Intel(R) PRO/1000 PT Dual Port Network Connection | ||
513 | Intel(R) PRO/1000 PT Dual Port Server Connection | ||
514 | Intel(R) PRO/1000 PT Dual Port Server Adapter | ||
515 | Intel(R) PRO/1000 PF Dual Port Server Adapter | ||
516 | Intel(R) PRO/1000 PT Quad Port Server Adapter | ||
517 | |||
439 | NAPI | 518 | NAPI |
440 | ---- | 519 | ---- |
441 | 520 | NAPI (Rx polling mode) is supported in the e1000 driver. NAPI is enabled | |
442 | NAPI (Rx polling mode) is supported in the e1000 driver. NAPI is enabled | 521 | or disabled based on the configuration of the kernel. To override |
443 | or disabled based on the configuration of the kernel. To override | ||
444 | the default, use the following compile-time flags. | 522 | the default, use the following compile-time flags. |
445 | 523 | ||
446 | To enable NAPI, compile the driver module, passing in a configuration option: | 524 | To enable NAPI, compile the driver module, passing in a configuration option: |
@@ -457,88 +535,105 @@ Additional Configurations | |||
457 | Known Issues | 535 | Known Issues |
458 | ============ | 536 | ============ |
459 | 537 | ||
460 | Jumbo Frames System Requirement | 538 | Dropped Receive Packets on Half-duplex 10/100 Networks |
461 | ------------------------------- | 539 | ------------------------------------------------------ |
462 | 540 | If you have an Intel PCI Express adapter running at 10mbps or 100mbps, half- | |
463 | Memory allocation failures have been observed on Linux systems with 64 MB | 541 | duplex, you may observe occasional dropped receive packets. There are no |
464 | of RAM or less that are running Jumbo Frames. If you are using Jumbo | 542 | workarounds for this problem in this network configuration. The network must |
465 | Frames, your system may require more than the advertised minimum | 543 | be updated to operate in full-duplex, and/or 1000mbps only. |
466 | requirement of 64 MB of system memory. | 544 | |
467 | 545 | Jumbo Frames System Requirement | |
468 | Performance Degradation with Jumbo Frames | 546 | ------------------------------- |
469 | ----------------------------------------- | 547 | Memory allocation failures have been observed on Linux systems with 64 MB |
470 | 548 | of RAM or less that are running Jumbo Frames. If you are using Jumbo | |
471 | Degradation in throughput performance may be observed in some Jumbo frames | 549 | Frames, your system may require more than the advertised minimum |
472 | environments. If this is observed, increasing the application's socket | 550 | requirement of 64 MB of system memory. |
473 | buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values | 551 | |
474 | may help. See the specific application manual and | 552 | Performance Degradation with Jumbo Frames |
475 | /usr/src/linux*/Documentation/ | 553 | ----------------------------------------- |
476 | networking/ip-sysctl.txt for more details. | 554 | Degradation in throughput performance may be observed in some Jumbo frames |
477 | 555 | environments. If this is observed, increasing the application's socket | |
478 | Jumbo frames on Foundry BigIron 8000 switch | 556 | buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values |
479 | ------------------------------------------- | 557 | may help. See the specific application manual and |
480 | There is a known issue using Jumbo frames when connected to a Foundry | 558 | /usr/src/linux*/Documentation/ |
481 | BigIron 8000 switch. This is a 3rd party limitation. If you experience | 559 | networking/ip-sysctl.txt for more details. |
482 | loss of packets, lower the MTU size. | 560 | |
483 | 561 | Jumbo Frames on Foundry BigIron 8000 switch | |
484 | Multiple Interfaces on Same Ethernet Broadcast Network | 562 | ------------------------------------------- |
485 | ------------------------------------------------------ | 563 | There is a known issue using Jumbo frames when connected to a Foundry |
486 | 564 | BigIron 8000 switch. This is a 3rd party limitation. If you experience | |
487 | Due to the default ARP behavior on Linux, it is not possible to have | 565 | loss of packets, lower the MTU size. |
488 | one system on two IP networks in the same Ethernet broadcast domain | 566 | |
489 | (non-partitioned switch) behave as expected. All Ethernet interfaces | 567 | Allocating Rx Buffers when Using Jumbo Frames |
490 | will respond to IP traffic for any IP address assigned to the system. | 568 | --------------------------------------------- |
491 | This results in unbalanced receive traffic. | 569 | Allocating Rx buffers when using Jumbo Frames on 2.6.x kernels may fail if |
492 | 570 | the available memory is heavily fragmented. This issue may be seen with PCI-X | |
493 | If you have multiple interfaces in a server, either turn on ARP | 571 | adapters or with packet split disabled. This can be reduced or eliminated |
494 | filtering by entering: | 572 | by changing the amount of available memory for receive buffer allocation, by |
495 | 573 | increasing /proc/sys/vm/min_free_kbytes. | |
496 | echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter | 574 | |
497 | (this only works if your kernel's version is higher than 2.4.5), | 575 | Multiple Interfaces on Same Ethernet Broadcast Network |
498 | 576 | ------------------------------------------------------ | |
499 | NOTE: This setting is not saved across reboots. The configuration | 577 | Due to the default ARP behavior on Linux, it is not possible to have |
500 | change can be made permanent by adding the line: | 578 | one system on two IP networks in the same Ethernet broadcast domain |
501 | net.ipv4.conf.all.arp_filter = 1 | 579 | (non-partitioned switch) behave as expected. All Ethernet interfaces |
502 | to the file /etc/sysctl.conf | 580 | will respond to IP traffic for any IP address assigned to the system. |
503 | 581 | This results in unbalanced receive traffic. | |
504 | or, | 582 | |
505 | 583 | If you have multiple interfaces in a server, either turn on ARP | |
506 | install the interfaces in separate broadcast domains (either in | 584 | filtering by entering: |
507 | different switches or in a switch partitioned to VLANs). | 585 | |
508 | 586 | echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter | |
509 | 82541/82547 can't link or are slow to link with some link partners | 587 | (this only works if your kernel's version is higher than 2.4.5), |
510 | ----------------------------------------------------------------- | 588 | |
511 | 589 | NOTE: This setting is not saved across reboots. The configuration | |
512 | There is a known compatibility issue with 82541/82547 and some | 590 | change can be made permanent by adding the line: |
513 | low-end switches where the link will not be established, or will | 591 | net.ipv4.conf.all.arp_filter = 1 |
514 | be slow to establish. In particular, these switches are known to | 592 | to the file /etc/sysctl.conf |
515 | be incompatible with 82541/82547: | 593 | |
516 | 594 | or, | |
517 | Planex FXG-08TE | 595 | |
518 | I-O Data ETG-SH8 | 596 | install the interfaces in separate broadcast domains (either in |
519 | 597 | different switches or in a switch partitioned to VLANs). | |
520 | To workaround this issue, the driver can be compiled with an override | 598 | |
521 | of the PHY's master/slave setting. Forcing master or forcing slave | 599 | 82541/82547 can't link or are slow to link with some link partners |
522 | mode will improve time-to-link. | 600 | ----------------------------------------------------------------- |
523 | 601 | There is a known compatibility issue with 82541/82547 and some | |
524 | # make EXTRA_CFLAGS=-DE1000_MASTER_SLAVE=<n> | 602 | low-end switches where the link will not be established, or will |
525 | 603 | be slow to establish. In particular, these switches are known to | |
526 | Where <n> is: | 604 | be incompatible with 82541/82547: |
527 | 605 | ||
528 | 0 = Hardware default | 606 | Planex FXG-08TE |
529 | 1 = Master mode | 607 | I-O Data ETG-SH8 |
530 | 2 = Slave mode | 608 | |
531 | 3 = Auto master/slave | 609 | To workaround this issue, the driver can be compiled with an override |
532 | 610 | of the PHY's master/slave setting. Forcing master or forcing slave | |
533 | Disable rx flow control with ethtool | 611 | mode will improve time-to-link. |
534 | ------------------------------------ | 612 | |
535 | 613 | # make CFLAGS_EXTRA=-DE1000_MASTER_SLAVE=<n> | |
536 | In order to disable receive flow control using ethtool, you must turn | 614 | |
537 | off auto-negotiation on the same command line. | 615 | Where <n> is: |
538 | 616 | ||
539 | For example: | 617 | 0 = Hardware default |
540 | 618 | 1 = Master mode | |
541 | ethtool -A eth? autoneg off rx off | 619 | 2 = Slave mode |
620 | 3 = Auto master/slave | ||
621 | |||
622 | Disable rx flow control with ethtool | ||
623 | ------------------------------------ | ||
624 | In order to disable receive flow control using ethtool, you must turn | ||
625 | off auto-negotiation on the same command line. | ||
626 | |||
627 | For example: | ||
628 | |||
629 | ethtool -A eth? autoneg off rx off | ||
630 | |||
631 | Unplugging network cable while ethtool -p is running | ||
632 | ---------------------------------------------------- | ||
633 | In kernel versions 2.5.50 and later (including 2.6 kernel), unplugging | ||
634 | the network cable while ethtool -p is running will cause the system to | ||
635 | become unresponsive to keyboard commands, except for control-alt-delete. | ||
636 | Restarting the system appears to be the only remedy. | ||
542 | 637 | ||
543 | 638 | ||
544 | Support | 639 | Support |
@@ -548,24 +643,10 @@ For general information, go to the Intel support website at: | |||
548 | 643 | ||
549 | http://support.intel.com | 644 | http://support.intel.com |
550 | 645 | ||
551 | or the Intel Wired Networking project hosted by Sourceforge at: | 646 | or the Intel Wired Networking project hosted by Sourceforge at: |
552 | 647 | ||
553 | http://sourceforge.net/projects/e1000 | 648 | http://sourceforge.net/projects/e1000 |
554 | 649 | ||
555 | If an issue is identified with the released source code on the supported | 650 | If an issue is identified with the released source code on the supported |
556 | kernel with a supported adapter, email the specific information related | 651 | kernel with a supported adapter, email the specific information related |
557 | to the issue to e1000-devel@lists.sourceforge.net | 652 | to the issue to e1000-devel@lists.sf.net |
558 | |||
559 | |||
560 | License | ||
561 | ======= | ||
562 | |||
563 | This software program is released under the terms of a license agreement | ||
564 | between you ('Licensee') and Intel. Do not use or load this software or any | ||
565 | associated materials (collectively, the 'Software') until you have carefully | ||
566 | read the full terms and conditions of the file COPYING located in this software | ||
567 | package. By loading or using the Software, you agree to the terms of this | ||
568 | Agreement. If you do not agree with the terms of this Agreement, do not | ||
569 | install or use the Software. | ||
570 | |||
571 | * Other names and brands may be claimed as the property of others. | ||
diff --git a/Documentation/networking/generic_netlink.txt b/Documentation/networking/generic_netlink.txt new file mode 100644 index 000000000000..d4f8b8b9b53c --- /dev/null +++ b/Documentation/networking/generic_netlink.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | A wiki document on how to use Generic Netlink can be found here: | ||
2 | |||
3 | * http://linux-net.osdl.org/index.php/Generic_Netlink_HOWTO | ||
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index fd3c0c012351..a0f6842368c3 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -101,6 +101,11 @@ inet_peer_gc_maxtime - INTEGER | |||
101 | 101 | ||
102 | TCP variables: | 102 | TCP variables: |
103 | 103 | ||
104 | somaxconn - INTEGER | ||
105 | Limit of socket listen() backlog, known in userspace as SOMAXCONN. | ||
106 | Defaults to 128. See also tcp_max_syn_backlog for additional tuning | ||
107 | for TCP sockets. | ||
108 | |||
104 | tcp_abc - INTEGER | 109 | tcp_abc - INTEGER |
105 | Controls Appropriate Byte Count (ABC) defined in RFC3465. | 110 | Controls Appropriate Byte Count (ABC) defined in RFC3465. |
106 | ABC is a way of increasing congestion window (cwnd) more slowly | 111 | ABC is a way of increasing congestion window (cwnd) more slowly |
@@ -112,48 +117,51 @@ tcp_abc - INTEGER | |||
112 | of two segments to compensate for delayed acknowledgments. | 117 | of two segments to compensate for delayed acknowledgments. |
113 | Default: 0 (off) | 118 | Default: 0 (off) |
114 | 119 | ||
115 | tcp_syn_retries - INTEGER | 120 | tcp_abort_on_overflow - BOOLEAN |
116 | Number of times initial SYNs for an active TCP connection attempt | 121 | If listening service is too slow to accept new connections, |
117 | will be retransmitted. Should not be higher than 255. Default value | 122 | reset them. Default state is FALSE. It means that if overflow |
118 | is 5, which corresponds to ~180seconds. | 123 | occurred due to a burst, connection will recover. Enable this |
124 | option _only_ if you are really sure that listening daemon | ||
125 | cannot be tuned to accept connections faster. Enabling this | ||
126 | option can harm clients of your server. | ||
119 | 127 | ||
120 | tcp_synack_retries - INTEGER | 128 | tcp_adv_win_scale - INTEGER |
121 | Number of times SYNACKs for a passive TCP connection attempt will | 129 | Count buffering overhead as bytes/2^tcp_adv_win_scale |
122 | be retransmitted. Should not be higher than 255. Default value | 130 | (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), |
123 | is 5, which corresponds to ~180seconds. | 131 | if it is <= 0. |
132 | Default: 2 | ||
124 | 133 | ||
125 | tcp_keepalive_time - INTEGER | 134 | tcp_allowed_congestion_control - STRING |
126 | How often TCP sends out keepalive messages when keepalive is enabled. | 135 | Show/set the congestion control choices available to non-privileged |
127 | Default: 2hours. | 136 | processes. The list is a subset of those listed in |
137 | tcp_available_congestion_control. | ||
138 | Default is "reno" and the default setting (tcp_congestion_control). | ||
128 | 139 | ||
129 | tcp_keepalive_probes - INTEGER | 140 | tcp_app_win - INTEGER |
130 | How many keepalive probes TCP sends out, until it decides that the | 141 | Reserve max(window/2^tcp_app_win, mss) of window for application |
131 | connection is broken. Default value: 9. | 142 | buffer. Value 0 is special, it means that nothing is reserved. |
143 | Default: 31 | ||
132 | 144 | ||
133 | tcp_keepalive_intvl - INTEGER | 145 | tcp_available_congestion_control - STRING |
134 | How frequently the probes are send out. Multiplied by | 146 | Shows the available congestion control choices that are registered. |
135 | tcp_keepalive_probes it is time to kill not responding connection, | 147 | More congestion control algorithms may be available as modules, |
136 | after probes started. Default value: 75sec i.e. connection | 148 | but not loaded. |
137 | will be aborted after ~11 minutes of retries. | ||
138 | 149 | ||
139 | tcp_retries1 - INTEGER | 150 | tcp_congestion_control - STRING |
140 | How many times to retry before deciding that something is wrong | 151 | Set the congestion control algorithm to be used for new |
141 | and it is necessary to report this suspicion to network layer. | 152 | connections. The algorithm "reno" is always available, but |
142 | Minimal RFC value is 3, it is default, which corresponds | 153 | additional choices may be available based on kernel configuration. |
143 | to ~3sec-8min depending on RTO. | 154 | Default is set as part of kernel configuration. |
144 | 155 | ||
145 | tcp_retries2 - INTEGER | 156 | tcp_dsack - BOOLEAN |
146 | How may times to retry before killing alive TCP connection. | 157 | Allows TCP to send "duplicate" SACKs. |
147 | RFC1122 says that the limit should be longer than 100 sec. | ||
148 | It is too small number. Default value 15 corresponds to ~13-30min | ||
149 | depending on RTO. | ||
150 | 158 | ||
151 | tcp_orphan_retries - INTEGER | 159 | tcp_ecn - BOOLEAN |
152 | How may times to retry before killing TCP connection, closed | 160 | Enable Explicit Congestion Notification in TCP. |
153 | by our side. Default value 7 corresponds to ~50sec-16min | 161 | |
154 | depending on RTO. If you machine is loaded WEB server, | 162 | tcp_fack - BOOLEAN |
155 | you should think about lowering this value, such sockets | 163 | Enable FACK congestion avoidance and fast retransmission. |
156 | may consume significant resources. Cf. tcp_max_orphans. | 164 | The value is not used, if tcp_sack is not enabled. |
157 | 165 | ||
158 | tcp_fin_timeout - INTEGER | 166 | tcp_fin_timeout - INTEGER |
159 | Time to hold socket in state FIN-WAIT-2, if it was closed | 167 | Time to hold socket in state FIN-WAIT-2, if it was closed |
@@ -166,24 +174,33 @@ tcp_fin_timeout - INTEGER | |||
166 | because they eat maximum 1.5K of memory, but they tend | 174 | because they eat maximum 1.5K of memory, but they tend |
167 | to live longer. Cf. tcp_max_orphans. | 175 | to live longer. Cf. tcp_max_orphans. |
168 | 176 | ||
169 | tcp_max_tw_buckets - INTEGER | 177 | tcp_frto - BOOLEAN |
170 | Maximal number of timewait sockets held by system simultaneously. | 178 | Enables F-RTO, an enhanced recovery algorithm for TCP retransmission |
171 | If this number is exceeded time-wait socket is immediately destroyed | 179 | timeouts. It is particularly beneficial in wireless environments |
172 | and warning is printed. This limit exists only to prevent | 180 | where packet loss is typically due to random radio interference |
173 | simple DoS attacks, you _must_ not lower the limit artificially, | 181 | rather than intermediate router congestion. |
174 | but rather increase it (probably, after increasing installed memory), | ||
175 | if network conditions require more than default value. | ||
176 | 182 | ||
177 | tcp_tw_recycle - BOOLEAN | 183 | tcp_keepalive_time - INTEGER |
178 | Enable fast recycling TIME-WAIT sockets. Default value is 0. | 184 | How often TCP sends out keepalive messages when keepalive is enabled. |
179 | It should not be changed without advice/request of technical | 185 | Default: 2hours. |
180 | experts. | ||
181 | 186 | ||
182 | tcp_tw_reuse - BOOLEAN | 187 | tcp_keepalive_probes - INTEGER |
183 | Allow to reuse TIME-WAIT sockets for new connections when it is | 188 | How many keepalive probes TCP sends out, until it decides that the |
184 | safe from protocol viewpoint. Default value is 0. | 189 | connection is broken. Default value: 9. |
185 | It should not be changed without advice/request of technical | 190 | |
186 | experts. | 191 | tcp_keepalive_intvl - INTEGER |
192 | How frequently the probes are send out. Multiplied by | ||
193 | tcp_keepalive_probes it is time to kill not responding connection, | ||
194 | after probes started. Default value: 75sec i.e. connection | ||
195 | will be aborted after ~11 minutes of retries. | ||
196 | |||
197 | tcp_low_latency - BOOLEAN | ||
198 | If set, the TCP stack makes decisions that prefer lower | ||
199 | latency as opposed to higher throughput. By default, this | ||
200 | option is not set meaning that higher throughput is preferred. | ||
201 | An example of an application where this default should be | ||
202 | changed would be a Beowulf compute cluster. | ||
203 | Default: 0 | ||
187 | 204 | ||
188 | tcp_max_orphans - INTEGER | 205 | tcp_max_orphans - INTEGER |
189 | Maximal number of TCP sockets not attached to any user file handle, | 206 | Maximal number of TCP sockets not attached to any user file handle, |
@@ -197,41 +214,6 @@ tcp_max_orphans - INTEGER | |||
197 | more aggressively. Let me to remind again: each orphan eats | 214 | more aggressively. Let me to remind again: each orphan eats |
198 | up to ~64K of unswappable memory. | 215 | up to ~64K of unswappable memory. |
199 | 216 | ||
200 | tcp_abort_on_overflow - BOOLEAN | ||
201 | If listening service is too slow to accept new connections, | ||
202 | reset them. Default state is FALSE. It means that if overflow | ||
203 | occurred due to a burst, connection will recover. Enable this | ||
204 | option _only_ if you are really sure that listening daemon | ||
205 | cannot be tuned to accept connections faster. Enabling this | ||
206 | option can harm clients of your server. | ||
207 | |||
208 | tcp_syncookies - BOOLEAN | ||
209 | Only valid when the kernel was compiled with CONFIG_SYNCOOKIES | ||
210 | Send out syncookies when the syn backlog queue of a socket | ||
211 | overflows. This is to prevent against the common 'syn flood attack' | ||
212 | Default: FALSE | ||
213 | |||
214 | Note, that syncookies is fallback facility. | ||
215 | It MUST NOT be used to help highly loaded servers to stand | ||
216 | against legal connection rate. If you see synflood warnings | ||
217 | in your logs, but investigation shows that they occur | ||
218 | because of overload with legal connections, you should tune | ||
219 | another parameters until this warning disappear. | ||
220 | See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow. | ||
221 | |||
222 | syncookies seriously violate TCP protocol, do not allow | ||
223 | to use TCP extensions, can result in serious degradation | ||
224 | of some services (f.e. SMTP relaying), visible not by you, | ||
225 | but your clients and relays, contacting you. While you see | ||
226 | synflood warnings in logs not being really flooded, your server | ||
227 | is seriously misconfigured. | ||
228 | |||
229 | tcp_stdurg - BOOLEAN | ||
230 | Use the Host requirements interpretation of the TCP urg pointer field. | ||
231 | Most hosts use the older BSD interpretation, so if you turn this on | ||
232 | Linux might not communicate correctly with them. | ||
233 | Default: FALSE | ||
234 | |||
235 | tcp_max_syn_backlog - INTEGER | 217 | tcp_max_syn_backlog - INTEGER |
236 | Maximal number of remembered connection requests, which are | 218 | Maximal number of remembered connection requests, which are |
237 | still did not receive an acknowledgment from connecting client. | 219 | still did not receive an acknowledgment from connecting client. |
@@ -239,24 +221,34 @@ tcp_max_syn_backlog - INTEGER | |||
239 | and 128 for low memory machines. If server suffers of overload, | 221 | and 128 for low memory machines. If server suffers of overload, |
240 | try to increase this number. | 222 | try to increase this number. |
241 | 223 | ||
242 | tcp_window_scaling - BOOLEAN | 224 | tcp_max_tw_buckets - INTEGER |
243 | Enable window scaling as defined in RFC1323. | 225 | Maximal number of timewait sockets held by system simultaneously. |
226 | If this number is exceeded time-wait socket is immediately destroyed | ||
227 | and warning is printed. This limit exists only to prevent | ||
228 | simple DoS attacks, you _must_ not lower the limit artificially, | ||
229 | but rather increase it (probably, after increasing installed memory), | ||
230 | if network conditions require more than default value. | ||
244 | 231 | ||
245 | tcp_timestamps - BOOLEAN | 232 | tcp_mem - vector of 3 INTEGERs: min, pressure, max |
246 | Enable timestamps as defined in RFC1323. | 233 | min: below this number of pages TCP is not bothered about its |
234 | memory appetite. | ||
247 | 235 | ||
248 | tcp_sack - BOOLEAN | 236 | pressure: when amount of memory allocated by TCP exceeds this number |
249 | Enable select acknowledgments (SACKS). | 237 | of pages, TCP moderates its memory consumption and enters memory |
238 | pressure mode, which is exited when memory consumption falls | ||
239 | under "min". | ||
250 | 240 | ||
251 | tcp_fack - BOOLEAN | 241 | max: number of pages allowed for queueing by all TCP sockets. |
252 | Enable FACK congestion avoidance and fast retransmission. | ||
253 | The value is not used, if tcp_sack is not enabled. | ||
254 | 242 | ||
255 | tcp_dsack - BOOLEAN | 243 | Defaults are calculated at boot time from amount of available |
256 | Allows TCP to send "duplicate" SACKs. | 244 | memory. |
257 | 245 | ||
258 | tcp_ecn - BOOLEAN | 246 | tcp_orphan_retries - INTEGER |
259 | Enable Explicit Congestion Notification in TCP. | 247 | How may times to retry before killing TCP connection, closed |
248 | by our side. Default value 7 corresponds to ~50sec-16min | ||
249 | depending on RTO. If you machine is loaded WEB server, | ||
250 | you should think about lowering this value, such sockets | ||
251 | may consume significant resources. Cf. tcp_max_orphans. | ||
260 | 252 | ||
261 | tcp_reordering - INTEGER | 253 | tcp_reordering - INTEGER |
262 | Maximal reordering of packets in a TCP stream. | 254 | Maximal reordering of packets in a TCP stream. |
@@ -267,20 +259,23 @@ tcp_retrans_collapse - BOOLEAN | |||
267 | On retransmit try to send bigger packets to work around bugs in | 259 | On retransmit try to send bigger packets to work around bugs in |
268 | certain TCP stacks. | 260 | certain TCP stacks. |
269 | 261 | ||
270 | tcp_wmem - vector of 3 INTEGERs: min, default, max | 262 | tcp_retries1 - INTEGER |
271 | min: Amount of memory reserved for send buffers for TCP socket. | 263 | How many times to retry before deciding that something is wrong |
272 | Each TCP socket has rights to use it due to fact of its birth. | 264 | and it is necessary to report this suspicion to network layer. |
273 | Default: 4K | 265 | Minimal RFC value is 3, it is default, which corresponds |
266 | to ~3sec-8min depending on RTO. | ||
274 | 267 | ||
275 | default: Amount of memory allowed for send buffers for TCP socket | 268 | tcp_retries2 - INTEGER |
276 | by default. This value overrides net.core.wmem_default used | 269 | How may times to retry before killing alive TCP connection. |
277 | by other protocols, it is usually lower than net.core.wmem_default. | 270 | RFC1122 says that the limit should be longer than 100 sec. |
278 | Default: 16K | 271 | It is too small number. Default value 15 corresponds to ~13-30min |
272 | depending on RTO. | ||
279 | 273 | ||
280 | max: Maximal amount of memory allowed for automatically selected | 274 | tcp_rfc1337 - BOOLEAN |
281 | send buffers for TCP socket. This value does not override | 275 | If set, the TCP stack behaves conforming to RFC1337. If unset, |
282 | net.core.wmem_max, "static" selection via SO_SNDBUF does not use this. | 276 | we are not conforming to RFC, but prevent TCP TIME_WAIT |
283 | Default: 128K | 277 | assassination. |
278 | Default: 0 | ||
284 | 279 | ||
285 | tcp_rmem - vector of 3 INTEGERs: min, default, max | 280 | tcp_rmem - vector of 3 INTEGERs: min, default, max |
286 | min: Minimal size of receive buffer used by TCP sockets. | 281 | min: Minimal size of receive buffer used by TCP sockets. |
@@ -299,67 +294,91 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max | |||
299 | net.core.rmem_max, "static" selection via SO_RCVBUF does not use this. | 294 | net.core.rmem_max, "static" selection via SO_RCVBUF does not use this. |
300 | Default: 87380*2 bytes. | 295 | Default: 87380*2 bytes. |
301 | 296 | ||
302 | tcp_mem - vector of 3 INTEGERs: min, pressure, max | 297 | tcp_sack - BOOLEAN |
303 | min: below this number of pages TCP is not bothered about its | 298 | Enable select acknowledgments (SACKS). |
304 | memory appetite. | ||
305 | 299 | ||
306 | pressure: when amount of memory allocated by TCP exceeds this number | 300 | tcp_slow_start_after_idle - BOOLEAN |
307 | of pages, TCP moderates its memory consumption and enters memory | 301 | If set, provide RFC2861 behavior and time out the congestion |
308 | pressure mode, which is exited when memory consumption falls | 302 | window after an idle period. An idle period is defined at |
309 | under "min". | 303 | the current RTO. If unset, the congestion window will not |
304 | be timed out after an idle period. | ||
305 | Default: 1 | ||
310 | 306 | ||
311 | max: number of pages allowed for queueing by all TCP sockets. | 307 | tcp_stdurg - BOOLEAN |
308 | Use the Host requirements interpretation of the TCP urg pointer field. | ||
309 | Most hosts use the older BSD interpretation, so if you turn this on | ||
310 | Linux might not communicate correctly with them. | ||
311 | Default: FALSE | ||
312 | 312 | ||
313 | Defaults are calculated at boot time from amount of available | 313 | tcp_synack_retries - INTEGER |
314 | memory. | 314 | Number of times SYNACKs for a passive TCP connection attempt will |
315 | be retransmitted. Should not be higher than 255. Default value | ||
316 | is 5, which corresponds to ~180seconds. | ||
315 | 317 | ||
316 | tcp_app_win - INTEGER | 318 | tcp_syncookies - BOOLEAN |
317 | Reserve max(window/2^tcp_app_win, mss) of window for application | 319 | Only valid when the kernel was compiled with CONFIG_SYNCOOKIES |
318 | buffer. Value 0 is special, it means that nothing is reserved. | 320 | Send out syncookies when the syn backlog queue of a socket |
319 | Default: 31 | 321 | overflows. This is to prevent against the common 'syn flood attack' |
322 | Default: FALSE | ||
320 | 323 | ||
321 | tcp_adv_win_scale - INTEGER | 324 | Note, that syncookies is fallback facility. |
322 | Count buffering overhead as bytes/2^tcp_adv_win_scale | 325 | It MUST NOT be used to help highly loaded servers to stand |
323 | (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), | 326 | against legal connection rate. If you see synflood warnings |
324 | if it is <= 0. | 327 | in your logs, but investigation shows that they occur |
325 | Default: 2 | 328 | because of overload with legal connections, you should tune |
329 | another parameters until this warning disappear. | ||
330 | See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow. | ||
326 | 331 | ||
327 | tcp_rfc1337 - BOOLEAN | 332 | syncookies seriously violate TCP protocol, do not allow |
328 | If set, the TCP stack behaves conforming to RFC1337. If unset, | 333 | to use TCP extensions, can result in serious degradation |
329 | we are not conforming to RFC, but prevent TCP TIME_WAIT | 334 | of some services (f.e. SMTP relaying), visible not by you, |
330 | assassination. | 335 | but your clients and relays, contacting you. While you see |
331 | Default: 0 | 336 | synflood warnings in logs not being really flooded, your server |
337 | is seriously misconfigured. | ||
332 | 338 | ||
333 | tcp_low_latency - BOOLEAN | 339 | tcp_syn_retries - INTEGER |
334 | If set, the TCP stack makes decisions that prefer lower | 340 | Number of times initial SYNs for an active TCP connection attempt |
335 | latency as opposed to higher throughput. By default, this | 341 | will be retransmitted. Should not be higher than 255. Default value |
336 | option is not set meaning that higher throughput is preferred. | 342 | is 5, which corresponds to ~180seconds. |
337 | An example of an application where this default should be | 343 | |
338 | changed would be a Beowulf compute cluster. | 344 | tcp_timestamps - BOOLEAN |
339 | Default: 0 | 345 | Enable timestamps as defined in RFC1323. |
340 | 346 | ||
341 | tcp_tso_win_divisor - INTEGER | 347 | tcp_tso_win_divisor - INTEGER |
342 | This allows control over what percentage of the congestion window | 348 | This allows control over what percentage of the congestion window |
343 | can be consumed by a single TSO frame. | 349 | can be consumed by a single TSO frame. |
344 | The setting of this parameter is a choice between burstiness and | 350 | The setting of this parameter is a choice between burstiness and |
345 | building larger TSO frames. | 351 | building larger TSO frames. |
346 | Default: 3 | 352 | Default: 3 |
347 | 353 | ||
348 | tcp_frto - BOOLEAN | 354 | tcp_tw_recycle - BOOLEAN |
349 | Enables F-RTO, an enhanced recovery algorithm for TCP retransmission | 355 | Enable fast recycling TIME-WAIT sockets. Default value is 0. |
350 | timeouts. It is particularly beneficial in wireless environments | 356 | It should not be changed without advice/request of technical |
351 | where packet loss is typically due to random radio interference | 357 | experts. |
352 | rather than intermediate router congestion. | ||
353 | 358 | ||
354 | tcp_congestion_control - STRING | 359 | tcp_tw_reuse - BOOLEAN |
355 | Set the congestion control algorithm to be used for new | 360 | Allow to reuse TIME-WAIT sockets for new connections when it is |
356 | connections. The algorithm "reno" is always available, but | 361 | safe from protocol viewpoint. Default value is 0. |
357 | additional choices may be available based on kernel configuration. | 362 | It should not be changed without advice/request of technical |
363 | experts. | ||
358 | 364 | ||
359 | somaxconn - INTEGER | 365 | tcp_window_scaling - BOOLEAN |
360 | Limit of socket listen() backlog, known in userspace as SOMAXCONN. | 366 | Enable window scaling as defined in RFC1323. |
361 | Defaults to 128. See also tcp_max_syn_backlog for additional tuning | 367 | |
362 | for TCP sockets. | 368 | tcp_wmem - vector of 3 INTEGERs: min, default, max |
369 | min: Amount of memory reserved for send buffers for TCP socket. | ||
370 | Each TCP socket has rights to use it due to fact of its birth. | ||
371 | Default: 4K | ||
372 | |||
373 | default: Amount of memory allowed for send buffers for TCP socket | ||
374 | by default. This value overrides net.core.wmem_default used | ||
375 | by other protocols, it is usually lower than net.core.wmem_default. | ||
376 | Default: 16K | ||
377 | |||
378 | max: Maximal amount of memory allowed for automatically selected | ||
379 | send buffers for TCP socket. This value does not override | ||
380 | net.core.wmem_max, "static" selection via SO_SNDBUF does not use this. | ||
381 | Default: 128K | ||
363 | 382 | ||
364 | tcp_workaround_signed_windows - BOOLEAN | 383 | tcp_workaround_signed_windows - BOOLEAN |
365 | If set, assume no receipt of a window scaling option means the | 384 | If set, assume no receipt of a window scaling option means the |
@@ -368,13 +387,6 @@ tcp_workaround_signed_windows - BOOLEAN | |||
368 | not receive a window scaling option from them. | 387 | not receive a window scaling option from them. |
369 | Default: 0 | 388 | Default: 0 |
370 | 389 | ||
371 | tcp_slow_start_after_idle - BOOLEAN | ||
372 | If set, provide RFC2861 behavior and time out the congestion | ||
373 | window after an idle period. An idle period is defined at | ||
374 | the current RTO. If unset, the congestion window will not | ||
375 | be timed out after an idle period. | ||
376 | Default: 1 | ||
377 | |||
378 | CIPSOv4 Variables: | 390 | CIPSOv4 Variables: |
379 | 391 | ||
380 | cipso_cache_enable - BOOLEAN | 392 | cipso_cache_enable - BOOLEAN |
@@ -974,4 +986,3 @@ no_cong_thresh FIXME | |||
974 | slot_timeout FIXME | 986 | slot_timeout FIXME |
975 | warn_noreply_time FIXME | 987 | warn_noreply_time FIXME |
976 | 988 | ||
977 | $Id: ip-sysctl.txt,v 1.20 2001/12/13 09:00:18 davem Exp $ | ||
diff --git a/Documentation/networking/iphase.txt b/Documentation/networking/iphase.txt index 493203a080a8..55eac4a784e2 100644 --- a/Documentation/networking/iphase.txt +++ b/Documentation/networking/iphase.txt | |||
@@ -81,7 +81,7 @@ Installation | |||
81 | 1M. The RAM size decides the number of buffers and buffer size. The default | 81 | 1M. The RAM size decides the number of buffers and buffer size. The default |
82 | size and number of buffers are set as following: | 82 | size and number of buffers are set as following: |
83 | 83 | ||
84 | Totol Rx RAM Tx RAM Rx Buf Tx Buf Rx buf Tx buf | 84 | Total Rx RAM Tx RAM Rx Buf Tx Buf Rx buf Tx buf |
85 | RAM size size size size size cnt cnt | 85 | RAM size size size size size cnt cnt |
86 | -------- ------ ------ ------ ------ ------ ------ | 86 | -------- ------ ------ ------ ------ ------ ------ |
87 | 128K 64K 64K 10K 10K 6 6 | 87 | 128K 64K 64K 10K 10K 6 6 |
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 12a008a5c221..5a232d946be3 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt | |||
@@ -284,7 +284,7 @@ the necessary memory, so normally limits can be reached. | |||
284 | ------------------- | 284 | ------------------- |
285 | 285 | ||
286 | If you check the source code you will see that what I draw here as a frame | 286 | If you check the source code you will see that what I draw here as a frame |
287 | is not only the link level frame. At the begining of each frame there is a | 287 | is not only the link level frame. At the beginning of each frame there is a |
288 | header called struct tpacket_hdr used in PACKET_MMAP to hold link level's frame | 288 | header called struct tpacket_hdr used in PACKET_MMAP to hold link level's frame |
289 | meta information like timestamp. So what we draw here a frame it's really | 289 | meta information like timestamp. So what we draw here a frame it's really |
290 | the following (from include/linux/if_packet.h): | 290 | the following (from include/linux/if_packet.h): |
diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt index 29ccae409031..0bc95eab1512 100644 --- a/Documentation/networking/phy.txt +++ b/Documentation/networking/phy.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | 1 | ||
2 | ------- | 2 | ------- |
3 | PHY Abstraction Layer | 3 | PHY Abstraction Layer |
4 | (Updated 2005-07-21) | 4 | (Updated 2006-11-30) |
5 | 5 | ||
6 | Purpose | 6 | Purpose |
7 | 7 | ||
@@ -97,11 +97,12 @@ Letting the PHY Abstraction Layer do Everything | |||
97 | 97 | ||
98 | Next, you need to know the device name of the PHY connected to this device. | 98 | Next, you need to know the device name of the PHY connected to this device. |
99 | The name will look something like, "phy0:0", where the first number is the | 99 | The name will look something like, "phy0:0", where the first number is the |
100 | bus id, and the second is the PHY's address on that bus. | 100 | bus id, and the second is the PHY's address on that bus. Typically, |
101 | the bus is responsible for making its ID unique. | ||
101 | 102 | ||
102 | Now, to connect, just call this function: | 103 | Now, to connect, just call this function: |
103 | 104 | ||
104 | phydev = phy_connect(dev, phy_name, &adjust_link, flags); | 105 | phydev = phy_connect(dev, phy_name, &adjust_link, flags, interface); |
105 | 106 | ||
106 | phydev is a pointer to the phy_device structure which represents the PHY. If | 107 | phydev is a pointer to the phy_device structure which represents the PHY. If |
107 | phy_connect is successful, it will return the pointer. dev, here, is the | 108 | phy_connect is successful, it will return the pointer. dev, here, is the |
@@ -115,6 +116,10 @@ Letting the PHY Abstraction Layer do Everything | |||
115 | This is useful if the system has put hardware restrictions on | 116 | This is useful if the system has put hardware restrictions on |
116 | the PHY/controller, of which the PHY needs to be aware. | 117 | the PHY/controller, of which the PHY needs to be aware. |
117 | 118 | ||
119 | interface is a u32 which specifies the connection type used | ||
120 | between the controller and the PHY. Examples are GMII, MII, | ||
121 | RGMII, and SGMII. For a full list, see include/linux/phy.h | ||
122 | |||
118 | Now just make sure that phydev->supported and phydev->advertising have any | 123 | Now just make sure that phydev->supported and phydev->advertising have any |
119 | values pruned from them which don't make sense for your controller (a 10/100 | 124 | values pruned from them which don't make sense for your controller (a 10/100 |
120 | controller may be connected to a gigabit capable PHY, so you would need to | 125 | controller may be connected to a gigabit capable PHY, so you would need to |
@@ -191,7 +196,7 @@ Doing it all yourself | |||
191 | start, or disables then frees them for stop. | 196 | start, or disables then frees them for stop. |
192 | 197 | ||
193 | struct phy_device * phy_attach(struct net_device *dev, const char *phy_id, | 198 | struct phy_device * phy_attach(struct net_device *dev, const char *phy_id, |
194 | u32 flags); | 199 | u32 flags, phy_interface_t interface); |
195 | 200 | ||
196 | Attaches a network device to a particular PHY, binding the PHY to a generic | 201 | Attaches a network device to a particular PHY, binding the PHY to a generic |
197 | driver if none was found during bus initialization. Passes in | 202 | driver if none was found during bus initialization. Passes in |
diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt index c8eee23be8c0..c6cf4a3c16e0 100644 --- a/Documentation/networking/pktgen.txt +++ b/Documentation/networking/pktgen.txt | |||
@@ -63,8 +63,8 @@ Current: | |||
63 | Result: OK: 13101142(c12220741+d880401) usec, 10000000 (60byte,0frags) | 63 | Result: OK: 13101142(c12220741+d880401) usec, 10000000 (60byte,0frags) |
64 | 763292pps 390Mb/sec (390805504bps) errors: 39664 | 64 | 763292pps 390Mb/sec (390805504bps) errors: 39664 |
65 | 65 | ||
66 | Confguring threads and devices | 66 | Configuring threads and devices |
67 | ============================== | 67 | ================================ |
68 | This is done via the /proc interface easiest done via pgset in the scripts | 68 | This is done via the /proc interface easiest done via pgset in the scripts |
69 | 69 | ||
70 | Examples: | 70 | Examples: |
@@ -116,7 +116,7 @@ Examples: | |||
116 | there must be no spaces between the | 116 | there must be no spaces between the |
117 | arguments. Leading zeros are required. | 117 | arguments. Leading zeros are required. |
118 | Do not set the bottom of stack bit, | 118 | Do not set the bottom of stack bit, |
119 | thats done automatically. If you do | 119 | that's done automatically. If you do |
120 | set the bottom of stack bit, that | 120 | set the bottom of stack bit, that |
121 | indicates that you want to randomly | 121 | indicates that you want to randomly |
122 | generate that address and the flag | 122 | generate that address and the flag |
diff --git a/Documentation/networking/proc_net_tcp.txt b/Documentation/networking/proc_net_tcp.txt index 59cb915c3713..5e21f7cb6383 100644 --- a/Documentation/networking/proc_net_tcp.txt +++ b/Documentation/networking/proc_net_tcp.txt | |||
@@ -25,7 +25,7 @@ up into 3 parts because of the length of the line): | |||
25 | 25 | ||
26 | 1000 0 54165785 4 cd1e6040 25 4 27 3 -1 | 26 | 1000 0 54165785 4 cd1e6040 25 4 27 3 -1 |
27 | | | | | | | | | | |--> slow start size threshold, | 27 | | | | | | | | | | |--> slow start size threshold, |
28 | | | | | | | | | | or -1 if the treshold | 28 | | | | | | | | | | or -1 if the threshold |
29 | | | | | | | | | | is >= 0xFFFF | 29 | | | | | | | | | | is >= 0xFFFF |
30 | | | | | | | | | |----> sending congestion window | 30 | | | | | | | | | |----> sending congestion window |
31 | | | | | | | | |-------> (ack.quick<<1)|ack.pingpong | 31 | | | | | | | | |-------> (ack.quick<<1)|ack.pingpong |
diff --git a/Documentation/networking/sk98lin.txt b/Documentation/networking/sk98lin.txt index 4e1cc745ec63..8590a954df1d 100644 --- a/Documentation/networking/sk98lin.txt +++ b/Documentation/networking/sk98lin.txt | |||
@@ -346,7 +346,7 @@ Possible modes: | |||
346 | depending on the load of the system. If the driver detects that the | 346 | depending on the load of the system. If the driver detects that the |
347 | system load is too high, the driver tries to shield the system against | 347 | system load is too high, the driver tries to shield the system against |
348 | too much network load by enabling interrupt moderation. If - at a later | 348 | too much network load by enabling interrupt moderation. If - at a later |
349 | time - the CPU utilizaton decreases again (or if the network load is | 349 | time - the CPU utilization decreases again (or if the network load is |
350 | negligible) the interrupt moderation will automatically be disabled. | 350 | negligible) the interrupt moderation will automatically be disabled. |
351 | 351 | ||
352 | Interrupt moderation should be used when the driver has to handle one or more | 352 | Interrupt moderation should be used when the driver has to handle one or more |
diff --git a/Documentation/networking/slicecom.txt b/Documentation/networking/slicecom.txt index 2f04c9267f89..32d3b916afad 100644 --- a/Documentation/networking/slicecom.txt +++ b/Documentation/networking/slicecom.txt | |||
@@ -126,7 +126,7 @@ comx0/boardnum - board number of the SliceCom in the PC (using the 'natural' | |||
126 | 126 | ||
127 | Though the options below are to be set on a single interface, they apply to the | 127 | Though the options below are to be set on a single interface, they apply to the |
128 | whole board. The restriction, to use them on 'UP' interfaces, is because the | 128 | whole board. The restriction, to use them on 'UP' interfaces, is because the |
129 | command sequence below could lead to unpredicable results. | 129 | command sequence below could lead to unpredictable results. |
130 | 130 | ||
131 | # echo 0 >boardnum | 131 | # echo 0 >boardnum |
132 | # echo internal >clock_source | 132 | # echo internal >clock_source |
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt new file mode 100644 index 000000000000..dd6f46b83dab --- /dev/null +++ b/Documentation/networking/udplite.txt | |||
@@ -0,0 +1,281 @@ | |||
1 | =========================================================================== | ||
2 | The UDP-Lite protocol (RFC 3828) | ||
3 | =========================================================================== | ||
4 | |||
5 | |||
6 | UDP-Lite is a Standards-Track IETF transport protocol whose characteristic | ||
7 | is a variable-length checksum. This has advantages for transport of multimedia | ||
8 | (video, VoIP) over wireless networks, as partly damaged packets can still be | ||
9 | fed into the codec instead of being discarded due to a failed checksum test. | ||
10 | |||
11 | This file briefly describes the existing kernel support and the socket API. | ||
12 | For in-depth information, you can consult: | ||
13 | |||
14 | o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ | ||
15 | Fom here you can also download some example application source code. | ||
16 | |||
17 | o The UDP-Lite HOWTO on | ||
18 | http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt | ||
19 | |||
20 | o The Wireshark UDP-Lite WiKi (with capture files): | ||
21 | http://wiki.wireshark.org/Lightweight_User_Datagram_Protocol | ||
22 | |||
23 | o The Protocol Spec, RFC 3828, http://www.ietf.org/rfc/rfc3828.txt | ||
24 | |||
25 | |||
26 | I) APPLICATIONS | ||
27 | |||
28 | Several applications have been ported successfully to UDP-Lite. Ethereal | ||
29 | (now called wireshark) has UDP-Litev4/v6 support by default. The tarball on | ||
30 | |||
31 | http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/udplite_linux.tar.gz | ||
32 | |||
33 | has source code for several v4/v6 client-server and network testing examples. | ||
34 | |||
35 | Porting applications to UDP-Lite is straightforward: only socket level and | ||
36 | IPPROTO need to be changed; senders additionally set the checksum coverage | ||
37 | length (default = header length = 8). Details are in the next section. | ||
38 | |||
39 | |||
40 | II) PROGRAMMING API | ||
41 | |||
42 | UDP-Lite provides a connectionless, unreliable datagram service and hence | ||
43 | uses the same socket type as UDP. In fact, porting from UDP to UDP-Lite is | ||
44 | very easy: simply add `IPPROTO_UDPLITE' as the last argument of the socket(2) | ||
45 | call so that the statement looks like: | ||
46 | |||
47 | s = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); | ||
48 | |||
49 | or, respectively, | ||
50 | |||
51 | s = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDPLITE); | ||
52 | |||
53 | With just the above change you are able to run UDP-Lite services or connect | ||
54 | to UDP-Lite servers. The kernel will assume that you are not interested in | ||
55 | using partial checksum coverage and so emulate UDP mode (full coverage). | ||
56 | |||
57 | To make use of the partial checksum coverage facilities requires setting a | ||
58 | single socket option, which takes an integer specifying the coverage length: | ||
59 | |||
60 | * Sender checksum coverage: UDPLITE_SEND_CSCOV | ||
61 | |||
62 | For example, | ||
63 | |||
64 | int val = 20; | ||
65 | setsockopt(s, SOL_UDPLITE, UDPLITE_SEND_CSCOV, &val, sizeof(int)); | ||
66 | |||
67 | sets the checksum coverage length to 20 bytes (12b data + 8b header). | ||
68 | Of each packet only the first 20 bytes (plus the pseudo-header) will be | ||
69 | checksummed. This is useful for RTP applications which have a 12-byte | ||
70 | base header. | ||
71 | |||
72 | |||
73 | * Receiver checksum coverage: UDPLITE_RECV_CSCOV | ||
74 | |||
75 | This option is the receiver-side analogue. It is truly optional, i.e. not | ||
76 | required to enable traffic with partial checksum coverage. Its function is | ||
77 | that of a traffic filter: when enabled, it instructs the kernel to drop | ||
78 | all packets which have a coverage _less_ than this value. For example, if | ||
79 | RTP and UDP headers are to be protected, a receiver can enforce that only | ||
80 | packets with a minimum coverage of 20 are admitted: | ||
81 | |||
82 | int min = 20; | ||
83 | setsockopt(s, SOL_UDPLITE, UDPLITE_RECV_CSCOV, &min, sizeof(int)); | ||
84 | |||
85 | The calls to getsockopt(2) are analogous. Being an extension and not a stand- | ||
86 | alone protocol, all socket options known from UDP can be used in exactly the | ||
87 | same manner as before, e.g. UDP_CORK or UDP_ENCAP. | ||
88 | |||
89 | A detailed discussion of UDP-Lite checksum coverage options is in section IV. | ||
90 | |||
91 | |||
92 | III) HEADER FILES | ||
93 | |||
94 | The socket API requires support through header files in /usr/include: | ||
95 | |||
96 | * /usr/include/netinet/in.h | ||
97 | to define IPPROTO_UDPLITE | ||
98 | |||
99 | * /usr/include/netinet/udplite.h | ||
100 | for UDP-Lite header fields and protocol constants | ||
101 | |||
102 | For testing purposes, the following can serve as a `mini' header file: | ||
103 | |||
104 | #define IPPROTO_UDPLITE 136 | ||
105 | #define SOL_UDPLITE 136 | ||
106 | #define UDPLITE_SEND_CSCOV 10 | ||
107 | #define UDPLITE_RECV_CSCOV 11 | ||
108 | |||
109 | Ready-made header files for various distros are in the UDP-Lite tarball. | ||
110 | |||
111 | |||
112 | IV) KERNEL BEHAVIOUR WITH REGARD TO THE VARIOUS SOCKET OPTIONS | ||
113 | |||
114 | To enable debugging messages, the log level need to be set to 8, as most | ||
115 | messages use the KERN_DEBUG level (7). | ||
116 | |||
117 | 1) Sender Socket Options | ||
118 | |||
119 | If the sender specifies a value of 0 as coverage length, the module | ||
120 | assumes full coverage, transmits a packet with coverage length of 0 | ||
121 | and according checksum. If the sender specifies a coverage < 8 and | ||
122 | different from 0, the kernel assumes 8 as default value. Finally, | ||
123 | if the specified coverage length exceeds the packet length, the packet | ||
124 | length is used instead as coverage length. | ||
125 | |||
126 | 2) Receiver Socket Options | ||
127 | |||
128 | The receiver specifies the minimum value of the coverage length it | ||
129 | is willing to accept. A value of 0 here indicates that the receiver | ||
130 | always wants the whole of the packet covered. In this case, all | ||
131 | partially covered packets are dropped and an error is logged. | ||
132 | |||
133 | It is not possible to specify illegal values (<0 and <8); in these | ||
134 | cases the default of 8 is assumed. | ||
135 | |||
136 | All packets arriving with a coverage value less than the specified | ||
137 | threshold are discarded, these events are also logged. | ||
138 | |||
139 | 3) Disabling the Checksum Computation | ||
140 | |||
141 | On both sender and receiver, checksumming will always be performed | ||
142 | and can not be disabled using SO_NO_CHECK. Thus | ||
143 | |||
144 | setsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, ... ); | ||
145 | |||
146 | will always will be ignored, while the value of | ||
147 | |||
148 | getsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, &value, ...); | ||
149 | |||
150 | is meaningless (as in TCP). Packets with a zero checksum field are | ||
151 | illegal (cf. RFC 3828, sec. 3.1) will be silently discarded. | ||
152 | |||
153 | 4) Fragmentation | ||
154 | |||
155 | The checksum computation respects both buffersize and MTU. The size | ||
156 | of UDP-Lite packets is determined by the size of the send buffer. The | ||
157 | minimum size of the send buffer is 2048 (defined as SOCK_MIN_SNDBUF | ||
158 | in include/net/sock.h), the default value is configurable as | ||
159 | net.core.wmem_default or via setting the SO_SNDBUF socket(7) | ||
160 | option. The maximum upper bound for the send buffer is determined | ||
161 | by net.core.wmem_max. | ||
162 | |||
163 | Given a payload size larger than the send buffer size, UDP-Lite will | ||
164 | split the payload into several individual packets, filling up the | ||
165 | send buffer size in each case. | ||
166 | |||
167 | The precise value also depends on the interface MTU. The interface MTU, | ||
168 | in turn, may trigger IP fragmentation. In this case, the generated | ||
169 | UDP-Lite packet is split into several IP packets, of which only the | ||
170 | first one contains the L4 header. | ||
171 | |||
172 | The send buffer size has implications on the checksum coverage length. | ||
173 | Consider the following example: | ||
174 | |||
175 | Payload: 1536 bytes Send Buffer: 1024 bytes | ||
176 | MTU: 1500 bytes Coverage Length: 856 bytes | ||
177 | |||
178 | UDP-Lite will ship the 1536 bytes in two separate packets: | ||
179 | |||
180 | Packet 1: 1024 payload + 8 byte header + 20 byte IP header = 1052 bytes | ||
181 | Packet 2: 512 payload + 8 byte header + 20 byte IP header = 540 bytes | ||
182 | |||
183 | The coverage packet covers the UDP-Lite header and 848 bytes of the | ||
184 | payload in the first packet, the second packet is fully covered. Note | ||
185 | that for the second packet, the coverage length exceeds the packet | ||
186 | length. The kernel always re-adjusts the coverage length to the packet | ||
187 | length in such cases. | ||
188 | |||
189 | As an example of what happens when one UDP-Lite packet is split into | ||
190 | several tiny fragments, consider the following example. | ||
191 | |||
192 | Payload: 1024 bytes Send buffer size: 1024 bytes | ||
193 | MTU: 300 bytes Coverage length: 575 bytes | ||
194 | |||
195 | +-+-----------+--------------+--------------+--------------+ | ||
196 | |8| 272 | 280 | 280 | 280 | | ||
197 | +-+-----------+--------------+--------------+--------------+ | ||
198 | 280 560 840 1032 | ||
199 | ^ | ||
200 | *****checksum coverage************* | ||
201 | |||
202 | The UDP-Lite module generates one 1032 byte packet (1024 + 8 byte | ||
203 | header). According to the interface MTU, these are split into 4 IP | ||
204 | packets (280 byte IP payload + 20 byte IP header). The kernel module | ||
205 | sums the contents of the entire first two packets, plus 15 bytes of | ||
206 | the last packet before releasing the fragments to the IP module. | ||
207 | |||
208 | To see the analogous case for IPv6 fragmentation, consider a link | ||
209 | MTU of 1280 bytes and a write buffer of 3356 bytes. If the checksum | ||
210 | coverage is less than 1232 bytes (MTU minus IPv6/fragment header | ||
211 | lengths), only the first fragment needs to be considered. When using | ||
212 | larger checksum coverage lengths, each eligible fragment needs to be | ||
213 | checksummed. Suppose we have a checksum coverage of 3062. The buffer | ||
214 | of 3356 bytes will be split into the following fragments: | ||
215 | |||
216 | Fragment 1: 1280 bytes carrying 1232 bytes of UDP-Lite data | ||
217 | Fragment 2: 1280 bytes carrying 1232 bytes of UDP-Lite data | ||
218 | Fragment 3: 948 bytes carrying 900 bytes of UDP-Lite data | ||
219 | |||
220 | The first two fragments have to be checksummed in full, of the last | ||
221 | fragment only 598 (= 3062 - 2*1232) bytes are checksummed. | ||
222 | |||
223 | While it is important that such cases are dealt with correctly, they | ||
224 | are (annoyingly) rare: UDP-Lite is designed for optimising multimedia | ||
225 | performance over wireless (or generally noisy) links and thus smaller | ||
226 | coverage lenghts are likely to be expected. | ||
227 | |||
228 | |||
229 | V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING | ||
230 | |||
231 | Exceptional and error conditions are logged to syslog at the KERN_DEBUG | ||
232 | level. Live statistics about UDP-Lite are available in /proc/net/snmp | ||
233 | and can (with newer versions of netstat) be viewed using | ||
234 | |||
235 | netstat -svu | ||
236 | |||
237 | This displays UDP-Lite statistics variables, whose meaning is as follows. | ||
238 | |||
239 | InDatagrams: Total number of received datagrams. | ||
240 | |||
241 | NoPorts: Number of packets received to an unknown port. | ||
242 | These cases are counted separately (not as InErrors). | ||
243 | |||
244 | InErrors: Number of erroneous UDP-Lite packets. Errors include: | ||
245 | * internal socket queue receive errors | ||
246 | * packet too short (less than 8 bytes or stated | ||
247 | coverage length exceeds received length) | ||
248 | * xfrm4_policy_check() returned with error | ||
249 | * application has specified larger min. coverage | ||
250 | length than that of incoming packet | ||
251 | * checksum coverage violated | ||
252 | * bad checksum | ||
253 | |||
254 | OutDatagrams: Total number of sent datagrams. | ||
255 | |||
256 | These statistics derive from the UDP MIB (RFC 2013). | ||
257 | |||
258 | |||
259 | VI) IPTABLES | ||
260 | |||
261 | There is packet match support for UDP-Lite as well as support for the LOG target. | ||
262 | If you copy and paste the following line into /etc/protcols, | ||
263 | |||
264 | udplite 136 UDP-Lite # UDP-Lite [RFC 3828] | ||
265 | |||
266 | then | ||
267 | iptables -A INPUT -p udplite -j LOG | ||
268 | |||
269 | will produce logging output to syslog. Dropping and rejecting packets also works. | ||
270 | |||
271 | |||
272 | VII) MAINTAINER ADDRESS | ||
273 | |||
274 | The UDP-Lite patch was developed at | ||
275 | University of Aberdeen | ||
276 | Electronics Research Group | ||
277 | Department of Engineering | ||
278 | Fraser Noble Building | ||
279 | Aberdeen AB24 3UE; UK | ||
280 | The current maintainer is Gerrit Renker, <gerrit@erg.abdn.ac.uk>. Initial | ||
281 | code was developed by William Stanislaus, <william@erg.abdn.ac.uk>. | ||
diff --git a/Documentation/networking/wan-router.txt b/Documentation/networking/wan-router.txt index 0cf654147634..653978dcea7f 100644 --- a/Documentation/networking/wan-router.txt +++ b/Documentation/networking/wan-router.txt | |||
@@ -412,7 +412,7 @@ beta-2.1.4 Jul 2000 o Dynamic interface configuration: | |||
412 | 412 | ||
413 | beta3-2.1.4 Jul 2000 o X25 M_BIT Problem fix. | 413 | beta3-2.1.4 Jul 2000 o X25 M_BIT Problem fix. |
414 | o Added the Multi-Port PPP | 414 | o Added the Multi-Port PPP |
415 | Updated utilites for the Multi-Port PPP. | 415 | Updated utilities for the Multi-Port PPP. |
416 | 416 | ||
417 | 2.1.4 Aut 2000 | 417 | 2.1.4 Aut 2000 |
418 | o In X25API: | 418 | o In X25API: |
@@ -444,13 +444,13 @@ beta1-2.1.5 Nov 15 2000 | |||
444 | 444 | ||
445 | o Cpipemon | 445 | o Cpipemon |
446 | - Added set FT1 commands to the cpipemon. Thus CSU/DSU | 446 | - Added set FT1 commands to the cpipemon. Thus CSU/DSU |
447 | configuraiton can be performed using cpipemon. | 447 | configuration can be performed using cpipemon. |
448 | All systems that cannot run cfgft1 GUI utility should | 448 | All systems that cannot run cfgft1 GUI utility should |
449 | use cpipemon to configure the on board CSU/DSU. | 449 | use cpipemon to configure the on board CSU/DSU. |
450 | 450 | ||
451 | 451 | ||
452 | o Keyboard Led Monitor/Debugger | 452 | o Keyboard Led Monitor/Debugger |
453 | - A new utilty /usr/sbin/wpkbdmon uses keyboard leds | 453 | - A new utility /usr/sbin/wpkbdmon uses keyboard leds |
454 | to convey operational statistic information of the | 454 | to convey operational statistic information of the |
455 | Sangoma WANPIPE cards. | 455 | Sangoma WANPIPE cards. |
456 | NUM_LOCK = Line State (On=connected, Off=disconnected) | 456 | NUM_LOCK = Line State (On=connected, Off=disconnected) |
@@ -464,7 +464,7 @@ beta1-2.1.5 Nov 15 2000 | |||
464 | - Appropriate number of devices are dynamically loaded | 464 | - Appropriate number of devices are dynamically loaded |
465 | based on the number of Sangoma cards found. | 465 | based on the number of Sangoma cards found. |
466 | 466 | ||
467 | Note: The kernel configuraiton option | 467 | Note: The kernel configuration option |
468 | CONFIG_WANPIPE_CARDS has been taken out. | 468 | CONFIG_WANPIPE_CARDS has been taken out. |
469 | 469 | ||
470 | o Fixed the Frame Relay and Chdlc network interfaces so they are | 470 | o Fixed the Frame Relay and Chdlc network interfaces so they are |
diff --git a/Documentation/networking/xfrm_sync.txt b/Documentation/networking/xfrm_sync.txt index 8be626f7c0b8..d7aac9dedeb4 100644 --- a/Documentation/networking/xfrm_sync.txt +++ b/Documentation/networking/xfrm_sync.txt | |||
@@ -47,10 +47,13 @@ aevent_id structure looks like: | |||
47 | 47 | ||
48 | struct xfrm_aevent_id { | 48 | struct xfrm_aevent_id { |
49 | struct xfrm_usersa_id sa_id; | 49 | struct xfrm_usersa_id sa_id; |
50 | xfrm_address_t saddr; | ||
50 | __u32 flags; | 51 | __u32 flags; |
52 | __u32 reqid; | ||
51 | }; | 53 | }; |
52 | 54 | ||
53 | xfrm_usersa_id in this message layout identifies the SA. | 55 | The unique SA is identified by the combination of xfrm_usersa_id, |
56 | reqid and saddr. | ||
54 | 57 | ||
55 | flags are used to indicate different things. The possible | 58 | flags are used to indicate different things. The possible |
56 | flags are: | 59 | flags are: |