diff options
Diffstat (limited to 'Documentation/networking')
-rw-r--r-- | Documentation/networking/can.txt | 94 | ||||
-rw-r--r-- | Documentation/networking/filter.txt | 608 | ||||
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 26 | ||||
-rw-r--r-- | Documentation/networking/packet_mmap.txt | 31 | ||||
-rw-r--r-- | Documentation/networking/phy.txt | 3 | ||||
-rw-r--r-- | Documentation/networking/regulatory.txt | 4 |
6 files changed, 656 insertions, 110 deletions
diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt index 4c072414eadb..f3089d423515 100644 --- a/Documentation/networking/can.txt +++ b/Documentation/networking/can.txt | |||
@@ -2,21 +2,20 @@ | |||
2 | 2 | ||
3 | can.txt | 3 | can.txt |
4 | 4 | ||
5 | Readme file for the Controller Area Network Protocol Family (aka Socket CAN) | 5 | Readme file for the Controller Area Network Protocol Family (aka SocketCAN) |
6 | 6 | ||
7 | This file contains | 7 | This file contains |
8 | 8 | ||
9 | 1 Overview / What is Socket CAN | 9 | 1 Overview / What is SocketCAN |
10 | 10 | ||
11 | 2 Motivation / Why using the socket API | 11 | 2 Motivation / Why using the socket API |
12 | 12 | ||
13 | 3 Socket CAN concept | 13 | 3 SocketCAN concept |
14 | 3.1 receive lists | 14 | 3.1 receive lists |
15 | 3.2 local loopback of sent frames | 15 | 3.2 local loopback of sent frames |
16 | 3.3 network security issues (capabilities) | 16 | 3.3 network problem notifications |
17 | 3.4 network problem notifications | ||
18 | 17 | ||
19 | 4 How to use Socket CAN | 18 | 4 How to use SocketCAN |
20 | 4.1 RAW protocol sockets with can_filters (SOCK_RAW) | 19 | 4.1 RAW protocol sockets with can_filters (SOCK_RAW) |
21 | 4.1.1 RAW socket option CAN_RAW_FILTER | 20 | 4.1.1 RAW socket option CAN_RAW_FILTER |
22 | 4.1.2 RAW socket option CAN_RAW_ERR_FILTER | 21 | 4.1.2 RAW socket option CAN_RAW_ERR_FILTER |
@@ -34,7 +33,7 @@ This file contains | |||
34 | 4.3 connected transport protocols (SOCK_SEQPACKET) | 33 | 4.3 connected transport protocols (SOCK_SEQPACKET) |
35 | 4.4 unconnected transport protocols (SOCK_DGRAM) | 34 | 4.4 unconnected transport protocols (SOCK_DGRAM) |
36 | 35 | ||
37 | 5 Socket CAN core module | 36 | 5 SocketCAN core module |
38 | 5.1 can.ko module params | 37 | 5.1 can.ko module params |
39 | 5.2 procfs content | 38 | 5.2 procfs content |
40 | 5.3 writing own CAN protocol modules | 39 | 5.3 writing own CAN protocol modules |
@@ -51,20 +50,20 @@ This file contains | |||
51 | 6.6 CAN FD (flexible data rate) driver support | 50 | 6.6 CAN FD (flexible data rate) driver support |
52 | 6.7 supported CAN hardware | 51 | 6.7 supported CAN hardware |
53 | 52 | ||
54 | 7 Socket CAN resources | 53 | 7 SocketCAN resources |
55 | 54 | ||
56 | 8 Credits | 55 | 8 Credits |
57 | 56 | ||
58 | ============================================================================ | 57 | ============================================================================ |
59 | 58 | ||
60 | 1. Overview / What is Socket CAN | 59 | 1. Overview / What is SocketCAN |
61 | -------------------------------- | 60 | -------------------------------- |
62 | 61 | ||
63 | The socketcan package is an implementation of CAN protocols | 62 | The socketcan package is an implementation of CAN protocols |
64 | (Controller Area Network) for Linux. CAN is a networking technology | 63 | (Controller Area Network) for Linux. CAN is a networking technology |
65 | which has widespread use in automation, embedded devices, and | 64 | which has widespread use in automation, embedded devices, and |
66 | automotive fields. While there have been other CAN implementations | 65 | automotive fields. While there have been other CAN implementations |
67 | for Linux based on character devices, Socket CAN uses the Berkeley | 66 | for Linux based on character devices, SocketCAN uses the Berkeley |
68 | socket API, the Linux network stack and implements the CAN device | 67 | socket API, the Linux network stack and implements the CAN device |
69 | drivers as network interfaces. The CAN socket API has been designed | 68 | drivers as network interfaces. The CAN socket API has been designed |
70 | as similar as possible to the TCP/IP protocols to allow programmers, | 69 | as similar as possible to the TCP/IP protocols to allow programmers, |
@@ -74,7 +73,7 @@ sockets. | |||
74 | 2. Motivation / Why using the socket API | 73 | 2. Motivation / Why using the socket API |
75 | ---------------------------------------- | 74 | ---------------------------------------- |
76 | 75 | ||
77 | There have been CAN implementations for Linux before Socket CAN so the | 76 | There have been CAN implementations for Linux before SocketCAN so the |
78 | question arises, why we have started another project. Most existing | 77 | question arises, why we have started another project. Most existing |
79 | implementations come as a device driver for some CAN hardware, they | 78 | implementations come as a device driver for some CAN hardware, they |
80 | are based on character devices and provide comparatively little | 79 | are based on character devices and provide comparatively little |
@@ -89,10 +88,10 @@ the CAN controller requires employment of another device driver and | |||
89 | often the need for adaption of large parts of the application to the | 88 | often the need for adaption of large parts of the application to the |
90 | new driver's API. | 89 | new driver's API. |
91 | 90 | ||
92 | Socket CAN was designed to overcome all of these limitations. A new | 91 | SocketCAN was designed to overcome all of these limitations. A new |
93 | protocol family has been implemented which provides a socket interface | 92 | protocol family has been implemented which provides a socket interface |
94 | to user space applications and which builds upon the Linux network | 93 | to user space applications and which builds upon the Linux network |
95 | layer, so to use all of the provided queueing functionality. A device | 94 | layer, enabling use all of the provided queueing functionality. A device |
96 | driver for CAN controller hardware registers itself with the Linux | 95 | driver for CAN controller hardware registers itself with the Linux |
97 | network layer as a network device, so that CAN frames from the | 96 | network layer as a network device, so that CAN frames from the |
98 | controller can be passed up to the network layer and on to the CAN | 97 | controller can be passed up to the network layer and on to the CAN |
@@ -146,15 +145,15 @@ solution for a couple of reasons: | |||
146 | providing an API for device drivers to register with. However, then | 145 | providing an API for device drivers to register with. However, then |
147 | it would be no more difficult, or may be even easier, to use the | 146 | it would be no more difficult, or may be even easier, to use the |
148 | networking framework provided by the Linux kernel, and this is what | 147 | networking framework provided by the Linux kernel, and this is what |
149 | Socket CAN does. | 148 | SocketCAN does. |
150 | 149 | ||
151 | The use of the networking framework of the Linux kernel is just the | 150 | The use of the networking framework of the Linux kernel is just the |
152 | natural and most appropriate way to implement CAN for Linux. | 151 | natural and most appropriate way to implement CAN for Linux. |
153 | 152 | ||
154 | 3. Socket CAN concept | 153 | 3. SocketCAN concept |
155 | --------------------- | 154 | --------------------- |
156 | 155 | ||
157 | As described in chapter 2 it is the main goal of Socket CAN to | 156 | As described in chapter 2 it is the main goal of SocketCAN to |
158 | provide a socket interface to user space applications which builds | 157 | provide a socket interface to user space applications which builds |
159 | upon the Linux network layer. In contrast to the commonly known | 158 | upon the Linux network layer. In contrast to the commonly known |
160 | TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!) | 159 | TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!) |
@@ -168,11 +167,11 @@ solution for a couple of reasons: | |||
168 | 167 | ||
169 | The network transparent access of multiple applications leads to the | 168 | The network transparent access of multiple applications leads to the |
170 | problem that different applications may be interested in the same | 169 | problem that different applications may be interested in the same |
171 | CAN-IDs from the same CAN network interface. The Socket CAN core | 170 | CAN-IDs from the same CAN network interface. The SocketCAN core |
172 | module - which implements the protocol family CAN - provides several | 171 | module - which implements the protocol family CAN - provides several |
173 | high efficient receive lists for this reason. If e.g. a user space | 172 | high efficient receive lists for this reason. If e.g. a user space |
174 | application opens a CAN RAW socket, the raw protocol module itself | 173 | application opens a CAN RAW socket, the raw protocol module itself |
175 | requests the (range of) CAN-IDs from the Socket CAN core that are | 174 | requests the (range of) CAN-IDs from the SocketCAN core that are |
176 | requested by the user. The subscription and unsubscription of | 175 | requested by the user. The subscription and unsubscription of |
177 | CAN-IDs can be done for specific CAN interfaces or for all(!) known | 176 | CAN-IDs can be done for specific CAN interfaces or for all(!) known |
178 | CAN interfaces with the can_rx_(un)register() functions provided to | 177 | CAN interfaces with the can_rx_(un)register() functions provided to |
@@ -217,21 +216,7 @@ solution for a couple of reasons: | |||
217 | * = you really like to have this when you're running analyser tools | 216 | * = you really like to have this when you're running analyser tools |
218 | like 'candump' or 'cansniffer' on the (same) node. | 217 | like 'candump' or 'cansniffer' on the (same) node. |
219 | 218 | ||
220 | 3.3 network security issues (capabilities) | 219 | 3.3 network problem notifications |
221 | |||
222 | The Controller Area Network is a local field bus transmitting only | ||
223 | broadcast messages without any routing and security concepts. | ||
224 | In the majority of cases the user application has to deal with | ||
225 | raw CAN frames. Therefore it might be reasonable NOT to restrict | ||
226 | the CAN access only to the user root, as known from other networks. | ||
227 | Since the currently implemented CAN_RAW and CAN_BCM sockets can only | ||
228 | send and receive frames to/from CAN interfaces it does not affect | ||
229 | security of others networks to allow all users to access the CAN. | ||
230 | To enable non-root users to access CAN_RAW and CAN_BCM protocol | ||
231 | sockets the Kconfig options CAN_RAW_USER and/or CAN_BCM_USER may be | ||
232 | selected at kernel compile time. | ||
233 | |||
234 | 3.4 network problem notifications | ||
235 | 220 | ||
236 | The use of the CAN bus may lead to several problems on the physical | 221 | The use of the CAN bus may lead to several problems on the physical |
237 | and media access control layer. Detecting and logging of these lower | 222 | and media access control layer. Detecting and logging of these lower |
@@ -251,11 +236,11 @@ solution for a couple of reasons: | |||
251 | by default. The format of the CAN error message frame is briefly | 236 | by default. The format of the CAN error message frame is briefly |
252 | described in the Linux header file "include/linux/can/error.h". | 237 | described in the Linux header file "include/linux/can/error.h". |
253 | 238 | ||
254 | 4. How to use Socket CAN | 239 | 4. How to use SocketCAN |
255 | ------------------------ | 240 | ------------------------ |
256 | 241 | ||
257 | Like TCP/IP, you first need to open a socket for communicating over a | 242 | Like TCP/IP, you first need to open a socket for communicating over a |
258 | CAN network. Since Socket CAN implements a new protocol family, you | 243 | CAN network. Since SocketCAN implements a new protocol family, you |
259 | need to pass PF_CAN as the first argument to the socket(2) system | 244 | need to pass PF_CAN as the first argument to the socket(2) system |
260 | call. Currently, there are two CAN protocols to choose from, the raw | 245 | call. Currently, there are two CAN protocols to choose from, the raw |
261 | socket protocol and the broadcast manager (BCM). So to open a socket, | 246 | socket protocol and the broadcast manager (BCM). So to open a socket, |
@@ -286,8 +271,8 @@ solution for a couple of reasons: | |||
286 | }; | 271 | }; |
287 | 272 | ||
288 | The alignment of the (linear) payload data[] to a 64bit boundary | 273 | The alignment of the (linear) payload data[] to a 64bit boundary |
289 | allows the user to define own structs and unions to easily access the | 274 | allows the user to define their own structs and unions to easily access |
290 | CAN payload. There is no given byteorder on the CAN bus by | 275 | the CAN payload. There is no given byteorder on the CAN bus by |
291 | default. A read(2) system call on a CAN_RAW socket transfers a | 276 | default. A read(2) system call on a CAN_RAW socket transfers a |
292 | struct can_frame to the user space. | 277 | struct can_frame to the user space. |
293 | 278 | ||
@@ -479,7 +464,7 @@ solution for a couple of reasons: | |||
479 | 464 | ||
480 | setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0); | 465 | setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0); |
481 | 466 | ||
482 | To set the filters to zero filters is quite obsolete as not read | 467 | To set the filters to zero filters is quite obsolete as to not read |
483 | data causes the raw socket to discard the received CAN frames. But | 468 | data causes the raw socket to discard the received CAN frames. But |
484 | having this 'send only' use-case we may remove the receive list in the | 469 | having this 'send only' use-case we may remove the receive list in the |
485 | Kernel to save a little (really a very little!) CPU usage. | 470 | Kernel to save a little (really a very little!) CPU usage. |
@@ -814,17 +799,17 @@ solution for a couple of reasons: | |||
814 | 4.4 unconnected transport protocols (SOCK_DGRAM) | 799 | 4.4 unconnected transport protocols (SOCK_DGRAM) |
815 | 800 | ||
816 | 801 | ||
817 | 5. Socket CAN core module | 802 | 5. SocketCAN core module |
818 | ------------------------- | 803 | ------------------------- |
819 | 804 | ||
820 | The Socket CAN core module implements the protocol family | 805 | The SocketCAN core module implements the protocol family |
821 | PF_CAN. CAN protocol modules are loaded by the core module at | 806 | PF_CAN. CAN protocol modules are loaded by the core module at |
822 | runtime. The core module provides an interface for CAN protocol | 807 | runtime. The core module provides an interface for CAN protocol |
823 | modules to subscribe needed CAN IDs (see chapter 3.1). | 808 | modules to subscribe needed CAN IDs (see chapter 3.1). |
824 | 809 | ||
825 | 5.1 can.ko module params | 810 | 5.1 can.ko module params |
826 | 811 | ||
827 | - stats_timer: To calculate the Socket CAN core statistics | 812 | - stats_timer: To calculate the SocketCAN core statistics |
828 | (e.g. current/maximum frames per second) this 1 second timer is | 813 | (e.g. current/maximum frames per second) this 1 second timer is |
829 | invoked at can.ko module start time by default. This timer can be | 814 | invoked at can.ko module start time by default. This timer can be |
830 | disabled by using stattimer=0 on the module commandline. | 815 | disabled by using stattimer=0 on the module commandline. |
@@ -833,7 +818,7 @@ solution for a couple of reasons: | |||
833 | 818 | ||
834 | 5.2 procfs content | 819 | 5.2 procfs content |
835 | 820 | ||
836 | As described in chapter 3.1 the Socket CAN core uses several filter | 821 | As described in chapter 3.1 the SocketCAN core uses several filter |
837 | lists to deliver received CAN frames to CAN protocol modules. These | 822 | lists to deliver received CAN frames to CAN protocol modules. These |
838 | receive lists, their filters and the count of filter matches can be | 823 | receive lists, their filters and the count of filter matches can be |
839 | checked in the appropriate receive list. All entries contain the | 824 | checked in the appropriate receive list. All entries contain the |
@@ -860,15 +845,15 @@ solution for a couple of reasons: | |||
860 | 845 | ||
861 | Additional procfs files in /proc/net/can | 846 | Additional procfs files in /proc/net/can |
862 | 847 | ||
863 | stats - Socket CAN core statistics (rx/tx frames, match ratios, ...) | 848 | stats - SocketCAN core statistics (rx/tx frames, match ratios, ...) |
864 | reset_stats - manual statistic reset | 849 | reset_stats - manual statistic reset |
865 | version - prints the Socket CAN core version and the ABI version | 850 | version - prints the SocketCAN core version and the ABI version |
866 | 851 | ||
867 | 5.3 writing own CAN protocol modules | 852 | 5.3 writing own CAN protocol modules |
868 | 853 | ||
869 | To implement a new protocol in the protocol family PF_CAN a new | 854 | To implement a new protocol in the protocol family PF_CAN a new |
870 | protocol has to be defined in include/linux/can.h . | 855 | protocol has to be defined in include/linux/can.h . |
871 | The prototypes and definitions to use the Socket CAN core can be | 856 | The prototypes and definitions to use the SocketCAN core can be |
872 | accessed by including include/linux/can/core.h . | 857 | accessed by including include/linux/can/core.h . |
873 | In addition to functions that register the CAN protocol and the | 858 | In addition to functions that register the CAN protocol and the |
874 | CAN device notifier chain there are functions to subscribe CAN | 859 | CAN device notifier chain there are functions to subscribe CAN |
@@ -1105,7 +1090,7 @@ solution for a couple of reasons: | |||
1105 | 1090 | ||
1106 | $ ip link set canX up type can bitrate 125000 | 1091 | $ ip link set canX up type can bitrate 125000 |
1107 | 1092 | ||
1108 | A device may enter the "bus-off" state if too much errors occurred on | 1093 | A device may enter the "bus-off" state if too many errors occurred on |
1109 | the CAN bus. Then no more messages are received or sent. An automatic | 1094 | the CAN bus. Then no more messages are received or sent. An automatic |
1110 | bus-off recovery can be enabled by setting the "restart-ms" to a | 1095 | bus-off recovery can be enabled by setting the "restart-ms" to a |
1111 | non-zero value, e.g.: | 1096 | non-zero value, e.g.: |
@@ -1125,7 +1110,7 @@ solution for a couple of reasons: | |||
1125 | 1110 | ||
1126 | CAN FD capable CAN controllers support two different bitrates for the | 1111 | CAN FD capable CAN controllers support two different bitrates for the |
1127 | arbitration phase and the payload phase of the CAN FD frame. Therefore a | 1112 | arbitration phase and the payload phase of the CAN FD frame. Therefore a |
1128 | second bittiming has to be specified in order to enable the CAN FD bitrate. | 1113 | second bit timing has to be specified in order to enable the CAN FD bitrate. |
1129 | 1114 | ||
1130 | Additionally CAN FD capable CAN controllers support up to 64 bytes of | 1115 | Additionally CAN FD capable CAN controllers support up to 64 bytes of |
1131 | payload. The representation of this length in can_frame.can_dlc and | 1116 | payload. The representation of this length in can_frame.can_dlc and |
@@ -1150,21 +1135,16 @@ solution for a couple of reasons: | |||
1150 | 6.7 Supported CAN hardware | 1135 | 6.7 Supported CAN hardware |
1151 | 1136 | ||
1152 | Please check the "Kconfig" file in "drivers/net/can" to get an actual | 1137 | Please check the "Kconfig" file in "drivers/net/can" to get an actual |
1153 | list of the support CAN hardware. On the Socket CAN project website | 1138 | list of the support CAN hardware. On the SocketCAN project website |
1154 | (see chapter 7) there might be further drivers available, also for | 1139 | (see chapter 7) there might be further drivers available, also for |
1155 | older kernel versions. | 1140 | older kernel versions. |
1156 | 1141 | ||
1157 | 7. Socket CAN resources | 1142 | 7. SocketCAN resources |
1158 | ----------------------- | 1143 | ----------------------- |
1159 | 1144 | ||
1160 | You can find further resources for Socket CAN like user space tools, | 1145 | The Linux CAN / SocketCAN project ressources (project site / mailing list) |
1161 | support for old kernel versions, more drivers, mailing lists, etc. | 1146 | are referenced in the MAINTAINERS file in the Linux source tree. |
1162 | at the BerliOS OSS project website for Socket CAN: | 1147 | Search for CAN NETWORK [LAYERS|DRIVERS]. |
1163 | |||
1164 | http://developer.berlios.de/projects/socketcan | ||
1165 | |||
1166 | If you have questions, bug fixes, etc., don't hesitate to post them to | ||
1167 | the Socketcan-Users mailing list. But please search the archives first. | ||
1168 | 1148 | ||
1169 | 8. Credits | 1149 | 8. Credits |
1170 | ---------- | 1150 | ---------- |
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index cdb3e40b9d14..a06b48d2f5cc 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt | |||
@@ -1,49 +1,563 @@ | |||
1 | filter.txt: Linux Socket Filtering | 1 | Linux Socket Filtering aka Berkeley Packet Filter (BPF) |
2 | Written by: Jay Schulist <jschlst@samba.org> | 2 | ======================================================= |
3 | 3 | ||
4 | Introduction | 4 | Introduction |
5 | ============ | 5 | ------------ |
6 | 6 | ||
7 | Linux Socket Filtering is derived from the Berkeley | 7 | Linux Socket Filtering (LSF) is derived from the Berkeley Packet Filter. |
8 | Packet Filter. There are some distinct differences between | 8 | Though there are some distinct differences between the BSD and Linux |
9 | the BSD and Linux Kernel Filtering. | 9 | Kernel filtering, but when we speak of BPF or LSF in Linux context, we |
10 | 10 | mean the very same mechanism of filtering in the Linux kernel. | |
11 | Linux Socket Filtering (LSF) allows a user-space program to | 11 | |
12 | attach a filter onto any socket and allow or disallow certain | 12 | BPF allows a user-space program to attach a filter onto any socket and |
13 | types of data to come through the socket. LSF follows exactly | 13 | allow or disallow certain types of data to come through the socket. LSF |
14 | the same filter code structure as the BSD Berkeley Packet Filter | 14 | follows exactly the same filter code structure as BSD's BPF, so referring |
15 | (BPF), so referring to the BSD bpf.4 manpage is very helpful in | 15 | to the BSD bpf.4 manpage is very helpful in creating filters. |
16 | creating filters. | 16 | |
17 | 17 | On Linux, BPF is much simpler than on BSD. One does not have to worry | |
18 | LSF is much simpler than BPF. One does not have to worry about | 18 | about devices or anything like that. You simply create your filter code, |
19 | devices or anything like that. You simply create your filter | 19 | send it to the kernel via the SO_ATTACH_FILTER option and if your filter |
20 | code, send it to the kernel via the SO_ATTACH_FILTER option and | 20 | code passes the kernel check on it, you then immediately begin filtering |
21 | if your filter code passes the kernel check on it, you then | 21 | data on that socket. |
22 | immediately begin filtering data on that socket. | 22 | |
23 | 23 | You can also detach filters from your socket via the SO_DETACH_FILTER | |
24 | You can also detach filters from your socket via the | 24 | option. This will probably not be used much since when you close a socket |
25 | SO_DETACH_FILTER option. This will probably not be used much | 25 | that has a filter on it the filter is automagically removed. The other |
26 | since when you close a socket that has a filter on it the | 26 | less common case may be adding a different filter on the same socket where |
27 | filter is automagically removed. The other less common case | 27 | you had another filter that is still running: the kernel takes care of |
28 | may be adding a different filter on the same socket where you had another | 28 | removing the old one and placing your new one in its place, assuming your |
29 | filter that is still running: the kernel takes care of removing | 29 | filter has passed the checks, otherwise if it fails the old filter will |
30 | the old one and placing your new one in its place, assuming your | 30 | remain on that socket. |
31 | filter has passed the checks, otherwise if it fails the old filter | 31 | |
32 | will remain on that socket. | 32 | SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once |
33 | 33 | set, a filter cannot be removed or changed. This allows one process to | |
34 | SO_LOCK_FILTER option allows to lock the filter attached to a | 34 | setup a socket, attach a filter, lock it then drop privileges and be |
35 | socket. Once set, a filter cannot be removed or changed. This allows | 35 | assured that the filter will be kept until the socket is closed. |
36 | one process to setup a socket, attach a filter, lock it then drop | 36 | |
37 | privileges and be assured that the filter will be kept until the | 37 | The biggest user of this construct might be libpcap. Issuing a high-level |
38 | socket is closed. | 38 | filter command like `tcpdump -i em1 port 22` passes through the libpcap |
39 | 39 | internal compiler that generates a structure that can eventually be loaded | |
40 | Examples | 40 | via SO_ATTACH_FILTER to the kernel. `tcpdump -i em1 port 22 -ddd` |
41 | ======== | 41 | displays what is being placed into this structure. |
42 | 42 | ||
43 | Ioctls- | 43 | Although we were only speaking about sockets here, BPF in Linux is used |
44 | setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &Filter, sizeof(Filter)); | 44 | in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel |
45 | setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &value, sizeof(value)); | 45 | qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places |
46 | setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &value, sizeof(value)); | 46 | such as team driver, PTP code, etc where BPF is being used. |
47 | 47 | ||
48 | See the BSD bpf.4 manpage and the BSD Packet Filter paper written by | 48 | [1] Documentation/prctl/seccomp_filter.txt |
49 | Steven McCanne and Van Jacobson of Lawrence Berkeley Laboratory. | 49 | |
50 | Original BPF paper: | ||
51 | |||
52 | Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new | ||
53 | architecture for user-level packet capture. In Proceedings of the | ||
54 | USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993 | ||
55 | Conference Proceedings (USENIX'93). USENIX Association, Berkeley, | ||
56 | CA, USA, 2-2. [http://www.tcpdump.org/papers/bpf-usenix93.pdf] | ||
57 | |||
58 | Structure | ||
59 | --------- | ||
60 | |||
61 | User space applications include <linux/filter.h> which contains the | ||
62 | following relevant structures: | ||
63 | |||
64 | struct sock_filter { /* Filter block */ | ||
65 | __u16 code; /* Actual filter code */ | ||
66 | __u8 jt; /* Jump true */ | ||
67 | __u8 jf; /* Jump false */ | ||
68 | __u32 k; /* Generic multiuse field */ | ||
69 | }; | ||
70 | |||
71 | Such a structure is assembled as an array of 4-tuples, that contains | ||
72 | a code, jt, jf and k value. jt and jf are jump offsets and k a generic | ||
73 | value to be used for a provided code. | ||
74 | |||
75 | struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ | ||
76 | unsigned short len; /* Number of filter blocks */ | ||
77 | struct sock_filter __user *filter; | ||
78 | }; | ||
79 | |||
80 | For socket filtering, a pointer to this structure (as shown in | ||
81 | follow-up example) is being passed to the kernel through setsockopt(2). | ||
82 | |||
83 | Example | ||
84 | ------- | ||
85 | |||
86 | #include <sys/socket.h> | ||
87 | #include <sys/types.h> | ||
88 | #include <arpa/inet.h> | ||
89 | #include <linux/if_ether.h> | ||
90 | /* ... */ | ||
91 | |||
92 | /* From the example above: tcpdump -i em1 port 22 -dd */ | ||
93 | struct sock_filter code[] = { | ||
94 | { 0x28, 0, 0, 0x0000000c }, | ||
95 | { 0x15, 0, 8, 0x000086dd }, | ||
96 | { 0x30, 0, 0, 0x00000014 }, | ||
97 | { 0x15, 2, 0, 0x00000084 }, | ||
98 | { 0x15, 1, 0, 0x00000006 }, | ||
99 | { 0x15, 0, 17, 0x00000011 }, | ||
100 | { 0x28, 0, 0, 0x00000036 }, | ||
101 | { 0x15, 14, 0, 0x00000016 }, | ||
102 | { 0x28, 0, 0, 0x00000038 }, | ||
103 | { 0x15, 12, 13, 0x00000016 }, | ||
104 | { 0x15, 0, 12, 0x00000800 }, | ||
105 | { 0x30, 0, 0, 0x00000017 }, | ||
106 | { 0x15, 2, 0, 0x00000084 }, | ||
107 | { 0x15, 1, 0, 0x00000006 }, | ||
108 | { 0x15, 0, 8, 0x00000011 }, | ||
109 | { 0x28, 0, 0, 0x00000014 }, | ||
110 | { 0x45, 6, 0, 0x00001fff }, | ||
111 | { 0xb1, 0, 0, 0x0000000e }, | ||
112 | { 0x48, 0, 0, 0x0000000e }, | ||
113 | { 0x15, 2, 0, 0x00000016 }, | ||
114 | { 0x48, 0, 0, 0x00000010 }, | ||
115 | { 0x15, 0, 1, 0x00000016 }, | ||
116 | { 0x06, 0, 0, 0x0000ffff }, | ||
117 | { 0x06, 0, 0, 0x00000000 }, | ||
118 | }; | ||
119 | |||
120 | struct sock_fprog bpf = { | ||
121 | .len = ARRAY_SIZE(code), | ||
122 | .filter = code, | ||
123 | }; | ||
124 | |||
125 | sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); | ||
126 | if (sock < 0) | ||
127 | /* ... bail out ... */ | ||
128 | |||
129 | ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)); | ||
130 | if (ret < 0) | ||
131 | /* ... bail out ... */ | ||
132 | |||
133 | /* ... */ | ||
134 | close(sock); | ||
135 | |||
136 | The above example code attaches a socket filter for a PF_PACKET socket | ||
137 | in order to let all IPv4/IPv6 packets with port 22 pass. The rest will | ||
138 | be dropped for this socket. | ||
139 | |||
140 | The setsockopt(2) call to SO_DETACH_FILTER doesn't need any arguments | ||
141 | and SO_LOCK_FILTER for preventing the filter to be detached, takes an | ||
142 | integer value with 0 or 1. | ||
143 | |||
144 | Note that socket filters are not restricted to PF_PACKET sockets only, | ||
145 | but can also be used on other socket families. | ||
146 | |||
147 | Summary of system calls: | ||
148 | |||
149 | * setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val)); | ||
150 | * setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)); | ||
151 | * setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &val, sizeof(val)); | ||
152 | |||
153 | Normally, most use cases for socket filtering on packet sockets will be | ||
154 | covered by libpcap in high-level syntax, so as an application developer | ||
155 | you should stick to that. libpcap wraps its own layer around all that. | ||
156 | |||
157 | Unless i) using/linking to libpcap is not an option, ii) the required BPF | ||
158 | filters use Linux extensions that are not supported by libpcap's compiler, | ||
159 | iii) a filter might be more complex and not cleanly implementable with | ||
160 | libpcap's compiler, or iv) particular filter codes should be optimized | ||
161 | differently than libpcap's internal compiler does; then in such cases | ||
162 | writing such a filter "by hand" can be of an alternative. For example, | ||
163 | xt_bpf and cls_bpf users might have requirements that could result in | ||
164 | more complex filter code, or one that cannot be expressed with libpcap | ||
165 | (e.g. different return codes for various code paths). Moreover, BPF JIT | ||
166 | implementors may wish to manually write test cases and thus need low-level | ||
167 | access to BPF code as well. | ||
168 | |||
169 | BPF engine and instruction set | ||
170 | ------------------------------ | ||
171 | |||
172 | Under tools/net/ there's a small helper tool called bpf_asm which can | ||
173 | be used to write low-level filters for example scenarios mentioned in the | ||
174 | previous section. Asm-like syntax mentioned here has been implemented in | ||
175 | bpf_asm and will be used for further explanations (instead of dealing with | ||
176 | less readable opcodes directly, principles are the same). The syntax is | ||
177 | closely modelled after Steven McCanne's and Van Jacobson's BPF paper. | ||
178 | |||
179 | The BPF architecture consists of the following basic elements: | ||
180 | |||
181 | Element Description | ||
182 | |||
183 | A 32 bit wide accumulator | ||
184 | X 32 bit wide X register | ||
185 | M[] 16 x 32 bit wide misc registers aka "scratch memory | ||
186 | store", addressable from 0 to 15 | ||
187 | |||
188 | A program, that is translated by bpf_asm into "opcodes" is an array that | ||
189 | consists of the following elements (as already mentioned): | ||
190 | |||
191 | op:16, jt:8, jf:8, k:32 | ||
192 | |||
193 | The element op is a 16 bit wide opcode that has a particular instruction | ||
194 | encoded. jt and jf are two 8 bit wide jump targets, one for condition | ||
195 | "jump if true", the other one "jump if false". Eventually, element k | ||
196 | contains a miscellaneous argument that can be interpreted in different | ||
197 | ways depending on the given instruction in op. | ||
198 | |||
199 | The instruction set consists of load, store, branch, alu, miscellaneous | ||
200 | and return instructions that are also represented in bpf_asm syntax. This | ||
201 | table lists all bpf_asm instructions available resp. what their underlying | ||
202 | opcodes as defined in linux/filter.h stand for: | ||
203 | |||
204 | Instruction Addressing mode Description | ||
205 | |||
206 | ld 1, 2, 3, 4, 10 Load word into A | ||
207 | ldi 4 Load word into A | ||
208 | ldh 1, 2 Load half-word into A | ||
209 | ldb 1, 2 Load byte into A | ||
210 | ldx 3, 4, 5, 10 Load word into X | ||
211 | ldxi 4 Load word into X | ||
212 | ldxb 5 Load byte into X | ||
213 | |||
214 | st 3 Store A into M[] | ||
215 | stx 3 Store X into M[] | ||
216 | |||
217 | jmp 6 Jump to label | ||
218 | ja 6 Jump to label | ||
219 | jeq 7, 8 Jump on k == A | ||
220 | jneq 8 Jump on k != A | ||
221 | jne 8 Jump on k != A | ||
222 | jlt 8 Jump on k < A | ||
223 | jle 8 Jump on k <= A | ||
224 | jgt 7, 8 Jump on k > A | ||
225 | jge 7, 8 Jump on k >= A | ||
226 | jset 7, 8 Jump on k & A | ||
227 | |||
228 | add 0, 4 A + <x> | ||
229 | sub 0, 4 A - <x> | ||
230 | mul 0, 4 A * <x> | ||
231 | div 0, 4 A / <x> | ||
232 | mod 0, 4 A % <x> | ||
233 | neg 0, 4 !A | ||
234 | and 0, 4 A & <x> | ||
235 | or 0, 4 A | <x> | ||
236 | xor 0, 4 A ^ <x> | ||
237 | lsh 0, 4 A << <x> | ||
238 | rsh 0, 4 A >> <x> | ||
239 | |||
240 | tax Copy A into X | ||
241 | txa Copy X into A | ||
242 | |||
243 | ret 4, 9 Return | ||
244 | |||
245 | The next table shows addressing formats from the 2nd column: | ||
246 | |||
247 | Addressing mode Syntax Description | ||
248 | |||
249 | 0 x/%x Register X | ||
250 | 1 [k] BHW at byte offset k in the packet | ||
251 | 2 [x + k] BHW at the offset X + k in the packet | ||
252 | 3 M[k] Word at offset k in M[] | ||
253 | 4 #k Literal value stored in k | ||
254 | 5 4*([k]&0xf) Lower nibble * 4 at byte offset k in the packet | ||
255 | 6 L Jump label L | ||
256 | 7 #k,Lt,Lf Jump to Lt if true, otherwise jump to Lf | ||
257 | 8 #k,Lt Jump to Lt if predicate is true | ||
258 | 9 a/%a Accumulator A | ||
259 | 10 extension BPF extension | ||
260 | |||
261 | The Linux kernel also has a couple of BPF extensions that are used along | ||
262 | with the class of load instructions by "overloading" the k argument with | ||
263 | a negative offset + a particular extension offset. The result of such BPF | ||
264 | extensions are loaded into A. | ||
265 | |||
266 | Possible BPF extensions are shown in the following table: | ||
267 | |||
268 | Extension Description | ||
269 | |||
270 | len skb->len | ||
271 | proto skb->protocol | ||
272 | type skb->pkt_type | ||
273 | poff Payload start offset | ||
274 | ifidx skb->dev->ifindex | ||
275 | nla Netlink attribute of type X with offset A | ||
276 | nlan Nested Netlink attribute of type X with offset A | ||
277 | mark skb->mark | ||
278 | queue skb->queue_mapping | ||
279 | hatype skb->dev->type | ||
280 | rxhash skb->rxhash | ||
281 | cpu raw_smp_processor_id() | ||
282 | vlan_tci vlan_tx_tag_get(skb) | ||
283 | vlan_pr vlan_tx_tag_present(skb) | ||
284 | |||
285 | These extensions can also be prefixed with '#'. | ||
286 | Examples for low-level BPF: | ||
287 | |||
288 | ** ARP packets: | ||
289 | |||
290 | ldh [12] | ||
291 | jne #0x806, drop | ||
292 | ret #-1 | ||
293 | drop: ret #0 | ||
294 | |||
295 | ** IPv4 TCP packets: | ||
296 | |||
297 | ldh [12] | ||
298 | jne #0x800, drop | ||
299 | ldb [23] | ||
300 | jneq #6, drop | ||
301 | ret #-1 | ||
302 | drop: ret #0 | ||
303 | |||
304 | ** (Accelerated) VLAN w/ id 10: | ||
305 | |||
306 | ld vlan_tci | ||
307 | jneq #10, drop | ||
308 | ret #-1 | ||
309 | drop: ret #0 | ||
310 | |||
311 | ** SECCOMP filter example: | ||
312 | |||
313 | ld [4] /* offsetof(struct seccomp_data, arch) */ | ||
314 | jne #0xc000003e, bad /* AUDIT_ARCH_X86_64 */ | ||
315 | ld [0] /* offsetof(struct seccomp_data, nr) */ | ||
316 | jeq #15, good /* __NR_rt_sigreturn */ | ||
317 | jeq #231, good /* __NR_exit_group */ | ||
318 | jeq #60, good /* __NR_exit */ | ||
319 | jeq #0, good /* __NR_read */ | ||
320 | jeq #1, good /* __NR_write */ | ||
321 | jeq #5, good /* __NR_fstat */ | ||
322 | jeq #9, good /* __NR_mmap */ | ||
323 | jeq #14, good /* __NR_rt_sigprocmask */ | ||
324 | jeq #13, good /* __NR_rt_sigaction */ | ||
325 | jeq #35, good /* __NR_nanosleep */ | ||
326 | bad: ret #0 /* SECCOMP_RET_KILL */ | ||
327 | good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */ | ||
328 | |||
329 | The above example code can be placed into a file (here called "foo"), and | ||
330 | then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf | ||
331 | and cls_bpf understands and can directly be loaded with. Example with above | ||
332 | ARP code: | ||
333 | |||
334 | $ ./bpf_asm foo | ||
335 | 4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0, | ||
336 | |||
337 | In copy and paste C-like output: | ||
338 | |||
339 | $ ./bpf_asm -c foo | ||
340 | { 0x28, 0, 0, 0x0000000c }, | ||
341 | { 0x15, 0, 1, 0x00000806 }, | ||
342 | { 0x06, 0, 0, 0xffffffff }, | ||
343 | { 0x06, 0, 0, 0000000000 }, | ||
344 | |||
345 | In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF | ||
346 | filters that might not be obvious at first, it's good to test filters before | ||
347 | attaching to a live system. For that purpose, there's a small tool called | ||
348 | bpf_dbg under tools/net/ in the kernel source directory. This debugger allows | ||
349 | for testing BPF filters against given pcap files, single stepping through the | ||
350 | BPF code on the pcap's packets and to do BPF machine register dumps. | ||
351 | |||
352 | Starting bpf_dbg is trivial and just requires issuing: | ||
353 | |||
354 | # ./bpf_dbg | ||
355 | |||
356 | In case input and output do not equal stdin/stdout, bpf_dbg takes an | ||
357 | alternative stdin source as a first argument, and an alternative stdout | ||
358 | sink as a second one, e.g. `./bpf_dbg test_in.txt test_out.txt`. | ||
359 | |||
360 | Other than that, a particular libreadline configuration can be set via | ||
361 | file "~/.bpf_dbg_init" and the command history is stored in the file | ||
362 | "~/.bpf_dbg_history". | ||
363 | |||
364 | Interaction in bpf_dbg happens through a shell that also has auto-completion | ||
365 | support (follow-up example commands starting with '>' denote bpf_dbg shell). | ||
366 | The usual workflow would be to ... | ||
367 | |||
368 | > load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 | ||
369 | Loads a BPF filter from standard output of bpf_asm, or transformed via | ||
370 | e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT | ||
371 | debugging (next section), this command creates a temporary socket and | ||
372 | loads the BPF code into the kernel. Thus, this will also be useful for | ||
373 | JIT developers. | ||
374 | |||
375 | > load pcap foo.pcap | ||
376 | Loads standard tcpdump pcap file. | ||
377 | |||
378 | > run [<n>] | ||
379 | bpf passes:1 fails:9 | ||
380 | Runs through all packets from a pcap to account how many passes and fails | ||
381 | the filter will generate. A limit of packets to traverse can be given. | ||
382 | |||
383 | > disassemble | ||
384 | l0: ldh [12] | ||
385 | l1: jeq #0x800, l2, l5 | ||
386 | l2: ldb [23] | ||
387 | l3: jeq #0x1, l4, l5 | ||
388 | l4: ret #0xffff | ||
389 | l5: ret #0 | ||
390 | Prints out BPF code disassembly. | ||
391 | |||
392 | > dump | ||
393 | /* { op, jt, jf, k }, */ | ||
394 | { 0x28, 0, 0, 0x0000000c }, | ||
395 | { 0x15, 0, 3, 0x00000800 }, | ||
396 | { 0x30, 0, 0, 0x00000017 }, | ||
397 | { 0x15, 0, 1, 0x00000001 }, | ||
398 | { 0x06, 0, 0, 0x0000ffff }, | ||
399 | { 0x06, 0, 0, 0000000000 }, | ||
400 | Prints out C-style BPF code dump. | ||
401 | |||
402 | > breakpoint 0 | ||
403 | breakpoint at: l0: ldh [12] | ||
404 | > breakpoint 1 | ||
405 | breakpoint at: l1: jeq #0x800, l2, l5 | ||
406 | ... | ||
407 | Sets breakpoints at particular BPF instructions. Issuing a `run` command | ||
408 | will walk through the pcap file continuing from the current packet and | ||
409 | break when a breakpoint is being hit (another `run` will continue from | ||
410 | the currently active breakpoint executing next instructions): | ||
411 | |||
412 | > run | ||
413 | -- register dump -- | ||
414 | pc: [0] <-- program counter | ||
415 | code: [40] jt[0] jf[0] k[12] <-- plain BPF code of current instruction | ||
416 | curr: l0: ldh [12] <-- disassembly of current instruction | ||
417 | A: [00000000][0] <-- content of A (hex, decimal) | ||
418 | X: [00000000][0] <-- content of X (hex, decimal) | ||
419 | M[0,15]: [00000000][0] <-- folded content of M (hex, decimal) | ||
420 | -- packet dump -- <-- Current packet from pcap (hex) | ||
421 | len: 42 | ||
422 | 0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 | ||
423 | 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 | ||
424 | 32: 00 00 00 00 00 00 0a 3b 01 01 | ||
425 | (breakpoint) | ||
426 | > | ||
427 | |||
428 | > breakpoint | ||
429 | breakpoints: 0 1 | ||
430 | Prints currently set breakpoints. | ||
431 | |||
432 | > step [-<n>, +<n>] | ||
433 | Performs single stepping through the BPF program from the current pc | ||
434 | offset. Thus, on each step invocation, above register dump is issued. | ||
435 | This can go forwards and backwards in time, a plain `step` will break | ||
436 | on the next BPF instruction, thus +1. (No `run` needs to be issued here.) | ||
437 | |||
438 | > select <n> | ||
439 | Selects a given packet from the pcap file to continue from. Thus, on | ||
440 | the next `run` or `step`, the BPF program is being evaluated against | ||
441 | the user pre-selected packet. Numbering starts just as in Wireshark | ||
442 | with index 1. | ||
443 | |||
444 | > quit | ||
445 | # | ||
446 | Exits bpf_dbg. | ||
447 | |||
448 | JIT compiler | ||
449 | ------------ | ||
450 | |||
451 | The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, PowerPC, | ||
452 | ARM and s390 and can be enabled through CONFIG_BPF_JIT. The JIT compiler is | ||
453 | transparently invoked for each attached filter from user space or for internal | ||
454 | kernel users if it has been previously enabled by root: | ||
455 | |||
456 | echo 1 > /proc/sys/net/core/bpf_jit_enable | ||
457 | |||
458 | For JIT developers, doing audits etc, each compile run can output the generated | ||
459 | opcode image into the kernel log via: | ||
460 | |||
461 | echo 2 > /proc/sys/net/core/bpf_jit_enable | ||
462 | |||
463 | Example output from dmesg: | ||
464 | |||
465 | [ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f | ||
466 | [ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68 | ||
467 | [ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00 | ||
468 | [ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00 | ||
469 | [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 | ||
470 | [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 | ||
471 | |||
472 | In the kernel source tree under tools/net/, there's bpf_jit_disasm for | ||
473 | generating disassembly out of the kernel log's hexdump: | ||
474 | |||
475 | # ./bpf_jit_disasm | ||
476 | 70 bytes emitted from JIT compiler (pass:3, flen:6) | ||
477 | ffffffffa0069c8f + <x>: | ||
478 | 0: push %rbp | ||
479 | 1: mov %rsp,%rbp | ||
480 | 4: sub $0x60,%rsp | ||
481 | 8: mov %rbx,-0x8(%rbp) | ||
482 | c: mov 0x68(%rdi),%r9d | ||
483 | 10: sub 0x6c(%rdi),%r9d | ||
484 | 14: mov 0xd8(%rdi),%r8 | ||
485 | 1b: mov $0xc,%esi | ||
486 | 20: callq 0xffffffffe0ff9442 | ||
487 | 25: cmp $0x800,%eax | ||
488 | 2a: jne 0x0000000000000042 | ||
489 | 2c: mov $0x17,%esi | ||
490 | 31: callq 0xffffffffe0ff945e | ||
491 | 36: cmp $0x1,%eax | ||
492 | 39: jne 0x0000000000000042 | ||
493 | 3b: mov $0xffff,%eax | ||
494 | 40: jmp 0x0000000000000044 | ||
495 | 42: xor %eax,%eax | ||
496 | 44: leaveq | ||
497 | 45: retq | ||
498 | |||
499 | Issuing option `-o` will "annotate" opcodes to resulting assembler | ||
500 | instructions, which can be very useful for JIT developers: | ||
501 | |||
502 | # ./bpf_jit_disasm -o | ||
503 | 70 bytes emitted from JIT compiler (pass:3, flen:6) | ||
504 | ffffffffa0069c8f + <x>: | ||
505 | 0: push %rbp | ||
506 | 55 | ||
507 | 1: mov %rsp,%rbp | ||
508 | 48 89 e5 | ||
509 | 4: sub $0x60,%rsp | ||
510 | 48 83 ec 60 | ||
511 | 8: mov %rbx,-0x8(%rbp) | ||
512 | 48 89 5d f8 | ||
513 | c: mov 0x68(%rdi),%r9d | ||
514 | 44 8b 4f 68 | ||
515 | 10: sub 0x6c(%rdi),%r9d | ||
516 | 44 2b 4f 6c | ||
517 | 14: mov 0xd8(%rdi),%r8 | ||
518 | 4c 8b 87 d8 00 00 00 | ||
519 | 1b: mov $0xc,%esi | ||
520 | be 0c 00 00 00 | ||
521 | 20: callq 0xffffffffe0ff9442 | ||
522 | e8 1d 94 ff e0 | ||
523 | 25: cmp $0x800,%eax | ||
524 | 3d 00 08 00 00 | ||
525 | 2a: jne 0x0000000000000042 | ||
526 | 75 16 | ||
527 | 2c: mov $0x17,%esi | ||
528 | be 17 00 00 00 | ||
529 | 31: callq 0xffffffffe0ff945e | ||
530 | e8 28 94 ff e0 | ||
531 | 36: cmp $0x1,%eax | ||
532 | 83 f8 01 | ||
533 | 39: jne 0x0000000000000042 | ||
534 | 75 07 | ||
535 | 3b: mov $0xffff,%eax | ||
536 | b8 ff ff 00 00 | ||
537 | 40: jmp 0x0000000000000044 | ||
538 | eb 02 | ||
539 | 42: xor %eax,%eax | ||
540 | 31 c0 | ||
541 | 44: leaveq | ||
542 | c9 | ||
543 | 45: retq | ||
544 | c3 | ||
545 | |||
546 | For BPF JIT developers, bpf_jit_disasm, bpf_asm and bpf_dbg provides a useful | ||
547 | toolchain for developing and testing the kernel's JIT compiler. | ||
548 | |||
549 | Misc | ||
550 | ---- | ||
551 | |||
552 | Also trinity, the Linux syscall fuzzer, has built-in support for BPF and | ||
553 | SECCOMP-BPF kernel fuzzing. | ||
554 | |||
555 | Written by | ||
556 | ---------- | ||
557 | |||
558 | The document was written in the hope that it is found useful and in order | ||
559 | to give potential BPF hackers or security auditors a better overview of | ||
560 | the underlying architecture. | ||
561 | |||
562 | Jay Schulist <jschlst@samba.org> | ||
563 | Daniel Borkmann <dborkman@redhat.com> | ||
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 3c12d9a7ed00..d71afa8bd828 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -15,9 +15,19 @@ ip_default_ttl - INTEGER | |||
15 | forwarded) IP packets. Should be between 1 and 255 inclusive. | 15 | forwarded) IP packets. Should be between 1 and 255 inclusive. |
16 | Default: 64 (as recommended by RFC1700) | 16 | Default: 64 (as recommended by RFC1700) |
17 | 17 | ||
18 | ip_no_pmtu_disc - BOOLEAN | 18 | ip_no_pmtu_disc - INTEGER |
19 | Disable Path MTU Discovery. | 19 | Disable Path MTU Discovery. If enabled in mode 1 and a |
20 | default FALSE | 20 | fragmentation-required ICMP is received, the PMTU to this |
21 | destination will be set to min_pmtu (see below). You will need | ||
22 | to raise min_pmtu to the smallest interface MTU on your system | ||
23 | manually if you want to avoid locally generated fragments. | ||
24 | |||
25 | In mode 2 incoming Path MTU Discovery messages will be | ||
26 | discarded. Outgoing frames are handled the same as in mode 1, | ||
27 | implicitly setting IP_PMTUDISC_DONT on every created socket. | ||
28 | |||
29 | Possible values: 0-2 | ||
30 | Default: FALSE | ||
21 | 31 | ||
22 | min_pmtu - INTEGER | 32 | min_pmtu - INTEGER |
23 | default 552 - minimum discovered Path MTU | 33 | default 552 - minimum discovered Path MTU |
@@ -156,6 +166,16 @@ tcp_app_win - INTEGER | |||
156 | buffer. Value 0 is special, it means that nothing is reserved. | 166 | buffer. Value 0 is special, it means that nothing is reserved. |
157 | Default: 31 | 167 | Default: 31 |
158 | 168 | ||
169 | tcp_autocorking - BOOLEAN | ||
170 | Enable TCP auto corking : | ||
171 | When applications do consecutive small write()/sendmsg() system calls, | ||
172 | we try to coalesce these small writes as much as possible, to lower | ||
173 | total amount of sent packets. This is done if at least one prior | ||
174 | packet for the flow is waiting in Qdisc queues or device transmit | ||
175 | queue. Applications can still use TCP_CORK for optimal behavior | ||
176 | when they know how/when to uncork their sockets. | ||
177 | Default : 1 | ||
178 | |||
159 | tcp_available_congestion_control - STRING | 179 | tcp_available_congestion_control - STRING |
160 | Shows the available congestion control choices that are registered. | 180 | Shows the available congestion control choices that are registered. |
161 | More congestion control algorithms may be available as modules, | 181 | More congestion control algorithms may be available as modules, |
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index c01223628a87..4288ffafba9f 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt | |||
@@ -123,6 +123,16 @@ Transmission process is similar to capture as shown below. | |||
123 | [shutdown] close() --------> destruction of the transmission socket and | 123 | [shutdown] close() --------> destruction of the transmission socket and |
124 | deallocation of all associated resources. | 124 | deallocation of all associated resources. |
125 | 125 | ||
126 | Socket creation and destruction is also straight forward, and is done | ||
127 | the same way as in capturing described in the previous paragraph: | ||
128 | |||
129 | int fd = socket(PF_PACKET, mode, 0); | ||
130 | |||
131 | The protocol can optionally be 0 in case we only want to transmit | ||
132 | via this socket, which avoids an expensive call to packet_rcv(). | ||
133 | In this case, you also need to bind(2) the TX_RING with sll_protocol = 0 | ||
134 | set. Otherwise, htons(ETH_P_ALL) or any other protocol, for example. | ||
135 | |||
126 | Binding the socket to your network interface is mandatory (with zero copy) to | 136 | Binding the socket to your network interface is mandatory (with zero copy) to |
127 | know the header size of frames used in the circular buffer. | 137 | know the header size of frames used in the circular buffer. |
128 | 138 | ||
@@ -943,6 +953,27 @@ int main(int argc, char **argp) | |||
943 | } | 953 | } |
944 | 954 | ||
945 | ------------------------------------------------------------------------------- | 955 | ------------------------------------------------------------------------------- |
956 | + PACKET_QDISC_BYPASS | ||
957 | ------------------------------------------------------------------------------- | ||
958 | |||
959 | If there is a requirement to load the network with many packets in a similar | ||
960 | fashion as pktgen does, you might set the following option after socket | ||
961 | creation: | ||
962 | |||
963 | int one = 1; | ||
964 | setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)); | ||
965 | |||
966 | This has the side-effect, that packets sent through PF_PACKET will bypass the | ||
967 | kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning, | ||
968 | packet are not buffered, tc disciplines are ignored, increased loss can occur | ||
969 | and such packets are also not visible to other PF_PACKET sockets anymore. So, | ||
970 | you have been warned; generally, this can be useful for stress testing various | ||
971 | components of a system. | ||
972 | |||
973 | On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled | ||
974 | on PF_PACKET sockets. | ||
975 | |||
976 | ------------------------------------------------------------------------------- | ||
946 | + PACKET_TIMESTAMP | 977 | + PACKET_TIMESTAMP |
947 | ------------------------------------------------------------------------------- | 978 | ------------------------------------------------------------------------------- |
948 | 979 | ||
diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt index d5b1a3935245..ebf270719402 100644 --- a/Documentation/networking/phy.txt +++ b/Documentation/networking/phy.txt | |||
@@ -255,7 +255,8 @@ Writing a PHY driver | |||
255 | 255 | ||
256 | config_init: configures PHY into a sane state after a reset. | 256 | config_init: configures PHY into a sane state after a reset. |
257 | For instance, a Davicom PHY requires descrambling disabled. | 257 | For instance, a Davicom PHY requires descrambling disabled. |
258 | probe: Does any setup needed by the driver | 258 | probe: Allocate phy->priv, optionally refuse to bind. |
259 | PHY may not have been reset or had fixups run yet. | ||
259 | suspend/resume: power management | 260 | suspend/resume: power management |
260 | config_aneg: Changes the speed/duplex/negotiation settings | 261 | config_aneg: Changes the speed/duplex/negotiation settings |
261 | read_status: Reads the current speed/duplex/negotiation settings | 262 | read_status: Reads the current speed/duplex/negotiation settings |
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt index 9551622d0a7b..356f791af574 100644 --- a/Documentation/networking/regulatory.txt +++ b/Documentation/networking/regulatory.txt | |||
@@ -159,10 +159,10 @@ struct ieee80211_regdomain mydriver_jp_regdom = { | |||
159 | REG_RULE(2412-20, 2484+20, 40, 6, 20, 0), | 159 | REG_RULE(2412-20, 2484+20, 40, 6, 20, 0), |
160 | /* IEEE 802.11a, channels 34..48 */ | 160 | /* IEEE 802.11a, channels 34..48 */ |
161 | REG_RULE(5170-20, 5240+20, 40, 6, 20, | 161 | REG_RULE(5170-20, 5240+20, 40, 6, 20, |
162 | NL80211_RRF_PASSIVE_SCAN), | 162 | NL80211_RRF_NO_IR), |
163 | /* IEEE 802.11a, channels 52..64 */ | 163 | /* IEEE 802.11a, channels 52..64 */ |
164 | REG_RULE(5260-20, 5320+20, 40, 6, 20, | 164 | REG_RULE(5260-20, 5320+20, 40, 6, 20, |
165 | NL80211_RRF_NO_IBSS | | 165 | NL80211_RRF_NO_IR| |
166 | NL80211_RRF_DFS), | 166 | NL80211_RRF_DFS), |
167 | } | 167 | } |
168 | }; | 168 | }; |