diff options
-rw-r--r-- | Documentation/networking/netlink_mmap.txt | 339 | ||||
-rw-r--r-- | include/linux/netfilter/nfnetlink.h | 11 | ||||
-rw-r--r-- | include/linux/netlink.h | 11 | ||||
-rw-r--r-- | include/linux/skbuff.h | 6 | ||||
-rw-r--r-- | include/net/netfilter/nf_conntrack.h | 2 | ||||
-rw-r--r-- | include/net/netfilter/nf_conntrack_expect.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/netlink.h | 32 | ||||
-rw-r--r-- | include/uapi/linux/netlink_diag.h | 10 | ||||
-rw-r--r-- | net/Kconfig | 9 | ||||
-rw-r--r-- | net/core/skbuff.c | 30 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 6 | ||||
-rw-r--r-- | net/ipv4/udp_diag.c | 4 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 8 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_expect.c | 8 | ||||
-rw-r--r-- | net/netfilter/nfnetlink.c | 20 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_log.c | 12 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_queue_core.c | 3 | ||||
-rw-r--r-- | net/netlink/af_netlink.c | 836 | ||||
-rw-r--r-- | net/netlink/af_netlink.h | 20 | ||||
-rw-r--r-- | net/netlink/diag.c | 32 | ||||
-rw-r--r-- | net/sched/cls_flow.c | 2 |
21 files changed, 1331 insertions, 74 deletions
diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt new file mode 100644 index 000000000000..1c2dab409625 --- /dev/null +++ b/Documentation/networking/netlink_mmap.txt | |||
@@ -0,0 +1,339 @@ | |||
1 | This file documents how to use memory mapped I/O with netlink. | ||
2 | |||
3 | Author: Patrick McHardy <kaber@trash.net> | ||
4 | |||
5 | Overview | ||
6 | -------- | ||
7 | |||
8 | Memory mapped netlink I/O can be used to increase throughput and decrease | ||
9 | overhead of unicast receive and transmit operations. Some netlink subsystems | ||
10 | require high throughput, these are mainly the netfilter subsystems | ||
11 | nfnetlink_queue and nfnetlink_log, but it can also help speed up large | ||
12 | dump operations of f.i. the routing database. | ||
13 | |||
14 | Memory mapped netlink I/O used two circular ring buffers for RX and TX which | ||
15 | are mapped into the processes address space. | ||
16 | |||
17 | The RX ring is used by the kernel to directly construct netlink messages into | ||
18 | user-space memory without copying them as done with regular socket I/O, | ||
19 | additionally as long as the ring contains messages no recvmsg() or poll() | ||
20 | syscalls have to be issued by user-space to get more message. | ||
21 | |||
22 | The TX ring is used to process messages directly from user-space memory, the | ||
23 | kernel processes all messages contained in the ring using a single sendmsg() | ||
24 | call. | ||
25 | |||
26 | Usage overview | ||
27 | -------------- | ||
28 | |||
29 | In order to use memory mapped netlink I/O, user-space needs three main changes: | ||
30 | |||
31 | - ring setup | ||
32 | - conversion of the RX path to get messages from the ring instead of recvmsg() | ||
33 | - conversion of the TX path to construct messages into the ring | ||
34 | |||
35 | Ring setup is done using setsockopt() to provide the ring parameters to the | ||
36 | kernel, then a call to mmap() to map the ring into the processes address space: | ||
37 | |||
38 | - setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, ¶ms, sizeof(params)); | ||
39 | - setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, ¶ms, sizeof(params)); | ||
40 | - ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0) | ||
41 | |||
42 | Usage of either ring is optional, but even if only the RX ring is used the | ||
43 | mapping still needs to be writable in order to update the frame status after | ||
44 | processing. | ||
45 | |||
46 | Conversion of the reception path involves calling poll() on the file | ||
47 | descriptor, once the socket is readable the frames from the ring are | ||
48 | processsed in order until no more messages are available, as indicated by | ||
49 | a status word in the frame header. | ||
50 | |||
51 | On kernel side, in order to make use of memory mapped I/O on receive, the | ||
52 | originating netlink subsystem needs to support memory mapped I/O, otherwise | ||
53 | it will use an allocated socket buffer as usual and the contents will be | ||
54 | copied to the ring on transmission, nullifying most of the performance gains. | ||
55 | Dumps of kernel databases automatically support memory mapped I/O. | ||
56 | |||
57 | Conversion of the transmit path involves changing message contruction to | ||
58 | use memory from the TX ring instead of (usually) a buffer declared on the | ||
59 | stack and setting up the frame header approriately. Optionally poll() can | ||
60 | be used to wait for free frames in the TX ring. | ||
61 | |||
62 | Structured and definitions for using memory mapped I/O are contained in | ||
63 | <linux/netlink.h>. | ||
64 | |||
65 | RX and TX rings | ||
66 | ---------------- | ||
67 | |||
68 | Each ring contains a number of continous memory blocks, containing frames of | ||
69 | fixed size dependant on the parameters used for ring setup. | ||
70 | |||
71 | Ring: [ block 0 ] | ||
72 | [ frame 0 ] | ||
73 | [ frame 1 ] | ||
74 | [ block 1 ] | ||
75 | [ frame 2 ] | ||
76 | [ frame 3 ] | ||
77 | ... | ||
78 | [ block n ] | ||
79 | [ frame 2 * n ] | ||
80 | [ frame 2 * n + 1 ] | ||
81 | |||
82 | The blocks are only visible to the kernel, from the point of view of user-space | ||
83 | the ring just contains the frames in a continous memory zone. | ||
84 | |||
85 | The ring parameters used for setting up the ring are defined as follows: | ||
86 | |||
87 | struct nl_mmap_req { | ||
88 | unsigned int nm_block_size; | ||
89 | unsigned int nm_block_nr; | ||
90 | unsigned int nm_frame_size; | ||
91 | unsigned int nm_frame_nr; | ||
92 | }; | ||
93 | |||
94 | Frames are grouped into blocks, where each block is a continous region of memory | ||
95 | and holds nm_block_size / nm_frame_size frames. The total number of frames in | ||
96 | the ring is nm_frame_nr. The following invariants hold: | ||
97 | |||
98 | - frames_per_block = nm_block_size / nm_frame_size | ||
99 | |||
100 | - nm_frame_nr = frames_per_block * nm_block_nr | ||
101 | |||
102 | Some parameters are constrained, specifically: | ||
103 | |||
104 | - nm_block_size must be a multiple of the architectures memory page size. | ||
105 | The getpagesize() function can be used to get the page size. | ||
106 | |||
107 | - nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be | ||
108 | able to hold at least the frame header | ||
109 | |||
110 | - nm_frame_size must be smaller or equal to nm_block_size | ||
111 | |||
112 | - nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT | ||
113 | |||
114 | - nm_frame_nr must equal the actual number of frames as specified above. | ||
115 | |||
116 | When the kernel can't allocate phsyically continous memory for a ring block, | ||
117 | it will fall back to use physically discontinous memory. This might affect | ||
118 | performance negatively, in order to avoid this the nm_frame_size parameter | ||
119 | should be chosen to be as small as possible for the required frame size and | ||
120 | the number of blocks should be increased instead. | ||
121 | |||
122 | Ring frames | ||
123 | ------------ | ||
124 | |||
125 | Each frames contain a frame header, consisting of a synchronization word and some | ||
126 | meta-data, and the message itself. | ||
127 | |||
128 | Frame: [ header message ] | ||
129 | |||
130 | The frame header is defined as follows: | ||
131 | |||
132 | struct nl_mmap_hdr { | ||
133 | unsigned int nm_status; | ||
134 | unsigned int nm_len; | ||
135 | __u32 nm_group; | ||
136 | /* credentials */ | ||
137 | __u32 nm_pid; | ||
138 | __u32 nm_uid; | ||
139 | __u32 nm_gid; | ||
140 | }; | ||
141 | |||
142 | - nm_status is used for synchronizing processing between the kernel and user- | ||
143 | space and specifies ownership of the frame as well as the operation to perform | ||
144 | |||
145 | - nm_len contains the length of the message contained in the data area | ||
146 | |||
147 | - nm_group specified the destination multicast group of message | ||
148 | |||
149 | - nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending | ||
150 | process. These values correspond to the data available using SOCK_PASSCRED in | ||
151 | the SCM_CREDENTIALS cmsg. | ||
152 | |||
153 | The possible values in the status word are: | ||
154 | |||
155 | - NL_MMAP_STATUS_UNUSED: | ||
156 | RX ring: frame belongs to the kernel and contains no message | ||
157 | for user-space. Approriate action is to invoke poll() | ||
158 | to wait for new messages. | ||
159 | |||
160 | TX ring: frame belongs to user-space and can be used for | ||
161 | message construction. | ||
162 | |||
163 | - NL_MMAP_STATUS_RESERVED: | ||
164 | RX ring only: frame is currently used by the kernel for message | ||
165 | construction and contains no valid message yet. | ||
166 | Appropriate action is to invoke poll() to wait for | ||
167 | new messages. | ||
168 | |||
169 | - NL_MMAP_STATUS_VALID: | ||
170 | RX ring: frame contains a valid message. Approriate action is | ||
171 | to process the message and release the frame back to | ||
172 | the kernel by setting the status to | ||
173 | NL_MMAP_STATUS_UNUSED or queue the frame by setting the | ||
174 | status to NL_MMAP_STATUS_SKIP. | ||
175 | |||
176 | TX ring: the frame contains a valid message from user-space to | ||
177 | be processed by the kernel. After completing processing | ||
178 | the kernel will release the frame back to user-space by | ||
179 | setting the status to NL_MMAP_STATUS_UNUSED. | ||
180 | |||
181 | - NL_MMAP_STATUS_COPY: | ||
182 | RX ring only: a message is ready to be processed but could not be | ||
183 | stored in the ring, either because it exceeded the | ||
184 | frame size or because the originating subsystem does | ||
185 | not support memory mapped I/O. Appropriate action is | ||
186 | to invoke recvmsg() to receive the message and release | ||
187 | the frame back to the kernel by setting the status to | ||
188 | NL_MMAP_STATUS_UNUSED. | ||
189 | |||
190 | - NL_MMAP_STATUS_SKIP: | ||
191 | RX ring only: user-space queued the message for later processing, but | ||
192 | processed some messages following it in the ring. The | ||
193 | kernel should skip this frame when looking for unused | ||
194 | frames. | ||
195 | |||
196 | The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the | ||
197 | frame header. | ||
198 | |||
199 | TX limitations | ||
200 | -------------- | ||
201 | |||
202 | Kernel processing usually involves validation of the message received by | ||
203 | user-space, then processing its contents. The kernel must assure that | ||
204 | userspace is not able to modify the message contents after they have been | ||
205 | validated. In order to do so, the message is copied from the ring frame | ||
206 | to an allocated buffer if either of these conditions is false: | ||
207 | |||
208 | - only a single mapping of the ring exists | ||
209 | - the file descriptor is not shared between processes | ||
210 | |||
211 | This means that for threaded programs, the kernel will fall back to copying. | ||
212 | |||
213 | Example | ||
214 | ------- | ||
215 | |||
216 | Ring setup: | ||
217 | |||
218 | unsigned int block_size = 16 * getpagesize(); | ||
219 | struct nl_mmap_req req = { | ||
220 | .nm_block_size = block_size, | ||
221 | .nm_block_nr = 64, | ||
222 | .nm_frame_size = 16384, | ||
223 | .nm_frame_nr = 64 * block_size / 16384, | ||
224 | }; | ||
225 | unsigned int ring_size; | ||
226 | void *rx_ring, *tx_ring; | ||
227 | |||
228 | /* Configure ring parameters */ | ||
229 | if (setsockopt(fd, NETLINK_RX_RING, &req, sizeof(req)) < 0) | ||
230 | exit(1); | ||
231 | if (setsockopt(fd, NETLINK_TX_RING, &req, sizeof(req)) < 0) | ||
232 | exit(1) | ||
233 | |||
234 | /* Calculate size of each invididual ring */ | ||
235 | ring_size = req.nm_block_nr * req.nm_block_size; | ||
236 | |||
237 | /* Map RX/TX rings. The TX ring is located after the RX ring */ | ||
238 | rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE, | ||
239 | MAP_SHARED, fd, 0); | ||
240 | if ((long)rx_ring == -1L) | ||
241 | exit(1); | ||
242 | tx_ring = rx_ring + ring_size: | ||
243 | |||
244 | Message reception: | ||
245 | |||
246 | This example assumes some ring parameters of the ring setup are available. | ||
247 | |||
248 | unsigned int frame_offset = 0; | ||
249 | struct nl_mmap_hdr *hdr; | ||
250 | struct nlmsghdr *nlh; | ||
251 | unsigned char buf[16384]; | ||
252 | ssize_t len; | ||
253 | |||
254 | while (1) { | ||
255 | struct pollfd pfds[1]; | ||
256 | |||
257 | pfds[0].fd = fd; | ||
258 | pfds[0].events = POLLIN | POLLERR; | ||
259 | pfds[0].revents = 0; | ||
260 | |||
261 | if (poll(pfds, 1, -1) < 0 && errno != -EINTR) | ||
262 | exit(1); | ||
263 | |||
264 | /* Check for errors. Error handling omitted */ | ||
265 | if (pfds[0].revents & POLLERR) | ||
266 | <handle error> | ||
267 | |||
268 | /* If no new messages, poll again */ | ||
269 | if (!(pfds[0].revents & POLLIN)) | ||
270 | continue; | ||
271 | |||
272 | /* Process all frames */ | ||
273 | while (1) { | ||
274 | /* Get next frame header */ | ||
275 | hdr = rx_ring + frame_offset; | ||
276 | |||
277 | if (hdr->nm_status == NL_MMAP_STATUS_VALID) | ||
278 | /* Regular memory mapped frame */ | ||
279 | nlh = (void *hdr) + NL_MMAP_HDRLEN; | ||
280 | len = hdr->nm_len; | ||
281 | |||
282 | /* Release empty message immediately. May happen | ||
283 | * on error during message construction. | ||
284 | */ | ||
285 | if (len == 0) | ||
286 | goto release; | ||
287 | } else if (hdr->nm_status == NL_MMAP_STATUS_COPY) { | ||
288 | /* Frame queued to socket receive queue */ | ||
289 | len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); | ||
290 | if (len <= 0) | ||
291 | break; | ||
292 | nlh = buf; | ||
293 | } else | ||
294 | /* No more messages to process, continue polling */ | ||
295 | break; | ||
296 | |||
297 | process_msg(nlh); | ||
298 | release: | ||
299 | /* Release frame back to the kernel */ | ||
300 | hdr->nm_status = NL_MMAP_STATUS_UNUSED; | ||
301 | |||
302 | /* Advance frame offset to next frame */ | ||
303 | frame_offset = (frame_offset + frame_size) % ring_size; | ||
304 | } | ||
305 | } | ||
306 | |||
307 | Message transmission: | ||
308 | |||
309 | This example assumes some ring parameters of the ring setup are available. | ||
310 | A single message is constructed and transmitted, to send multiple messages | ||
311 | at once they would be constructed in consecutive frames before a final call | ||
312 | to sendto(). | ||
313 | |||
314 | unsigned int frame_offset = 0; | ||
315 | struct nl_mmap_hdr *hdr; | ||
316 | struct nlmsghdr *nlh; | ||
317 | struct sockaddr_nl addr = { | ||
318 | .nl_family = AF_NETLINK, | ||
319 | }; | ||
320 | |||
321 | hdr = tx_ring + frame_offset; | ||
322 | if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) | ||
323 | /* No frame available. Use poll() to avoid. */ | ||
324 | exit(1); | ||
325 | |||
326 | nlh = (void *)hdr + NL_MMAP_HDRLEN; | ||
327 | |||
328 | /* Build message */ | ||
329 | build_message(nlh); | ||
330 | |||
331 | /* Fill frame header: length and status need to be set */ | ||
332 | hdr->nm_len = nlh->nlmsg_len; | ||
333 | hdr->nm_status = NL_MMAP_STATUS_VALID; | ||
334 | |||
335 | if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0) | ||
336 | exit(1); | ||
337 | |||
338 | /* Advance frame offset to next frame */ | ||
339 | frame_offset = (frame_offset + frame_size) % ring_size; | ||
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ecbb8e495912..cadb7402d7a7 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h | |||
@@ -29,10 +29,13 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); | |||
29 | extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); | 29 | extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); |
30 | 30 | ||
31 | extern int nfnetlink_has_listeners(struct net *net, unsigned int group); | 31 | extern int nfnetlink_has_listeners(struct net *net, unsigned int group); |
32 | extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, | 32 | extern struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, |
33 | int echo, gfp_t flags); | 33 | u32 dst_portid, gfp_t gfp_mask); |
34 | extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); | 34 | extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, |
35 | extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); | 35 | unsigned int group, int echo, gfp_t flags); |
36 | extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); | ||
37 | extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, | ||
38 | u32 portid, int flags); | ||
36 | 39 | ||
37 | extern void nfnl_lock(__u8 subsys_id); | 40 | extern void nfnl_lock(__u8 subsys_id); |
38 | extern void nfnl_unlock(__u8 subsys_id); | 41 | extern void nfnl_unlock(__u8 subsys_id); |
diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e0f746b7b95c..6358da5eeee8 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h | |||
@@ -15,11 +15,18 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) | |||
15 | return (struct nlmsghdr *)skb->data; | 15 | return (struct nlmsghdr *)skb->data; |
16 | } | 16 | } |
17 | 17 | ||
18 | enum netlink_skb_flags { | ||
19 | NETLINK_SKB_MMAPED = 0x1, /* Packet data is mmaped */ | ||
20 | NETLINK_SKB_TX = 0x2, /* Packet was sent by userspace */ | ||
21 | NETLINK_SKB_DELIVERED = 0x4, /* Packet was delivered */ | ||
22 | }; | ||
23 | |||
18 | struct netlink_skb_parms { | 24 | struct netlink_skb_parms { |
19 | struct scm_creds creds; /* Skb credentials */ | 25 | struct scm_creds creds; /* Skb credentials */ |
20 | __u32 portid; | 26 | __u32 portid; |
21 | __u32 dst_group; | 27 | __u32 dst_group; |
22 | struct sock *ssk; | 28 | __u32 flags; |
29 | struct sock *sk; | ||
23 | }; | 30 | }; |
24 | 31 | ||
25 | #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) | 32 | #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) |
@@ -57,6 +64,8 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group) | |||
57 | extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group); | 64 | extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group); |
58 | extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); | 65 | extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); |
59 | extern int netlink_has_listeners(struct sock *sk, unsigned int group); | 66 | extern int netlink_has_listeners(struct sock *sk, unsigned int group); |
67 | extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, | ||
68 | u32 dst_portid, gfp_t gfp_mask); | ||
60 | extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); | 69 | extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); |
61 | extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, | 70 | extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, |
62 | __u32 group, gfp_t allocation); | 71 | __u32 group, gfp_t allocation); |
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f5bed7b31954..2e0ced1af3b1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h | |||
@@ -651,6 +651,12 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, | |||
651 | return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); | 651 | return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); |
652 | } | 652 | } |
653 | 653 | ||
654 | extern struct sk_buff *__alloc_skb_head(gfp_t priority, int node); | ||
655 | static inline struct sk_buff *alloc_skb_head(gfp_t priority) | ||
656 | { | ||
657 | return __alloc_skb_head(priority, -1); | ||
658 | } | ||
659 | |||
654 | extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); | 660 | extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); |
655 | extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); | 661 | extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); |
656 | extern struct sk_buff *skb_clone(struct sk_buff *skb, | 662 | extern struct sk_buff *skb_clone(struct sk_buff *skb, |
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index caca0c4d6b4b..644d9c223d24 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h | |||
@@ -184,7 +184,7 @@ extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); | |||
184 | extern void nf_ct_delete_from_lists(struct nf_conn *ct); | 184 | extern void nf_ct_delete_from_lists(struct nf_conn *ct); |
185 | extern void nf_ct_dying_timeout(struct nf_conn *ct); | 185 | extern void nf_ct_dying_timeout(struct nf_conn *ct); |
186 | 186 | ||
187 | extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); | 187 | extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report); |
188 | 188 | ||
189 | extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, | 189 | extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, |
190 | unsigned int nhoff, u_int16_t l3num, | 190 | unsigned int nhoff, u_int16_t l3num, |
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cbbae7621e22..3f3aecbc8632 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h | |||
@@ -88,7 +88,7 @@ nf_ct_find_expectation(struct net *net, u16 zone, | |||
88 | const struct nf_conntrack_tuple *tuple); | 88 | const struct nf_conntrack_tuple *tuple); |
89 | 89 | ||
90 | void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, | 90 | void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, |
91 | u32 pid, int report); | 91 | u32 portid, int report); |
92 | static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) | 92 | static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) |
93 | { | 93 | { |
94 | nf_ct_unlink_expect_report(exp, 0, 0); | 94 | nf_ct_unlink_expect_report(exp, 0, 0); |
@@ -106,7 +106,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, | |||
106 | u_int8_t, const __be16 *, const __be16 *); | 106 | u_int8_t, const __be16 *, const __be16 *); |
107 | void nf_ct_expect_put(struct nf_conntrack_expect *exp); | 107 | void nf_ct_expect_put(struct nf_conntrack_expect *exp); |
108 | int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, | 108 | int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, |
109 | u32 pid, int report); | 109 | u32 portid, int report); |
110 | static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect) | 110 | static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect) |
111 | { | 111 | { |
112 | return nf_ct_expect_related_report(expect, 0, 0); | 112 | return nf_ct_expect_related_report(expect, 0, 0); |
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 32a354f67ba4..1a85940f8ab7 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h | |||
@@ -1,6 +1,7 @@ | |||
1 | #ifndef _UAPI__LINUX_NETLINK_H | 1 | #ifndef _UAPI__LINUX_NETLINK_H |
2 | #define _UAPI__LINUX_NETLINK_H | 2 | #define _UAPI__LINUX_NETLINK_H |
3 | 3 | ||
4 | #include <linux/kernel.h> | ||
4 | #include <linux/socket.h> /* for __kernel_sa_family_t */ | 5 | #include <linux/socket.h> /* for __kernel_sa_family_t */ |
5 | #include <linux/types.h> | 6 | #include <linux/types.h> |
6 | 7 | ||
@@ -105,11 +106,42 @@ struct nlmsgerr { | |||
105 | #define NETLINK_PKTINFO 3 | 106 | #define NETLINK_PKTINFO 3 |
106 | #define NETLINK_BROADCAST_ERROR 4 | 107 | #define NETLINK_BROADCAST_ERROR 4 |
107 | #define NETLINK_NO_ENOBUFS 5 | 108 | #define NETLINK_NO_ENOBUFS 5 |
109 | #define NETLINK_RX_RING 6 | ||
110 | #define NETLINK_TX_RING 7 | ||
108 | 111 | ||
109 | struct nl_pktinfo { | 112 | struct nl_pktinfo { |
110 | __u32 group; | 113 | __u32 group; |
111 | }; | 114 | }; |
112 | 115 | ||
116 | struct nl_mmap_req { | ||
117 | unsigned int nm_block_size; | ||
118 | unsigned int nm_block_nr; | ||
119 | unsigned int nm_frame_size; | ||
120 | unsigned int nm_frame_nr; | ||
121 | }; | ||
122 | |||
123 | struct nl_mmap_hdr { | ||
124 | unsigned int nm_status; | ||
125 | unsigned int nm_len; | ||
126 | __u32 nm_group; | ||
127 | /* credentials */ | ||
128 | __u32 nm_pid; | ||
129 | __u32 nm_uid; | ||
130 | __u32 nm_gid; | ||
131 | }; | ||
132 | |||
133 | enum nl_mmap_status { | ||
134 | NL_MMAP_STATUS_UNUSED, | ||
135 | NL_MMAP_STATUS_RESERVED, | ||
136 | NL_MMAP_STATUS_VALID, | ||
137 | NL_MMAP_STATUS_COPY, | ||
138 | NL_MMAP_STATUS_SKIP, | ||
139 | }; | ||
140 | |||
141 | #define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO | ||
142 | #define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) | ||
143 | #define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) | ||
144 | |||
113 | #define NET_MAJOR 36 /* Major 36 is reserved for networking */ | 145 | #define NET_MAJOR 36 /* Major 36 is reserved for networking */ |
114 | 146 | ||
115 | enum { | 147 | enum { |
diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index 88009a31cd06..4e31db4eea41 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h | |||
@@ -25,9 +25,18 @@ struct netlink_diag_msg { | |||
25 | __u32 ndiag_cookie[2]; | 25 | __u32 ndiag_cookie[2]; |
26 | }; | 26 | }; |
27 | 27 | ||
28 | struct netlink_diag_ring { | ||
29 | __u32 ndr_block_size; | ||
30 | __u32 ndr_block_nr; | ||
31 | __u32 ndr_frame_size; | ||
32 | __u32 ndr_frame_nr; | ||
33 | }; | ||
34 | |||
28 | enum { | 35 | enum { |
29 | NETLINK_DIAG_MEMINFO, | 36 | NETLINK_DIAG_MEMINFO, |
30 | NETLINK_DIAG_GROUPS, | 37 | NETLINK_DIAG_GROUPS, |
38 | NETLINK_DIAG_RX_RING, | ||
39 | NETLINK_DIAG_TX_RING, | ||
31 | 40 | ||
32 | __NETLINK_DIAG_MAX, | 41 | __NETLINK_DIAG_MAX, |
33 | }; | 42 | }; |
@@ -38,5 +47,6 @@ enum { | |||
38 | 47 | ||
39 | #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ | 48 | #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ |
40 | #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ | 49 | #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ |
50 | #define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */ | ||
41 | 51 | ||
42 | #endif | 52 | #endif |
diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868e..1a2221630e6a 100644 --- a/net/Kconfig +++ b/net/Kconfig | |||
@@ -23,6 +23,15 @@ menuconfig NET | |||
23 | 23 | ||
24 | if NET | 24 | if NET |
25 | 25 | ||
26 | config NETLINK_MMAP | ||
27 | bool "Netlink: mmaped IO" | ||
28 | help | ||
29 | This option enables support for memory mapped netlink IO. This | ||
30 | reduces overhead by avoiding copying data between kernel- and | ||
31 | userspace. | ||
32 | |||
33 | If unsure, say N. | ||
34 | |||
26 | config WANT_COMPAT_NETLINK_MESSAGES | 35 | config WANT_COMPAT_NETLINK_MESSAGES |
27 | bool | 36 | bool |
28 | help | 37 | help |
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a92d9e7d10f7..898cf5c566f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -179,6 +179,33 @@ out: | |||
179 | * | 179 | * |
180 | */ | 180 | */ |
181 | 181 | ||
182 | struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) | ||
183 | { | ||
184 | struct sk_buff *skb; | ||
185 | |||
186 | /* Get the HEAD */ | ||
187 | skb = kmem_cache_alloc_node(skbuff_head_cache, | ||
188 | gfp_mask & ~__GFP_DMA, node); | ||
189 | if (!skb) | ||
190 | goto out; | ||
191 | |||
192 | /* | ||
193 | * Only clear those fields we need to clear, not those that we will | ||
194 | * actually initialise below. Hence, don't put any more fields after | ||
195 | * the tail pointer in struct sk_buff! | ||
196 | */ | ||
197 | memset(skb, 0, offsetof(struct sk_buff, tail)); | ||
198 | skb->data = NULL; | ||
199 | skb->truesize = sizeof(struct sk_buff); | ||
200 | atomic_set(&skb->users, 1); | ||
201 | |||
202 | #ifdef NET_SKBUFF_DATA_USES_OFFSET | ||
203 | skb->mac_header = ~0U; | ||
204 | #endif | ||
205 | out: | ||
206 | return skb; | ||
207 | } | ||
208 | |||
182 | /** | 209 | /** |
183 | * __alloc_skb - allocate a network buffer | 210 | * __alloc_skb - allocate a network buffer |
184 | * @size: size to allocate | 211 | * @size: size to allocate |
@@ -584,7 +611,8 @@ static void skb_release_head_state(struct sk_buff *skb) | |||
584 | static void skb_release_all(struct sk_buff *skb) | 611 | static void skb_release_all(struct sk_buff *skb) |
585 | { | 612 | { |
586 | skb_release_head_state(skb); | 613 | skb_release_head_state(skb); |
587 | skb_release_data(skb); | 614 | if (likely(skb->data)) |
615 | skb_release_data(skb); | ||
588 | } | 616 | } |
589 | 617 | ||
590 | /** | 618 | /** |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8620408af574..5f648751fce2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -324,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s | |||
324 | } | 324 | } |
325 | 325 | ||
326 | err = sk_diag_fill(sk, rep, req, | 326 | err = sk_diag_fill(sk, rep, req, |
327 | sk_user_ns(NETLINK_CB(in_skb).ssk), | 327 | sk_user_ns(NETLINK_CB(in_skb).sk), |
328 | NETLINK_CB(in_skb).portid, | 328 | NETLINK_CB(in_skb).portid, |
329 | nlh->nlmsg_seq, 0, nlh); | 329 | nlh->nlmsg_seq, 0, nlh); |
330 | if (err < 0) { | 330 | if (err < 0) { |
@@ -630,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk, | |||
630 | return 0; | 630 | return 0; |
631 | 631 | ||
632 | return inet_csk_diag_fill(sk, skb, r, | 632 | return inet_csk_diag_fill(sk, skb, r, |
633 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | 633 | sk_user_ns(NETLINK_CB(cb->skb).sk), |
634 | NETLINK_CB(cb->skb).portid, | 634 | NETLINK_CB(cb->skb).portid, |
635 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | 635 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); |
636 | } | 636 | } |
@@ -805,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
805 | } | 805 | } |
806 | 806 | ||
807 | err = inet_diag_fill_req(skb, sk, req, | 807 | err = inet_diag_fill_req(skb, sk, req, |
808 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | 808 | sk_user_ns(NETLINK_CB(cb->skb).sk), |
809 | NETLINK_CB(cb->skb).portid, | 809 | NETLINK_CB(cb->skb).portid, |
810 | cb->nlh->nlmsg_seq, cb->nlh); | 810 | cb->nlh->nlmsg_seq, cb->nlh); |
811 | if (err < 0) { | 811 | if (err < 0) { |
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 369a781851ad..7927db0a9279 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c | |||
@@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, | |||
25 | return 0; | 25 | return 0; |
26 | 26 | ||
27 | return inet_sk_diag_fill(sk, NULL, skb, req, | 27 | return inet_sk_diag_fill(sk, NULL, skb, req, |
28 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | 28 | sk_user_ns(NETLINK_CB(cb->skb).sk), |
29 | NETLINK_CB(cb->skb).portid, | 29 | NETLINK_CB(cb->skb).portid, |
30 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | 30 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); |
31 | } | 31 | } |
@@ -71,7 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, | |||
71 | goto out; | 71 | goto out; |
72 | 72 | ||
73 | err = inet_sk_diag_fill(sk, NULL, rep, req, | 73 | err = inet_sk_diag_fill(sk, NULL, rep, req, |
74 | sk_user_ns(NETLINK_CB(in_skb).ssk), | 74 | sk_user_ns(NETLINK_CB(in_skb).sk), |
75 | NETLINK_CB(in_skb).portid, | 75 | NETLINK_CB(in_skb).portid, |
76 | nlh->nlmsg_seq, 0, nlh); | 76 | nlh->nlmsg_seq, 0, nlh); |
77 | if (err < 0) { | 77 | if (err < 0) { |
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 007e8c43d19a..54ddc2f8e7c9 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c | |||
@@ -1260,7 +1260,7 @@ void nf_ct_iterate_cleanup(struct net *net, | |||
1260 | EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); | 1260 | EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); |
1261 | 1261 | ||
1262 | struct __nf_ct_flush_report { | 1262 | struct __nf_ct_flush_report { |
1263 | u32 pid; | 1263 | u32 portid; |
1264 | int report; | 1264 | int report; |
1265 | }; | 1265 | }; |
1266 | 1266 | ||
@@ -1275,7 +1275,7 @@ static int kill_report(struct nf_conn *i, void *data) | |||
1275 | 1275 | ||
1276 | /* If we fail to deliver the event, death_by_timeout() will retry */ | 1276 | /* If we fail to deliver the event, death_by_timeout() will retry */ |
1277 | if (nf_conntrack_event_report(IPCT_DESTROY, i, | 1277 | if (nf_conntrack_event_report(IPCT_DESTROY, i, |
1278 | fr->pid, fr->report) < 0) | 1278 | fr->portid, fr->report) < 0) |
1279 | return 1; | 1279 | return 1; |
1280 | 1280 | ||
1281 | /* Avoid the delivery of the destroy event in death_by_timeout(). */ | 1281 | /* Avoid the delivery of the destroy event in death_by_timeout(). */ |
@@ -1298,10 +1298,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) | |||
1298 | } | 1298 | } |
1299 | EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); | 1299 | EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); |
1300 | 1300 | ||
1301 | void nf_conntrack_flush_report(struct net *net, u32 pid, int report) | 1301 | void nf_conntrack_flush_report(struct net *net, u32 portid, int report) |
1302 | { | 1302 | { |
1303 | struct __nf_ct_flush_report fr = { | 1303 | struct __nf_ct_flush_report fr = { |
1304 | .pid = pid, | 1304 | .portid = portid, |
1305 | .report = report, | 1305 | .report = report, |
1306 | }; | 1306 | }; |
1307 | nf_ct_iterate_cleanup(net, kill_report, &fr); | 1307 | nf_ct_iterate_cleanup(net, kill_report, &fr); |
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 8c10e3db3d9b..0adfdcc68bae 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c | |||
@@ -40,7 +40,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; | |||
40 | 40 | ||
41 | /* nf_conntrack_expect helper functions */ | 41 | /* nf_conntrack_expect helper functions */ |
42 | void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, | 42 | void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, |
43 | u32 pid, int report) | 43 | u32 portid, int report) |
44 | { | 44 | { |
45 | struct nf_conn_help *master_help = nfct_help(exp->master); | 45 | struct nf_conn_help *master_help = nfct_help(exp->master); |
46 | struct net *net = nf_ct_exp_net(exp); | 46 | struct net *net = nf_ct_exp_net(exp); |
@@ -54,7 +54,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, | |||
54 | hlist_del(&exp->lnode); | 54 | hlist_del(&exp->lnode); |
55 | master_help->expecting[exp->class]--; | 55 | master_help->expecting[exp->class]--; |
56 | 56 | ||
57 | nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); | 57 | nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); |
58 | nf_ct_expect_put(exp); | 58 | nf_ct_expect_put(exp); |
59 | 59 | ||
60 | NF_CT_STAT_INC(net, expect_delete); | 60 | NF_CT_STAT_INC(net, expect_delete); |
@@ -412,7 +412,7 @@ out: | |||
412 | } | 412 | } |
413 | 413 | ||
414 | int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, | 414 | int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, |
415 | u32 pid, int report) | 415 | u32 portid, int report) |
416 | { | 416 | { |
417 | int ret; | 417 | int ret; |
418 | 418 | ||
@@ -425,7 +425,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, | |||
425 | if (ret < 0) | 425 | if (ret < 0) |
426 | goto out; | 426 | goto out; |
427 | spin_unlock_bh(&nf_conntrack_lock); | 427 | spin_unlock_bh(&nf_conntrack_lock); |
428 | nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); | 428 | nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); |
429 | return ret; | 429 | return ret; |
430 | out: | 430 | out: |
431 | spin_unlock_bh(&nf_conntrack_lock); | 431 | spin_unlock_bh(&nf_conntrack_lock); |
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index bc4c499adb13..572d87dc116f 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c | |||
@@ -112,22 +112,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) | |||
112 | } | 112 | } |
113 | EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); | 113 | EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); |
114 | 114 | ||
115 | int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, | 115 | struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, |
116 | u32 dst_portid, gfp_t gfp_mask) | ||
117 | { | ||
118 | return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); | ||
119 | } | ||
120 | EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); | ||
121 | |||
122 | int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, | ||
116 | unsigned int group, int echo, gfp_t flags) | 123 | unsigned int group, int echo, gfp_t flags) |
117 | { | 124 | { |
118 | return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); | 125 | return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); |
119 | } | 126 | } |
120 | EXPORT_SYMBOL_GPL(nfnetlink_send); | 127 | EXPORT_SYMBOL_GPL(nfnetlink_send); |
121 | 128 | ||
122 | int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) | 129 | int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) |
123 | { | 130 | { |
124 | return netlink_set_err(net->nfnl, pid, group, error); | 131 | return netlink_set_err(net->nfnl, portid, group, error); |
125 | } | 132 | } |
126 | EXPORT_SYMBOL_GPL(nfnetlink_set_err); | 133 | EXPORT_SYMBOL_GPL(nfnetlink_set_err); |
127 | 134 | ||
128 | int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) | 135 | int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, |
136 | int flags) | ||
129 | { | 137 | { |
130 | return netlink_unicast(net->nfnl, skb, pid, flags); | 138 | return netlink_unicast(net->nfnl, skb, portid, flags); |
131 | } | 139 | } |
132 | EXPORT_SYMBOL_GPL(nfnetlink_unicast); | 140 | EXPORT_SYMBOL_GPL(nfnetlink_unicast); |
133 | 141 | ||
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 1a0be2af1dd8..d4199eb9b338 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c | |||
@@ -318,7 +318,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) | |||
318 | } | 318 | } |
319 | 319 | ||
320 | static struct sk_buff * | 320 | static struct sk_buff * |
321 | nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) | 321 | nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) |
322 | { | 322 | { |
323 | struct sk_buff *skb; | 323 | struct sk_buff *skb; |
324 | unsigned int n; | 324 | unsigned int n; |
@@ -327,13 +327,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) | |||
327 | * message. WARNING: has to be <= 128k due to slab restrictions */ | 327 | * message. WARNING: has to be <= 128k due to slab restrictions */ |
328 | 328 | ||
329 | n = max(inst_size, pkt_size); | 329 | n = max(inst_size, pkt_size); |
330 | skb = alloc_skb(n, GFP_ATOMIC); | 330 | skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC); |
331 | if (!skb) { | 331 | if (!skb) { |
332 | if (n > pkt_size) { | 332 | if (n > pkt_size) { |
333 | /* try to allocate only as much as we need for current | 333 | /* try to allocate only as much as we need for current |
334 | * packet */ | 334 | * packet */ |
335 | 335 | ||
336 | skb = alloc_skb(pkt_size, GFP_ATOMIC); | 336 | skb = nfnetlink_alloc_skb(&init_net, pkt_size, |
337 | peer_portid, GFP_ATOMIC); | ||
337 | if (!skb) | 338 | if (!skb) |
338 | pr_err("nfnetlink_log: can't even alloc %u bytes\n", | 339 | pr_err("nfnetlink_log: can't even alloc %u bytes\n", |
339 | pkt_size); | 340 | pkt_size); |
@@ -696,7 +697,8 @@ nfulnl_log_packet(u_int8_t pf, | |||
696 | } | 697 | } |
697 | 698 | ||
698 | if (!inst->skb) { | 699 | if (!inst->skb) { |
699 | inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); | 700 | inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz, |
701 | size); | ||
700 | if (!inst->skb) | 702 | if (!inst->skb) |
701 | goto alloc_failure; | 703 | goto alloc_failure; |
702 | } | 704 | } |
@@ -824,7 +826,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, | |||
824 | 826 | ||
825 | inst = instance_create(net, group_num, | 827 | inst = instance_create(net, group_num, |
826 | NETLINK_CB(skb).portid, | 828 | NETLINK_CB(skb).portid, |
827 | sk_user_ns(NETLINK_CB(skb).ssk)); | 829 | sk_user_ns(NETLINK_CB(skb).sk)); |
828 | if (IS_ERR(inst)) { | 830 | if (IS_ERR(inst)) { |
829 | ret = PTR_ERR(inst); | 831 | ret = PTR_ERR(inst); |
830 | goto out; | 832 | goto out; |
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 5e280b3e154f..ef3cdb4bfeea 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c | |||
@@ -339,7 +339,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, | |||
339 | if (queue->flags & NFQA_CFG_F_CONNTRACK) | 339 | if (queue->flags & NFQA_CFG_F_CONNTRACK) |
340 | ct = nfqnl_ct_get(entskb, &size, &ctinfo); | 340 | ct = nfqnl_ct_get(entskb, &size, &ctinfo); |
341 | 341 | ||
342 | skb = alloc_skb(size, GFP_ATOMIC); | 342 | skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid, |
343 | GFP_ATOMIC); | ||
343 | if (!skb) | 344 | if (!skb) |
344 | return NULL; | 345 | return NULL; |
345 | 346 | ||
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ce2e0064e7f6..2a3e9ba814c4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c | |||
@@ -3,6 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> | 4 | * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> |
5 | * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> | 5 | * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> |
6 | * Patrick McHardy <kaber@trash.net> | ||
6 | * | 7 | * |
7 | * This program is free software; you can redistribute it and/or | 8 | * This program is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU General Public License | 9 | * modify it under the terms of the GNU General Public License |
@@ -55,6 +56,8 @@ | |||
55 | #include <linux/types.h> | 56 | #include <linux/types.h> |
56 | #include <linux/audit.h> | 57 | #include <linux/audit.h> |
57 | #include <linux/mutex.h> | 58 | #include <linux/mutex.h> |
59 | #include <linux/vmalloc.h> | ||
60 | #include <asm/cacheflush.h> | ||
58 | 61 | ||
59 | #include <net/net_namespace.h> | 62 | #include <net/net_namespace.h> |
60 | #include <net/sock.h> | 63 | #include <net/sock.h> |
@@ -68,6 +71,10 @@ struct listeners { | |||
68 | unsigned long masks[0]; | 71 | unsigned long masks[0]; |
69 | }; | 72 | }; |
70 | 73 | ||
74 | /* state bits */ | ||
75 | #define NETLINK_CONGESTED 0x0 | ||
76 | |||
77 | /* flags */ | ||
71 | #define NETLINK_KERNEL_SOCKET 0x1 | 78 | #define NETLINK_KERNEL_SOCKET 0x1 |
72 | #define NETLINK_RECV_PKTINFO 0x2 | 79 | #define NETLINK_RECV_PKTINFO 0x2 |
73 | #define NETLINK_BROADCAST_SEND_ERROR 0x4 | 80 | #define NETLINK_BROADCAST_SEND_ERROR 0x4 |
@@ -84,6 +91,7 @@ EXPORT_SYMBOL_GPL(nl_table); | |||
84 | static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); | 91 | static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); |
85 | 92 | ||
86 | static int netlink_dump(struct sock *sk); | 93 | static int netlink_dump(struct sock *sk); |
94 | static void netlink_skb_destructor(struct sk_buff *skb); | ||
87 | 95 | ||
88 | DEFINE_RWLOCK(nl_table_lock); | 96 | DEFINE_RWLOCK(nl_table_lock); |
89 | EXPORT_SYMBOL_GPL(nl_table_lock); | 97 | EXPORT_SYMBOL_GPL(nl_table_lock); |
@@ -103,6 +111,599 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u | |||
103 | return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; | 111 | return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; |
104 | } | 112 | } |
105 | 113 | ||
114 | static void netlink_overrun(struct sock *sk) | ||
115 | { | ||
116 | struct netlink_sock *nlk = nlk_sk(sk); | ||
117 | |||
118 | if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { | ||
119 | if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { | ||
120 | sk->sk_err = ENOBUFS; | ||
121 | sk->sk_error_report(sk); | ||
122 | } | ||
123 | } | ||
124 | atomic_inc(&sk->sk_drops); | ||
125 | } | ||
126 | |||
127 | static void netlink_rcv_wake(struct sock *sk) | ||
128 | { | ||
129 | struct netlink_sock *nlk = nlk_sk(sk); | ||
130 | |||
131 | if (skb_queue_empty(&sk->sk_receive_queue)) | ||
132 | clear_bit(NETLINK_CONGESTED, &nlk->state); | ||
133 | if (!test_bit(NETLINK_CONGESTED, &nlk->state)) | ||
134 | wake_up_interruptible(&nlk->wait); | ||
135 | } | ||
136 | |||
137 | #ifdef CONFIG_NETLINK_MMAP | ||
138 | static bool netlink_skb_is_mmaped(const struct sk_buff *skb) | ||
139 | { | ||
140 | return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; | ||
141 | } | ||
142 | |||
143 | static bool netlink_rx_is_mmaped(struct sock *sk) | ||
144 | { | ||
145 | return nlk_sk(sk)->rx_ring.pg_vec != NULL; | ||
146 | } | ||
147 | |||
148 | static bool netlink_tx_is_mmaped(struct sock *sk) | ||
149 | { | ||
150 | return nlk_sk(sk)->tx_ring.pg_vec != NULL; | ||
151 | } | ||
152 | |||
153 | static __pure struct page *pgvec_to_page(const void *addr) | ||
154 | { | ||
155 | if (is_vmalloc_addr(addr)) | ||
156 | return vmalloc_to_page(addr); | ||
157 | else | ||
158 | return virt_to_page(addr); | ||
159 | } | ||
160 | |||
161 | static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) | ||
162 | { | ||
163 | unsigned int i; | ||
164 | |||
165 | for (i = 0; i < len; i++) { | ||
166 | if (pg_vec[i] != NULL) { | ||
167 | if (is_vmalloc_addr(pg_vec[i])) | ||
168 | vfree(pg_vec[i]); | ||
169 | else | ||
170 | free_pages((unsigned long)pg_vec[i], order); | ||
171 | } | ||
172 | } | ||
173 | kfree(pg_vec); | ||
174 | } | ||
175 | |||
176 | static void *alloc_one_pg_vec_page(unsigned long order) | ||
177 | { | ||
178 | void *buffer; | ||
179 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | | ||
180 | __GFP_NOWARN | __GFP_NORETRY; | ||
181 | |||
182 | buffer = (void *)__get_free_pages(gfp_flags, order); | ||
183 | if (buffer != NULL) | ||
184 | return buffer; | ||
185 | |||
186 | buffer = vzalloc((1 << order) * PAGE_SIZE); | ||
187 | if (buffer != NULL) | ||
188 | return buffer; | ||
189 | |||
190 | gfp_flags &= ~__GFP_NORETRY; | ||
191 | return (void *)__get_free_pages(gfp_flags, order); | ||
192 | } | ||
193 | |||
194 | static void **alloc_pg_vec(struct netlink_sock *nlk, | ||
195 | struct nl_mmap_req *req, unsigned int order) | ||
196 | { | ||
197 | unsigned int block_nr = req->nm_block_nr; | ||
198 | unsigned int i; | ||
199 | void **pg_vec, *ptr; | ||
200 | |||
201 | pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); | ||
202 | if (pg_vec == NULL) | ||
203 | return NULL; | ||
204 | |||
205 | for (i = 0; i < block_nr; i++) { | ||
206 | pg_vec[i] = ptr = alloc_one_pg_vec_page(order); | ||
207 | if (pg_vec[i] == NULL) | ||
208 | goto err1; | ||
209 | } | ||
210 | |||
211 | return pg_vec; | ||
212 | err1: | ||
213 | free_pg_vec(pg_vec, order, block_nr); | ||
214 | return NULL; | ||
215 | } | ||
216 | |||
217 | static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, | ||
218 | bool closing, bool tx_ring) | ||
219 | { | ||
220 | struct netlink_sock *nlk = nlk_sk(sk); | ||
221 | struct netlink_ring *ring; | ||
222 | struct sk_buff_head *queue; | ||
223 | void **pg_vec = NULL; | ||
224 | unsigned int order = 0; | ||
225 | int err; | ||
226 | |||
227 | ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; | ||
228 | queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; | ||
229 | |||
230 | if (!closing) { | ||
231 | if (atomic_read(&nlk->mapped)) | ||
232 | return -EBUSY; | ||
233 | if (atomic_read(&ring->pending)) | ||
234 | return -EBUSY; | ||
235 | } | ||
236 | |||
237 | if (req->nm_block_nr) { | ||
238 | if (ring->pg_vec != NULL) | ||
239 | return -EBUSY; | ||
240 | |||
241 | if ((int)req->nm_block_size <= 0) | ||
242 | return -EINVAL; | ||
243 | if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) | ||
244 | return -EINVAL; | ||
245 | if (req->nm_frame_size < NL_MMAP_HDRLEN) | ||
246 | return -EINVAL; | ||
247 | if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) | ||
248 | return -EINVAL; | ||
249 | |||
250 | ring->frames_per_block = req->nm_block_size / | ||
251 | req->nm_frame_size; | ||
252 | if (ring->frames_per_block == 0) | ||
253 | return -EINVAL; | ||
254 | if (ring->frames_per_block * req->nm_block_nr != | ||
255 | req->nm_frame_nr) | ||
256 | return -EINVAL; | ||
257 | |||
258 | order = get_order(req->nm_block_size); | ||
259 | pg_vec = alloc_pg_vec(nlk, req, order); | ||
260 | if (pg_vec == NULL) | ||
261 | return -ENOMEM; | ||
262 | } else { | ||
263 | if (req->nm_frame_nr) | ||
264 | return -EINVAL; | ||
265 | } | ||
266 | |||
267 | err = -EBUSY; | ||
268 | mutex_lock(&nlk->pg_vec_lock); | ||
269 | if (closing || atomic_read(&nlk->mapped) == 0) { | ||
270 | err = 0; | ||
271 | spin_lock_bh(&queue->lock); | ||
272 | |||
273 | ring->frame_max = req->nm_frame_nr - 1; | ||
274 | ring->head = 0; | ||
275 | ring->frame_size = req->nm_frame_size; | ||
276 | ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; | ||
277 | |||
278 | swap(ring->pg_vec_len, req->nm_block_nr); | ||
279 | swap(ring->pg_vec_order, order); | ||
280 | swap(ring->pg_vec, pg_vec); | ||
281 | |||
282 | __skb_queue_purge(queue); | ||
283 | spin_unlock_bh(&queue->lock); | ||
284 | |||
285 | WARN_ON(atomic_read(&nlk->mapped)); | ||
286 | } | ||
287 | mutex_unlock(&nlk->pg_vec_lock); | ||
288 | |||
289 | if (pg_vec) | ||
290 | free_pg_vec(pg_vec, order, req->nm_block_nr); | ||
291 | return err; | ||
292 | } | ||
293 | |||
294 | static void netlink_mm_open(struct vm_area_struct *vma) | ||
295 | { | ||
296 | struct file *file = vma->vm_file; | ||
297 | struct socket *sock = file->private_data; | ||
298 | struct sock *sk = sock->sk; | ||
299 | |||
300 | if (sk) | ||
301 | atomic_inc(&nlk_sk(sk)->mapped); | ||
302 | } | ||
303 | |||
304 | static void netlink_mm_close(struct vm_area_struct *vma) | ||
305 | { | ||
306 | struct file *file = vma->vm_file; | ||
307 | struct socket *sock = file->private_data; | ||
308 | struct sock *sk = sock->sk; | ||
309 | |||
310 | if (sk) | ||
311 | atomic_dec(&nlk_sk(sk)->mapped); | ||
312 | } | ||
313 | |||
314 | static const struct vm_operations_struct netlink_mmap_ops = { | ||
315 | .open = netlink_mm_open, | ||
316 | .close = netlink_mm_close, | ||
317 | }; | ||
318 | |||
319 | static int netlink_mmap(struct file *file, struct socket *sock, | ||
320 | struct vm_area_struct *vma) | ||
321 | { | ||
322 | struct sock *sk = sock->sk; | ||
323 | struct netlink_sock *nlk = nlk_sk(sk); | ||
324 | struct netlink_ring *ring; | ||
325 | unsigned long start, size, expected; | ||
326 | unsigned int i; | ||
327 | int err = -EINVAL; | ||
328 | |||
329 | if (vma->vm_pgoff) | ||
330 | return -EINVAL; | ||
331 | |||
332 | mutex_lock(&nlk->pg_vec_lock); | ||
333 | |||
334 | expected = 0; | ||
335 | for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { | ||
336 | if (ring->pg_vec == NULL) | ||
337 | continue; | ||
338 | expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; | ||
339 | } | ||
340 | |||
341 | if (expected == 0) | ||
342 | goto out; | ||
343 | |||
344 | size = vma->vm_end - vma->vm_start; | ||
345 | if (size != expected) | ||
346 | goto out; | ||
347 | |||
348 | start = vma->vm_start; | ||
349 | for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { | ||
350 | if (ring->pg_vec == NULL) | ||
351 | continue; | ||
352 | |||
353 | for (i = 0; i < ring->pg_vec_len; i++) { | ||
354 | struct page *page; | ||
355 | void *kaddr = ring->pg_vec[i]; | ||
356 | unsigned int pg_num; | ||
357 | |||
358 | for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { | ||
359 | page = pgvec_to_page(kaddr); | ||
360 | err = vm_insert_page(vma, start, page); | ||
361 | if (err < 0) | ||
362 | goto out; | ||
363 | start += PAGE_SIZE; | ||
364 | kaddr += PAGE_SIZE; | ||
365 | } | ||
366 | } | ||
367 | } | ||
368 | |||
369 | atomic_inc(&nlk->mapped); | ||
370 | vma->vm_ops = &netlink_mmap_ops; | ||
371 | err = 0; | ||
372 | out: | ||
373 | mutex_unlock(&nlk->pg_vec_lock); | ||
374 | return 0; | ||
375 | } | ||
376 | |||
377 | static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) | ||
378 | { | ||
379 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
380 | struct page *p_start, *p_end; | ||
381 | |||
382 | /* First page is flushed through netlink_{get,set}_status */ | ||
383 | p_start = pgvec_to_page(hdr + PAGE_SIZE); | ||
384 | p_end = pgvec_to_page((void *)hdr + NL_MMAP_MSG_HDRLEN + hdr->nm_len - 1); | ||
385 | while (p_start <= p_end) { | ||
386 | flush_dcache_page(p_start); | ||
387 | p_start++; | ||
388 | } | ||
389 | #endif | ||
390 | } | ||
391 | |||
392 | static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) | ||
393 | { | ||
394 | smp_rmb(); | ||
395 | flush_dcache_page(pgvec_to_page(hdr)); | ||
396 | return hdr->nm_status; | ||
397 | } | ||
398 | |||
399 | static void netlink_set_status(struct nl_mmap_hdr *hdr, | ||
400 | enum nl_mmap_status status) | ||
401 | { | ||
402 | hdr->nm_status = status; | ||
403 | flush_dcache_page(pgvec_to_page(hdr)); | ||
404 | smp_wmb(); | ||
405 | } | ||
406 | |||
407 | static struct nl_mmap_hdr * | ||
408 | __netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) | ||
409 | { | ||
410 | unsigned int pg_vec_pos, frame_off; | ||
411 | |||
412 | pg_vec_pos = pos / ring->frames_per_block; | ||
413 | frame_off = pos % ring->frames_per_block; | ||
414 | |||
415 | return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); | ||
416 | } | ||
417 | |||
418 | static struct nl_mmap_hdr * | ||
419 | netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, | ||
420 | enum nl_mmap_status status) | ||
421 | { | ||
422 | struct nl_mmap_hdr *hdr; | ||
423 | |||
424 | hdr = __netlink_lookup_frame(ring, pos); | ||
425 | if (netlink_get_status(hdr) != status) | ||
426 | return NULL; | ||
427 | |||
428 | return hdr; | ||
429 | } | ||
430 | |||
431 | static struct nl_mmap_hdr * | ||
432 | netlink_current_frame(const struct netlink_ring *ring, | ||
433 | enum nl_mmap_status status) | ||
434 | { | ||
435 | return netlink_lookup_frame(ring, ring->head, status); | ||
436 | } | ||
437 | |||
438 | static struct nl_mmap_hdr * | ||
439 | netlink_previous_frame(const struct netlink_ring *ring, | ||
440 | enum nl_mmap_status status) | ||
441 | { | ||
442 | unsigned int prev; | ||
443 | |||
444 | prev = ring->head ? ring->head - 1 : ring->frame_max; | ||
445 | return netlink_lookup_frame(ring, prev, status); | ||
446 | } | ||
447 | |||
448 | static void netlink_increment_head(struct netlink_ring *ring) | ||
449 | { | ||
450 | ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; | ||
451 | } | ||
452 | |||
453 | static void netlink_forward_ring(struct netlink_ring *ring) | ||
454 | { | ||
455 | unsigned int head = ring->head, pos = head; | ||
456 | const struct nl_mmap_hdr *hdr; | ||
457 | |||
458 | do { | ||
459 | hdr = __netlink_lookup_frame(ring, pos); | ||
460 | if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) | ||
461 | break; | ||
462 | if (hdr->nm_status != NL_MMAP_STATUS_SKIP) | ||
463 | break; | ||
464 | netlink_increment_head(ring); | ||
465 | } while (ring->head != head); | ||
466 | } | ||
467 | |||
468 | static bool netlink_dump_space(struct netlink_sock *nlk) | ||
469 | { | ||
470 | struct netlink_ring *ring = &nlk->rx_ring; | ||
471 | struct nl_mmap_hdr *hdr; | ||
472 | unsigned int n; | ||
473 | |||
474 | hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); | ||
475 | if (hdr == NULL) | ||
476 | return false; | ||
477 | |||
478 | n = ring->head + ring->frame_max / 2; | ||
479 | if (n > ring->frame_max) | ||
480 | n -= ring->frame_max; | ||
481 | |||
482 | hdr = __netlink_lookup_frame(ring, n); | ||
483 | |||
484 | return hdr->nm_status == NL_MMAP_STATUS_UNUSED; | ||
485 | } | ||
486 | |||
487 | static unsigned int netlink_poll(struct file *file, struct socket *sock, | ||
488 | poll_table *wait) | ||
489 | { | ||
490 | struct sock *sk = sock->sk; | ||
491 | struct netlink_sock *nlk = nlk_sk(sk); | ||
492 | unsigned int mask; | ||
493 | int err; | ||
494 | |||
495 | if (nlk->rx_ring.pg_vec != NULL) { | ||
496 | /* Memory mapped sockets don't call recvmsg(), so flow control | ||
497 | * for dumps is performed here. A dump is allowed to continue | ||
498 | * if at least half the ring is unused. | ||
499 | */ | ||
500 | while (nlk->cb != NULL && netlink_dump_space(nlk)) { | ||
501 | err = netlink_dump(sk); | ||
502 | if (err < 0) { | ||
503 | sk->sk_err = err; | ||
504 | sk->sk_error_report(sk); | ||
505 | break; | ||
506 | } | ||
507 | } | ||
508 | netlink_rcv_wake(sk); | ||
509 | } | ||
510 | |||
511 | mask = datagram_poll(file, sock, wait); | ||
512 | |||
513 | spin_lock_bh(&sk->sk_receive_queue.lock); | ||
514 | if (nlk->rx_ring.pg_vec) { | ||
515 | netlink_forward_ring(&nlk->rx_ring); | ||
516 | if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) | ||
517 | mask |= POLLIN | POLLRDNORM; | ||
518 | } | ||
519 | spin_unlock_bh(&sk->sk_receive_queue.lock); | ||
520 | |||
521 | spin_lock_bh(&sk->sk_write_queue.lock); | ||
522 | if (nlk->tx_ring.pg_vec) { | ||
523 | if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) | ||
524 | mask |= POLLOUT | POLLWRNORM; | ||
525 | } | ||
526 | spin_unlock_bh(&sk->sk_write_queue.lock); | ||
527 | |||
528 | return mask; | ||
529 | } | ||
530 | |||
531 | static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) | ||
532 | { | ||
533 | return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); | ||
534 | } | ||
535 | |||
536 | static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, | ||
537 | struct netlink_ring *ring, | ||
538 | struct nl_mmap_hdr *hdr) | ||
539 | { | ||
540 | unsigned int size; | ||
541 | void *data; | ||
542 | |||
543 | size = ring->frame_size - NL_MMAP_HDRLEN; | ||
544 | data = (void *)hdr + NL_MMAP_HDRLEN; | ||
545 | |||
546 | skb->head = data; | ||
547 | skb->data = data; | ||
548 | skb_reset_tail_pointer(skb); | ||
549 | skb->end = skb->tail + size; | ||
550 | skb->len = 0; | ||
551 | |||
552 | skb->destructor = netlink_skb_destructor; | ||
553 | NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; | ||
554 | NETLINK_CB(skb).sk = sk; | ||
555 | } | ||
556 | |||
557 | static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, | ||
558 | u32 dst_portid, u32 dst_group, | ||
559 | struct sock_iocb *siocb) | ||
560 | { | ||
561 | struct netlink_sock *nlk = nlk_sk(sk); | ||
562 | struct netlink_ring *ring; | ||
563 | struct nl_mmap_hdr *hdr; | ||
564 | struct sk_buff *skb; | ||
565 | unsigned int maxlen; | ||
566 | bool excl = true; | ||
567 | int err = 0, len = 0; | ||
568 | |||
569 | /* Netlink messages are validated by the receiver before processing. | ||
570 | * In order to avoid userspace changing the contents of the message | ||
571 | * after validation, the socket and the ring may only be used by a | ||
572 | * single process, otherwise we fall back to copying. | ||
573 | */ | ||
574 | if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 || | ||
575 | atomic_read(&nlk->mapped) > 1) | ||
576 | excl = false; | ||
577 | |||
578 | mutex_lock(&nlk->pg_vec_lock); | ||
579 | |||
580 | ring = &nlk->tx_ring; | ||
581 | maxlen = ring->frame_size - NL_MMAP_HDRLEN; | ||
582 | |||
583 | do { | ||
584 | hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); | ||
585 | if (hdr == NULL) { | ||
586 | if (!(msg->msg_flags & MSG_DONTWAIT) && | ||
587 | atomic_read(&nlk->tx_ring.pending)) | ||
588 | schedule(); | ||
589 | continue; | ||
590 | } | ||
591 | if (hdr->nm_len > maxlen) { | ||
592 | err = -EINVAL; | ||
593 | goto out; | ||
594 | } | ||
595 | |||
596 | netlink_frame_flush_dcache(hdr); | ||
597 | |||
598 | if (likely(dst_portid == 0 && dst_group == 0 && excl)) { | ||
599 | skb = alloc_skb_head(GFP_KERNEL); | ||
600 | if (skb == NULL) { | ||
601 | err = -ENOBUFS; | ||
602 | goto out; | ||
603 | } | ||
604 | sock_hold(sk); | ||
605 | netlink_ring_setup_skb(skb, sk, ring, hdr); | ||
606 | NETLINK_CB(skb).flags |= NETLINK_SKB_TX; | ||
607 | __skb_put(skb, hdr->nm_len); | ||
608 | netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); | ||
609 | atomic_inc(&ring->pending); | ||
610 | } else { | ||
611 | skb = alloc_skb(hdr->nm_len, GFP_KERNEL); | ||
612 | if (skb == NULL) { | ||
613 | err = -ENOBUFS; | ||
614 | goto out; | ||
615 | } | ||
616 | __skb_put(skb, hdr->nm_len); | ||
617 | memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); | ||
618 | netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); | ||
619 | } | ||
620 | |||
621 | netlink_increment_head(ring); | ||
622 | |||
623 | NETLINK_CB(skb).portid = nlk->portid; | ||
624 | NETLINK_CB(skb).dst_group = dst_group; | ||
625 | NETLINK_CB(skb).creds = siocb->scm->creds; | ||
626 | |||
627 | err = security_netlink_send(sk, skb); | ||
628 | if (err) { | ||
629 | kfree_skb(skb); | ||
630 | goto out; | ||
631 | } | ||
632 | |||
633 | if (unlikely(dst_group)) { | ||
634 | atomic_inc(&skb->users); | ||
635 | netlink_broadcast(sk, skb, dst_portid, dst_group, | ||
636 | GFP_KERNEL); | ||
637 | } | ||
638 | err = netlink_unicast(sk, skb, dst_portid, | ||
639 | msg->msg_flags & MSG_DONTWAIT); | ||
640 | if (err < 0) | ||
641 | goto out; | ||
642 | len += err; | ||
643 | |||
644 | } while (hdr != NULL || | ||
645 | (!(msg->msg_flags & MSG_DONTWAIT) && | ||
646 | atomic_read(&nlk->tx_ring.pending))); | ||
647 | |||
648 | if (len > 0) | ||
649 | err = len; | ||
650 | out: | ||
651 | mutex_unlock(&nlk->pg_vec_lock); | ||
652 | return err; | ||
653 | } | ||
654 | |||
655 | static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) | ||
656 | { | ||
657 | struct nl_mmap_hdr *hdr; | ||
658 | |||
659 | hdr = netlink_mmap_hdr(skb); | ||
660 | hdr->nm_len = skb->len; | ||
661 | hdr->nm_group = NETLINK_CB(skb).dst_group; | ||
662 | hdr->nm_pid = NETLINK_CB(skb).creds.pid; | ||
663 | hdr->nm_uid = NETLINK_CB(skb).creds.uid; | ||
664 | hdr->nm_gid = NETLINK_CB(skb).creds.gid; | ||
665 | netlink_frame_flush_dcache(hdr); | ||
666 | netlink_set_status(hdr, NL_MMAP_STATUS_VALID); | ||
667 | |||
668 | NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; | ||
669 | kfree_skb(skb); | ||
670 | } | ||
671 | |||
672 | static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) | ||
673 | { | ||
674 | struct netlink_sock *nlk = nlk_sk(sk); | ||
675 | struct netlink_ring *ring = &nlk->rx_ring; | ||
676 | struct nl_mmap_hdr *hdr; | ||
677 | |||
678 | spin_lock_bh(&sk->sk_receive_queue.lock); | ||
679 | hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); | ||
680 | if (hdr == NULL) { | ||
681 | spin_unlock_bh(&sk->sk_receive_queue.lock); | ||
682 | kfree_skb(skb); | ||
683 | netlink_overrun(sk); | ||
684 | return; | ||
685 | } | ||
686 | netlink_increment_head(ring); | ||
687 | __skb_queue_tail(&sk->sk_receive_queue, skb); | ||
688 | spin_unlock_bh(&sk->sk_receive_queue.lock); | ||
689 | |||
690 | hdr->nm_len = skb->len; | ||
691 | hdr->nm_group = NETLINK_CB(skb).dst_group; | ||
692 | hdr->nm_pid = NETLINK_CB(skb).creds.pid; | ||
693 | hdr->nm_uid = NETLINK_CB(skb).creds.uid; | ||
694 | hdr->nm_gid = NETLINK_CB(skb).creds.gid; | ||
695 | netlink_set_status(hdr, NL_MMAP_STATUS_COPY); | ||
696 | } | ||
697 | |||
698 | #else /* CONFIG_NETLINK_MMAP */ | ||
699 | #define netlink_skb_is_mmaped(skb) false | ||
700 | #define netlink_rx_is_mmaped(sk) false | ||
701 | #define netlink_tx_is_mmaped(sk) false | ||
702 | #define netlink_mmap sock_no_mmap | ||
703 | #define netlink_poll datagram_poll | ||
704 | #define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 | ||
705 | #endif /* CONFIG_NETLINK_MMAP */ | ||
706 | |||
106 | static void netlink_destroy_callback(struct netlink_callback *cb) | 707 | static void netlink_destroy_callback(struct netlink_callback *cb) |
107 | { | 708 | { |
108 | kfree_skb(cb->skb); | 709 | kfree_skb(cb->skb); |
@@ -115,6 +716,53 @@ static void netlink_consume_callback(struct netlink_callback *cb) | |||
115 | kfree(cb); | 716 | kfree(cb); |
116 | } | 717 | } |
117 | 718 | ||
719 | static void netlink_skb_destructor(struct sk_buff *skb) | ||
720 | { | ||
721 | #ifdef CONFIG_NETLINK_MMAP | ||
722 | struct nl_mmap_hdr *hdr; | ||
723 | struct netlink_ring *ring; | ||
724 | struct sock *sk; | ||
725 | |||
726 | /* If a packet from the kernel to userspace was freed because of an | ||
727 | * error without being delivered to userspace, the kernel must reset | ||
728 | * the status. In the direction userspace to kernel, the status is | ||
729 | * always reset here after the packet was processed and freed. | ||
730 | */ | ||
731 | if (netlink_skb_is_mmaped(skb)) { | ||
732 | hdr = netlink_mmap_hdr(skb); | ||
733 | sk = NETLINK_CB(skb).sk; | ||
734 | |||
735 | if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { | ||
736 | netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); | ||
737 | ring = &nlk_sk(sk)->tx_ring; | ||
738 | } else { | ||
739 | if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { | ||
740 | hdr->nm_len = 0; | ||
741 | netlink_set_status(hdr, NL_MMAP_STATUS_VALID); | ||
742 | } | ||
743 | ring = &nlk_sk(sk)->rx_ring; | ||
744 | } | ||
745 | |||
746 | WARN_ON(atomic_read(&ring->pending) == 0); | ||
747 | atomic_dec(&ring->pending); | ||
748 | sock_put(sk); | ||
749 | |||
750 | skb->data = NULL; | ||
751 | } | ||
752 | #endif | ||
753 | if (skb->sk != NULL) | ||
754 | sock_rfree(skb); | ||
755 | } | ||
756 | |||
757 | static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) | ||
758 | { | ||
759 | WARN_ON(skb->sk != NULL); | ||
760 | skb->sk = sk; | ||
761 | skb->destructor = netlink_skb_destructor; | ||
762 | atomic_add(skb->truesize, &sk->sk_rmem_alloc); | ||
763 | sk_mem_charge(sk, skb->truesize); | ||
764 | } | ||
765 | |||
118 | static void netlink_sock_destruct(struct sock *sk) | 766 | static void netlink_sock_destruct(struct sock *sk) |
119 | { | 767 | { |
120 | struct netlink_sock *nlk = nlk_sk(sk); | 768 | struct netlink_sock *nlk = nlk_sk(sk); |
@@ -128,6 +776,18 @@ static void netlink_sock_destruct(struct sock *sk) | |||
128 | } | 776 | } |
129 | 777 | ||
130 | skb_queue_purge(&sk->sk_receive_queue); | 778 | skb_queue_purge(&sk->sk_receive_queue); |
779 | #ifdef CONFIG_NETLINK_MMAP | ||
780 | if (1) { | ||
781 | struct nl_mmap_req req; | ||
782 | |||
783 | memset(&req, 0, sizeof(req)); | ||
784 | if (nlk->rx_ring.pg_vec) | ||
785 | netlink_set_ring(sk, &req, true, false); | ||
786 | memset(&req, 0, sizeof(req)); | ||
787 | if (nlk->tx_ring.pg_vec) | ||
788 | netlink_set_ring(sk, &req, true, true); | ||
789 | } | ||
790 | #endif /* CONFIG_NETLINK_MMAP */ | ||
131 | 791 | ||
132 | if (!sock_flag(sk, SOCK_DEAD)) { | 792 | if (!sock_flag(sk, SOCK_DEAD)) { |
133 | printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); | 793 | printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); |
@@ -391,6 +1051,9 @@ static int __netlink_create(struct net *net, struct socket *sock, | |||
391 | mutex_init(nlk->cb_mutex); | 1051 | mutex_init(nlk->cb_mutex); |
392 | } | 1052 | } |
393 | init_waitqueue_head(&nlk->wait); | 1053 | init_waitqueue_head(&nlk->wait); |
1054 | #ifdef CONFIG_NETLINK_MMAP | ||
1055 | mutex_init(&nlk->pg_vec_lock); | ||
1056 | #endif | ||
394 | 1057 | ||
395 | sk->sk_destruct = netlink_sock_destruct; | 1058 | sk->sk_destruct = netlink_sock_destruct; |
396 | sk->sk_protocol = protocol; | 1059 | sk->sk_protocol = protocol; |
@@ -722,19 +1385,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, | |||
722 | return 0; | 1385 | return 0; |
723 | } | 1386 | } |
724 | 1387 | ||
725 | static void netlink_overrun(struct sock *sk) | ||
726 | { | ||
727 | struct netlink_sock *nlk = nlk_sk(sk); | ||
728 | |||
729 | if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { | ||
730 | if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { | ||
731 | sk->sk_err = ENOBUFS; | ||
732 | sk->sk_error_report(sk); | ||
733 | } | ||
734 | } | ||
735 | atomic_inc(&sk->sk_drops); | ||
736 | } | ||
737 | |||
738 | static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) | 1388 | static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) |
739 | { | 1389 | { |
740 | struct sock *sock; | 1390 | struct sock *sock; |
@@ -787,8 +1437,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, | |||
787 | 1437 | ||
788 | nlk = nlk_sk(sk); | 1438 | nlk = nlk_sk(sk); |
789 | 1439 | ||
790 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || | 1440 | if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || |
791 | test_bit(0, &nlk->state)) { | 1441 | test_bit(NETLINK_CONGESTED, &nlk->state)) && |
1442 | !netlink_skb_is_mmaped(skb)) { | ||
792 | DECLARE_WAITQUEUE(wait, current); | 1443 | DECLARE_WAITQUEUE(wait, current); |
793 | if (!*timeo) { | 1444 | if (!*timeo) { |
794 | if (!ssk || netlink_is_kernel(ssk)) | 1445 | if (!ssk || netlink_is_kernel(ssk)) |
@@ -802,7 +1453,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, | |||
802 | add_wait_queue(&nlk->wait, &wait); | 1453 | add_wait_queue(&nlk->wait, &wait); |
803 | 1454 | ||
804 | if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || | 1455 | if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || |
805 | test_bit(0, &nlk->state)) && | 1456 | test_bit(NETLINK_CONGESTED, &nlk->state)) && |
806 | !sock_flag(sk, SOCK_DEAD)) | 1457 | !sock_flag(sk, SOCK_DEAD)) |
807 | *timeo = schedule_timeout(*timeo); | 1458 | *timeo = schedule_timeout(*timeo); |
808 | 1459 | ||
@@ -816,7 +1467,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, | |||
816 | } | 1467 | } |
817 | return 1; | 1468 | return 1; |
818 | } | 1469 | } |
819 | skb_set_owner_r(skb, sk); | 1470 | netlink_skb_set_owner_r(skb, sk); |
820 | return 0; | 1471 | return 0; |
821 | } | 1472 | } |
822 | 1473 | ||
@@ -824,7 +1475,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) | |||
824 | { | 1475 | { |
825 | int len = skb->len; | 1476 | int len = skb->len; |
826 | 1477 | ||
827 | skb_queue_tail(&sk->sk_receive_queue, skb); | 1478 | #ifdef CONFIG_NETLINK_MMAP |
1479 | if (netlink_skb_is_mmaped(skb)) | ||
1480 | netlink_queue_mmaped_skb(sk, skb); | ||
1481 | else if (netlink_rx_is_mmaped(sk)) | ||
1482 | netlink_ring_set_copied(sk, skb); | ||
1483 | else | ||
1484 | #endif /* CONFIG_NETLINK_MMAP */ | ||
1485 | skb_queue_tail(&sk->sk_receive_queue, skb); | ||
828 | sk->sk_data_ready(sk, len); | 1486 | sk->sk_data_ready(sk, len); |
829 | return len; | 1487 | return len; |
830 | } | 1488 | } |
@@ -847,7 +1505,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) | |||
847 | { | 1505 | { |
848 | int delta; | 1506 | int delta; |
849 | 1507 | ||
850 | skb_orphan(skb); | 1508 | WARN_ON(skb->sk != NULL); |
1509 | if (netlink_skb_is_mmaped(skb)) | ||
1510 | return skb; | ||
851 | 1511 | ||
852 | delta = skb->end - skb->tail; | 1512 | delta = skb->end - skb->tail; |
853 | if (delta * 2 < skb->truesize) | 1513 | if (delta * 2 < skb->truesize) |
@@ -867,16 +1527,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) | |||
867 | return skb; | 1527 | return skb; |
868 | } | 1528 | } |
869 | 1529 | ||
870 | static void netlink_rcv_wake(struct sock *sk) | ||
871 | { | ||
872 | struct netlink_sock *nlk = nlk_sk(sk); | ||
873 | |||
874 | if (skb_queue_empty(&sk->sk_receive_queue)) | ||
875 | clear_bit(0, &nlk->state); | ||
876 | if (!test_bit(0, &nlk->state)) | ||
877 | wake_up_interruptible(&nlk->wait); | ||
878 | } | ||
879 | |||
880 | static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, | 1530 | static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, |
881 | struct sock *ssk) | 1531 | struct sock *ssk) |
882 | { | 1532 | { |
@@ -886,8 +1536,8 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, | |||
886 | ret = -ECONNREFUSED; | 1536 | ret = -ECONNREFUSED; |
887 | if (nlk->netlink_rcv != NULL) { | 1537 | if (nlk->netlink_rcv != NULL) { |
888 | ret = skb->len; | 1538 | ret = skb->len; |
889 | skb_set_owner_r(skb, sk); | 1539 | netlink_skb_set_owner_r(skb, sk); |
890 | NETLINK_CB(skb).ssk = ssk; | 1540 | NETLINK_CB(skb).sk = ssk; |
891 | nlk->netlink_rcv(skb); | 1541 | nlk->netlink_rcv(skb); |
892 | consume_skb(skb); | 1542 | consume_skb(skb); |
893 | } else { | 1543 | } else { |
@@ -933,6 +1583,69 @@ retry: | |||
933 | } | 1583 | } |
934 | EXPORT_SYMBOL(netlink_unicast); | 1584 | EXPORT_SYMBOL(netlink_unicast); |
935 | 1585 | ||
1586 | struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, | ||
1587 | u32 dst_portid, gfp_t gfp_mask) | ||
1588 | { | ||
1589 | #ifdef CONFIG_NETLINK_MMAP | ||
1590 | struct sock *sk = NULL; | ||
1591 | struct sk_buff *skb; | ||
1592 | struct netlink_ring *ring; | ||
1593 | struct nl_mmap_hdr *hdr; | ||
1594 | unsigned int maxlen; | ||
1595 | |||
1596 | sk = netlink_getsockbyportid(ssk, dst_portid); | ||
1597 | if (IS_ERR(sk)) | ||
1598 | goto out; | ||
1599 | |||
1600 | ring = &nlk_sk(sk)->rx_ring; | ||
1601 | /* fast-path without atomic ops for common case: non-mmaped receiver */ | ||
1602 | if (ring->pg_vec == NULL) | ||
1603 | goto out_put; | ||
1604 | |||
1605 | skb = alloc_skb_head(gfp_mask); | ||
1606 | if (skb == NULL) | ||
1607 | goto err1; | ||
1608 | |||
1609 | spin_lock_bh(&sk->sk_receive_queue.lock); | ||
1610 | /* check again under lock */ | ||
1611 | if (ring->pg_vec == NULL) | ||
1612 | goto out_free; | ||
1613 | |||
1614 | maxlen = ring->frame_size - NL_MMAP_HDRLEN; | ||
1615 | if (maxlen < size) | ||
1616 | goto out_free; | ||
1617 | |||
1618 | netlink_forward_ring(ring); | ||
1619 | hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); | ||
1620 | if (hdr == NULL) | ||
1621 | goto err2; | ||
1622 | netlink_ring_setup_skb(skb, sk, ring, hdr); | ||
1623 | netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); | ||
1624 | atomic_inc(&ring->pending); | ||
1625 | netlink_increment_head(ring); | ||
1626 | |||
1627 | spin_unlock_bh(&sk->sk_receive_queue.lock); | ||
1628 | return skb; | ||
1629 | |||
1630 | err2: | ||
1631 | kfree_skb(skb); | ||
1632 | spin_unlock_bh(&sk->sk_receive_queue.lock); | ||
1633 | netlink_overrun(sk); | ||
1634 | err1: | ||
1635 | sock_put(sk); | ||
1636 | return NULL; | ||
1637 | |||
1638 | out_free: | ||
1639 | kfree_skb(skb); | ||
1640 | spin_unlock_bh(&sk->sk_receive_queue.lock); | ||
1641 | out_put: | ||
1642 | sock_put(sk); | ||
1643 | out: | ||
1644 | #endif | ||
1645 | return alloc_skb(size, gfp_mask); | ||
1646 | } | ||
1647 | EXPORT_SYMBOL_GPL(netlink_alloc_skb); | ||
1648 | |||
936 | int netlink_has_listeners(struct sock *sk, unsigned int group) | 1649 | int netlink_has_listeners(struct sock *sk, unsigned int group) |
937 | { | 1650 | { |
938 | int res = 0; | 1651 | int res = 0; |
@@ -957,8 +1670,8 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) | |||
957 | struct netlink_sock *nlk = nlk_sk(sk); | 1670 | struct netlink_sock *nlk = nlk_sk(sk); |
958 | 1671 | ||
959 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && | 1672 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && |
960 | !test_bit(0, &nlk->state)) { | 1673 | !test_bit(NETLINK_CONGESTED, &nlk->state)) { |
961 | skb_set_owner_r(skb, sk); | 1674 | netlink_skb_set_owner_r(skb, sk); |
962 | __netlink_sendskb(sk, skb); | 1675 | __netlink_sendskb(sk, skb); |
963 | return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); | 1676 | return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); |
964 | } | 1677 | } |
@@ -1193,7 +1906,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, | |||
1193 | if (level != SOL_NETLINK) | 1906 | if (level != SOL_NETLINK) |
1194 | return -ENOPROTOOPT; | 1907 | return -ENOPROTOOPT; |
1195 | 1908 | ||
1196 | if (optlen >= sizeof(int) && | 1909 | if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && |
1910 | optlen >= sizeof(int) && | ||
1197 | get_user(val, (unsigned int __user *)optval)) | 1911 | get_user(val, (unsigned int __user *)optval)) |
1198 | return -EFAULT; | 1912 | return -EFAULT; |
1199 | 1913 | ||
@@ -1235,13 +1949,32 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, | |||
1235 | case NETLINK_NO_ENOBUFS: | 1949 | case NETLINK_NO_ENOBUFS: |
1236 | if (val) { | 1950 | if (val) { |
1237 | nlk->flags |= NETLINK_RECV_NO_ENOBUFS; | 1951 | nlk->flags |= NETLINK_RECV_NO_ENOBUFS; |
1238 | clear_bit(0, &nlk->state); | 1952 | clear_bit(NETLINK_CONGESTED, &nlk->state); |
1239 | wake_up_interruptible(&nlk->wait); | 1953 | wake_up_interruptible(&nlk->wait); |
1240 | } else { | 1954 | } else { |
1241 | nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; | 1955 | nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; |
1242 | } | 1956 | } |
1243 | err = 0; | 1957 | err = 0; |
1244 | break; | 1958 | break; |
1959 | #ifdef CONFIG_NETLINK_MMAP | ||
1960 | case NETLINK_RX_RING: | ||
1961 | case NETLINK_TX_RING: { | ||
1962 | struct nl_mmap_req req; | ||
1963 | |||
1964 | /* Rings might consume more memory than queue limits, require | ||
1965 | * CAP_NET_ADMIN. | ||
1966 | */ | ||
1967 | if (!capable(CAP_NET_ADMIN)) | ||
1968 | return -EPERM; | ||
1969 | if (optlen < sizeof(req)) | ||
1970 | return -EINVAL; | ||
1971 | if (copy_from_user(&req, optval, sizeof(req))) | ||
1972 | return -EFAULT; | ||
1973 | err = netlink_set_ring(sk, &req, false, | ||
1974 | optname == NETLINK_TX_RING); | ||
1975 | break; | ||
1976 | } | ||
1977 | #endif /* CONFIG_NETLINK_MMAP */ | ||
1245 | default: | 1978 | default: |
1246 | err = -ENOPROTOOPT; | 1979 | err = -ENOPROTOOPT; |
1247 | } | 1980 | } |
@@ -1352,6 +2085,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
1352 | goto out; | 2085 | goto out; |
1353 | } | 2086 | } |
1354 | 2087 | ||
2088 | if (netlink_tx_is_mmaped(sk) && | ||
2089 | msg->msg_iov->iov_base == NULL) { | ||
2090 | err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, | ||
2091 | siocb); | ||
2092 | goto out; | ||
2093 | } | ||
2094 | |||
1355 | err = -EMSGSIZE; | 2095 | err = -EMSGSIZE; |
1356 | if (len > sk->sk_sndbuf - 32) | 2096 | if (len > sk->sk_sndbuf - 32) |
1357 | goto out; | 2097 | goto out; |
@@ -1684,9 +2424,13 @@ static int netlink_dump(struct sock *sk) | |||
1684 | 2424 | ||
1685 | alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); | 2425 | alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); |
1686 | 2426 | ||
1687 | skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL); | 2427 | if (!netlink_rx_is_mmaped(sk) && |
2428 | atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) | ||
2429 | goto errout_skb; | ||
2430 | skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); | ||
1688 | if (!skb) | 2431 | if (!skb) |
1689 | goto errout_skb; | 2432 | goto errout_skb; |
2433 | netlink_skb_set_owner_r(skb, sk); | ||
1690 | 2434 | ||
1691 | len = cb->dump(skb, cb); | 2435 | len = cb->dump(skb, cb); |
1692 | 2436 | ||
@@ -1741,6 +2485,19 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, | |||
1741 | if (cb == NULL) | 2485 | if (cb == NULL) |
1742 | return -ENOBUFS; | 2486 | return -ENOBUFS; |
1743 | 2487 | ||
2488 | /* Memory mapped dump requests need to be copied to avoid looping | ||
2489 | * on the pending state in netlink_mmap_sendmsg() while the CB hold | ||
2490 | * a reference to the skb. | ||
2491 | */ | ||
2492 | if (netlink_skb_is_mmaped(skb)) { | ||
2493 | skb = skb_copy(skb, GFP_KERNEL); | ||
2494 | if (skb == NULL) { | ||
2495 | kfree(cb); | ||
2496 | return -ENOBUFS; | ||
2497 | } | ||
2498 | } else | ||
2499 | atomic_inc(&skb->users); | ||
2500 | |||
1744 | cb->dump = control->dump; | 2501 | cb->dump = control->dump; |
1745 | cb->done = control->done; | 2502 | cb->done = control->done; |
1746 | cb->nlh = nlh; | 2503 | cb->nlh = nlh; |
@@ -1801,7 +2558,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) | |||
1801 | if (err) | 2558 | if (err) |
1802 | payload += nlmsg_len(nlh); | 2559 | payload += nlmsg_len(nlh); |
1803 | 2560 | ||
1804 | skb = nlmsg_new(payload, GFP_KERNEL); | 2561 | skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), |
2562 | NETLINK_CB(in_skb).portid, GFP_KERNEL); | ||
1805 | if (!skb) { | 2563 | if (!skb) { |
1806 | struct sock *sk; | 2564 | struct sock *sk; |
1807 | 2565 | ||
@@ -2067,7 +2825,7 @@ static const struct proto_ops netlink_ops = { | |||
2067 | .socketpair = sock_no_socketpair, | 2825 | .socketpair = sock_no_socketpair, |
2068 | .accept = sock_no_accept, | 2826 | .accept = sock_no_accept, |
2069 | .getname = netlink_getname, | 2827 | .getname = netlink_getname, |
2070 | .poll = datagram_poll, | 2828 | .poll = netlink_poll, |
2071 | .ioctl = sock_no_ioctl, | 2829 | .ioctl = sock_no_ioctl, |
2072 | .listen = sock_no_listen, | 2830 | .listen = sock_no_listen, |
2073 | .shutdown = sock_no_shutdown, | 2831 | .shutdown = sock_no_shutdown, |
@@ -2075,7 +2833,7 @@ static const struct proto_ops netlink_ops = { | |||
2075 | .getsockopt = netlink_getsockopt, | 2833 | .getsockopt = netlink_getsockopt, |
2076 | .sendmsg = netlink_sendmsg, | 2834 | .sendmsg = netlink_sendmsg, |
2077 | .recvmsg = netlink_recvmsg, | 2835 | .recvmsg = netlink_recvmsg, |
2078 | .mmap = sock_no_mmap, | 2836 | .mmap = netlink_mmap, |
2079 | .sendpage = sock_no_sendpage, | 2837 | .sendpage = sock_no_sendpage, |
2080 | }; | 2838 | }; |
2081 | 2839 | ||
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index d9acb2a1d855..ed8522265f4e 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h | |||
@@ -6,6 +6,20 @@ | |||
6 | #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) | 6 | #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) |
7 | #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) | 7 | #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) |
8 | 8 | ||
9 | struct netlink_ring { | ||
10 | void **pg_vec; | ||
11 | unsigned int head; | ||
12 | unsigned int frames_per_block; | ||
13 | unsigned int frame_size; | ||
14 | unsigned int frame_max; | ||
15 | |||
16 | unsigned int pg_vec_order; | ||
17 | unsigned int pg_vec_pages; | ||
18 | unsigned int pg_vec_len; | ||
19 | |||
20 | atomic_t pending; | ||
21 | }; | ||
22 | |||
9 | struct netlink_sock { | 23 | struct netlink_sock { |
10 | /* struct sock has to be the first member of netlink_sock */ | 24 | /* struct sock has to be the first member of netlink_sock */ |
11 | struct sock sk; | 25 | struct sock sk; |
@@ -24,6 +38,12 @@ struct netlink_sock { | |||
24 | void (*netlink_rcv)(struct sk_buff *skb); | 38 | void (*netlink_rcv)(struct sk_buff *skb); |
25 | void (*netlink_bind)(int group); | 39 | void (*netlink_bind)(int group); |
26 | struct module *module; | 40 | struct module *module; |
41 | #ifdef CONFIG_NETLINK_MMAP | ||
42 | struct mutex pg_vec_lock; | ||
43 | struct netlink_ring rx_ring; | ||
44 | struct netlink_ring tx_ring; | ||
45 | atomic_t mapped; | ||
46 | #endif /* CONFIG_NETLINK_MMAP */ | ||
27 | }; | 47 | }; |
28 | 48 | ||
29 | static inline struct netlink_sock *nlk_sk(struct sock *sk) | 49 | static inline struct netlink_sock *nlk_sk(struct sock *sk) |
diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 5ffb1d1cf402..4e4aa471cd05 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c | |||
@@ -7,6 +7,34 @@ | |||
7 | 7 | ||
8 | #include "af_netlink.h" | 8 | #include "af_netlink.h" |
9 | 9 | ||
10 | static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, | ||
11 | struct sk_buff *nlskb) | ||
12 | { | ||
13 | struct netlink_diag_ring ndr; | ||
14 | |||
15 | ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; | ||
16 | ndr.ndr_block_nr = ring->pg_vec_len; | ||
17 | ndr.ndr_frame_size = ring->frame_size; | ||
18 | ndr.ndr_frame_nr = ring->frame_max + 1; | ||
19 | |||
20 | return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); | ||
21 | } | ||
22 | |||
23 | static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) | ||
24 | { | ||
25 | struct netlink_sock *nlk = nlk_sk(sk); | ||
26 | int ret; | ||
27 | |||
28 | mutex_lock(&nlk->pg_vec_lock); | ||
29 | ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); | ||
30 | if (!ret) | ||
31 | ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, | ||
32 | nlskb); | ||
33 | mutex_unlock(&nlk->pg_vec_lock); | ||
34 | |||
35 | return ret; | ||
36 | } | ||
37 | |||
10 | static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) | 38 | static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) |
11 | { | 39 | { |
12 | struct netlink_sock *nlk = nlk_sk(sk); | 40 | struct netlink_sock *nlk = nlk_sk(sk); |
@@ -51,6 +79,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, | |||
51 | sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) | 79 | sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) |
52 | goto out_nlmsg_trim; | 80 | goto out_nlmsg_trim; |
53 | 81 | ||
82 | if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && | ||
83 | sk_diag_put_rings_cfg(sk, skb)) | ||
84 | goto out_nlmsg_trim; | ||
85 | |||
54 | return nlmsg_end(skb, nlh); | 86 | return nlmsg_end(skb, nlh); |
55 | 87 | ||
56 | out_nlmsg_trim: | 88 | out_nlmsg_trim: |
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index aa36a8c8b33b..7881e2fccbc2 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c | |||
@@ -393,7 +393,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, | |||
393 | return -EOPNOTSUPP; | 393 | return -EOPNOTSUPP; |
394 | 394 | ||
395 | if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && | 395 | if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && |
396 | sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) | 396 | sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) |
397 | return -EOPNOTSUPP; | 397 | return -EOPNOTSUPP; |
398 | } | 398 | } |
399 | 399 | ||