aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/netlink_mmap.txt339
-rw-r--r--include/linux/netfilter/nfnetlink.h11
-rw-r--r--include/linux/netlink.h11
-rw-r--r--include/linux/skbuff.h6
-rw-r--r--include/net/netfilter/nf_conntrack.h2
-rw-r--r--include/net/netfilter/nf_conntrack_expect.h4
-rw-r--r--include/uapi/linux/netlink.h32
-rw-r--r--include/uapi/linux/netlink_diag.h10
-rw-r--r--net/Kconfig9
-rw-r--r--net/core/skbuff.c30
-rw-r--r--net/ipv4/inet_diag.c6
-rw-r--r--net/ipv4/udp_diag.c4
-rw-r--r--net/netfilter/nf_conntrack_core.c8
-rw-r--r--net/netfilter/nf_conntrack_expect.c8
-rw-r--r--net/netfilter/nfnetlink.c20
-rw-r--r--net/netfilter/nfnetlink_log.c12
-rw-r--r--net/netfilter/nfnetlink_queue_core.c3
-rw-r--r--net/netlink/af_netlink.c836
-rw-r--r--net/netlink/af_netlink.h20
-rw-r--r--net/netlink/diag.c32
-rw-r--r--net/sched/cls_flow.c2
21 files changed, 1331 insertions, 74 deletions
diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt
new file mode 100644
index 000000000000..1c2dab409625
--- /dev/null
+++ b/Documentation/networking/netlink_mmap.txt
@@ -0,0 +1,339 @@
1This file documents how to use memory mapped I/O with netlink.
2
3Author: Patrick McHardy <kaber@trash.net>
4
5Overview
6--------
7
8Memory mapped netlink I/O can be used to increase throughput and decrease
9overhead of unicast receive and transmit operations. Some netlink subsystems
10require high throughput, these are mainly the netfilter subsystems
11nfnetlink_queue and nfnetlink_log, but it can also help speed up large
12dump operations of f.i. the routing database.
13
14Memory mapped netlink I/O used two circular ring buffers for RX and TX which
15are mapped into the processes address space.
16
17The RX ring is used by the kernel to directly construct netlink messages into
18user-space memory without copying them as done with regular socket I/O,
19additionally as long as the ring contains messages no recvmsg() or poll()
20syscalls have to be issued by user-space to get more message.
21
22The TX ring is used to process messages directly from user-space memory, the
23kernel processes all messages contained in the ring using a single sendmsg()
24call.
25
26Usage overview
27--------------
28
29In order to use memory mapped netlink I/O, user-space needs three main changes:
30
31- ring setup
32- conversion of the RX path to get messages from the ring instead of recvmsg()
33- conversion of the TX path to construct messages into the ring
34
35Ring setup is done using setsockopt() to provide the ring parameters to the
36kernel, then a call to mmap() to map the ring into the processes address space:
37
38- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &params, sizeof(params));
39- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &params, sizeof(params));
40- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)
41
42Usage of either ring is optional, but even if only the RX ring is used the
43mapping still needs to be writable in order to update the frame status after
44processing.
45
46Conversion of the reception path involves calling poll() on the file
47descriptor, once the socket is readable the frames from the ring are
48processsed in order until no more messages are available, as indicated by
49a status word in the frame header.
50
51On kernel side, in order to make use of memory mapped I/O on receive, the
52originating netlink subsystem needs to support memory mapped I/O, otherwise
53it will use an allocated socket buffer as usual and the contents will be
54 copied to the ring on transmission, nullifying most of the performance gains.
55Dumps of kernel databases automatically support memory mapped I/O.
56
57Conversion of the transmit path involves changing message contruction to
58use memory from the TX ring instead of (usually) a buffer declared on the
59stack and setting up the frame header approriately. Optionally poll() can
60be used to wait for free frames in the TX ring.
61
62Structured and definitions for using memory mapped I/O are contained in
63<linux/netlink.h>.
64
65RX and TX rings
66----------------
67
68Each ring contains a number of continous memory blocks, containing frames of
69fixed size dependant on the parameters used for ring setup.
70
71Ring: [ block 0 ]
72 [ frame 0 ]
73 [ frame 1 ]
74 [ block 1 ]
75 [ frame 2 ]
76 [ frame 3 ]
77 ...
78 [ block n ]
79 [ frame 2 * n ]
80 [ frame 2 * n + 1 ]
81
82The blocks are only visible to the kernel, from the point of view of user-space
83the ring just contains the frames in a continous memory zone.
84
85The ring parameters used for setting up the ring are defined as follows:
86
87struct nl_mmap_req {
88 unsigned int nm_block_size;
89 unsigned int nm_block_nr;
90 unsigned int nm_frame_size;
91 unsigned int nm_frame_nr;
92};
93
94Frames are grouped into blocks, where each block is a continous region of memory
95and holds nm_block_size / nm_frame_size frames. The total number of frames in
96the ring is nm_frame_nr. The following invariants hold:
97
98- frames_per_block = nm_block_size / nm_frame_size
99
100- nm_frame_nr = frames_per_block * nm_block_nr
101
102Some parameters are constrained, specifically:
103
104- nm_block_size must be a multiple of the architectures memory page size.
105 The getpagesize() function can be used to get the page size.
106
107- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be
108 able to hold at least the frame header
109
110- nm_frame_size must be smaller or equal to nm_block_size
111
112- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT
113
114- nm_frame_nr must equal the actual number of frames as specified above.
115
116When the kernel can't allocate phsyically continous memory for a ring block,
117it will fall back to use physically discontinous memory. This might affect
118performance negatively, in order to avoid this the nm_frame_size parameter
119should be chosen to be as small as possible for the required frame size and
120the number of blocks should be increased instead.
121
122Ring frames
123------------
124
125Each frames contain a frame header, consisting of a synchronization word and some
126meta-data, and the message itself.
127
128Frame: [ header message ]
129
130The frame header is defined as follows:
131
132struct nl_mmap_hdr {
133 unsigned int nm_status;
134 unsigned int nm_len;
135 __u32 nm_group;
136 /* credentials */
137 __u32 nm_pid;
138 __u32 nm_uid;
139 __u32 nm_gid;
140};
141
142- nm_status is used for synchronizing processing between the kernel and user-
143 space and specifies ownership of the frame as well as the operation to perform
144
145- nm_len contains the length of the message contained in the data area
146
147- nm_group specified the destination multicast group of message
148
149- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending
150 process. These values correspond to the data available using SOCK_PASSCRED in
151 the SCM_CREDENTIALS cmsg.
152
153The possible values in the status word are:
154
155- NL_MMAP_STATUS_UNUSED:
156 RX ring: frame belongs to the kernel and contains no message
157 for user-space. Approriate action is to invoke poll()
158 to wait for new messages.
159
160 TX ring: frame belongs to user-space and can be used for
161 message construction.
162
163- NL_MMAP_STATUS_RESERVED:
164 RX ring only: frame is currently used by the kernel for message
165 construction and contains no valid message yet.
166 Appropriate action is to invoke poll() to wait for
167 new messages.
168
169- NL_MMAP_STATUS_VALID:
170 RX ring: frame contains a valid message. Approriate action is
171 to process the message and release the frame back to
172 the kernel by setting the status to
173 NL_MMAP_STATUS_UNUSED or queue the frame by setting the
174 status to NL_MMAP_STATUS_SKIP.
175
176 TX ring: the frame contains a valid message from user-space to
177 be processed by the kernel. After completing processing
178 the kernel will release the frame back to user-space by
179 setting the status to NL_MMAP_STATUS_UNUSED.
180
181- NL_MMAP_STATUS_COPY:
182 RX ring only: a message is ready to be processed but could not be
183 stored in the ring, either because it exceeded the
184 frame size or because the originating subsystem does
185 not support memory mapped I/O. Appropriate action is
186 to invoke recvmsg() to receive the message and release
187 the frame back to the kernel by setting the status to
188 NL_MMAP_STATUS_UNUSED.
189
190- NL_MMAP_STATUS_SKIP:
191 RX ring only: user-space queued the message for later processing, but
192 processed some messages following it in the ring. The
193 kernel should skip this frame when looking for unused
194 frames.
195
196The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the
197frame header.
198
199TX limitations
200--------------
201
202Kernel processing usually involves validation of the message received by
203user-space, then processing its contents. The kernel must assure that
204userspace is not able to modify the message contents after they have been
205validated. In order to do so, the message is copied from the ring frame
206to an allocated buffer if either of these conditions is false:
207
208- only a single mapping of the ring exists
209- the file descriptor is not shared between processes
210
211This means that for threaded programs, the kernel will fall back to copying.
212
213Example
214-------
215
216Ring setup:
217
218 unsigned int block_size = 16 * getpagesize();
219 struct nl_mmap_req req = {
220 .nm_block_size = block_size,
221 .nm_block_nr = 64,
222 .nm_frame_size = 16384,
223 .nm_frame_nr = 64 * block_size / 16384,
224 };
225 unsigned int ring_size;
226 void *rx_ring, *tx_ring;
227
228 /* Configure ring parameters */
229 if (setsockopt(fd, NETLINK_RX_RING, &req, sizeof(req)) < 0)
230 exit(1);
231 if (setsockopt(fd, NETLINK_TX_RING, &req, sizeof(req)) < 0)
232 exit(1)
233
234 /* Calculate size of each invididual ring */
235 ring_size = req.nm_block_nr * req.nm_block_size;
236
237 /* Map RX/TX rings. The TX ring is located after the RX ring */
238 rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
239 MAP_SHARED, fd, 0);
240 if ((long)rx_ring == -1L)
241 exit(1);
242 tx_ring = rx_ring + ring_size:
243
244Message reception:
245
246This example assumes some ring parameters of the ring setup are available.
247
248 unsigned int frame_offset = 0;
249 struct nl_mmap_hdr *hdr;
250 struct nlmsghdr *nlh;
251 unsigned char buf[16384];
252 ssize_t len;
253
254 while (1) {
255 struct pollfd pfds[1];
256
257 pfds[0].fd = fd;
258 pfds[0].events = POLLIN | POLLERR;
259 pfds[0].revents = 0;
260
261 if (poll(pfds, 1, -1) < 0 && errno != -EINTR)
262 exit(1);
263
264 /* Check for errors. Error handling omitted */
265 if (pfds[0].revents & POLLERR)
266 <handle error>
267
268 /* If no new messages, poll again */
269 if (!(pfds[0].revents & POLLIN))
270 continue;
271
272 /* Process all frames */
273 while (1) {
274 /* Get next frame header */
275 hdr = rx_ring + frame_offset;
276
277 if (hdr->nm_status == NL_MMAP_STATUS_VALID)
278 /* Regular memory mapped frame */
279 nlh = (void *hdr) + NL_MMAP_HDRLEN;
280 len = hdr->nm_len;
281
282 /* Release empty message immediately. May happen
283 * on error during message construction.
284 */
285 if (len == 0)
286 goto release;
287 } else if (hdr->nm_status == NL_MMAP_STATUS_COPY) {
288 /* Frame queued to socket receive queue */
289 len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT);
290 if (len <= 0)
291 break;
292 nlh = buf;
293 } else
294 /* No more messages to process, continue polling */
295 break;
296
297 process_msg(nlh);
298release:
299 /* Release frame back to the kernel */
300 hdr->nm_status = NL_MMAP_STATUS_UNUSED;
301
302 /* Advance frame offset to next frame */
303 frame_offset = (frame_offset + frame_size) % ring_size;
304 }
305 }
306
307Message transmission:
308
309This example assumes some ring parameters of the ring setup are available.
310A single message is constructed and transmitted, to send multiple messages
311at once they would be constructed in consecutive frames before a final call
312to sendto().
313
314 unsigned int frame_offset = 0;
315 struct nl_mmap_hdr *hdr;
316 struct nlmsghdr *nlh;
317 struct sockaddr_nl addr = {
318 .nl_family = AF_NETLINK,
319 };
320
321 hdr = tx_ring + frame_offset;
322 if (hdr->nm_status != NL_MMAP_STATUS_UNUSED)
323 /* No frame available. Use poll() to avoid. */
324 exit(1);
325
326 nlh = (void *)hdr + NL_MMAP_HDRLEN;
327
328 /* Build message */
329 build_message(nlh);
330
331 /* Fill frame header: length and status need to be set */
332 hdr->nm_len = nlh->nlmsg_len;
333 hdr->nm_status = NL_MMAP_STATUS_VALID;
334
335 if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0)
336 exit(1);
337
338 /* Advance frame offset to next frame */
339 frame_offset = (frame_offset + frame_size) % ring_size;
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index ecbb8e495912..cadb7402d7a7 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -29,10 +29,13 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
29extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); 29extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
30 30
31extern int nfnetlink_has_listeners(struct net *net, unsigned int group); 31extern int nfnetlink_has_listeners(struct net *net, unsigned int group);
32extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, 32extern struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
33 int echo, gfp_t flags); 33 u32 dst_portid, gfp_t gfp_mask);
34extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); 34extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
35extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); 35 unsigned int group, int echo, gfp_t flags);
36extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
37extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net,
38 u32 portid, int flags);
36 39
37extern void nfnl_lock(__u8 subsys_id); 40extern void nfnl_lock(__u8 subsys_id);
38extern void nfnl_unlock(__u8 subsys_id); 41extern void nfnl_unlock(__u8 subsys_id);
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index e0f746b7b95c..6358da5eeee8 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -15,11 +15,18 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
15 return (struct nlmsghdr *)skb->data; 15 return (struct nlmsghdr *)skb->data;
16} 16}
17 17
18enum netlink_skb_flags {
19 NETLINK_SKB_MMAPED = 0x1, /* Packet data is mmaped */
20 NETLINK_SKB_TX = 0x2, /* Packet was sent by userspace */
21 NETLINK_SKB_DELIVERED = 0x4, /* Packet was delivered */
22};
23
18struct netlink_skb_parms { 24struct netlink_skb_parms {
19 struct scm_creds creds; /* Skb credentials */ 25 struct scm_creds creds; /* Skb credentials */
20 __u32 portid; 26 __u32 portid;
21 __u32 dst_group; 27 __u32 dst_group;
22 struct sock *ssk; 28 __u32 flags;
29 struct sock *sk;
23}; 30};
24 31
25#define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) 32#define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb))
@@ -57,6 +64,8 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group)
57extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group); 64extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);
58extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); 65extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
59extern int netlink_has_listeners(struct sock *sk, unsigned int group); 66extern int netlink_has_listeners(struct sock *sk, unsigned int group);
67extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
68 u32 dst_portid, gfp_t gfp_mask);
60extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); 69extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
61extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, 70extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
62 __u32 group, gfp_t allocation); 71 __u32 group, gfp_t allocation);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f5bed7b31954..2e0ced1af3b1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -651,6 +651,12 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
651 return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); 651 return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
652} 652}
653 653
654extern struct sk_buff *__alloc_skb_head(gfp_t priority, int node);
655static inline struct sk_buff *alloc_skb_head(gfp_t priority)
656{
657 return __alloc_skb_head(priority, -1);
658}
659
654extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); 660extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
655extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); 661extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
656extern struct sk_buff *skb_clone(struct sk_buff *skb, 662extern struct sk_buff *skb_clone(struct sk_buff *skb,
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index caca0c4d6b4b..644d9c223d24 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -184,7 +184,7 @@ extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
184extern void nf_ct_delete_from_lists(struct nf_conn *ct); 184extern void nf_ct_delete_from_lists(struct nf_conn *ct);
185extern void nf_ct_dying_timeout(struct nf_conn *ct); 185extern void nf_ct_dying_timeout(struct nf_conn *ct);
186 186
187extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); 187extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report);
188 188
189extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, 189extern bool nf_ct_get_tuplepr(const struct sk_buff *skb,
190 unsigned int nhoff, u_int16_t l3num, 190 unsigned int nhoff, u_int16_t l3num,
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index cbbae7621e22..3f3aecbc8632 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -88,7 +88,7 @@ nf_ct_find_expectation(struct net *net, u16 zone,
88 const struct nf_conntrack_tuple *tuple); 88 const struct nf_conntrack_tuple *tuple);
89 89
90void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, 90void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
91 u32 pid, int report); 91 u32 portid, int report);
92static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) 92static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
93{ 93{
94 nf_ct_unlink_expect_report(exp, 0, 0); 94 nf_ct_unlink_expect_report(exp, 0, 0);
@@ -106,7 +106,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t,
106 u_int8_t, const __be16 *, const __be16 *); 106 u_int8_t, const __be16 *, const __be16 *);
107void nf_ct_expect_put(struct nf_conntrack_expect *exp); 107void nf_ct_expect_put(struct nf_conntrack_expect *exp);
108int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 108int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
109 u32 pid, int report); 109 u32 portid, int report);
110static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect) 110static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect)
111{ 111{
112 return nf_ct_expect_related_report(expect, 0, 0); 112 return nf_ct_expect_related_report(expect, 0, 0);
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 32a354f67ba4..1a85940f8ab7 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -1,6 +1,7 @@
1#ifndef _UAPI__LINUX_NETLINK_H 1#ifndef _UAPI__LINUX_NETLINK_H
2#define _UAPI__LINUX_NETLINK_H 2#define _UAPI__LINUX_NETLINK_H
3 3
4#include <linux/kernel.h>
4#include <linux/socket.h> /* for __kernel_sa_family_t */ 5#include <linux/socket.h> /* for __kernel_sa_family_t */
5#include <linux/types.h> 6#include <linux/types.h>
6 7
@@ -105,11 +106,42 @@ struct nlmsgerr {
105#define NETLINK_PKTINFO 3 106#define NETLINK_PKTINFO 3
106#define NETLINK_BROADCAST_ERROR 4 107#define NETLINK_BROADCAST_ERROR 4
107#define NETLINK_NO_ENOBUFS 5 108#define NETLINK_NO_ENOBUFS 5
109#define NETLINK_RX_RING 6
110#define NETLINK_TX_RING 7
108 111
109struct nl_pktinfo { 112struct nl_pktinfo {
110 __u32 group; 113 __u32 group;
111}; 114};
112 115
116struct nl_mmap_req {
117 unsigned int nm_block_size;
118 unsigned int nm_block_nr;
119 unsigned int nm_frame_size;
120 unsigned int nm_frame_nr;
121};
122
123struct nl_mmap_hdr {
124 unsigned int nm_status;
125 unsigned int nm_len;
126 __u32 nm_group;
127 /* credentials */
128 __u32 nm_pid;
129 __u32 nm_uid;
130 __u32 nm_gid;
131};
132
133enum nl_mmap_status {
134 NL_MMAP_STATUS_UNUSED,
135 NL_MMAP_STATUS_RESERVED,
136 NL_MMAP_STATUS_VALID,
137 NL_MMAP_STATUS_COPY,
138 NL_MMAP_STATUS_SKIP,
139};
140
141#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO
142#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
143#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
144
113#define NET_MAJOR 36 /* Major 36 is reserved for networking */ 145#define NET_MAJOR 36 /* Major 36 is reserved for networking */
114 146
115enum { 147enum {
diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
index 88009a31cd06..4e31db4eea41 100644
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -25,9 +25,18 @@ struct netlink_diag_msg {
25 __u32 ndiag_cookie[2]; 25 __u32 ndiag_cookie[2];
26}; 26};
27 27
28struct netlink_diag_ring {
29 __u32 ndr_block_size;
30 __u32 ndr_block_nr;
31 __u32 ndr_frame_size;
32 __u32 ndr_frame_nr;
33};
34
28enum { 35enum {
29 NETLINK_DIAG_MEMINFO, 36 NETLINK_DIAG_MEMINFO,
30 NETLINK_DIAG_GROUPS, 37 NETLINK_DIAG_GROUPS,
38 NETLINK_DIAG_RX_RING,
39 NETLINK_DIAG_TX_RING,
31 40
32 __NETLINK_DIAG_MAX, 41 __NETLINK_DIAG_MAX,
33}; 42};
@@ -38,5 +47,6 @@ enum {
38 47
39#define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ 48#define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */
40#define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ 49#define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */
50#define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */
41 51
42#endif 52#endif
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc9046868e..1a2221630e6a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -23,6 +23,15 @@ menuconfig NET
23 23
24if NET 24if NET
25 25
26config NETLINK_MMAP
27 bool "Netlink: mmaped IO"
28 help
29 This option enables support for memory mapped netlink IO. This
30 reduces overhead by avoiding copying data between kernel- and
31 userspace.
32
33 If unsure, say N.
34
26config WANT_COMPAT_NETLINK_MESSAGES 35config WANT_COMPAT_NETLINK_MESSAGES
27 bool 36 bool
28 help 37 help
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a92d9e7d10f7..898cf5c566f9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -179,6 +179,33 @@ out:
179 * 179 *
180 */ 180 */
181 181
182struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
183{
184 struct sk_buff *skb;
185
186 /* Get the HEAD */
187 skb = kmem_cache_alloc_node(skbuff_head_cache,
188 gfp_mask & ~__GFP_DMA, node);
189 if (!skb)
190 goto out;
191
192 /*
193 * Only clear those fields we need to clear, not those that we will
194 * actually initialise below. Hence, don't put any more fields after
195 * the tail pointer in struct sk_buff!
196 */
197 memset(skb, 0, offsetof(struct sk_buff, tail));
198 skb->data = NULL;
199 skb->truesize = sizeof(struct sk_buff);
200 atomic_set(&skb->users, 1);
201
202#ifdef NET_SKBUFF_DATA_USES_OFFSET
203 skb->mac_header = ~0U;
204#endif
205out:
206 return skb;
207}
208
182/** 209/**
183 * __alloc_skb - allocate a network buffer 210 * __alloc_skb - allocate a network buffer
184 * @size: size to allocate 211 * @size: size to allocate
@@ -584,7 +611,8 @@ static void skb_release_head_state(struct sk_buff *skb)
584static void skb_release_all(struct sk_buff *skb) 611static void skb_release_all(struct sk_buff *skb)
585{ 612{
586 skb_release_head_state(skb); 613 skb_release_head_state(skb);
587 skb_release_data(skb); 614 if (likely(skb->data))
615 skb_release_data(skb);
588} 616}
589 617
590/** 618/**
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8620408af574..5f648751fce2 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -324,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
324 } 324 }
325 325
326 err = sk_diag_fill(sk, rep, req, 326 err = sk_diag_fill(sk, rep, req,
327 sk_user_ns(NETLINK_CB(in_skb).ssk), 327 sk_user_ns(NETLINK_CB(in_skb).sk),
328 NETLINK_CB(in_skb).portid, 328 NETLINK_CB(in_skb).portid,
329 nlh->nlmsg_seq, 0, nlh); 329 nlh->nlmsg_seq, 0, nlh);
330 if (err < 0) { 330 if (err < 0) {
@@ -630,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk,
630 return 0; 630 return 0;
631 631
632 return inet_csk_diag_fill(sk, skb, r, 632 return inet_csk_diag_fill(sk, skb, r,
633 sk_user_ns(NETLINK_CB(cb->skb).ssk), 633 sk_user_ns(NETLINK_CB(cb->skb).sk),
634 NETLINK_CB(cb->skb).portid, 634 NETLINK_CB(cb->skb).portid,
635 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 635 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
636} 636}
@@ -805,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
805 } 805 }
806 806
807 err = inet_diag_fill_req(skb, sk, req, 807 err = inet_diag_fill_req(skb, sk, req,
808 sk_user_ns(NETLINK_CB(cb->skb).ssk), 808 sk_user_ns(NETLINK_CB(cb->skb).sk),
809 NETLINK_CB(cb->skb).portid, 809 NETLINK_CB(cb->skb).portid,
810 cb->nlh->nlmsg_seq, cb->nlh); 810 cb->nlh->nlmsg_seq, cb->nlh);
811 if (err < 0) { 811 if (err < 0) {
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 369a781851ad..7927db0a9279 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
25 return 0; 25 return 0;
26 26
27 return inet_sk_diag_fill(sk, NULL, skb, req, 27 return inet_sk_diag_fill(sk, NULL, skb, req,
28 sk_user_ns(NETLINK_CB(cb->skb).ssk), 28 sk_user_ns(NETLINK_CB(cb->skb).sk),
29 NETLINK_CB(cb->skb).portid, 29 NETLINK_CB(cb->skb).portid,
30 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 30 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
31} 31}
@@ -71,7 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
71 goto out; 71 goto out;
72 72
73 err = inet_sk_diag_fill(sk, NULL, rep, req, 73 err = inet_sk_diag_fill(sk, NULL, rep, req,
74 sk_user_ns(NETLINK_CB(in_skb).ssk), 74 sk_user_ns(NETLINK_CB(in_skb).sk),
75 NETLINK_CB(in_skb).portid, 75 NETLINK_CB(in_skb).portid,
76 nlh->nlmsg_seq, 0, nlh); 76 nlh->nlmsg_seq, 0, nlh);
77 if (err < 0) { 77 if (err < 0) {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 007e8c43d19a..54ddc2f8e7c9 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1260,7 +1260,7 @@ void nf_ct_iterate_cleanup(struct net *net,
1260EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); 1260EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1261 1261
1262struct __nf_ct_flush_report { 1262struct __nf_ct_flush_report {
1263 u32 pid; 1263 u32 portid;
1264 int report; 1264 int report;
1265}; 1265};
1266 1266
@@ -1275,7 +1275,7 @@ static int kill_report(struct nf_conn *i, void *data)
1275 1275
1276 /* If we fail to deliver the event, death_by_timeout() will retry */ 1276 /* If we fail to deliver the event, death_by_timeout() will retry */
1277 if (nf_conntrack_event_report(IPCT_DESTROY, i, 1277 if (nf_conntrack_event_report(IPCT_DESTROY, i,
1278 fr->pid, fr->report) < 0) 1278 fr->portid, fr->report) < 0)
1279 return 1; 1279 return 1;
1280 1280
1281 /* Avoid the delivery of the destroy event in death_by_timeout(). */ 1281 /* Avoid the delivery of the destroy event in death_by_timeout(). */
@@ -1298,10 +1298,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size)
1298} 1298}
1299EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); 1299EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1300 1300
1301void nf_conntrack_flush_report(struct net *net, u32 pid, int report) 1301void nf_conntrack_flush_report(struct net *net, u32 portid, int report)
1302{ 1302{
1303 struct __nf_ct_flush_report fr = { 1303 struct __nf_ct_flush_report fr = {
1304 .pid = pid, 1304 .portid = portid,
1305 .report = report, 1305 .report = report,
1306 }; 1306 };
1307 nf_ct_iterate_cleanup(net, kill_report, &fr); 1307 nf_ct_iterate_cleanup(net, kill_report, &fr);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 8c10e3db3d9b..0adfdcc68bae 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -40,7 +40,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
40 40
41/* nf_conntrack_expect helper functions */ 41/* nf_conntrack_expect helper functions */
42void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, 42void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
43 u32 pid, int report) 43 u32 portid, int report)
44{ 44{
45 struct nf_conn_help *master_help = nfct_help(exp->master); 45 struct nf_conn_help *master_help = nfct_help(exp->master);
46 struct net *net = nf_ct_exp_net(exp); 46 struct net *net = nf_ct_exp_net(exp);
@@ -54,7 +54,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
54 hlist_del(&exp->lnode); 54 hlist_del(&exp->lnode);
55 master_help->expecting[exp->class]--; 55 master_help->expecting[exp->class]--;
56 56
57 nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); 57 nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
58 nf_ct_expect_put(exp); 58 nf_ct_expect_put(exp);
59 59
60 NF_CT_STAT_INC(net, expect_delete); 60 NF_CT_STAT_INC(net, expect_delete);
@@ -412,7 +412,7 @@ out:
412} 412}
413 413
414int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 414int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
415 u32 pid, int report) 415 u32 portid, int report)
416{ 416{
417 int ret; 417 int ret;
418 418
@@ -425,7 +425,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
425 if (ret < 0) 425 if (ret < 0)
426 goto out; 426 goto out;
427 spin_unlock_bh(&nf_conntrack_lock); 427 spin_unlock_bh(&nf_conntrack_lock);
428 nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); 428 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
429 return ret; 429 return ret;
430out: 430out:
431 spin_unlock_bh(&nf_conntrack_lock); 431 spin_unlock_bh(&nf_conntrack_lock);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index bc4c499adb13..572d87dc116f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -112,22 +112,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
112} 112}
113EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); 113EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
114 114
115int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, 115struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
116 u32 dst_portid, gfp_t gfp_mask)
117{
118 return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
119}
120EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
121
122int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
116 unsigned int group, int echo, gfp_t flags) 123 unsigned int group, int echo, gfp_t flags)
117{ 124{
118 return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); 125 return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
119} 126}
120EXPORT_SYMBOL_GPL(nfnetlink_send); 127EXPORT_SYMBOL_GPL(nfnetlink_send);
121 128
122int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) 129int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
123{ 130{
124 return netlink_set_err(net->nfnl, pid, group, error); 131 return netlink_set_err(net->nfnl, portid, group, error);
125} 132}
126EXPORT_SYMBOL_GPL(nfnetlink_set_err); 133EXPORT_SYMBOL_GPL(nfnetlink_set_err);
127 134
128int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) 135int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
136 int flags)
129{ 137{
130 return netlink_unicast(net->nfnl, skb, pid, flags); 138 return netlink_unicast(net->nfnl, skb, portid, flags);
131} 139}
132EXPORT_SYMBOL_GPL(nfnetlink_unicast); 140EXPORT_SYMBOL_GPL(nfnetlink_unicast);
133 141
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 1a0be2af1dd8..d4199eb9b338 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -318,7 +318,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
318} 318}
319 319
320static struct sk_buff * 320static struct sk_buff *
321nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) 321nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size)
322{ 322{
323 struct sk_buff *skb; 323 struct sk_buff *skb;
324 unsigned int n; 324 unsigned int n;
@@ -327,13 +327,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
327 * message. WARNING: has to be <= 128k due to slab restrictions */ 327 * message. WARNING: has to be <= 128k due to slab restrictions */
328 328
329 n = max(inst_size, pkt_size); 329 n = max(inst_size, pkt_size);
330 skb = alloc_skb(n, GFP_ATOMIC); 330 skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC);
331 if (!skb) { 331 if (!skb) {
332 if (n > pkt_size) { 332 if (n > pkt_size) {
333 /* try to allocate only as much as we need for current 333 /* try to allocate only as much as we need for current
334 * packet */ 334 * packet */
335 335
336 skb = alloc_skb(pkt_size, GFP_ATOMIC); 336 skb = nfnetlink_alloc_skb(&init_net, pkt_size,
337 peer_portid, GFP_ATOMIC);
337 if (!skb) 338 if (!skb)
338 pr_err("nfnetlink_log: can't even alloc %u bytes\n", 339 pr_err("nfnetlink_log: can't even alloc %u bytes\n",
339 pkt_size); 340 pkt_size);
@@ -696,7 +697,8 @@ nfulnl_log_packet(u_int8_t pf,
696 } 697 }
697 698
698 if (!inst->skb) { 699 if (!inst->skb) {
699 inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); 700 inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz,
701 size);
700 if (!inst->skb) 702 if (!inst->skb)
701 goto alloc_failure; 703 goto alloc_failure;
702 } 704 }
@@ -824,7 +826,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
824 826
825 inst = instance_create(net, group_num, 827 inst = instance_create(net, group_num,
826 NETLINK_CB(skb).portid, 828 NETLINK_CB(skb).portid,
827 sk_user_ns(NETLINK_CB(skb).ssk)); 829 sk_user_ns(NETLINK_CB(skb).sk));
828 if (IS_ERR(inst)) { 830 if (IS_ERR(inst)) {
829 ret = PTR_ERR(inst); 831 ret = PTR_ERR(inst);
830 goto out; 832 goto out;
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 5e280b3e154f..ef3cdb4bfeea 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -339,7 +339,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
339 if (queue->flags & NFQA_CFG_F_CONNTRACK) 339 if (queue->flags & NFQA_CFG_F_CONNTRACK)
340 ct = nfqnl_ct_get(entskb, &size, &ctinfo); 340 ct = nfqnl_ct_get(entskb, &size, &ctinfo);
341 341
342 skb = alloc_skb(size, GFP_ATOMIC); 342 skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid,
343 GFP_ATOMIC);
343 if (!skb) 344 if (!skb)
344 return NULL; 345 return NULL;
345 346
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ce2e0064e7f6..2a3e9ba814c4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
5 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 5 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6 * Patrick McHardy <kaber@trash.net>
6 * 7 *
7 * This program is free software; you can redistribute it and/or 8 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License 9 * modify it under the terms of the GNU General Public License
@@ -55,6 +56,8 @@
55#include <linux/types.h> 56#include <linux/types.h>
56#include <linux/audit.h> 57#include <linux/audit.h>
57#include <linux/mutex.h> 58#include <linux/mutex.h>
59#include <linux/vmalloc.h>
60#include <asm/cacheflush.h>
58 61
59#include <net/net_namespace.h> 62#include <net/net_namespace.h>
60#include <net/sock.h> 63#include <net/sock.h>
@@ -68,6 +71,10 @@ struct listeners {
68 unsigned long masks[0]; 71 unsigned long masks[0];
69}; 72};
70 73
74/* state bits */
75#define NETLINK_CONGESTED 0x0
76
77/* flags */
71#define NETLINK_KERNEL_SOCKET 0x1 78#define NETLINK_KERNEL_SOCKET 0x1
72#define NETLINK_RECV_PKTINFO 0x2 79#define NETLINK_RECV_PKTINFO 0x2
73#define NETLINK_BROADCAST_SEND_ERROR 0x4 80#define NETLINK_BROADCAST_SEND_ERROR 0x4
@@ -84,6 +91,7 @@ EXPORT_SYMBOL_GPL(nl_table);
84static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); 91static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
85 92
86static int netlink_dump(struct sock *sk); 93static int netlink_dump(struct sock *sk);
94static void netlink_skb_destructor(struct sk_buff *skb);
87 95
88DEFINE_RWLOCK(nl_table_lock); 96DEFINE_RWLOCK(nl_table_lock);
89EXPORT_SYMBOL_GPL(nl_table_lock); 97EXPORT_SYMBOL_GPL(nl_table_lock);
@@ -103,6 +111,599 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
103 return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; 111 return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
104} 112}
105 113
114static void netlink_overrun(struct sock *sk)
115{
116 struct netlink_sock *nlk = nlk_sk(sk);
117
118 if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
119 if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
120 sk->sk_err = ENOBUFS;
121 sk->sk_error_report(sk);
122 }
123 }
124 atomic_inc(&sk->sk_drops);
125}
126
127static void netlink_rcv_wake(struct sock *sk)
128{
129 struct netlink_sock *nlk = nlk_sk(sk);
130
131 if (skb_queue_empty(&sk->sk_receive_queue))
132 clear_bit(NETLINK_CONGESTED, &nlk->state);
133 if (!test_bit(NETLINK_CONGESTED, &nlk->state))
134 wake_up_interruptible(&nlk->wait);
135}
136
137#ifdef CONFIG_NETLINK_MMAP
138static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
139{
140 return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
141}
142
143static bool netlink_rx_is_mmaped(struct sock *sk)
144{
145 return nlk_sk(sk)->rx_ring.pg_vec != NULL;
146}
147
148static bool netlink_tx_is_mmaped(struct sock *sk)
149{
150 return nlk_sk(sk)->tx_ring.pg_vec != NULL;
151}
152
153static __pure struct page *pgvec_to_page(const void *addr)
154{
155 if (is_vmalloc_addr(addr))
156 return vmalloc_to_page(addr);
157 else
158 return virt_to_page(addr);
159}
160
161static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
162{
163 unsigned int i;
164
165 for (i = 0; i < len; i++) {
166 if (pg_vec[i] != NULL) {
167 if (is_vmalloc_addr(pg_vec[i]))
168 vfree(pg_vec[i]);
169 else
170 free_pages((unsigned long)pg_vec[i], order);
171 }
172 }
173 kfree(pg_vec);
174}
175
176static void *alloc_one_pg_vec_page(unsigned long order)
177{
178 void *buffer;
179 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
180 __GFP_NOWARN | __GFP_NORETRY;
181
182 buffer = (void *)__get_free_pages(gfp_flags, order);
183 if (buffer != NULL)
184 return buffer;
185
186 buffer = vzalloc((1 << order) * PAGE_SIZE);
187 if (buffer != NULL)
188 return buffer;
189
190 gfp_flags &= ~__GFP_NORETRY;
191 return (void *)__get_free_pages(gfp_flags, order);
192}
193
194static void **alloc_pg_vec(struct netlink_sock *nlk,
195 struct nl_mmap_req *req, unsigned int order)
196{
197 unsigned int block_nr = req->nm_block_nr;
198 unsigned int i;
199 void **pg_vec, *ptr;
200
201 pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
202 if (pg_vec == NULL)
203 return NULL;
204
205 for (i = 0; i < block_nr; i++) {
206 pg_vec[i] = ptr = alloc_one_pg_vec_page(order);
207 if (pg_vec[i] == NULL)
208 goto err1;
209 }
210
211 return pg_vec;
212err1:
213 free_pg_vec(pg_vec, order, block_nr);
214 return NULL;
215}
216
217static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
218 bool closing, bool tx_ring)
219{
220 struct netlink_sock *nlk = nlk_sk(sk);
221 struct netlink_ring *ring;
222 struct sk_buff_head *queue;
223 void **pg_vec = NULL;
224 unsigned int order = 0;
225 int err;
226
227 ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
228 queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
229
230 if (!closing) {
231 if (atomic_read(&nlk->mapped))
232 return -EBUSY;
233 if (atomic_read(&ring->pending))
234 return -EBUSY;
235 }
236
237 if (req->nm_block_nr) {
238 if (ring->pg_vec != NULL)
239 return -EBUSY;
240
241 if ((int)req->nm_block_size <= 0)
242 return -EINVAL;
243 if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE))
244 return -EINVAL;
245 if (req->nm_frame_size < NL_MMAP_HDRLEN)
246 return -EINVAL;
247 if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
248 return -EINVAL;
249
250 ring->frames_per_block = req->nm_block_size /
251 req->nm_frame_size;
252 if (ring->frames_per_block == 0)
253 return -EINVAL;
254 if (ring->frames_per_block * req->nm_block_nr !=
255 req->nm_frame_nr)
256 return -EINVAL;
257
258 order = get_order(req->nm_block_size);
259 pg_vec = alloc_pg_vec(nlk, req, order);
260 if (pg_vec == NULL)
261 return -ENOMEM;
262 } else {
263 if (req->nm_frame_nr)
264 return -EINVAL;
265 }
266
267 err = -EBUSY;
268 mutex_lock(&nlk->pg_vec_lock);
269 if (closing || atomic_read(&nlk->mapped) == 0) {
270 err = 0;
271 spin_lock_bh(&queue->lock);
272
273 ring->frame_max = req->nm_frame_nr - 1;
274 ring->head = 0;
275 ring->frame_size = req->nm_frame_size;
276 ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;
277
278 swap(ring->pg_vec_len, req->nm_block_nr);
279 swap(ring->pg_vec_order, order);
280 swap(ring->pg_vec, pg_vec);
281
282 __skb_queue_purge(queue);
283 spin_unlock_bh(&queue->lock);
284
285 WARN_ON(atomic_read(&nlk->mapped));
286 }
287 mutex_unlock(&nlk->pg_vec_lock);
288
289 if (pg_vec)
290 free_pg_vec(pg_vec, order, req->nm_block_nr);
291 return err;
292}
293
294static void netlink_mm_open(struct vm_area_struct *vma)
295{
296 struct file *file = vma->vm_file;
297 struct socket *sock = file->private_data;
298 struct sock *sk = sock->sk;
299
300 if (sk)
301 atomic_inc(&nlk_sk(sk)->mapped);
302}
303
304static void netlink_mm_close(struct vm_area_struct *vma)
305{
306 struct file *file = vma->vm_file;
307 struct socket *sock = file->private_data;
308 struct sock *sk = sock->sk;
309
310 if (sk)
311 atomic_dec(&nlk_sk(sk)->mapped);
312}
313
314static const struct vm_operations_struct netlink_mmap_ops = {
315 .open = netlink_mm_open,
316 .close = netlink_mm_close,
317};
318
319static int netlink_mmap(struct file *file, struct socket *sock,
320 struct vm_area_struct *vma)
321{
322 struct sock *sk = sock->sk;
323 struct netlink_sock *nlk = nlk_sk(sk);
324 struct netlink_ring *ring;
325 unsigned long start, size, expected;
326 unsigned int i;
327 int err = -EINVAL;
328
329 if (vma->vm_pgoff)
330 return -EINVAL;
331
332 mutex_lock(&nlk->pg_vec_lock);
333
334 expected = 0;
335 for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
336 if (ring->pg_vec == NULL)
337 continue;
338 expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
339 }
340
341 if (expected == 0)
342 goto out;
343
344 size = vma->vm_end - vma->vm_start;
345 if (size != expected)
346 goto out;
347
348 start = vma->vm_start;
349 for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
350 if (ring->pg_vec == NULL)
351 continue;
352
353 for (i = 0; i < ring->pg_vec_len; i++) {
354 struct page *page;
355 void *kaddr = ring->pg_vec[i];
356 unsigned int pg_num;
357
358 for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
359 page = pgvec_to_page(kaddr);
360 err = vm_insert_page(vma, start, page);
361 if (err < 0)
362 goto out;
363 start += PAGE_SIZE;
364 kaddr += PAGE_SIZE;
365 }
366 }
367 }
368
369 atomic_inc(&nlk->mapped);
370 vma->vm_ops = &netlink_mmap_ops;
371 err = 0;
372out:
373 mutex_unlock(&nlk->pg_vec_lock);
374 return 0;
375}
376
377static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr)
378{
379#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
380 struct page *p_start, *p_end;
381
382 /* First page is flushed through netlink_{get,set}_status */
383 p_start = pgvec_to_page(hdr + PAGE_SIZE);
384 p_end = pgvec_to_page((void *)hdr + NL_MMAP_MSG_HDRLEN + hdr->nm_len - 1);
385 while (p_start <= p_end) {
386 flush_dcache_page(p_start);
387 p_start++;
388 }
389#endif
390}
391
392static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
393{
394 smp_rmb();
395 flush_dcache_page(pgvec_to_page(hdr));
396 return hdr->nm_status;
397}
398
399static void netlink_set_status(struct nl_mmap_hdr *hdr,
400 enum nl_mmap_status status)
401{
402 hdr->nm_status = status;
403 flush_dcache_page(pgvec_to_page(hdr));
404 smp_wmb();
405}
406
407static struct nl_mmap_hdr *
408__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
409{
410 unsigned int pg_vec_pos, frame_off;
411
412 pg_vec_pos = pos / ring->frames_per_block;
413 frame_off = pos % ring->frames_per_block;
414
415 return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
416}
417
418static struct nl_mmap_hdr *
419netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
420 enum nl_mmap_status status)
421{
422 struct nl_mmap_hdr *hdr;
423
424 hdr = __netlink_lookup_frame(ring, pos);
425 if (netlink_get_status(hdr) != status)
426 return NULL;
427
428 return hdr;
429}
430
431static struct nl_mmap_hdr *
432netlink_current_frame(const struct netlink_ring *ring,
433 enum nl_mmap_status status)
434{
435 return netlink_lookup_frame(ring, ring->head, status);
436}
437
438static struct nl_mmap_hdr *
439netlink_previous_frame(const struct netlink_ring *ring,
440 enum nl_mmap_status status)
441{
442 unsigned int prev;
443
444 prev = ring->head ? ring->head - 1 : ring->frame_max;
445 return netlink_lookup_frame(ring, prev, status);
446}
447
448static void netlink_increment_head(struct netlink_ring *ring)
449{
450 ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
451}
452
453static void netlink_forward_ring(struct netlink_ring *ring)
454{
455 unsigned int head = ring->head, pos = head;
456 const struct nl_mmap_hdr *hdr;
457
458 do {
459 hdr = __netlink_lookup_frame(ring, pos);
460 if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
461 break;
462 if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
463 break;
464 netlink_increment_head(ring);
465 } while (ring->head != head);
466}
467
468static bool netlink_dump_space(struct netlink_sock *nlk)
469{
470 struct netlink_ring *ring = &nlk->rx_ring;
471 struct nl_mmap_hdr *hdr;
472 unsigned int n;
473
474 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
475 if (hdr == NULL)
476 return false;
477
478 n = ring->head + ring->frame_max / 2;
479 if (n > ring->frame_max)
480 n -= ring->frame_max;
481
482 hdr = __netlink_lookup_frame(ring, n);
483
484 return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
485}
486
487static unsigned int netlink_poll(struct file *file, struct socket *sock,
488 poll_table *wait)
489{
490 struct sock *sk = sock->sk;
491 struct netlink_sock *nlk = nlk_sk(sk);
492 unsigned int mask;
493 int err;
494
495 if (nlk->rx_ring.pg_vec != NULL) {
496 /* Memory mapped sockets don't call recvmsg(), so flow control
497 * for dumps is performed here. A dump is allowed to continue
498 * if at least half the ring is unused.
499 */
500 while (nlk->cb != NULL && netlink_dump_space(nlk)) {
501 err = netlink_dump(sk);
502 if (err < 0) {
503 sk->sk_err = err;
504 sk->sk_error_report(sk);
505 break;
506 }
507 }
508 netlink_rcv_wake(sk);
509 }
510
511 mask = datagram_poll(file, sock, wait);
512
513 spin_lock_bh(&sk->sk_receive_queue.lock);
514 if (nlk->rx_ring.pg_vec) {
515 netlink_forward_ring(&nlk->rx_ring);
516 if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
517 mask |= POLLIN | POLLRDNORM;
518 }
519 spin_unlock_bh(&sk->sk_receive_queue.lock);
520
521 spin_lock_bh(&sk->sk_write_queue.lock);
522 if (nlk->tx_ring.pg_vec) {
523 if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
524 mask |= POLLOUT | POLLWRNORM;
525 }
526 spin_unlock_bh(&sk->sk_write_queue.lock);
527
528 return mask;
529}
530
531static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
532{
533 return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
534}
535
536static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
537 struct netlink_ring *ring,
538 struct nl_mmap_hdr *hdr)
539{
540 unsigned int size;
541 void *data;
542
543 size = ring->frame_size - NL_MMAP_HDRLEN;
544 data = (void *)hdr + NL_MMAP_HDRLEN;
545
546 skb->head = data;
547 skb->data = data;
548 skb_reset_tail_pointer(skb);
549 skb->end = skb->tail + size;
550 skb->len = 0;
551
552 skb->destructor = netlink_skb_destructor;
553 NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
554 NETLINK_CB(skb).sk = sk;
555}
556
557static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
558 u32 dst_portid, u32 dst_group,
559 struct sock_iocb *siocb)
560{
561 struct netlink_sock *nlk = nlk_sk(sk);
562 struct netlink_ring *ring;
563 struct nl_mmap_hdr *hdr;
564 struct sk_buff *skb;
565 unsigned int maxlen;
566 bool excl = true;
567 int err = 0, len = 0;
568
569 /* Netlink messages are validated by the receiver before processing.
570 * In order to avoid userspace changing the contents of the message
571 * after validation, the socket and the ring may only be used by a
572 * single process, otherwise we fall back to copying.
573 */
574 if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
575 atomic_read(&nlk->mapped) > 1)
576 excl = false;
577
578 mutex_lock(&nlk->pg_vec_lock);
579
580 ring = &nlk->tx_ring;
581 maxlen = ring->frame_size - NL_MMAP_HDRLEN;
582
583 do {
584 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
585 if (hdr == NULL) {
586 if (!(msg->msg_flags & MSG_DONTWAIT) &&
587 atomic_read(&nlk->tx_ring.pending))
588 schedule();
589 continue;
590 }
591 if (hdr->nm_len > maxlen) {
592 err = -EINVAL;
593 goto out;
594 }
595
596 netlink_frame_flush_dcache(hdr);
597
598 if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
599 skb = alloc_skb_head(GFP_KERNEL);
600 if (skb == NULL) {
601 err = -ENOBUFS;
602 goto out;
603 }
604 sock_hold(sk);
605 netlink_ring_setup_skb(skb, sk, ring, hdr);
606 NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
607 __skb_put(skb, hdr->nm_len);
608 netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
609 atomic_inc(&ring->pending);
610 } else {
611 skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
612 if (skb == NULL) {
613 err = -ENOBUFS;
614 goto out;
615 }
616 __skb_put(skb, hdr->nm_len);
617 memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
618 netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
619 }
620
621 netlink_increment_head(ring);
622
623 NETLINK_CB(skb).portid = nlk->portid;
624 NETLINK_CB(skb).dst_group = dst_group;
625 NETLINK_CB(skb).creds = siocb->scm->creds;
626
627 err = security_netlink_send(sk, skb);
628 if (err) {
629 kfree_skb(skb);
630 goto out;
631 }
632
633 if (unlikely(dst_group)) {
634 atomic_inc(&skb->users);
635 netlink_broadcast(sk, skb, dst_portid, dst_group,
636 GFP_KERNEL);
637 }
638 err = netlink_unicast(sk, skb, dst_portid,
639 msg->msg_flags & MSG_DONTWAIT);
640 if (err < 0)
641 goto out;
642 len += err;
643
644 } while (hdr != NULL ||
645 (!(msg->msg_flags & MSG_DONTWAIT) &&
646 atomic_read(&nlk->tx_ring.pending)));
647
648 if (len > 0)
649 err = len;
650out:
651 mutex_unlock(&nlk->pg_vec_lock);
652 return err;
653}
654
655static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
656{
657 struct nl_mmap_hdr *hdr;
658
659 hdr = netlink_mmap_hdr(skb);
660 hdr->nm_len = skb->len;
661 hdr->nm_group = NETLINK_CB(skb).dst_group;
662 hdr->nm_pid = NETLINK_CB(skb).creds.pid;
663 hdr->nm_uid = NETLINK_CB(skb).creds.uid;
664 hdr->nm_gid = NETLINK_CB(skb).creds.gid;
665 netlink_frame_flush_dcache(hdr);
666 netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
667
668 NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
669 kfree_skb(skb);
670}
671
672static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
673{
674 struct netlink_sock *nlk = nlk_sk(sk);
675 struct netlink_ring *ring = &nlk->rx_ring;
676 struct nl_mmap_hdr *hdr;
677
678 spin_lock_bh(&sk->sk_receive_queue.lock);
679 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
680 if (hdr == NULL) {
681 spin_unlock_bh(&sk->sk_receive_queue.lock);
682 kfree_skb(skb);
683 netlink_overrun(sk);
684 return;
685 }
686 netlink_increment_head(ring);
687 __skb_queue_tail(&sk->sk_receive_queue, skb);
688 spin_unlock_bh(&sk->sk_receive_queue.lock);
689
690 hdr->nm_len = skb->len;
691 hdr->nm_group = NETLINK_CB(skb).dst_group;
692 hdr->nm_pid = NETLINK_CB(skb).creds.pid;
693 hdr->nm_uid = NETLINK_CB(skb).creds.uid;
694 hdr->nm_gid = NETLINK_CB(skb).creds.gid;
695 netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
696}
697
698#else /* CONFIG_NETLINK_MMAP */
699#define netlink_skb_is_mmaped(skb) false
700#define netlink_rx_is_mmaped(sk) false
701#define netlink_tx_is_mmaped(sk) false
702#define netlink_mmap sock_no_mmap
703#define netlink_poll datagram_poll
704#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0
705#endif /* CONFIG_NETLINK_MMAP */
706
106static void netlink_destroy_callback(struct netlink_callback *cb) 707static void netlink_destroy_callback(struct netlink_callback *cb)
107{ 708{
108 kfree_skb(cb->skb); 709 kfree_skb(cb->skb);
@@ -115,6 +716,53 @@ static void netlink_consume_callback(struct netlink_callback *cb)
115 kfree(cb); 716 kfree(cb);
116} 717}
117 718
719static void netlink_skb_destructor(struct sk_buff *skb)
720{
721#ifdef CONFIG_NETLINK_MMAP
722 struct nl_mmap_hdr *hdr;
723 struct netlink_ring *ring;
724 struct sock *sk;
725
726 /* If a packet from the kernel to userspace was freed because of an
727 * error without being delivered to userspace, the kernel must reset
728 * the status. In the direction userspace to kernel, the status is
729 * always reset here after the packet was processed and freed.
730 */
731 if (netlink_skb_is_mmaped(skb)) {
732 hdr = netlink_mmap_hdr(skb);
733 sk = NETLINK_CB(skb).sk;
734
735 if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
736 netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
737 ring = &nlk_sk(sk)->tx_ring;
738 } else {
739 if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
740 hdr->nm_len = 0;
741 netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
742 }
743 ring = &nlk_sk(sk)->rx_ring;
744 }
745
746 WARN_ON(atomic_read(&ring->pending) == 0);
747 atomic_dec(&ring->pending);
748 sock_put(sk);
749
750 skb->data = NULL;
751 }
752#endif
753 if (skb->sk != NULL)
754 sock_rfree(skb);
755}
756
757static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
758{
759 WARN_ON(skb->sk != NULL);
760 skb->sk = sk;
761 skb->destructor = netlink_skb_destructor;
762 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
763 sk_mem_charge(sk, skb->truesize);
764}
765
118static void netlink_sock_destruct(struct sock *sk) 766static void netlink_sock_destruct(struct sock *sk)
119{ 767{
120 struct netlink_sock *nlk = nlk_sk(sk); 768 struct netlink_sock *nlk = nlk_sk(sk);
@@ -128,6 +776,18 @@ static void netlink_sock_destruct(struct sock *sk)
128 } 776 }
129 777
130 skb_queue_purge(&sk->sk_receive_queue); 778 skb_queue_purge(&sk->sk_receive_queue);
779#ifdef CONFIG_NETLINK_MMAP
780 if (1) {
781 struct nl_mmap_req req;
782
783 memset(&req, 0, sizeof(req));
784 if (nlk->rx_ring.pg_vec)
785 netlink_set_ring(sk, &req, true, false);
786 memset(&req, 0, sizeof(req));
787 if (nlk->tx_ring.pg_vec)
788 netlink_set_ring(sk, &req, true, true);
789 }
790#endif /* CONFIG_NETLINK_MMAP */
131 791
132 if (!sock_flag(sk, SOCK_DEAD)) { 792 if (!sock_flag(sk, SOCK_DEAD)) {
133 printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); 793 printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -391,6 +1051,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
391 mutex_init(nlk->cb_mutex); 1051 mutex_init(nlk->cb_mutex);
392 } 1052 }
393 init_waitqueue_head(&nlk->wait); 1053 init_waitqueue_head(&nlk->wait);
1054#ifdef CONFIG_NETLINK_MMAP
1055 mutex_init(&nlk->pg_vec_lock);
1056#endif
394 1057
395 sk->sk_destruct = netlink_sock_destruct; 1058 sk->sk_destruct = netlink_sock_destruct;
396 sk->sk_protocol = protocol; 1059 sk->sk_protocol = protocol;
@@ -722,19 +1385,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
722 return 0; 1385 return 0;
723} 1386}
724 1387
725static void netlink_overrun(struct sock *sk)
726{
727 struct netlink_sock *nlk = nlk_sk(sk);
728
729 if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
730 if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
731 sk->sk_err = ENOBUFS;
732 sk->sk_error_report(sk);
733 }
734 }
735 atomic_inc(&sk->sk_drops);
736}
737
738static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) 1388static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
739{ 1389{
740 struct sock *sock; 1390 struct sock *sock;
@@ -787,8 +1437,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
787 1437
788 nlk = nlk_sk(sk); 1438 nlk = nlk_sk(sk);
789 1439
790 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 1440 if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
791 test_bit(0, &nlk->state)) { 1441 test_bit(NETLINK_CONGESTED, &nlk->state)) &&
1442 !netlink_skb_is_mmaped(skb)) {
792 DECLARE_WAITQUEUE(wait, current); 1443 DECLARE_WAITQUEUE(wait, current);
793 if (!*timeo) { 1444 if (!*timeo) {
794 if (!ssk || netlink_is_kernel(ssk)) 1445 if (!ssk || netlink_is_kernel(ssk))
@@ -802,7 +1453,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
802 add_wait_queue(&nlk->wait, &wait); 1453 add_wait_queue(&nlk->wait, &wait);
803 1454
804 if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 1455 if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
805 test_bit(0, &nlk->state)) && 1456 test_bit(NETLINK_CONGESTED, &nlk->state)) &&
806 !sock_flag(sk, SOCK_DEAD)) 1457 !sock_flag(sk, SOCK_DEAD))
807 *timeo = schedule_timeout(*timeo); 1458 *timeo = schedule_timeout(*timeo);
808 1459
@@ -816,7 +1467,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
816 } 1467 }
817 return 1; 1468 return 1;
818 } 1469 }
819 skb_set_owner_r(skb, sk); 1470 netlink_skb_set_owner_r(skb, sk);
820 return 0; 1471 return 0;
821} 1472}
822 1473
@@ -824,7 +1475,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
824{ 1475{
825 int len = skb->len; 1476 int len = skb->len;
826 1477
827 skb_queue_tail(&sk->sk_receive_queue, skb); 1478#ifdef CONFIG_NETLINK_MMAP
1479 if (netlink_skb_is_mmaped(skb))
1480 netlink_queue_mmaped_skb(sk, skb);
1481 else if (netlink_rx_is_mmaped(sk))
1482 netlink_ring_set_copied(sk, skb);
1483 else
1484#endif /* CONFIG_NETLINK_MMAP */
1485 skb_queue_tail(&sk->sk_receive_queue, skb);
828 sk->sk_data_ready(sk, len); 1486 sk->sk_data_ready(sk, len);
829 return len; 1487 return len;
830} 1488}
@@ -847,7 +1505,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
847{ 1505{
848 int delta; 1506 int delta;
849 1507
850 skb_orphan(skb); 1508 WARN_ON(skb->sk != NULL);
1509 if (netlink_skb_is_mmaped(skb))
1510 return skb;
851 1511
852 delta = skb->end - skb->tail; 1512 delta = skb->end - skb->tail;
853 if (delta * 2 < skb->truesize) 1513 if (delta * 2 < skb->truesize)
@@ -867,16 +1527,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
867 return skb; 1527 return skb;
868} 1528}
869 1529
870static void netlink_rcv_wake(struct sock *sk)
871{
872 struct netlink_sock *nlk = nlk_sk(sk);
873
874 if (skb_queue_empty(&sk->sk_receive_queue))
875 clear_bit(0, &nlk->state);
876 if (!test_bit(0, &nlk->state))
877 wake_up_interruptible(&nlk->wait);
878}
879
880static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, 1530static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
881 struct sock *ssk) 1531 struct sock *ssk)
882{ 1532{
@@ -886,8 +1536,8 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
886 ret = -ECONNREFUSED; 1536 ret = -ECONNREFUSED;
887 if (nlk->netlink_rcv != NULL) { 1537 if (nlk->netlink_rcv != NULL) {
888 ret = skb->len; 1538 ret = skb->len;
889 skb_set_owner_r(skb, sk); 1539 netlink_skb_set_owner_r(skb, sk);
890 NETLINK_CB(skb).ssk = ssk; 1540 NETLINK_CB(skb).sk = ssk;
891 nlk->netlink_rcv(skb); 1541 nlk->netlink_rcv(skb);
892 consume_skb(skb); 1542 consume_skb(skb);
893 } else { 1543 } else {
@@ -933,6 +1583,69 @@ retry:
933} 1583}
934EXPORT_SYMBOL(netlink_unicast); 1584EXPORT_SYMBOL(netlink_unicast);
935 1585
1586struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
1587 u32 dst_portid, gfp_t gfp_mask)
1588{
1589#ifdef CONFIG_NETLINK_MMAP
1590 struct sock *sk = NULL;
1591 struct sk_buff *skb;
1592 struct netlink_ring *ring;
1593 struct nl_mmap_hdr *hdr;
1594 unsigned int maxlen;
1595
1596 sk = netlink_getsockbyportid(ssk, dst_portid);
1597 if (IS_ERR(sk))
1598 goto out;
1599
1600 ring = &nlk_sk(sk)->rx_ring;
1601 /* fast-path without atomic ops for common case: non-mmaped receiver */
1602 if (ring->pg_vec == NULL)
1603 goto out_put;
1604
1605 skb = alloc_skb_head(gfp_mask);
1606 if (skb == NULL)
1607 goto err1;
1608
1609 spin_lock_bh(&sk->sk_receive_queue.lock);
1610 /* check again under lock */
1611 if (ring->pg_vec == NULL)
1612 goto out_free;
1613
1614 maxlen = ring->frame_size - NL_MMAP_HDRLEN;
1615 if (maxlen < size)
1616 goto out_free;
1617
1618 netlink_forward_ring(ring);
1619 hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
1620 if (hdr == NULL)
1621 goto err2;
1622 netlink_ring_setup_skb(skb, sk, ring, hdr);
1623 netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
1624 atomic_inc(&ring->pending);
1625 netlink_increment_head(ring);
1626
1627 spin_unlock_bh(&sk->sk_receive_queue.lock);
1628 return skb;
1629
1630err2:
1631 kfree_skb(skb);
1632 spin_unlock_bh(&sk->sk_receive_queue.lock);
1633 netlink_overrun(sk);
1634err1:
1635 sock_put(sk);
1636 return NULL;
1637
1638out_free:
1639 kfree_skb(skb);
1640 spin_unlock_bh(&sk->sk_receive_queue.lock);
1641out_put:
1642 sock_put(sk);
1643out:
1644#endif
1645 return alloc_skb(size, gfp_mask);
1646}
1647EXPORT_SYMBOL_GPL(netlink_alloc_skb);
1648
936int netlink_has_listeners(struct sock *sk, unsigned int group) 1649int netlink_has_listeners(struct sock *sk, unsigned int group)
937{ 1650{
938 int res = 0; 1651 int res = 0;
@@ -957,8 +1670,8 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
957 struct netlink_sock *nlk = nlk_sk(sk); 1670 struct netlink_sock *nlk = nlk_sk(sk);
958 1671
959 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && 1672 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
960 !test_bit(0, &nlk->state)) { 1673 !test_bit(NETLINK_CONGESTED, &nlk->state)) {
961 skb_set_owner_r(skb, sk); 1674 netlink_skb_set_owner_r(skb, sk);
962 __netlink_sendskb(sk, skb); 1675 __netlink_sendskb(sk, skb);
963 return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); 1676 return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
964 } 1677 }
@@ -1193,7 +1906,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1193 if (level != SOL_NETLINK) 1906 if (level != SOL_NETLINK)
1194 return -ENOPROTOOPT; 1907 return -ENOPROTOOPT;
1195 1908
1196 if (optlen >= sizeof(int) && 1909 if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
1910 optlen >= sizeof(int) &&
1197 get_user(val, (unsigned int __user *)optval)) 1911 get_user(val, (unsigned int __user *)optval))
1198 return -EFAULT; 1912 return -EFAULT;
1199 1913
@@ -1235,13 +1949,32 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1235 case NETLINK_NO_ENOBUFS: 1949 case NETLINK_NO_ENOBUFS:
1236 if (val) { 1950 if (val) {
1237 nlk->flags |= NETLINK_RECV_NO_ENOBUFS; 1951 nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
1238 clear_bit(0, &nlk->state); 1952 clear_bit(NETLINK_CONGESTED, &nlk->state);
1239 wake_up_interruptible(&nlk->wait); 1953 wake_up_interruptible(&nlk->wait);
1240 } else { 1954 } else {
1241 nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; 1955 nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
1242 } 1956 }
1243 err = 0; 1957 err = 0;
1244 break; 1958 break;
1959#ifdef CONFIG_NETLINK_MMAP
1960 case NETLINK_RX_RING:
1961 case NETLINK_TX_RING: {
1962 struct nl_mmap_req req;
1963
1964 /* Rings might consume more memory than queue limits, require
1965 * CAP_NET_ADMIN.
1966 */
1967 if (!capable(CAP_NET_ADMIN))
1968 return -EPERM;
1969 if (optlen < sizeof(req))
1970 return -EINVAL;
1971 if (copy_from_user(&req, optval, sizeof(req)))
1972 return -EFAULT;
1973 err = netlink_set_ring(sk, &req, false,
1974 optname == NETLINK_TX_RING);
1975 break;
1976 }
1977#endif /* CONFIG_NETLINK_MMAP */
1245 default: 1978 default:
1246 err = -ENOPROTOOPT; 1979 err = -ENOPROTOOPT;
1247 } 1980 }
@@ -1352,6 +2085,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
1352 goto out; 2085 goto out;
1353 } 2086 }
1354 2087
2088 if (netlink_tx_is_mmaped(sk) &&
2089 msg->msg_iov->iov_base == NULL) {
2090 err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
2091 siocb);
2092 goto out;
2093 }
2094
1355 err = -EMSGSIZE; 2095 err = -EMSGSIZE;
1356 if (len > sk->sk_sndbuf - 32) 2096 if (len > sk->sk_sndbuf - 32)
1357 goto out; 2097 goto out;
@@ -1684,9 +2424,13 @@ static int netlink_dump(struct sock *sk)
1684 2424
1685 alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); 2425 alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
1686 2426
1687 skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL); 2427 if (!netlink_rx_is_mmaped(sk) &&
2428 atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2429 goto errout_skb;
2430 skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL);
1688 if (!skb) 2431 if (!skb)
1689 goto errout_skb; 2432 goto errout_skb;
2433 netlink_skb_set_owner_r(skb, sk);
1690 2434
1691 len = cb->dump(skb, cb); 2435 len = cb->dump(skb, cb);
1692 2436
@@ -1741,6 +2485,19 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
1741 if (cb == NULL) 2485 if (cb == NULL)
1742 return -ENOBUFS; 2486 return -ENOBUFS;
1743 2487
2488 /* Memory mapped dump requests need to be copied to avoid looping
2489 * on the pending state in netlink_mmap_sendmsg() while the CB hold
2490 * a reference to the skb.
2491 */
2492 if (netlink_skb_is_mmaped(skb)) {
2493 skb = skb_copy(skb, GFP_KERNEL);
2494 if (skb == NULL) {
2495 kfree(cb);
2496 return -ENOBUFS;
2497 }
2498 } else
2499 atomic_inc(&skb->users);
2500
1744 cb->dump = control->dump; 2501 cb->dump = control->dump;
1745 cb->done = control->done; 2502 cb->done = control->done;
1746 cb->nlh = nlh; 2503 cb->nlh = nlh;
@@ -1801,7 +2558,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
1801 if (err) 2558 if (err)
1802 payload += nlmsg_len(nlh); 2559 payload += nlmsg_len(nlh);
1803 2560
1804 skb = nlmsg_new(payload, GFP_KERNEL); 2561 skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
2562 NETLINK_CB(in_skb).portid, GFP_KERNEL);
1805 if (!skb) { 2563 if (!skb) {
1806 struct sock *sk; 2564 struct sock *sk;
1807 2565
@@ -2067,7 +2825,7 @@ static const struct proto_ops netlink_ops = {
2067 .socketpair = sock_no_socketpair, 2825 .socketpair = sock_no_socketpair,
2068 .accept = sock_no_accept, 2826 .accept = sock_no_accept,
2069 .getname = netlink_getname, 2827 .getname = netlink_getname,
2070 .poll = datagram_poll, 2828 .poll = netlink_poll,
2071 .ioctl = sock_no_ioctl, 2829 .ioctl = sock_no_ioctl,
2072 .listen = sock_no_listen, 2830 .listen = sock_no_listen,
2073 .shutdown = sock_no_shutdown, 2831 .shutdown = sock_no_shutdown,
@@ -2075,7 +2833,7 @@ static const struct proto_ops netlink_ops = {
2075 .getsockopt = netlink_getsockopt, 2833 .getsockopt = netlink_getsockopt,
2076 .sendmsg = netlink_sendmsg, 2834 .sendmsg = netlink_sendmsg,
2077 .recvmsg = netlink_recvmsg, 2835 .recvmsg = netlink_recvmsg,
2078 .mmap = sock_no_mmap, 2836 .mmap = netlink_mmap,
2079 .sendpage = sock_no_sendpage, 2837 .sendpage = sock_no_sendpage,
2080}; 2838};
2081 2839
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index d9acb2a1d855..ed8522265f4e 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -6,6 +6,20 @@
6#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) 6#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
7#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) 7#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
8 8
9struct netlink_ring {
10 void **pg_vec;
11 unsigned int head;
12 unsigned int frames_per_block;
13 unsigned int frame_size;
14 unsigned int frame_max;
15
16 unsigned int pg_vec_order;
17 unsigned int pg_vec_pages;
18 unsigned int pg_vec_len;
19
20 atomic_t pending;
21};
22
9struct netlink_sock { 23struct netlink_sock {
10 /* struct sock has to be the first member of netlink_sock */ 24 /* struct sock has to be the first member of netlink_sock */
11 struct sock sk; 25 struct sock sk;
@@ -24,6 +38,12 @@ struct netlink_sock {
24 void (*netlink_rcv)(struct sk_buff *skb); 38 void (*netlink_rcv)(struct sk_buff *skb);
25 void (*netlink_bind)(int group); 39 void (*netlink_bind)(int group);
26 struct module *module; 40 struct module *module;
41#ifdef CONFIG_NETLINK_MMAP
42 struct mutex pg_vec_lock;
43 struct netlink_ring rx_ring;
44 struct netlink_ring tx_ring;
45 atomic_t mapped;
46#endif /* CONFIG_NETLINK_MMAP */
27}; 47};
28 48
29static inline struct netlink_sock *nlk_sk(struct sock *sk) 49static inline struct netlink_sock *nlk_sk(struct sock *sk)
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 5ffb1d1cf402..4e4aa471cd05 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -7,6 +7,34 @@
7 7
8#include "af_netlink.h" 8#include "af_netlink.h"
9 9
10static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
11 struct sk_buff *nlskb)
12{
13 struct netlink_diag_ring ndr;
14
15 ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
16 ndr.ndr_block_nr = ring->pg_vec_len;
17 ndr.ndr_frame_size = ring->frame_size;
18 ndr.ndr_frame_nr = ring->frame_max + 1;
19
20 return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
21}
22
23static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
24{
25 struct netlink_sock *nlk = nlk_sk(sk);
26 int ret;
27
28 mutex_lock(&nlk->pg_vec_lock);
29 ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
30 if (!ret)
31 ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
32 nlskb);
33 mutex_unlock(&nlk->pg_vec_lock);
34
35 return ret;
36}
37
10static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) 38static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
11{ 39{
12 struct netlink_sock *nlk = nlk_sk(sk); 40 struct netlink_sock *nlk = nlk_sk(sk);
@@ -51,6 +79,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
51 sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) 79 sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
52 goto out_nlmsg_trim; 80 goto out_nlmsg_trim;
53 81
82 if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
83 sk_diag_put_rings_cfg(sk, skb))
84 goto out_nlmsg_trim;
85
54 return nlmsg_end(skb, nlh); 86 return nlmsg_end(skb, nlh);
55 87
56out_nlmsg_trim: 88out_nlmsg_trim:
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index aa36a8c8b33b..7881e2fccbc2 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -393,7 +393,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
393 return -EOPNOTSUPP; 393 return -EOPNOTSUPP;
394 394
395 if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && 395 if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
396 sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) 396 sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
397 return -EOPNOTSUPP; 397 return -EOPNOTSUPP;
398 } 398 }
399 399