diff options
-rw-r--r-- | include/linux/socket.h | 6 | ||||
-rw-r--r-- | include/net/kcm.h | 125 | ||||
-rw-r--r-- | include/uapi/linux/kcm.h | 40 | ||||
-rw-r--r-- | net/Kconfig | 1 | ||||
-rw-r--r-- | net/Makefile | 1 | ||||
-rw-r--r-- | net/kcm/Kconfig | 10 | ||||
-rw-r--r-- | net/kcm/Makefile | 3 | ||||
-rw-r--r-- | net/kcm/kcmsock.c | 2016 |
8 files changed, 2201 insertions, 1 deletions
diff --git a/include/linux/socket.h b/include/linux/socket.h index d834af22a460..73bf6c6a833b 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h | |||
@@ -200,7 +200,9 @@ struct ucred { | |||
200 | #define AF_ALG 38 /* Algorithm sockets */ | 200 | #define AF_ALG 38 /* Algorithm sockets */ |
201 | #define AF_NFC 39 /* NFC sockets */ | 201 | #define AF_NFC 39 /* NFC sockets */ |
202 | #define AF_VSOCK 40 /* vSockets */ | 202 | #define AF_VSOCK 40 /* vSockets */ |
203 | #define AF_MAX 41 /* For now.. */ | 203 | #define AF_KCM 41 /* Kernel Connection Multiplexor*/ |
204 | |||
205 | #define AF_MAX 42 /* For now.. */ | ||
204 | 206 | ||
205 | /* Protocol families, same as address families. */ | 207 | /* Protocol families, same as address families. */ |
206 | #define PF_UNSPEC AF_UNSPEC | 208 | #define PF_UNSPEC AF_UNSPEC |
@@ -246,6 +248,7 @@ struct ucred { | |||
246 | #define PF_ALG AF_ALG | 248 | #define PF_ALG AF_ALG |
247 | #define PF_NFC AF_NFC | 249 | #define PF_NFC AF_NFC |
248 | #define PF_VSOCK AF_VSOCK | 250 | #define PF_VSOCK AF_VSOCK |
251 | #define PF_KCM AF_KCM | ||
249 | #define PF_MAX AF_MAX | 252 | #define PF_MAX AF_MAX |
250 | 253 | ||
251 | /* Maximum queue length specifiable by listen. */ | 254 | /* Maximum queue length specifiable by listen. */ |
@@ -323,6 +326,7 @@ struct ucred { | |||
323 | #define SOL_CAIF 278 | 326 | #define SOL_CAIF 278 |
324 | #define SOL_ALG 279 | 327 | #define SOL_ALG 279 |
325 | #define SOL_NFC 280 | 328 | #define SOL_NFC 280 |
329 | #define SOL_KCM 281 | ||
326 | 330 | ||
327 | /* IPX options */ | 331 | /* IPX options */ |
328 | #define IPX_TYPE 1 | 332 | #define IPX_TYPE 1 |
diff --git a/include/net/kcm.h b/include/net/kcm.h new file mode 100644 index 000000000000..1bcae39070ec --- /dev/null +++ b/include/net/kcm.h | |||
@@ -0,0 +1,125 @@ | |||
1 | /* | ||
2 | * Kernel Connection Multiplexor | ||
3 | * | ||
4 | * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 | ||
8 | * as published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #ifndef __NET_KCM_H_ | ||
12 | #define __NET_KCM_H_ | ||
13 | |||
14 | #include <linux/skbuff.h> | ||
15 | #include <net/sock.h> | ||
16 | #include <uapi/linux/kcm.h> | ||
17 | |||
18 | extern unsigned int kcm_net_id; | ||
19 | |||
20 | struct kcm_tx_msg { | ||
21 | unsigned int sent; | ||
22 | unsigned int fragidx; | ||
23 | unsigned int frag_offset; | ||
24 | unsigned int msg_flags; | ||
25 | struct sk_buff *frag_skb; | ||
26 | struct sk_buff *last_skb; | ||
27 | }; | ||
28 | |||
29 | struct kcm_rx_msg { | ||
30 | int full_len; | ||
31 | int accum_len; | ||
32 | int offset; | ||
33 | }; | ||
34 | |||
35 | /* Socket structure for KCM client sockets */ | ||
36 | struct kcm_sock { | ||
37 | struct sock sk; | ||
38 | struct kcm_mux *mux; | ||
39 | struct list_head kcm_sock_list; | ||
40 | int index; | ||
41 | u32 done : 1; | ||
42 | struct work_struct done_work; | ||
43 | |||
44 | /* Transmit */ | ||
45 | struct kcm_psock *tx_psock; | ||
46 | struct work_struct tx_work; | ||
47 | struct list_head wait_psock_list; | ||
48 | struct sk_buff *seq_skb; | ||
49 | |||
50 | /* Don't use bit fields here, these are set under different locks */ | ||
51 | bool tx_wait; | ||
52 | bool tx_wait_more; | ||
53 | |||
54 | /* Receive */ | ||
55 | struct kcm_psock *rx_psock; | ||
56 | struct list_head wait_rx_list; /* KCMs waiting for receiving */ | ||
57 | bool rx_wait; | ||
58 | u32 rx_disabled : 1; | ||
59 | }; | ||
60 | |||
61 | struct bpf_prog; | ||
62 | |||
63 | /* Structure for an attached lower socket */ | ||
64 | struct kcm_psock { | ||
65 | struct sock *sk; | ||
66 | struct kcm_mux *mux; | ||
67 | int index; | ||
68 | |||
69 | u32 tx_stopped : 1; | ||
70 | u32 rx_stopped : 1; | ||
71 | u32 done : 1; | ||
72 | u32 unattaching : 1; | ||
73 | |||
74 | void (*save_state_change)(struct sock *sk); | ||
75 | void (*save_data_ready)(struct sock *sk); | ||
76 | void (*save_write_space)(struct sock *sk); | ||
77 | |||
78 | struct list_head psock_list; | ||
79 | |||
80 | /* Receive */ | ||
81 | struct sk_buff *rx_skb_head; | ||
82 | struct sk_buff **rx_skb_nextp; | ||
83 | struct sk_buff *ready_rx_msg; | ||
84 | struct list_head psock_ready_list; | ||
85 | struct work_struct rx_work; | ||
86 | struct delayed_work rx_delayed_work; | ||
87 | struct bpf_prog *bpf_prog; | ||
88 | struct kcm_sock *rx_kcm; | ||
89 | |||
90 | /* Transmit */ | ||
91 | struct kcm_sock *tx_kcm; | ||
92 | struct list_head psock_avail_list; | ||
93 | }; | ||
94 | |||
95 | /* Per net MUX list */ | ||
96 | struct kcm_net { | ||
97 | struct mutex mutex; | ||
98 | struct list_head mux_list; | ||
99 | int count; | ||
100 | }; | ||
101 | |||
102 | /* Structure for a MUX */ | ||
103 | struct kcm_mux { | ||
104 | struct list_head kcm_mux_list; | ||
105 | struct rcu_head rcu; | ||
106 | struct kcm_net *knet; | ||
107 | |||
108 | struct list_head kcm_socks; /* All KCM sockets on MUX */ | ||
109 | int kcm_socks_cnt; /* Total KCM socket count for MUX */ | ||
110 | struct list_head psocks; /* List of all psocks on MUX */ | ||
111 | int psocks_cnt; /* Total attached sockets */ | ||
112 | |||
113 | /* Receive */ | ||
114 | spinlock_t rx_lock ____cacheline_aligned_in_smp; | ||
115 | struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */ | ||
116 | struct list_head psocks_ready; /* List of psocks with a msg ready */ | ||
117 | struct sk_buff_head rx_hold_queue; | ||
118 | |||
119 | /* Transmit */ | ||
120 | spinlock_t lock ____cacheline_aligned_in_smp; /* TX and mux locking */ | ||
121 | struct list_head psocks_avail; /* List of available psocks */ | ||
122 | struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */ | ||
123 | }; | ||
124 | |||
125 | #endif /* __NET_KCM_H_ */ | ||
diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h new file mode 100644 index 000000000000..a5a530940b99 --- /dev/null +++ b/include/uapi/linux/kcm.h | |||
@@ -0,0 +1,40 @@ | |||
1 | /* | ||
2 | * Kernel Connection Multiplexor | ||
3 | * | ||
4 | * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 | ||
8 | * as published by the Free Software Foundation. | ||
9 | * | ||
10 | * User API to clone KCM sockets and attach transport socket to a KCM | ||
11 | * multiplexor. | ||
12 | */ | ||
13 | |||
14 | #ifndef KCM_KERNEL_H | ||
15 | #define KCM_KERNEL_H | ||
16 | |||
17 | struct kcm_attach { | ||
18 | int fd; | ||
19 | int bpf_fd; | ||
20 | }; | ||
21 | |||
22 | struct kcm_unattach { | ||
23 | int fd; | ||
24 | }; | ||
25 | |||
26 | struct kcm_clone { | ||
27 | int fd; | ||
28 | }; | ||
29 | |||
30 | #define SIOCKCMATTACH (SIOCPROTOPRIVATE + 0) | ||
31 | #define SIOCKCMUNATTACH (SIOCPROTOPRIVATE + 1) | ||
32 | #define SIOCKCMCLONE (SIOCPROTOPRIVATE + 2) | ||
33 | |||
34 | #define KCMPROTO_CONNECTED 0 | ||
35 | |||
36 | /* Socket options */ | ||
37 | #define KCM_RECV_DISABLE 1 | ||
38 | |||
39 | #endif | ||
40 | |||
diff --git a/net/Kconfig b/net/Kconfig index 2760825e53fa..10640d5f8bee 100644 --- a/net/Kconfig +++ b/net/Kconfig | |||
@@ -360,6 +360,7 @@ source "net/can/Kconfig" | |||
360 | source "net/irda/Kconfig" | 360 | source "net/irda/Kconfig" |
361 | source "net/bluetooth/Kconfig" | 361 | source "net/bluetooth/Kconfig" |
362 | source "net/rxrpc/Kconfig" | 362 | source "net/rxrpc/Kconfig" |
363 | source "net/kcm/Kconfig" | ||
363 | 364 | ||
364 | config FIB_RULES | 365 | config FIB_RULES |
365 | bool | 366 | bool |
diff --git a/net/Makefile b/net/Makefile index a5d04098dfce..81d14119eab5 100644 --- a/net/Makefile +++ b/net/Makefile | |||
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA) += irda/ | |||
34 | obj-$(CONFIG_BT) += bluetooth/ | 34 | obj-$(CONFIG_BT) += bluetooth/ |
35 | obj-$(CONFIG_SUNRPC) += sunrpc/ | 35 | obj-$(CONFIG_SUNRPC) += sunrpc/ |
36 | obj-$(CONFIG_AF_RXRPC) += rxrpc/ | 36 | obj-$(CONFIG_AF_RXRPC) += rxrpc/ |
37 | obj-$(CONFIG_AF_KCM) += kcm/ | ||
37 | obj-$(CONFIG_ATM) += atm/ | 38 | obj-$(CONFIG_ATM) += atm/ |
38 | obj-$(CONFIG_L2TP) += l2tp/ | 39 | obj-$(CONFIG_L2TP) += l2tp/ |
39 | obj-$(CONFIG_DECNET) += decnet/ | 40 | obj-$(CONFIG_DECNET) += decnet/ |
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig new file mode 100644 index 000000000000..5db94d940ecc --- /dev/null +++ b/net/kcm/Kconfig | |||
@@ -0,0 +1,10 @@ | |||
1 | |||
2 | config AF_KCM | ||
3 | tristate "KCM sockets" | ||
4 | depends on INET | ||
5 | select BPF_SYSCALL | ||
6 | ---help--- | ||
7 | KCM (Kernel Connection Multiplexor) sockets provide a method | ||
8 | for multiplexing messages of a message based application | ||
9 | protocol over kernel connectons (e.g. TCP connections). | ||
10 | |||
diff --git a/net/kcm/Makefile b/net/kcm/Makefile new file mode 100644 index 000000000000..cb525f7c5a13 --- /dev/null +++ b/net/kcm/Makefile | |||
@@ -0,0 +1,3 @@ | |||
1 | obj-$(CONFIG_AF_KCM) += kcm.o | ||
2 | |||
3 | kcm-y := kcmsock.o | ||
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c new file mode 100644 index 000000000000..30ef69ac6b81 --- /dev/null +++ b/net/kcm/kcmsock.c | |||
@@ -0,0 +1,2016 @@ | |||
1 | #include <linux/bpf.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/errqueue.h> | ||
4 | #include <linux/file.h> | ||
5 | #include <linux/in.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/net.h> | ||
9 | #include <linux/netdevice.h> | ||
10 | #include <linux/poll.h> | ||
11 | #include <linux/rculist.h> | ||
12 | #include <linux/skbuff.h> | ||
13 | #include <linux/socket.h> | ||
14 | #include <linux/uaccess.h> | ||
15 | #include <linux/workqueue.h> | ||
16 | #include <net/kcm.h> | ||
17 | #include <net/netns/generic.h> | ||
18 | #include <net/sock.h> | ||
19 | #include <net/tcp.h> | ||
20 | #include <uapi/linux/kcm.h> | ||
21 | |||
22 | unsigned int kcm_net_id; | ||
23 | |||
24 | static struct kmem_cache *kcm_psockp __read_mostly; | ||
25 | static struct kmem_cache *kcm_muxp __read_mostly; | ||
26 | static struct workqueue_struct *kcm_wq; | ||
27 | |||
28 | static inline struct kcm_sock *kcm_sk(const struct sock *sk) | ||
29 | { | ||
30 | return (struct kcm_sock *)sk; | ||
31 | } | ||
32 | |||
33 | static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) | ||
34 | { | ||
35 | return (struct kcm_tx_msg *)skb->cb; | ||
36 | } | ||
37 | |||
38 | static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) | ||
39 | { | ||
40 | return (struct kcm_rx_msg *)((void *)skb->cb + | ||
41 | offsetof(struct qdisc_skb_cb, data)); | ||
42 | } | ||
43 | |||
44 | static void report_csk_error(struct sock *csk, int err) | ||
45 | { | ||
46 | csk->sk_err = EPIPE; | ||
47 | csk->sk_error_report(csk); | ||
48 | } | ||
49 | |||
50 | /* Callback lock held */ | ||
51 | static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, | ||
52 | struct sk_buff *skb) | ||
53 | { | ||
54 | struct sock *csk = psock->sk; | ||
55 | |||
56 | /* Unrecoverable error in receive */ | ||
57 | |||
58 | if (psock->rx_stopped) | ||
59 | return; | ||
60 | |||
61 | psock->rx_stopped = 1; | ||
62 | |||
63 | /* Report an error on the lower socket */ | ||
64 | report_csk_error(csk, err); | ||
65 | } | ||
66 | |||
67 | static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, | ||
68 | bool wakeup_kcm) | ||
69 | { | ||
70 | struct sock *csk = psock->sk; | ||
71 | struct kcm_mux *mux = psock->mux; | ||
72 | |||
73 | /* Unrecoverable error in transmit */ | ||
74 | |||
75 | spin_lock_bh(&mux->lock); | ||
76 | |||
77 | if (psock->tx_stopped) { | ||
78 | spin_unlock_bh(&mux->lock); | ||
79 | return; | ||
80 | } | ||
81 | |||
82 | psock->tx_stopped = 1; | ||
83 | |||
84 | if (!psock->tx_kcm) { | ||
85 | /* Take off psocks_avail list */ | ||
86 | list_del(&psock->psock_avail_list); | ||
87 | } else if (wakeup_kcm) { | ||
88 | /* In this case psock is being aborted while outside of | ||
89 | * write_msgs and psock is reserved. Schedule tx_work | ||
90 | * to handle the failure there. Need to commit tx_stopped | ||
91 | * before queuing work. | ||
92 | */ | ||
93 | smp_mb(); | ||
94 | |||
95 | queue_work(kcm_wq, &psock->tx_kcm->tx_work); | ||
96 | } | ||
97 | |||
98 | spin_unlock_bh(&mux->lock); | ||
99 | |||
100 | /* Report error on lower socket */ | ||
101 | report_csk_error(csk, err); | ||
102 | } | ||
103 | |||
104 | static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); | ||
105 | |||
106 | /* KCM is ready to receive messages on its queue-- either the KCM is new or | ||
107 | * has become unblocked after being blocked on full socket buffer. Queue any | ||
108 | * pending ready messages on a psock. RX mux lock held. | ||
109 | */ | ||
110 | static void kcm_rcv_ready(struct kcm_sock *kcm) | ||
111 | { | ||
112 | struct kcm_mux *mux = kcm->mux; | ||
113 | struct kcm_psock *psock; | ||
114 | struct sk_buff *skb; | ||
115 | |||
116 | if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) | ||
117 | return; | ||
118 | |||
119 | while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { | ||
120 | if (kcm_queue_rcv_skb(&kcm->sk, skb)) { | ||
121 | /* Assuming buffer limit has been reached */ | ||
122 | skb_queue_head(&mux->rx_hold_queue, skb); | ||
123 | WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); | ||
124 | return; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | while (!list_empty(&mux->psocks_ready)) { | ||
129 | psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, | ||
130 | psock_ready_list); | ||
131 | |||
132 | if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { | ||
133 | /* Assuming buffer limit has been reached */ | ||
134 | WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); | ||
135 | return; | ||
136 | } | ||
137 | |||
138 | /* Consumed the ready message on the psock. Schedule rx_work to | ||
139 | * get more messages. | ||
140 | */ | ||
141 | list_del(&psock->psock_ready_list); | ||
142 | psock->ready_rx_msg = NULL; | ||
143 | |||
144 | /* Commit clearing of ready_rx_msg for queuing work */ | ||
145 | smp_mb(); | ||
146 | |||
147 | queue_work(kcm_wq, &psock->rx_work); | ||
148 | } | ||
149 | |||
150 | /* Buffer limit is okay now, add to ready list */ | ||
151 | list_add_tail(&kcm->wait_rx_list, | ||
152 | &kcm->mux->kcm_rx_waiters); | ||
153 | kcm->rx_wait = true; | ||
154 | } | ||
155 | |||
156 | static void kcm_rfree(struct sk_buff *skb) | ||
157 | { | ||
158 | struct sock *sk = skb->sk; | ||
159 | struct kcm_sock *kcm = kcm_sk(sk); | ||
160 | struct kcm_mux *mux = kcm->mux; | ||
161 | unsigned int len = skb->truesize; | ||
162 | |||
163 | sk_mem_uncharge(sk, len); | ||
164 | atomic_sub(len, &sk->sk_rmem_alloc); | ||
165 | |||
166 | /* For reading rx_wait and rx_psock without holding lock */ | ||
167 | smp_mb__after_atomic(); | ||
168 | |||
169 | if (!kcm->rx_wait && !kcm->rx_psock && | ||
170 | sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { | ||
171 | spin_lock_bh(&mux->rx_lock); | ||
172 | kcm_rcv_ready(kcm); | ||
173 | spin_unlock_bh(&mux->rx_lock); | ||
174 | } | ||
175 | } | ||
176 | |||
177 | static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | ||
178 | { | ||
179 | struct sk_buff_head *list = &sk->sk_receive_queue; | ||
180 | |||
181 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) | ||
182 | return -ENOMEM; | ||
183 | |||
184 | if (!sk_rmem_schedule(sk, skb, skb->truesize)) | ||
185 | return -ENOBUFS; | ||
186 | |||
187 | skb->dev = NULL; | ||
188 | |||
189 | skb_orphan(skb); | ||
190 | skb->sk = sk; | ||
191 | skb->destructor = kcm_rfree; | ||
192 | atomic_add(skb->truesize, &sk->sk_rmem_alloc); | ||
193 | sk_mem_charge(sk, skb->truesize); | ||
194 | |||
195 | skb_queue_tail(list, skb); | ||
196 | |||
197 | if (!sock_flag(sk, SOCK_DEAD)) | ||
198 | sk->sk_data_ready(sk); | ||
199 | |||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | /* Requeue received messages for a kcm socket to other kcm sockets. This is | ||
204 | * called with a kcm socket is receive disabled. | ||
205 | * RX mux lock held. | ||
206 | */ | ||
207 | static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) | ||
208 | { | ||
209 | struct sk_buff *skb; | ||
210 | struct kcm_sock *kcm; | ||
211 | |||
212 | while ((skb = __skb_dequeue(head))) { | ||
213 | /* Reset destructor to avoid calling kcm_rcv_ready */ | ||
214 | skb->destructor = sock_rfree; | ||
215 | skb_orphan(skb); | ||
216 | try_again: | ||
217 | if (list_empty(&mux->kcm_rx_waiters)) { | ||
218 | skb_queue_tail(&mux->rx_hold_queue, skb); | ||
219 | continue; | ||
220 | } | ||
221 | |||
222 | kcm = list_first_entry(&mux->kcm_rx_waiters, | ||
223 | struct kcm_sock, wait_rx_list); | ||
224 | |||
225 | if (kcm_queue_rcv_skb(&kcm->sk, skb)) { | ||
226 | /* Should mean socket buffer full */ | ||
227 | list_del(&kcm->wait_rx_list); | ||
228 | kcm->rx_wait = false; | ||
229 | |||
230 | /* Commit rx_wait to read in kcm_free */ | ||
231 | smp_wmb(); | ||
232 | |||
233 | goto try_again; | ||
234 | } | ||
235 | } | ||
236 | } | ||
237 | |||
238 | /* Lower sock lock held */ | ||
239 | static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, | ||
240 | struct sk_buff *head) | ||
241 | { | ||
242 | struct kcm_mux *mux = psock->mux; | ||
243 | struct kcm_sock *kcm; | ||
244 | |||
245 | WARN_ON(psock->ready_rx_msg); | ||
246 | |||
247 | if (psock->rx_kcm) | ||
248 | return psock->rx_kcm; | ||
249 | |||
250 | spin_lock_bh(&mux->rx_lock); | ||
251 | |||
252 | if (psock->rx_kcm) { | ||
253 | spin_unlock_bh(&mux->rx_lock); | ||
254 | return psock->rx_kcm; | ||
255 | } | ||
256 | |||
257 | if (list_empty(&mux->kcm_rx_waiters)) { | ||
258 | psock->ready_rx_msg = head; | ||
259 | list_add_tail(&psock->psock_ready_list, | ||
260 | &mux->psocks_ready); | ||
261 | spin_unlock_bh(&mux->rx_lock); | ||
262 | return NULL; | ||
263 | } | ||
264 | |||
265 | kcm = list_first_entry(&mux->kcm_rx_waiters, | ||
266 | struct kcm_sock, wait_rx_list); | ||
267 | list_del(&kcm->wait_rx_list); | ||
268 | kcm->rx_wait = false; | ||
269 | |||
270 | psock->rx_kcm = kcm; | ||
271 | kcm->rx_psock = psock; | ||
272 | |||
273 | spin_unlock_bh(&mux->rx_lock); | ||
274 | |||
275 | return kcm; | ||
276 | } | ||
277 | |||
278 | static void kcm_done(struct kcm_sock *kcm); | ||
279 | |||
280 | static void kcm_done_work(struct work_struct *w) | ||
281 | { | ||
282 | kcm_done(container_of(w, struct kcm_sock, done_work)); | ||
283 | } | ||
284 | |||
285 | /* Lower sock held */ | ||
286 | static void unreserve_rx_kcm(struct kcm_psock *psock, | ||
287 | bool rcv_ready) | ||
288 | { | ||
289 | struct kcm_sock *kcm = psock->rx_kcm; | ||
290 | struct kcm_mux *mux = psock->mux; | ||
291 | |||
292 | if (!kcm) | ||
293 | return; | ||
294 | |||
295 | spin_lock_bh(&mux->rx_lock); | ||
296 | |||
297 | psock->rx_kcm = NULL; | ||
298 | kcm->rx_psock = NULL; | ||
299 | |||
300 | /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with | ||
301 | * kcm_rfree | ||
302 | */ | ||
303 | smp_mb(); | ||
304 | |||
305 | if (unlikely(kcm->done)) { | ||
306 | spin_unlock_bh(&mux->rx_lock); | ||
307 | |||
308 | /* Need to run kcm_done in a task since we need to qcquire | ||
309 | * callback locks which may already be held here. | ||
310 | */ | ||
311 | INIT_WORK(&kcm->done_work, kcm_done_work); | ||
312 | schedule_work(&kcm->done_work); | ||
313 | return; | ||
314 | } | ||
315 | |||
316 | if (unlikely(kcm->rx_disabled)) { | ||
317 | requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); | ||
318 | } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { | ||
319 | /* Check for degenerative race with rx_wait that all | ||
320 | * data was dequeued (accounted for in kcm_rfree). | ||
321 | */ | ||
322 | kcm_rcv_ready(kcm); | ||
323 | } | ||
324 | spin_unlock_bh(&mux->rx_lock); | ||
325 | } | ||
326 | |||
327 | /* Macro to invoke filter function. */ | ||
328 | #define KCM_RUN_FILTER(prog, ctx) \ | ||
329 | (*prog->bpf_func)(ctx, prog->insnsi) | ||
330 | |||
331 | /* Lower socket lock held */ | ||
332 | static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, | ||
333 | unsigned int orig_offset, size_t orig_len) | ||
334 | { | ||
335 | struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; | ||
336 | struct kcm_rx_msg *rxm; | ||
337 | struct kcm_sock *kcm; | ||
338 | struct sk_buff *head, *skb; | ||
339 | size_t eaten = 0, cand_len; | ||
340 | ssize_t extra; | ||
341 | int err; | ||
342 | bool cloned_orig = false; | ||
343 | |||
344 | if (psock->ready_rx_msg) | ||
345 | return 0; | ||
346 | |||
347 | head = psock->rx_skb_head; | ||
348 | if (head) { | ||
349 | /* Message already in progress */ | ||
350 | |||
351 | if (unlikely(orig_offset)) { | ||
352 | /* Getting data with a non-zero offset when a message is | ||
353 | * in progress is not expected. If it does happen, we | ||
354 | * need to clone and pull since we can't deal with | ||
355 | * offsets in the skbs for a message expect in the head. | ||
356 | */ | ||
357 | orig_skb = skb_clone(orig_skb, GFP_ATOMIC); | ||
358 | if (!orig_skb) { | ||
359 | desc->error = -ENOMEM; | ||
360 | return 0; | ||
361 | } | ||
362 | if (!pskb_pull(orig_skb, orig_offset)) { | ||
363 | kfree_skb(orig_skb); | ||
364 | desc->error = -ENOMEM; | ||
365 | return 0; | ||
366 | } | ||
367 | cloned_orig = true; | ||
368 | orig_offset = 0; | ||
369 | } | ||
370 | |||
371 | if (!psock->rx_skb_nextp) { | ||
372 | /* We are going to append to the frags_list of head. | ||
373 | * Need to unshare the frag_list. | ||
374 | */ | ||
375 | err = skb_unclone(head, GFP_ATOMIC); | ||
376 | if (err) { | ||
377 | desc->error = err; | ||
378 | return 0; | ||
379 | } | ||
380 | |||
381 | if (unlikely(skb_shinfo(head)->frag_list)) { | ||
382 | /* We can't append to an sk_buff that already | ||
383 | * has a frag_list. We create a new head, point | ||
384 | * the frag_list of that to the old head, and | ||
385 | * then are able to use the old head->next for | ||
386 | * appending to the message. | ||
387 | */ | ||
388 | if (WARN_ON(head->next)) { | ||
389 | desc->error = -EINVAL; | ||
390 | return 0; | ||
391 | } | ||
392 | |||
393 | skb = alloc_skb(0, GFP_ATOMIC); | ||
394 | if (!skb) { | ||
395 | desc->error = -ENOMEM; | ||
396 | return 0; | ||
397 | } | ||
398 | skb->len = head->len; | ||
399 | skb->data_len = head->len; | ||
400 | skb->truesize = head->truesize; | ||
401 | *kcm_rx_msg(skb) = *kcm_rx_msg(head); | ||
402 | psock->rx_skb_nextp = &head->next; | ||
403 | skb_shinfo(skb)->frag_list = head; | ||
404 | psock->rx_skb_head = skb; | ||
405 | head = skb; | ||
406 | } else { | ||
407 | psock->rx_skb_nextp = | ||
408 | &skb_shinfo(head)->frag_list; | ||
409 | } | ||
410 | } | ||
411 | } | ||
412 | |||
413 | while (eaten < orig_len) { | ||
414 | /* Always clone since we will consume something */ | ||
415 | skb = skb_clone(orig_skb, GFP_ATOMIC); | ||
416 | if (!skb) { | ||
417 | desc->error = -ENOMEM; | ||
418 | break; | ||
419 | } | ||
420 | |||
421 | cand_len = orig_len - eaten; | ||
422 | |||
423 | head = psock->rx_skb_head; | ||
424 | if (!head) { | ||
425 | head = skb; | ||
426 | psock->rx_skb_head = head; | ||
427 | /* Will set rx_skb_nextp on next packet if needed */ | ||
428 | psock->rx_skb_nextp = NULL; | ||
429 | rxm = kcm_rx_msg(head); | ||
430 | memset(rxm, 0, sizeof(*rxm)); | ||
431 | rxm->offset = orig_offset + eaten; | ||
432 | } else { | ||
433 | /* Unclone since we may be appending to an skb that we | ||
434 | * already share a frag_list with. | ||
435 | */ | ||
436 | err = skb_unclone(skb, GFP_ATOMIC); | ||
437 | if (err) { | ||
438 | desc->error = err; | ||
439 | break; | ||
440 | } | ||
441 | |||
442 | rxm = kcm_rx_msg(head); | ||
443 | *psock->rx_skb_nextp = skb; | ||
444 | psock->rx_skb_nextp = &skb->next; | ||
445 | head->data_len += skb->len; | ||
446 | head->len += skb->len; | ||
447 | head->truesize += skb->truesize; | ||
448 | } | ||
449 | |||
450 | if (!rxm->full_len) { | ||
451 | ssize_t len; | ||
452 | |||
453 | len = KCM_RUN_FILTER(psock->bpf_prog, head); | ||
454 | |||
455 | if (!len) { | ||
456 | /* Need more header to determine length */ | ||
457 | rxm->accum_len += cand_len; | ||
458 | eaten += cand_len; | ||
459 | WARN_ON(eaten != orig_len); | ||
460 | break; | ||
461 | } else if (len <= (ssize_t)head->len - | ||
462 | skb->len - rxm->offset) { | ||
463 | /* Length must be into new skb (and also | ||
464 | * greater than zero) | ||
465 | */ | ||
466 | desc->error = -EPROTO; | ||
467 | psock->rx_skb_head = NULL; | ||
468 | kcm_abort_rx_psock(psock, EPROTO, head); | ||
469 | break; | ||
470 | } | ||
471 | |||
472 | rxm->full_len = len; | ||
473 | } | ||
474 | |||
475 | extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; | ||
476 | |||
477 | if (extra < 0) { | ||
478 | /* Message not complete yet. */ | ||
479 | rxm->accum_len += cand_len; | ||
480 | eaten += cand_len; | ||
481 | WARN_ON(eaten != orig_len); | ||
482 | break; | ||
483 | } | ||
484 | |||
485 | /* Positive extra indicates ore bytes than needed for the | ||
486 | * message | ||
487 | */ | ||
488 | |||
489 | WARN_ON(extra > cand_len); | ||
490 | |||
491 | eaten += (cand_len - extra); | ||
492 | |||
493 | /* Hurray, we have a new message! */ | ||
494 | psock->rx_skb_head = NULL; | ||
495 | |||
496 | try_queue: | ||
497 | kcm = reserve_rx_kcm(psock, head); | ||
498 | if (!kcm) { | ||
499 | /* Unable to reserve a KCM, message is held in psock. */ | ||
500 | break; | ||
501 | } | ||
502 | |||
503 | if (kcm_queue_rcv_skb(&kcm->sk, head)) { | ||
504 | /* Should mean socket buffer full */ | ||
505 | unreserve_rx_kcm(psock, false); | ||
506 | goto try_queue; | ||
507 | } | ||
508 | } | ||
509 | |||
510 | if (cloned_orig) | ||
511 | kfree_skb(orig_skb); | ||
512 | |||
513 | return eaten; | ||
514 | } | ||
515 | |||
516 | /* Called with lock held on lower socket */ | ||
517 | static int psock_tcp_read_sock(struct kcm_psock *psock) | ||
518 | { | ||
519 | read_descriptor_t desc; | ||
520 | |||
521 | desc.arg.data = psock; | ||
522 | desc.error = 0; | ||
523 | desc.count = 1; /* give more than one skb per call */ | ||
524 | |||
525 | /* sk should be locked here, so okay to do tcp_read_sock */ | ||
526 | tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); | ||
527 | |||
528 | unreserve_rx_kcm(psock, true); | ||
529 | |||
530 | return desc.error; | ||
531 | } | ||
532 | |||
533 | /* Lower sock lock held */ | ||
534 | static void psock_tcp_data_ready(struct sock *sk) | ||
535 | { | ||
536 | struct kcm_psock *psock; | ||
537 | |||
538 | read_lock_bh(&sk->sk_callback_lock); | ||
539 | |||
540 | psock = (struct kcm_psock *)sk->sk_user_data; | ||
541 | if (unlikely(!psock || psock->rx_stopped)) | ||
542 | goto out; | ||
543 | |||
544 | if (psock->ready_rx_msg) | ||
545 | goto out; | ||
546 | |||
547 | if (psock_tcp_read_sock(psock) == -ENOMEM) | ||
548 | queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); | ||
549 | |||
550 | out: | ||
551 | read_unlock_bh(&sk->sk_callback_lock); | ||
552 | } | ||
553 | |||
554 | static void do_psock_rx_work(struct kcm_psock *psock) | ||
555 | { | ||
556 | read_descriptor_t rd_desc; | ||
557 | struct sock *csk = psock->sk; | ||
558 | |||
559 | /* We need the read lock to synchronize with psock_tcp_data_ready. We | ||
560 | * need the socket lock for calling tcp_read_sock. | ||
561 | */ | ||
562 | lock_sock(csk); | ||
563 | read_lock_bh(&csk->sk_callback_lock); | ||
564 | |||
565 | if (unlikely(csk->sk_user_data != psock)) | ||
566 | goto out; | ||
567 | |||
568 | if (unlikely(psock->rx_stopped)) | ||
569 | goto out; | ||
570 | |||
571 | if (psock->ready_rx_msg) | ||
572 | goto out; | ||
573 | |||
574 | rd_desc.arg.data = psock; | ||
575 | |||
576 | if (psock_tcp_read_sock(psock) == -ENOMEM) | ||
577 | queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); | ||
578 | |||
579 | out: | ||
580 | read_unlock_bh(&csk->sk_callback_lock); | ||
581 | release_sock(csk); | ||
582 | } | ||
583 | |||
584 | static void psock_rx_work(struct work_struct *w) | ||
585 | { | ||
586 | do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); | ||
587 | } | ||
588 | |||
589 | static void psock_rx_delayed_work(struct work_struct *w) | ||
590 | { | ||
591 | do_psock_rx_work(container_of(w, struct kcm_psock, | ||
592 | rx_delayed_work.work)); | ||
593 | } | ||
594 | |||
595 | static void psock_tcp_state_change(struct sock *sk) | ||
596 | { | ||
597 | /* TCP only does a POLLIN for a half close. Do a POLLHUP here | ||
598 | * since application will normally not poll with POLLIN | ||
599 | * on the TCP sockets. | ||
600 | */ | ||
601 | |||
602 | report_csk_error(sk, EPIPE); | ||
603 | } | ||
604 | |||
605 | static void psock_tcp_write_space(struct sock *sk) | ||
606 | { | ||
607 | struct kcm_psock *psock; | ||
608 | struct kcm_mux *mux; | ||
609 | struct kcm_sock *kcm; | ||
610 | |||
611 | read_lock_bh(&sk->sk_callback_lock); | ||
612 | |||
613 | psock = (struct kcm_psock *)sk->sk_user_data; | ||
614 | if (unlikely(!psock)) | ||
615 | goto out; | ||
616 | |||
617 | mux = psock->mux; | ||
618 | |||
619 | spin_lock_bh(&mux->lock); | ||
620 | |||
621 | /* Check if the socket is reserved so someone is waiting for sending. */ | ||
622 | kcm = psock->tx_kcm; | ||
623 | if (kcm) | ||
624 | queue_work(kcm_wq, &kcm->tx_work); | ||
625 | |||
626 | spin_unlock_bh(&mux->lock); | ||
627 | out: | ||
628 | read_unlock_bh(&sk->sk_callback_lock); | ||
629 | } | ||
630 | |||
631 | static void unreserve_psock(struct kcm_sock *kcm); | ||
632 | |||
633 | /* kcm sock is locked. */ | ||
634 | static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) | ||
635 | { | ||
636 | struct kcm_mux *mux = kcm->mux; | ||
637 | struct kcm_psock *psock; | ||
638 | |||
639 | psock = kcm->tx_psock; | ||
640 | |||
641 | smp_rmb(); /* Must read tx_psock before tx_wait */ | ||
642 | |||
643 | if (psock) { | ||
644 | WARN_ON(kcm->tx_wait); | ||
645 | if (unlikely(psock->tx_stopped)) | ||
646 | unreserve_psock(kcm); | ||
647 | else | ||
648 | return kcm->tx_psock; | ||
649 | } | ||
650 | |||
651 | spin_lock_bh(&mux->lock); | ||
652 | |||
653 | /* Check again under lock to see if psock was reserved for this | ||
654 | * psock via psock_unreserve. | ||
655 | */ | ||
656 | psock = kcm->tx_psock; | ||
657 | if (unlikely(psock)) { | ||
658 | WARN_ON(kcm->tx_wait); | ||
659 | spin_unlock_bh(&mux->lock); | ||
660 | return kcm->tx_psock; | ||
661 | } | ||
662 | |||
663 | if (!list_empty(&mux->psocks_avail)) { | ||
664 | psock = list_first_entry(&mux->psocks_avail, | ||
665 | struct kcm_psock, | ||
666 | psock_avail_list); | ||
667 | list_del(&psock->psock_avail_list); | ||
668 | if (kcm->tx_wait) { | ||
669 | list_del(&kcm->wait_psock_list); | ||
670 | kcm->tx_wait = false; | ||
671 | } | ||
672 | kcm->tx_psock = psock; | ||
673 | psock->tx_kcm = kcm; | ||
674 | } else if (!kcm->tx_wait) { | ||
675 | list_add_tail(&kcm->wait_psock_list, | ||
676 | &mux->kcm_tx_waiters); | ||
677 | kcm->tx_wait = true; | ||
678 | } | ||
679 | |||
680 | spin_unlock_bh(&mux->lock); | ||
681 | |||
682 | return psock; | ||
683 | } | ||
684 | |||
685 | /* mux lock held */ | ||
686 | static void psock_now_avail(struct kcm_psock *psock) | ||
687 | { | ||
688 | struct kcm_mux *mux = psock->mux; | ||
689 | struct kcm_sock *kcm; | ||
690 | |||
691 | if (list_empty(&mux->kcm_tx_waiters)) { | ||
692 | list_add_tail(&psock->psock_avail_list, | ||
693 | &mux->psocks_avail); | ||
694 | } else { | ||
695 | kcm = list_first_entry(&mux->kcm_tx_waiters, | ||
696 | struct kcm_sock, | ||
697 | wait_psock_list); | ||
698 | list_del(&kcm->wait_psock_list); | ||
699 | kcm->tx_wait = false; | ||
700 | psock->tx_kcm = kcm; | ||
701 | |||
702 | /* Commit before changing tx_psock since that is read in | ||
703 | * reserve_psock before queuing work. | ||
704 | */ | ||
705 | smp_mb(); | ||
706 | |||
707 | kcm->tx_psock = psock; | ||
708 | queue_work(kcm_wq, &kcm->tx_work); | ||
709 | } | ||
710 | } | ||
711 | |||
712 | /* kcm sock is locked. */ | ||
713 | static void unreserve_psock(struct kcm_sock *kcm) | ||
714 | { | ||
715 | struct kcm_psock *psock; | ||
716 | struct kcm_mux *mux = kcm->mux; | ||
717 | |||
718 | spin_lock_bh(&mux->lock); | ||
719 | |||
720 | psock = kcm->tx_psock; | ||
721 | |||
722 | if (WARN_ON(!psock)) { | ||
723 | spin_unlock_bh(&mux->lock); | ||
724 | return; | ||
725 | } | ||
726 | |||
727 | smp_rmb(); /* Read tx_psock before tx_wait */ | ||
728 | |||
729 | WARN_ON(kcm->tx_wait); | ||
730 | |||
731 | kcm->tx_psock = NULL; | ||
732 | psock->tx_kcm = NULL; | ||
733 | |||
734 | if (unlikely(psock->tx_stopped)) { | ||
735 | if (psock->done) { | ||
736 | /* Deferred free */ | ||
737 | list_del(&psock->psock_list); | ||
738 | mux->psocks_cnt--; | ||
739 | sock_put(psock->sk); | ||
740 | fput(psock->sk->sk_socket->file); | ||
741 | kmem_cache_free(kcm_psockp, psock); | ||
742 | } | ||
743 | |||
744 | /* Don't put back on available list */ | ||
745 | |||
746 | spin_unlock_bh(&mux->lock); | ||
747 | |||
748 | return; | ||
749 | } | ||
750 | |||
751 | psock_now_avail(psock); | ||
752 | |||
753 | spin_unlock_bh(&mux->lock); | ||
754 | } | ||
755 | |||
756 | /* Write any messages ready on the kcm socket. Called with kcm sock lock | ||
757 | * held. Return bytes actually sent or error. | ||
758 | */ | ||
759 | static int kcm_write_msgs(struct kcm_sock *kcm) | ||
760 | { | ||
761 | struct sock *sk = &kcm->sk; | ||
762 | struct kcm_psock *psock; | ||
763 | struct sk_buff *skb, *head; | ||
764 | struct kcm_tx_msg *txm; | ||
765 | unsigned short fragidx, frag_offset; | ||
766 | unsigned int sent, total_sent = 0; | ||
767 | int ret = 0; | ||
768 | |||
769 | kcm->tx_wait_more = false; | ||
770 | psock = kcm->tx_psock; | ||
771 | if (unlikely(psock && psock->tx_stopped)) { | ||
772 | /* A reserved psock was aborted asynchronously. Unreserve | ||
773 | * it and we'll retry the message. | ||
774 | */ | ||
775 | unreserve_psock(kcm); | ||
776 | if (skb_queue_empty(&sk->sk_write_queue)) | ||
777 | return 0; | ||
778 | |||
779 | kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; | ||
780 | |||
781 | } else if (skb_queue_empty(&sk->sk_write_queue)) { | ||
782 | return 0; | ||
783 | } | ||
784 | |||
785 | head = skb_peek(&sk->sk_write_queue); | ||
786 | txm = kcm_tx_msg(head); | ||
787 | |||
788 | if (txm->sent) { | ||
789 | /* Send of first skbuff in queue already in progress */ | ||
790 | if (WARN_ON(!psock)) { | ||
791 | ret = -EINVAL; | ||
792 | goto out; | ||
793 | } | ||
794 | sent = txm->sent; | ||
795 | frag_offset = txm->frag_offset; | ||
796 | fragidx = txm->fragidx; | ||
797 | skb = txm->frag_skb; | ||
798 | |||
799 | goto do_frag; | ||
800 | } | ||
801 | |||
802 | try_again: | ||
803 | psock = reserve_psock(kcm); | ||
804 | if (!psock) | ||
805 | goto out; | ||
806 | |||
807 | do { | ||
808 | skb = head; | ||
809 | txm = kcm_tx_msg(head); | ||
810 | sent = 0; | ||
811 | |||
812 | do_frag_list: | ||
813 | if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { | ||
814 | ret = -EINVAL; | ||
815 | goto out; | ||
816 | } | ||
817 | |||
818 | for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; | ||
819 | fragidx++) { | ||
820 | skb_frag_t *frag; | ||
821 | |||
822 | frag_offset = 0; | ||
823 | do_frag: | ||
824 | frag = &skb_shinfo(skb)->frags[fragidx]; | ||
825 | if (WARN_ON(!frag->size)) { | ||
826 | ret = -EINVAL; | ||
827 | goto out; | ||
828 | } | ||
829 | |||
830 | ret = kernel_sendpage(psock->sk->sk_socket, | ||
831 | frag->page.p, | ||
832 | frag->page_offset + frag_offset, | ||
833 | frag->size - frag_offset, | ||
834 | MSG_DONTWAIT); | ||
835 | if (ret <= 0) { | ||
836 | if (ret == -EAGAIN) { | ||
837 | /* Save state to try again when there's | ||
838 | * write space on the socket | ||
839 | */ | ||
840 | txm->sent = sent; | ||
841 | txm->frag_offset = frag_offset; | ||
842 | txm->fragidx = fragidx; | ||
843 | txm->frag_skb = skb; | ||
844 | |||
845 | ret = 0; | ||
846 | goto out; | ||
847 | } | ||
848 | |||
849 | /* Hard failure in sending message, abort this | ||
850 | * psock since it has lost framing | ||
851 | * synchonization and retry sending the | ||
852 | * message from the beginning. | ||
853 | */ | ||
854 | kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, | ||
855 | true); | ||
856 | unreserve_psock(kcm); | ||
857 | |||
858 | txm->sent = 0; | ||
859 | ret = 0; | ||
860 | |||
861 | goto try_again; | ||
862 | } | ||
863 | |||
864 | sent += ret; | ||
865 | frag_offset += ret; | ||
866 | if (frag_offset < frag->size) { | ||
867 | /* Not finished with this frag */ | ||
868 | goto do_frag; | ||
869 | } | ||
870 | } | ||
871 | |||
872 | if (skb == head) { | ||
873 | if (skb_has_frag_list(skb)) { | ||
874 | skb = skb_shinfo(skb)->frag_list; | ||
875 | goto do_frag_list; | ||
876 | } | ||
877 | } else if (skb->next) { | ||
878 | skb = skb->next; | ||
879 | goto do_frag_list; | ||
880 | } | ||
881 | |||
882 | /* Successfully sent the whole packet, account for it. */ | ||
883 | skb_dequeue(&sk->sk_write_queue); | ||
884 | kfree_skb(head); | ||
885 | sk->sk_wmem_queued -= sent; | ||
886 | total_sent += sent; | ||
887 | } while ((head = skb_peek(&sk->sk_write_queue))); | ||
888 | out: | ||
889 | if (!head) { | ||
890 | /* Done with all queued messages. */ | ||
891 | WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); | ||
892 | unreserve_psock(kcm); | ||
893 | } | ||
894 | |||
895 | /* Check if write space is available */ | ||
896 | sk->sk_write_space(sk); | ||
897 | |||
898 | return total_sent ? : ret; | ||
899 | } | ||
900 | |||
901 | static void kcm_tx_work(struct work_struct *w) | ||
902 | { | ||
903 | struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); | ||
904 | struct sock *sk = &kcm->sk; | ||
905 | int err; | ||
906 | |||
907 | lock_sock(sk); | ||
908 | |||
909 | /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx | ||
910 | * aborts | ||
911 | */ | ||
912 | err = kcm_write_msgs(kcm); | ||
913 | if (err < 0) { | ||
914 | /* Hard failure in write, report error on KCM socket */ | ||
915 | pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); | ||
916 | report_csk_error(&kcm->sk, -err); | ||
917 | goto out; | ||
918 | } | ||
919 | |||
920 | /* Primarily for SOCK_SEQPACKET sockets */ | ||
921 | if (likely(sk->sk_socket) && | ||
922 | test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { | ||
923 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | ||
924 | sk->sk_write_space(sk); | ||
925 | } | ||
926 | |||
927 | out: | ||
928 | release_sock(sk); | ||
929 | } | ||
930 | |||
931 | static void kcm_push(struct kcm_sock *kcm) | ||
932 | { | ||
933 | if (kcm->tx_wait_more) | ||
934 | kcm_write_msgs(kcm); | ||
935 | } | ||
936 | |||
937 | static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) | ||
938 | { | ||
939 | struct sock *sk = sock->sk; | ||
940 | struct kcm_sock *kcm = kcm_sk(sk); | ||
941 | struct sk_buff *skb = NULL, *head = NULL; | ||
942 | size_t copy, copied = 0; | ||
943 | long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); | ||
944 | int eor = (sock->type == SOCK_DGRAM) ? | ||
945 | !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); | ||
946 | int err = -EPIPE; | ||
947 | |||
948 | lock_sock(sk); | ||
949 | |||
950 | /* Per tcp_sendmsg this should be in poll */ | ||
951 | sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); | ||
952 | |||
953 | if (sk->sk_err) | ||
954 | goto out_error; | ||
955 | |||
956 | if (kcm->seq_skb) { | ||
957 | /* Previously opened message */ | ||
958 | head = kcm->seq_skb; | ||
959 | skb = kcm_tx_msg(head)->last_skb; | ||
960 | goto start; | ||
961 | } | ||
962 | |||
963 | /* Call the sk_stream functions to manage the sndbuf mem. */ | ||
964 | if (!sk_stream_memory_free(sk)) { | ||
965 | kcm_push(kcm); | ||
966 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | ||
967 | err = sk_stream_wait_memory(sk, &timeo); | ||
968 | if (err) | ||
969 | goto out_error; | ||
970 | } | ||
971 | |||
972 | /* New message, alloc head skb */ | ||
973 | head = alloc_skb(0, sk->sk_allocation); | ||
974 | while (!head) { | ||
975 | kcm_push(kcm); | ||
976 | err = sk_stream_wait_memory(sk, &timeo); | ||
977 | if (err) | ||
978 | goto out_error; | ||
979 | |||
980 | head = alloc_skb(0, sk->sk_allocation); | ||
981 | } | ||
982 | |||
983 | skb = head; | ||
984 | |||
985 | /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling | ||
986 | * csum_and_copy_from_iter from skb_do_copy_data_nocache. | ||
987 | */ | ||
988 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
989 | |||
990 | start: | ||
991 | while (msg_data_left(msg)) { | ||
992 | bool merge = true; | ||
993 | int i = skb_shinfo(skb)->nr_frags; | ||
994 | struct page_frag *pfrag = sk_page_frag(sk); | ||
995 | |||
996 | if (!sk_page_frag_refill(sk, pfrag)) | ||
997 | goto wait_for_memory; | ||
998 | |||
999 | if (!skb_can_coalesce(skb, i, pfrag->page, | ||
1000 | pfrag->offset)) { | ||
1001 | if (i == MAX_SKB_FRAGS) { | ||
1002 | struct sk_buff *tskb; | ||
1003 | |||
1004 | tskb = alloc_skb(0, sk->sk_allocation); | ||
1005 | if (!tskb) | ||
1006 | goto wait_for_memory; | ||
1007 | |||
1008 | if (head == skb) | ||
1009 | skb_shinfo(head)->frag_list = tskb; | ||
1010 | else | ||
1011 | skb->next = tskb; | ||
1012 | |||
1013 | skb = tskb; | ||
1014 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
1015 | continue; | ||
1016 | } | ||
1017 | merge = false; | ||
1018 | } | ||
1019 | |||
1020 | copy = min_t(int, msg_data_left(msg), | ||
1021 | pfrag->size - pfrag->offset); | ||
1022 | |||
1023 | if (!sk_wmem_schedule(sk, copy)) | ||
1024 | goto wait_for_memory; | ||
1025 | |||
1026 | err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, | ||
1027 | pfrag->page, | ||
1028 | pfrag->offset, | ||
1029 | copy); | ||
1030 | if (err) | ||
1031 | goto out_error; | ||
1032 | |||
1033 | /* Update the skb. */ | ||
1034 | if (merge) { | ||
1035 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); | ||
1036 | } else { | ||
1037 | skb_fill_page_desc(skb, i, pfrag->page, | ||
1038 | pfrag->offset, copy); | ||
1039 | get_page(pfrag->page); | ||
1040 | } | ||
1041 | |||
1042 | pfrag->offset += copy; | ||
1043 | copied += copy; | ||
1044 | if (head != skb) { | ||
1045 | head->len += copy; | ||
1046 | head->data_len += copy; | ||
1047 | } | ||
1048 | |||
1049 | continue; | ||
1050 | |||
1051 | wait_for_memory: | ||
1052 | kcm_push(kcm); | ||
1053 | err = sk_stream_wait_memory(sk, &timeo); | ||
1054 | if (err) | ||
1055 | goto out_error; | ||
1056 | } | ||
1057 | |||
1058 | if (eor) { | ||
1059 | bool not_busy = skb_queue_empty(&sk->sk_write_queue); | ||
1060 | |||
1061 | /* Message complete, queue it on send buffer */ | ||
1062 | __skb_queue_tail(&sk->sk_write_queue, head); | ||
1063 | kcm->seq_skb = NULL; | ||
1064 | |||
1065 | if (msg->msg_flags & MSG_BATCH) { | ||
1066 | kcm->tx_wait_more = true; | ||
1067 | } else if (kcm->tx_wait_more || not_busy) { | ||
1068 | err = kcm_write_msgs(kcm); | ||
1069 | if (err < 0) { | ||
1070 | /* We got a hard error in write_msgs but have | ||
1071 | * already queued this message. Report an error | ||
1072 | * in the socket, but don't affect return value | ||
1073 | * from sendmsg | ||
1074 | */ | ||
1075 | pr_warn("KCM: Hard failure on kcm_write_msgs\n"); | ||
1076 | report_csk_error(&kcm->sk, -err); | ||
1077 | } | ||
1078 | } | ||
1079 | } else { | ||
1080 | /* Message not complete, save state */ | ||
1081 | partial_message: | ||
1082 | kcm->seq_skb = head; | ||
1083 | kcm_tx_msg(head)->last_skb = skb; | ||
1084 | } | ||
1085 | |||
1086 | release_sock(sk); | ||
1087 | return copied; | ||
1088 | |||
1089 | out_error: | ||
1090 | kcm_push(kcm); | ||
1091 | |||
1092 | if (copied && sock->type == SOCK_SEQPACKET) { | ||
1093 | /* Wrote some bytes before encountering an | ||
1094 | * error, return partial success. | ||
1095 | */ | ||
1096 | goto partial_message; | ||
1097 | } | ||
1098 | |||
1099 | if (head != kcm->seq_skb) | ||
1100 | kfree_skb(head); | ||
1101 | |||
1102 | err = sk_stream_error(sk, msg->msg_flags, err); | ||
1103 | |||
1104 | /* make sure we wake any epoll edge trigger waiter */ | ||
1105 | if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) | ||
1106 | sk->sk_write_space(sk); | ||
1107 | |||
1108 | release_sock(sk); | ||
1109 | return err; | ||
1110 | } | ||
1111 | |||
1112 | static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, | ||
1113 | long timeo, int *err) | ||
1114 | { | ||
1115 | struct sk_buff *skb; | ||
1116 | |||
1117 | while (!(skb = skb_peek(&sk->sk_receive_queue))) { | ||
1118 | if (sk->sk_err) { | ||
1119 | *err = sock_error(sk); | ||
1120 | return NULL; | ||
1121 | } | ||
1122 | |||
1123 | if (sock_flag(sk, SOCK_DONE)) | ||
1124 | return NULL; | ||
1125 | |||
1126 | if ((flags & MSG_DONTWAIT) || !timeo) { | ||
1127 | *err = -EAGAIN; | ||
1128 | return NULL; | ||
1129 | } | ||
1130 | |||
1131 | sk_wait_data(sk, &timeo, NULL); | ||
1132 | |||
1133 | /* Handle signals */ | ||
1134 | if (signal_pending(current)) { | ||
1135 | *err = sock_intr_errno(timeo); | ||
1136 | return NULL; | ||
1137 | } | ||
1138 | } | ||
1139 | |||
1140 | return skb; | ||
1141 | } | ||
1142 | |||
1143 | static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, | ||
1144 | size_t len, int flags) | ||
1145 | { | ||
1146 | struct sock *sk = sock->sk; | ||
1147 | int err = 0; | ||
1148 | long timeo; | ||
1149 | struct kcm_rx_msg *rxm; | ||
1150 | int copied = 0; | ||
1151 | struct sk_buff *skb; | ||
1152 | |||
1153 | timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); | ||
1154 | |||
1155 | lock_sock(sk); | ||
1156 | |||
1157 | skb = kcm_wait_data(sk, flags, timeo, &err); | ||
1158 | if (!skb) | ||
1159 | goto out; | ||
1160 | |||
1161 | /* Okay, have a message on the receive queue */ | ||
1162 | |||
1163 | rxm = kcm_rx_msg(skb); | ||
1164 | |||
1165 | if (len > rxm->full_len) | ||
1166 | len = rxm->full_len; | ||
1167 | |||
1168 | err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); | ||
1169 | if (err < 0) | ||
1170 | goto out; | ||
1171 | |||
1172 | copied = len; | ||
1173 | if (likely(!(flags & MSG_PEEK))) { | ||
1174 | if (copied < rxm->full_len) { | ||
1175 | if (sock->type == SOCK_DGRAM) { | ||
1176 | /* Truncated message */ | ||
1177 | msg->msg_flags |= MSG_TRUNC; | ||
1178 | goto msg_finished; | ||
1179 | } | ||
1180 | rxm->offset += copied; | ||
1181 | rxm->full_len -= copied; | ||
1182 | } else { | ||
1183 | msg_finished: | ||
1184 | /* Finished with message */ | ||
1185 | msg->msg_flags |= MSG_EOR; | ||
1186 | skb_unlink(skb, &sk->sk_receive_queue); | ||
1187 | kfree_skb(skb); | ||
1188 | } | ||
1189 | } | ||
1190 | |||
1191 | out: | ||
1192 | release_sock(sk); | ||
1193 | |||
1194 | return copied ? : err; | ||
1195 | } | ||
1196 | |||
1197 | /* kcm sock lock held */ | ||
1198 | static void kcm_recv_disable(struct kcm_sock *kcm) | ||
1199 | { | ||
1200 | struct kcm_mux *mux = kcm->mux; | ||
1201 | |||
1202 | if (kcm->rx_disabled) | ||
1203 | return; | ||
1204 | |||
1205 | spin_lock_bh(&mux->rx_lock); | ||
1206 | |||
1207 | kcm->rx_disabled = 1; | ||
1208 | |||
1209 | /* If a psock is reserved we'll do cleanup in unreserve */ | ||
1210 | if (!kcm->rx_psock) { | ||
1211 | if (kcm->rx_wait) { | ||
1212 | list_del(&kcm->wait_rx_list); | ||
1213 | kcm->rx_wait = false; | ||
1214 | } | ||
1215 | |||
1216 | requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); | ||
1217 | } | ||
1218 | |||
1219 | spin_unlock_bh(&mux->rx_lock); | ||
1220 | } | ||
1221 | |||
1222 | /* kcm sock lock held */ | ||
1223 | static void kcm_recv_enable(struct kcm_sock *kcm) | ||
1224 | { | ||
1225 | struct kcm_mux *mux = kcm->mux; | ||
1226 | |||
1227 | if (!kcm->rx_disabled) | ||
1228 | return; | ||
1229 | |||
1230 | spin_lock_bh(&mux->rx_lock); | ||
1231 | |||
1232 | kcm->rx_disabled = 0; | ||
1233 | kcm_rcv_ready(kcm); | ||
1234 | |||
1235 | spin_unlock_bh(&mux->rx_lock); | ||
1236 | } | ||
1237 | |||
1238 | static int kcm_setsockopt(struct socket *sock, int level, int optname, | ||
1239 | char __user *optval, unsigned int optlen) | ||
1240 | { | ||
1241 | struct kcm_sock *kcm = kcm_sk(sock->sk); | ||
1242 | int val, valbool; | ||
1243 | int err = 0; | ||
1244 | |||
1245 | if (level != SOL_KCM) | ||
1246 | return -ENOPROTOOPT; | ||
1247 | |||
1248 | if (optlen < sizeof(int)) | ||
1249 | return -EINVAL; | ||
1250 | |||
1251 | if (get_user(val, (int __user *)optval)) | ||
1252 | return -EINVAL; | ||
1253 | |||
1254 | valbool = val ? 1 : 0; | ||
1255 | |||
1256 | switch (optname) { | ||
1257 | case KCM_RECV_DISABLE: | ||
1258 | lock_sock(&kcm->sk); | ||
1259 | if (valbool) | ||
1260 | kcm_recv_disable(kcm); | ||
1261 | else | ||
1262 | kcm_recv_enable(kcm); | ||
1263 | release_sock(&kcm->sk); | ||
1264 | break; | ||
1265 | default: | ||
1266 | err = -ENOPROTOOPT; | ||
1267 | } | ||
1268 | |||
1269 | return err; | ||
1270 | } | ||
1271 | |||
1272 | static int kcm_getsockopt(struct socket *sock, int level, int optname, | ||
1273 | char __user *optval, int __user *optlen) | ||
1274 | { | ||
1275 | struct kcm_sock *kcm = kcm_sk(sock->sk); | ||
1276 | int val, len; | ||
1277 | |||
1278 | if (level != SOL_KCM) | ||
1279 | return -ENOPROTOOPT; | ||
1280 | |||
1281 | if (get_user(len, optlen)) | ||
1282 | return -EFAULT; | ||
1283 | |||
1284 | len = min_t(unsigned int, len, sizeof(int)); | ||
1285 | if (len < 0) | ||
1286 | return -EINVAL; | ||
1287 | |||
1288 | switch (optname) { | ||
1289 | case KCM_RECV_DISABLE: | ||
1290 | val = kcm->rx_disabled; | ||
1291 | break; | ||
1292 | default: | ||
1293 | return -ENOPROTOOPT; | ||
1294 | } | ||
1295 | |||
1296 | if (put_user(len, optlen)) | ||
1297 | return -EFAULT; | ||
1298 | if (copy_to_user(optval, &val, len)) | ||
1299 | return -EFAULT; | ||
1300 | return 0; | ||
1301 | } | ||
1302 | |||
1303 | static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) | ||
1304 | { | ||
1305 | struct kcm_sock *tkcm; | ||
1306 | struct list_head *head; | ||
1307 | int index = 0; | ||
1308 | |||
1309 | /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so | ||
1310 | * we set sk_state, otherwise epoll_wait always returns right away with | ||
1311 | * POLLHUP | ||
1312 | */ | ||
1313 | kcm->sk.sk_state = TCP_ESTABLISHED; | ||
1314 | |||
1315 | /* Add to mux's kcm sockets list */ | ||
1316 | kcm->mux = mux; | ||
1317 | spin_lock_bh(&mux->lock); | ||
1318 | |||
1319 | head = &mux->kcm_socks; | ||
1320 | list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { | ||
1321 | if (tkcm->index != index) | ||
1322 | break; | ||
1323 | head = &tkcm->kcm_sock_list; | ||
1324 | index++; | ||
1325 | } | ||
1326 | |||
1327 | list_add(&kcm->kcm_sock_list, head); | ||
1328 | kcm->index = index; | ||
1329 | |||
1330 | mux->kcm_socks_cnt++; | ||
1331 | spin_unlock_bh(&mux->lock); | ||
1332 | |||
1333 | INIT_WORK(&kcm->tx_work, kcm_tx_work); | ||
1334 | |||
1335 | spin_lock_bh(&mux->rx_lock); | ||
1336 | kcm_rcv_ready(kcm); | ||
1337 | spin_unlock_bh(&mux->rx_lock); | ||
1338 | } | ||
1339 | |||
1340 | static int kcm_attach(struct socket *sock, struct socket *csock, | ||
1341 | struct bpf_prog *prog) | ||
1342 | { | ||
1343 | struct kcm_sock *kcm = kcm_sk(sock->sk); | ||
1344 | struct kcm_mux *mux = kcm->mux; | ||
1345 | struct sock *csk; | ||
1346 | struct kcm_psock *psock = NULL, *tpsock; | ||
1347 | struct list_head *head; | ||
1348 | int index = 0; | ||
1349 | |||
1350 | if (csock->ops->family != PF_INET && | ||
1351 | csock->ops->family != PF_INET6) | ||
1352 | return -EINVAL; | ||
1353 | |||
1354 | csk = csock->sk; | ||
1355 | if (!csk) | ||
1356 | return -EINVAL; | ||
1357 | |||
1358 | /* Only support TCP for now */ | ||
1359 | if (csk->sk_protocol != IPPROTO_TCP) | ||
1360 | return -EINVAL; | ||
1361 | |||
1362 | psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); | ||
1363 | if (!psock) | ||
1364 | return -ENOMEM; | ||
1365 | |||
1366 | psock->mux = mux; | ||
1367 | psock->sk = csk; | ||
1368 | psock->bpf_prog = prog; | ||
1369 | INIT_WORK(&psock->rx_work, psock_rx_work); | ||
1370 | INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); | ||
1371 | |||
1372 | sock_hold(csk); | ||
1373 | |||
1374 | write_lock_bh(&csk->sk_callback_lock); | ||
1375 | psock->save_data_ready = csk->sk_data_ready; | ||
1376 | psock->save_write_space = csk->sk_write_space; | ||
1377 | psock->save_state_change = csk->sk_state_change; | ||
1378 | csk->sk_user_data = psock; | ||
1379 | csk->sk_data_ready = psock_tcp_data_ready; | ||
1380 | csk->sk_write_space = psock_tcp_write_space; | ||
1381 | csk->sk_state_change = psock_tcp_state_change; | ||
1382 | write_unlock_bh(&csk->sk_callback_lock); | ||
1383 | |||
1384 | /* Finished initialization, now add the psock to the MUX. */ | ||
1385 | spin_lock_bh(&mux->lock); | ||
1386 | head = &mux->psocks; | ||
1387 | list_for_each_entry(tpsock, &mux->psocks, psock_list) { | ||
1388 | if (tpsock->index != index) | ||
1389 | break; | ||
1390 | head = &tpsock->psock_list; | ||
1391 | index++; | ||
1392 | } | ||
1393 | |||
1394 | list_add(&psock->psock_list, head); | ||
1395 | psock->index = index; | ||
1396 | |||
1397 | mux->psocks_cnt++; | ||
1398 | psock_now_avail(psock); | ||
1399 | spin_unlock_bh(&mux->lock); | ||
1400 | |||
1401 | /* Schedule RX work in case there are already bytes queued */ | ||
1402 | queue_work(kcm_wq, &psock->rx_work); | ||
1403 | |||
1404 | return 0; | ||
1405 | } | ||
1406 | |||
1407 | static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) | ||
1408 | { | ||
1409 | struct socket *csock; | ||
1410 | struct bpf_prog *prog; | ||
1411 | int err; | ||
1412 | |||
1413 | csock = sockfd_lookup(info->fd, &err); | ||
1414 | if (!csock) | ||
1415 | return -ENOENT; | ||
1416 | |||
1417 | prog = bpf_prog_get(info->bpf_fd); | ||
1418 | if (IS_ERR(prog)) { | ||
1419 | err = PTR_ERR(prog); | ||
1420 | goto out; | ||
1421 | } | ||
1422 | |||
1423 | if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { | ||
1424 | bpf_prog_put(prog); | ||
1425 | err = -EINVAL; | ||
1426 | goto out; | ||
1427 | } | ||
1428 | |||
1429 | err = kcm_attach(sock, csock, prog); | ||
1430 | if (err) { | ||
1431 | bpf_prog_put(prog); | ||
1432 | goto out; | ||
1433 | } | ||
1434 | |||
1435 | /* Keep reference on file also */ | ||
1436 | |||
1437 | return 0; | ||
1438 | out: | ||
1439 | fput(csock->file); | ||
1440 | return err; | ||
1441 | } | ||
1442 | |||
1443 | static void kcm_unattach(struct kcm_psock *psock) | ||
1444 | { | ||
1445 | struct sock *csk = psock->sk; | ||
1446 | struct kcm_mux *mux = psock->mux; | ||
1447 | |||
1448 | /* Stop getting callbacks from TCP socket. After this there should | ||
1449 | * be no way to reserve a kcm for this psock. | ||
1450 | */ | ||
1451 | write_lock_bh(&csk->sk_callback_lock); | ||
1452 | csk->sk_user_data = NULL; | ||
1453 | csk->sk_data_ready = psock->save_data_ready; | ||
1454 | csk->sk_write_space = psock->save_write_space; | ||
1455 | csk->sk_state_change = psock->save_state_change; | ||
1456 | psock->rx_stopped = 1; | ||
1457 | |||
1458 | if (WARN_ON(psock->rx_kcm)) { | ||
1459 | write_unlock_bh(&csk->sk_callback_lock); | ||
1460 | return; | ||
1461 | } | ||
1462 | |||
1463 | spin_lock_bh(&mux->rx_lock); | ||
1464 | |||
1465 | /* Stop receiver activities. After this point psock should not be | ||
1466 | * able to get onto ready list either through callbacks or work. | ||
1467 | */ | ||
1468 | if (psock->ready_rx_msg) { | ||
1469 | list_del(&psock->psock_ready_list); | ||
1470 | kfree_skb(psock->ready_rx_msg); | ||
1471 | psock->ready_rx_msg = NULL; | ||
1472 | } | ||
1473 | |||
1474 | spin_unlock_bh(&mux->rx_lock); | ||
1475 | |||
1476 | write_unlock_bh(&csk->sk_callback_lock); | ||
1477 | |||
1478 | cancel_work_sync(&psock->rx_work); | ||
1479 | cancel_delayed_work_sync(&psock->rx_delayed_work); | ||
1480 | |||
1481 | bpf_prog_put(psock->bpf_prog); | ||
1482 | |||
1483 | kfree_skb(psock->rx_skb_head); | ||
1484 | psock->rx_skb_head = NULL; | ||
1485 | |||
1486 | spin_lock_bh(&mux->lock); | ||
1487 | |||
1488 | if (psock->tx_kcm) { | ||
1489 | /* psock was reserved. Just mark it finished and we will clean | ||
1490 | * up in the kcm paths, we need kcm lock which can not be | ||
1491 | * acquired here. | ||
1492 | */ | ||
1493 | spin_unlock_bh(&mux->lock); | ||
1494 | |||
1495 | /* We are unattaching a socket that is reserved. Abort the | ||
1496 | * socket since we may be out of sync in sending on it. We need | ||
1497 | * to do this without the mux lock. | ||
1498 | */ | ||
1499 | kcm_abort_tx_psock(psock, EPIPE, false); | ||
1500 | |||
1501 | spin_lock_bh(&mux->lock); | ||
1502 | if (!psock->tx_kcm) { | ||
1503 | /* psock now unreserved in window mux was unlocked */ | ||
1504 | goto no_reserved; | ||
1505 | } | ||
1506 | psock->done = 1; | ||
1507 | |||
1508 | /* Commit done before queuing work to process it */ | ||
1509 | smp_mb(); | ||
1510 | |||
1511 | /* Queue tx work to make sure psock->done is handled */ | ||
1512 | queue_work(kcm_wq, &psock->tx_kcm->tx_work); | ||
1513 | spin_unlock_bh(&mux->lock); | ||
1514 | } else { | ||
1515 | no_reserved: | ||
1516 | if (!psock->tx_stopped) | ||
1517 | list_del(&psock->psock_avail_list); | ||
1518 | list_del(&psock->psock_list); | ||
1519 | mux->psocks_cnt--; | ||
1520 | spin_unlock_bh(&mux->lock); | ||
1521 | |||
1522 | sock_put(csk); | ||
1523 | fput(csk->sk_socket->file); | ||
1524 | kmem_cache_free(kcm_psockp, psock); | ||
1525 | } | ||
1526 | } | ||
1527 | |||
1528 | static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) | ||
1529 | { | ||
1530 | struct kcm_sock *kcm = kcm_sk(sock->sk); | ||
1531 | struct kcm_mux *mux = kcm->mux; | ||
1532 | struct kcm_psock *psock; | ||
1533 | struct socket *csock; | ||
1534 | struct sock *csk; | ||
1535 | int err; | ||
1536 | |||
1537 | csock = sockfd_lookup(info->fd, &err); | ||
1538 | if (!csock) | ||
1539 | return -ENOENT; | ||
1540 | |||
1541 | csk = csock->sk; | ||
1542 | if (!csk) { | ||
1543 | err = -EINVAL; | ||
1544 | goto out; | ||
1545 | } | ||
1546 | |||
1547 | err = -ENOENT; | ||
1548 | |||
1549 | spin_lock_bh(&mux->lock); | ||
1550 | |||
1551 | list_for_each_entry(psock, &mux->psocks, psock_list) { | ||
1552 | if (psock->sk != csk) | ||
1553 | continue; | ||
1554 | |||
1555 | /* Found the matching psock */ | ||
1556 | |||
1557 | if (psock->unattaching || WARN_ON(psock->done)) { | ||
1558 | err = -EALREADY; | ||
1559 | break; | ||
1560 | } | ||
1561 | |||
1562 | psock->unattaching = 1; | ||
1563 | |||
1564 | spin_unlock_bh(&mux->lock); | ||
1565 | |||
1566 | kcm_unattach(psock); | ||
1567 | |||
1568 | err = 0; | ||
1569 | goto out; | ||
1570 | } | ||
1571 | |||
1572 | spin_unlock_bh(&mux->lock); | ||
1573 | |||
1574 | out: | ||
1575 | fput(csock->file); | ||
1576 | return err; | ||
1577 | } | ||
1578 | |||
1579 | static struct proto kcm_proto = { | ||
1580 | .name = "KCM", | ||
1581 | .owner = THIS_MODULE, | ||
1582 | .obj_size = sizeof(struct kcm_sock), | ||
1583 | }; | ||
1584 | |||
1585 | /* Clone a kcm socket. */ | ||
1586 | static int kcm_clone(struct socket *osock, struct kcm_clone *info, | ||
1587 | struct socket **newsockp) | ||
1588 | { | ||
1589 | struct socket *newsock; | ||
1590 | struct sock *newsk; | ||
1591 | struct file *newfile; | ||
1592 | int err, newfd; | ||
1593 | |||
1594 | err = -ENFILE; | ||
1595 | newsock = sock_alloc(); | ||
1596 | if (!newsock) | ||
1597 | goto out; | ||
1598 | |||
1599 | newsock->type = osock->type; | ||
1600 | newsock->ops = osock->ops; | ||
1601 | |||
1602 | __module_get(newsock->ops->owner); | ||
1603 | |||
1604 | newfd = get_unused_fd_flags(0); | ||
1605 | if (unlikely(newfd < 0)) { | ||
1606 | err = newfd; | ||
1607 | goto out_fd_fail; | ||
1608 | } | ||
1609 | |||
1610 | newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); | ||
1611 | if (unlikely(IS_ERR(newfile))) { | ||
1612 | err = PTR_ERR(newfile); | ||
1613 | goto out_sock_alloc_fail; | ||
1614 | } | ||
1615 | |||
1616 | newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, | ||
1617 | &kcm_proto, true); | ||
1618 | if (!newsk) { | ||
1619 | err = -ENOMEM; | ||
1620 | goto out_sk_alloc_fail; | ||
1621 | } | ||
1622 | |||
1623 | sock_init_data(newsock, newsk); | ||
1624 | init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); | ||
1625 | |||
1626 | fd_install(newfd, newfile); | ||
1627 | *newsockp = newsock; | ||
1628 | info->fd = newfd; | ||
1629 | |||
1630 | return 0; | ||
1631 | |||
1632 | out_sk_alloc_fail: | ||
1633 | fput(newfile); | ||
1634 | out_sock_alloc_fail: | ||
1635 | put_unused_fd(newfd); | ||
1636 | out_fd_fail: | ||
1637 | sock_release(newsock); | ||
1638 | out: | ||
1639 | return err; | ||
1640 | } | ||
1641 | |||
1642 | static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | ||
1643 | { | ||
1644 | int err; | ||
1645 | |||
1646 | switch (cmd) { | ||
1647 | case SIOCKCMATTACH: { | ||
1648 | struct kcm_attach info; | ||
1649 | |||
1650 | if (copy_from_user(&info, (void __user *)arg, sizeof(info))) | ||
1651 | err = -EFAULT; | ||
1652 | |||
1653 | err = kcm_attach_ioctl(sock, &info); | ||
1654 | |||
1655 | break; | ||
1656 | } | ||
1657 | case SIOCKCMUNATTACH: { | ||
1658 | struct kcm_unattach info; | ||
1659 | |||
1660 | if (copy_from_user(&info, (void __user *)arg, sizeof(info))) | ||
1661 | err = -EFAULT; | ||
1662 | |||
1663 | err = kcm_unattach_ioctl(sock, &info); | ||
1664 | |||
1665 | break; | ||
1666 | } | ||
1667 | case SIOCKCMCLONE: { | ||
1668 | struct kcm_clone info; | ||
1669 | struct socket *newsock = NULL; | ||
1670 | |||
1671 | if (copy_from_user(&info, (void __user *)arg, sizeof(info))) | ||
1672 | err = -EFAULT; | ||
1673 | |||
1674 | err = kcm_clone(sock, &info, &newsock); | ||
1675 | |||
1676 | if (!err) { | ||
1677 | if (copy_to_user((void __user *)arg, &info, | ||
1678 | sizeof(info))) { | ||
1679 | err = -EFAULT; | ||
1680 | sock_release(newsock); | ||
1681 | } | ||
1682 | } | ||
1683 | |||
1684 | break; | ||
1685 | } | ||
1686 | default: | ||
1687 | err = -ENOIOCTLCMD; | ||
1688 | break; | ||
1689 | } | ||
1690 | |||
1691 | return err; | ||
1692 | } | ||
1693 | |||
1694 | static void free_mux(struct rcu_head *rcu) | ||
1695 | { | ||
1696 | struct kcm_mux *mux = container_of(rcu, | ||
1697 | struct kcm_mux, rcu); | ||
1698 | |||
1699 | kmem_cache_free(kcm_muxp, mux); | ||
1700 | } | ||
1701 | |||
1702 | static void release_mux(struct kcm_mux *mux) | ||
1703 | { | ||
1704 | struct kcm_net *knet = mux->knet; | ||
1705 | struct kcm_psock *psock, *tmp_psock; | ||
1706 | |||
1707 | /* Release psocks */ | ||
1708 | list_for_each_entry_safe(psock, tmp_psock, | ||
1709 | &mux->psocks, psock_list) { | ||
1710 | if (!WARN_ON(psock->unattaching)) | ||
1711 | kcm_unattach(psock); | ||
1712 | } | ||
1713 | |||
1714 | if (WARN_ON(mux->psocks_cnt)) | ||
1715 | return; | ||
1716 | |||
1717 | __skb_queue_purge(&mux->rx_hold_queue); | ||
1718 | |||
1719 | mutex_lock(&knet->mutex); | ||
1720 | list_del_rcu(&mux->kcm_mux_list); | ||
1721 | knet->count--; | ||
1722 | mutex_unlock(&knet->mutex); | ||
1723 | |||
1724 | call_rcu(&mux->rcu, free_mux); | ||
1725 | } | ||
1726 | |||
1727 | static void kcm_done(struct kcm_sock *kcm) | ||
1728 | { | ||
1729 | struct kcm_mux *mux = kcm->mux; | ||
1730 | struct sock *sk = &kcm->sk; | ||
1731 | int socks_cnt; | ||
1732 | |||
1733 | spin_lock_bh(&mux->rx_lock); | ||
1734 | if (kcm->rx_psock) { | ||
1735 | /* Cleanup in unreserve_rx_kcm */ | ||
1736 | WARN_ON(kcm->done); | ||
1737 | kcm->rx_disabled = 1; | ||
1738 | kcm->done = 1; | ||
1739 | spin_unlock_bh(&mux->rx_lock); | ||
1740 | return; | ||
1741 | } | ||
1742 | |||
1743 | if (kcm->rx_wait) { | ||
1744 | list_del(&kcm->wait_rx_list); | ||
1745 | kcm->rx_wait = false; | ||
1746 | } | ||
1747 | /* Move any pending receive messages to other kcm sockets */ | ||
1748 | requeue_rx_msgs(mux, &sk->sk_receive_queue); | ||
1749 | |||
1750 | spin_unlock_bh(&mux->rx_lock); | ||
1751 | |||
1752 | if (WARN_ON(sk_rmem_alloc_get(sk))) | ||
1753 | return; | ||
1754 | |||
1755 | /* Detach from MUX */ | ||
1756 | spin_lock_bh(&mux->lock); | ||
1757 | |||
1758 | list_del(&kcm->kcm_sock_list); | ||
1759 | mux->kcm_socks_cnt--; | ||
1760 | socks_cnt = mux->kcm_socks_cnt; | ||
1761 | |||
1762 | spin_unlock_bh(&mux->lock); | ||
1763 | |||
1764 | if (!socks_cnt) { | ||
1765 | /* We are done with the mux now. */ | ||
1766 | release_mux(mux); | ||
1767 | } | ||
1768 | |||
1769 | WARN_ON(kcm->rx_wait); | ||
1770 | |||
1771 | sock_put(&kcm->sk); | ||
1772 | } | ||
1773 | |||
1774 | /* Called by kcm_release to close a KCM socket. | ||
1775 | * If this is the last KCM socket on the MUX, destroy the MUX. | ||
1776 | */ | ||
1777 | static int kcm_release(struct socket *sock) | ||
1778 | { | ||
1779 | struct sock *sk = sock->sk; | ||
1780 | struct kcm_sock *kcm; | ||
1781 | struct kcm_mux *mux; | ||
1782 | struct kcm_psock *psock; | ||
1783 | |||
1784 | if (!sk) | ||
1785 | return 0; | ||
1786 | |||
1787 | kcm = kcm_sk(sk); | ||
1788 | mux = kcm->mux; | ||
1789 | |||
1790 | sock_orphan(sk); | ||
1791 | kfree_skb(kcm->seq_skb); | ||
1792 | |||
1793 | lock_sock(sk); | ||
1794 | /* Purge queue under lock to avoid race condition with tx_work trying | ||
1795 | * to act when queue is nonempty. If tx_work runs after this point | ||
1796 | * it will just return. | ||
1797 | */ | ||
1798 | __skb_queue_purge(&sk->sk_write_queue); | ||
1799 | release_sock(sk); | ||
1800 | |||
1801 | spin_lock_bh(&mux->lock); | ||
1802 | if (kcm->tx_wait) { | ||
1803 | /* Take of tx_wait list, after this point there should be no way | ||
1804 | * that a psock will be assigned to this kcm. | ||
1805 | */ | ||
1806 | list_del(&kcm->wait_psock_list); | ||
1807 | kcm->tx_wait = false; | ||
1808 | } | ||
1809 | spin_unlock_bh(&mux->lock); | ||
1810 | |||
1811 | /* Cancel work. After this point there should be no outside references | ||
1812 | * to the kcm socket. | ||
1813 | */ | ||
1814 | cancel_work_sync(&kcm->tx_work); | ||
1815 | |||
1816 | lock_sock(sk); | ||
1817 | psock = kcm->tx_psock; | ||
1818 | if (psock) { | ||
1819 | /* A psock was reserved, so we need to kill it since it | ||
1820 | * may already have some bytes queued from a message. We | ||
1821 | * need to do this after removing kcm from tx_wait list. | ||
1822 | */ | ||
1823 | kcm_abort_tx_psock(psock, EPIPE, false); | ||
1824 | unreserve_psock(kcm); | ||
1825 | } | ||
1826 | release_sock(sk); | ||
1827 | |||
1828 | WARN_ON(kcm->tx_wait); | ||
1829 | WARN_ON(kcm->tx_psock); | ||
1830 | |||
1831 | sock->sk = NULL; | ||
1832 | |||
1833 | kcm_done(kcm); | ||
1834 | |||
1835 | return 0; | ||
1836 | } | ||
1837 | |||
1838 | static const struct proto_ops kcm_ops = { | ||
1839 | .family = PF_KCM, | ||
1840 | .owner = THIS_MODULE, | ||
1841 | .release = kcm_release, | ||
1842 | .bind = sock_no_bind, | ||
1843 | .connect = sock_no_connect, | ||
1844 | .socketpair = sock_no_socketpair, | ||
1845 | .accept = sock_no_accept, | ||
1846 | .getname = sock_no_getname, | ||
1847 | .poll = datagram_poll, | ||
1848 | .ioctl = kcm_ioctl, | ||
1849 | .listen = sock_no_listen, | ||
1850 | .shutdown = sock_no_shutdown, | ||
1851 | .setsockopt = kcm_setsockopt, | ||
1852 | .getsockopt = kcm_getsockopt, | ||
1853 | .sendmsg = kcm_sendmsg, | ||
1854 | .recvmsg = kcm_recvmsg, | ||
1855 | .mmap = sock_no_mmap, | ||
1856 | .sendpage = sock_no_sendpage, | ||
1857 | }; | ||
1858 | |||
1859 | /* Create proto operation for kcm sockets */ | ||
1860 | static int kcm_create(struct net *net, struct socket *sock, | ||
1861 | int protocol, int kern) | ||
1862 | { | ||
1863 | struct kcm_net *knet = net_generic(net, kcm_net_id); | ||
1864 | struct sock *sk; | ||
1865 | struct kcm_mux *mux; | ||
1866 | |||
1867 | switch (sock->type) { | ||
1868 | case SOCK_DGRAM: | ||
1869 | case SOCK_SEQPACKET: | ||
1870 | sock->ops = &kcm_ops; | ||
1871 | break; | ||
1872 | default: | ||
1873 | return -ESOCKTNOSUPPORT; | ||
1874 | } | ||
1875 | |||
1876 | if (protocol != KCMPROTO_CONNECTED) | ||
1877 | return -EPROTONOSUPPORT; | ||
1878 | |||
1879 | sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); | ||
1880 | if (!sk) | ||
1881 | return -ENOMEM; | ||
1882 | |||
1883 | /* Allocate a kcm mux, shared between KCM sockets */ | ||
1884 | mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); | ||
1885 | if (!mux) { | ||
1886 | sk_free(sk); | ||
1887 | return -ENOMEM; | ||
1888 | } | ||
1889 | |||
1890 | spin_lock_init(&mux->lock); | ||
1891 | spin_lock_init(&mux->rx_lock); | ||
1892 | INIT_LIST_HEAD(&mux->kcm_socks); | ||
1893 | INIT_LIST_HEAD(&mux->kcm_rx_waiters); | ||
1894 | INIT_LIST_HEAD(&mux->kcm_tx_waiters); | ||
1895 | |||
1896 | INIT_LIST_HEAD(&mux->psocks); | ||
1897 | INIT_LIST_HEAD(&mux->psocks_ready); | ||
1898 | INIT_LIST_HEAD(&mux->psocks_avail); | ||
1899 | |||
1900 | mux->knet = knet; | ||
1901 | |||
1902 | /* Add new MUX to list */ | ||
1903 | mutex_lock(&knet->mutex); | ||
1904 | list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); | ||
1905 | knet->count++; | ||
1906 | mutex_unlock(&knet->mutex); | ||
1907 | |||
1908 | skb_queue_head_init(&mux->rx_hold_queue); | ||
1909 | |||
1910 | /* Init KCM socket */ | ||
1911 | sock_init_data(sock, sk); | ||
1912 | init_kcm_sock(kcm_sk(sk), mux); | ||
1913 | |||
1914 | return 0; | ||
1915 | } | ||
1916 | |||
1917 | static struct net_proto_family kcm_family_ops = { | ||
1918 | .family = PF_KCM, | ||
1919 | .create = kcm_create, | ||
1920 | .owner = THIS_MODULE, | ||
1921 | }; | ||
1922 | |||
1923 | static __net_init int kcm_init_net(struct net *net) | ||
1924 | { | ||
1925 | struct kcm_net *knet = net_generic(net, kcm_net_id); | ||
1926 | |||
1927 | INIT_LIST_HEAD_RCU(&knet->mux_list); | ||
1928 | mutex_init(&knet->mutex); | ||
1929 | |||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1933 | static __net_exit void kcm_exit_net(struct net *net) | ||
1934 | { | ||
1935 | struct kcm_net *knet = net_generic(net, kcm_net_id); | ||
1936 | |||
1937 | /* All KCM sockets should be closed at this point, which should mean | ||
1938 | * that all multiplexors and psocks have been destroyed. | ||
1939 | */ | ||
1940 | WARN_ON(!list_empty(&knet->mux_list)); | ||
1941 | } | ||
1942 | |||
1943 | static struct pernet_operations kcm_net_ops = { | ||
1944 | .init = kcm_init_net, | ||
1945 | .exit = kcm_exit_net, | ||
1946 | .id = &kcm_net_id, | ||
1947 | .size = sizeof(struct kcm_net), | ||
1948 | }; | ||
1949 | |||
1950 | static int __init kcm_init(void) | ||
1951 | { | ||
1952 | int err = -ENOMEM; | ||
1953 | |||
1954 | kcm_muxp = kmem_cache_create("kcm_mux_cache", | ||
1955 | sizeof(struct kcm_mux), 0, | ||
1956 | SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); | ||
1957 | if (!kcm_muxp) | ||
1958 | goto fail; | ||
1959 | |||
1960 | kcm_psockp = kmem_cache_create("kcm_psock_cache", | ||
1961 | sizeof(struct kcm_psock), 0, | ||
1962 | SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); | ||
1963 | if (!kcm_psockp) | ||
1964 | goto fail; | ||
1965 | |||
1966 | kcm_wq = create_singlethread_workqueue("kkcmd"); | ||
1967 | if (!kcm_wq) | ||
1968 | goto fail; | ||
1969 | |||
1970 | err = proto_register(&kcm_proto, 1); | ||
1971 | if (err) | ||
1972 | goto fail; | ||
1973 | |||
1974 | err = sock_register(&kcm_family_ops); | ||
1975 | if (err) | ||
1976 | goto sock_register_fail; | ||
1977 | |||
1978 | err = register_pernet_device(&kcm_net_ops); | ||
1979 | if (err) | ||
1980 | goto net_ops_fail; | ||
1981 | |||
1982 | return 0; | ||
1983 | |||
1984 | net_ops_fail: | ||
1985 | sock_unregister(PF_KCM); | ||
1986 | |||
1987 | sock_register_fail: | ||
1988 | proto_unregister(&kcm_proto); | ||
1989 | |||
1990 | fail: | ||
1991 | kmem_cache_destroy(kcm_muxp); | ||
1992 | kmem_cache_destroy(kcm_psockp); | ||
1993 | |||
1994 | if (kcm_wq) | ||
1995 | destroy_workqueue(kcm_wq); | ||
1996 | |||
1997 | return err; | ||
1998 | } | ||
1999 | |||
2000 | static void __exit kcm_exit(void) | ||
2001 | { | ||
2002 | unregister_pernet_device(&kcm_net_ops); | ||
2003 | sock_unregister(PF_KCM); | ||
2004 | proto_unregister(&kcm_proto); | ||
2005 | destroy_workqueue(kcm_wq); | ||
2006 | |||
2007 | kmem_cache_destroy(kcm_muxp); | ||
2008 | kmem_cache_destroy(kcm_psockp); | ||
2009 | } | ||
2010 | |||
2011 | module_init(kcm_init); | ||
2012 | module_exit(kcm_exit); | ||
2013 | |||
2014 | MODULE_LICENSE("GPL"); | ||
2015 | MODULE_ALIAS_NETPROTO(PF_KCM); | ||
2016 | |||