summaryrefslogtreecommitdiffstats
path: root/net/kcm/kcmsock.c
diff options
context:
space:
mode:
authorTom Herbert <tom@herbertland.com>2016-03-07 17:11:06 -0500
committerDavid S. Miller <davem@davemloft.net>2016-03-09 16:36:14 -0500
commitab7ac4eb9832e32a09f4e8042705484d2fb0aad3 (patch)
tree386057aeaceeaa0e0365814f544989e42c484a09 /net/kcm/kcmsock.c
parent473bd239b808a8af5241ce9996a16d283d88ddff (diff)
kcm: Kernel Connection Multiplexor module
This module implements the Kernel Connection Multiplexor. Kernel Connection Multiplexor (KCM) is a facility that provides a message based interface over TCP for generic application protocols. With KCM an application can efficiently send and receive application protocol messages over TCP using datagram sockets. For more information see the included Documentation/networking/kcm.txt Signed-off-by: Tom Herbert <tom@herbertland.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/kcm/kcmsock.c')
-rw-r--r--net/kcm/kcmsock.c2016
1 files changed, 2016 insertions, 0 deletions
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
new file mode 100644
index 000000000000..30ef69ac6b81
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2016 @@
1#include <linux/bpf.h>
2#include <linux/errno.h>
3#include <linux/errqueue.h>
4#include <linux/file.h>
5#include <linux/in.h>
6#include <linux/kernel.h>
7#include <linux/module.h>
8#include <linux/net.h>
9#include <linux/netdevice.h>
10#include <linux/poll.h>
11#include <linux/rculist.h>
12#include <linux/skbuff.h>
13#include <linux/socket.h>
14#include <linux/uaccess.h>
15#include <linux/workqueue.h>
16#include <net/kcm.h>
17#include <net/netns/generic.h>
18#include <net/sock.h>
19#include <net/tcp.h>
20#include <uapi/linux/kcm.h>
21
22unsigned int kcm_net_id;
23
24static struct kmem_cache *kcm_psockp __read_mostly;
25static struct kmem_cache *kcm_muxp __read_mostly;
26static struct workqueue_struct *kcm_wq;
27
28static inline struct kcm_sock *kcm_sk(const struct sock *sk)
29{
30 return (struct kcm_sock *)sk;
31}
32
33static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
34{
35 return (struct kcm_tx_msg *)skb->cb;
36}
37
38static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
39{
40 return (struct kcm_rx_msg *)((void *)skb->cb +
41 offsetof(struct qdisc_skb_cb, data));
42}
43
44static void report_csk_error(struct sock *csk, int err)
45{
46 csk->sk_err = EPIPE;
47 csk->sk_error_report(csk);
48}
49
50/* Callback lock held */
51static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
52 struct sk_buff *skb)
53{
54 struct sock *csk = psock->sk;
55
56 /* Unrecoverable error in receive */
57
58 if (psock->rx_stopped)
59 return;
60
61 psock->rx_stopped = 1;
62
63 /* Report an error on the lower socket */
64 report_csk_error(csk, err);
65}
66
67static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
68 bool wakeup_kcm)
69{
70 struct sock *csk = psock->sk;
71 struct kcm_mux *mux = psock->mux;
72
73 /* Unrecoverable error in transmit */
74
75 spin_lock_bh(&mux->lock);
76
77 if (psock->tx_stopped) {
78 spin_unlock_bh(&mux->lock);
79 return;
80 }
81
82 psock->tx_stopped = 1;
83
84 if (!psock->tx_kcm) {
85 /* Take off psocks_avail list */
86 list_del(&psock->psock_avail_list);
87 } else if (wakeup_kcm) {
88 /* In this case psock is being aborted while outside of
89 * write_msgs and psock is reserved. Schedule tx_work
90 * to handle the failure there. Need to commit tx_stopped
91 * before queuing work.
92 */
93 smp_mb();
94
95 queue_work(kcm_wq, &psock->tx_kcm->tx_work);
96 }
97
98 spin_unlock_bh(&mux->lock);
99
100 /* Report error on lower socket */
101 report_csk_error(csk, err);
102}
103
104static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
105
106/* KCM is ready to receive messages on its queue-- either the KCM is new or
107 * has become unblocked after being blocked on full socket buffer. Queue any
108 * pending ready messages on a psock. RX mux lock held.
109 */
110static void kcm_rcv_ready(struct kcm_sock *kcm)
111{
112 struct kcm_mux *mux = kcm->mux;
113 struct kcm_psock *psock;
114 struct sk_buff *skb;
115
116 if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
117 return;
118
119 while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
120 if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
121 /* Assuming buffer limit has been reached */
122 skb_queue_head(&mux->rx_hold_queue, skb);
123 WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
124 return;
125 }
126 }
127
128 while (!list_empty(&mux->psocks_ready)) {
129 psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
130 psock_ready_list);
131
132 if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
133 /* Assuming buffer limit has been reached */
134 WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
135 return;
136 }
137
138 /* Consumed the ready message on the psock. Schedule rx_work to
139 * get more messages.
140 */
141 list_del(&psock->psock_ready_list);
142 psock->ready_rx_msg = NULL;
143
144 /* Commit clearing of ready_rx_msg for queuing work */
145 smp_mb();
146
147 queue_work(kcm_wq, &psock->rx_work);
148 }
149
150 /* Buffer limit is okay now, add to ready list */
151 list_add_tail(&kcm->wait_rx_list,
152 &kcm->mux->kcm_rx_waiters);
153 kcm->rx_wait = true;
154}
155
156static void kcm_rfree(struct sk_buff *skb)
157{
158 struct sock *sk = skb->sk;
159 struct kcm_sock *kcm = kcm_sk(sk);
160 struct kcm_mux *mux = kcm->mux;
161 unsigned int len = skb->truesize;
162
163 sk_mem_uncharge(sk, len);
164 atomic_sub(len, &sk->sk_rmem_alloc);
165
166 /* For reading rx_wait and rx_psock without holding lock */
167 smp_mb__after_atomic();
168
169 if (!kcm->rx_wait && !kcm->rx_psock &&
170 sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
171 spin_lock_bh(&mux->rx_lock);
172 kcm_rcv_ready(kcm);
173 spin_unlock_bh(&mux->rx_lock);
174 }
175}
176
177static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
178{
179 struct sk_buff_head *list = &sk->sk_receive_queue;
180
181 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
182 return -ENOMEM;
183
184 if (!sk_rmem_schedule(sk, skb, skb->truesize))
185 return -ENOBUFS;
186
187 skb->dev = NULL;
188
189 skb_orphan(skb);
190 skb->sk = sk;
191 skb->destructor = kcm_rfree;
192 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
193 sk_mem_charge(sk, skb->truesize);
194
195 skb_queue_tail(list, skb);
196
197 if (!sock_flag(sk, SOCK_DEAD))
198 sk->sk_data_ready(sk);
199
200 return 0;
201}
202
203/* Requeue received messages for a kcm socket to other kcm sockets. This is
204 * called with a kcm socket is receive disabled.
205 * RX mux lock held.
206 */
207static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
208{
209 struct sk_buff *skb;
210 struct kcm_sock *kcm;
211
212 while ((skb = __skb_dequeue(head))) {
213 /* Reset destructor to avoid calling kcm_rcv_ready */
214 skb->destructor = sock_rfree;
215 skb_orphan(skb);
216try_again:
217 if (list_empty(&mux->kcm_rx_waiters)) {
218 skb_queue_tail(&mux->rx_hold_queue, skb);
219 continue;
220 }
221
222 kcm = list_first_entry(&mux->kcm_rx_waiters,
223 struct kcm_sock, wait_rx_list);
224
225 if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
226 /* Should mean socket buffer full */
227 list_del(&kcm->wait_rx_list);
228 kcm->rx_wait = false;
229
230 /* Commit rx_wait to read in kcm_free */
231 smp_wmb();
232
233 goto try_again;
234 }
235 }
236}
237
238/* Lower sock lock held */
239static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
240 struct sk_buff *head)
241{
242 struct kcm_mux *mux = psock->mux;
243 struct kcm_sock *kcm;
244
245 WARN_ON(psock->ready_rx_msg);
246
247 if (psock->rx_kcm)
248 return psock->rx_kcm;
249
250 spin_lock_bh(&mux->rx_lock);
251
252 if (psock->rx_kcm) {
253 spin_unlock_bh(&mux->rx_lock);
254 return psock->rx_kcm;
255 }
256
257 if (list_empty(&mux->kcm_rx_waiters)) {
258 psock->ready_rx_msg = head;
259 list_add_tail(&psock->psock_ready_list,
260 &mux->psocks_ready);
261 spin_unlock_bh(&mux->rx_lock);
262 return NULL;
263 }
264
265 kcm = list_first_entry(&mux->kcm_rx_waiters,
266 struct kcm_sock, wait_rx_list);
267 list_del(&kcm->wait_rx_list);
268 kcm->rx_wait = false;
269
270 psock->rx_kcm = kcm;
271 kcm->rx_psock = psock;
272
273 spin_unlock_bh(&mux->rx_lock);
274
275 return kcm;
276}
277
278static void kcm_done(struct kcm_sock *kcm);
279
280static void kcm_done_work(struct work_struct *w)
281{
282 kcm_done(container_of(w, struct kcm_sock, done_work));
283}
284
285/* Lower sock held */
286static void unreserve_rx_kcm(struct kcm_psock *psock,
287 bool rcv_ready)
288{
289 struct kcm_sock *kcm = psock->rx_kcm;
290 struct kcm_mux *mux = psock->mux;
291
292 if (!kcm)
293 return;
294
295 spin_lock_bh(&mux->rx_lock);
296
297 psock->rx_kcm = NULL;
298 kcm->rx_psock = NULL;
299
300 /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
301 * kcm_rfree
302 */
303 smp_mb();
304
305 if (unlikely(kcm->done)) {
306 spin_unlock_bh(&mux->rx_lock);
307
308 /* Need to run kcm_done in a task since we need to qcquire
309 * callback locks which may already be held here.
310 */
311 INIT_WORK(&kcm->done_work, kcm_done_work);
312 schedule_work(&kcm->done_work);
313 return;
314 }
315
316 if (unlikely(kcm->rx_disabled)) {
317 requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
318 } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
319 /* Check for degenerative race with rx_wait that all
320 * data was dequeued (accounted for in kcm_rfree).
321 */
322 kcm_rcv_ready(kcm);
323 }
324 spin_unlock_bh(&mux->rx_lock);
325}
326
327/* Macro to invoke filter function. */
328#define KCM_RUN_FILTER(prog, ctx) \
329 (*prog->bpf_func)(ctx, prog->insnsi)
330
331/* Lower socket lock held */
332static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
333 unsigned int orig_offset, size_t orig_len)
334{
335 struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
336 struct kcm_rx_msg *rxm;
337 struct kcm_sock *kcm;
338 struct sk_buff *head, *skb;
339 size_t eaten = 0, cand_len;
340 ssize_t extra;
341 int err;
342 bool cloned_orig = false;
343
344 if (psock->ready_rx_msg)
345 return 0;
346
347 head = psock->rx_skb_head;
348 if (head) {
349 /* Message already in progress */
350
351 if (unlikely(orig_offset)) {
352 /* Getting data with a non-zero offset when a message is
353 * in progress is not expected. If it does happen, we
354 * need to clone and pull since we can't deal with
355 * offsets in the skbs for a message expect in the head.
356 */
357 orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
358 if (!orig_skb) {
359 desc->error = -ENOMEM;
360 return 0;
361 }
362 if (!pskb_pull(orig_skb, orig_offset)) {
363 kfree_skb(orig_skb);
364 desc->error = -ENOMEM;
365 return 0;
366 }
367 cloned_orig = true;
368 orig_offset = 0;
369 }
370
371 if (!psock->rx_skb_nextp) {
372 /* We are going to append to the frags_list of head.
373 * Need to unshare the frag_list.
374 */
375 err = skb_unclone(head, GFP_ATOMIC);
376 if (err) {
377 desc->error = err;
378 return 0;
379 }
380
381 if (unlikely(skb_shinfo(head)->frag_list)) {
382 /* We can't append to an sk_buff that already
383 * has a frag_list. We create a new head, point
384 * the frag_list of that to the old head, and
385 * then are able to use the old head->next for
386 * appending to the message.
387 */
388 if (WARN_ON(head->next)) {
389 desc->error = -EINVAL;
390 return 0;
391 }
392
393 skb = alloc_skb(0, GFP_ATOMIC);
394 if (!skb) {
395 desc->error = -ENOMEM;
396 return 0;
397 }
398 skb->len = head->len;
399 skb->data_len = head->len;
400 skb->truesize = head->truesize;
401 *kcm_rx_msg(skb) = *kcm_rx_msg(head);
402 psock->rx_skb_nextp = &head->next;
403 skb_shinfo(skb)->frag_list = head;
404 psock->rx_skb_head = skb;
405 head = skb;
406 } else {
407 psock->rx_skb_nextp =
408 &skb_shinfo(head)->frag_list;
409 }
410 }
411 }
412
413 while (eaten < orig_len) {
414 /* Always clone since we will consume something */
415 skb = skb_clone(orig_skb, GFP_ATOMIC);
416 if (!skb) {
417 desc->error = -ENOMEM;
418 break;
419 }
420
421 cand_len = orig_len - eaten;
422
423 head = psock->rx_skb_head;
424 if (!head) {
425 head = skb;
426 psock->rx_skb_head = head;
427 /* Will set rx_skb_nextp on next packet if needed */
428 psock->rx_skb_nextp = NULL;
429 rxm = kcm_rx_msg(head);
430 memset(rxm, 0, sizeof(*rxm));
431 rxm->offset = orig_offset + eaten;
432 } else {
433 /* Unclone since we may be appending to an skb that we
434 * already share a frag_list with.
435 */
436 err = skb_unclone(skb, GFP_ATOMIC);
437 if (err) {
438 desc->error = err;
439 break;
440 }
441
442 rxm = kcm_rx_msg(head);
443 *psock->rx_skb_nextp = skb;
444 psock->rx_skb_nextp = &skb->next;
445 head->data_len += skb->len;
446 head->len += skb->len;
447 head->truesize += skb->truesize;
448 }
449
450 if (!rxm->full_len) {
451 ssize_t len;
452
453 len = KCM_RUN_FILTER(psock->bpf_prog, head);
454
455 if (!len) {
456 /* Need more header to determine length */
457 rxm->accum_len += cand_len;
458 eaten += cand_len;
459 WARN_ON(eaten != orig_len);
460 break;
461 } else if (len <= (ssize_t)head->len -
462 skb->len - rxm->offset) {
463 /* Length must be into new skb (and also
464 * greater than zero)
465 */
466 desc->error = -EPROTO;
467 psock->rx_skb_head = NULL;
468 kcm_abort_rx_psock(psock, EPROTO, head);
469 break;
470 }
471
472 rxm->full_len = len;
473 }
474
475 extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
476
477 if (extra < 0) {
478 /* Message not complete yet. */
479 rxm->accum_len += cand_len;
480 eaten += cand_len;
481 WARN_ON(eaten != orig_len);
482 break;
483 }
484
485 /* Positive extra indicates ore bytes than needed for the
486 * message
487 */
488
489 WARN_ON(extra > cand_len);
490
491 eaten += (cand_len - extra);
492
493 /* Hurray, we have a new message! */
494 psock->rx_skb_head = NULL;
495
496try_queue:
497 kcm = reserve_rx_kcm(psock, head);
498 if (!kcm) {
499 /* Unable to reserve a KCM, message is held in psock. */
500 break;
501 }
502
503 if (kcm_queue_rcv_skb(&kcm->sk, head)) {
504 /* Should mean socket buffer full */
505 unreserve_rx_kcm(psock, false);
506 goto try_queue;
507 }
508 }
509
510 if (cloned_orig)
511 kfree_skb(orig_skb);
512
513 return eaten;
514}
515
516/* Called with lock held on lower socket */
517static int psock_tcp_read_sock(struct kcm_psock *psock)
518{
519 read_descriptor_t desc;
520
521 desc.arg.data = psock;
522 desc.error = 0;
523 desc.count = 1; /* give more than one skb per call */
524
525 /* sk should be locked here, so okay to do tcp_read_sock */
526 tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
527
528 unreserve_rx_kcm(psock, true);
529
530 return desc.error;
531}
532
533/* Lower sock lock held */
534static void psock_tcp_data_ready(struct sock *sk)
535{
536 struct kcm_psock *psock;
537
538 read_lock_bh(&sk->sk_callback_lock);
539
540 psock = (struct kcm_psock *)sk->sk_user_data;
541 if (unlikely(!psock || psock->rx_stopped))
542 goto out;
543
544 if (psock->ready_rx_msg)
545 goto out;
546
547 if (psock_tcp_read_sock(psock) == -ENOMEM)
548 queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
549
550out:
551 read_unlock_bh(&sk->sk_callback_lock);
552}
553
554static void do_psock_rx_work(struct kcm_psock *psock)
555{
556 read_descriptor_t rd_desc;
557 struct sock *csk = psock->sk;
558
559 /* We need the read lock to synchronize with psock_tcp_data_ready. We
560 * need the socket lock for calling tcp_read_sock.
561 */
562 lock_sock(csk);
563 read_lock_bh(&csk->sk_callback_lock);
564
565 if (unlikely(csk->sk_user_data != psock))
566 goto out;
567
568 if (unlikely(psock->rx_stopped))
569 goto out;
570
571 if (psock->ready_rx_msg)
572 goto out;
573
574 rd_desc.arg.data = psock;
575
576 if (psock_tcp_read_sock(psock) == -ENOMEM)
577 queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
578
579out:
580 read_unlock_bh(&csk->sk_callback_lock);
581 release_sock(csk);
582}
583
584static void psock_rx_work(struct work_struct *w)
585{
586 do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
587}
588
589static void psock_rx_delayed_work(struct work_struct *w)
590{
591 do_psock_rx_work(container_of(w, struct kcm_psock,
592 rx_delayed_work.work));
593}
594
595static void psock_tcp_state_change(struct sock *sk)
596{
597 /* TCP only does a POLLIN for a half close. Do a POLLHUP here
598 * since application will normally not poll with POLLIN
599 * on the TCP sockets.
600 */
601
602 report_csk_error(sk, EPIPE);
603}
604
605static void psock_tcp_write_space(struct sock *sk)
606{
607 struct kcm_psock *psock;
608 struct kcm_mux *mux;
609 struct kcm_sock *kcm;
610
611 read_lock_bh(&sk->sk_callback_lock);
612
613 psock = (struct kcm_psock *)sk->sk_user_data;
614 if (unlikely(!psock))
615 goto out;
616
617 mux = psock->mux;
618
619 spin_lock_bh(&mux->lock);
620
621 /* Check if the socket is reserved so someone is waiting for sending. */
622 kcm = psock->tx_kcm;
623 if (kcm)
624 queue_work(kcm_wq, &kcm->tx_work);
625
626 spin_unlock_bh(&mux->lock);
627out:
628 read_unlock_bh(&sk->sk_callback_lock);
629}
630
631static void unreserve_psock(struct kcm_sock *kcm);
632
633/* kcm sock is locked. */
634static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
635{
636 struct kcm_mux *mux = kcm->mux;
637 struct kcm_psock *psock;
638
639 psock = kcm->tx_psock;
640
641 smp_rmb(); /* Must read tx_psock before tx_wait */
642
643 if (psock) {
644 WARN_ON(kcm->tx_wait);
645 if (unlikely(psock->tx_stopped))
646 unreserve_psock(kcm);
647 else
648 return kcm->tx_psock;
649 }
650
651 spin_lock_bh(&mux->lock);
652
653 /* Check again under lock to see if psock was reserved for this
654 * psock via psock_unreserve.
655 */
656 psock = kcm->tx_psock;
657 if (unlikely(psock)) {
658 WARN_ON(kcm->tx_wait);
659 spin_unlock_bh(&mux->lock);
660 return kcm->tx_psock;
661 }
662
663 if (!list_empty(&mux->psocks_avail)) {
664 psock = list_first_entry(&mux->psocks_avail,
665 struct kcm_psock,
666 psock_avail_list);
667 list_del(&psock->psock_avail_list);
668 if (kcm->tx_wait) {
669 list_del(&kcm->wait_psock_list);
670 kcm->tx_wait = false;
671 }
672 kcm->tx_psock = psock;
673 psock->tx_kcm = kcm;
674 } else if (!kcm->tx_wait) {
675 list_add_tail(&kcm->wait_psock_list,
676 &mux->kcm_tx_waiters);
677 kcm->tx_wait = true;
678 }
679
680 spin_unlock_bh(&mux->lock);
681
682 return psock;
683}
684
685/* mux lock held */
686static void psock_now_avail(struct kcm_psock *psock)
687{
688 struct kcm_mux *mux = psock->mux;
689 struct kcm_sock *kcm;
690
691 if (list_empty(&mux->kcm_tx_waiters)) {
692 list_add_tail(&psock->psock_avail_list,
693 &mux->psocks_avail);
694 } else {
695 kcm = list_first_entry(&mux->kcm_tx_waiters,
696 struct kcm_sock,
697 wait_psock_list);
698 list_del(&kcm->wait_psock_list);
699 kcm->tx_wait = false;
700 psock->tx_kcm = kcm;
701
702 /* Commit before changing tx_psock since that is read in
703 * reserve_psock before queuing work.
704 */
705 smp_mb();
706
707 kcm->tx_psock = psock;
708 queue_work(kcm_wq, &kcm->tx_work);
709 }
710}
711
712/* kcm sock is locked. */
713static void unreserve_psock(struct kcm_sock *kcm)
714{
715 struct kcm_psock *psock;
716 struct kcm_mux *mux = kcm->mux;
717
718 spin_lock_bh(&mux->lock);
719
720 psock = kcm->tx_psock;
721
722 if (WARN_ON(!psock)) {
723 spin_unlock_bh(&mux->lock);
724 return;
725 }
726
727 smp_rmb(); /* Read tx_psock before tx_wait */
728
729 WARN_ON(kcm->tx_wait);
730
731 kcm->tx_psock = NULL;
732 psock->tx_kcm = NULL;
733
734 if (unlikely(psock->tx_stopped)) {
735 if (psock->done) {
736 /* Deferred free */
737 list_del(&psock->psock_list);
738 mux->psocks_cnt--;
739 sock_put(psock->sk);
740 fput(psock->sk->sk_socket->file);
741 kmem_cache_free(kcm_psockp, psock);
742 }
743
744 /* Don't put back on available list */
745
746 spin_unlock_bh(&mux->lock);
747
748 return;
749 }
750
751 psock_now_avail(psock);
752
753 spin_unlock_bh(&mux->lock);
754}
755
756/* Write any messages ready on the kcm socket. Called with kcm sock lock
757 * held. Return bytes actually sent or error.
758 */
759static int kcm_write_msgs(struct kcm_sock *kcm)
760{
761 struct sock *sk = &kcm->sk;
762 struct kcm_psock *psock;
763 struct sk_buff *skb, *head;
764 struct kcm_tx_msg *txm;
765 unsigned short fragidx, frag_offset;
766 unsigned int sent, total_sent = 0;
767 int ret = 0;
768
769 kcm->tx_wait_more = false;
770 psock = kcm->tx_psock;
771 if (unlikely(psock && psock->tx_stopped)) {
772 /* A reserved psock was aborted asynchronously. Unreserve
773 * it and we'll retry the message.
774 */
775 unreserve_psock(kcm);
776 if (skb_queue_empty(&sk->sk_write_queue))
777 return 0;
778
779 kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
780
781 } else if (skb_queue_empty(&sk->sk_write_queue)) {
782 return 0;
783 }
784
785 head = skb_peek(&sk->sk_write_queue);
786 txm = kcm_tx_msg(head);
787
788 if (txm->sent) {
789 /* Send of first skbuff in queue already in progress */
790 if (WARN_ON(!psock)) {
791 ret = -EINVAL;
792 goto out;
793 }
794 sent = txm->sent;
795 frag_offset = txm->frag_offset;
796 fragidx = txm->fragidx;
797 skb = txm->frag_skb;
798
799 goto do_frag;
800 }
801
802try_again:
803 psock = reserve_psock(kcm);
804 if (!psock)
805 goto out;
806
807 do {
808 skb = head;
809 txm = kcm_tx_msg(head);
810 sent = 0;
811
812do_frag_list:
813 if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
814 ret = -EINVAL;
815 goto out;
816 }
817
818 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
819 fragidx++) {
820 skb_frag_t *frag;
821
822 frag_offset = 0;
823do_frag:
824 frag = &skb_shinfo(skb)->frags[fragidx];
825 if (WARN_ON(!frag->size)) {
826 ret = -EINVAL;
827 goto out;
828 }
829
830 ret = kernel_sendpage(psock->sk->sk_socket,
831 frag->page.p,
832 frag->page_offset + frag_offset,
833 frag->size - frag_offset,
834 MSG_DONTWAIT);
835 if (ret <= 0) {
836 if (ret == -EAGAIN) {
837 /* Save state to try again when there's
838 * write space on the socket
839 */
840 txm->sent = sent;
841 txm->frag_offset = frag_offset;
842 txm->fragidx = fragidx;
843 txm->frag_skb = skb;
844
845 ret = 0;
846 goto out;
847 }
848
849 /* Hard failure in sending message, abort this
850 * psock since it has lost framing
851 * synchonization and retry sending the
852 * message from the beginning.
853 */
854 kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
855 true);
856 unreserve_psock(kcm);
857
858 txm->sent = 0;
859 ret = 0;
860
861 goto try_again;
862 }
863
864 sent += ret;
865 frag_offset += ret;
866 if (frag_offset < frag->size) {
867 /* Not finished with this frag */
868 goto do_frag;
869 }
870 }
871
872 if (skb == head) {
873 if (skb_has_frag_list(skb)) {
874 skb = skb_shinfo(skb)->frag_list;
875 goto do_frag_list;
876 }
877 } else if (skb->next) {
878 skb = skb->next;
879 goto do_frag_list;
880 }
881
882 /* Successfully sent the whole packet, account for it. */
883 skb_dequeue(&sk->sk_write_queue);
884 kfree_skb(head);
885 sk->sk_wmem_queued -= sent;
886 total_sent += sent;
887 } while ((head = skb_peek(&sk->sk_write_queue)));
888out:
889 if (!head) {
890 /* Done with all queued messages. */
891 WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
892 unreserve_psock(kcm);
893 }
894
895 /* Check if write space is available */
896 sk->sk_write_space(sk);
897
898 return total_sent ? : ret;
899}
900
901static void kcm_tx_work(struct work_struct *w)
902{
903 struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
904 struct sock *sk = &kcm->sk;
905 int err;
906
907 lock_sock(sk);
908
909 /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
910 * aborts
911 */
912 err = kcm_write_msgs(kcm);
913 if (err < 0) {
914 /* Hard failure in write, report error on KCM socket */
915 pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
916 report_csk_error(&kcm->sk, -err);
917 goto out;
918 }
919
920 /* Primarily for SOCK_SEQPACKET sockets */
921 if (likely(sk->sk_socket) &&
922 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
923 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
924 sk->sk_write_space(sk);
925 }
926
927out:
928 release_sock(sk);
929}
930
931static void kcm_push(struct kcm_sock *kcm)
932{
933 if (kcm->tx_wait_more)
934 kcm_write_msgs(kcm);
935}
936
937static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
938{
939 struct sock *sk = sock->sk;
940 struct kcm_sock *kcm = kcm_sk(sk);
941 struct sk_buff *skb = NULL, *head = NULL;
942 size_t copy, copied = 0;
943 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
944 int eor = (sock->type == SOCK_DGRAM) ?
945 !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
946 int err = -EPIPE;
947
948 lock_sock(sk);
949
950 /* Per tcp_sendmsg this should be in poll */
951 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
952
953 if (sk->sk_err)
954 goto out_error;
955
956 if (kcm->seq_skb) {
957 /* Previously opened message */
958 head = kcm->seq_skb;
959 skb = kcm_tx_msg(head)->last_skb;
960 goto start;
961 }
962
963 /* Call the sk_stream functions to manage the sndbuf mem. */
964 if (!sk_stream_memory_free(sk)) {
965 kcm_push(kcm);
966 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
967 err = sk_stream_wait_memory(sk, &timeo);
968 if (err)
969 goto out_error;
970 }
971
972 /* New message, alloc head skb */
973 head = alloc_skb(0, sk->sk_allocation);
974 while (!head) {
975 kcm_push(kcm);
976 err = sk_stream_wait_memory(sk, &timeo);
977 if (err)
978 goto out_error;
979
980 head = alloc_skb(0, sk->sk_allocation);
981 }
982
983 skb = head;
984
985 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
986 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
987 */
988 skb->ip_summed = CHECKSUM_UNNECESSARY;
989
990start:
991 while (msg_data_left(msg)) {
992 bool merge = true;
993 int i = skb_shinfo(skb)->nr_frags;
994 struct page_frag *pfrag = sk_page_frag(sk);
995
996 if (!sk_page_frag_refill(sk, pfrag))
997 goto wait_for_memory;
998
999 if (!skb_can_coalesce(skb, i, pfrag->page,
1000 pfrag->offset)) {
1001 if (i == MAX_SKB_FRAGS) {
1002 struct sk_buff *tskb;
1003
1004 tskb = alloc_skb(0, sk->sk_allocation);
1005 if (!tskb)
1006 goto wait_for_memory;
1007
1008 if (head == skb)
1009 skb_shinfo(head)->frag_list = tskb;
1010 else
1011 skb->next = tskb;
1012
1013 skb = tskb;
1014 skb->ip_summed = CHECKSUM_UNNECESSARY;
1015 continue;
1016 }
1017 merge = false;
1018 }
1019
1020 copy = min_t(int, msg_data_left(msg),
1021 pfrag->size - pfrag->offset);
1022
1023 if (!sk_wmem_schedule(sk, copy))
1024 goto wait_for_memory;
1025
1026 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1027 pfrag->page,
1028 pfrag->offset,
1029 copy);
1030 if (err)
1031 goto out_error;
1032
1033 /* Update the skb. */
1034 if (merge) {
1035 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1036 } else {
1037 skb_fill_page_desc(skb, i, pfrag->page,
1038 pfrag->offset, copy);
1039 get_page(pfrag->page);
1040 }
1041
1042 pfrag->offset += copy;
1043 copied += copy;
1044 if (head != skb) {
1045 head->len += copy;
1046 head->data_len += copy;
1047 }
1048
1049 continue;
1050
1051wait_for_memory:
1052 kcm_push(kcm);
1053 err = sk_stream_wait_memory(sk, &timeo);
1054 if (err)
1055 goto out_error;
1056 }
1057
1058 if (eor) {
1059 bool not_busy = skb_queue_empty(&sk->sk_write_queue);
1060
1061 /* Message complete, queue it on send buffer */
1062 __skb_queue_tail(&sk->sk_write_queue, head);
1063 kcm->seq_skb = NULL;
1064
1065 if (msg->msg_flags & MSG_BATCH) {
1066 kcm->tx_wait_more = true;
1067 } else if (kcm->tx_wait_more || not_busy) {
1068 err = kcm_write_msgs(kcm);
1069 if (err < 0) {
1070 /* We got a hard error in write_msgs but have
1071 * already queued this message. Report an error
1072 * in the socket, but don't affect return value
1073 * from sendmsg
1074 */
1075 pr_warn("KCM: Hard failure on kcm_write_msgs\n");
1076 report_csk_error(&kcm->sk, -err);
1077 }
1078 }
1079 } else {
1080 /* Message not complete, save state */
1081partial_message:
1082 kcm->seq_skb = head;
1083 kcm_tx_msg(head)->last_skb = skb;
1084 }
1085
1086 release_sock(sk);
1087 return copied;
1088
1089out_error:
1090 kcm_push(kcm);
1091
1092 if (copied && sock->type == SOCK_SEQPACKET) {
1093 /* Wrote some bytes before encountering an
1094 * error, return partial success.
1095 */
1096 goto partial_message;
1097 }
1098
1099 if (head != kcm->seq_skb)
1100 kfree_skb(head);
1101
1102 err = sk_stream_error(sk, msg->msg_flags, err);
1103
1104 /* make sure we wake any epoll edge trigger waiter */
1105 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1106 sk->sk_write_space(sk);
1107
1108 release_sock(sk);
1109 return err;
1110}
1111
1112static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
1113 long timeo, int *err)
1114{
1115 struct sk_buff *skb;
1116
1117 while (!(skb = skb_peek(&sk->sk_receive_queue))) {
1118 if (sk->sk_err) {
1119 *err = sock_error(sk);
1120 return NULL;
1121 }
1122
1123 if (sock_flag(sk, SOCK_DONE))
1124 return NULL;
1125
1126 if ((flags & MSG_DONTWAIT) || !timeo) {
1127 *err = -EAGAIN;
1128 return NULL;
1129 }
1130
1131 sk_wait_data(sk, &timeo, NULL);
1132
1133 /* Handle signals */
1134 if (signal_pending(current)) {
1135 *err = sock_intr_errno(timeo);
1136 return NULL;
1137 }
1138 }
1139
1140 return skb;
1141}
1142
1143static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
1144 size_t len, int flags)
1145{
1146 struct sock *sk = sock->sk;
1147 int err = 0;
1148 long timeo;
1149 struct kcm_rx_msg *rxm;
1150 int copied = 0;
1151 struct sk_buff *skb;
1152
1153 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
1154
1155 lock_sock(sk);
1156
1157 skb = kcm_wait_data(sk, flags, timeo, &err);
1158 if (!skb)
1159 goto out;
1160
1161 /* Okay, have a message on the receive queue */
1162
1163 rxm = kcm_rx_msg(skb);
1164
1165 if (len > rxm->full_len)
1166 len = rxm->full_len;
1167
1168 err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
1169 if (err < 0)
1170 goto out;
1171
1172 copied = len;
1173 if (likely(!(flags & MSG_PEEK))) {
1174 if (copied < rxm->full_len) {
1175 if (sock->type == SOCK_DGRAM) {
1176 /* Truncated message */
1177 msg->msg_flags |= MSG_TRUNC;
1178 goto msg_finished;
1179 }
1180 rxm->offset += copied;
1181 rxm->full_len -= copied;
1182 } else {
1183msg_finished:
1184 /* Finished with message */
1185 msg->msg_flags |= MSG_EOR;
1186 skb_unlink(skb, &sk->sk_receive_queue);
1187 kfree_skb(skb);
1188 }
1189 }
1190
1191out:
1192 release_sock(sk);
1193
1194 return copied ? : err;
1195}
1196
1197/* kcm sock lock held */
1198static void kcm_recv_disable(struct kcm_sock *kcm)
1199{
1200 struct kcm_mux *mux = kcm->mux;
1201
1202 if (kcm->rx_disabled)
1203 return;
1204
1205 spin_lock_bh(&mux->rx_lock);
1206
1207 kcm->rx_disabled = 1;
1208
1209 /* If a psock is reserved we'll do cleanup in unreserve */
1210 if (!kcm->rx_psock) {
1211 if (kcm->rx_wait) {
1212 list_del(&kcm->wait_rx_list);
1213 kcm->rx_wait = false;
1214 }
1215
1216 requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
1217 }
1218
1219 spin_unlock_bh(&mux->rx_lock);
1220}
1221
1222/* kcm sock lock held */
1223static void kcm_recv_enable(struct kcm_sock *kcm)
1224{
1225 struct kcm_mux *mux = kcm->mux;
1226
1227 if (!kcm->rx_disabled)
1228 return;
1229
1230 spin_lock_bh(&mux->rx_lock);
1231
1232 kcm->rx_disabled = 0;
1233 kcm_rcv_ready(kcm);
1234
1235 spin_unlock_bh(&mux->rx_lock);
1236}
1237
1238static int kcm_setsockopt(struct socket *sock, int level, int optname,
1239 char __user *optval, unsigned int optlen)
1240{
1241 struct kcm_sock *kcm = kcm_sk(sock->sk);
1242 int val, valbool;
1243 int err = 0;
1244
1245 if (level != SOL_KCM)
1246 return -ENOPROTOOPT;
1247
1248 if (optlen < sizeof(int))
1249 return -EINVAL;
1250
1251 if (get_user(val, (int __user *)optval))
1252 return -EINVAL;
1253
1254 valbool = val ? 1 : 0;
1255
1256 switch (optname) {
1257 case KCM_RECV_DISABLE:
1258 lock_sock(&kcm->sk);
1259 if (valbool)
1260 kcm_recv_disable(kcm);
1261 else
1262 kcm_recv_enable(kcm);
1263 release_sock(&kcm->sk);
1264 break;
1265 default:
1266 err = -ENOPROTOOPT;
1267 }
1268
1269 return err;
1270}
1271
1272static int kcm_getsockopt(struct socket *sock, int level, int optname,
1273 char __user *optval, int __user *optlen)
1274{
1275 struct kcm_sock *kcm = kcm_sk(sock->sk);
1276 int val, len;
1277
1278 if (level != SOL_KCM)
1279 return -ENOPROTOOPT;
1280
1281 if (get_user(len, optlen))
1282 return -EFAULT;
1283
1284 len = min_t(unsigned int, len, sizeof(int));
1285 if (len < 0)
1286 return -EINVAL;
1287
1288 switch (optname) {
1289 case KCM_RECV_DISABLE:
1290 val = kcm->rx_disabled;
1291 break;
1292 default:
1293 return -ENOPROTOOPT;
1294 }
1295
1296 if (put_user(len, optlen))
1297 return -EFAULT;
1298 if (copy_to_user(optval, &val, len))
1299 return -EFAULT;
1300 return 0;
1301}
1302
1303static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
1304{
1305 struct kcm_sock *tkcm;
1306 struct list_head *head;
1307 int index = 0;
1308
1309 /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
1310 * we set sk_state, otherwise epoll_wait always returns right away with
1311 * POLLHUP
1312 */
1313 kcm->sk.sk_state = TCP_ESTABLISHED;
1314
1315 /* Add to mux's kcm sockets list */
1316 kcm->mux = mux;
1317 spin_lock_bh(&mux->lock);
1318
1319 head = &mux->kcm_socks;
1320 list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
1321 if (tkcm->index != index)
1322 break;
1323 head = &tkcm->kcm_sock_list;
1324 index++;
1325 }
1326
1327 list_add(&kcm->kcm_sock_list, head);
1328 kcm->index = index;
1329
1330 mux->kcm_socks_cnt++;
1331 spin_unlock_bh(&mux->lock);
1332
1333 INIT_WORK(&kcm->tx_work, kcm_tx_work);
1334
1335 spin_lock_bh(&mux->rx_lock);
1336 kcm_rcv_ready(kcm);
1337 spin_unlock_bh(&mux->rx_lock);
1338}
1339
1340static int kcm_attach(struct socket *sock, struct socket *csock,
1341 struct bpf_prog *prog)
1342{
1343 struct kcm_sock *kcm = kcm_sk(sock->sk);
1344 struct kcm_mux *mux = kcm->mux;
1345 struct sock *csk;
1346 struct kcm_psock *psock = NULL, *tpsock;
1347 struct list_head *head;
1348 int index = 0;
1349
1350 if (csock->ops->family != PF_INET &&
1351 csock->ops->family != PF_INET6)
1352 return -EINVAL;
1353
1354 csk = csock->sk;
1355 if (!csk)
1356 return -EINVAL;
1357
1358 /* Only support TCP for now */
1359 if (csk->sk_protocol != IPPROTO_TCP)
1360 return -EINVAL;
1361
1362 psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
1363 if (!psock)
1364 return -ENOMEM;
1365
1366 psock->mux = mux;
1367 psock->sk = csk;
1368 psock->bpf_prog = prog;
1369 INIT_WORK(&psock->rx_work, psock_rx_work);
1370 INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
1371
1372 sock_hold(csk);
1373
1374 write_lock_bh(&csk->sk_callback_lock);
1375 psock->save_data_ready = csk->sk_data_ready;
1376 psock->save_write_space = csk->sk_write_space;
1377 psock->save_state_change = csk->sk_state_change;
1378 csk->sk_user_data = psock;
1379 csk->sk_data_ready = psock_tcp_data_ready;
1380 csk->sk_write_space = psock_tcp_write_space;
1381 csk->sk_state_change = psock_tcp_state_change;
1382 write_unlock_bh(&csk->sk_callback_lock);
1383
1384 /* Finished initialization, now add the psock to the MUX. */
1385 spin_lock_bh(&mux->lock);
1386 head = &mux->psocks;
1387 list_for_each_entry(tpsock, &mux->psocks, psock_list) {
1388 if (tpsock->index != index)
1389 break;
1390 head = &tpsock->psock_list;
1391 index++;
1392 }
1393
1394 list_add(&psock->psock_list, head);
1395 psock->index = index;
1396
1397 mux->psocks_cnt++;
1398 psock_now_avail(psock);
1399 spin_unlock_bh(&mux->lock);
1400
1401 /* Schedule RX work in case there are already bytes queued */
1402 queue_work(kcm_wq, &psock->rx_work);
1403
1404 return 0;
1405}
1406
1407static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
1408{
1409 struct socket *csock;
1410 struct bpf_prog *prog;
1411 int err;
1412
1413 csock = sockfd_lookup(info->fd, &err);
1414 if (!csock)
1415 return -ENOENT;
1416
1417 prog = bpf_prog_get(info->bpf_fd);
1418 if (IS_ERR(prog)) {
1419 err = PTR_ERR(prog);
1420 goto out;
1421 }
1422
1423 if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
1424 bpf_prog_put(prog);
1425 err = -EINVAL;
1426 goto out;
1427 }
1428
1429 err = kcm_attach(sock, csock, prog);
1430 if (err) {
1431 bpf_prog_put(prog);
1432 goto out;
1433 }
1434
1435 /* Keep reference on file also */
1436
1437 return 0;
1438out:
1439 fput(csock->file);
1440 return err;
1441}
1442
1443static void kcm_unattach(struct kcm_psock *psock)
1444{
1445 struct sock *csk = psock->sk;
1446 struct kcm_mux *mux = psock->mux;
1447
1448 /* Stop getting callbacks from TCP socket. After this there should
1449 * be no way to reserve a kcm for this psock.
1450 */
1451 write_lock_bh(&csk->sk_callback_lock);
1452 csk->sk_user_data = NULL;
1453 csk->sk_data_ready = psock->save_data_ready;
1454 csk->sk_write_space = psock->save_write_space;
1455 csk->sk_state_change = psock->save_state_change;
1456 psock->rx_stopped = 1;
1457
1458 if (WARN_ON(psock->rx_kcm)) {
1459 write_unlock_bh(&csk->sk_callback_lock);
1460 return;
1461 }
1462
1463 spin_lock_bh(&mux->rx_lock);
1464
1465 /* Stop receiver activities. After this point psock should not be
1466 * able to get onto ready list either through callbacks or work.
1467 */
1468 if (psock->ready_rx_msg) {
1469 list_del(&psock->psock_ready_list);
1470 kfree_skb(psock->ready_rx_msg);
1471 psock->ready_rx_msg = NULL;
1472 }
1473
1474 spin_unlock_bh(&mux->rx_lock);
1475
1476 write_unlock_bh(&csk->sk_callback_lock);
1477
1478 cancel_work_sync(&psock->rx_work);
1479 cancel_delayed_work_sync(&psock->rx_delayed_work);
1480
1481 bpf_prog_put(psock->bpf_prog);
1482
1483 kfree_skb(psock->rx_skb_head);
1484 psock->rx_skb_head = NULL;
1485
1486 spin_lock_bh(&mux->lock);
1487
1488 if (psock->tx_kcm) {
1489 /* psock was reserved. Just mark it finished and we will clean
1490 * up in the kcm paths, we need kcm lock which can not be
1491 * acquired here.
1492 */
1493 spin_unlock_bh(&mux->lock);
1494
1495 /* We are unattaching a socket that is reserved. Abort the
1496 * socket since we may be out of sync in sending on it. We need
1497 * to do this without the mux lock.
1498 */
1499 kcm_abort_tx_psock(psock, EPIPE, false);
1500
1501 spin_lock_bh(&mux->lock);
1502 if (!psock->tx_kcm) {
1503 /* psock now unreserved in window mux was unlocked */
1504 goto no_reserved;
1505 }
1506 psock->done = 1;
1507
1508 /* Commit done before queuing work to process it */
1509 smp_mb();
1510
1511 /* Queue tx work to make sure psock->done is handled */
1512 queue_work(kcm_wq, &psock->tx_kcm->tx_work);
1513 spin_unlock_bh(&mux->lock);
1514 } else {
1515no_reserved:
1516 if (!psock->tx_stopped)
1517 list_del(&psock->psock_avail_list);
1518 list_del(&psock->psock_list);
1519 mux->psocks_cnt--;
1520 spin_unlock_bh(&mux->lock);
1521
1522 sock_put(csk);
1523 fput(csk->sk_socket->file);
1524 kmem_cache_free(kcm_psockp, psock);
1525 }
1526}
1527
1528static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
1529{
1530 struct kcm_sock *kcm = kcm_sk(sock->sk);
1531 struct kcm_mux *mux = kcm->mux;
1532 struct kcm_psock *psock;
1533 struct socket *csock;
1534 struct sock *csk;
1535 int err;
1536
1537 csock = sockfd_lookup(info->fd, &err);
1538 if (!csock)
1539 return -ENOENT;
1540
1541 csk = csock->sk;
1542 if (!csk) {
1543 err = -EINVAL;
1544 goto out;
1545 }
1546
1547 err = -ENOENT;
1548
1549 spin_lock_bh(&mux->lock);
1550
1551 list_for_each_entry(psock, &mux->psocks, psock_list) {
1552 if (psock->sk != csk)
1553 continue;
1554
1555 /* Found the matching psock */
1556
1557 if (psock->unattaching || WARN_ON(psock->done)) {
1558 err = -EALREADY;
1559 break;
1560 }
1561
1562 psock->unattaching = 1;
1563
1564 spin_unlock_bh(&mux->lock);
1565
1566 kcm_unattach(psock);
1567
1568 err = 0;
1569 goto out;
1570 }
1571
1572 spin_unlock_bh(&mux->lock);
1573
1574out:
1575 fput(csock->file);
1576 return err;
1577}
1578
1579static struct proto kcm_proto = {
1580 .name = "KCM",
1581 .owner = THIS_MODULE,
1582 .obj_size = sizeof(struct kcm_sock),
1583};
1584
1585/* Clone a kcm socket. */
1586static int kcm_clone(struct socket *osock, struct kcm_clone *info,
1587 struct socket **newsockp)
1588{
1589 struct socket *newsock;
1590 struct sock *newsk;
1591 struct file *newfile;
1592 int err, newfd;
1593
1594 err = -ENFILE;
1595 newsock = sock_alloc();
1596 if (!newsock)
1597 goto out;
1598
1599 newsock->type = osock->type;
1600 newsock->ops = osock->ops;
1601
1602 __module_get(newsock->ops->owner);
1603
1604 newfd = get_unused_fd_flags(0);
1605 if (unlikely(newfd < 0)) {
1606 err = newfd;
1607 goto out_fd_fail;
1608 }
1609
1610 newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
1611 if (unlikely(IS_ERR(newfile))) {
1612 err = PTR_ERR(newfile);
1613 goto out_sock_alloc_fail;
1614 }
1615
1616 newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
1617 &kcm_proto, true);
1618 if (!newsk) {
1619 err = -ENOMEM;
1620 goto out_sk_alloc_fail;
1621 }
1622
1623 sock_init_data(newsock, newsk);
1624 init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
1625
1626 fd_install(newfd, newfile);
1627 *newsockp = newsock;
1628 info->fd = newfd;
1629
1630 return 0;
1631
1632out_sk_alloc_fail:
1633 fput(newfile);
1634out_sock_alloc_fail:
1635 put_unused_fd(newfd);
1636out_fd_fail:
1637 sock_release(newsock);
1638out:
1639 return err;
1640}
1641
1642static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1643{
1644 int err;
1645
1646 switch (cmd) {
1647 case SIOCKCMATTACH: {
1648 struct kcm_attach info;
1649
1650 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1651 err = -EFAULT;
1652
1653 err = kcm_attach_ioctl(sock, &info);
1654
1655 break;
1656 }
1657 case SIOCKCMUNATTACH: {
1658 struct kcm_unattach info;
1659
1660 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1661 err = -EFAULT;
1662
1663 err = kcm_unattach_ioctl(sock, &info);
1664
1665 break;
1666 }
1667 case SIOCKCMCLONE: {
1668 struct kcm_clone info;
1669 struct socket *newsock = NULL;
1670
1671 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1672 err = -EFAULT;
1673
1674 err = kcm_clone(sock, &info, &newsock);
1675
1676 if (!err) {
1677 if (copy_to_user((void __user *)arg, &info,
1678 sizeof(info))) {
1679 err = -EFAULT;
1680 sock_release(newsock);
1681 }
1682 }
1683
1684 break;
1685 }
1686 default:
1687 err = -ENOIOCTLCMD;
1688 break;
1689 }
1690
1691 return err;
1692}
1693
1694static void free_mux(struct rcu_head *rcu)
1695{
1696 struct kcm_mux *mux = container_of(rcu,
1697 struct kcm_mux, rcu);
1698
1699 kmem_cache_free(kcm_muxp, mux);
1700}
1701
1702static void release_mux(struct kcm_mux *mux)
1703{
1704 struct kcm_net *knet = mux->knet;
1705 struct kcm_psock *psock, *tmp_psock;
1706
1707 /* Release psocks */
1708 list_for_each_entry_safe(psock, tmp_psock,
1709 &mux->psocks, psock_list) {
1710 if (!WARN_ON(psock->unattaching))
1711 kcm_unattach(psock);
1712 }
1713
1714 if (WARN_ON(mux->psocks_cnt))
1715 return;
1716
1717 __skb_queue_purge(&mux->rx_hold_queue);
1718
1719 mutex_lock(&knet->mutex);
1720 list_del_rcu(&mux->kcm_mux_list);
1721 knet->count--;
1722 mutex_unlock(&knet->mutex);
1723
1724 call_rcu(&mux->rcu, free_mux);
1725}
1726
1727static void kcm_done(struct kcm_sock *kcm)
1728{
1729 struct kcm_mux *mux = kcm->mux;
1730 struct sock *sk = &kcm->sk;
1731 int socks_cnt;
1732
1733 spin_lock_bh(&mux->rx_lock);
1734 if (kcm->rx_psock) {
1735 /* Cleanup in unreserve_rx_kcm */
1736 WARN_ON(kcm->done);
1737 kcm->rx_disabled = 1;
1738 kcm->done = 1;
1739 spin_unlock_bh(&mux->rx_lock);
1740 return;
1741 }
1742
1743 if (kcm->rx_wait) {
1744 list_del(&kcm->wait_rx_list);
1745 kcm->rx_wait = false;
1746 }
1747 /* Move any pending receive messages to other kcm sockets */
1748 requeue_rx_msgs(mux, &sk->sk_receive_queue);
1749
1750 spin_unlock_bh(&mux->rx_lock);
1751
1752 if (WARN_ON(sk_rmem_alloc_get(sk)))
1753 return;
1754
1755 /* Detach from MUX */
1756 spin_lock_bh(&mux->lock);
1757
1758 list_del(&kcm->kcm_sock_list);
1759 mux->kcm_socks_cnt--;
1760 socks_cnt = mux->kcm_socks_cnt;
1761
1762 spin_unlock_bh(&mux->lock);
1763
1764 if (!socks_cnt) {
1765 /* We are done with the mux now. */
1766 release_mux(mux);
1767 }
1768
1769 WARN_ON(kcm->rx_wait);
1770
1771 sock_put(&kcm->sk);
1772}
1773
1774/* Called by kcm_release to close a KCM socket.
1775 * If this is the last KCM socket on the MUX, destroy the MUX.
1776 */
1777static int kcm_release(struct socket *sock)
1778{
1779 struct sock *sk = sock->sk;
1780 struct kcm_sock *kcm;
1781 struct kcm_mux *mux;
1782 struct kcm_psock *psock;
1783
1784 if (!sk)
1785 return 0;
1786
1787 kcm = kcm_sk(sk);
1788 mux = kcm->mux;
1789
1790 sock_orphan(sk);
1791 kfree_skb(kcm->seq_skb);
1792
1793 lock_sock(sk);
1794 /* Purge queue under lock to avoid race condition with tx_work trying
1795 * to act when queue is nonempty. If tx_work runs after this point
1796 * it will just return.
1797 */
1798 __skb_queue_purge(&sk->sk_write_queue);
1799 release_sock(sk);
1800
1801 spin_lock_bh(&mux->lock);
1802 if (kcm->tx_wait) {
1803 /* Take of tx_wait list, after this point there should be no way
1804 * that a psock will be assigned to this kcm.
1805 */
1806 list_del(&kcm->wait_psock_list);
1807 kcm->tx_wait = false;
1808 }
1809 spin_unlock_bh(&mux->lock);
1810
1811 /* Cancel work. After this point there should be no outside references
1812 * to the kcm socket.
1813 */
1814 cancel_work_sync(&kcm->tx_work);
1815
1816 lock_sock(sk);
1817 psock = kcm->tx_psock;
1818 if (psock) {
1819 /* A psock was reserved, so we need to kill it since it
1820 * may already have some bytes queued from a message. We
1821 * need to do this after removing kcm from tx_wait list.
1822 */
1823 kcm_abort_tx_psock(psock, EPIPE, false);
1824 unreserve_psock(kcm);
1825 }
1826 release_sock(sk);
1827
1828 WARN_ON(kcm->tx_wait);
1829 WARN_ON(kcm->tx_psock);
1830
1831 sock->sk = NULL;
1832
1833 kcm_done(kcm);
1834
1835 return 0;
1836}
1837
1838static const struct proto_ops kcm_ops = {
1839 .family = PF_KCM,
1840 .owner = THIS_MODULE,
1841 .release = kcm_release,
1842 .bind = sock_no_bind,
1843 .connect = sock_no_connect,
1844 .socketpair = sock_no_socketpair,
1845 .accept = sock_no_accept,
1846 .getname = sock_no_getname,
1847 .poll = datagram_poll,
1848 .ioctl = kcm_ioctl,
1849 .listen = sock_no_listen,
1850 .shutdown = sock_no_shutdown,
1851 .setsockopt = kcm_setsockopt,
1852 .getsockopt = kcm_getsockopt,
1853 .sendmsg = kcm_sendmsg,
1854 .recvmsg = kcm_recvmsg,
1855 .mmap = sock_no_mmap,
1856 .sendpage = sock_no_sendpage,
1857};
1858
1859/* Create proto operation for kcm sockets */
1860static int kcm_create(struct net *net, struct socket *sock,
1861 int protocol, int kern)
1862{
1863 struct kcm_net *knet = net_generic(net, kcm_net_id);
1864 struct sock *sk;
1865 struct kcm_mux *mux;
1866
1867 switch (sock->type) {
1868 case SOCK_DGRAM:
1869 case SOCK_SEQPACKET:
1870 sock->ops = &kcm_ops;
1871 break;
1872 default:
1873 return -ESOCKTNOSUPPORT;
1874 }
1875
1876 if (protocol != KCMPROTO_CONNECTED)
1877 return -EPROTONOSUPPORT;
1878
1879 sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
1880 if (!sk)
1881 return -ENOMEM;
1882
1883 /* Allocate a kcm mux, shared between KCM sockets */
1884 mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
1885 if (!mux) {
1886 sk_free(sk);
1887 return -ENOMEM;
1888 }
1889
1890 spin_lock_init(&mux->lock);
1891 spin_lock_init(&mux->rx_lock);
1892 INIT_LIST_HEAD(&mux->kcm_socks);
1893 INIT_LIST_HEAD(&mux->kcm_rx_waiters);
1894 INIT_LIST_HEAD(&mux->kcm_tx_waiters);
1895
1896 INIT_LIST_HEAD(&mux->psocks);
1897 INIT_LIST_HEAD(&mux->psocks_ready);
1898 INIT_LIST_HEAD(&mux->psocks_avail);
1899
1900 mux->knet = knet;
1901
1902 /* Add new MUX to list */
1903 mutex_lock(&knet->mutex);
1904 list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
1905 knet->count++;
1906 mutex_unlock(&knet->mutex);
1907
1908 skb_queue_head_init(&mux->rx_hold_queue);
1909
1910 /* Init KCM socket */
1911 sock_init_data(sock, sk);
1912 init_kcm_sock(kcm_sk(sk), mux);
1913
1914 return 0;
1915}
1916
1917static struct net_proto_family kcm_family_ops = {
1918 .family = PF_KCM,
1919 .create = kcm_create,
1920 .owner = THIS_MODULE,
1921};
1922
1923static __net_init int kcm_init_net(struct net *net)
1924{
1925 struct kcm_net *knet = net_generic(net, kcm_net_id);
1926
1927 INIT_LIST_HEAD_RCU(&knet->mux_list);
1928 mutex_init(&knet->mutex);
1929
1930 return 0;
1931}
1932
1933static __net_exit void kcm_exit_net(struct net *net)
1934{
1935 struct kcm_net *knet = net_generic(net, kcm_net_id);
1936
1937 /* All KCM sockets should be closed at this point, which should mean
1938 * that all multiplexors and psocks have been destroyed.
1939 */
1940 WARN_ON(!list_empty(&knet->mux_list));
1941}
1942
1943static struct pernet_operations kcm_net_ops = {
1944 .init = kcm_init_net,
1945 .exit = kcm_exit_net,
1946 .id = &kcm_net_id,
1947 .size = sizeof(struct kcm_net),
1948};
1949
1950static int __init kcm_init(void)
1951{
1952 int err = -ENOMEM;
1953
1954 kcm_muxp = kmem_cache_create("kcm_mux_cache",
1955 sizeof(struct kcm_mux), 0,
1956 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1957 if (!kcm_muxp)
1958 goto fail;
1959
1960 kcm_psockp = kmem_cache_create("kcm_psock_cache",
1961 sizeof(struct kcm_psock), 0,
1962 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1963 if (!kcm_psockp)
1964 goto fail;
1965
1966 kcm_wq = create_singlethread_workqueue("kkcmd");
1967 if (!kcm_wq)
1968 goto fail;
1969
1970 err = proto_register(&kcm_proto, 1);
1971 if (err)
1972 goto fail;
1973
1974 err = sock_register(&kcm_family_ops);
1975 if (err)
1976 goto sock_register_fail;
1977
1978 err = register_pernet_device(&kcm_net_ops);
1979 if (err)
1980 goto net_ops_fail;
1981
1982 return 0;
1983
1984net_ops_fail:
1985 sock_unregister(PF_KCM);
1986
1987sock_register_fail:
1988 proto_unregister(&kcm_proto);
1989
1990fail:
1991 kmem_cache_destroy(kcm_muxp);
1992 kmem_cache_destroy(kcm_psockp);
1993
1994 if (kcm_wq)
1995 destroy_workqueue(kcm_wq);
1996
1997 return err;
1998}
1999
2000static void __exit kcm_exit(void)
2001{
2002 unregister_pernet_device(&kcm_net_ops);
2003 sock_unregister(PF_KCM);
2004 proto_unregister(&kcm_proto);
2005 destroy_workqueue(kcm_wq);
2006
2007 kmem_cache_destroy(kcm_muxp);
2008 kmem_cache_destroy(kcm_psockp);
2009}
2010
2011module_init(kcm_init);
2012module_exit(kcm_exit);
2013
2014MODULE_LICENSE("GPL");
2015MODULE_ALIAS_NETPROTO(PF_KCM);
2016