aboutsummaryrefslogtreecommitdiffstats
path: root/net/vmw_vsock/af_vsock.c
diff options
context:
space:
mode:
authorAndy King <acking@vmware.com>2013-02-06 09:23:56 -0500
committerDavid S. Miller <davem@davemloft.net>2013-02-10 19:41:08 -0500
commitd021c344051af91f42c5ba9fdedc176740cbd238 (patch)
tree8c02cd94a59556da4b74823816e670dd007db72f /net/vmw_vsock/af_vsock.c
parentfd5023111cf720db890ef34f305ac5d427e690a0 (diff)
VSOCK: Introduce VM Sockets
VM Sockets allows communication between virtual machines and the hypervisor. User level applications both in a virtual machine and on the host can use the VM Sockets API, which facilitates fast and efficient communication between guest virtual machines and their host. A socket address family, designed to be compatible with UDP and TCP at the interface level, is provided. Today, VM Sockets is used by various VMware Tools components inside the guest for zero-config, network-less access to VMware host services. In addition to this, VMware's users are using VM Sockets for various applications, where network access of the virtual machine is restricted or non-existent. Examples of this are VMs communicating with device proxies for proprietary hardware running as host applications and automated testing of applications running within virtual machines. The VMware VM Sockets are similar to other socket types, like Berkeley UNIX socket interface. The VM Sockets module supports both connection-oriented stream sockets like TCP, and connectionless datagram sockets like UDP. The VM Sockets protocol family is defined as "AF_VSOCK" and the socket operations split for SOCK_DGRAM and SOCK_STREAM. For additional information about the use of VM Sockets, please refer to the VM Sockets Programming Guide available at: https://www.vmware.com/support/developer/vmci-sdk/ Signed-off-by: George Zhang <georgezhang@vmware.com> Signed-off-by: Dmitry Torokhov <dtor@vmware.com> Signed-off-by: Andy king <acking@vmware.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/vmw_vsock/af_vsock.c')
-rw-r--r--net/vmw_vsock/af_vsock.c2015
1 files changed, 2015 insertions, 0 deletions
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
new file mode 100644
index 000000000000..54bb7bdf92d3
--- /dev/null
+++ b/net/vmw_vsock/af_vsock.c
@@ -0,0 +1,2015 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16/* Implementation notes:
17 *
18 * - There are two kinds of sockets: those created by user action (such as
19 * calling socket(2)) and those created by incoming connection request packets.
20 *
21 * - There are two "global" tables, one for bound sockets (sockets that have
22 * specified an address that they are responsible for) and one for connected
23 * sockets (sockets that have established a connection with another socket).
24 * These tables are "global" in that all sockets on the system are placed
25 * within them. - Note, though, that the bound table contains an extra entry
26 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in
27 * that list. The bound table is used solely for lookup of sockets when packets
28 * are received and that's not necessary for SOCK_DGRAM sockets since we create
29 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM
30 * sockets out of the bound hash buckets will reduce the chance of collisions
31 * when looking for SOCK_STREAM sockets and prevents us from having to check the
32 * socket type in the hash table lookups.
33 *
34 * - Sockets created by user action will either be "client" sockets that
35 * initiate a connection or "server" sockets that listen for connections; we do
36 * not support simultaneous connects (two "client" sockets connecting).
37 *
38 * - "Server" sockets are referred to as listener sockets throughout this
39 * implementation because they are in the SS_LISTEN state. When a connection
40 * request is received (the second kind of socket mentioned above), we create a
41 * new socket and refer to it as a pending socket. These pending sockets are
42 * placed on the pending connection list of the listener socket. When future
43 * packets are received for the address the listener socket is bound to, we
44 * check if the source of the packet is from one that has an existing pending
45 * connection. If it does, we process the packet for the pending socket. When
46 * that socket reaches the connected state, it is removed from the listener
47 * socket's pending list and enqueued in the listener socket's accept queue.
48 * Callers of accept(2) will accept connected sockets from the listener socket's
49 * accept queue. If the socket cannot be accepted for some reason then it is
50 * marked rejected. Once the connection is accepted, it is owned by the user
51 * process and the responsibility for cleanup falls with that user process.
52 *
53 * - It is possible that these pending sockets will never reach the connected
54 * state; in fact, we may never receive another packet after the connection
55 * request. Because of this, we must schedule a cleanup function to run in the
56 * future, after some amount of time passes where a connection should have been
57 * established. This function ensures that the socket is off all lists so it
58 * cannot be retrieved, then drops all references to the socket so it is cleaned
59 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this
60 * function will also cleanup rejected sockets, those that reach the connected
61 * state but leave it before they have been accepted.
62 *
63 * - Sockets created by user action will be cleaned up when the user process
64 * calls close(2), causing our release implementation to be called. Our release
65 * implementation will perform some cleanup then drop the last reference so our
66 * sk_destruct implementation is invoked. Our sk_destruct implementation will
67 * perform additional cleanup that's common for both types of sockets.
68 *
69 * - A socket's reference count is what ensures that the structure won't be
70 * freed. Each entry in a list (such as the "global" bound and connected tables
71 * and the listener socket's pending list and connected queue) ensures a
72 * reference. When we defer work until process context and pass a socket as our
73 * argument, we must ensure the reference count is increased to ensure the
74 * socket isn't freed before the function is run; the deferred function will
75 * then drop the reference.
76 */
77
78#include <linux/types.h>
79
80#define EXPORT_SYMTAB
81#include <linux/bitops.h>
82#include <linux/cred.h>
83#include <linux/init.h>
84#include <linux/io.h>
85#include <linux/kernel.h>
86#include <linux/kmod.h>
87#include <linux/list.h>
88#include <linux/miscdevice.h>
89#include <linux/module.h>
90#include <linux/mutex.h>
91#include <linux/net.h>
92#include <linux/poll.h>
93#include <linux/skbuff.h>
94#include <linux/smp.h>
95#include <linux/socket.h>
96#include <linux/stddef.h>
97#include <linux/unistd.h>
98#include <linux/wait.h>
99#include <linux/workqueue.h>
100#include <net/sock.h>
101
102#include "af_vsock.h"
103#include "vsock_version.h"
104
105static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
106static void vsock_sk_destruct(struct sock *sk);
107static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
108
109/* Protocol family. */
110static struct proto vsock_proto = {
111 .name = "AF_VSOCK",
112 .owner = THIS_MODULE,
113 .obj_size = sizeof(struct vsock_sock),
114};
115
116/* The default peer timeout indicates how long we will wait for a peer response
117 * to a control message.
118 */
119#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
120
121#define SS_LISTEN 255
122
123static const struct vsock_transport *transport;
124static DEFINE_MUTEX(vsock_register_mutex);
125
126/**** EXPORTS ****/
127
128/* Get the ID of the local context. This is transport dependent. */
129
130int vm_sockets_get_local_cid(void)
131{
132 return transport->get_local_cid();
133}
134EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
135
136/**** UTILS ****/
137
138/* Each bound VSocket is stored in the bind hash table and each connected
139 * VSocket is stored in the connected hash table.
140 *
141 * Unbound sockets are all put on the same list attached to the end of the hash
142 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in
143 * the bucket that their local address hashes to (vsock_bound_sockets(addr)
144 * represents the list that addr hashes to).
145 *
146 * Specifically, we initialize the vsock_bind_table array to a size of
147 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through
148 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and
149 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function
150 * mods with VSOCK_HASH_SIZE - 1 to ensure this.
151 */
152#define VSOCK_HASH_SIZE 251
153#define MAX_PORT_RETRIES 24
154
155#define VSOCK_HASH(addr) ((addr)->svm_port % (VSOCK_HASH_SIZE - 1))
156#define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)])
157#define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE])
158
159/* XXX This can probably be implemented in a better way. */
160#define VSOCK_CONN_HASH(src, dst) \
161 (((src)->svm_cid ^ (dst)->svm_port) % (VSOCK_HASH_SIZE - 1))
162#define vsock_connected_sockets(src, dst) \
163 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)])
164#define vsock_connected_sockets_vsk(vsk) \
165 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr)
166
167static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
168static struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
169static DEFINE_SPINLOCK(vsock_table_lock);
170
171static __init void vsock_init_tables(void)
172{
173 int i;
174
175 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++)
176 INIT_LIST_HEAD(&vsock_bind_table[i]);
177
178 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
179 INIT_LIST_HEAD(&vsock_connected_table[i]);
180}
181
182static void __vsock_insert_bound(struct list_head *list,
183 struct vsock_sock *vsk)
184{
185 sock_hold(&vsk->sk);
186 list_add(&vsk->bound_table, list);
187}
188
189static void __vsock_insert_connected(struct list_head *list,
190 struct vsock_sock *vsk)
191{
192 sock_hold(&vsk->sk);
193 list_add(&vsk->connected_table, list);
194}
195
196static void __vsock_remove_bound(struct vsock_sock *vsk)
197{
198 list_del_init(&vsk->bound_table);
199 sock_put(&vsk->sk);
200}
201
202static void __vsock_remove_connected(struct vsock_sock *vsk)
203{
204 list_del_init(&vsk->connected_table);
205 sock_put(&vsk->sk);
206}
207
208static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
209{
210 struct vsock_sock *vsk;
211
212 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
213 if (vsock_addr_equals_addr_any(addr, &vsk->local_addr))
214 return sk_vsock(vsk);
215
216 return NULL;
217}
218
219static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
220 struct sockaddr_vm *dst)
221{
222 struct vsock_sock *vsk;
223
224 list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
225 connected_table) {
226 if (vsock_addr_equals_addr(src, &vsk->remote_addr)
227 && vsock_addr_equals_addr(dst, &vsk->local_addr)) {
228 return sk_vsock(vsk);
229 }
230 }
231
232 return NULL;
233}
234
235static bool __vsock_in_bound_table(struct vsock_sock *vsk)
236{
237 return !list_empty(&vsk->bound_table);
238}
239
240static bool __vsock_in_connected_table(struct vsock_sock *vsk)
241{
242 return !list_empty(&vsk->connected_table);
243}
244
245static void vsock_insert_unbound(struct vsock_sock *vsk)
246{
247 spin_lock_bh(&vsock_table_lock);
248 __vsock_insert_bound(vsock_unbound_sockets, vsk);
249 spin_unlock_bh(&vsock_table_lock);
250}
251
252void vsock_insert_connected(struct vsock_sock *vsk)
253{
254 struct list_head *list = vsock_connected_sockets(
255 &vsk->remote_addr, &vsk->local_addr);
256
257 spin_lock_bh(&vsock_table_lock);
258 __vsock_insert_connected(list, vsk);
259 spin_unlock_bh(&vsock_table_lock);
260}
261EXPORT_SYMBOL_GPL(vsock_insert_connected);
262
263void vsock_remove_bound(struct vsock_sock *vsk)
264{
265 spin_lock_bh(&vsock_table_lock);
266 __vsock_remove_bound(vsk);
267 spin_unlock_bh(&vsock_table_lock);
268}
269EXPORT_SYMBOL_GPL(vsock_remove_bound);
270
271void vsock_remove_connected(struct vsock_sock *vsk)
272{
273 spin_lock_bh(&vsock_table_lock);
274 __vsock_remove_connected(vsk);
275 spin_unlock_bh(&vsock_table_lock);
276}
277EXPORT_SYMBOL_GPL(vsock_remove_connected);
278
279struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
280{
281 struct sock *sk;
282
283 spin_lock_bh(&vsock_table_lock);
284 sk = __vsock_find_bound_socket(addr);
285 if (sk)
286 sock_hold(sk);
287
288 spin_unlock_bh(&vsock_table_lock);
289
290 return sk;
291}
292EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
293
294struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
295 struct sockaddr_vm *dst)
296{
297 struct sock *sk;
298
299 spin_lock_bh(&vsock_table_lock);
300 sk = __vsock_find_connected_socket(src, dst);
301 if (sk)
302 sock_hold(sk);
303
304 spin_unlock_bh(&vsock_table_lock);
305
306 return sk;
307}
308EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
309
310static bool vsock_in_bound_table(struct vsock_sock *vsk)
311{
312 bool ret;
313
314 spin_lock_bh(&vsock_table_lock);
315 ret = __vsock_in_bound_table(vsk);
316 spin_unlock_bh(&vsock_table_lock);
317
318 return ret;
319}
320
321static bool vsock_in_connected_table(struct vsock_sock *vsk)
322{
323 bool ret;
324
325 spin_lock_bh(&vsock_table_lock);
326 ret = __vsock_in_connected_table(vsk);
327 spin_unlock_bh(&vsock_table_lock);
328
329 return ret;
330}
331
332void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
333{
334 int i;
335
336 spin_lock_bh(&vsock_table_lock);
337
338 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
339 struct vsock_sock *vsk;
340 list_for_each_entry(vsk, &vsock_connected_table[i],
341 connected_table);
342 fn(sk_vsock(vsk));
343 }
344
345 spin_unlock_bh(&vsock_table_lock);
346}
347EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
348
349void vsock_add_pending(struct sock *listener, struct sock *pending)
350{
351 struct vsock_sock *vlistener;
352 struct vsock_sock *vpending;
353
354 vlistener = vsock_sk(listener);
355 vpending = vsock_sk(pending);
356
357 sock_hold(pending);
358 sock_hold(listener);
359 list_add_tail(&vpending->pending_links, &vlistener->pending_links);
360}
361EXPORT_SYMBOL_GPL(vsock_add_pending);
362
363void vsock_remove_pending(struct sock *listener, struct sock *pending)
364{
365 struct vsock_sock *vpending = vsock_sk(pending);
366
367 list_del_init(&vpending->pending_links);
368 sock_put(listener);
369 sock_put(pending);
370}
371EXPORT_SYMBOL_GPL(vsock_remove_pending);
372
373void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
374{
375 struct vsock_sock *vlistener;
376 struct vsock_sock *vconnected;
377
378 vlistener = vsock_sk(listener);
379 vconnected = vsock_sk(connected);
380
381 sock_hold(connected);
382 sock_hold(listener);
383 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue);
384}
385EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
386
387static struct sock *vsock_dequeue_accept(struct sock *listener)
388{
389 struct vsock_sock *vlistener;
390 struct vsock_sock *vconnected;
391
392 vlistener = vsock_sk(listener);
393
394 if (list_empty(&vlistener->accept_queue))
395 return NULL;
396
397 vconnected = list_entry(vlistener->accept_queue.next,
398 struct vsock_sock, accept_queue);
399
400 list_del_init(&vconnected->accept_queue);
401 sock_put(listener);
402 /* The caller will need a reference on the connected socket so we let
403 * it call sock_put().
404 */
405
406 return sk_vsock(vconnected);
407}
408
409static bool vsock_is_accept_queue_empty(struct sock *sk)
410{
411 struct vsock_sock *vsk = vsock_sk(sk);
412 return list_empty(&vsk->accept_queue);
413}
414
415static bool vsock_is_pending(struct sock *sk)
416{
417 struct vsock_sock *vsk = vsock_sk(sk);
418 return !list_empty(&vsk->pending_links);
419}
420
421static int vsock_send_shutdown(struct sock *sk, int mode)
422{
423 return transport->shutdown(vsock_sk(sk), mode);
424}
425
426void vsock_pending_work(struct work_struct *work)
427{
428 struct sock *sk;
429 struct sock *listener;
430 struct vsock_sock *vsk;
431 bool cleanup;
432
433 vsk = container_of(work, struct vsock_sock, dwork.work);
434 sk = sk_vsock(vsk);
435 listener = vsk->listener;
436 cleanup = true;
437
438 lock_sock(listener);
439 lock_sock(sk);
440
441 if (vsock_is_pending(sk)) {
442 vsock_remove_pending(listener, sk);
443 } else if (!vsk->rejected) {
444 /* We are not on the pending list and accept() did not reject
445 * us, so we must have been accepted by our user process. We
446 * just need to drop our references to the sockets and be on
447 * our way.
448 */
449 cleanup = false;
450 goto out;
451 }
452
453 listener->sk_ack_backlog--;
454
455 /* We need to remove ourself from the global connected sockets list so
456 * incoming packets can't find this socket, and to reduce the reference
457 * count.
458 */
459 if (vsock_in_connected_table(vsk))
460 vsock_remove_connected(vsk);
461
462 sk->sk_state = SS_FREE;
463
464out:
465 release_sock(sk);
466 release_sock(listener);
467 if (cleanup)
468 sock_put(sk);
469
470 sock_put(sk);
471 sock_put(listener);
472}
473EXPORT_SYMBOL_GPL(vsock_pending_work);
474
475/**** SOCKET OPERATIONS ****/
476
477static int __vsock_bind_stream(struct vsock_sock *vsk,
478 struct sockaddr_vm *addr)
479{
480 static u32 port = LAST_RESERVED_PORT + 1;
481 struct sockaddr_vm new_addr;
482
483 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
484
485 if (addr->svm_port == VMADDR_PORT_ANY) {
486 bool found = false;
487 unsigned int i;
488
489 for (i = 0; i < MAX_PORT_RETRIES; i++) {
490 if (port <= LAST_RESERVED_PORT)
491 port = LAST_RESERVED_PORT + 1;
492
493 new_addr.svm_port = port++;
494
495 if (!__vsock_find_bound_socket(&new_addr)) {
496 found = true;
497 break;
498 }
499 }
500
501 if (!found)
502 return -EADDRNOTAVAIL;
503 } else {
504 /* If port is in reserved range, ensure caller
505 * has necessary privileges.
506 */
507 if (addr->svm_port <= LAST_RESERVED_PORT &&
508 !capable(CAP_NET_BIND_SERVICE)) {
509 return -EACCES;
510 }
511
512 if (__vsock_find_bound_socket(&new_addr))
513 return -EADDRINUSE;
514 }
515
516 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
517
518 /* Remove stream sockets from the unbound list and add them to the hash
519 * table for easy lookup by its address. The unbound list is simply an
520 * extra entry at the end of the hash table, a trick used by AF_UNIX.
521 */
522 __vsock_remove_bound(vsk);
523 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
524
525 return 0;
526}
527
528static int __vsock_bind_dgram(struct vsock_sock *vsk,
529 struct sockaddr_vm *addr)
530{
531 return transport->dgram_bind(vsk, addr);
532}
533
534static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
535{
536 struct vsock_sock *vsk = vsock_sk(sk);
537 u32 cid;
538 int retval;
539
540 /* First ensure this socket isn't already bound. */
541 if (vsock_addr_bound(&vsk->local_addr))
542 return -EINVAL;
543
544 /* Now bind to the provided address or select appropriate values if
545 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
546 * like AF_INET prevents binding to a non-local IP address (in most
547 * cases), we only allow binding to the local CID.
548 */
549 cid = transport->get_local_cid();
550 if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
551 return -EADDRNOTAVAIL;
552
553 switch (sk->sk_socket->type) {
554 case SOCK_STREAM:
555 spin_lock_bh(&vsock_table_lock);
556 retval = __vsock_bind_stream(vsk, addr);
557 spin_unlock_bh(&vsock_table_lock);
558 break;
559
560 case SOCK_DGRAM:
561 retval = __vsock_bind_dgram(vsk, addr);
562 break;
563
564 default:
565 retval = -EINVAL;
566 break;
567 }
568
569 return retval;
570}
571
572struct sock *__vsock_create(struct net *net,
573 struct socket *sock,
574 struct sock *parent,
575 gfp_t priority,
576 unsigned short type)
577{
578 struct sock *sk;
579 struct vsock_sock *psk;
580 struct vsock_sock *vsk;
581
582 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
583 if (!sk)
584 return NULL;
585
586 sock_init_data(sock, sk);
587
588 /* sk->sk_type is normally set in sock_init_data, but only if sock is
589 * non-NULL. We make sure that our sockets always have a type by
590 * setting it here if needed.
591 */
592 if (!sock)
593 sk->sk_type = type;
594
595 vsk = vsock_sk(sk);
596 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
597 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
598
599 sk->sk_destruct = vsock_sk_destruct;
600 sk->sk_backlog_rcv = vsock_queue_rcv_skb;
601 sk->sk_state = 0;
602 sock_reset_flag(sk, SOCK_DONE);
603
604 INIT_LIST_HEAD(&vsk->bound_table);
605 INIT_LIST_HEAD(&vsk->connected_table);
606 vsk->listener = NULL;
607 INIT_LIST_HEAD(&vsk->pending_links);
608 INIT_LIST_HEAD(&vsk->accept_queue);
609 vsk->rejected = false;
610 vsk->sent_request = false;
611 vsk->ignore_connecting_rst = false;
612 vsk->peer_shutdown = 0;
613
614 psk = parent ? vsock_sk(parent) : NULL;
615 if (parent) {
616 vsk->trusted = psk->trusted;
617 vsk->owner = get_cred(psk->owner);
618 vsk->connect_timeout = psk->connect_timeout;
619 } else {
620 vsk->trusted = capable(CAP_NET_ADMIN);
621 vsk->owner = get_current_cred();
622 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
623 }
624
625 if (transport->init(vsk, psk) < 0) {
626 sk_free(sk);
627 return NULL;
628 }
629
630 if (sock)
631 vsock_insert_unbound(vsk);
632
633 return sk;
634}
635EXPORT_SYMBOL_GPL(__vsock_create);
636
637static void __vsock_release(struct sock *sk)
638{
639 if (sk) {
640 struct sk_buff *skb;
641 struct sock *pending;
642 struct vsock_sock *vsk;
643
644 vsk = vsock_sk(sk);
645 pending = NULL; /* Compiler warning. */
646
647 if (vsock_in_bound_table(vsk))
648 vsock_remove_bound(vsk);
649
650 if (vsock_in_connected_table(vsk))
651 vsock_remove_connected(vsk);
652
653 transport->release(vsk);
654
655 lock_sock(sk);
656 sock_orphan(sk);
657 sk->sk_shutdown = SHUTDOWN_MASK;
658
659 while ((skb = skb_dequeue(&sk->sk_receive_queue)))
660 kfree_skb(skb);
661
662 /* Clean up any sockets that never were accepted. */
663 while ((pending = vsock_dequeue_accept(sk)) != NULL) {
664 __vsock_release(pending);
665 sock_put(pending);
666 }
667
668 release_sock(sk);
669 sock_put(sk);
670 }
671}
672
673static void vsock_sk_destruct(struct sock *sk)
674{
675 struct vsock_sock *vsk = vsock_sk(sk);
676
677 transport->destruct(vsk);
678
679 /* When clearing these addresses, there's no need to set the family and
680 * possibly register the address family with the kernel.
681 */
682 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
683 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
684
685 put_cred(vsk->owner);
686}
687
688static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
689{
690 int err;
691
692 err = sock_queue_rcv_skb(sk, skb);
693 if (err)
694 kfree_skb(skb);
695
696 return err;
697}
698
699s64 vsock_stream_has_data(struct vsock_sock *vsk)
700{
701 return transport->stream_has_data(vsk);
702}
703EXPORT_SYMBOL_GPL(vsock_stream_has_data);
704
705s64 vsock_stream_has_space(struct vsock_sock *vsk)
706{
707 return transport->stream_has_space(vsk);
708}
709EXPORT_SYMBOL_GPL(vsock_stream_has_space);
710
711static int vsock_release(struct socket *sock)
712{
713 __vsock_release(sock->sk);
714 sock->sk = NULL;
715 sock->state = SS_FREE;
716
717 return 0;
718}
719
720static int
721vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
722{
723 int err;
724 struct sock *sk;
725 struct sockaddr_vm *vm_addr;
726
727 sk = sock->sk;
728
729 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0)
730 return -EINVAL;
731
732 lock_sock(sk);
733 err = __vsock_bind(sk, vm_addr);
734 release_sock(sk);
735
736 return err;
737}
738
739static int vsock_getname(struct socket *sock,
740 struct sockaddr *addr, int *addr_len, int peer)
741{
742 int err;
743 struct sock *sk;
744 struct vsock_sock *vsk;
745 struct sockaddr_vm *vm_addr;
746
747 sk = sock->sk;
748 vsk = vsock_sk(sk);
749 err = 0;
750
751 lock_sock(sk);
752
753 if (peer) {
754 if (sock->state != SS_CONNECTED) {
755 err = -ENOTCONN;
756 goto out;
757 }
758 vm_addr = &vsk->remote_addr;
759 } else {
760 vm_addr = &vsk->local_addr;
761 }
762
763 if (!vm_addr) {
764 err = -EINVAL;
765 goto out;
766 }
767
768 /* sys_getsockname() and sys_getpeername() pass us a
769 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately
770 * that macro is defined in socket.c instead of .h, so we hardcode its
771 * value here.
772 */
773 BUILD_BUG_ON(sizeof(*vm_addr) > 128);
774 memcpy(addr, vm_addr, sizeof(*vm_addr));
775 *addr_len = sizeof(*vm_addr);
776
777out:
778 release_sock(sk);
779 return err;
780}
781
782static int vsock_shutdown(struct socket *sock, int mode)
783{
784 int err;
785 struct sock *sk;
786
787 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses
788 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode
789 * here like the other address families do. Note also that the
790 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3),
791 * which is what we want.
792 */
793 mode++;
794
795 if ((mode & ~SHUTDOWN_MASK) || !mode)
796 return -EINVAL;
797
798 /* If this is a STREAM socket and it is not connected then bail out
799 * immediately. If it is a DGRAM socket then we must first kick the
800 * socket so that it wakes up from any sleeping calls, for example
801 * recv(), and then afterwards return the error.
802 */
803
804 sk = sock->sk;
805 if (sock->state == SS_UNCONNECTED) {
806 err = -ENOTCONN;
807 if (sk->sk_type == SOCK_STREAM)
808 return err;
809 } else {
810 sock->state = SS_DISCONNECTING;
811 err = 0;
812 }
813
814 /* Receive and send shutdowns are treated alike. */
815 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN);
816 if (mode) {
817 lock_sock(sk);
818 sk->sk_shutdown |= mode;
819 sk->sk_state_change(sk);
820 release_sock(sk);
821
822 if (sk->sk_type == SOCK_STREAM) {
823 sock_reset_flag(sk, SOCK_DONE);
824 vsock_send_shutdown(sk, mode);
825 }
826 }
827
828 return err;
829}
830
831static unsigned int vsock_poll(struct file *file, struct socket *sock,
832 poll_table *wait)
833{
834 struct sock *sk;
835 unsigned int mask;
836 struct vsock_sock *vsk;
837
838 sk = sock->sk;
839 vsk = vsock_sk(sk);
840
841 poll_wait(file, sk_sleep(sk), wait);
842 mask = 0;
843
844 if (sk->sk_err)
845 /* Signify that there has been an error on this socket. */
846 mask |= POLLERR;
847
848 /* INET sockets treat local write shutdown and peer write shutdown as a
849 * case of POLLHUP set.
850 */
851 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
852 ((sk->sk_shutdown & SEND_SHUTDOWN) &&
853 (vsk->peer_shutdown & SEND_SHUTDOWN))) {
854 mask |= POLLHUP;
855 }
856
857 if (sk->sk_shutdown & RCV_SHUTDOWN ||
858 vsk->peer_shutdown & SEND_SHUTDOWN) {
859 mask |= POLLRDHUP;
860 }
861
862 if (sock->type == SOCK_DGRAM) {
863 /* For datagram sockets we can read if there is something in
864 * the queue and write as long as the socket isn't shutdown for
865 * sending.
866 */
867 if (!skb_queue_empty(&sk->sk_receive_queue) ||
868 (sk->sk_shutdown & RCV_SHUTDOWN)) {
869 mask |= POLLIN | POLLRDNORM;
870 }
871
872 if (!(sk->sk_shutdown & SEND_SHUTDOWN))
873 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
874
875 } else if (sock->type == SOCK_STREAM) {
876 lock_sock(sk);
877
878 /* Listening sockets that have connections in their accept
879 * queue can be read.
880 */
881 if (sk->sk_state == SS_LISTEN
882 && !vsock_is_accept_queue_empty(sk))
883 mask |= POLLIN | POLLRDNORM;
884
885 /* If there is something in the queue then we can read. */
886 if (transport->stream_is_active(vsk) &&
887 !(sk->sk_shutdown & RCV_SHUTDOWN)) {
888 bool data_ready_now = false;
889 int ret = transport->notify_poll_in(
890 vsk, 1, &data_ready_now);
891 if (ret < 0) {
892 mask |= POLLERR;
893 } else {
894 if (data_ready_now)
895 mask |= POLLIN | POLLRDNORM;
896
897 }
898 }
899
900 /* Sockets whose connections have been closed, reset, or
901 * terminated should also be considered read, and we check the
902 * shutdown flag for that.
903 */
904 if (sk->sk_shutdown & RCV_SHUTDOWN ||
905 vsk->peer_shutdown & SEND_SHUTDOWN) {
906 mask |= POLLIN | POLLRDNORM;
907 }
908
909 /* Connected sockets that can produce data can be written. */
910 if (sk->sk_state == SS_CONNECTED) {
911 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
912 bool space_avail_now = false;
913 int ret = transport->notify_poll_out(
914 vsk, 1, &space_avail_now);
915 if (ret < 0) {
916 mask |= POLLERR;
917 } else {
918 if (space_avail_now)
919 /* Remove POLLWRBAND since INET
920 * sockets are not setting it.
921 */
922 mask |= POLLOUT | POLLWRNORM;
923
924 }
925 }
926 }
927
928 /* Simulate INET socket poll behaviors, which sets
929 * POLLOUT|POLLWRNORM when peer is closed and nothing to read,
930 * but local send is not shutdown.
931 */
932 if (sk->sk_state == SS_UNCONNECTED) {
933 if (!(sk->sk_shutdown & SEND_SHUTDOWN))
934 mask |= POLLOUT | POLLWRNORM;
935
936 }
937
938 release_sock(sk);
939 }
940
941 return mask;
942}
943
944static int vsock_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
945 struct msghdr *msg, size_t len)
946{
947 int err;
948 struct sock *sk;
949 struct vsock_sock *vsk;
950 struct sockaddr_vm *remote_addr;
951
952 if (msg->msg_flags & MSG_OOB)
953 return -EOPNOTSUPP;
954
955 /* For now, MSG_DONTWAIT is always assumed... */
956 err = 0;
957 sk = sock->sk;
958 vsk = vsock_sk(sk);
959
960 lock_sock(sk);
961
962 if (!vsock_addr_bound(&vsk->local_addr)) {
963 struct sockaddr_vm local_addr;
964
965 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
966 err = __vsock_bind(sk, &local_addr);
967 if (err != 0)
968 goto out;
969
970 }
971
972 /* If the provided message contains an address, use that. Otherwise
973 * fall back on the socket's remote handle (if it has been connected).
974 */
975 if (msg->msg_name &&
976 vsock_addr_cast(msg->msg_name, msg->msg_namelen,
977 &remote_addr) == 0) {
978 /* Ensure this address is of the right type and is a valid
979 * destination.
980 */
981
982 if (remote_addr->svm_cid == VMADDR_CID_ANY)
983 remote_addr->svm_cid = transport->get_local_cid();
984
985 if (!vsock_addr_bound(remote_addr)) {
986 err = -EINVAL;
987 goto out;
988 }
989 } else if (sock->state == SS_CONNECTED) {
990 remote_addr = &vsk->remote_addr;
991
992 if (remote_addr->svm_cid == VMADDR_CID_ANY)
993 remote_addr->svm_cid = transport->get_local_cid();
994
995 /* XXX Should connect() or this function ensure remote_addr is
996 * bound?
997 */
998 if (!vsock_addr_bound(&vsk->remote_addr)) {
999 err = -EINVAL;
1000 goto out;
1001 }
1002 } else {
1003 err = -EINVAL;
1004 goto out;
1005 }
1006
1007 if (!transport->dgram_allow(remote_addr->svm_cid,
1008 remote_addr->svm_port)) {
1009 err = -EINVAL;
1010 goto out;
1011 }
1012
1013 err = transport->dgram_enqueue(vsk, remote_addr, msg->msg_iov, len);
1014
1015out:
1016 release_sock(sk);
1017 return err;
1018}
1019
1020static int vsock_dgram_connect(struct socket *sock,
1021 struct sockaddr *addr, int addr_len, int flags)
1022{
1023 int err;
1024 struct sock *sk;
1025 struct vsock_sock *vsk;
1026 struct sockaddr_vm *remote_addr;
1027
1028 sk = sock->sk;
1029 vsk = vsock_sk(sk);
1030
1031 err = vsock_addr_cast(addr, addr_len, &remote_addr);
1032 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) {
1033 lock_sock(sk);
1034 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY,
1035 VMADDR_PORT_ANY);
1036 sock->state = SS_UNCONNECTED;
1037 release_sock(sk);
1038 return 0;
1039 } else if (err != 0)
1040 return -EINVAL;
1041
1042 lock_sock(sk);
1043
1044 if (!vsock_addr_bound(&vsk->local_addr)) {
1045 struct sockaddr_vm local_addr;
1046
1047 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
1048 err = __vsock_bind(sk, &local_addr);
1049 if (err != 0)
1050 goto out;
1051
1052 }
1053
1054 if (!transport->dgram_allow(remote_addr->svm_cid,
1055 remote_addr->svm_port)) {
1056 err = -EINVAL;
1057 goto out;
1058 }
1059
1060 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
1061 sock->state = SS_CONNECTED;
1062
1063out:
1064 release_sock(sk);
1065 return err;
1066}
1067
1068static int vsock_dgram_recvmsg(struct kiocb *kiocb, struct socket *sock,
1069 struct msghdr *msg, size_t len, int flags)
1070{
1071 return transport->dgram_dequeue(kiocb, vsock_sk(sock->sk), msg, len,
1072 flags);
1073}
1074
1075static const struct proto_ops vsock_dgram_ops = {
1076 .family = PF_VSOCK,
1077 .owner = THIS_MODULE,
1078 .release = vsock_release,
1079 .bind = vsock_bind,
1080 .connect = vsock_dgram_connect,
1081 .socketpair = sock_no_socketpair,
1082 .accept = sock_no_accept,
1083 .getname = vsock_getname,
1084 .poll = vsock_poll,
1085 .ioctl = sock_no_ioctl,
1086 .listen = sock_no_listen,
1087 .shutdown = vsock_shutdown,
1088 .setsockopt = sock_no_setsockopt,
1089 .getsockopt = sock_no_getsockopt,
1090 .sendmsg = vsock_dgram_sendmsg,
1091 .recvmsg = vsock_dgram_recvmsg,
1092 .mmap = sock_no_mmap,
1093 .sendpage = sock_no_sendpage,
1094};
1095
1096static void vsock_connect_timeout(struct work_struct *work)
1097{
1098 struct sock *sk;
1099 struct vsock_sock *vsk;
1100
1101 vsk = container_of(work, struct vsock_sock, dwork.work);
1102 sk = sk_vsock(vsk);
1103
1104 lock_sock(sk);
1105 if (sk->sk_state == SS_CONNECTING &&
1106 (sk->sk_shutdown != SHUTDOWN_MASK)) {
1107 sk->sk_state = SS_UNCONNECTED;
1108 sk->sk_err = ETIMEDOUT;
1109 sk->sk_error_report(sk);
1110 }
1111 release_sock(sk);
1112
1113 sock_put(sk);
1114}
1115
1116static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
1117 int addr_len, int flags)
1118{
1119 int err;
1120 struct sock *sk;
1121 struct vsock_sock *vsk;
1122 struct sockaddr_vm *remote_addr;
1123 long timeout;
1124 DEFINE_WAIT(wait);
1125
1126 err = 0;
1127 sk = sock->sk;
1128 vsk = vsock_sk(sk);
1129
1130 lock_sock(sk);
1131
1132 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */
1133 switch (sock->state) {
1134 case SS_CONNECTED:
1135 err = -EISCONN;
1136 goto out;
1137 case SS_DISCONNECTING:
1138 err = -EINVAL;
1139 goto out;
1140 case SS_CONNECTING:
1141 /* This continues on so we can move sock into the SS_CONNECTED
1142 * state once the connection has completed (at which point err
1143 * will be set to zero also). Otherwise, we will either wait
1144 * for the connection or return -EALREADY should this be a
1145 * non-blocking call.
1146 */
1147 err = -EALREADY;
1148 break;
1149 default:
1150 if ((sk->sk_state == SS_LISTEN) ||
1151 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
1152 err = -EINVAL;
1153 goto out;
1154 }
1155
1156 /* The hypervisor and well-known contexts do not have socket
1157 * endpoints.
1158 */
1159 if (!transport->stream_allow(remote_addr->svm_cid,
1160 remote_addr->svm_port)) {
1161 err = -ENETUNREACH;
1162 goto out;
1163 }
1164
1165 /* Set the remote address that we are connecting to. */
1166 memcpy(&vsk->remote_addr, remote_addr,
1167 sizeof(vsk->remote_addr));
1168
1169 /* Autobind this socket to the local address if necessary. */
1170 if (!vsock_addr_bound(&vsk->local_addr)) {
1171 struct sockaddr_vm local_addr;
1172
1173 vsock_addr_init(&local_addr, VMADDR_CID_ANY,
1174 VMADDR_PORT_ANY);
1175 err = __vsock_bind(sk, &local_addr);
1176 if (err != 0)
1177 goto out;
1178
1179 }
1180
1181 sk->sk_state = SS_CONNECTING;
1182
1183 err = transport->connect(vsk);
1184 if (err < 0)
1185 goto out;
1186
1187 /* Mark sock as connecting and set the error code to in
1188 * progress in case this is a non-blocking connect.
1189 */
1190 sock->state = SS_CONNECTING;
1191 err = -EINPROGRESS;
1192 }
1193
1194 /* The receive path will handle all communication until we are able to
1195 * enter the connected state. Here we wait for the connection to be
1196 * completed or a notification of an error.
1197 */
1198 timeout = vsk->connect_timeout;
1199 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1200
1201 while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) {
1202 if (flags & O_NONBLOCK) {
1203 /* If we're not going to block, we schedule a timeout
1204 * function to generate a timeout on the connection
1205 * attempt, in case the peer doesn't respond in a
1206 * timely manner. We hold on to the socket until the
1207 * timeout fires.
1208 */
1209 sock_hold(sk);
1210 INIT_DELAYED_WORK(&vsk->dwork,
1211 vsock_connect_timeout);
1212 schedule_delayed_work(&vsk->dwork, timeout);
1213
1214 /* Skip ahead to preserve error code set above. */
1215 goto out_wait;
1216 }
1217
1218 release_sock(sk);
1219 timeout = schedule_timeout(timeout);
1220 lock_sock(sk);
1221
1222 if (signal_pending(current)) {
1223 err = sock_intr_errno(timeout);
1224 goto out_wait_error;
1225 } else if (timeout == 0) {
1226 err = -ETIMEDOUT;
1227 goto out_wait_error;
1228 }
1229
1230 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1231 }
1232
1233 if (sk->sk_err) {
1234 err = -sk->sk_err;
1235 goto out_wait_error;
1236 } else
1237 err = 0;
1238
1239out_wait:
1240 finish_wait(sk_sleep(sk), &wait);
1241out:
1242 release_sock(sk);
1243 return err;
1244
1245out_wait_error:
1246 sk->sk_state = SS_UNCONNECTED;
1247 sock->state = SS_UNCONNECTED;
1248 goto out_wait;
1249}
1250
1251static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
1252{
1253 struct sock *listener;
1254 int err;
1255 struct sock *connected;
1256 struct vsock_sock *vconnected;
1257 long timeout;
1258 DEFINE_WAIT(wait);
1259
1260 err = 0;
1261 listener = sock->sk;
1262
1263 lock_sock(listener);
1264
1265 if (sock->type != SOCK_STREAM) {
1266 err = -EOPNOTSUPP;
1267 goto out;
1268 }
1269
1270 if (listener->sk_state != SS_LISTEN) {
1271 err = -EINVAL;
1272 goto out;
1273 }
1274
1275 /* Wait for children sockets to appear; these are the new sockets
1276 * created upon connection establishment.
1277 */
1278 timeout = sock_sndtimeo(listener, flags & O_NONBLOCK);
1279 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
1280
1281 while ((connected = vsock_dequeue_accept(listener)) == NULL &&
1282 listener->sk_err == 0) {
1283 release_sock(listener);
1284 timeout = schedule_timeout(timeout);
1285 lock_sock(listener);
1286
1287 if (signal_pending(current)) {
1288 err = sock_intr_errno(timeout);
1289 goto out_wait;
1290 } else if (timeout == 0) {
1291 err = -EAGAIN;
1292 goto out_wait;
1293 }
1294
1295 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
1296 }
1297
1298 if (listener->sk_err)
1299 err = -listener->sk_err;
1300
1301 if (connected) {
1302 listener->sk_ack_backlog--;
1303
1304 lock_sock(connected);
1305 vconnected = vsock_sk(connected);
1306
1307 /* If the listener socket has received an error, then we should
1308 * reject this socket and return. Note that we simply mark the
1309 * socket rejected, drop our reference, and let the cleanup
1310 * function handle the cleanup; the fact that we found it in
1311 * the listener's accept queue guarantees that the cleanup
1312 * function hasn't run yet.
1313 */
1314 if (err) {
1315 vconnected->rejected = true;
1316 release_sock(connected);
1317 sock_put(connected);
1318 goto out_wait;
1319 }
1320
1321 newsock->state = SS_CONNECTED;
1322 sock_graft(connected, newsock);
1323 release_sock(connected);
1324 sock_put(connected);
1325 }
1326
1327out_wait:
1328 finish_wait(sk_sleep(listener), &wait);
1329out:
1330 release_sock(listener);
1331 return err;
1332}
1333
1334static int vsock_listen(struct socket *sock, int backlog)
1335{
1336 int err;
1337 struct sock *sk;
1338 struct vsock_sock *vsk;
1339
1340 sk = sock->sk;
1341
1342 lock_sock(sk);
1343
1344 if (sock->type != SOCK_STREAM) {
1345 err = -EOPNOTSUPP;
1346 goto out;
1347 }
1348
1349 if (sock->state != SS_UNCONNECTED) {
1350 err = -EINVAL;
1351 goto out;
1352 }
1353
1354 vsk = vsock_sk(sk);
1355
1356 if (!vsock_addr_bound(&vsk->local_addr)) {
1357 err = -EINVAL;
1358 goto out;
1359 }
1360
1361 sk->sk_max_ack_backlog = backlog;
1362 sk->sk_state = SS_LISTEN;
1363
1364 err = 0;
1365
1366out:
1367 release_sock(sk);
1368 return err;
1369}
1370
1371static int vsock_stream_setsockopt(struct socket *sock,
1372 int level,
1373 int optname,
1374 char __user *optval,
1375 unsigned int optlen)
1376{
1377 int err;
1378 struct sock *sk;
1379 struct vsock_sock *vsk;
1380 u64 val;
1381
1382 if (level != AF_VSOCK)
1383 return -ENOPROTOOPT;
1384
1385#define COPY_IN(_v) \
1386 do { \
1387 if (optlen < sizeof(_v)) { \
1388 err = -EINVAL; \
1389 goto exit; \
1390 } \
1391 if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \
1392 err = -EFAULT; \
1393 goto exit; \
1394 } \
1395 } while (0)
1396
1397 err = 0;
1398 sk = sock->sk;
1399 vsk = vsock_sk(sk);
1400
1401 lock_sock(sk);
1402
1403 switch (optname) {
1404 case SO_VM_SOCKETS_BUFFER_SIZE:
1405 COPY_IN(val);
1406 transport->set_buffer_size(vsk, val);
1407 break;
1408
1409 case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
1410 COPY_IN(val);
1411 transport->set_max_buffer_size(vsk, val);
1412 break;
1413
1414 case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
1415 COPY_IN(val);
1416 transport->set_min_buffer_size(vsk, val);
1417 break;
1418
1419 case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
1420 struct timeval tv;
1421 COPY_IN(tv);
1422 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC &&
1423 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) {
1424 vsk->connect_timeout = tv.tv_sec * HZ +
1425 DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ));
1426 if (vsk->connect_timeout == 0)
1427 vsk->connect_timeout =
1428 VSOCK_DEFAULT_CONNECT_TIMEOUT;
1429
1430 } else {
1431 err = -ERANGE;
1432 }
1433 break;
1434 }
1435
1436 default:
1437 err = -ENOPROTOOPT;
1438 break;
1439 }
1440
1441#undef COPY_IN
1442
1443exit:
1444 release_sock(sk);
1445 return err;
1446}
1447
1448static int vsock_stream_getsockopt(struct socket *sock,
1449 int level, int optname,
1450 char __user *optval,
1451 int __user *optlen)
1452{
1453 int err;
1454 int len;
1455 struct sock *sk;
1456 struct vsock_sock *vsk;
1457 u64 val;
1458
1459 if (level != AF_VSOCK)
1460 return -ENOPROTOOPT;
1461
1462 err = get_user(len, optlen);
1463 if (err != 0)
1464 return err;
1465
1466#define COPY_OUT(_v) \
1467 do { \
1468 if (len < sizeof(_v)) \
1469 return -EINVAL; \
1470 \
1471 len = sizeof(_v); \
1472 if (copy_to_user(optval, &_v, len) != 0) \
1473 return -EFAULT; \
1474 \
1475 } while (0)
1476
1477 err = 0;
1478 sk = sock->sk;
1479 vsk = vsock_sk(sk);
1480
1481 switch (optname) {
1482 case SO_VM_SOCKETS_BUFFER_SIZE:
1483 val = transport->get_buffer_size(vsk);
1484 COPY_OUT(val);
1485 break;
1486
1487 case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
1488 val = transport->get_max_buffer_size(vsk);
1489 COPY_OUT(val);
1490 break;
1491
1492 case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
1493 val = transport->get_min_buffer_size(vsk);
1494 COPY_OUT(val);
1495 break;
1496
1497 case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
1498 struct timeval tv;
1499 tv.tv_sec = vsk->connect_timeout / HZ;
1500 tv.tv_usec =
1501 (vsk->connect_timeout -
1502 tv.tv_sec * HZ) * (1000000 / HZ);
1503 COPY_OUT(tv);
1504 break;
1505 }
1506 default:
1507 return -ENOPROTOOPT;
1508 }
1509
1510 err = put_user(len, optlen);
1511 if (err != 0)
1512 return -EFAULT;
1513
1514#undef COPY_OUT
1515
1516 return 0;
1517}
1518
1519static int vsock_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1520 struct msghdr *msg, size_t len)
1521{
1522 struct sock *sk;
1523 struct vsock_sock *vsk;
1524 ssize_t total_written;
1525 long timeout;
1526 int err;
1527 struct vsock_transport_send_notify_data send_data;
1528
1529 DEFINE_WAIT(wait);
1530
1531 sk = sock->sk;
1532 vsk = vsock_sk(sk);
1533 total_written = 0;
1534 err = 0;
1535
1536 if (msg->msg_flags & MSG_OOB)
1537 return -EOPNOTSUPP;
1538
1539 lock_sock(sk);
1540
1541 /* Callers should not provide a destination with stream sockets. */
1542 if (msg->msg_namelen) {
1543 err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP;
1544 goto out;
1545 }
1546
1547 /* Send data only if both sides are not shutdown in the direction. */
1548 if (sk->sk_shutdown & SEND_SHUTDOWN ||
1549 vsk->peer_shutdown & RCV_SHUTDOWN) {
1550 err = -EPIPE;
1551 goto out;
1552 }
1553
1554 if (sk->sk_state != SS_CONNECTED ||
1555 !vsock_addr_bound(&vsk->local_addr)) {
1556 err = -ENOTCONN;
1557 goto out;
1558 }
1559
1560 if (!vsock_addr_bound(&vsk->remote_addr)) {
1561 err = -EDESTADDRREQ;
1562 goto out;
1563 }
1564
1565 /* Wait for room in the produce queue to enqueue our user's data. */
1566 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1567
1568 err = transport->notify_send_init(vsk, &send_data);
1569 if (err < 0)
1570 goto out;
1571
1572 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1573
1574 while (total_written < len) {
1575 ssize_t written;
1576
1577 while (vsock_stream_has_space(vsk) == 0 &&
1578 sk->sk_err == 0 &&
1579 !(sk->sk_shutdown & SEND_SHUTDOWN) &&
1580 !(vsk->peer_shutdown & RCV_SHUTDOWN)) {
1581
1582 /* Don't wait for non-blocking sockets. */
1583 if (timeout == 0) {
1584 err = -EAGAIN;
1585 goto out_wait;
1586 }
1587
1588 err = transport->notify_send_pre_block(vsk, &send_data);
1589 if (err < 0)
1590 goto out_wait;
1591
1592 release_sock(sk);
1593 timeout = schedule_timeout(timeout);
1594 lock_sock(sk);
1595 if (signal_pending(current)) {
1596 err = sock_intr_errno(timeout);
1597 goto out_wait;
1598 } else if (timeout == 0) {
1599 err = -EAGAIN;
1600 goto out_wait;
1601 }
1602
1603 prepare_to_wait(sk_sleep(sk), &wait,
1604 TASK_INTERRUPTIBLE);
1605 }
1606
1607 /* These checks occur both as part of and after the loop
1608 * conditional since we need to check before and after
1609 * sleeping.
1610 */
1611 if (sk->sk_err) {
1612 err = -sk->sk_err;
1613 goto out_wait;
1614 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
1615 (vsk->peer_shutdown & RCV_SHUTDOWN)) {
1616 err = -EPIPE;
1617 goto out_wait;
1618 }
1619
1620 err = transport->notify_send_pre_enqueue(vsk, &send_data);
1621 if (err < 0)
1622 goto out_wait;
1623
1624 /* Note that enqueue will only write as many bytes as are free
1625 * in the produce queue, so we don't need to ensure len is
1626 * smaller than the queue size. It is the caller's
1627 * responsibility to check how many bytes we were able to send.
1628 */
1629
1630 written = transport->stream_enqueue(
1631 vsk, msg->msg_iov,
1632 len - total_written);
1633 if (written < 0) {
1634 err = -ENOMEM;
1635 goto out_wait;
1636 }
1637
1638 total_written += written;
1639
1640 err = transport->notify_send_post_enqueue(
1641 vsk, written, &send_data);
1642 if (err < 0)
1643 goto out_wait;
1644
1645 }
1646
1647out_wait:
1648 if (total_written > 0)
1649 err = total_written;
1650 finish_wait(sk_sleep(sk), &wait);
1651out:
1652 release_sock(sk);
1653 return err;
1654}
1655
1656
1657static int
1658vsock_stream_recvmsg(struct kiocb *kiocb,
1659 struct socket *sock,
1660 struct msghdr *msg, size_t len, int flags)
1661{
1662 struct sock *sk;
1663 struct vsock_sock *vsk;
1664 int err;
1665 size_t target;
1666 ssize_t copied;
1667 long timeout;
1668 struct vsock_transport_recv_notify_data recv_data;
1669
1670 DEFINE_WAIT(wait);
1671
1672 sk = sock->sk;
1673 vsk = vsock_sk(sk);
1674 err = 0;
1675
1676 lock_sock(sk);
1677
1678 if (sk->sk_state != SS_CONNECTED) {
1679 /* Recvmsg is supposed to return 0 if a peer performs an
1680 * orderly shutdown. Differentiate between that case and when a
1681 * peer has not connected or a local shutdown occured with the
1682 * SOCK_DONE flag.
1683 */
1684 if (sock_flag(sk, SOCK_DONE))
1685 err = 0;
1686 else
1687 err = -ENOTCONN;
1688
1689 goto out;
1690 }
1691
1692 if (flags & MSG_OOB) {
1693 err = -EOPNOTSUPP;
1694 goto out;
1695 }
1696
1697 /* We don't check peer_shutdown flag here since peer may actually shut
1698 * down, but there can be data in the queue that a local socket can
1699 * receive.
1700 */
1701 if (sk->sk_shutdown & RCV_SHUTDOWN) {
1702 err = 0;
1703 goto out;
1704 }
1705
1706 /* It is valid on Linux to pass in a zero-length receive buffer. This
1707 * is not an error. We may as well bail out now.
1708 */
1709 if (!len) {
1710 err = 0;
1711 goto out;
1712 }
1713
1714 /* We must not copy less than target bytes into the user's buffer
1715 * before returning successfully, so we wait for the consume queue to
1716 * have that much data to consume before dequeueing. Note that this
1717 * makes it impossible to handle cases where target is greater than the
1718 * queue size.
1719 */
1720 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1721 if (target >= transport->stream_rcvhiwat(vsk)) {
1722 err = -ENOMEM;
1723 goto out;
1724 }
1725 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
1726 copied = 0;
1727
1728 err = transport->notify_recv_init(vsk, target, &recv_data);
1729 if (err < 0)
1730 goto out;
1731
1732 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1733
1734 while (1) {
1735 s64 ready = vsock_stream_has_data(vsk);
1736
1737 if (ready < 0) {
1738 /* Invalid queue pair content. XXX This should be
1739 * changed to a connection reset in a later change.
1740 */
1741
1742 err = -ENOMEM;
1743 goto out_wait;
1744 } else if (ready > 0) {
1745 ssize_t read;
1746
1747 err = transport->notify_recv_pre_dequeue(
1748 vsk, target, &recv_data);
1749 if (err < 0)
1750 break;
1751
1752 read = transport->stream_dequeue(
1753 vsk, msg->msg_iov,
1754 len - copied, flags);
1755 if (read < 0) {
1756 err = -ENOMEM;
1757 break;
1758 }
1759
1760 copied += read;
1761
1762 err = transport->notify_recv_post_dequeue(
1763 vsk, target, read,
1764 !(flags & MSG_PEEK), &recv_data);
1765 if (err < 0)
1766 goto out_wait;
1767
1768 if (read >= target || flags & MSG_PEEK)
1769 break;
1770
1771 target -= read;
1772 } else {
1773 if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN)
1774 || (vsk->peer_shutdown & SEND_SHUTDOWN)) {
1775 break;
1776 }
1777 /* Don't wait for non-blocking sockets. */
1778 if (timeout == 0) {
1779 err = -EAGAIN;
1780 break;
1781 }
1782
1783 err = transport->notify_recv_pre_block(
1784 vsk, target, &recv_data);
1785 if (err < 0)
1786 break;
1787
1788 release_sock(sk);
1789 timeout = schedule_timeout(timeout);
1790 lock_sock(sk);
1791
1792 if (signal_pending(current)) {
1793 err = sock_intr_errno(timeout);
1794 break;
1795 } else if (timeout == 0) {
1796 err = -EAGAIN;
1797 break;
1798 }
1799
1800 prepare_to_wait(sk_sleep(sk), &wait,
1801 TASK_INTERRUPTIBLE);
1802 }
1803 }
1804
1805 if (sk->sk_err)
1806 err = -sk->sk_err;
1807 else if (sk->sk_shutdown & RCV_SHUTDOWN)
1808 err = 0;
1809
1810 if (copied > 0) {
1811 /* We only do these additional bookkeeping/notification steps
1812 * if we actually copied something out of the queue pair
1813 * instead of just peeking ahead.
1814 */
1815
1816 if (!(flags & MSG_PEEK)) {
1817 /* If the other side has shutdown for sending and there
1818 * is nothing more to read, then modify the socket
1819 * state.
1820 */
1821 if (vsk->peer_shutdown & SEND_SHUTDOWN) {
1822 if (vsock_stream_has_data(vsk) <= 0) {
1823 sk->sk_state = SS_UNCONNECTED;
1824 sock_set_flag(sk, SOCK_DONE);
1825 sk->sk_state_change(sk);
1826 }
1827 }
1828 }
1829 err = copied;
1830 }
1831
1832out_wait:
1833 finish_wait(sk_sleep(sk), &wait);
1834out:
1835 release_sock(sk);
1836 return err;
1837}
1838
1839static const struct proto_ops vsock_stream_ops = {
1840 .family = PF_VSOCK,
1841 .owner = THIS_MODULE,
1842 .release = vsock_release,
1843 .bind = vsock_bind,
1844 .connect = vsock_stream_connect,
1845 .socketpair = sock_no_socketpair,
1846 .accept = vsock_accept,
1847 .getname = vsock_getname,
1848 .poll = vsock_poll,
1849 .ioctl = sock_no_ioctl,
1850 .listen = vsock_listen,
1851 .shutdown = vsock_shutdown,
1852 .setsockopt = vsock_stream_setsockopt,
1853 .getsockopt = vsock_stream_getsockopt,
1854 .sendmsg = vsock_stream_sendmsg,
1855 .recvmsg = vsock_stream_recvmsg,
1856 .mmap = sock_no_mmap,
1857 .sendpage = sock_no_sendpage,
1858};
1859
1860static int vsock_create(struct net *net, struct socket *sock,
1861 int protocol, int kern)
1862{
1863 if (!sock)
1864 return -EINVAL;
1865
1866 if (protocol)
1867 return -EPROTONOSUPPORT;
1868
1869 switch (sock->type) {
1870 case SOCK_DGRAM:
1871 sock->ops = &vsock_dgram_ops;
1872 break;
1873 case SOCK_STREAM:
1874 sock->ops = &vsock_stream_ops;
1875 break;
1876 default:
1877 return -ESOCKTNOSUPPORT;
1878 }
1879
1880 sock->state = SS_UNCONNECTED;
1881
1882 return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM;
1883}
1884
1885static const struct net_proto_family vsock_family_ops = {
1886 .family = AF_VSOCK,
1887 .create = vsock_create,
1888 .owner = THIS_MODULE,
1889};
1890
1891static long vsock_dev_do_ioctl(struct file *filp,
1892 unsigned int cmd, void __user *ptr)
1893{
1894 u32 __user *p = ptr;
1895 int retval = 0;
1896
1897 switch (cmd) {
1898 case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
1899 if (put_user(transport->get_local_cid(), p) != 0)
1900 retval = -EFAULT;
1901 break;
1902
1903 default:
1904 pr_err("Unknown ioctl %d\n", cmd);
1905 retval = -EINVAL;
1906 }
1907
1908 return retval;
1909}
1910
1911static long vsock_dev_ioctl(struct file *filp,
1912 unsigned int cmd, unsigned long arg)
1913{
1914 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg);
1915}
1916
1917#ifdef CONFIG_COMPAT
1918static long vsock_dev_compat_ioctl(struct file *filp,
1919 unsigned int cmd, unsigned long arg)
1920{
1921 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg));
1922}
1923#endif
1924
1925static const struct file_operations vsock_device_ops = {
1926 .owner = THIS_MODULE,
1927 .unlocked_ioctl = vsock_dev_ioctl,
1928#ifdef CONFIG_COMPAT
1929 .compat_ioctl = vsock_dev_compat_ioctl,
1930#endif
1931 .open = nonseekable_open,
1932};
1933
1934static struct miscdevice vsock_device = {
1935 .name = "vsock",
1936 .minor = MISC_DYNAMIC_MINOR,
1937 .fops = &vsock_device_ops,
1938};
1939
1940static int __vsock_core_init(void)
1941{
1942 int err;
1943
1944 vsock_init_tables();
1945
1946 err = misc_register(&vsock_device);
1947 if (err) {
1948 pr_err("Failed to register misc device\n");
1949 return -ENOENT;
1950 }
1951
1952 err = proto_register(&vsock_proto, 1); /* we want our slab */
1953 if (err) {
1954 pr_err("Cannot register vsock protocol\n");
1955 goto err_misc_deregister;
1956 }
1957
1958 err = sock_register(&vsock_family_ops);
1959 if (err) {
1960 pr_err("could not register af_vsock (%d) address family: %d\n",
1961 AF_VSOCK, err);
1962 goto err_unregister_proto;
1963 }
1964
1965 return 0;
1966
1967err_unregister_proto:
1968 proto_unregister(&vsock_proto);
1969err_misc_deregister:
1970 misc_deregister(&vsock_device);
1971 return err;
1972}
1973
1974int vsock_core_init(const struct vsock_transport *t)
1975{
1976 int retval = mutex_lock_interruptible(&vsock_register_mutex);
1977 if (retval)
1978 return retval;
1979
1980 if (transport) {
1981 retval = -EBUSY;
1982 goto out;
1983 }
1984
1985 transport = t;
1986 retval = __vsock_core_init();
1987 if (retval)
1988 transport = NULL;
1989
1990out:
1991 mutex_unlock(&vsock_register_mutex);
1992 return retval;
1993}
1994EXPORT_SYMBOL_GPL(vsock_core_init);
1995
1996void vsock_core_exit(void)
1997{
1998 mutex_lock(&vsock_register_mutex);
1999
2000 misc_deregister(&vsock_device);
2001 sock_unregister(AF_VSOCK);
2002 proto_unregister(&vsock_proto);
2003
2004 /* We do not want the assignment below re-ordered. */
2005 mb();
2006 transport = NULL;
2007
2008 mutex_unlock(&vsock_register_mutex);
2009}
2010EXPORT_SYMBOL_GPL(vsock_core_exit);
2011
2012MODULE_AUTHOR("VMware, Inc.");
2013MODULE_DESCRIPTION("VMware Virtual Socket Family");
2014MODULE_VERSION(VSOCK_DRIVER_VERSION_STRING);
2015MODULE_LICENSE("GPL v2");