aboutsummaryrefslogtreecommitdiffstats
path: root/net/vmw_vsock
diff options
context:
space:
mode:
authorAndy King <acking@vmware.com>2013-02-06 09:23:56 -0500
committerDavid S. Miller <davem@davemloft.net>2013-02-10 19:41:08 -0500
commitd021c344051af91f42c5ba9fdedc176740cbd238 (patch)
tree8c02cd94a59556da4b74823816e670dd007db72f /net/vmw_vsock
parentfd5023111cf720db890ef34f305ac5d427e690a0 (diff)
VSOCK: Introduce VM Sockets
VM Sockets allows communication between virtual machines and the hypervisor. User level applications both in a virtual machine and on the host can use the VM Sockets API, which facilitates fast and efficient communication between guest virtual machines and their host. A socket address family, designed to be compatible with UDP and TCP at the interface level, is provided. Today, VM Sockets is used by various VMware Tools components inside the guest for zero-config, network-less access to VMware host services. In addition to this, VMware's users are using VM Sockets for various applications, where network access of the virtual machine is restricted or non-existent. Examples of this are VMs communicating with device proxies for proprietary hardware running as host applications and automated testing of applications running within virtual machines. The VMware VM Sockets are similar to other socket types, like Berkeley UNIX socket interface. The VM Sockets module supports both connection-oriented stream sockets like TCP, and connectionless datagram sockets like UDP. The VM Sockets protocol family is defined as "AF_VSOCK" and the socket operations split for SOCK_DGRAM and SOCK_STREAM. For additional information about the use of VM Sockets, please refer to the VM Sockets Programming Guide available at: https://www.vmware.com/support/developer/vmci-sdk/ Signed-off-by: George Zhang <georgezhang@vmware.com> Signed-off-by: Dmitry Torokhov <dtor@vmware.com> Signed-off-by: Andy king <acking@vmware.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/vmw_vsock')
-rw-r--r--net/vmw_vsock/Kconfig28
-rw-r--r--net/vmw_vsock/Makefile7
-rw-r--r--net/vmw_vsock/af_vsock.c2015
-rw-r--r--net/vmw_vsock/af_vsock.h175
-rw-r--r--net/vmw_vsock/vmci_transport.c2157
-rw-r--r--net/vmw_vsock/vmci_transport.h139
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c680
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h83
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c438
-rw-r--r--net/vmw_vsock/vsock_addr.c86
-rw-r--r--net/vmw_vsock/vsock_addr.h32
-rw-r--r--net/vmw_vsock/vsock_version.h22
12 files changed, 5862 insertions, 0 deletions
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
new file mode 100644
index 000000000000..b5fa7e40cdcb
--- /dev/null
+++ b/net/vmw_vsock/Kconfig
@@ -0,0 +1,28 @@
1#
2# Vsock protocol
3#
4
5config VSOCKETS
6 tristate "Virtual Socket protocol"
7 help
8 Virtual Socket Protocol is a socket protocol similar to TCP/IP
9 allowing comunication between Virtual Machines and hypervisor
10 or host.
11
12 You should also select one or more hypervisor-specific transports
13 below.
14
15 To compile this driver as a module, choose M here: the module
16 will be called vsock. If unsure, say N.
17
18config VMWARE_VMCI_VSOCKETS
19 tristate "VMware VMCI transport for Virtual Sockets"
20 depends on VSOCKETS && VMWARE_VMCI
21 help
22 This module implements a VMCI transport for Virtual Sockets.
23
24 Enable this transport if your Virtual Machine runs on a VMware
25 hypervisor.
26
27 To compile this driver as a module, choose M here: the module
28 will be called vmw_vsock_vmci_transport. If unsure, say N.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
new file mode 100644
index 000000000000..2ce52d70f224
--- /dev/null
+++ b/net/vmw_vsock/Makefile
@@ -0,0 +1,7 @@
1obj-$(CONFIG_VSOCKETS) += vsock.o
2obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
3
4vsock-y += af_vsock.o vsock_addr.o
5
6vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
7 vmci_transport_notify_qstate.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
new file mode 100644
index 000000000000..54bb7bdf92d3
--- /dev/null
+++ b/net/vmw_vsock/af_vsock.c
@@ -0,0 +1,2015 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16/* Implementation notes:
17 *
18 * - There are two kinds of sockets: those created by user action (such as
19 * calling socket(2)) and those created by incoming connection request packets.
20 *
21 * - There are two "global" tables, one for bound sockets (sockets that have
22 * specified an address that they are responsible for) and one for connected
23 * sockets (sockets that have established a connection with another socket).
24 * These tables are "global" in that all sockets on the system are placed
25 * within them. - Note, though, that the bound table contains an extra entry
26 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in
27 * that list. The bound table is used solely for lookup of sockets when packets
28 * are received and that's not necessary for SOCK_DGRAM sockets since we create
29 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM
30 * sockets out of the bound hash buckets will reduce the chance of collisions
31 * when looking for SOCK_STREAM sockets and prevents us from having to check the
32 * socket type in the hash table lookups.
33 *
34 * - Sockets created by user action will either be "client" sockets that
35 * initiate a connection or "server" sockets that listen for connections; we do
36 * not support simultaneous connects (two "client" sockets connecting).
37 *
38 * - "Server" sockets are referred to as listener sockets throughout this
39 * implementation because they are in the SS_LISTEN state. When a connection
40 * request is received (the second kind of socket mentioned above), we create a
41 * new socket and refer to it as a pending socket. These pending sockets are
42 * placed on the pending connection list of the listener socket. When future
43 * packets are received for the address the listener socket is bound to, we
44 * check if the source of the packet is from one that has an existing pending
45 * connection. If it does, we process the packet for the pending socket. When
46 * that socket reaches the connected state, it is removed from the listener
47 * socket's pending list and enqueued in the listener socket's accept queue.
48 * Callers of accept(2) will accept connected sockets from the listener socket's
49 * accept queue. If the socket cannot be accepted for some reason then it is
50 * marked rejected. Once the connection is accepted, it is owned by the user
51 * process and the responsibility for cleanup falls with that user process.
52 *
53 * - It is possible that these pending sockets will never reach the connected
54 * state; in fact, we may never receive another packet after the connection
55 * request. Because of this, we must schedule a cleanup function to run in the
56 * future, after some amount of time passes where a connection should have been
57 * established. This function ensures that the socket is off all lists so it
58 * cannot be retrieved, then drops all references to the socket so it is cleaned
59 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this
60 * function will also cleanup rejected sockets, those that reach the connected
61 * state but leave it before they have been accepted.
62 *
63 * - Sockets created by user action will be cleaned up when the user process
64 * calls close(2), causing our release implementation to be called. Our release
65 * implementation will perform some cleanup then drop the last reference so our
66 * sk_destruct implementation is invoked. Our sk_destruct implementation will
67 * perform additional cleanup that's common for both types of sockets.
68 *
69 * - A socket's reference count is what ensures that the structure won't be
70 * freed. Each entry in a list (such as the "global" bound and connected tables
71 * and the listener socket's pending list and connected queue) ensures a
72 * reference. When we defer work until process context and pass a socket as our
73 * argument, we must ensure the reference count is increased to ensure the
74 * socket isn't freed before the function is run; the deferred function will
75 * then drop the reference.
76 */
77
78#include <linux/types.h>
79
80#define EXPORT_SYMTAB
81#include <linux/bitops.h>
82#include <linux/cred.h>
83#include <linux/init.h>
84#include <linux/io.h>
85#include <linux/kernel.h>
86#include <linux/kmod.h>
87#include <linux/list.h>
88#include <linux/miscdevice.h>
89#include <linux/module.h>
90#include <linux/mutex.h>
91#include <linux/net.h>
92#include <linux/poll.h>
93#include <linux/skbuff.h>
94#include <linux/smp.h>
95#include <linux/socket.h>
96#include <linux/stddef.h>
97#include <linux/unistd.h>
98#include <linux/wait.h>
99#include <linux/workqueue.h>
100#include <net/sock.h>
101
102#include "af_vsock.h"
103#include "vsock_version.h"
104
105static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
106static void vsock_sk_destruct(struct sock *sk);
107static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
108
109/* Protocol family. */
110static struct proto vsock_proto = {
111 .name = "AF_VSOCK",
112 .owner = THIS_MODULE,
113 .obj_size = sizeof(struct vsock_sock),
114};
115
116/* The default peer timeout indicates how long we will wait for a peer response
117 * to a control message.
118 */
119#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
120
121#define SS_LISTEN 255
122
123static const struct vsock_transport *transport;
124static DEFINE_MUTEX(vsock_register_mutex);
125
126/**** EXPORTS ****/
127
128/* Get the ID of the local context. This is transport dependent. */
129
130int vm_sockets_get_local_cid(void)
131{
132 return transport->get_local_cid();
133}
134EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
135
136/**** UTILS ****/
137
138/* Each bound VSocket is stored in the bind hash table and each connected
139 * VSocket is stored in the connected hash table.
140 *
141 * Unbound sockets are all put on the same list attached to the end of the hash
142 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in
143 * the bucket that their local address hashes to (vsock_bound_sockets(addr)
144 * represents the list that addr hashes to).
145 *
146 * Specifically, we initialize the vsock_bind_table array to a size of
147 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through
148 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and
149 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function
150 * mods with VSOCK_HASH_SIZE - 1 to ensure this.
151 */
152#define VSOCK_HASH_SIZE 251
153#define MAX_PORT_RETRIES 24
154
155#define VSOCK_HASH(addr) ((addr)->svm_port % (VSOCK_HASH_SIZE - 1))
156#define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)])
157#define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE])
158
159/* XXX This can probably be implemented in a better way. */
160#define VSOCK_CONN_HASH(src, dst) \
161 (((src)->svm_cid ^ (dst)->svm_port) % (VSOCK_HASH_SIZE - 1))
162#define vsock_connected_sockets(src, dst) \
163 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)])
164#define vsock_connected_sockets_vsk(vsk) \
165 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr)
166
167static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
168static struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
169static DEFINE_SPINLOCK(vsock_table_lock);
170
171static __init void vsock_init_tables(void)
172{
173 int i;
174
175 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++)
176 INIT_LIST_HEAD(&vsock_bind_table[i]);
177
178 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
179 INIT_LIST_HEAD(&vsock_connected_table[i]);
180}
181
182static void __vsock_insert_bound(struct list_head *list,
183 struct vsock_sock *vsk)
184{
185 sock_hold(&vsk->sk);
186 list_add(&vsk->bound_table, list);
187}
188
189static void __vsock_insert_connected(struct list_head *list,
190 struct vsock_sock *vsk)
191{
192 sock_hold(&vsk->sk);
193 list_add(&vsk->connected_table, list);
194}
195
196static void __vsock_remove_bound(struct vsock_sock *vsk)
197{
198 list_del_init(&vsk->bound_table);
199 sock_put(&vsk->sk);
200}
201
202static void __vsock_remove_connected(struct vsock_sock *vsk)
203{
204 list_del_init(&vsk->connected_table);
205 sock_put(&vsk->sk);
206}
207
208static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
209{
210 struct vsock_sock *vsk;
211
212 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
213 if (vsock_addr_equals_addr_any(addr, &vsk->local_addr))
214 return sk_vsock(vsk);
215
216 return NULL;
217}
218
219static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
220 struct sockaddr_vm *dst)
221{
222 struct vsock_sock *vsk;
223
224 list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
225 connected_table) {
226 if (vsock_addr_equals_addr(src, &vsk->remote_addr)
227 && vsock_addr_equals_addr(dst, &vsk->local_addr)) {
228 return sk_vsock(vsk);
229 }
230 }
231
232 return NULL;
233}
234
235static bool __vsock_in_bound_table(struct vsock_sock *vsk)
236{
237 return !list_empty(&vsk->bound_table);
238}
239
240static bool __vsock_in_connected_table(struct vsock_sock *vsk)
241{
242 return !list_empty(&vsk->connected_table);
243}
244
245static void vsock_insert_unbound(struct vsock_sock *vsk)
246{
247 spin_lock_bh(&vsock_table_lock);
248 __vsock_insert_bound(vsock_unbound_sockets, vsk);
249 spin_unlock_bh(&vsock_table_lock);
250}
251
252void vsock_insert_connected(struct vsock_sock *vsk)
253{
254 struct list_head *list = vsock_connected_sockets(
255 &vsk->remote_addr, &vsk->local_addr);
256
257 spin_lock_bh(&vsock_table_lock);
258 __vsock_insert_connected(list, vsk);
259 spin_unlock_bh(&vsock_table_lock);
260}
261EXPORT_SYMBOL_GPL(vsock_insert_connected);
262
263void vsock_remove_bound(struct vsock_sock *vsk)
264{
265 spin_lock_bh(&vsock_table_lock);
266 __vsock_remove_bound(vsk);
267 spin_unlock_bh(&vsock_table_lock);
268}
269EXPORT_SYMBOL_GPL(vsock_remove_bound);
270
271void vsock_remove_connected(struct vsock_sock *vsk)
272{
273 spin_lock_bh(&vsock_table_lock);
274 __vsock_remove_connected(vsk);
275 spin_unlock_bh(&vsock_table_lock);
276}
277EXPORT_SYMBOL_GPL(vsock_remove_connected);
278
279struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
280{
281 struct sock *sk;
282
283 spin_lock_bh(&vsock_table_lock);
284 sk = __vsock_find_bound_socket(addr);
285 if (sk)
286 sock_hold(sk);
287
288 spin_unlock_bh(&vsock_table_lock);
289
290 return sk;
291}
292EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
293
294struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
295 struct sockaddr_vm *dst)
296{
297 struct sock *sk;
298
299 spin_lock_bh(&vsock_table_lock);
300 sk = __vsock_find_connected_socket(src, dst);
301 if (sk)
302 sock_hold(sk);
303
304 spin_unlock_bh(&vsock_table_lock);
305
306 return sk;
307}
308EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
309
310static bool vsock_in_bound_table(struct vsock_sock *vsk)
311{
312 bool ret;
313
314 spin_lock_bh(&vsock_table_lock);
315 ret = __vsock_in_bound_table(vsk);
316 spin_unlock_bh(&vsock_table_lock);
317
318 return ret;
319}
320
321static bool vsock_in_connected_table(struct vsock_sock *vsk)
322{
323 bool ret;
324
325 spin_lock_bh(&vsock_table_lock);
326 ret = __vsock_in_connected_table(vsk);
327 spin_unlock_bh(&vsock_table_lock);
328
329 return ret;
330}
331
332void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
333{
334 int i;
335
336 spin_lock_bh(&vsock_table_lock);
337
338 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
339 struct vsock_sock *vsk;
340 list_for_each_entry(vsk, &vsock_connected_table[i],
341 connected_table);
342 fn(sk_vsock(vsk));
343 }
344
345 spin_unlock_bh(&vsock_table_lock);
346}
347EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
348
349void vsock_add_pending(struct sock *listener, struct sock *pending)
350{
351 struct vsock_sock *vlistener;
352 struct vsock_sock *vpending;
353
354 vlistener = vsock_sk(listener);
355 vpending = vsock_sk(pending);
356
357 sock_hold(pending);
358 sock_hold(listener);
359 list_add_tail(&vpending->pending_links, &vlistener->pending_links);
360}
361EXPORT_SYMBOL_GPL(vsock_add_pending);
362
363void vsock_remove_pending(struct sock *listener, struct sock *pending)
364{
365 struct vsock_sock *vpending = vsock_sk(pending);
366
367 list_del_init(&vpending->pending_links);
368 sock_put(listener);
369 sock_put(pending);
370}
371EXPORT_SYMBOL_GPL(vsock_remove_pending);
372
373void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
374{
375 struct vsock_sock *vlistener;
376 struct vsock_sock *vconnected;
377
378 vlistener = vsock_sk(listener);
379 vconnected = vsock_sk(connected);
380
381 sock_hold(connected);
382 sock_hold(listener);
383 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue);
384}
385EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
386
387static struct sock *vsock_dequeue_accept(struct sock *listener)
388{
389 struct vsock_sock *vlistener;
390 struct vsock_sock *vconnected;
391
392 vlistener = vsock_sk(listener);
393
394 if (list_empty(&vlistener->accept_queue))
395 return NULL;
396
397 vconnected = list_entry(vlistener->accept_queue.next,
398 struct vsock_sock, accept_queue);
399
400 list_del_init(&vconnected->accept_queue);
401 sock_put(listener);
402 /* The caller will need a reference on the connected socket so we let
403 * it call sock_put().
404 */
405
406 return sk_vsock(vconnected);
407}
408
409static bool vsock_is_accept_queue_empty(struct sock *sk)
410{
411 struct vsock_sock *vsk = vsock_sk(sk);
412 return list_empty(&vsk->accept_queue);
413}
414
415static bool vsock_is_pending(struct sock *sk)
416{
417 struct vsock_sock *vsk = vsock_sk(sk);
418 return !list_empty(&vsk->pending_links);
419}
420
421static int vsock_send_shutdown(struct sock *sk, int mode)
422{
423 return transport->shutdown(vsock_sk(sk), mode);
424}
425
426void vsock_pending_work(struct work_struct *work)
427{
428 struct sock *sk;
429 struct sock *listener;
430 struct vsock_sock *vsk;
431 bool cleanup;
432
433 vsk = container_of(work, struct vsock_sock, dwork.work);
434 sk = sk_vsock(vsk);
435 listener = vsk->listener;
436 cleanup = true;
437
438 lock_sock(listener);
439 lock_sock(sk);
440
441 if (vsock_is_pending(sk)) {
442 vsock_remove_pending(listener, sk);
443 } else if (!vsk->rejected) {
444 /* We are not on the pending list and accept() did not reject
445 * us, so we must have been accepted by our user process. We
446 * just need to drop our references to the sockets and be on
447 * our way.
448 */
449 cleanup = false;
450 goto out;
451 }
452
453 listener->sk_ack_backlog--;
454
455 /* We need to remove ourself from the global connected sockets list so
456 * incoming packets can't find this socket, and to reduce the reference
457 * count.
458 */
459 if (vsock_in_connected_table(vsk))
460 vsock_remove_connected(vsk);
461
462 sk->sk_state = SS_FREE;
463
464out:
465 release_sock(sk);
466 release_sock(listener);
467 if (cleanup)
468 sock_put(sk);
469
470 sock_put(sk);
471 sock_put(listener);
472}
473EXPORT_SYMBOL_GPL(vsock_pending_work);
474
475/**** SOCKET OPERATIONS ****/
476
477static int __vsock_bind_stream(struct vsock_sock *vsk,
478 struct sockaddr_vm *addr)
479{
480 static u32 port = LAST_RESERVED_PORT + 1;
481 struct sockaddr_vm new_addr;
482
483 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
484
485 if (addr->svm_port == VMADDR_PORT_ANY) {
486 bool found = false;
487 unsigned int i;
488
489 for (i = 0; i < MAX_PORT_RETRIES; i++) {
490 if (port <= LAST_RESERVED_PORT)
491 port = LAST_RESERVED_PORT + 1;
492
493 new_addr.svm_port = port++;
494
495 if (!__vsock_find_bound_socket(&new_addr)) {
496 found = true;
497 break;
498 }
499 }
500
501 if (!found)
502 return -EADDRNOTAVAIL;
503 } else {
504 /* If port is in reserved range, ensure caller
505 * has necessary privileges.
506 */
507 if (addr->svm_port <= LAST_RESERVED_PORT &&
508 !capable(CAP_NET_BIND_SERVICE)) {
509 return -EACCES;
510 }
511
512 if (__vsock_find_bound_socket(&new_addr))
513 return -EADDRINUSE;
514 }
515
516 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
517
518 /* Remove stream sockets from the unbound list and add them to the hash
519 * table for easy lookup by its address. The unbound list is simply an
520 * extra entry at the end of the hash table, a trick used by AF_UNIX.
521 */
522 __vsock_remove_bound(vsk);
523 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
524
525 return 0;
526}
527
528static int __vsock_bind_dgram(struct vsock_sock *vsk,
529 struct sockaddr_vm *addr)
530{
531 return transport->dgram_bind(vsk, addr);
532}
533
534static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
535{
536 struct vsock_sock *vsk = vsock_sk(sk);
537 u32 cid;
538 int retval;
539
540 /* First ensure this socket isn't already bound. */
541 if (vsock_addr_bound(&vsk->local_addr))
542 return -EINVAL;
543
544 /* Now bind to the provided address or select appropriate values if
545 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
546 * like AF_INET prevents binding to a non-local IP address (in most
547 * cases), we only allow binding to the local CID.
548 */
549 cid = transport->get_local_cid();
550 if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
551 return -EADDRNOTAVAIL;
552
553 switch (sk->sk_socket->type) {
554 case SOCK_STREAM:
555 spin_lock_bh(&vsock_table_lock);
556 retval = __vsock_bind_stream(vsk, addr);
557 spin_unlock_bh(&vsock_table_lock);
558 break;
559
560 case SOCK_DGRAM:
561 retval = __vsock_bind_dgram(vsk, addr);
562 break;
563
564 default:
565 retval = -EINVAL;
566 break;
567 }
568
569 return retval;
570}
571
572struct sock *__vsock_create(struct net *net,
573 struct socket *sock,
574 struct sock *parent,
575 gfp_t priority,
576 unsigned short type)
577{
578 struct sock *sk;
579 struct vsock_sock *psk;
580 struct vsock_sock *vsk;
581
582 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
583 if (!sk)
584 return NULL;
585
586 sock_init_data(sock, sk);
587
588 /* sk->sk_type is normally set in sock_init_data, but only if sock is
589 * non-NULL. We make sure that our sockets always have a type by
590 * setting it here if needed.
591 */
592 if (!sock)
593 sk->sk_type = type;
594
595 vsk = vsock_sk(sk);
596 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
597 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
598
599 sk->sk_destruct = vsock_sk_destruct;
600 sk->sk_backlog_rcv = vsock_queue_rcv_skb;
601 sk->sk_state = 0;
602 sock_reset_flag(sk, SOCK_DONE);
603
604 INIT_LIST_HEAD(&vsk->bound_table);
605 INIT_LIST_HEAD(&vsk->connected_table);
606 vsk->listener = NULL;
607 INIT_LIST_HEAD(&vsk->pending_links);
608 INIT_LIST_HEAD(&vsk->accept_queue);
609 vsk->rejected = false;
610 vsk->sent_request = false;
611 vsk->ignore_connecting_rst = false;
612 vsk->peer_shutdown = 0;
613
614 psk = parent ? vsock_sk(parent) : NULL;
615 if (parent) {
616 vsk->trusted = psk->trusted;
617 vsk->owner = get_cred(psk->owner);
618 vsk->connect_timeout = psk->connect_timeout;
619 } else {
620 vsk->trusted = capable(CAP_NET_ADMIN);
621 vsk->owner = get_current_cred();
622 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
623 }
624
625 if (transport->init(vsk, psk) < 0) {
626 sk_free(sk);
627 return NULL;
628 }
629
630 if (sock)
631 vsock_insert_unbound(vsk);
632
633 return sk;
634}
635EXPORT_SYMBOL_GPL(__vsock_create);
636
637static void __vsock_release(struct sock *sk)
638{
639 if (sk) {
640 struct sk_buff *skb;
641 struct sock *pending;
642 struct vsock_sock *vsk;
643
644 vsk = vsock_sk(sk);
645 pending = NULL; /* Compiler warning. */
646
647 if (vsock_in_bound_table(vsk))
648 vsock_remove_bound(vsk);
649
650 if (vsock_in_connected_table(vsk))
651 vsock_remove_connected(vsk);
652
653 transport->release(vsk);
654
655 lock_sock(sk);
656 sock_orphan(sk);
657 sk->sk_shutdown = SHUTDOWN_MASK;
658
659 while ((skb = skb_dequeue(&sk->sk_receive_queue)))
660 kfree_skb(skb);
661
662 /* Clean up any sockets that never were accepted. */
663 while ((pending = vsock_dequeue_accept(sk)) != NULL) {
664 __vsock_release(pending);
665 sock_put(pending);
666 }
667
668 release_sock(sk);
669 sock_put(sk);
670 }
671}
672
673static void vsock_sk_destruct(struct sock *sk)
674{
675 struct vsock_sock *vsk = vsock_sk(sk);
676
677 transport->destruct(vsk);
678
679 /* When clearing these addresses, there's no need to set the family and
680 * possibly register the address family with the kernel.
681 */
682 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
683 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
684
685 put_cred(vsk->owner);
686}
687
688static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
689{
690 int err;
691
692 err = sock_queue_rcv_skb(sk, skb);
693 if (err)
694 kfree_skb(skb);
695
696 return err;
697}
698
699s64 vsock_stream_has_data(struct vsock_sock *vsk)
700{
701 return transport->stream_has_data(vsk);
702}
703EXPORT_SYMBOL_GPL(vsock_stream_has_data);
704
705s64 vsock_stream_has_space(struct vsock_sock *vsk)
706{
707 return transport->stream_has_space(vsk);
708}
709EXPORT_SYMBOL_GPL(vsock_stream_has_space);
710
711static int vsock_release(struct socket *sock)
712{
713 __vsock_release(sock->sk);
714 sock->sk = NULL;
715 sock->state = SS_FREE;
716
717 return 0;
718}
719
720static int
721vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
722{
723 int err;
724 struct sock *sk;
725 struct sockaddr_vm *vm_addr;
726
727 sk = sock->sk;
728
729 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0)
730 return -EINVAL;
731
732 lock_sock(sk);
733 err = __vsock_bind(sk, vm_addr);
734 release_sock(sk);
735
736 return err;
737}
738
739static int vsock_getname(struct socket *sock,
740 struct sockaddr *addr, int *addr_len, int peer)
741{
742 int err;
743 struct sock *sk;
744 struct vsock_sock *vsk;
745 struct sockaddr_vm *vm_addr;
746
747 sk = sock->sk;
748 vsk = vsock_sk(sk);
749 err = 0;
750
751 lock_sock(sk);
752
753 if (peer) {
754 if (sock->state != SS_CONNECTED) {
755 err = -ENOTCONN;
756 goto out;
757 }
758 vm_addr = &vsk->remote_addr;
759 } else {
760 vm_addr = &vsk->local_addr;
761 }
762
763 if (!vm_addr) {
764 err = -EINVAL;
765 goto out;
766 }
767
768 /* sys_getsockname() and sys_getpeername() pass us a
769 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately
770 * that macro is defined in socket.c instead of .h, so we hardcode its
771 * value here.
772 */
773 BUILD_BUG_ON(sizeof(*vm_addr) > 128);
774 memcpy(addr, vm_addr, sizeof(*vm_addr));
775 *addr_len = sizeof(*vm_addr);
776
777out:
778 release_sock(sk);
779 return err;
780}
781
782static int vsock_shutdown(struct socket *sock, int mode)
783{
784 int err;
785 struct sock *sk;
786
787 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses
788 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode
789 * here like the other address families do. Note also that the
790 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3),
791 * which is what we want.
792 */
793 mode++;
794
795 if ((mode & ~SHUTDOWN_MASK) || !mode)
796 return -EINVAL;
797
798 /* If this is a STREAM socket and it is not connected then bail out
799 * immediately. If it is a DGRAM socket then we must first kick the
800 * socket so that it wakes up from any sleeping calls, for example
801 * recv(), and then afterwards return the error.
802 */
803
804 sk = sock->sk;
805 if (sock->state == SS_UNCONNECTED) {
806 err = -ENOTCONN;
807 if (sk->sk_type == SOCK_STREAM)
808 return err;
809 } else {
810 sock->state = SS_DISCONNECTING;
811 err = 0;
812 }
813
814 /* Receive and send shutdowns are treated alike. */
815 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN);
816 if (mode) {
817 lock_sock(sk);
818 sk->sk_shutdown |= mode;
819 sk->sk_state_change(sk);
820 release_sock(sk);
821
822 if (sk->sk_type == SOCK_STREAM) {
823 sock_reset_flag(sk, SOCK_DONE);
824 vsock_send_shutdown(sk, mode);
825 }
826 }
827
828 return err;
829}
830
831static unsigned int vsock_poll(struct file *file, struct socket *sock,
832 poll_table *wait)
833{
834 struct sock *sk;
835 unsigned int mask;
836 struct vsock_sock *vsk;
837
838 sk = sock->sk;
839 vsk = vsock_sk(sk);
840
841 poll_wait(file, sk_sleep(sk), wait);
842 mask = 0;
843
844 if (sk->sk_err)
845 /* Signify that there has been an error on this socket. */
846 mask |= POLLERR;
847
848 /* INET sockets treat local write shutdown and peer write shutdown as a
849 * case of POLLHUP set.
850 */
851 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
852 ((sk->sk_shutdown & SEND_SHUTDOWN) &&
853 (vsk->peer_shutdown & SEND_SHUTDOWN))) {
854 mask |= POLLHUP;
855 }
856
857 if (sk->sk_shutdown & RCV_SHUTDOWN ||
858 vsk->peer_shutdown & SEND_SHUTDOWN) {
859 mask |= POLLRDHUP;
860 }
861
862 if (sock->type == SOCK_DGRAM) {
863 /* For datagram sockets we can read if there is something in
864 * the queue and write as long as the socket isn't shutdown for
865 * sending.
866 */
867 if (!skb_queue_empty(&sk->sk_receive_queue) ||
868 (sk->sk_shutdown & RCV_SHUTDOWN)) {
869 mask |= POLLIN | POLLRDNORM;
870 }
871
872 if (!(sk->sk_shutdown & SEND_SHUTDOWN))
873 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
874
875 } else if (sock->type == SOCK_STREAM) {
876 lock_sock(sk);
877
878 /* Listening sockets that have connections in their accept
879 * queue can be read.
880 */
881 if (sk->sk_state == SS_LISTEN
882 && !vsock_is_accept_queue_empty(sk))
883 mask |= POLLIN | POLLRDNORM;
884
885 /* If there is something in the queue then we can read. */
886 if (transport->stream_is_active(vsk) &&
887 !(sk->sk_shutdown & RCV_SHUTDOWN)) {
888 bool data_ready_now = false;
889 int ret = transport->notify_poll_in(
890 vsk, 1, &data_ready_now);
891 if (ret < 0) {
892 mask |= POLLERR;
893 } else {
894 if (data_ready_now)
895 mask |= POLLIN | POLLRDNORM;
896
897 }
898 }
899
900 /* Sockets whose connections have been closed, reset, or
901 * terminated should also be considered read, and we check the
902 * shutdown flag for that.
903 */
904 if (sk->sk_shutdown & RCV_SHUTDOWN ||
905 vsk->peer_shutdown & SEND_SHUTDOWN) {
906 mask |= POLLIN | POLLRDNORM;
907 }
908
909 /* Connected sockets that can produce data can be written. */
910 if (sk->sk_state == SS_CONNECTED) {
911 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
912 bool space_avail_now = false;
913 int ret = transport->notify_poll_out(
914 vsk, 1, &space_avail_now);
915 if (ret < 0) {
916 mask |= POLLERR;
917 } else {
918 if (space_avail_now)
919 /* Remove POLLWRBAND since INET
920 * sockets are not setting it.
921 */
922 mask |= POLLOUT | POLLWRNORM;
923
924 }
925 }
926 }
927
928 /* Simulate INET socket poll behaviors, which sets
929 * POLLOUT|POLLWRNORM when peer is closed and nothing to read,
930 * but local send is not shutdown.
931 */
932 if (sk->sk_state == SS_UNCONNECTED) {
933 if (!(sk->sk_shutdown & SEND_SHUTDOWN))
934 mask |= POLLOUT | POLLWRNORM;
935
936 }
937
938 release_sock(sk);
939 }
940
941 return mask;
942}
943
944static int vsock_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
945 struct msghdr *msg, size_t len)
946{
947 int err;
948 struct sock *sk;
949 struct vsock_sock *vsk;
950 struct sockaddr_vm *remote_addr;
951
952 if (msg->msg_flags & MSG_OOB)
953 return -EOPNOTSUPP;
954
955 /* For now, MSG_DONTWAIT is always assumed... */
956 err = 0;
957 sk = sock->sk;
958 vsk = vsock_sk(sk);
959
960 lock_sock(sk);
961
962 if (!vsock_addr_bound(&vsk->local_addr)) {
963 struct sockaddr_vm local_addr;
964
965 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
966 err = __vsock_bind(sk, &local_addr);
967 if (err != 0)
968 goto out;
969
970 }
971
972 /* If the provided message contains an address, use that. Otherwise
973 * fall back on the socket's remote handle (if it has been connected).
974 */
975 if (msg->msg_name &&
976 vsock_addr_cast(msg->msg_name, msg->msg_namelen,
977 &remote_addr) == 0) {
978 /* Ensure this address is of the right type and is a valid
979 * destination.
980 */
981
982 if (remote_addr->svm_cid == VMADDR_CID_ANY)
983 remote_addr->svm_cid = transport->get_local_cid();
984
985 if (!vsock_addr_bound(remote_addr)) {
986 err = -EINVAL;
987 goto out;
988 }
989 } else if (sock->state == SS_CONNECTED) {
990 remote_addr = &vsk->remote_addr;
991
992 if (remote_addr->svm_cid == VMADDR_CID_ANY)
993 remote_addr->svm_cid = transport->get_local_cid();
994
995 /* XXX Should connect() or this function ensure remote_addr is
996 * bound?
997 */
998 if (!vsock_addr_bound(&vsk->remote_addr)) {
999 err = -EINVAL;
1000 goto out;
1001 }
1002 } else {
1003 err = -EINVAL;
1004 goto out;
1005 }
1006
1007 if (!transport->dgram_allow(remote_addr->svm_cid,
1008 remote_addr->svm_port)) {
1009 err = -EINVAL;
1010 goto out;
1011 }
1012
1013 err = transport->dgram_enqueue(vsk, remote_addr, msg->msg_iov, len);
1014
1015out:
1016 release_sock(sk);
1017 return err;
1018}
1019
1020static int vsock_dgram_connect(struct socket *sock,
1021 struct sockaddr *addr, int addr_len, int flags)
1022{
1023 int err;
1024 struct sock *sk;
1025 struct vsock_sock *vsk;
1026 struct sockaddr_vm *remote_addr;
1027
1028 sk = sock->sk;
1029 vsk = vsock_sk(sk);
1030
1031 err = vsock_addr_cast(addr, addr_len, &remote_addr);
1032 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) {
1033 lock_sock(sk);
1034 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY,
1035 VMADDR_PORT_ANY);
1036 sock->state = SS_UNCONNECTED;
1037 release_sock(sk);
1038 return 0;
1039 } else if (err != 0)
1040 return -EINVAL;
1041
1042 lock_sock(sk);
1043
1044 if (!vsock_addr_bound(&vsk->local_addr)) {
1045 struct sockaddr_vm local_addr;
1046
1047 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
1048 err = __vsock_bind(sk, &local_addr);
1049 if (err != 0)
1050 goto out;
1051
1052 }
1053
1054 if (!transport->dgram_allow(remote_addr->svm_cid,
1055 remote_addr->svm_port)) {
1056 err = -EINVAL;
1057 goto out;
1058 }
1059
1060 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
1061 sock->state = SS_CONNECTED;
1062
1063out:
1064 release_sock(sk);
1065 return err;
1066}
1067
1068static int vsock_dgram_recvmsg(struct kiocb *kiocb, struct socket *sock,
1069 struct msghdr *msg, size_t len, int flags)
1070{
1071 return transport->dgram_dequeue(kiocb, vsock_sk(sock->sk), msg, len,
1072 flags);
1073}
1074
1075static const struct proto_ops vsock_dgram_ops = {
1076 .family = PF_VSOCK,
1077 .owner = THIS_MODULE,
1078 .release = vsock_release,
1079 .bind = vsock_bind,
1080 .connect = vsock_dgram_connect,
1081 .socketpair = sock_no_socketpair,
1082 .accept = sock_no_accept,
1083 .getname = vsock_getname,
1084 .poll = vsock_poll,
1085 .ioctl = sock_no_ioctl,
1086 .listen = sock_no_listen,
1087 .shutdown = vsock_shutdown,
1088 .setsockopt = sock_no_setsockopt,
1089 .getsockopt = sock_no_getsockopt,
1090 .sendmsg = vsock_dgram_sendmsg,
1091 .recvmsg = vsock_dgram_recvmsg,
1092 .mmap = sock_no_mmap,
1093 .sendpage = sock_no_sendpage,
1094};
1095
1096static void vsock_connect_timeout(struct work_struct *work)
1097{
1098 struct sock *sk;
1099 struct vsock_sock *vsk;
1100
1101 vsk = container_of(work, struct vsock_sock, dwork.work);
1102 sk = sk_vsock(vsk);
1103
1104 lock_sock(sk);
1105 if (sk->sk_state == SS_CONNECTING &&
1106 (sk->sk_shutdown != SHUTDOWN_MASK)) {
1107 sk->sk_state = SS_UNCONNECTED;
1108 sk->sk_err = ETIMEDOUT;
1109 sk->sk_error_report(sk);
1110 }
1111 release_sock(sk);
1112
1113 sock_put(sk);
1114}
1115
1116static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
1117 int addr_len, int flags)
1118{
1119 int err;
1120 struct sock *sk;
1121 struct vsock_sock *vsk;
1122 struct sockaddr_vm *remote_addr;
1123 long timeout;
1124 DEFINE_WAIT(wait);
1125
1126 err = 0;
1127 sk = sock->sk;
1128 vsk = vsock_sk(sk);
1129
1130 lock_sock(sk);
1131
1132 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */
1133 switch (sock->state) {
1134 case SS_CONNECTED:
1135 err = -EISCONN;
1136 goto out;
1137 case SS_DISCONNECTING:
1138 err = -EINVAL;
1139 goto out;
1140 case SS_CONNECTING:
1141 /* This continues on so we can move sock into the SS_CONNECTED
1142 * state once the connection has completed (at which point err
1143 * will be set to zero also). Otherwise, we will either wait
1144 * for the connection or return -EALREADY should this be a
1145 * non-blocking call.
1146 */
1147 err = -EALREADY;
1148 break;
1149 default:
1150 if ((sk->sk_state == SS_LISTEN) ||
1151 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
1152 err = -EINVAL;
1153 goto out;
1154 }
1155
1156 /* The hypervisor and well-known contexts do not have socket
1157 * endpoints.
1158 */
1159 if (!transport->stream_allow(remote_addr->svm_cid,
1160 remote_addr->svm_port)) {
1161 err = -ENETUNREACH;
1162 goto out;
1163 }
1164
1165 /* Set the remote address that we are connecting to. */
1166 memcpy(&vsk->remote_addr, remote_addr,
1167 sizeof(vsk->remote_addr));
1168
1169 /* Autobind this socket to the local address if necessary. */
1170 if (!vsock_addr_bound(&vsk->local_addr)) {
1171 struct sockaddr_vm local_addr;
1172
1173 vsock_addr_init(&local_addr, VMADDR_CID_ANY,
1174 VMADDR_PORT_ANY);
1175 err = __vsock_bind(sk, &local_addr);
1176 if (err != 0)
1177 goto out;
1178
1179 }
1180
1181 sk->sk_state = SS_CONNECTING;
1182
1183 err = transport->connect(vsk);
1184 if (err < 0)
1185 goto out;
1186
1187 /* Mark sock as connecting and set the error code to in
1188 * progress in case this is a non-blocking connect.
1189 */
1190 sock->state = SS_CONNECTING;
1191 err = -EINPROGRESS;
1192 }
1193
1194 /* The receive path will handle all communication until we are able to
1195 * enter the connected state. Here we wait for the connection to be
1196 * completed or a notification of an error.
1197 */
1198 timeout = vsk->connect_timeout;
1199 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1200
1201 while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) {
1202 if (flags & O_NONBLOCK) {
1203 /* If we're not going to block, we schedule a timeout
1204 * function to generate a timeout on the connection
1205 * attempt, in case the peer doesn't respond in a
1206 * timely manner. We hold on to the socket until the
1207 * timeout fires.
1208 */
1209 sock_hold(sk);
1210 INIT_DELAYED_WORK(&vsk->dwork,
1211 vsock_connect_timeout);
1212 schedule_delayed_work(&vsk->dwork, timeout);
1213
1214 /* Skip ahead to preserve error code set above. */
1215 goto out_wait;
1216 }
1217
1218 release_sock(sk);
1219 timeout = schedule_timeout(timeout);
1220 lock_sock(sk);
1221
1222 if (signal_pending(current)) {
1223 err = sock_intr_errno(timeout);
1224 goto out_wait_error;
1225 } else if (timeout == 0) {
1226 err = -ETIMEDOUT;
1227 goto out_wait_error;
1228 }
1229
1230 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1231 }
1232
1233 if (sk->sk_err) {
1234 err = -sk->sk_err;
1235 goto out_wait_error;
1236 } else
1237 err = 0;
1238
1239out_wait:
1240 finish_wait(sk_sleep(sk), &wait);
1241out:
1242 release_sock(sk);
1243 return err;
1244
1245out_wait_error:
1246 sk->sk_state = SS_UNCONNECTED;
1247 sock->state = SS_UNCONNECTED;
1248 goto out_wait;
1249}
1250
1251static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
1252{
1253 struct sock *listener;
1254 int err;
1255 struct sock *connected;
1256 struct vsock_sock *vconnected;
1257 long timeout;
1258 DEFINE_WAIT(wait);
1259
1260 err = 0;
1261 listener = sock->sk;
1262
1263 lock_sock(listener);
1264
1265 if (sock->type != SOCK_STREAM) {
1266 err = -EOPNOTSUPP;
1267 goto out;
1268 }
1269
1270 if (listener->sk_state != SS_LISTEN) {
1271 err = -EINVAL;
1272 goto out;
1273 }
1274
1275 /* Wait for children sockets to appear; these are the new sockets
1276 * created upon connection establishment.
1277 */
1278 timeout = sock_sndtimeo(listener, flags & O_NONBLOCK);
1279 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
1280
1281 while ((connected = vsock_dequeue_accept(listener)) == NULL &&
1282 listener->sk_err == 0) {
1283 release_sock(listener);
1284 timeout = schedule_timeout(timeout);
1285 lock_sock(listener);
1286
1287 if (signal_pending(current)) {
1288 err = sock_intr_errno(timeout);
1289 goto out_wait;
1290 } else if (timeout == 0) {
1291 err = -EAGAIN;
1292 goto out_wait;
1293 }
1294
1295 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
1296 }
1297
1298 if (listener->sk_err)
1299 err = -listener->sk_err;
1300
1301 if (connected) {
1302 listener->sk_ack_backlog--;
1303
1304 lock_sock(connected);
1305 vconnected = vsock_sk(connected);
1306
1307 /* If the listener socket has received an error, then we should
1308 * reject this socket and return. Note that we simply mark the
1309 * socket rejected, drop our reference, and let the cleanup
1310 * function handle the cleanup; the fact that we found it in
1311 * the listener's accept queue guarantees that the cleanup
1312 * function hasn't run yet.
1313 */
1314 if (err) {
1315 vconnected->rejected = true;
1316 release_sock(connected);
1317 sock_put(connected);
1318 goto out_wait;
1319 }
1320
1321 newsock->state = SS_CONNECTED;
1322 sock_graft(connected, newsock);
1323 release_sock(connected);
1324 sock_put(connected);
1325 }
1326
1327out_wait:
1328 finish_wait(sk_sleep(listener), &wait);
1329out:
1330 release_sock(listener);
1331 return err;
1332}
1333
1334static int vsock_listen(struct socket *sock, int backlog)
1335{
1336 int err;
1337 struct sock *sk;
1338 struct vsock_sock *vsk;
1339
1340 sk = sock->sk;
1341
1342 lock_sock(sk);
1343
1344 if (sock->type != SOCK_STREAM) {
1345 err = -EOPNOTSUPP;
1346 goto out;
1347 }
1348
1349 if (sock->state != SS_UNCONNECTED) {
1350 err = -EINVAL;
1351 goto out;
1352 }
1353
1354 vsk = vsock_sk(sk);
1355
1356 if (!vsock_addr_bound(&vsk->local_addr)) {
1357 err = -EINVAL;
1358 goto out;
1359 }
1360
1361 sk->sk_max_ack_backlog = backlog;
1362 sk->sk_state = SS_LISTEN;
1363
1364 err = 0;
1365
1366out:
1367 release_sock(sk);
1368 return err;
1369}
1370
1371static int vsock_stream_setsockopt(struct socket *sock,
1372 int level,
1373 int optname,
1374 char __user *optval,
1375 unsigned int optlen)
1376{
1377 int err;
1378 struct sock *sk;
1379 struct vsock_sock *vsk;
1380 u64 val;
1381
1382 if (level != AF_VSOCK)
1383 return -ENOPROTOOPT;
1384
1385#define COPY_IN(_v) \
1386 do { \
1387 if (optlen < sizeof(_v)) { \
1388 err = -EINVAL; \
1389 goto exit; \
1390 } \
1391 if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \
1392 err = -EFAULT; \
1393 goto exit; \
1394 } \
1395 } while (0)
1396
1397 err = 0;
1398 sk = sock->sk;
1399 vsk = vsock_sk(sk);
1400
1401 lock_sock(sk);
1402
1403 switch (optname) {
1404 case SO_VM_SOCKETS_BUFFER_SIZE:
1405 COPY_IN(val);
1406 transport->set_buffer_size(vsk, val);
1407 break;
1408
1409 case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
1410 COPY_IN(val);
1411 transport->set_max_buffer_size(vsk, val);
1412 break;
1413
1414 case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
1415 COPY_IN(val);
1416 transport->set_min_buffer_size(vsk, val);
1417 break;
1418
1419 case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
1420 struct timeval tv;
1421 COPY_IN(tv);
1422 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC &&
1423 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) {
1424 vsk->connect_timeout = tv.tv_sec * HZ +
1425 DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ));
1426 if (vsk->connect_timeout == 0)
1427 vsk->connect_timeout =
1428 VSOCK_DEFAULT_CONNECT_TIMEOUT;
1429
1430 } else {
1431 err = -ERANGE;
1432 }
1433 break;
1434 }
1435
1436 default:
1437 err = -ENOPROTOOPT;
1438 break;
1439 }
1440
1441#undef COPY_IN
1442
1443exit:
1444 release_sock(sk);
1445 return err;
1446}
1447
1448static int vsock_stream_getsockopt(struct socket *sock,
1449 int level, int optname,
1450 char __user *optval,
1451 int __user *optlen)
1452{
1453 int err;
1454 int len;
1455 struct sock *sk;
1456 struct vsock_sock *vsk;
1457 u64 val;
1458
1459 if (level != AF_VSOCK)
1460 return -ENOPROTOOPT;
1461
1462 err = get_user(len, optlen);
1463 if (err != 0)
1464 return err;
1465
1466#define COPY_OUT(_v) \
1467 do { \
1468 if (len < sizeof(_v)) \
1469 return -EINVAL; \
1470 \
1471 len = sizeof(_v); \
1472 if (copy_to_user(optval, &_v, len) != 0) \
1473 return -EFAULT; \
1474 \
1475 } while (0)
1476
1477 err = 0;
1478 sk = sock->sk;
1479 vsk = vsock_sk(sk);
1480
1481 switch (optname) {
1482 case SO_VM_SOCKETS_BUFFER_SIZE:
1483 val = transport->get_buffer_size(vsk);
1484 COPY_OUT(val);
1485 break;
1486
1487 case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
1488 val = transport->get_max_buffer_size(vsk);
1489 COPY_OUT(val);
1490 break;
1491
1492 case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
1493 val = transport->get_min_buffer_size(vsk);
1494 COPY_OUT(val);
1495 break;
1496
1497 case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
1498 struct timeval tv;
1499 tv.tv_sec = vsk->connect_timeout / HZ;
1500 tv.tv_usec =
1501 (vsk->connect_timeout -
1502 tv.tv_sec * HZ) * (1000000 / HZ);
1503 COPY_OUT(tv);
1504 break;
1505 }
1506 default:
1507 return -ENOPROTOOPT;
1508 }
1509
1510 err = put_user(len, optlen);
1511 if (err != 0)
1512 return -EFAULT;
1513
1514#undef COPY_OUT
1515
1516 return 0;
1517}
1518
1519static int vsock_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1520 struct msghdr *msg, size_t len)
1521{
1522 struct sock *sk;
1523 struct vsock_sock *vsk;
1524 ssize_t total_written;
1525 long timeout;
1526 int err;
1527 struct vsock_transport_send_notify_data send_data;
1528
1529 DEFINE_WAIT(wait);
1530
1531 sk = sock->sk;
1532 vsk = vsock_sk(sk);
1533 total_written = 0;
1534 err = 0;
1535
1536 if (msg->msg_flags & MSG_OOB)
1537 return -EOPNOTSUPP;
1538
1539 lock_sock(sk);
1540
1541 /* Callers should not provide a destination with stream sockets. */
1542 if (msg->msg_namelen) {
1543 err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP;
1544 goto out;
1545 }
1546
1547 /* Send data only if both sides are not shutdown in the direction. */
1548 if (sk->sk_shutdown & SEND_SHUTDOWN ||
1549 vsk->peer_shutdown & RCV_SHUTDOWN) {
1550 err = -EPIPE;
1551 goto out;
1552 }
1553
1554 if (sk->sk_state != SS_CONNECTED ||
1555 !vsock_addr_bound(&vsk->local_addr)) {
1556 err = -ENOTCONN;
1557 goto out;
1558 }
1559
1560 if (!vsock_addr_bound(&vsk->remote_addr)) {
1561 err = -EDESTADDRREQ;
1562 goto out;
1563 }
1564
1565 /* Wait for room in the produce queue to enqueue our user's data. */
1566 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1567
1568 err = transport->notify_send_init(vsk, &send_data);
1569 if (err < 0)
1570 goto out;
1571
1572 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1573
1574 while (total_written < len) {
1575 ssize_t written;
1576
1577 while (vsock_stream_has_space(vsk) == 0 &&
1578 sk->sk_err == 0 &&
1579 !(sk->sk_shutdown & SEND_SHUTDOWN) &&
1580 !(vsk->peer_shutdown & RCV_SHUTDOWN)) {
1581
1582 /* Don't wait for non-blocking sockets. */
1583 if (timeout == 0) {
1584 err = -EAGAIN;
1585 goto out_wait;
1586 }
1587
1588 err = transport->notify_send_pre_block(vsk, &send_data);
1589 if (err < 0)
1590 goto out_wait;
1591
1592 release_sock(sk);
1593 timeout = schedule_timeout(timeout);
1594 lock_sock(sk);
1595 if (signal_pending(current)) {
1596 err = sock_intr_errno(timeout);
1597 goto out_wait;
1598 } else if (timeout == 0) {
1599 err = -EAGAIN;
1600 goto out_wait;
1601 }
1602
1603 prepare_to_wait(sk_sleep(sk), &wait,
1604 TASK_INTERRUPTIBLE);
1605 }
1606
1607 /* These checks occur both as part of and after the loop
1608 * conditional since we need to check before and after
1609 * sleeping.
1610 */
1611 if (sk->sk_err) {
1612 err = -sk->sk_err;
1613 goto out_wait;
1614 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
1615 (vsk->peer_shutdown & RCV_SHUTDOWN)) {
1616 err = -EPIPE;
1617 goto out_wait;
1618 }
1619
1620 err = transport->notify_send_pre_enqueue(vsk, &send_data);
1621 if (err < 0)
1622 goto out_wait;
1623
1624 /* Note that enqueue will only write as many bytes as are free
1625 * in the produce queue, so we don't need to ensure len is
1626 * smaller than the queue size. It is the caller's
1627 * responsibility to check how many bytes we were able to send.
1628 */
1629
1630 written = transport->stream_enqueue(
1631 vsk, msg->msg_iov,
1632 len - total_written);
1633 if (written < 0) {
1634 err = -ENOMEM;
1635 goto out_wait;
1636 }
1637
1638 total_written += written;
1639
1640 err = transport->notify_send_post_enqueue(
1641 vsk, written, &send_data);
1642 if (err < 0)
1643 goto out_wait;
1644
1645 }
1646
1647out_wait:
1648 if (total_written > 0)
1649 err = total_written;
1650 finish_wait(sk_sleep(sk), &wait);
1651out:
1652 release_sock(sk);
1653 return err;
1654}
1655
1656
1657static int
1658vsock_stream_recvmsg(struct kiocb *kiocb,
1659 struct socket *sock,
1660 struct msghdr *msg, size_t len, int flags)
1661{
1662 struct sock *sk;
1663 struct vsock_sock *vsk;
1664 int err;
1665 size_t target;
1666 ssize_t copied;
1667 long timeout;
1668 struct vsock_transport_recv_notify_data recv_data;
1669
1670 DEFINE_WAIT(wait);
1671
1672 sk = sock->sk;
1673 vsk = vsock_sk(sk);
1674 err = 0;
1675
1676 lock_sock(sk);
1677
1678 if (sk->sk_state != SS_CONNECTED) {
1679 /* Recvmsg is supposed to return 0 if a peer performs an
1680 * orderly shutdown. Differentiate between that case and when a
1681 * peer has not connected or a local shutdown occured with the
1682 * SOCK_DONE flag.
1683 */
1684 if (sock_flag(sk, SOCK_DONE))
1685 err = 0;
1686 else
1687 err = -ENOTCONN;
1688
1689 goto out;
1690 }
1691
1692 if (flags & MSG_OOB) {
1693 err = -EOPNOTSUPP;
1694 goto out;
1695 }
1696
1697 /* We don't check peer_shutdown flag here since peer may actually shut
1698 * down, but there can be data in the queue that a local socket can
1699 * receive.
1700 */
1701 if (sk->sk_shutdown & RCV_SHUTDOWN) {
1702 err = 0;
1703 goto out;
1704 }
1705
1706 /* It is valid on Linux to pass in a zero-length receive buffer. This
1707 * is not an error. We may as well bail out now.
1708 */
1709 if (!len) {
1710 err = 0;
1711 goto out;
1712 }
1713
1714 /* We must not copy less than target bytes into the user's buffer
1715 * before returning successfully, so we wait for the consume queue to
1716 * have that much data to consume before dequeueing. Note that this
1717 * makes it impossible to handle cases where target is greater than the
1718 * queue size.
1719 */
1720 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1721 if (target >= transport->stream_rcvhiwat(vsk)) {
1722 err = -ENOMEM;
1723 goto out;
1724 }
1725 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
1726 copied = 0;
1727
1728 err = transport->notify_recv_init(vsk, target, &recv_data);
1729 if (err < 0)
1730 goto out;
1731
1732 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1733
1734 while (1) {
1735 s64 ready = vsock_stream_has_data(vsk);
1736
1737 if (ready < 0) {
1738 /* Invalid queue pair content. XXX This should be
1739 * changed to a connection reset in a later change.
1740 */
1741
1742 err = -ENOMEM;
1743 goto out_wait;
1744 } else if (ready > 0) {
1745 ssize_t read;
1746
1747 err = transport->notify_recv_pre_dequeue(
1748 vsk, target, &recv_data);
1749 if (err < 0)
1750 break;
1751
1752 read = transport->stream_dequeue(
1753 vsk, msg->msg_iov,
1754 len - copied, flags);
1755 if (read < 0) {
1756 err = -ENOMEM;
1757 break;
1758 }
1759
1760 copied += read;
1761
1762 err = transport->notify_recv_post_dequeue(
1763 vsk, target, read,
1764 !(flags & MSG_PEEK), &recv_data);
1765 if (err < 0)
1766 goto out_wait;
1767
1768 if (read >= target || flags & MSG_PEEK)
1769 break;
1770
1771 target -= read;
1772 } else {
1773 if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN)
1774 || (vsk->peer_shutdown & SEND_SHUTDOWN)) {
1775 break;
1776 }
1777 /* Don't wait for non-blocking sockets. */
1778 if (timeout == 0) {
1779 err = -EAGAIN;
1780 break;
1781 }
1782
1783 err = transport->notify_recv_pre_block(
1784 vsk, target, &recv_data);
1785 if (err < 0)
1786 break;
1787
1788 release_sock(sk);
1789 timeout = schedule_timeout(timeout);
1790 lock_sock(sk);
1791
1792 if (signal_pending(current)) {
1793 err = sock_intr_errno(timeout);
1794 break;
1795 } else if (timeout == 0) {
1796 err = -EAGAIN;
1797 break;
1798 }
1799
1800 prepare_to_wait(sk_sleep(sk), &wait,
1801 TASK_INTERRUPTIBLE);
1802 }
1803 }
1804
1805 if (sk->sk_err)
1806 err = -sk->sk_err;
1807 else if (sk->sk_shutdown & RCV_SHUTDOWN)
1808 err = 0;
1809
1810 if (copied > 0) {
1811 /* We only do these additional bookkeeping/notification steps
1812 * if we actually copied something out of the queue pair
1813 * instead of just peeking ahead.
1814 */
1815
1816 if (!(flags & MSG_PEEK)) {
1817 /* If the other side has shutdown for sending and there
1818 * is nothing more to read, then modify the socket
1819 * state.
1820 */
1821 if (vsk->peer_shutdown & SEND_SHUTDOWN) {
1822 if (vsock_stream_has_data(vsk) <= 0) {
1823 sk->sk_state = SS_UNCONNECTED;
1824 sock_set_flag(sk, SOCK_DONE);
1825 sk->sk_state_change(sk);
1826 }
1827 }
1828 }
1829 err = copied;
1830 }
1831
1832out_wait:
1833 finish_wait(sk_sleep(sk), &wait);
1834out:
1835 release_sock(sk);
1836 return err;
1837}
1838
1839static const struct proto_ops vsock_stream_ops = {
1840 .family = PF_VSOCK,
1841 .owner = THIS_MODULE,
1842 .release = vsock_release,
1843 .bind = vsock_bind,
1844 .connect = vsock_stream_connect,
1845 .socketpair = sock_no_socketpair,
1846 .accept = vsock_accept,
1847 .getname = vsock_getname,
1848 .poll = vsock_poll,
1849 .ioctl = sock_no_ioctl,
1850 .listen = vsock_listen,
1851 .shutdown = vsock_shutdown,
1852 .setsockopt = vsock_stream_setsockopt,
1853 .getsockopt = vsock_stream_getsockopt,
1854 .sendmsg = vsock_stream_sendmsg,
1855 .recvmsg = vsock_stream_recvmsg,
1856 .mmap = sock_no_mmap,
1857 .sendpage = sock_no_sendpage,
1858};
1859
1860static int vsock_create(struct net *net, struct socket *sock,
1861 int protocol, int kern)
1862{
1863 if (!sock)
1864 return -EINVAL;
1865
1866 if (protocol)
1867 return -EPROTONOSUPPORT;
1868
1869 switch (sock->type) {
1870 case SOCK_DGRAM:
1871 sock->ops = &vsock_dgram_ops;
1872 break;
1873 case SOCK_STREAM:
1874 sock->ops = &vsock_stream_ops;
1875 break;
1876 default:
1877 return -ESOCKTNOSUPPORT;
1878 }
1879
1880 sock->state = SS_UNCONNECTED;
1881
1882 return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM;
1883}
1884
1885static const struct net_proto_family vsock_family_ops = {
1886 .family = AF_VSOCK,
1887 .create = vsock_create,
1888 .owner = THIS_MODULE,
1889};
1890
1891static long vsock_dev_do_ioctl(struct file *filp,
1892 unsigned int cmd, void __user *ptr)
1893{
1894 u32 __user *p = ptr;
1895 int retval = 0;
1896
1897 switch (cmd) {
1898 case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
1899 if (put_user(transport->get_local_cid(), p) != 0)
1900 retval = -EFAULT;
1901 break;
1902
1903 default:
1904 pr_err("Unknown ioctl %d\n", cmd);
1905 retval = -EINVAL;
1906 }
1907
1908 return retval;
1909}
1910
1911static long vsock_dev_ioctl(struct file *filp,
1912 unsigned int cmd, unsigned long arg)
1913{
1914 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg);
1915}
1916
1917#ifdef CONFIG_COMPAT
1918static long vsock_dev_compat_ioctl(struct file *filp,
1919 unsigned int cmd, unsigned long arg)
1920{
1921 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg));
1922}
1923#endif
1924
1925static const struct file_operations vsock_device_ops = {
1926 .owner = THIS_MODULE,
1927 .unlocked_ioctl = vsock_dev_ioctl,
1928#ifdef CONFIG_COMPAT
1929 .compat_ioctl = vsock_dev_compat_ioctl,
1930#endif
1931 .open = nonseekable_open,
1932};
1933
1934static struct miscdevice vsock_device = {
1935 .name = "vsock",
1936 .minor = MISC_DYNAMIC_MINOR,
1937 .fops = &vsock_device_ops,
1938};
1939
1940static int __vsock_core_init(void)
1941{
1942 int err;
1943
1944 vsock_init_tables();
1945
1946 err = misc_register(&vsock_device);
1947 if (err) {
1948 pr_err("Failed to register misc device\n");
1949 return -ENOENT;
1950 }
1951
1952 err = proto_register(&vsock_proto, 1); /* we want our slab */
1953 if (err) {
1954 pr_err("Cannot register vsock protocol\n");
1955 goto err_misc_deregister;
1956 }
1957
1958 err = sock_register(&vsock_family_ops);
1959 if (err) {
1960 pr_err("could not register af_vsock (%d) address family: %d\n",
1961 AF_VSOCK, err);
1962 goto err_unregister_proto;
1963 }
1964
1965 return 0;
1966
1967err_unregister_proto:
1968 proto_unregister(&vsock_proto);
1969err_misc_deregister:
1970 misc_deregister(&vsock_device);
1971 return err;
1972}
1973
1974int vsock_core_init(const struct vsock_transport *t)
1975{
1976 int retval = mutex_lock_interruptible(&vsock_register_mutex);
1977 if (retval)
1978 return retval;
1979
1980 if (transport) {
1981 retval = -EBUSY;
1982 goto out;
1983 }
1984
1985 transport = t;
1986 retval = __vsock_core_init();
1987 if (retval)
1988 transport = NULL;
1989
1990out:
1991 mutex_unlock(&vsock_register_mutex);
1992 return retval;
1993}
1994EXPORT_SYMBOL_GPL(vsock_core_init);
1995
1996void vsock_core_exit(void)
1997{
1998 mutex_lock(&vsock_register_mutex);
1999
2000 misc_deregister(&vsock_device);
2001 sock_unregister(AF_VSOCK);
2002 proto_unregister(&vsock_proto);
2003
2004 /* We do not want the assignment below re-ordered. */
2005 mb();
2006 transport = NULL;
2007
2008 mutex_unlock(&vsock_register_mutex);
2009}
2010EXPORT_SYMBOL_GPL(vsock_core_exit);
2011
2012MODULE_AUTHOR("VMware, Inc.");
2013MODULE_DESCRIPTION("VMware Virtual Socket Family");
2014MODULE_VERSION(VSOCK_DRIVER_VERSION_STRING);
2015MODULE_LICENSE("GPL v2");
diff --git a/net/vmw_vsock/af_vsock.h b/net/vmw_vsock/af_vsock.h
new file mode 100644
index 000000000000..7d64d3609ec9
--- /dev/null
+++ b/net/vmw_vsock/af_vsock.h
@@ -0,0 +1,175 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#ifndef __AF_VSOCK_H__
17#define __AF_VSOCK_H__
18
19#include <linux/kernel.h>
20#include <linux/workqueue.h>
21#include <linux/vm_sockets.h>
22
23#include "vsock_addr.h"
24
25#define LAST_RESERVED_PORT 1023
26
27#define vsock_sk(__sk) ((struct vsock_sock *)__sk)
28#define sk_vsock(__vsk) (&(__vsk)->sk)
29
30struct vsock_sock {
31 /* sk must be the first member. */
32 struct sock sk;
33 struct sockaddr_vm local_addr;
34 struct sockaddr_vm remote_addr;
35 /* Links for the global tables of bound and connected sockets. */
36 struct list_head bound_table;
37 struct list_head connected_table;
38 /* Accessed without the socket lock held. This means it can never be
39 * modified outsided of socket create or destruct.
40 */
41 bool trusted;
42 bool cached_peer_allow_dgram; /* Dgram communication allowed to
43 * cached peer?
44 */
45 u32 cached_peer; /* Context ID of last dgram destination check. */
46 const struct cred *owner;
47 /* Rest are SOCK_STREAM only. */
48 long connect_timeout;
49 /* Listening socket that this came from. */
50 struct sock *listener;
51 /* Used for pending list and accept queue during connection handshake.
52 * The listening socket is the head for both lists. Sockets created
53 * for connection requests are placed in the pending list until they
54 * are connected, at which point they are put in the accept queue list
55 * so they can be accepted in accept(). If accept() cannot accept the
56 * connection, it is marked as rejected so the cleanup function knows
57 * to clean up the socket.
58 */
59 struct list_head pending_links;
60 struct list_head accept_queue;
61 bool rejected;
62 struct delayed_work dwork;
63 u32 peer_shutdown;
64 bool sent_request;
65 bool ignore_connecting_rst;
66
67 /* Private to transport. */
68 void *trans;
69};
70
71s64 vsock_stream_has_data(struct vsock_sock *vsk);
72s64 vsock_stream_has_space(struct vsock_sock *vsk);
73void vsock_pending_work(struct work_struct *work);
74struct sock *__vsock_create(struct net *net,
75 struct socket *sock,
76 struct sock *parent,
77 gfp_t priority, unsigned short type);
78
79/**** TRANSPORT ****/
80
81struct vsock_transport_recv_notify_data {
82 u64 data1; /* Transport-defined. */
83 u64 data2; /* Transport-defined. */
84 bool notify_on_block;
85};
86
87struct vsock_transport_send_notify_data {
88 u64 data1; /* Transport-defined. */
89 u64 data2; /* Transport-defined. */
90};
91
92struct vsock_transport {
93 /* Initialize/tear-down socket. */
94 int (*init)(struct vsock_sock *, struct vsock_sock *);
95 void (*destruct)(struct vsock_sock *);
96 void (*release)(struct vsock_sock *);
97
98 /* Connections. */
99 int (*connect)(struct vsock_sock *);
100
101 /* DGRAM. */
102 int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *);
103 int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk,
104 struct msghdr *msg, size_t len, int flags);
105 int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
106 struct iovec *, size_t len);
107 bool (*dgram_allow)(u32 cid, u32 port);
108
109 /* STREAM. */
110 /* TODO: stream_bind() */
111 ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *,
112 size_t len, int flags);
113 ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *,
114 size_t len);
115 s64 (*stream_has_data)(struct vsock_sock *);
116 s64 (*stream_has_space)(struct vsock_sock *);
117 u64 (*stream_rcvhiwat)(struct vsock_sock *);
118 bool (*stream_is_active)(struct vsock_sock *);
119 bool (*stream_allow)(u32 cid, u32 port);
120
121 /* Notification. */
122 int (*notify_poll_in)(struct vsock_sock *, size_t, bool *);
123 int (*notify_poll_out)(struct vsock_sock *, size_t, bool *);
124 int (*notify_recv_init)(struct vsock_sock *, size_t,
125 struct vsock_transport_recv_notify_data *);
126 int (*notify_recv_pre_block)(struct vsock_sock *, size_t,
127 struct vsock_transport_recv_notify_data *);
128 int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t,
129 struct vsock_transport_recv_notify_data *);
130 int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t,
131 ssize_t, bool, struct vsock_transport_recv_notify_data *);
132 int (*notify_send_init)(struct vsock_sock *,
133 struct vsock_transport_send_notify_data *);
134 int (*notify_send_pre_block)(struct vsock_sock *,
135 struct vsock_transport_send_notify_data *);
136 int (*notify_send_pre_enqueue)(struct vsock_sock *,
137 struct vsock_transport_send_notify_data *);
138 int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t,
139 struct vsock_transport_send_notify_data *);
140
141 /* Shutdown. */
142 int (*shutdown)(struct vsock_sock *, int);
143
144 /* Buffer sizes. */
145 void (*set_buffer_size)(struct vsock_sock *, u64);
146 void (*set_min_buffer_size)(struct vsock_sock *, u64);
147 void (*set_max_buffer_size)(struct vsock_sock *, u64);
148 u64 (*get_buffer_size)(struct vsock_sock *);
149 u64 (*get_min_buffer_size)(struct vsock_sock *);
150 u64 (*get_max_buffer_size)(struct vsock_sock *);
151
152 /* Addressing. */
153 u32 (*get_local_cid)(void);
154};
155
156/**** CORE ****/
157
158int vsock_core_init(const struct vsock_transport *t);
159void vsock_core_exit(void);
160
161/**** UTILS ****/
162
163void vsock_release_pending(struct sock *pending);
164void vsock_add_pending(struct sock *listener, struct sock *pending);
165void vsock_remove_pending(struct sock *listener, struct sock *pending);
166void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
167void vsock_insert_connected(struct vsock_sock *vsk);
168void vsock_remove_bound(struct vsock_sock *vsk);
169void vsock_remove_connected(struct vsock_sock *vsk);
170struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
171struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
172 struct sockaddr_vm *dst);
173void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
174
175#endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
new file mode 100644
index 000000000000..e8a87cf37072
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport.c
@@ -0,0 +1,2157 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/types.h>
17
18#define EXPORT_SYMTAB
19#include <linux/bitops.h>
20#include <linux/cred.h>
21#include <linux/init.h>
22#include <linux/io.h>
23#include <linux/kernel.h>
24#include <linux/kmod.h>
25#include <linux/list.h>
26#include <linux/miscdevice.h>
27#include <linux/module.h>
28#include <linux/mutex.h>
29#include <linux/net.h>
30#include <linux/poll.h>
31#include <linux/skbuff.h>
32#include <linux/smp.h>
33#include <linux/socket.h>
34#include <linux/stddef.h>
35#include <linux/unistd.h>
36#include <linux/wait.h>
37#include <linux/workqueue.h>
38#include <net/sock.h>
39
40#include "af_vsock.h"
41#include "vmci_transport_notify.h"
42
43static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg);
44static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg);
45static void vmci_transport_peer_attach_cb(u32 sub_id,
46 const struct vmci_event_data *ed,
47 void *client_data);
48static void vmci_transport_peer_detach_cb(u32 sub_id,
49 const struct vmci_event_data *ed,
50 void *client_data);
51static void vmci_transport_recv_pkt_work(struct work_struct *work);
52static int vmci_transport_recv_listen(struct sock *sk,
53 struct vmci_transport_packet *pkt);
54static int vmci_transport_recv_connecting_server(
55 struct sock *sk,
56 struct sock *pending,
57 struct vmci_transport_packet *pkt);
58static int vmci_transport_recv_connecting_client(
59 struct sock *sk,
60 struct vmci_transport_packet *pkt);
61static int vmci_transport_recv_connecting_client_negotiate(
62 struct sock *sk,
63 struct vmci_transport_packet *pkt);
64static int vmci_transport_recv_connecting_client_invalid(
65 struct sock *sk,
66 struct vmci_transport_packet *pkt);
67static int vmci_transport_recv_connected(struct sock *sk,
68 struct vmci_transport_packet *pkt);
69static bool vmci_transport_old_proto_override(bool *old_pkt_proto);
70static u16 vmci_transport_new_proto_supported_versions(void);
71static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto,
72 bool old_pkt_proto);
73
74struct vmci_transport_recv_pkt_info {
75 struct work_struct work;
76 struct sock *sk;
77 struct vmci_transport_packet pkt;
78};
79
80static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID,
81 VMCI_INVALID_ID };
82static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
83
84static int PROTOCOL_OVERRIDE = -1;
85
86#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128
87#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144
88#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144
89
90/* The default peer timeout indicates how long we will wait for a peer response
91 * to a control message.
92 */
93#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
94
95#define SS_LISTEN 255
96
97/* Helper function to convert from a VMCI error code to a VSock error code. */
98
99static s32 vmci_transport_error_to_vsock_error(s32 vmci_error)
100{
101 int err;
102
103 switch (vmci_error) {
104 case VMCI_ERROR_NO_MEM:
105 err = ENOMEM;
106 break;
107 case VMCI_ERROR_DUPLICATE_ENTRY:
108 case VMCI_ERROR_ALREADY_EXISTS:
109 err = EADDRINUSE;
110 break;
111 case VMCI_ERROR_NO_ACCESS:
112 err = EPERM;
113 break;
114 case VMCI_ERROR_NO_RESOURCES:
115 err = ENOBUFS;
116 break;
117 case VMCI_ERROR_INVALID_RESOURCE:
118 err = EHOSTUNREACH;
119 break;
120 case VMCI_ERROR_INVALID_ARGS:
121 default:
122 err = EINVAL;
123 }
124
125 return err > 0 ? -err : err;
126}
127
128static inline void
129vmci_transport_packet_init(struct vmci_transport_packet *pkt,
130 struct sockaddr_vm *src,
131 struct sockaddr_vm *dst,
132 u8 type,
133 u64 size,
134 u64 mode,
135 struct vmci_transport_waiting_info *wait,
136 u16 proto,
137 struct vmci_handle handle)
138{
139 /* We register the stream control handler as an any cid handle so we
140 * must always send from a source address of VMADDR_CID_ANY
141 */
142 pkt->dg.src = vmci_make_handle(VMADDR_CID_ANY,
143 VMCI_TRANSPORT_PACKET_RID);
144 pkt->dg.dst = vmci_make_handle(dst->svm_cid,
145 VMCI_TRANSPORT_PACKET_RID);
146 pkt->dg.payload_size = sizeof(*pkt) - sizeof(pkt->dg);
147 pkt->version = VMCI_TRANSPORT_PACKET_VERSION;
148 pkt->type = type;
149 pkt->src_port = src->svm_port;
150 pkt->dst_port = dst->svm_port;
151 memset(&pkt->proto, 0, sizeof(pkt->proto));
152 memset(&pkt->_reserved2, 0, sizeof(pkt->_reserved2));
153
154 switch (pkt->type) {
155 case VMCI_TRANSPORT_PACKET_TYPE_INVALID:
156 pkt->u.size = 0;
157 break;
158
159 case VMCI_TRANSPORT_PACKET_TYPE_REQUEST:
160 case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE:
161 pkt->u.size = size;
162 break;
163
164 case VMCI_TRANSPORT_PACKET_TYPE_OFFER:
165 case VMCI_TRANSPORT_PACKET_TYPE_ATTACH:
166 pkt->u.handle = handle;
167 break;
168
169 case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
170 case VMCI_TRANSPORT_PACKET_TYPE_READ:
171 case VMCI_TRANSPORT_PACKET_TYPE_RST:
172 pkt->u.size = 0;
173 break;
174
175 case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN:
176 pkt->u.mode = mode;
177 break;
178
179 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
180 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
181 memcpy(&pkt->u.wait, wait, sizeof(pkt->u.wait));
182 break;
183
184 case VMCI_TRANSPORT_PACKET_TYPE_REQUEST2:
185 case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2:
186 pkt->u.size = size;
187 pkt->proto = proto;
188 break;
189 }
190}
191
192static inline void
193vmci_transport_packet_get_addresses(struct vmci_transport_packet *pkt,
194 struct sockaddr_vm *local,
195 struct sockaddr_vm *remote)
196{
197 vsock_addr_init(local, pkt->dg.dst.context, pkt->dst_port);
198 vsock_addr_init(remote, pkt->dg.src.context, pkt->src_port);
199}
200
201static int
202__vmci_transport_send_control_pkt(struct vmci_transport_packet *pkt,
203 struct sockaddr_vm *src,
204 struct sockaddr_vm *dst,
205 enum vmci_transport_packet_type type,
206 u64 size,
207 u64 mode,
208 struct vmci_transport_waiting_info *wait,
209 u16 proto,
210 struct vmci_handle handle,
211 bool convert_error)
212{
213 int err;
214
215 vmci_transport_packet_init(pkt, src, dst, type, size, mode, wait,
216 proto, handle);
217 err = vmci_datagram_send(&pkt->dg);
218 if (convert_error && (err < 0))
219 return vmci_transport_error_to_vsock_error(err);
220
221 return err;
222}
223
224static int
225vmci_transport_reply_control_pkt_fast(struct vmci_transport_packet *pkt,
226 enum vmci_transport_packet_type type,
227 u64 size,
228 u64 mode,
229 struct vmci_transport_waiting_info *wait,
230 struct vmci_handle handle)
231{
232 struct vmci_transport_packet reply;
233 struct sockaddr_vm src, dst;
234
235 if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) {
236 return 0;
237 } else {
238 vmci_transport_packet_get_addresses(pkt, &src, &dst);
239 return __vmci_transport_send_control_pkt(&reply, &src, &dst,
240 type,
241 size, mode, wait,
242 VSOCK_PROTO_INVALID,
243 handle, true);
244 }
245}
246
247static int
248vmci_transport_send_control_pkt_bh(struct sockaddr_vm *src,
249 struct sockaddr_vm *dst,
250 enum vmci_transport_packet_type type,
251 u64 size,
252 u64 mode,
253 struct vmci_transport_waiting_info *wait,
254 struct vmci_handle handle)
255{
256 /* Note that it is safe to use a single packet across all CPUs since
257 * two tasklets of the same type are guaranteed to not ever run
258 * simultaneously. If that ever changes, or VMCI stops using tasklets,
259 * we can use per-cpu packets.
260 */
261 static struct vmci_transport_packet pkt;
262
263 return __vmci_transport_send_control_pkt(&pkt, src, dst, type,
264 size, mode, wait,
265 VSOCK_PROTO_INVALID, handle,
266 false);
267}
268
269static int
270vmci_transport_send_control_pkt(struct sock *sk,
271 enum vmci_transport_packet_type type,
272 u64 size,
273 u64 mode,
274 struct vmci_transport_waiting_info *wait,
275 u16 proto,
276 struct vmci_handle handle)
277{
278 struct vmci_transport_packet *pkt;
279 struct vsock_sock *vsk;
280 int err;
281
282 vsk = vsock_sk(sk);
283
284 if (!vsock_addr_bound(&vsk->local_addr))
285 return -EINVAL;
286
287 if (!vsock_addr_bound(&vsk->remote_addr))
288 return -EINVAL;
289
290 pkt = kmalloc(sizeof(*pkt), GFP_KERNEL);
291 if (!pkt)
292 return -ENOMEM;
293
294 err = __vmci_transport_send_control_pkt(pkt, &vsk->local_addr,
295 &vsk->remote_addr, type, size,
296 mode, wait, proto, handle,
297 true);
298 kfree(pkt);
299
300 return err;
301}
302
303static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
304 struct sockaddr_vm *src,
305 struct vmci_transport_packet *pkt)
306{
307 if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST)
308 return 0;
309 return vmci_transport_send_control_pkt_bh(
310 dst, src,
311 VMCI_TRANSPORT_PACKET_TYPE_RST, 0,
312 0, NULL, VMCI_INVALID_HANDLE);
313}
314
315static int vmci_transport_send_reset(struct sock *sk,
316 struct vmci_transport_packet *pkt)
317{
318 if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST)
319 return 0;
320 return vmci_transport_send_control_pkt(sk,
321 VMCI_TRANSPORT_PACKET_TYPE_RST,
322 0, 0, NULL, VSOCK_PROTO_INVALID,
323 VMCI_INVALID_HANDLE);
324}
325
326static int vmci_transport_send_negotiate(struct sock *sk, size_t size)
327{
328 return vmci_transport_send_control_pkt(
329 sk,
330 VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE,
331 size, 0, NULL,
332 VSOCK_PROTO_INVALID,
333 VMCI_INVALID_HANDLE);
334}
335
336static int vmci_transport_send_negotiate2(struct sock *sk, size_t size,
337 u16 version)
338{
339 return vmci_transport_send_control_pkt(
340 sk,
341 VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2,
342 size, 0, NULL, version,
343 VMCI_INVALID_HANDLE);
344}
345
346static int vmci_transport_send_qp_offer(struct sock *sk,
347 struct vmci_handle handle)
348{
349 return vmci_transport_send_control_pkt(
350 sk, VMCI_TRANSPORT_PACKET_TYPE_OFFER, 0,
351 0, NULL,
352 VSOCK_PROTO_INVALID, handle);
353}
354
355static int vmci_transport_send_attach(struct sock *sk,
356 struct vmci_handle handle)
357{
358 return vmci_transport_send_control_pkt(
359 sk, VMCI_TRANSPORT_PACKET_TYPE_ATTACH,
360 0, 0, NULL, VSOCK_PROTO_INVALID,
361 handle);
362}
363
364static int vmci_transport_reply_reset(struct vmci_transport_packet *pkt)
365{
366 return vmci_transport_reply_control_pkt_fast(
367 pkt,
368 VMCI_TRANSPORT_PACKET_TYPE_RST,
369 0, 0, NULL,
370 VMCI_INVALID_HANDLE);
371}
372
373static int vmci_transport_send_invalid_bh(struct sockaddr_vm *dst,
374 struct sockaddr_vm *src)
375{
376 return vmci_transport_send_control_pkt_bh(
377 dst, src,
378 VMCI_TRANSPORT_PACKET_TYPE_INVALID,
379 0, 0, NULL, VMCI_INVALID_HANDLE);
380}
381
382int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst,
383 struct sockaddr_vm *src)
384{
385 return vmci_transport_send_control_pkt_bh(
386 dst, src,
387 VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0,
388 0, NULL, VMCI_INVALID_HANDLE);
389}
390
391int vmci_transport_send_read_bh(struct sockaddr_vm *dst,
392 struct sockaddr_vm *src)
393{
394 return vmci_transport_send_control_pkt_bh(
395 dst, src,
396 VMCI_TRANSPORT_PACKET_TYPE_READ, 0,
397 0, NULL, VMCI_INVALID_HANDLE);
398}
399
400int vmci_transport_send_wrote(struct sock *sk)
401{
402 return vmci_transport_send_control_pkt(
403 sk, VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0,
404 0, NULL, VSOCK_PROTO_INVALID,
405 VMCI_INVALID_HANDLE);
406}
407
408int vmci_transport_send_read(struct sock *sk)
409{
410 return vmci_transport_send_control_pkt(
411 sk, VMCI_TRANSPORT_PACKET_TYPE_READ, 0,
412 0, NULL, VSOCK_PROTO_INVALID,
413 VMCI_INVALID_HANDLE);
414}
415
416int vmci_transport_send_waiting_write(struct sock *sk,
417 struct vmci_transport_waiting_info *wait)
418{
419 return vmci_transport_send_control_pkt(
420 sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE,
421 0, 0, wait, VSOCK_PROTO_INVALID,
422 VMCI_INVALID_HANDLE);
423}
424
425int vmci_transport_send_waiting_read(struct sock *sk,
426 struct vmci_transport_waiting_info *wait)
427{
428 return vmci_transport_send_control_pkt(
429 sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ,
430 0, 0, wait, VSOCK_PROTO_INVALID,
431 VMCI_INVALID_HANDLE);
432}
433
434static int vmci_transport_shutdown(struct vsock_sock *vsk, int mode)
435{
436 return vmci_transport_send_control_pkt(
437 &vsk->sk,
438 VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN,
439 0, mode, NULL,
440 VSOCK_PROTO_INVALID,
441 VMCI_INVALID_HANDLE);
442}
443
444static int vmci_transport_send_conn_request(struct sock *sk, size_t size)
445{
446 return vmci_transport_send_control_pkt(sk,
447 VMCI_TRANSPORT_PACKET_TYPE_REQUEST,
448 size, 0, NULL,
449 VSOCK_PROTO_INVALID,
450 VMCI_INVALID_HANDLE);
451}
452
453static int vmci_transport_send_conn_request2(struct sock *sk, size_t size,
454 u16 version)
455{
456 return vmci_transport_send_control_pkt(
457 sk, VMCI_TRANSPORT_PACKET_TYPE_REQUEST2,
458 size, 0, NULL, version,
459 VMCI_INVALID_HANDLE);
460}
461
462static struct sock *vmci_transport_get_pending(
463 struct sock *listener,
464 struct vmci_transport_packet *pkt)
465{
466 struct vsock_sock *vlistener;
467 struct vsock_sock *vpending;
468 struct sock *pending;
469
470 vlistener = vsock_sk(listener);
471
472 list_for_each_entry(vpending, &vlistener->pending_links,
473 pending_links) {
474 struct sockaddr_vm src;
475 struct sockaddr_vm dst;
476
477 vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
478 vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port);
479
480 if (vsock_addr_equals_addr(&src, &vpending->remote_addr) &&
481 vsock_addr_equals_addr(&dst, &vpending->local_addr)) {
482 pending = sk_vsock(vpending);
483 sock_hold(pending);
484 goto found;
485 }
486 }
487
488 pending = NULL;
489found:
490 return pending;
491
492}
493
494static void vmci_transport_release_pending(struct sock *pending)
495{
496 sock_put(pending);
497}
498
499/* We allow two kinds of sockets to communicate with a restricted VM: 1)
500 * trusted sockets 2) sockets from applications running as the same user as the
501 * VM (this is only true for the host side and only when using hosted products)
502 */
503
504static bool vmci_transport_is_trusted(struct vsock_sock *vsock, u32 peer_cid)
505{
506 return vsock->trusted ||
507 vmci_is_context_owner(peer_cid, vsock->owner->uid);
508}
509
510/* We allow sending datagrams to and receiving datagrams from a restricted VM
511 * only if it is trusted as described in vmci_transport_is_trusted.
512 */
513
514static bool vmci_transport_allow_dgram(struct vsock_sock *vsock, u32 peer_cid)
515{
516 if (vsock->cached_peer != peer_cid) {
517 vsock->cached_peer = peer_cid;
518 if (!vmci_transport_is_trusted(vsock, peer_cid) &&
519 (vmci_context_get_priv_flags(peer_cid) &
520 VMCI_PRIVILEGE_FLAG_RESTRICTED)) {
521 vsock->cached_peer_allow_dgram = false;
522 } else {
523 vsock->cached_peer_allow_dgram = true;
524 }
525 }
526
527 return vsock->cached_peer_allow_dgram;
528}
529
530static int
531vmci_transport_queue_pair_alloc(struct vmci_qp **qpair,
532 struct vmci_handle *handle,
533 u64 produce_size,
534 u64 consume_size,
535 u32 peer, u32 flags, bool trusted)
536{
537 int err = 0;
538
539 if (trusted) {
540 /* Try to allocate our queue pair as trusted. This will only
541 * work if vsock is running in the host.
542 */
543
544 err = vmci_qpair_alloc(qpair, handle, produce_size,
545 consume_size,
546 peer, flags,
547 VMCI_PRIVILEGE_FLAG_TRUSTED);
548 if (err != VMCI_ERROR_NO_ACCESS)
549 goto out;
550
551 }
552
553 err = vmci_qpair_alloc(qpair, handle, produce_size, consume_size,
554 peer, flags, VMCI_NO_PRIVILEGE_FLAGS);
555out:
556 if (err < 0) {
557 pr_err("Could not attach to queue pair with %d\n",
558 err);
559 err = vmci_transport_error_to_vsock_error(err);
560 }
561
562 return err;
563}
564
565static int
566vmci_transport_datagram_create_hnd(u32 resource_id,
567 u32 flags,
568 vmci_datagram_recv_cb recv_cb,
569 void *client_data,
570 struct vmci_handle *out_handle)
571{
572 int err = 0;
573
574 /* Try to allocate our datagram handler as trusted. This will only work
575 * if vsock is running in the host.
576 */
577
578 err = vmci_datagram_create_handle_priv(resource_id, flags,
579 VMCI_PRIVILEGE_FLAG_TRUSTED,
580 recv_cb,
581 client_data, out_handle);
582
583 if (err == VMCI_ERROR_NO_ACCESS)
584 err = vmci_datagram_create_handle(resource_id, flags,
585 recv_cb, client_data,
586 out_handle);
587
588 return err;
589}
590
591/* This is invoked as part of a tasklet that's scheduled when the VMCI
592 * interrupt fires. This is run in bottom-half context and if it ever needs to
593 * sleep it should defer that work to a work queue.
594 */
595
596static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
597{
598 struct sock *sk;
599 size_t size;
600 struct sk_buff *skb;
601 struct vsock_sock *vsk;
602
603 sk = (struct sock *)data;
604
605 /* This handler is privileged when this module is running on the host.
606 * We will get datagrams from all endpoints (even VMs that are in a
607 * restricted context). If we get one from a restricted context then
608 * the destination socket must be trusted.
609 *
610 * NOTE: We access the socket struct without holding the lock here.
611 * This is ok because the field we are interested is never modified
612 * outside of the create and destruct socket functions.
613 */
614 vsk = vsock_sk(sk);
615 if (!vmci_transport_allow_dgram(vsk, dg->src.context))
616 return VMCI_ERROR_NO_ACCESS;
617
618 size = VMCI_DG_SIZE(dg);
619
620 /* Attach the packet to the socket's receive queue as an sk_buff. */
621 skb = alloc_skb(size, GFP_ATOMIC);
622 if (skb) {
623 /* sk_receive_skb() will do a sock_put(), so hold here. */
624 sock_hold(sk);
625 skb_put(skb, size);
626 memcpy(skb->data, dg, size);
627 sk_receive_skb(sk, skb, 0);
628 }
629
630 return VMCI_SUCCESS;
631}
632
633static bool vmci_transport_stream_allow(u32 cid, u32 port)
634{
635 static const u32 non_socket_contexts[] = {
636 VMADDR_CID_HYPERVISOR,
637 VMADDR_CID_RESERVED,
638 };
639 int i;
640
641 BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts));
642
643 for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) {
644 if (cid == non_socket_contexts[i])
645 return false;
646 }
647
648 return true;
649}
650
651/* This is invoked as part of a tasklet that's scheduled when the VMCI
652 * interrupt fires. This is run in bottom-half context but it defers most of
653 * its work to the packet handling work queue.
654 */
655
656static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
657{
658 struct sock *sk;
659 struct sockaddr_vm dst;
660 struct sockaddr_vm src;
661 struct vmci_transport_packet *pkt;
662 struct vsock_sock *vsk;
663 bool bh_process_pkt;
664 int err;
665
666 sk = NULL;
667 err = VMCI_SUCCESS;
668 bh_process_pkt = false;
669
670 /* Ignore incoming packets from contexts without sockets, or resources
671 * that aren't vsock implementations.
672 */
673
674 if (!vmci_transport_stream_allow(dg->src.context, -1)
675 || VMCI_TRANSPORT_PACKET_RID != dg->src.resource)
676 return VMCI_ERROR_NO_ACCESS;
677
678 if (VMCI_DG_SIZE(dg) < sizeof(*pkt))
679 /* Drop datagrams that do not contain full VSock packets. */
680 return VMCI_ERROR_INVALID_ARGS;
681
682 pkt = (struct vmci_transport_packet *)dg;
683
684 /* Find the socket that should handle this packet. First we look for a
685 * connected socket and if there is none we look for a socket bound to
686 * the destintation address.
687 */
688 vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
689 vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port);
690
691 sk = vsock_find_connected_socket(&src, &dst);
692 if (!sk) {
693 sk = vsock_find_bound_socket(&dst);
694 if (!sk) {
695 /* We could not find a socket for this specified
696 * address. If this packet is a RST, we just drop it.
697 * If it is another packet, we send a RST. Note that
698 * we do not send a RST reply to RSTs so that we do not
699 * continually send RSTs between two endpoints.
700 *
701 * Note that since this is a reply, dst is src and src
702 * is dst.
703 */
704 if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0)
705 pr_err("unable to send reset\n");
706
707 err = VMCI_ERROR_NOT_FOUND;
708 goto out;
709 }
710 }
711
712 /* If the received packet type is beyond all types known to this
713 * implementation, reply with an invalid message. Hopefully this will
714 * help when implementing backwards compatibility in the future.
715 */
716 if (pkt->type >= VMCI_TRANSPORT_PACKET_TYPE_MAX) {
717 vmci_transport_send_invalid_bh(&dst, &src);
718 err = VMCI_ERROR_INVALID_ARGS;
719 goto out;
720 }
721
722 /* This handler is privileged when this module is running on the host.
723 * We will get datagram connect requests from all endpoints (even VMs
724 * that are in a restricted context). If we get one from a restricted
725 * context then the destination socket must be trusted.
726 *
727 * NOTE: We access the socket struct without holding the lock here.
728 * This is ok because the field we are interested is never modified
729 * outside of the create and destruct socket functions.
730 */
731 vsk = vsock_sk(sk);
732 if (!vmci_transport_allow_dgram(vsk, pkt->dg.src.context)) {
733 err = VMCI_ERROR_NO_ACCESS;
734 goto out;
735 }
736
737 /* We do most everything in a work queue, but let's fast path the
738 * notification of reads and writes to help data transfer performance.
739 * We can only do this if there is no process context code executing
740 * for this socket since that may change the state.
741 */
742 bh_lock_sock(sk);
743
744 if (!sock_owned_by_user(sk) && sk->sk_state == SS_CONNECTED)
745 vmci_trans(vsk)->notify_ops->handle_notify_pkt(
746 sk, pkt, true, &dst, &src,
747 &bh_process_pkt);
748
749 bh_unlock_sock(sk);
750
751 if (!bh_process_pkt) {
752 struct vmci_transport_recv_pkt_info *recv_pkt_info;
753
754 recv_pkt_info = kmalloc(sizeof(*recv_pkt_info), GFP_ATOMIC);
755 if (!recv_pkt_info) {
756 if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0)
757 pr_err("unable to send reset\n");
758
759 err = VMCI_ERROR_NO_MEM;
760 goto out;
761 }
762
763 recv_pkt_info->sk = sk;
764 memcpy(&recv_pkt_info->pkt, pkt, sizeof(recv_pkt_info->pkt));
765 INIT_WORK(&recv_pkt_info->work, vmci_transport_recv_pkt_work);
766
767 schedule_work(&recv_pkt_info->work);
768 /* Clear sk so that the reference count incremented by one of
769 * the Find functions above is not decremented below. We need
770 * that reference count for the packet handler we've scheduled
771 * to run.
772 */
773 sk = NULL;
774 }
775
776out:
777 if (sk)
778 sock_put(sk);
779
780 return err;
781}
782
783static void vmci_transport_peer_attach_cb(u32 sub_id,
784 const struct vmci_event_data *e_data,
785 void *client_data)
786{
787 struct sock *sk = client_data;
788 const struct vmci_event_payload_qp *e_payload;
789 struct vsock_sock *vsk;
790
791 e_payload = vmci_event_data_const_payload(e_data);
792
793 vsk = vsock_sk(sk);
794
795 /* We don't ask for delayed CBs when we subscribe to this event (we
796 * pass 0 as flags to vmci_event_subscribe()). VMCI makes no
797 * guarantees in that case about what context we might be running in,
798 * so it could be BH or process, blockable or non-blockable. So we
799 * need to account for all possible contexts here.
800 */
801 local_bh_disable();
802 bh_lock_sock(sk);
803
804 /* XXX This is lame, we should provide a way to lookup sockets by
805 * qp_handle.
806 */
807 if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle,
808 e_payload->handle)) {
809 /* XXX This doesn't do anything, but in the future we may want
810 * to set a flag here to verify the attach really did occur and
811 * we weren't just sent a datagram claiming it was.
812 */
813 goto out;
814 }
815
816out:
817 bh_unlock_sock(sk);
818 local_bh_enable();
819}
820
821static void vmci_transport_handle_detach(struct sock *sk)
822{
823 struct vsock_sock *vsk;
824
825 vsk = vsock_sk(sk);
826 if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) {
827 sock_set_flag(sk, SOCK_DONE);
828
829 /* On a detach the peer will not be sending or receiving
830 * anymore.
831 */
832 vsk->peer_shutdown = SHUTDOWN_MASK;
833
834 /* We should not be sending anymore since the peer won't be
835 * there to receive, but we can still receive if there is data
836 * left in our consume queue.
837 */
838 if (vsock_stream_has_data(vsk) <= 0) {
839 if (sk->sk_state == SS_CONNECTING) {
840 /* The peer may detach from a queue pair while
841 * we are still in the connecting state, i.e.,
842 * if the peer VM is killed after attaching to
843 * a queue pair, but before we complete the
844 * handshake. In that case, we treat the detach
845 * event like a reset.
846 */
847
848 sk->sk_state = SS_UNCONNECTED;
849 sk->sk_err = ECONNRESET;
850 sk->sk_error_report(sk);
851 return;
852 }
853 sk->sk_state = SS_UNCONNECTED;
854 }
855 sk->sk_state_change(sk);
856 }
857}
858
859static void vmci_transport_peer_detach_cb(u32 sub_id,
860 const struct vmci_event_data *e_data,
861 void *client_data)
862{
863 struct sock *sk = client_data;
864 const struct vmci_event_payload_qp *e_payload;
865 struct vsock_sock *vsk;
866
867 e_payload = vmci_event_data_const_payload(e_data);
868 vsk = vsock_sk(sk);
869 if (vmci_handle_is_invalid(e_payload->handle))
870 return;
871
872 /* Same rules for locking as for peer_attach_cb(). */
873 local_bh_disable();
874 bh_lock_sock(sk);
875
876 /* XXX This is lame, we should provide a way to lookup sockets by
877 * qp_handle.
878 */
879 if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle,
880 e_payload->handle))
881 vmci_transport_handle_detach(sk);
882
883 bh_unlock_sock(sk);
884 local_bh_enable();
885}
886
887static void vmci_transport_qp_resumed_cb(u32 sub_id,
888 const struct vmci_event_data *e_data,
889 void *client_data)
890{
891 vsock_for_each_connected_socket(vmci_transport_handle_detach);
892}
893
894static void vmci_transport_recv_pkt_work(struct work_struct *work)
895{
896 struct vmci_transport_recv_pkt_info *recv_pkt_info;
897 struct vmci_transport_packet *pkt;
898 struct sock *sk;
899
900 recv_pkt_info =
901 container_of(work, struct vmci_transport_recv_pkt_info, work);
902 sk = recv_pkt_info->sk;
903 pkt = &recv_pkt_info->pkt;
904
905 lock_sock(sk);
906
907 switch (sk->sk_state) {
908 case SS_LISTEN:
909 vmci_transport_recv_listen(sk, pkt);
910 break;
911 case SS_CONNECTING:
912 /* Processing of pending connections for servers goes through
913 * the listening socket, so see vmci_transport_recv_listen()
914 * for that path.
915 */
916 vmci_transport_recv_connecting_client(sk, pkt);
917 break;
918 case SS_CONNECTED:
919 vmci_transport_recv_connected(sk, pkt);
920 break;
921 default:
922 /* Because this function does not run in the same context as
923 * vmci_transport_recv_stream_cb it is possible that the
924 * socket has closed. We need to let the other side know or it
925 * could be sitting in a connect and hang forever. Send a
926 * reset to prevent that.
927 */
928 vmci_transport_send_reset(sk, pkt);
929 goto out;
930 }
931
932out:
933 release_sock(sk);
934 kfree(recv_pkt_info);
935 /* Release reference obtained in the stream callback when we fetched
936 * this socket out of the bound or connected list.
937 */
938 sock_put(sk);
939}
940
941static int vmci_transport_recv_listen(struct sock *sk,
942 struct vmci_transport_packet *pkt)
943{
944 struct sock *pending;
945 struct vsock_sock *vpending;
946 int err;
947 u64 qp_size;
948 bool old_request = false;
949 bool old_pkt_proto = false;
950
951 err = 0;
952
953 /* Because we are in the listen state, we could be receiving a packet
954 * for ourself or any previous connection requests that we received.
955 * If it's the latter, we try to find a socket in our list of pending
956 * connections and, if we do, call the appropriate handler for the
957 * state that that socket is in. Otherwise we try to service the
958 * connection request.
959 */
960 pending = vmci_transport_get_pending(sk, pkt);
961 if (pending) {
962 lock_sock(pending);
963 switch (pending->sk_state) {
964 case SS_CONNECTING:
965 err = vmci_transport_recv_connecting_server(sk,
966 pending,
967 pkt);
968 break;
969 default:
970 vmci_transport_send_reset(pending, pkt);
971 err = -EINVAL;
972 }
973
974 if (err < 0)
975 vsock_remove_pending(sk, pending);
976
977 release_sock(pending);
978 vmci_transport_release_pending(pending);
979
980 return err;
981 }
982
983 /* The listen state only accepts connection requests. Reply with a
984 * reset unless we received a reset.
985 */
986
987 if (!(pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST ||
988 pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2)) {
989 vmci_transport_reply_reset(pkt);
990 return -EINVAL;
991 }
992
993 if (pkt->u.size == 0) {
994 vmci_transport_reply_reset(pkt);
995 return -EINVAL;
996 }
997
998 /* If this socket can't accommodate this connection request, we send a
999 * reset. Otherwise we create and initialize a child socket and reply
1000 * with a connection negotiation.
1001 */
1002 if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) {
1003 vmci_transport_reply_reset(pkt);
1004 return -ECONNREFUSED;
1005 }
1006
1007 pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
1008 sk->sk_type);
1009 if (!pending) {
1010 vmci_transport_send_reset(sk, pkt);
1011 return -ENOMEM;
1012 }
1013
1014 vpending = vsock_sk(pending);
1015
1016 vsock_addr_init(&vpending->local_addr, pkt->dg.dst.context,
1017 pkt->dst_port);
1018 vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context,
1019 pkt->src_port);
1020
1021 /* If the proposed size fits within our min/max, accept it. Otherwise
1022 * propose our own size.
1023 */
1024 if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size &&
1025 pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) {
1026 qp_size = pkt->u.size;
1027 } else {
1028 qp_size = vmci_trans(vpending)->queue_pair_size;
1029 }
1030
1031 /* Figure out if we are using old or new requests based on the
1032 * overrides pkt types sent by our peer.
1033 */
1034 if (vmci_transport_old_proto_override(&old_pkt_proto)) {
1035 old_request = old_pkt_proto;
1036 } else {
1037 if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST)
1038 old_request = true;
1039 else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2)
1040 old_request = false;
1041
1042 }
1043
1044 if (old_request) {
1045 /* Handle a REQUEST (or override) */
1046 u16 version = VSOCK_PROTO_INVALID;
1047 if (vmci_transport_proto_to_notify_struct(
1048 pending, &version, true))
1049 err = vmci_transport_send_negotiate(pending, qp_size);
1050 else
1051 err = -EINVAL;
1052
1053 } else {
1054 /* Handle a REQUEST2 (or override) */
1055 int proto_int = pkt->proto;
1056 int pos;
1057 u16 active_proto_version = 0;
1058
1059 /* The list of possible protocols is the intersection of all
1060 * protocols the client supports ... plus all the protocols we
1061 * support.
1062 */
1063 proto_int &= vmci_transport_new_proto_supported_versions();
1064
1065 /* We choose the highest possible protocol version and use that
1066 * one.
1067 */
1068 pos = fls(proto_int);
1069 if (pos) {
1070 active_proto_version = (1 << (pos - 1));
1071 if (vmci_transport_proto_to_notify_struct(
1072 pending, &active_proto_version, false))
1073 err = vmci_transport_send_negotiate2(pending,
1074 qp_size,
1075 active_proto_version);
1076 else
1077 err = -EINVAL;
1078
1079 } else {
1080 err = -EINVAL;
1081 }
1082 }
1083
1084 if (err < 0) {
1085 vmci_transport_send_reset(sk, pkt);
1086 sock_put(pending);
1087 err = vmci_transport_error_to_vsock_error(err);
1088 goto out;
1089 }
1090
1091 vsock_add_pending(sk, pending);
1092 sk->sk_ack_backlog++;
1093
1094 pending->sk_state = SS_CONNECTING;
1095 vmci_trans(vpending)->produce_size =
1096 vmci_trans(vpending)->consume_size = qp_size;
1097 vmci_trans(vpending)->queue_pair_size = qp_size;
1098
1099 vmci_trans(vpending)->notify_ops->process_request(pending);
1100
1101 /* We might never receive another message for this socket and it's not
1102 * connected to any process, so we have to ensure it gets cleaned up
1103 * ourself. Our delayed work function will take care of that. Note
1104 * that we do not ever cancel this function since we have few
1105 * guarantees about its state when calling cancel_delayed_work().
1106 * Instead we hold a reference on the socket for that function and make
1107 * it capable of handling cases where it needs to do nothing but
1108 * release that reference.
1109 */
1110 vpending->listener = sk;
1111 sock_hold(sk);
1112 sock_hold(pending);
1113 INIT_DELAYED_WORK(&vpending->dwork, vsock_pending_work);
1114 schedule_delayed_work(&vpending->dwork, HZ);
1115
1116out:
1117 return err;
1118}
1119
1120static int
1121vmci_transport_recv_connecting_server(struct sock *listener,
1122 struct sock *pending,
1123 struct vmci_transport_packet *pkt)
1124{
1125 struct vsock_sock *vpending;
1126 struct vmci_handle handle;
1127 struct vmci_qp *qpair;
1128 bool is_local;
1129 u32 flags;
1130 u32 detach_sub_id;
1131 int err;
1132 int skerr;
1133
1134 vpending = vsock_sk(pending);
1135 detach_sub_id = VMCI_INVALID_ID;
1136
1137 switch (pkt->type) {
1138 case VMCI_TRANSPORT_PACKET_TYPE_OFFER:
1139 if (vmci_handle_is_invalid(pkt->u.handle)) {
1140 vmci_transport_send_reset(pending, pkt);
1141 skerr = EPROTO;
1142 err = -EINVAL;
1143 goto destroy;
1144 }
1145 break;
1146 default:
1147 /* Close and cleanup the connection. */
1148 vmci_transport_send_reset(pending, pkt);
1149 skerr = EPROTO;
1150 err = pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST ? 0 : -EINVAL;
1151 goto destroy;
1152 }
1153
1154 /* In order to complete the connection we need to attach to the offered
1155 * queue pair and send an attach notification. We also subscribe to the
1156 * detach event so we know when our peer goes away, and we do that
1157 * before attaching so we don't miss an event. If all this succeeds,
1158 * we update our state and wakeup anything waiting in accept() for a
1159 * connection.
1160 */
1161
1162 /* We don't care about attach since we ensure the other side has
1163 * attached by specifying the ATTACH_ONLY flag below.
1164 */
1165 err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,
1166 vmci_transport_peer_detach_cb,
1167 pending, &detach_sub_id);
1168 if (err < VMCI_SUCCESS) {
1169 vmci_transport_send_reset(pending, pkt);
1170 err = vmci_transport_error_to_vsock_error(err);
1171 skerr = -err;
1172 goto destroy;
1173 }
1174
1175 vmci_trans(vpending)->detach_sub_id = detach_sub_id;
1176
1177 /* Now attach to the queue pair the client created. */
1178 handle = pkt->u.handle;
1179
1180 /* vpending->local_addr always has a context id so we do not need to
1181 * worry about VMADDR_CID_ANY in this case.
1182 */
1183 is_local =
1184 vpending->remote_addr.svm_cid == vpending->local_addr.svm_cid;
1185 flags = VMCI_QPFLAG_ATTACH_ONLY;
1186 flags |= is_local ? VMCI_QPFLAG_LOCAL : 0;
1187
1188 err = vmci_transport_queue_pair_alloc(
1189 &qpair,
1190 &handle,
1191 vmci_trans(vpending)->produce_size,
1192 vmci_trans(vpending)->consume_size,
1193 pkt->dg.src.context,
1194 flags,
1195 vmci_transport_is_trusted(
1196 vpending,
1197 vpending->remote_addr.svm_cid));
1198 if (err < 0) {
1199 vmci_transport_send_reset(pending, pkt);
1200 skerr = -err;
1201 goto destroy;
1202 }
1203
1204 vmci_trans(vpending)->qp_handle = handle;
1205 vmci_trans(vpending)->qpair = qpair;
1206
1207 /* When we send the attach message, we must be ready to handle incoming
1208 * control messages on the newly connected socket. So we move the
1209 * pending socket to the connected state before sending the attach
1210 * message. Otherwise, an incoming packet triggered by the attach being
1211 * received by the peer may be processed concurrently with what happens
1212 * below after sending the attach message, and that incoming packet
1213 * will find the listening socket instead of the (currently) pending
1214 * socket. Note that enqueueing the socket increments the reference
1215 * count, so even if a reset comes before the connection is accepted,
1216 * the socket will be valid until it is removed from the queue.
1217 *
1218 * If we fail sending the attach below, we remove the socket from the
1219 * connected list and move the socket to SS_UNCONNECTED before
1220 * releasing the lock, so a pending slow path processing of an incoming
1221 * packet will not see the socket in the connected state in that case.
1222 */
1223 pending->sk_state = SS_CONNECTED;
1224
1225 vsock_insert_connected(vpending);
1226
1227 /* Notify our peer of our attach. */
1228 err = vmci_transport_send_attach(pending, handle);
1229 if (err < 0) {
1230 vsock_remove_connected(vpending);
1231 pr_err("Could not send attach\n");
1232 vmci_transport_send_reset(pending, pkt);
1233 err = vmci_transport_error_to_vsock_error(err);
1234 skerr = -err;
1235 goto destroy;
1236 }
1237
1238 /* We have a connection. Move the now connected socket from the
1239 * listener's pending list to the accept queue so callers of accept()
1240 * can find it.
1241 */
1242 vsock_remove_pending(listener, pending);
1243 vsock_enqueue_accept(listener, pending);
1244
1245 /* Callers of accept() will be be waiting on the listening socket, not
1246 * the pending socket.
1247 */
1248 listener->sk_state_change(listener);
1249
1250 return 0;
1251
1252destroy:
1253 pending->sk_err = skerr;
1254 pending->sk_state = SS_UNCONNECTED;
1255 /* As long as we drop our reference, all necessary cleanup will handle
1256 * when the cleanup function drops its reference and our destruct
1257 * implementation is called. Note that since the listen handler will
1258 * remove pending from the pending list upon our failure, the cleanup
1259 * function won't drop the additional reference, which is why we do it
1260 * here.
1261 */
1262 sock_put(pending);
1263
1264 return err;
1265}
1266
1267static int
1268vmci_transport_recv_connecting_client(struct sock *sk,
1269 struct vmci_transport_packet *pkt)
1270{
1271 struct vsock_sock *vsk;
1272 int err;
1273 int skerr;
1274
1275 vsk = vsock_sk(sk);
1276
1277 switch (pkt->type) {
1278 case VMCI_TRANSPORT_PACKET_TYPE_ATTACH:
1279 if (vmci_handle_is_invalid(pkt->u.handle) ||
1280 !vmci_handle_is_equal(pkt->u.handle,
1281 vmci_trans(vsk)->qp_handle)) {
1282 skerr = EPROTO;
1283 err = -EINVAL;
1284 goto destroy;
1285 }
1286
1287 /* Signify the socket is connected and wakeup the waiter in
1288 * connect(). Also place the socket in the connected table for
1289 * accounting (it can already be found since it's in the bound
1290 * table).
1291 */
1292 sk->sk_state = SS_CONNECTED;
1293 sk->sk_socket->state = SS_CONNECTED;
1294 vsock_insert_connected(vsk);
1295 sk->sk_state_change(sk);
1296
1297 break;
1298 case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE:
1299 case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2:
1300 if (pkt->u.size == 0
1301 || pkt->dg.src.context != vsk->remote_addr.svm_cid
1302 || pkt->src_port != vsk->remote_addr.svm_port
1303 || !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)
1304 || vmci_trans(vsk)->qpair
1305 || vmci_trans(vsk)->produce_size != 0
1306 || vmci_trans(vsk)->consume_size != 0
1307 || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID
1308 || vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) {
1309 skerr = EPROTO;
1310 err = -EINVAL;
1311
1312 goto destroy;
1313 }
1314
1315 err = vmci_transport_recv_connecting_client_negotiate(sk, pkt);
1316 if (err) {
1317 skerr = -err;
1318 goto destroy;
1319 }
1320
1321 break;
1322 case VMCI_TRANSPORT_PACKET_TYPE_INVALID:
1323 err = vmci_transport_recv_connecting_client_invalid(sk, pkt);
1324 if (err) {
1325 skerr = -err;
1326 goto destroy;
1327 }
1328
1329 break;
1330 case VMCI_TRANSPORT_PACKET_TYPE_RST:
1331 /* Older versions of the linux code (WS 6.5 / ESX 4.0) used to
1332 * continue processing here after they sent an INVALID packet.
1333 * This meant that we got a RST after the INVALID. We ignore a
1334 * RST after an INVALID. The common code doesn't send the RST
1335 * ... so we can hang if an old version of the common code
1336 * fails between getting a REQUEST and sending an OFFER back.
1337 * Not much we can do about it... except hope that it doesn't
1338 * happen.
1339 */
1340 if (vsk->ignore_connecting_rst) {
1341 vsk->ignore_connecting_rst = false;
1342 } else {
1343 skerr = ECONNRESET;
1344 err = 0;
1345 goto destroy;
1346 }
1347
1348 break;
1349 default:
1350 /* Close and cleanup the connection. */
1351 skerr = EPROTO;
1352 err = -EINVAL;
1353 goto destroy;
1354 }
1355
1356 return 0;
1357
1358destroy:
1359 vmci_transport_send_reset(sk, pkt);
1360
1361 sk->sk_state = SS_UNCONNECTED;
1362 sk->sk_err = skerr;
1363 sk->sk_error_report(sk);
1364 return err;
1365}
1366
1367static int vmci_transport_recv_connecting_client_negotiate(
1368 struct sock *sk,
1369 struct vmci_transport_packet *pkt)
1370{
1371 int err;
1372 struct vsock_sock *vsk;
1373 struct vmci_handle handle;
1374 struct vmci_qp *qpair;
1375 u32 attach_sub_id;
1376 u32 detach_sub_id;
1377 bool is_local;
1378 u32 flags;
1379 bool old_proto = true;
1380 bool old_pkt_proto;
1381 u16 version;
1382
1383 vsk = vsock_sk(sk);
1384 handle = VMCI_INVALID_HANDLE;
1385 attach_sub_id = VMCI_INVALID_ID;
1386 detach_sub_id = VMCI_INVALID_ID;
1387
1388 /* If we have gotten here then we should be past the point where old
1389 * linux vsock could have sent the bogus rst.
1390 */
1391 vsk->sent_request = false;
1392 vsk->ignore_connecting_rst = false;
1393
1394 /* Verify that we're OK with the proposed queue pair size */
1395 if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size ||
1396 pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) {
1397 err = -EINVAL;
1398 goto destroy;
1399 }
1400
1401 /* At this point we know the CID the peer is using to talk to us. */
1402
1403 if (vsk->local_addr.svm_cid == VMADDR_CID_ANY)
1404 vsk->local_addr.svm_cid = pkt->dg.dst.context;
1405
1406 /* Setup the notify ops to be the highest supported version that both
1407 * the server and the client support.
1408 */
1409
1410 if (vmci_transport_old_proto_override(&old_pkt_proto)) {
1411 old_proto = old_pkt_proto;
1412 } else {
1413 if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE)
1414 old_proto = true;
1415 else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2)
1416 old_proto = false;
1417
1418 }
1419
1420 if (old_proto)
1421 version = VSOCK_PROTO_INVALID;
1422 else
1423 version = pkt->proto;
1424
1425 if (!vmci_transport_proto_to_notify_struct(sk, &version, old_proto)) {
1426 err = -EINVAL;
1427 goto destroy;
1428 }
1429
1430 /* Subscribe to attach and detach events first.
1431 *
1432 * XXX We attach once for each queue pair created for now so it is easy
1433 * to find the socket (it's provided), but later we should only
1434 * subscribe once and add a way to lookup sockets by queue pair handle.
1435 */
1436 err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH,
1437 vmci_transport_peer_attach_cb,
1438 sk, &attach_sub_id);
1439 if (err < VMCI_SUCCESS) {
1440 err = vmci_transport_error_to_vsock_error(err);
1441 goto destroy;
1442 }
1443
1444 err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,
1445 vmci_transport_peer_detach_cb,
1446 sk, &detach_sub_id);
1447 if (err < VMCI_SUCCESS) {
1448 err = vmci_transport_error_to_vsock_error(err);
1449 goto destroy;
1450 }
1451
1452 /* Make VMCI select the handle for us. */
1453 handle = VMCI_INVALID_HANDLE;
1454 is_local = vsk->remote_addr.svm_cid == vsk->local_addr.svm_cid;
1455 flags = is_local ? VMCI_QPFLAG_LOCAL : 0;
1456
1457 err = vmci_transport_queue_pair_alloc(&qpair,
1458 &handle,
1459 pkt->u.size,
1460 pkt->u.size,
1461 vsk->remote_addr.svm_cid,
1462 flags,
1463 vmci_transport_is_trusted(
1464 vsk,
1465 vsk->
1466 remote_addr.svm_cid));
1467 if (err < 0)
1468 goto destroy;
1469
1470 err = vmci_transport_send_qp_offer(sk, handle);
1471 if (err < 0) {
1472 err = vmci_transport_error_to_vsock_error(err);
1473 goto destroy;
1474 }
1475
1476 vmci_trans(vsk)->qp_handle = handle;
1477 vmci_trans(vsk)->qpair = qpair;
1478
1479 vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size =
1480 pkt->u.size;
1481
1482 vmci_trans(vsk)->attach_sub_id = attach_sub_id;
1483 vmci_trans(vsk)->detach_sub_id = detach_sub_id;
1484
1485 vmci_trans(vsk)->notify_ops->process_negotiate(sk);
1486
1487 return 0;
1488
1489destroy:
1490 if (attach_sub_id != VMCI_INVALID_ID)
1491 vmci_event_unsubscribe(attach_sub_id);
1492
1493 if (detach_sub_id != VMCI_INVALID_ID)
1494 vmci_event_unsubscribe(detach_sub_id);
1495
1496 if (!vmci_handle_is_invalid(handle))
1497 vmci_qpair_detach(&qpair);
1498
1499 return err;
1500}
1501
1502static int
1503vmci_transport_recv_connecting_client_invalid(struct sock *sk,
1504 struct vmci_transport_packet *pkt)
1505{
1506 int err = 0;
1507 struct vsock_sock *vsk = vsock_sk(sk);
1508
1509 if (vsk->sent_request) {
1510 vsk->sent_request = false;
1511 vsk->ignore_connecting_rst = true;
1512
1513 err = vmci_transport_send_conn_request(
1514 sk, vmci_trans(vsk)->queue_pair_size);
1515 if (err < 0)
1516 err = vmci_transport_error_to_vsock_error(err);
1517 else
1518 err = 0;
1519
1520 }
1521
1522 return err;
1523}
1524
1525static int vmci_transport_recv_connected(struct sock *sk,
1526 struct vmci_transport_packet *pkt)
1527{
1528 struct vsock_sock *vsk;
1529 bool pkt_processed = false;
1530
1531 /* In cases where we are closing the connection, it's sufficient to
1532 * mark the state change (and maybe error) and wake up any waiting
1533 * threads. Since this is a connected socket, it's owned by a user
1534 * process and will be cleaned up when the failure is passed back on
1535 * the current or next system call. Our system call implementations
1536 * must therefore check for error and state changes on entry and when
1537 * being awoken.
1538 */
1539 switch (pkt->type) {
1540 case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN:
1541 if (pkt->u.mode) {
1542 vsk = vsock_sk(sk);
1543
1544 vsk->peer_shutdown |= pkt->u.mode;
1545 sk->sk_state_change(sk);
1546 }
1547 break;
1548
1549 case VMCI_TRANSPORT_PACKET_TYPE_RST:
1550 vsk = vsock_sk(sk);
1551 /* It is possible that we sent our peer a message (e.g a
1552 * WAITING_READ) right before we got notified that the peer had
1553 * detached. If that happens then we can get a RST pkt back
1554 * from our peer even though there is data available for us to
1555 * read. In that case, don't shutdown the socket completely but
1556 * instead allow the local client to finish reading data off
1557 * the queuepair. Always treat a RST pkt in connected mode like
1558 * a clean shutdown.
1559 */
1560 sock_set_flag(sk, SOCK_DONE);
1561 vsk->peer_shutdown = SHUTDOWN_MASK;
1562 if (vsock_stream_has_data(vsk) <= 0)
1563 sk->sk_state = SS_DISCONNECTING;
1564
1565 sk->sk_state_change(sk);
1566 break;
1567
1568 default:
1569 vsk = vsock_sk(sk);
1570 vmci_trans(vsk)->notify_ops->handle_notify_pkt(
1571 sk, pkt, false, NULL, NULL,
1572 &pkt_processed);
1573 if (!pkt_processed)
1574 return -EINVAL;
1575
1576 break;
1577 }
1578
1579 return 0;
1580}
1581
1582static int vmci_transport_socket_init(struct vsock_sock *vsk,
1583 struct vsock_sock *psk)
1584{
1585 vsk->trans = kmalloc(sizeof(struct vmci_transport), GFP_KERNEL);
1586 if (!vsk->trans)
1587 return -ENOMEM;
1588
1589 vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
1590 vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE;
1591 vmci_trans(vsk)->qpair = NULL;
1592 vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0;
1593 vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id =
1594 VMCI_INVALID_ID;
1595 vmci_trans(vsk)->notify_ops = NULL;
1596 if (psk) {
1597 vmci_trans(vsk)->queue_pair_size =
1598 vmci_trans(psk)->queue_pair_size;
1599 vmci_trans(vsk)->queue_pair_min_size =
1600 vmci_trans(psk)->queue_pair_min_size;
1601 vmci_trans(vsk)->queue_pair_max_size =
1602 vmci_trans(psk)->queue_pair_max_size;
1603 } else {
1604 vmci_trans(vsk)->queue_pair_size =
1605 VMCI_TRANSPORT_DEFAULT_QP_SIZE;
1606 vmci_trans(vsk)->queue_pair_min_size =
1607 VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN;
1608 vmci_trans(vsk)->queue_pair_max_size =
1609 VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX;
1610 }
1611
1612 return 0;
1613}
1614
1615static void vmci_transport_destruct(struct vsock_sock *vsk)
1616{
1617 if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) {
1618 vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id);
1619 vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID;
1620 }
1621
1622 if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) {
1623 vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id);
1624 vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID;
1625 }
1626
1627 if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) {
1628 vmci_qpair_detach(&vmci_trans(vsk)->qpair);
1629 vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE;
1630 vmci_trans(vsk)->produce_size = 0;
1631 vmci_trans(vsk)->consume_size = 0;
1632 }
1633
1634 if (vmci_trans(vsk)->notify_ops)
1635 vmci_trans(vsk)->notify_ops->socket_destruct(vsk);
1636
1637 kfree(vsk->trans);
1638 vsk->trans = NULL;
1639}
1640
1641static void vmci_transport_release(struct vsock_sock *vsk)
1642{
1643 if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
1644 vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
1645 vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
1646 }
1647}
1648
1649static int vmci_transport_dgram_bind(struct vsock_sock *vsk,
1650 struct sockaddr_vm *addr)
1651{
1652 u32 port;
1653 u32 flags;
1654 int err;
1655
1656 /* VMCI will select a resource ID for us if we provide
1657 * VMCI_INVALID_ID.
1658 */
1659 port = addr->svm_port == VMADDR_PORT_ANY ?
1660 VMCI_INVALID_ID : addr->svm_port;
1661
1662 if (port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE))
1663 return -EACCES;
1664
1665 flags = addr->svm_cid == VMADDR_CID_ANY ?
1666 VMCI_FLAG_ANYCID_DG_HND : 0;
1667
1668 err = vmci_transport_datagram_create_hnd(port, flags,
1669 vmci_transport_recv_dgram_cb,
1670 &vsk->sk,
1671 &vmci_trans(vsk)->dg_handle);
1672 if (err < VMCI_SUCCESS)
1673 return vmci_transport_error_to_vsock_error(err);
1674 vsock_addr_init(&vsk->local_addr, addr->svm_cid,
1675 vmci_trans(vsk)->dg_handle.resource);
1676
1677 return 0;
1678}
1679
1680static int vmci_transport_dgram_enqueue(
1681 struct vsock_sock *vsk,
1682 struct sockaddr_vm *remote_addr,
1683 struct iovec *iov,
1684 size_t len)
1685{
1686 int err;
1687 struct vmci_datagram *dg;
1688
1689 if (len > VMCI_MAX_DG_PAYLOAD_SIZE)
1690 return -EMSGSIZE;
1691
1692 if (!vmci_transport_allow_dgram(vsk, remote_addr->svm_cid))
1693 return -EPERM;
1694
1695 /* Allocate a buffer for the user's message and our packet header. */
1696 dg = kmalloc(len + sizeof(*dg), GFP_KERNEL);
1697 if (!dg)
1698 return -ENOMEM;
1699
1700 memcpy_fromiovec(VMCI_DG_PAYLOAD(dg), iov, len);
1701
1702 dg->dst = vmci_make_handle(remote_addr->svm_cid,
1703 remote_addr->svm_port);
1704 dg->src = vmci_make_handle(vsk->local_addr.svm_cid,
1705 vsk->local_addr.svm_port);
1706 dg->payload_size = len;
1707
1708 err = vmci_datagram_send(dg);
1709 kfree(dg);
1710 if (err < 0)
1711 return vmci_transport_error_to_vsock_error(err);
1712
1713 return err - sizeof(*dg);
1714}
1715
1716static int vmci_transport_dgram_dequeue(struct kiocb *kiocb,
1717 struct vsock_sock *vsk,
1718 struct msghdr *msg, size_t len,
1719 int flags)
1720{
1721 int err;
1722 int noblock;
1723 struct vmci_datagram *dg;
1724 size_t payload_len;
1725 struct sk_buff *skb;
1726
1727 noblock = flags & MSG_DONTWAIT;
1728
1729 if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
1730 return -EOPNOTSUPP;
1731
1732 /* Retrieve the head sk_buff from the socket's receive queue. */
1733 err = 0;
1734 skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
1735 if (err)
1736 return err;
1737
1738 if (!skb)
1739 return -EAGAIN;
1740
1741 dg = (struct vmci_datagram *)skb->data;
1742 if (!dg)
1743 /* err is 0, meaning we read zero bytes. */
1744 goto out;
1745
1746 payload_len = dg->payload_size;
1747 /* Ensure the sk_buff matches the payload size claimed in the packet. */
1748 if (payload_len != skb->len - sizeof(*dg)) {
1749 err = -EINVAL;
1750 goto out;
1751 }
1752
1753 if (payload_len > len) {
1754 payload_len = len;
1755 msg->msg_flags |= MSG_TRUNC;
1756 }
1757
1758 /* Place the datagram payload in the user's iovec. */
1759 err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov,
1760 payload_len);
1761 if (err)
1762 goto out;
1763
1764 msg->msg_namelen = 0;
1765 if (msg->msg_name) {
1766 struct sockaddr_vm *vm_addr;
1767
1768 /* Provide the address of the sender. */
1769 vm_addr = (struct sockaddr_vm *)msg->msg_name;
1770 vsock_addr_init(vm_addr, dg->src.context, dg->src.resource);
1771 msg->msg_namelen = sizeof(*vm_addr);
1772 }
1773 err = payload_len;
1774
1775out:
1776 skb_free_datagram(&vsk->sk, skb);
1777 return err;
1778}
1779
1780static bool vmci_transport_dgram_allow(u32 cid, u32 port)
1781{
1782 if (cid == VMADDR_CID_HYPERVISOR) {
1783 /* Registrations of PBRPC Servers do not modify VMX/Hypervisor
1784 * state and are allowed.
1785 */
1786 return port == VMCI_UNITY_PBRPC_REGISTER;
1787 }
1788
1789 return true;
1790}
1791
1792static int vmci_transport_connect(struct vsock_sock *vsk)
1793{
1794 int err;
1795 bool old_pkt_proto = false;
1796 struct sock *sk = &vsk->sk;
1797
1798 if (vmci_transport_old_proto_override(&old_pkt_proto) &&
1799 old_pkt_proto) {
1800 err = vmci_transport_send_conn_request(
1801 sk, vmci_trans(vsk)->queue_pair_size);
1802 if (err < 0) {
1803 sk->sk_state = SS_UNCONNECTED;
1804 return err;
1805 }
1806 } else {
1807 int supported_proto_versions =
1808 vmci_transport_new_proto_supported_versions();
1809 err = vmci_transport_send_conn_request2(
1810 sk, vmci_trans(vsk)->queue_pair_size,
1811 supported_proto_versions);
1812 if (err < 0) {
1813 sk->sk_state = SS_UNCONNECTED;
1814 return err;
1815 }
1816
1817 vsk->sent_request = true;
1818 }
1819
1820 return err;
1821}
1822
1823static ssize_t vmci_transport_stream_dequeue(
1824 struct vsock_sock *vsk,
1825 struct iovec *iov,
1826 size_t len,
1827 int flags)
1828{
1829 if (flags & MSG_PEEK)
1830 return vmci_qpair_peekv(vmci_trans(vsk)->qpair, iov, len, 0);
1831 else
1832 return vmci_qpair_dequev(vmci_trans(vsk)->qpair, iov, len, 0);
1833}
1834
1835static ssize_t vmci_transport_stream_enqueue(
1836 struct vsock_sock *vsk,
1837 struct iovec *iov,
1838 size_t len)
1839{
1840 return vmci_qpair_enquev(vmci_trans(vsk)->qpair, iov, len, 0);
1841}
1842
1843static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk)
1844{
1845 return vmci_qpair_consume_buf_ready(vmci_trans(vsk)->qpair);
1846}
1847
1848static s64 vmci_transport_stream_has_space(struct vsock_sock *vsk)
1849{
1850 return vmci_qpair_produce_free_space(vmci_trans(vsk)->qpair);
1851}
1852
1853static u64 vmci_transport_stream_rcvhiwat(struct vsock_sock *vsk)
1854{
1855 return vmci_trans(vsk)->consume_size;
1856}
1857
1858static bool vmci_transport_stream_is_active(struct vsock_sock *vsk)
1859{
1860 return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle);
1861}
1862
1863static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk)
1864{
1865 return vmci_trans(vsk)->queue_pair_size;
1866}
1867
1868static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk)
1869{
1870 return vmci_trans(vsk)->queue_pair_min_size;
1871}
1872
1873static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk)
1874{
1875 return vmci_trans(vsk)->queue_pair_max_size;
1876}
1877
1878static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
1879{
1880 if (val < vmci_trans(vsk)->queue_pair_min_size)
1881 vmci_trans(vsk)->queue_pair_min_size = val;
1882 if (val > vmci_trans(vsk)->queue_pair_max_size)
1883 vmci_trans(vsk)->queue_pair_max_size = val;
1884 vmci_trans(vsk)->queue_pair_size = val;
1885}
1886
1887static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk,
1888 u64 val)
1889{
1890 if (val > vmci_trans(vsk)->queue_pair_size)
1891 vmci_trans(vsk)->queue_pair_size = val;
1892 vmci_trans(vsk)->queue_pair_min_size = val;
1893}
1894
1895static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk,
1896 u64 val)
1897{
1898 if (val < vmci_trans(vsk)->queue_pair_size)
1899 vmci_trans(vsk)->queue_pair_size = val;
1900 vmci_trans(vsk)->queue_pair_max_size = val;
1901}
1902
1903static int vmci_transport_notify_poll_in(
1904 struct vsock_sock *vsk,
1905 size_t target,
1906 bool *data_ready_now)
1907{
1908 return vmci_trans(vsk)->notify_ops->poll_in(
1909 &vsk->sk, target, data_ready_now);
1910}
1911
1912static int vmci_transport_notify_poll_out(
1913 struct vsock_sock *vsk,
1914 size_t target,
1915 bool *space_available_now)
1916{
1917 return vmci_trans(vsk)->notify_ops->poll_out(
1918 &vsk->sk, target, space_available_now);
1919}
1920
1921static int vmci_transport_notify_recv_init(
1922 struct vsock_sock *vsk,
1923 size_t target,
1924 struct vsock_transport_recv_notify_data *data)
1925{
1926 return vmci_trans(vsk)->notify_ops->recv_init(
1927 &vsk->sk, target,
1928 (struct vmci_transport_recv_notify_data *)data);
1929}
1930
1931static int vmci_transport_notify_recv_pre_block(
1932 struct vsock_sock *vsk,
1933 size_t target,
1934 struct vsock_transport_recv_notify_data *data)
1935{
1936 return vmci_trans(vsk)->notify_ops->recv_pre_block(
1937 &vsk->sk, target,
1938 (struct vmci_transport_recv_notify_data *)data);
1939}
1940
1941static int vmci_transport_notify_recv_pre_dequeue(
1942 struct vsock_sock *vsk,
1943 size_t target,
1944 struct vsock_transport_recv_notify_data *data)
1945{
1946 return vmci_trans(vsk)->notify_ops->recv_pre_dequeue(
1947 &vsk->sk, target,
1948 (struct vmci_transport_recv_notify_data *)data);
1949}
1950
1951static int vmci_transport_notify_recv_post_dequeue(
1952 struct vsock_sock *vsk,
1953 size_t target,
1954 ssize_t copied,
1955 bool data_read,
1956 struct vsock_transport_recv_notify_data *data)
1957{
1958 return vmci_trans(vsk)->notify_ops->recv_post_dequeue(
1959 &vsk->sk, target, copied, data_read,
1960 (struct vmci_transport_recv_notify_data *)data);
1961}
1962
1963static int vmci_transport_notify_send_init(
1964 struct vsock_sock *vsk,
1965 struct vsock_transport_send_notify_data *data)
1966{
1967 return vmci_trans(vsk)->notify_ops->send_init(
1968 &vsk->sk,
1969 (struct vmci_transport_send_notify_data *)data);
1970}
1971
1972static int vmci_transport_notify_send_pre_block(
1973 struct vsock_sock *vsk,
1974 struct vsock_transport_send_notify_data *data)
1975{
1976 return vmci_trans(vsk)->notify_ops->send_pre_block(
1977 &vsk->sk,
1978 (struct vmci_transport_send_notify_data *)data);
1979}
1980
1981static int vmci_transport_notify_send_pre_enqueue(
1982 struct vsock_sock *vsk,
1983 struct vsock_transport_send_notify_data *data)
1984{
1985 return vmci_trans(vsk)->notify_ops->send_pre_enqueue(
1986 &vsk->sk,
1987 (struct vmci_transport_send_notify_data *)data);
1988}
1989
1990static int vmci_transport_notify_send_post_enqueue(
1991 struct vsock_sock *vsk,
1992 ssize_t written,
1993 struct vsock_transport_send_notify_data *data)
1994{
1995 return vmci_trans(vsk)->notify_ops->send_post_enqueue(
1996 &vsk->sk, written,
1997 (struct vmci_transport_send_notify_data *)data);
1998}
1999
2000static bool vmci_transport_old_proto_override(bool *old_pkt_proto)
2001{
2002 if (PROTOCOL_OVERRIDE != -1) {
2003 if (PROTOCOL_OVERRIDE == 0)
2004 *old_pkt_proto = true;
2005 else
2006 *old_pkt_proto = false;
2007
2008 pr_info("Proto override in use\n");
2009 return true;
2010 }
2011
2012 return false;
2013}
2014
2015static bool vmci_transport_proto_to_notify_struct(struct sock *sk,
2016 u16 *proto,
2017 bool old_pkt_proto)
2018{
2019 struct vsock_sock *vsk = vsock_sk(sk);
2020
2021 if (old_pkt_proto) {
2022 if (*proto != VSOCK_PROTO_INVALID) {
2023 pr_err("Can't set both an old and new protocol\n");
2024 return false;
2025 }
2026 vmci_trans(vsk)->notify_ops = &vmci_transport_notify_pkt_ops;
2027 goto exit;
2028 }
2029
2030 switch (*proto) {
2031 case VSOCK_PROTO_PKT_ON_NOTIFY:
2032 vmci_trans(vsk)->notify_ops =
2033 &vmci_transport_notify_pkt_q_state_ops;
2034 break;
2035 default:
2036 pr_err("Unknown notify protocol version\n");
2037 return false;
2038 }
2039
2040exit:
2041 vmci_trans(vsk)->notify_ops->socket_init(sk);
2042 return true;
2043}
2044
2045static u16 vmci_transport_new_proto_supported_versions(void)
2046{
2047 if (PROTOCOL_OVERRIDE != -1)
2048 return PROTOCOL_OVERRIDE;
2049
2050 return VSOCK_PROTO_ALL_SUPPORTED;
2051}
2052
2053static u32 vmci_transport_get_local_cid(void)
2054{
2055 return vmci_get_context_id();
2056}
2057
2058static struct vsock_transport vmci_transport = {
2059 .init = vmci_transport_socket_init,
2060 .destruct = vmci_transport_destruct,
2061 .release = vmci_transport_release,
2062 .connect = vmci_transport_connect,
2063 .dgram_bind = vmci_transport_dgram_bind,
2064 .dgram_dequeue = vmci_transport_dgram_dequeue,
2065 .dgram_enqueue = vmci_transport_dgram_enqueue,
2066 .dgram_allow = vmci_transport_dgram_allow,
2067 .stream_dequeue = vmci_transport_stream_dequeue,
2068 .stream_enqueue = vmci_transport_stream_enqueue,
2069 .stream_has_data = vmci_transport_stream_has_data,
2070 .stream_has_space = vmci_transport_stream_has_space,
2071 .stream_rcvhiwat = vmci_transport_stream_rcvhiwat,
2072 .stream_is_active = vmci_transport_stream_is_active,
2073 .stream_allow = vmci_transport_stream_allow,
2074 .notify_poll_in = vmci_transport_notify_poll_in,
2075 .notify_poll_out = vmci_transport_notify_poll_out,
2076 .notify_recv_init = vmci_transport_notify_recv_init,
2077 .notify_recv_pre_block = vmci_transport_notify_recv_pre_block,
2078 .notify_recv_pre_dequeue = vmci_transport_notify_recv_pre_dequeue,
2079 .notify_recv_post_dequeue = vmci_transport_notify_recv_post_dequeue,
2080 .notify_send_init = vmci_transport_notify_send_init,
2081 .notify_send_pre_block = vmci_transport_notify_send_pre_block,
2082 .notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue,
2083 .notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue,
2084 .shutdown = vmci_transport_shutdown,
2085 .set_buffer_size = vmci_transport_set_buffer_size,
2086 .set_min_buffer_size = vmci_transport_set_min_buffer_size,
2087 .set_max_buffer_size = vmci_transport_set_max_buffer_size,
2088 .get_buffer_size = vmci_transport_get_buffer_size,
2089 .get_min_buffer_size = vmci_transport_get_min_buffer_size,
2090 .get_max_buffer_size = vmci_transport_get_max_buffer_size,
2091 .get_local_cid = vmci_transport_get_local_cid,
2092};
2093
2094static int __init vmci_transport_init(void)
2095{
2096 int err;
2097
2098 /* Create the datagram handle that we will use to send and receive all
2099 * VSocket control messages for this context.
2100 */
2101 err = vmci_transport_datagram_create_hnd(VMCI_TRANSPORT_PACKET_RID,
2102 VMCI_FLAG_ANYCID_DG_HND,
2103 vmci_transport_recv_stream_cb,
2104 NULL,
2105 &vmci_transport_stream_handle);
2106 if (err < VMCI_SUCCESS) {
2107 pr_err("Unable to create datagram handle. (%d)\n", err);
2108 return vmci_transport_error_to_vsock_error(err);
2109 }
2110
2111 err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED,
2112 vmci_transport_qp_resumed_cb,
2113 NULL, &vmci_transport_qp_resumed_sub_id);
2114 if (err < VMCI_SUCCESS) {
2115 pr_err("Unable to subscribe to resumed event. (%d)\n", err);
2116 err = vmci_transport_error_to_vsock_error(err);
2117 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
2118 goto err_destroy_stream_handle;
2119 }
2120
2121 err = vsock_core_init(&vmci_transport);
2122 if (err < 0)
2123 goto err_unsubscribe;
2124
2125 return 0;
2126
2127err_unsubscribe:
2128 vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id);
2129err_destroy_stream_handle:
2130 vmci_datagram_destroy_handle(vmci_transport_stream_handle);
2131 return err;
2132}
2133module_init(vmci_transport_init);
2134
2135static void __exit vmci_transport_exit(void)
2136{
2137 if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) {
2138 if (vmci_datagram_destroy_handle(
2139 vmci_transport_stream_handle) != VMCI_SUCCESS)
2140 pr_err("Couldn't destroy datagram handle\n");
2141 vmci_transport_stream_handle = VMCI_INVALID_HANDLE;
2142 }
2143
2144 if (vmci_transport_qp_resumed_sub_id != VMCI_INVALID_ID) {
2145 vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id);
2146 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
2147 }
2148
2149 vsock_core_exit();
2150}
2151module_exit(vmci_transport_exit);
2152
2153MODULE_AUTHOR("VMware, Inc.");
2154MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
2155MODULE_LICENSE("GPL v2");
2156MODULE_ALIAS("vmware_vsock");
2157MODULE_ALIAS_NETPROTO(PF_VSOCK);
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
new file mode 100644
index 000000000000..1bf991803ec0
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport.h
@@ -0,0 +1,139 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#ifndef _VMCI_TRANSPORT_H_
17#define _VMCI_TRANSPORT_H_
18
19#include <linux/vmw_vmci_defs.h>
20#include <linux/vmw_vmci_api.h>
21
22#include "vsock_addr.h"
23#include "af_vsock.h"
24
25/* If the packet format changes in a release then this should change too. */
26#define VMCI_TRANSPORT_PACKET_VERSION 1
27
28/* The resource ID on which control packets are sent. */
29#define VMCI_TRANSPORT_PACKET_RID 1
30
31#define VSOCK_PROTO_INVALID 0
32#define VSOCK_PROTO_PKT_ON_NOTIFY (1 << 0)
33#define VSOCK_PROTO_ALL_SUPPORTED (VSOCK_PROTO_PKT_ON_NOTIFY)
34
35#define vmci_trans(_vsk) ((struct vmci_transport *)((_vsk)->trans))
36
37enum vmci_transport_packet_type {
38 VMCI_TRANSPORT_PACKET_TYPE_INVALID = 0,
39 VMCI_TRANSPORT_PACKET_TYPE_REQUEST,
40 VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE,
41 VMCI_TRANSPORT_PACKET_TYPE_OFFER,
42 VMCI_TRANSPORT_PACKET_TYPE_ATTACH,
43 VMCI_TRANSPORT_PACKET_TYPE_WROTE,
44 VMCI_TRANSPORT_PACKET_TYPE_READ,
45 VMCI_TRANSPORT_PACKET_TYPE_RST,
46 VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN,
47 VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE,
48 VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ,
49 VMCI_TRANSPORT_PACKET_TYPE_REQUEST2,
50 VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2,
51 VMCI_TRANSPORT_PACKET_TYPE_MAX
52};
53
54struct vmci_transport_waiting_info {
55 u64 generation;
56 u64 offset;
57};
58
59/* Control packet type for STREAM sockets. DGRAMs have no control packets nor
60 * special packet header for data packets, they are just raw VMCI DGRAM
61 * messages. For STREAMs, control packets are sent over the control channel
62 * while data is written and read directly from queue pairs with no packet
63 * format.
64 */
65struct vmci_transport_packet {
66 struct vmci_datagram dg;
67 u8 version;
68 u8 type;
69 u16 proto;
70 u32 src_port;
71 u32 dst_port;
72 u32 _reserved2;
73 union {
74 u64 size;
75 u64 mode;
76 struct vmci_handle handle;
77 struct vmci_transport_waiting_info wait;
78 } u;
79};
80
81struct vmci_transport_notify_pkt {
82 u64 write_notify_window;
83 u64 write_notify_min_window;
84 bool peer_waiting_read;
85 bool peer_waiting_write;
86 bool peer_waiting_write_detected;
87 bool sent_waiting_read;
88 bool sent_waiting_write;
89 struct vmci_transport_waiting_info peer_waiting_read_info;
90 struct vmci_transport_waiting_info peer_waiting_write_info;
91 u64 produce_q_generation;
92 u64 consume_q_generation;
93};
94
95struct vmci_transport_notify_pkt_q_state {
96 u64 write_notify_window;
97 u64 write_notify_min_window;
98 bool peer_waiting_write;
99 bool peer_waiting_write_detected;
100};
101
102union vmci_transport_notify {
103 struct vmci_transport_notify_pkt pkt;
104 struct vmci_transport_notify_pkt_q_state pkt_q_state;
105};
106
107/* Our transport-specific data. */
108struct vmci_transport {
109 /* For DGRAMs. */
110 struct vmci_handle dg_handle;
111 /* For STREAMs. */
112 struct vmci_handle qp_handle;
113 struct vmci_qp *qpair;
114 u64 produce_size;
115 u64 consume_size;
116 u64 queue_pair_size;
117 u64 queue_pair_min_size;
118 u64 queue_pair_max_size;
119 u32 attach_sub_id;
120 u32 detach_sub_id;
121 union vmci_transport_notify notify;
122 struct vmci_transport_notify_ops *notify_ops;
123};
124
125int vmci_transport_register(void);
126void vmci_transport_unregister(void);
127
128int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst,
129 struct sockaddr_vm *src);
130int vmci_transport_send_read_bh(struct sockaddr_vm *dst,
131 struct sockaddr_vm *src);
132int vmci_transport_send_wrote(struct sock *sk);
133int vmci_transport_send_read(struct sock *sk);
134int vmci_transport_send_waiting_write(struct sock *sk,
135 struct vmci_transport_waiting_info *wait);
136int vmci_transport_send_waiting_read(struct sock *sk,
137 struct vmci_transport_waiting_info *wait);
138
139#endif
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
new file mode 100644
index 000000000000..9a730744e7bc
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -0,0 +1,680 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/types.h>
17#include <linux/socket.h>
18#include <linux/stddef.h>
19#include <net/sock.h>
20
21#include "vmci_transport_notify.h"
22
23#define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
24
25static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
26{
27#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
28 bool retval;
29 u64 notify_limit;
30
31 if (!PKT_FIELD(vsk, peer_waiting_write))
32 return false;
33
34#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
35 /* When the sender blocks, we take that as a sign that the sender is
36 * faster than the receiver. To reduce the transmit rate of the sender,
37 * we delay the sending of the read notification by decreasing the
38 * write_notify_window. The notification is delayed until the number of
39 * bytes used in the queue drops below the write_notify_window.
40 */
41
42 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
43 PKT_FIELD(vsk, peer_waiting_write_detected) = true;
44 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
45 PKT_FIELD(vsk, write_notify_window) =
46 PKT_FIELD(vsk, write_notify_min_window);
47 } else {
48 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
49 if (PKT_FIELD(vsk, write_notify_window) <
50 PKT_FIELD(vsk, write_notify_min_window))
51 PKT_FIELD(vsk, write_notify_window) =
52 PKT_FIELD(vsk, write_notify_min_window);
53
54 }
55 }
56 notify_limit = vmci_trans(vsk)->consume_size -
57 PKT_FIELD(vsk, write_notify_window);
58#else
59 notify_limit = 0;
60#endif
61
62 /* For now we ignore the wait information and just see if the free
63 * space exceeds the notify limit. Note that improving this function
64 * to be more intelligent will not require a protocol change and will
65 * retain compatibility between endpoints with mixed versions of this
66 * function.
67 *
68 * The notify_limit is used to delay notifications in the case where
69 * flow control is enabled. Below the test is expressed in terms of
70 * free space in the queue: if free_space > ConsumeSize -
71 * write_notify_window then notify An alternate way of expressing this
72 * is to rewrite the expression to use the data ready in the receive
73 * queue: if write_notify_window > bufferReady then notify as
74 * free_space == ConsumeSize - bufferReady.
75 */
76 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
77 notify_limit;
78#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
79 if (retval) {
80 /*
81 * Once we notify the peer, we reset the detected flag so the
82 * next wait will again cause a decrease in the window size.
83 */
84
85 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
86 }
87#endif
88 return retval;
89#else
90 return true;
91#endif
92}
93
94static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk)
95{
96#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
97 if (!PKT_FIELD(vsk, peer_waiting_read))
98 return false;
99
100 /* For now we ignore the wait information and just see if there is any
101 * data for our peer to read. Note that improving this function to be
102 * more intelligent will not require a protocol change and will retain
103 * compatibility between endpoints with mixed versions of this
104 * function.
105 */
106 return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0;
107#else
108 return true;
109#endif
110}
111
112static void
113vmci_transport_handle_waiting_read(struct sock *sk,
114 struct vmci_transport_packet *pkt,
115 bool bottom_half,
116 struct sockaddr_vm *dst,
117 struct sockaddr_vm *src)
118{
119#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
120 struct vsock_sock *vsk;
121
122 vsk = vsock_sk(sk);
123
124 PKT_FIELD(vsk, peer_waiting_read) = true;
125 memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait,
126 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
127
128 if (vmci_transport_notify_waiting_read(vsk)) {
129 bool sent;
130
131 if (bottom_half)
132 sent = vmci_transport_send_wrote_bh(dst, src) > 0;
133 else
134 sent = vmci_transport_send_wrote(sk) > 0;
135
136 if (sent)
137 PKT_FIELD(vsk, peer_waiting_read) = false;
138 }
139#endif
140}
141
142static void
143vmci_transport_handle_waiting_write(struct sock *sk,
144 struct vmci_transport_packet *pkt,
145 bool bottom_half,
146 struct sockaddr_vm *dst,
147 struct sockaddr_vm *src)
148{
149#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
150 struct vsock_sock *vsk;
151
152 vsk = vsock_sk(sk);
153
154 PKT_FIELD(vsk, peer_waiting_write) = true;
155 memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait,
156 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
157
158 if (vmci_transport_notify_waiting_write(vsk)) {
159 bool sent;
160
161 if (bottom_half)
162 sent = vmci_transport_send_read_bh(dst, src) > 0;
163 else
164 sent = vmci_transport_send_read(sk) > 0;
165
166 if (sent)
167 PKT_FIELD(vsk, peer_waiting_write) = false;
168 }
169#endif
170}
171
172static void
173vmci_transport_handle_read(struct sock *sk,
174 struct vmci_transport_packet *pkt,
175 bool bottom_half,
176 struct sockaddr_vm *dst, struct sockaddr_vm *src)
177{
178#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
179 struct vsock_sock *vsk;
180
181 vsk = vsock_sk(sk);
182 PKT_FIELD(vsk, sent_waiting_write) = false;
183#endif
184
185 sk->sk_write_space(sk);
186}
187
188static bool send_waiting_read(struct sock *sk, u64 room_needed)
189{
190#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
191 struct vsock_sock *vsk;
192 struct vmci_transport_waiting_info waiting_info;
193 u64 tail;
194 u64 head;
195 u64 room_left;
196 bool ret;
197
198 vsk = vsock_sk(sk);
199
200 if (PKT_FIELD(vsk, sent_waiting_read))
201 return true;
202
203 if (PKT_FIELD(vsk, write_notify_window) <
204 vmci_trans(vsk)->consume_size)
205 PKT_FIELD(vsk, write_notify_window) =
206 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
207 vmci_trans(vsk)->consume_size);
208
209 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head);
210 room_left = vmci_trans(vsk)->consume_size - head;
211 if (room_needed >= room_left) {
212 waiting_info.offset = room_needed - room_left;
213 waiting_info.generation =
214 PKT_FIELD(vsk, consume_q_generation) + 1;
215 } else {
216 waiting_info.offset = head + room_needed;
217 waiting_info.generation = PKT_FIELD(vsk, consume_q_generation);
218 }
219
220 ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0;
221 if (ret)
222 PKT_FIELD(vsk, sent_waiting_read) = true;
223
224 return ret;
225#else
226 return true;
227#endif
228}
229
230static bool send_waiting_write(struct sock *sk, u64 room_needed)
231{
232#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
233 struct vsock_sock *vsk;
234 struct vmci_transport_waiting_info waiting_info;
235 u64 tail;
236 u64 head;
237 u64 room_left;
238 bool ret;
239
240 vsk = vsock_sk(sk);
241
242 if (PKT_FIELD(vsk, sent_waiting_write))
243 return true;
244
245 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head);
246 room_left = vmci_trans(vsk)->produce_size - tail;
247 if (room_needed + 1 >= room_left) {
248 /* Wraps around to current generation. */
249 waiting_info.offset = room_needed + 1 - room_left;
250 waiting_info.generation = PKT_FIELD(vsk, produce_q_generation);
251 } else {
252 waiting_info.offset = tail + room_needed + 1;
253 waiting_info.generation =
254 PKT_FIELD(vsk, produce_q_generation) - 1;
255 }
256
257 ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0;
258 if (ret)
259 PKT_FIELD(vsk, sent_waiting_write) = true;
260
261 return ret;
262#else
263 return true;
264#endif
265}
266
267static int vmci_transport_send_read_notification(struct sock *sk)
268{
269 struct vsock_sock *vsk;
270 bool sent_read;
271 unsigned int retries;
272 int err;
273
274 vsk = vsock_sk(sk);
275 sent_read = false;
276 retries = 0;
277 err = 0;
278
279 if (vmci_transport_notify_waiting_write(vsk)) {
280 /* Notify the peer that we have read, retrying the send on
281 * failure up to our maximum value. XXX For now we just log
282 * the failure, but later we should schedule a work item to
283 * handle the resend until it succeeds. That would require
284 * keeping track of work items in the vsk and cleaning them up
285 * upon socket close.
286 */
287 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
288 !sent_read &&
289 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
290 err = vmci_transport_send_read(sk);
291 if (err >= 0)
292 sent_read = true;
293
294 retries++;
295 }
296
297 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS)
298 pr_err("%p unable to send read notify to peer\n", sk);
299 else
300#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
301 PKT_FIELD(vsk, peer_waiting_write) = false;
302#endif
303
304 }
305 return err;
306}
307
308static void
309vmci_transport_handle_wrote(struct sock *sk,
310 struct vmci_transport_packet *pkt,
311 bool bottom_half,
312 struct sockaddr_vm *dst, struct sockaddr_vm *src)
313{
314#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
315 struct vsock_sock *vsk = vsock_sk(sk);
316 PKT_FIELD(vsk, sent_waiting_read) = false;
317#endif
318 sk->sk_data_ready(sk, 0);
319}
320
321static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
322{
323 struct vsock_sock *vsk = vsock_sk(sk);
324
325 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
326 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
327 PKT_FIELD(vsk, peer_waiting_read) = false;
328 PKT_FIELD(vsk, peer_waiting_write) = false;
329 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
330 PKT_FIELD(vsk, sent_waiting_read) = false;
331 PKT_FIELD(vsk, sent_waiting_write) = false;
332 PKT_FIELD(vsk, produce_q_generation) = 0;
333 PKT_FIELD(vsk, consume_q_generation) = 0;
334
335 memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0,
336 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
337 memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0,
338 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
339}
340
341static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
342{
343}
344
345static int
346vmci_transport_notify_pkt_poll_in(struct sock *sk,
347 size_t target, bool *data_ready_now)
348{
349 struct vsock_sock *vsk = vsock_sk(sk);
350
351 if (vsock_stream_has_data(vsk)) {
352 *data_ready_now = true;
353 } else {
354 /* We can't read right now because there is nothing in the
355 * queue. Ask for notifications when there is something to
356 * read.
357 */
358 if (sk->sk_state == SS_CONNECTED) {
359 if (!send_waiting_read(sk, 1))
360 return -1;
361
362 }
363 *data_ready_now = false;
364 }
365
366 return 0;
367}
368
369static int
370vmci_transport_notify_pkt_poll_out(struct sock *sk,
371 size_t target, bool *space_avail_now)
372{
373 s64 produce_q_free_space;
374 struct vsock_sock *vsk = vsock_sk(sk);
375
376 produce_q_free_space = vsock_stream_has_space(vsk);
377 if (produce_q_free_space > 0) {
378 *space_avail_now = true;
379 return 0;
380 } else if (produce_q_free_space == 0) {
381 /* This is a connected socket but we can't currently send data.
382 * Notify the peer that we are waiting if the queue is full. We
383 * only send a waiting write if the queue is full because
384 * otherwise we end up in an infinite WAITING_WRITE, READ,
385 * WAITING_WRITE, READ, etc. loop. Treat failing to send the
386 * notification as a socket error, passing that back through
387 * the mask.
388 */
389 if (!send_waiting_write(sk, 1))
390 return -1;
391
392 *space_avail_now = false;
393 }
394
395 return 0;
396}
397
398static int
399vmci_transport_notify_pkt_recv_init(
400 struct sock *sk,
401 size_t target,
402 struct vmci_transport_recv_notify_data *data)
403{
404 struct vsock_sock *vsk = vsock_sk(sk);
405
406#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
407 data->consume_head = 0;
408 data->produce_tail = 0;
409#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
410 data->notify_on_block = false;
411
412 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
413 PKT_FIELD(vsk, write_notify_min_window) = target + 1;
414 if (PKT_FIELD(vsk, write_notify_window) <
415 PKT_FIELD(vsk, write_notify_min_window)) {
416 /* If the current window is smaller than the new
417 * minimal window size, we need to reevaluate whether
418 * we need to notify the sender. If the number of ready
419 * bytes are smaller than the new window, we need to
420 * send a notification to the sender before we block.
421 */
422
423 PKT_FIELD(vsk, write_notify_window) =
424 PKT_FIELD(vsk, write_notify_min_window);
425 data->notify_on_block = true;
426 }
427 }
428#endif
429#endif
430
431 return 0;
432}
433
434static int
435vmci_transport_notify_pkt_recv_pre_block(
436 struct sock *sk,
437 size_t target,
438 struct vmci_transport_recv_notify_data *data)
439{
440 int err = 0;
441
442 /* Notify our peer that we are waiting for data to read. */
443 if (!send_waiting_read(sk, target)) {
444 err = -EHOSTUNREACH;
445 return err;
446 }
447#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
448 if (data->notify_on_block) {
449 err = vmci_transport_send_read_notification(sk);
450 if (err < 0)
451 return err;
452
453 data->notify_on_block = false;
454 }
455#endif
456
457 return err;
458}
459
460static int
461vmci_transport_notify_pkt_recv_pre_dequeue(
462 struct sock *sk,
463 size_t target,
464 struct vmci_transport_recv_notify_data *data)
465{
466 struct vsock_sock *vsk = vsock_sk(sk);
467
468 /* Now consume up to len bytes from the queue. Note that since we have
469 * the socket locked we should copy at least ready bytes.
470 */
471#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
472 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair,
473 &data->produce_tail,
474 &data->consume_head);
475#endif
476
477 return 0;
478}
479
480static int
481vmci_transport_notify_pkt_recv_post_dequeue(
482 struct sock *sk,
483 size_t target,
484 ssize_t copied,
485 bool data_read,
486 struct vmci_transport_recv_notify_data *data)
487{
488 struct vsock_sock *vsk;
489 int err;
490
491 vsk = vsock_sk(sk);
492 err = 0;
493
494 if (data_read) {
495#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
496 /* Detect a wrap-around to maintain queue generation. Note
497 * that this is safe since we hold the socket lock across the
498 * two queue pair operations.
499 */
500 if (copied >=
501 vmci_trans(vsk)->consume_size - data->consume_head)
502 PKT_FIELD(vsk, consume_q_generation)++;
503#endif
504
505 err = vmci_transport_send_read_notification(sk);
506 if (err < 0)
507 return err;
508
509 }
510 return err;
511}
512
513static int
514vmci_transport_notify_pkt_send_init(
515 struct sock *sk,
516 struct vmci_transport_send_notify_data *data)
517{
518#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
519 data->consume_head = 0;
520 data->produce_tail = 0;
521#endif
522
523 return 0;
524}
525
526static int
527vmci_transport_notify_pkt_send_pre_block(
528 struct sock *sk,
529 struct vmci_transport_send_notify_data *data)
530{
531 /* Notify our peer that we are waiting for room to write. */
532 if (!send_waiting_write(sk, 1))
533 return -EHOSTUNREACH;
534
535 return 0;
536}
537
538static int
539vmci_transport_notify_pkt_send_pre_enqueue(
540 struct sock *sk,
541 struct vmci_transport_send_notify_data *data)
542{
543 struct vsock_sock *vsk = vsock_sk(sk);
544
545#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
546 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair,
547 &data->produce_tail,
548 &data->consume_head);
549#endif
550
551 return 0;
552}
553
554static int
555vmci_transport_notify_pkt_send_post_enqueue(
556 struct sock *sk,
557 ssize_t written,
558 struct vmci_transport_send_notify_data *data)
559{
560 int err = 0;
561 struct vsock_sock *vsk;
562 bool sent_wrote = false;
563 int retries = 0;
564
565 vsk = vsock_sk(sk);
566
567#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
568 /* Detect a wrap-around to maintain queue generation. Note that this
569 * is safe since we hold the socket lock across the two queue pair
570 * operations.
571 */
572 if (written >= vmci_trans(vsk)->produce_size - data->produce_tail)
573 PKT_FIELD(vsk, produce_q_generation)++;
574
575#endif
576
577 if (vmci_transport_notify_waiting_read(vsk)) {
578 /* Notify the peer that we have written, retrying the send on
579 * failure up to our maximum value. See the XXX comment for the
580 * corresponding piece of code in StreamRecvmsg() for potential
581 * improvements.
582 */
583 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
584 !sent_wrote &&
585 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
586 err = vmci_transport_send_wrote(sk);
587 if (err >= 0)
588 sent_wrote = true;
589
590 retries++;
591 }
592
593 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
594 pr_err("%p unable to send wrote notify to peer\n", sk);
595 return err;
596 } else {
597#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
598 PKT_FIELD(vsk, peer_waiting_read) = false;
599#endif
600 }
601 }
602 return err;
603}
604
605static void
606vmci_transport_notify_pkt_handle_pkt(
607 struct sock *sk,
608 struct vmci_transport_packet *pkt,
609 bool bottom_half,
610 struct sockaddr_vm *dst,
611 struct sockaddr_vm *src, bool *pkt_processed)
612{
613 bool processed = false;
614
615 switch (pkt->type) {
616 case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
617 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
618 processed = true;
619 break;
620 case VMCI_TRANSPORT_PACKET_TYPE_READ:
621 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
622 processed = true;
623 break;
624 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
625 vmci_transport_handle_waiting_write(sk, pkt, bottom_half,
626 dst, src);
627 processed = true;
628 break;
629
630 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
631 vmci_transport_handle_waiting_read(sk, pkt, bottom_half,
632 dst, src);
633 processed = true;
634 break;
635 }
636
637 if (pkt_processed)
638 *pkt_processed = processed;
639}
640
641static void vmci_transport_notify_pkt_process_request(struct sock *sk)
642{
643 struct vsock_sock *vsk = vsock_sk(sk);
644
645 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
646 if (vmci_trans(vsk)->consume_size <
647 PKT_FIELD(vsk, write_notify_min_window))
648 PKT_FIELD(vsk, write_notify_min_window) =
649 vmci_trans(vsk)->consume_size;
650}
651
652static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
653{
654 struct vsock_sock *vsk = vsock_sk(sk);
655
656 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
657 if (vmci_trans(vsk)->consume_size <
658 PKT_FIELD(vsk, write_notify_min_window))
659 PKT_FIELD(vsk, write_notify_min_window) =
660 vmci_trans(vsk)->consume_size;
661}
662
663/* Socket control packet based operations. */
664struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
665 vmci_transport_notify_pkt_socket_init,
666 vmci_transport_notify_pkt_socket_destruct,
667 vmci_transport_notify_pkt_poll_in,
668 vmci_transport_notify_pkt_poll_out,
669 vmci_transport_notify_pkt_handle_pkt,
670 vmci_transport_notify_pkt_recv_init,
671 vmci_transport_notify_pkt_recv_pre_block,
672 vmci_transport_notify_pkt_recv_pre_dequeue,
673 vmci_transport_notify_pkt_recv_post_dequeue,
674 vmci_transport_notify_pkt_send_init,
675 vmci_transport_notify_pkt_send_pre_block,
676 vmci_transport_notify_pkt_send_pre_enqueue,
677 vmci_transport_notify_pkt_send_post_enqueue,
678 vmci_transport_notify_pkt_process_request,
679 vmci_transport_notify_pkt_process_negotiate,
680};
diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h
new file mode 100644
index 000000000000..7df793249b6c
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport_notify.h
@@ -0,0 +1,83 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#ifndef __VMCI_TRANSPORT_NOTIFY_H__
17#define __VMCI_TRANSPORT_NOTIFY_H__
18
19#include <linux/types.h>
20#include <linux/vmw_vmci_defs.h>
21#include <linux/vmw_vmci_api.h>
22#include <linux/vm_sockets.h>
23
24#include "vmci_transport.h"
25
26/* Comment this out to compare with old protocol. */
27#define VSOCK_OPTIMIZATION_WAITING_NOTIFY 1
28#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
29/* Comment this out to remove flow control for "new" protocol */
30#define VSOCK_OPTIMIZATION_FLOW_CONTROL 1
31#endif
32
33#define VMCI_TRANSPORT_MAX_DGRAM_RESENDS 10
34
35struct vmci_transport_recv_notify_data {
36 u64 consume_head;
37 u64 produce_tail;
38 bool notify_on_block;
39};
40
41struct vmci_transport_send_notify_data {
42 u64 consume_head;
43 u64 produce_tail;
44};
45
46/* Socket notification callbacks. */
47struct vmci_transport_notify_ops {
48 void (*socket_init) (struct sock *sk);
49 void (*socket_destruct) (struct vsock_sock *vsk);
50 int (*poll_in) (struct sock *sk, size_t target,
51 bool *data_ready_now);
52 int (*poll_out) (struct sock *sk, size_t target,
53 bool *space_avail_now);
54 void (*handle_notify_pkt) (struct sock *sk,
55 struct vmci_transport_packet *pkt,
56 bool bottom_half, struct sockaddr_vm *dst,
57 struct sockaddr_vm *src,
58 bool *pkt_processed);
59 int (*recv_init) (struct sock *sk, size_t target,
60 struct vmci_transport_recv_notify_data *data);
61 int (*recv_pre_block) (struct sock *sk, size_t target,
62 struct vmci_transport_recv_notify_data *data);
63 int (*recv_pre_dequeue) (struct sock *sk, size_t target,
64 struct vmci_transport_recv_notify_data *data);
65 int (*recv_post_dequeue) (struct sock *sk, size_t target,
66 ssize_t copied, bool data_read,
67 struct vmci_transport_recv_notify_data *data);
68 int (*send_init) (struct sock *sk,
69 struct vmci_transport_send_notify_data *data);
70 int (*send_pre_block) (struct sock *sk,
71 struct vmci_transport_send_notify_data *data);
72 int (*send_pre_enqueue) (struct sock *sk,
73 struct vmci_transport_send_notify_data *data);
74 int (*send_post_enqueue) (struct sock *sk, ssize_t written,
75 struct vmci_transport_send_notify_data *data);
76 void (*process_request) (struct sock *sk);
77 void (*process_negotiate) (struct sock *sk);
78};
79
80extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops;
81extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops;
82
83#endif /* __VMCI_TRANSPORT_NOTIFY_H__ */
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
new file mode 100644
index 000000000000..622bd7aa1016
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -0,0 +1,438 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/types.h>
17#include <linux/socket.h>
18#include <linux/stddef.h>
19#include <net/sock.h>
20
21#include "vmci_transport_notify.h"
22
23#define PKT_FIELD(vsk, field_name) \
24 (vmci_trans(vsk)->notify.pkt_q_state.field_name)
25
26static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
27{
28 bool retval;
29 u64 notify_limit;
30
31 if (!PKT_FIELD(vsk, peer_waiting_write))
32 return false;
33
34 /* When the sender blocks, we take that as a sign that the sender is
35 * faster than the receiver. To reduce the transmit rate of the sender,
36 * we delay the sending of the read notification by decreasing the
37 * write_notify_window. The notification is delayed until the number of
38 * bytes used in the queue drops below the write_notify_window.
39 */
40
41 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
42 PKT_FIELD(vsk, peer_waiting_write_detected) = true;
43 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
44 PKT_FIELD(vsk, write_notify_window) =
45 PKT_FIELD(vsk, write_notify_min_window);
46 } else {
47 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
48 if (PKT_FIELD(vsk, write_notify_window) <
49 PKT_FIELD(vsk, write_notify_min_window))
50 PKT_FIELD(vsk, write_notify_window) =
51 PKT_FIELD(vsk, write_notify_min_window);
52
53 }
54 }
55 notify_limit = vmci_trans(vsk)->consume_size -
56 PKT_FIELD(vsk, write_notify_window);
57
58 /* The notify_limit is used to delay notifications in the case where
59 * flow control is enabled. Below the test is expressed in terms of
60 * free space in the queue: if free_space > ConsumeSize -
61 * write_notify_window then notify An alternate way of expressing this
62 * is to rewrite the expression to use the data ready in the receive
63 * queue: if write_notify_window > bufferReady then notify as
64 * free_space == ConsumeSize - bufferReady.
65 */
66
67 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
68 notify_limit;
69
70 if (retval) {
71 /* Once we notify the peer, we reset the detected flag so the
72 * next wait will again cause a decrease in the window size.
73 */
74
75 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
76 }
77 return retval;
78}
79
80static void
81vmci_transport_handle_read(struct sock *sk,
82 struct vmci_transport_packet *pkt,
83 bool bottom_half,
84 struct sockaddr_vm *dst, struct sockaddr_vm *src)
85{
86 sk->sk_write_space(sk);
87}
88
89static void
90vmci_transport_handle_wrote(struct sock *sk,
91 struct vmci_transport_packet *pkt,
92 bool bottom_half,
93 struct sockaddr_vm *dst, struct sockaddr_vm *src)
94{
95 sk->sk_data_ready(sk, 0);
96}
97
98static void vsock_block_update_write_window(struct sock *sk)
99{
100 struct vsock_sock *vsk = vsock_sk(sk);
101
102 if (PKT_FIELD(vsk, write_notify_window) < vmci_trans(vsk)->consume_size)
103 PKT_FIELD(vsk, write_notify_window) =
104 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
105 vmci_trans(vsk)->consume_size);
106}
107
108static int vmci_transport_send_read_notification(struct sock *sk)
109{
110 struct vsock_sock *vsk;
111 bool sent_read;
112 unsigned int retries;
113 int err;
114
115 vsk = vsock_sk(sk);
116 sent_read = false;
117 retries = 0;
118 err = 0;
119
120 if (vmci_transport_notify_waiting_write(vsk)) {
121 /* Notify the peer that we have read, retrying the send on
122 * failure up to our maximum value. XXX For now we just log
123 * the failure, but later we should schedule a work item to
124 * handle the resend until it succeeds. That would require
125 * keeping track of work items in the vsk and cleaning them up
126 * upon socket close.
127 */
128 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
129 !sent_read &&
130 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
131 err = vmci_transport_send_read(sk);
132 if (err >= 0)
133 sent_read = true;
134
135 retries++;
136 }
137
138 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_read)
139 pr_err("%p unable to send read notification to peer\n",
140 sk);
141 else
142 PKT_FIELD(vsk, peer_waiting_write) = false;
143
144 }
145 return err;
146}
147
148static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
149{
150 struct vsock_sock *vsk = vsock_sk(sk);
151
152 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
153 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
154 PKT_FIELD(vsk, peer_waiting_write) = false;
155 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
156}
157
158static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
159{
160 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
161 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
162 PKT_FIELD(vsk, peer_waiting_write) = false;
163 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
164}
165
166static int
167vmci_transport_notify_pkt_poll_in(struct sock *sk,
168 size_t target, bool *data_ready_now)
169{
170 struct vsock_sock *vsk = vsock_sk(sk);
171
172 if (vsock_stream_has_data(vsk)) {
173 *data_ready_now = true;
174 } else {
175 /* We can't read right now because there is nothing in the
176 * queue. Ask for notifications when there is something to
177 * read.
178 */
179 if (sk->sk_state == SS_CONNECTED)
180 vsock_block_update_write_window(sk);
181 *data_ready_now = false;
182 }
183
184 return 0;
185}
186
187static int
188vmci_transport_notify_pkt_poll_out(struct sock *sk,
189 size_t target, bool *space_avail_now)
190{
191 s64 produce_q_free_space;
192 struct vsock_sock *vsk = vsock_sk(sk);
193
194 produce_q_free_space = vsock_stream_has_space(vsk);
195 if (produce_q_free_space > 0) {
196 *space_avail_now = true;
197 return 0;
198 } else if (produce_q_free_space == 0) {
199 /* This is a connected socket but we can't currently send data.
200 * Nothing else to do.
201 */
202 *space_avail_now = false;
203 }
204
205 return 0;
206}
207
208static int
209vmci_transport_notify_pkt_recv_init(
210 struct sock *sk,
211 size_t target,
212 struct vmci_transport_recv_notify_data *data)
213{
214 struct vsock_sock *vsk = vsock_sk(sk);
215
216 data->consume_head = 0;
217 data->produce_tail = 0;
218 data->notify_on_block = false;
219
220 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
221 PKT_FIELD(vsk, write_notify_min_window) = target + 1;
222 if (PKT_FIELD(vsk, write_notify_window) <
223 PKT_FIELD(vsk, write_notify_min_window)) {
224 /* If the current window is smaller than the new
225 * minimal window size, we need to reevaluate whether
226 * we need to notify the sender. If the number of ready
227 * bytes are smaller than the new window, we need to
228 * send a notification to the sender before we block.
229 */
230
231 PKT_FIELD(vsk, write_notify_window) =
232 PKT_FIELD(vsk, write_notify_min_window);
233 data->notify_on_block = true;
234 }
235 }
236
237 return 0;
238}
239
240static int
241vmci_transport_notify_pkt_recv_pre_block(
242 struct sock *sk,
243 size_t target,
244 struct vmci_transport_recv_notify_data *data)
245{
246 int err = 0;
247
248 vsock_block_update_write_window(sk);
249
250 if (data->notify_on_block) {
251 err = vmci_transport_send_read_notification(sk);
252 if (err < 0)
253 return err;
254 data->notify_on_block = false;
255 }
256
257 return err;
258}
259
260static int
261vmci_transport_notify_pkt_recv_post_dequeue(
262 struct sock *sk,
263 size_t target,
264 ssize_t copied,
265 bool data_read,
266 struct vmci_transport_recv_notify_data *data)
267{
268 struct vsock_sock *vsk;
269 int err;
270 bool was_full = false;
271 u64 free_space;
272
273 vsk = vsock_sk(sk);
274 err = 0;
275
276 if (data_read) {
277 smp_mb();
278
279 free_space =
280 vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair);
281 was_full = free_space == copied;
282
283 if (was_full)
284 PKT_FIELD(vsk, peer_waiting_write) = true;
285
286 err = vmci_transport_send_read_notification(sk);
287 if (err < 0)
288 return err;
289
290 /* See the comment in
291 * vmci_transport_notify_pkt_send_post_enqueue().
292 */
293 sk->sk_data_ready(sk, 0);
294 }
295
296 return err;
297}
298
299static int
300vmci_transport_notify_pkt_send_init(
301 struct sock *sk,
302 struct vmci_transport_send_notify_data *data)
303{
304 data->consume_head = 0;
305 data->produce_tail = 0;
306
307 return 0;
308}
309
310static int
311vmci_transport_notify_pkt_send_post_enqueue(
312 struct sock *sk,
313 ssize_t written,
314 struct vmci_transport_send_notify_data *data)
315{
316 int err = 0;
317 struct vsock_sock *vsk;
318 bool sent_wrote = false;
319 bool was_empty;
320 int retries = 0;
321
322 vsk = vsock_sk(sk);
323
324 smp_mb();
325
326 was_empty =
327 vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) == written;
328 if (was_empty) {
329 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
330 !sent_wrote &&
331 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
332 err = vmci_transport_send_wrote(sk);
333 if (err >= 0)
334 sent_wrote = true;
335
336 retries++;
337 }
338 }
339
340 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_wrote) {
341 pr_err("%p unable to send wrote notification to peer\n",
342 sk);
343 return err;
344 }
345
346 return err;
347}
348
349static void
350vmci_transport_notify_pkt_handle_pkt(
351 struct sock *sk,
352 struct vmci_transport_packet *pkt,
353 bool bottom_half,
354 struct sockaddr_vm *dst,
355 struct sockaddr_vm *src, bool *pkt_processed)
356{
357 bool processed = false;
358
359 switch (pkt->type) {
360 case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
361 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
362 processed = true;
363 break;
364 case VMCI_TRANSPORT_PACKET_TYPE_READ:
365 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
366 processed = true;
367 break;
368 }
369
370 if (pkt_processed)
371 *pkt_processed = processed;
372}
373
374static void vmci_transport_notify_pkt_process_request(struct sock *sk)
375{
376 struct vsock_sock *vsk = vsock_sk(sk);
377
378 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
379 if (vmci_trans(vsk)->consume_size <
380 PKT_FIELD(vsk, write_notify_min_window))
381 PKT_FIELD(vsk, write_notify_min_window) =
382 vmci_trans(vsk)->consume_size;
383}
384
385static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
386{
387 struct vsock_sock *vsk = vsock_sk(sk);
388
389 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
390 if (vmci_trans(vsk)->consume_size <
391 PKT_FIELD(vsk, write_notify_min_window))
392 PKT_FIELD(vsk, write_notify_min_window) =
393 vmci_trans(vsk)->consume_size;
394}
395
396static int
397vmci_transport_notify_pkt_recv_pre_dequeue(
398 struct sock *sk,
399 size_t target,
400 struct vmci_transport_recv_notify_data *data)
401{
402 return 0; /* NOP for QState. */
403}
404
405static int
406vmci_transport_notify_pkt_send_pre_block(
407 struct sock *sk,
408 struct vmci_transport_send_notify_data *data)
409{
410 return 0; /* NOP for QState. */
411}
412
413static int
414vmci_transport_notify_pkt_send_pre_enqueue(
415 struct sock *sk,
416 struct vmci_transport_send_notify_data *data)
417{
418 return 0; /* NOP for QState. */
419}
420
421/* Socket always on control packet based operations. */
422struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
423 vmci_transport_notify_pkt_socket_init,
424 vmci_transport_notify_pkt_socket_destruct,
425 vmci_transport_notify_pkt_poll_in,
426 vmci_transport_notify_pkt_poll_out,
427 vmci_transport_notify_pkt_handle_pkt,
428 vmci_transport_notify_pkt_recv_init,
429 vmci_transport_notify_pkt_recv_pre_block,
430 vmci_transport_notify_pkt_recv_pre_dequeue,
431 vmci_transport_notify_pkt_recv_post_dequeue,
432 vmci_transport_notify_pkt_send_init,
433 vmci_transport_notify_pkt_send_pre_block,
434 vmci_transport_notify_pkt_send_pre_enqueue,
435 vmci_transport_notify_pkt_send_post_enqueue,
436 vmci_transport_notify_pkt_process_request,
437 vmci_transport_notify_pkt_process_negotiate,
438};
diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c
new file mode 100644
index 000000000000..b7df1aea7c59
--- /dev/null
+++ b/net/vmw_vsock/vsock_addr.c
@@ -0,0 +1,86 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2007-2012 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/types.h>
17#include <linux/socket.h>
18#include <linux/stddef.h>
19#include <net/sock.h>
20
21#include "vsock_addr.h"
22
23void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port)
24{
25 memset(addr, 0, sizeof(*addr));
26 addr->svm_family = AF_VSOCK;
27 addr->svm_cid = cid;
28 addr->svm_port = port;
29}
30EXPORT_SYMBOL_GPL(vsock_addr_init);
31
32int vsock_addr_validate(const struct sockaddr_vm *addr)
33{
34 if (!addr)
35 return -EFAULT;
36
37 if (addr->svm_family != AF_VSOCK)
38 return -EAFNOSUPPORT;
39
40 if (addr->svm_zero[0] != 0)
41 return -EINVAL;
42
43 return 0;
44}
45EXPORT_SYMBOL_GPL(vsock_addr_validate);
46
47bool vsock_addr_bound(const struct sockaddr_vm *addr)
48{
49 return addr->svm_port != VMADDR_PORT_ANY;
50}
51EXPORT_SYMBOL_GPL(vsock_addr_bound);
52
53void vsock_addr_unbind(struct sockaddr_vm *addr)
54{
55 vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
56}
57EXPORT_SYMBOL_GPL(vsock_addr_unbind);
58
59bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
60 const struct sockaddr_vm *other)
61{
62 return addr->svm_cid == other->svm_cid &&
63 addr->svm_port == other->svm_port;
64}
65EXPORT_SYMBOL_GPL(vsock_addr_equals_addr);
66
67bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr,
68 const struct sockaddr_vm *other)
69{
70 return (addr->svm_cid == VMADDR_CID_ANY ||
71 other->svm_cid == VMADDR_CID_ANY ||
72 addr->svm_cid == other->svm_cid) &&
73 addr->svm_port == other->svm_port;
74}
75EXPORT_SYMBOL_GPL(vsock_addr_equals_addr_any);
76
77int vsock_addr_cast(const struct sockaddr *addr,
78 size_t len, struct sockaddr_vm **out_addr)
79{
80 if (len < sizeof(**out_addr))
81 return -EFAULT;
82
83 *out_addr = (struct sockaddr_vm *)addr;
84 return vsock_addr_validate(*out_addr);
85}
86EXPORT_SYMBOL_GPL(vsock_addr_cast);
diff --git a/net/vmw_vsock/vsock_addr.h b/net/vmw_vsock/vsock_addr.h
new file mode 100644
index 000000000000..cdfbcefdf843
--- /dev/null
+++ b/net/vmw_vsock/vsock_addr.h
@@ -0,0 +1,32 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#ifndef _VSOCK_ADDR_H_
17#define _VSOCK_ADDR_H_
18
19#include <linux/vm_sockets.h>
20
21void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port);
22int vsock_addr_validate(const struct sockaddr_vm *addr);
23bool vsock_addr_bound(const struct sockaddr_vm *addr);
24void vsock_addr_unbind(struct sockaddr_vm *addr);
25bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
26 const struct sockaddr_vm *other);
27bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr,
28 const struct sockaddr_vm *other);
29int vsock_addr_cast(const struct sockaddr *addr, size_t len,
30 struct sockaddr_vm **out_addr);
31
32#endif
diff --git a/net/vmw_vsock/vsock_version.h b/net/vmw_vsock/vsock_version.h
new file mode 100644
index 000000000000..4df7f5e2151c
--- /dev/null
+++ b/net/vmw_vsock/vsock_version.h
@@ -0,0 +1,22 @@
1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2011-2012 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#ifndef _VSOCK_VERSION_H_
17#define _VSOCK_VERSION_H_
18
19#define VSOCK_DRIVER_VERSION_PARTS { 1, 0, 0, 0 }
20#define VSOCK_DRIVER_VERSION_STRING "1.0.0.0-k"
21
22#endif /* _VSOCK_VERSION_H_ */