aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSudeep Dutt <sudeep.dutt@intel.com>2015-04-29 08:32:28 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2015-05-24 15:13:36 -0400
commit7df20f2d893db42eaa1ea1e30a2573c971ec9238 (patch)
tree372f796f0c48006754facac07edda9ea390b88c7
parent0d09f1a54d9710548c9af72dc1564c8291a5307c (diff)
misc: mic: SCIF header file and IOCTL interface
This patch introduces the SCIF documentation in the header file and describes the IOCTL interface for user mode. mic_overview.txt is updated with documentation on SCIF and a new document describing SCIF in more details is available in scif_overview.txt. Reviewed-by: Nikhil Rao <nikhil.rao@intel.com> Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com> Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--Documentation/mic/mic_overview.txt28
-rw-r--r--Documentation/mic/scif_overview.txt98
-rw-r--r--include/linux/scif.h993
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/scif_ioctl.h130
5 files changed, 1238 insertions, 12 deletions
diff --git a/Documentation/mic/mic_overview.txt b/Documentation/mic/mic_overview.txt
index 77c541802ad9..1a2f2c8ec59e 100644
--- a/Documentation/mic/mic_overview.txt
+++ b/Documentation/mic/mic_overview.txt
@@ -24,6 +24,10 @@ a virtual bus called mic bus is created and virtual dma devices are
24created on it by the host/card drivers. On host the channels are private 24created on it by the host/card drivers. On host the channels are private
25and used only by the host driver to transfer data for the virtio devices. 25and used only by the host driver to transfer data for the virtio devices.
26 26
27The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a
28low level communications API across PCIe currently implemented for MIC.
29More details are available at scif_overview.txt.
30
27Here is a block diagram of the various components described above. The 31Here is a block diagram of the various components described above. The
28virtio backends are situated on the host rather than the card given better 32virtio backends are situated on the host rather than the card given better
29single threaded performance for the host compared to MIC, the ability of 33single threaded performance for the host compared to MIC, the ability of
@@ -47,18 +51,18 @@ the fact that the virtio block storage backend can only be on the host.
47 | | | Virtio over PCIe IOCTLs | 51 | | | Virtio over PCIe IOCTLs |
48 | | +--------------------------+ 52 | | +--------------------------+
49+-----------+ | | | +-----------+ 53+-----------+ | | | +-----------+
50| MIC DMA | | | | | MIC DMA | 54| MIC DMA | | +----------+ | +-----------+ | | MIC DMA |
51| Driver | | | | | Driver | 55| Driver | | | SCIF | | | SCIF | | | Driver |
52+-----------+ | | | +-----------+ 56+-----------+ | +----------+ | +-----------+ | +-----------+
53 | | | | | 57 | | | | | | |
54+---------------+ | | | +----------------+ 58+---------------+ | +-----+-----+ | +-----+-----+ | +---------------+
55|MIC virtual Bus| | | | |MIC virtual Bus | 59|MIC virtual Bus| | |SCIF HW Bus| | |SCIF HW BUS| | |MIC virtual Bus|
56+---------------+ | | | +----------------+ 60+---------------+ | +-----------+ | +-----+-----+ | +---------------+
57 | | | | | 61 | | | | | | |
58 | +--------------+ | +---------------+ | 62 | +--------------+ | | | +---------------+ |
59 | |Intel MIC | | |Intel MIC | | 63 | |Intel MIC | | | | |Intel MIC | |
60 +---|Card Driver | | |Host Driver | | 64 +---|Card Driver +----+ | | |Host Driver | |
61 +--------------+ | +---------------+-----+ 65 +--------------+ | +----+---------------+-----+
62 | | | 66 | | |
63 +-------------------------------------------------------------+ 67 +-------------------------------------------------------------+
64 | | 68 | |
diff --git a/Documentation/mic/scif_overview.txt b/Documentation/mic/scif_overview.txt
new file mode 100644
index 000000000000..0a280d986731
--- /dev/null
+++ b/Documentation/mic/scif_overview.txt
@@ -0,0 +1,98 @@
1The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a low
2level communications API across PCIe currently implemented for MIC. Currently
3SCIF provides inter-node communication within a single host platform, where a
4node is a MIC Coprocessor or Xeon based host. SCIF abstracts the details of
5communicating over the PCIe bus while providing an API that is symmetric
6across all the nodes in the PCIe network. An important design objective for SCIF
7is to deliver the maximum possible performance given the communication
8abilities of the hardware. SCIF has been used to implement an offload compiler
9runtime and OFED support for MPI implementations for MIC coprocessors.
10
11==== SCIF API Components ====
12The SCIF API has the following parts:
131. Connection establishment using a client server model
142. Byte stream messaging intended for short messages
153. Node enumeration to determine online nodes
164. Poll semantics for detection of incoming connections and messages
175. Memory registration to pin down pages
186. Remote memory mapping for low latency CPU accesses via mmap
197. Remote DMA (RDMA) for high bandwidth DMA transfers
208. Fence APIs for RDMA synchronization
21
22SCIF exposes the notion of a connection which can be used by peer processes on
23nodes in a SCIF PCIe "network" to share memory "windows" and to communicate. A
24process in a SCIF node initiates a SCIF connection to a peer process on a
25different node via a SCIF "endpoint". SCIF endpoints support messaging APIs
26which are similar to connection oriented socket APIs. Connected SCIF endpoints
27can also register local memory which is followed by data transfer using either
28DMA, CPU copies or remote memory mapping via mmap. SCIF supports both user and
29kernel mode clients which are functionally equivalent.
30
31==== SCIF Performance for MIC ====
32DMA bandwidth comparison between the TCP (over ethernet over PCIe) stack versus
33SCIF shows the performance advantages of SCIF for HPC applications and runtimes.
34
35 Comparison of TCP and SCIF based BW
36
37 Throughput (GB/sec)
38 8 + PCIe Bandwidth ******
39 + TCP ######
40 7 + ************************************** SCIF %%%%%%
41 | %%%%%%%%%%%%%%%%%%%
42 6 + %%%%
43 | %%
44 | %%%
45 5 + %%
46 | %%
47 4 + %%
48 | %%
49 3 + %%
50 | %
51 2 + %%
52 | %%
53 | %
54 1 +
55 + ######################################
56 0 +++---+++--+--+-+--+--+-++-+--+-++-+--+-++-+-
57 1 10 100 1000 10000 100000
58 Transfer Size (KBytes)
59
60SCIF allows memory sharing via mmap(..) between processes on different PCIe
61nodes and thus provides bare-metal PCIe latency. The round trip SCIF mmap
62latency from the host to an x100 MIC for an 8 byte message is 0.44 usecs.
63
64SCIF has a user space library which is a thin IOCTL wrapper providing a user
65space API similar to the kernel API in scif.h. The SCIF user space library
66is distributed @ https://software.intel.com/en-us/mic-developer
67
68Here is some pseudo code for an example of how two applications on two PCIe
69nodes would typically use the SCIF API:
70
71Process A (on node A) Process B (on node B)
72
73/* get online node information */
74scif_get_node_ids(..) scif_get_node_ids(..)
75scif_open(..) scif_open(..)
76scif_bind(..) scif_bind(..)
77scif_listen(..)
78scif_accept(..) scif_connect(..)
79/* SCIF connection established */
80
81/* Send and receive short messages */
82scif_send(..)/scif_recv(..) scif_send(..)/scif_recv(..)
83
84/* Register memory */
85scif_register(..) scif_register(..)
86
87/* RDMA */
88scif_readfrom(..)/scif_writeto(..) scif_readfrom(..)/scif_writeto(..)
89
90/* Fence DMAs */
91scif_fence_signal(..) scif_fence_signal(..)
92
93mmap(..) mmap(..)
94
95/* Access remote registered memory */
96
97/* Close the endpoints */
98scif_close(..) scif_close(..)
diff --git a/include/linux/scif.h b/include/linux/scif.h
new file mode 100644
index 000000000000..44f4f3898bbe
--- /dev/null
+++ b/include/linux/scif.h
@@ -0,0 +1,993 @@
1/*
2 * Intel MIC Platform Software Stack (MPSS)
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * BSD LICENSE
21 *
22 * Copyright(c) 2014 Intel Corporation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 *
28 * * Redistributions of source code must retain the above copyright
29 * notice, this list of conditions and the following disclaimer.
30 * * Redistributions in binary form must reproduce the above copyright
31 * notice, this list of conditions and the following disclaimer in
32 * the documentation and/or other materials provided with the
33 * distribution.
34 * * Neither the name of Intel Corporation nor the names of its
35 * contributors may be used to endorse or promote products derived
36 * from this software without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
39 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
40 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
41 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
42 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
45 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
46 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 *
50 * Intel SCIF driver.
51 *
52 */
53#ifndef __SCIF_H__
54#define __SCIF_H__
55
56#include <linux/types.h>
57#include <linux/poll.h>
58#include <linux/scif_ioctl.h>
59
60#define SCIF_ACCEPT_SYNC 1
61#define SCIF_SEND_BLOCK 1
62#define SCIF_RECV_BLOCK 1
63
64enum {
65 SCIF_PROT_READ = (1 << 0),
66 SCIF_PROT_WRITE = (1 << 1)
67};
68
69enum {
70 SCIF_MAP_FIXED = 0x10,
71 SCIF_MAP_KERNEL = 0x20,
72};
73
74enum {
75 SCIF_FENCE_INIT_SELF = (1 << 0),
76 SCIF_FENCE_INIT_PEER = (1 << 1),
77 SCIF_SIGNAL_LOCAL = (1 << 4),
78 SCIF_SIGNAL_REMOTE = (1 << 5)
79};
80
81enum {
82 SCIF_RMA_USECPU = (1 << 0),
83 SCIF_RMA_USECACHE = (1 << 1),
84 SCIF_RMA_SYNC = (1 << 2),
85 SCIF_RMA_ORDERED = (1 << 3)
86};
87
88/* End of SCIF Admin Reserved Ports */
89#define SCIF_ADMIN_PORT_END 1024
90
91/* End of SCIF Reserved Ports */
92#define SCIF_PORT_RSVD 1088
93
94typedef struct scif_endpt *scif_epd_t;
95
96#define SCIF_OPEN_FAILED ((scif_epd_t)-1)
97#define SCIF_REGISTER_FAILED ((off_t)-1)
98#define SCIF_MMAP_FAILED ((void *)-1)
99
100/**
101 * scif_open() - Create an endpoint
102 *
103 * Return:
104 * Upon successful completion, scif_open() returns an endpoint descriptor to
105 * be used in subsequent SCIF functions calls to refer to that endpoint;
106 * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
107 * returned and errno is set to indicate the error; in kernel mode a NULL
108 * scif_epd_t is returned.
109 *
110 * Errors:
111 * ENOMEM - Insufficient kernel memory was available
112 */
113scif_epd_t scif_open(void);
114
115/**
116 * scif_bind() - Bind an endpoint to a port
117 * @epd: endpoint descriptor
118 * @pn: port number
119 *
120 * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
121 * local node. If pn is zero, a port number greater than or equal to
122 * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
123 * exactly one local port. Ports less than 1024 when requested can only be bound
124 * by system (or root) processes or by processes executed by privileged users.
125 *
126 * Return:
127 * Upon successful completion, scif_bind() returns the port number to which epd
128 * is bound; otherwise in user mode -1 is returned and errno is set to
129 * indicate the error; in kernel mode the negative of one of the following
130 * errors is returned.
131 *
132 * Errors:
133 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
134 * EINVAL - the endpoint or the port is already bound
135 * EISCONN - The endpoint is already connected
136 * ENOSPC - No port number available for assignment
137 * EACCES - The port requested is protected and the user is not the superuser
138 */
139int scif_bind(scif_epd_t epd, u16 pn);
140
141/**
142 * scif_listen() - Listen for connections on an endpoint
143 * @epd: endpoint descriptor
144 * @backlog: maximum pending connection requests
145 *
146 * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
147 * an endpoint that will be used to accept incoming connection requests. Once
148 * so marked, the endpoint is said to be in the listening state and may not be
149 * used as the endpoint of a connection.
150 *
151 * The endpoint, epd, must have been bound to a port.
152 *
153 * The backlog argument defines the maximum length to which the queue of
154 * pending connections for epd may grow. If a connection request arrives when
155 * the queue is full, the client may receive an error with an indication that
156 * the connection was refused.
157 *
158 * Return:
159 * Upon successful completion, scif_listen() returns 0; otherwise in user mode
160 * -1 is returned and errno is set to indicate the error; in kernel mode the
161 * negative of one of the following errors is returned.
162 *
163 * Errors:
164 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
165 * EINVAL - the endpoint is not bound to a port
166 * EISCONN - The endpoint is already connected or listening
167 */
168int scif_listen(scif_epd_t epd, int backlog);
169
170/**
171 * scif_connect() - Initiate a connection on a port
172 * @epd: endpoint descriptor
173 * @dst: global id of port to which to connect
174 *
175 * The scif_connect() function requests the connection of endpoint epd to remote
176 * port dst. If the connection is successful, a peer endpoint, bound to dst, is
177 * created on node dst.node. On successful return, the connection is complete.
178 *
179 * If the endpoint epd has not already been bound to a port, scif_connect()
180 * will bind it to an unused local port.
181 *
182 * A connection is terminated when an endpoint of the connection is closed,
183 * either explicitly by scif_close(), or when a process that owns one of the
184 * endpoints of the connection is terminated.
185 *
186 * In user space, scif_connect() supports an asynchronous connection mode
187 * if the application has set the O_NONBLOCK flag on the endpoint via the
188 * fcntl() system call. Setting this flag will result in the calling process
189 * not to wait during scif_connect().
190 *
191 * Return:
192 * Upon successful completion, scif_connect() returns the port ID to which the
193 * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is
194 * set to indicate the error; in kernel mode the negative of one of the
195 * following errors is returned.
196 *
197 * Errors:
198 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
199 * ECONNREFUSED - The destination was not listening for connections or refused
200 * the connection request
201 * EINVAL - dst.port is not a valid port ID
202 * EISCONN - The endpoint is already connected
203 * ENOMEM - No buffer space is available
204 * ENODEV - The destination node does not exist, or the node is lost or existed,
205 * but is not currently in the network since it may have crashed
206 * ENOSPC - No port number available for assignment
207 * EOPNOTSUPP - The endpoint is listening and cannot be connected
208 */
209int scif_connect(scif_epd_t epd, struct scif_port_id *dst);
210
211/**
212 * scif_accept() - Accept a connection on an endpoint
213 * @epd: endpoint descriptor
214 * @peer: global id of port to which connected
215 * @newepd: new connected endpoint descriptor
216 * @flags: flags
217 *
218 * The scif_accept() call extracts the first connection request from the queue
219 * of pending connections for the port on which epd is listening. scif_accept()
220 * creates a new endpoint, bound to the same port as epd, and allocates a new
221 * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
222 * endpoint is connected to the endpoint through which the connection was
223 * requested. epd is unaffected by this call, and remains in the listening
224 * state.
225 *
226 * On successful return, peer holds the global port identifier (node id and
227 * local port number) of the port which requested the connection.
228 *
229 * A connection is terminated when an endpoint of the connection is closed,
230 * either explicitly by scif_close(), or when a process that owns one of the
231 * endpoints of the connection is terminated.
232 *
233 * The number of connections that can (subsequently) be accepted on epd is only
234 * limited by system resources (memory).
235 *
236 * The flags argument is formed by OR'ing together zero or more of the
237 * following values.
238 * SCIF_ACCEPT_SYNC - block until a connection request is presented. If
239 * SCIF_ACCEPT_SYNC is not in flags, and no pending
240 * connections are present on the queue, scif_accept()
241 * fails with an EAGAIN error
242 *
243 * In user mode, the select() and poll() functions can be used to determine
244 * when there is a connection request. In kernel mode, the scif_poll()
245 * function may be used for this purpose. A readable event will be delivered
246 * when a connection is requested.
247 *
248 * Return:
249 * Upon successful completion, scif_accept() returns 0; otherwise in user mode
250 * -1 is returned and errno is set to indicate the error; in kernel mode the
251 * negative of one of the following errors is returned.
252 *
253 * Errors:
254 * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be
255 * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete
256 * its connection request
257 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
258 * EINTR - Interrupted function
259 * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is
260 * NULL, or newepd is NULL
261 * ENODEV - The requesting node is lost or existed, but is not currently in the
262 * network since it may have crashed
263 * ENOMEM - Not enough space
264 * ENOENT - Secondary part of epd registration failed
265 */
266int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t
267 *newepd, int flags);
268
269/**
270 * scif_close() - Close an endpoint
271 * @epd: endpoint descriptor
272 *
273 * scif_close() closes an endpoint and performs necessary teardown of
274 * facilities associated with that endpoint.
275 *
276 * If epd is a listening endpoint then it will no longer accept connection
277 * requests on the port to which it is bound. Any pending connection requests
278 * are rejected.
279 *
280 * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
281 * which are in-process through epd or its peer endpoint will complete before
282 * scif_close() returns. Registered windows of the local and peer endpoints are
283 * released as if scif_unregister() was called against each window.
284 *
285 * Closing a SCIF endpoint does not affect local registered memory mapped by
286 * a SCIF endpoint on a remote node. The local memory remains mapped by the peer
287 * SCIF endpoint explicitly removed by calling munmap(..) by the peer.
288 *
289 * If the peer endpoint's receive queue is not empty at the time that epd is
290 * closed, then the peer endpoint can be passed as the endpoint parameter to
291 * scif_recv() until the receive queue is empty.
292 *
293 * epd is freed and may no longer be accessed.
294 *
295 * Return:
296 * Upon successful completion, scif_close() returns 0; otherwise in user mode
297 * -1 is returned and errno is set to indicate the error; in kernel mode the
298 * negative of one of the following errors is returned.
299 *
300 * Errors:
301 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
302 */
303int scif_close(scif_epd_t epd);
304
305/**
306 * scif_send() - Send a message
307 * @epd: endpoint descriptor
308 * @msg: message buffer address
309 * @len: message length
310 * @flags: blocking mode flags
311 *
312 * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
313 * are copied from memory starting at address msg. On successful execution the
314 * return value of scif_send() is the number of bytes that were sent, and is
315 * zero if no bytes were sent because len was zero. scif_send() may be called
316 * only when the endpoint is in a connected state.
317 *
318 * If a scif_send() call is non-blocking, then it sends only those bytes which
319 * can be sent without waiting, up to a maximum of len bytes.
320 *
321 * If a scif_send() call is blocking, then it normally returns after sending
322 * all len bytes. If a blocking call is interrupted or the connection is
323 * reset, the call is considered successful if some bytes were sent or len is
324 * zero, otherwise the call is considered unsuccessful.
325 *
326 * In user mode, the select() and poll() functions can be used to determine
327 * when the send queue is not full. In kernel mode, the scif_poll() function
328 * may be used for this purpose.
329 *
330 * It is recommended that scif_send()/scif_recv() only be used for short
331 * control-type message communication between SCIF endpoints. The SCIF RMA
332 * APIs are expected to provide better performance for transfer sizes of
333 * 1024 bytes or longer for the current MIC hardware and software
334 * implementation.
335 *
336 * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK
337 * is passed as the flags argument.
338 *
339 * Return:
340 * Upon successful completion, scif_send() returns the number of bytes sent;
341 * otherwise in user mode -1 is returned and errno is set to indicate the
342 * error; in kernel mode the negative of one of the following errors is
343 * returned.
344 *
345 * Errors:
346 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
347 * ECONNRESET - Connection reset by peer
348 * EFAULT - An invalid address was specified for a parameter
349 * EINVAL - flags is invalid, or len is negative
350 * ENODEV - The remote node is lost or existed, but is not currently in the
351 * network since it may have crashed
352 * ENOMEM - Not enough space
353 * ENOTCONN - The endpoint is not connected
354 */
355int scif_send(scif_epd_t epd, void *msg, int len, int flags);
356
357/**
358 * scif_recv() - Receive a message
359 * @epd: endpoint descriptor
360 * @msg: message buffer address
361 * @len: message buffer length
362 * @flags: blocking mode flags
363 *
364 * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
365 * data are copied to memory starting at address msg. On successful execution
366 * the return value of scif_recv() is the number of bytes that were received,
367 * and is zero if no bytes were received because len was zero. scif_recv() may
368 * be called only when the endpoint is in a connected state.
369 *
370 * If a scif_recv() call is non-blocking, then it receives only those bytes
371 * which can be received without waiting, up to a maximum of len bytes.
372 *
373 * If a scif_recv() call is blocking, then it normally returns after receiving
374 * all len bytes. If the blocking call was interrupted due to a disconnection,
375 * subsequent calls to scif_recv() will copy all bytes received upto the point
376 * of disconnection.
377 *
378 * In user mode, the select() and poll() functions can be used to determine
379 * when data is available to be received. In kernel mode, the scif_poll()
380 * function may be used for this purpose.
381 *
382 * It is recommended that scif_send()/scif_recv() only be used for short
383 * control-type message communication between SCIF endpoints. The SCIF RMA
384 * APIs are expected to provide better performance for transfer sizes of
385 * 1024 bytes or longer for the current MIC hardware and software
386 * implementation.
387 *
388 * scif_recv() will block until the entire message is received if
389 * SCIF_RECV_BLOCK is passed as the flags argument.
390 *
391 * Return:
392 * Upon successful completion, scif_recv() returns the number of bytes
393 * received; otherwise in user mode -1 is returned and errno is set to
394 * indicate the error; in kernel mode the negative of one of the following
395 * errors is returned.
396 *
397 * Errors:
398 * EAGAIN - The destination node is returning from a low power state
399 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
400 * ECONNRESET - Connection reset by peer
401 * EFAULT - An invalid address was specified for a parameter
402 * EINVAL - flags is invalid, or len is negative
403 * ENODEV - The remote node is lost or existed, but is not currently in the
404 * network since it may have crashed
405 * ENOMEM - Not enough space
406 * ENOTCONN - The endpoint is not connected
407 */
408int scif_recv(scif_epd_t epd, void *msg, int len, int flags);
409
410/**
411 * scif_register() - Mark a memory region for remote access.
412 * @epd: endpoint descriptor
413 * @addr: starting virtual address
414 * @len: length of range
415 * @offset: offset of window
416 * @prot_flags: read/write protection flags
417 * @map_flags: mapping flags
418 *
419 * The scif_register() function opens a window, a range of whole pages of the
420 * registered address space of the endpoint epd, starting at offset po and
421 * continuing for len bytes. The value of po, further described below, is a
422 * function of the parameters offset and len, and the value of map_flags. Each
423 * page of the window represents the physical memory page which backs the
424 * corresponding page of the range of virtual address pages starting at addr
425 * and continuing for len bytes. addr and len are constrained to be multiples
426 * of the page size. A successful scif_register() call returns po.
427 *
428 * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
429 * exactly, and offset is constrained to be a multiple of the page size. The
430 * mapping established by scif_register() will not replace any existing
431 * registration; an error is returned if any page within the range [offset,
432 * offset + len - 1] intersects an existing window.
433 *
434 * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
435 * implementation-defined manner to arrive at po. The po value so chosen will
436 * be an area of the registered address space that the implementation deems
437 * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
438 * granting the implementation complete freedom in selecting po, subject to
439 * constraints described below. A non-zero value of offset is taken to be a
440 * suggestion of an offset near which the mapping should be placed. When the
441 * implementation selects a value for po, it does not replace any extant
442 * window. In all cases, po will be a multiple of the page size.
443 *
444 * The physical pages which are so represented by a window are available for
445 * access in calls to mmap(), scif_readfrom(), scif_writeto(),
446 * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
447 * physical pages represented by the window will not be reused by the memory
448 * subsystem for any other purpose. Note that the same physical page may be
449 * represented by multiple windows.
450 *
451 * Subsequent operations which change the memory pages to which virtual
452 * addresses are mapped (such as mmap(), munmap()) have no effect on
453 * existing window.
454 *
455 * If the process will fork(), it is recommended that the registered
456 * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
457 * problems due to copy-on-write semantics.
458 *
459 * The prot_flags argument is formed by OR'ing together one or more of the
460 * following values.
461 * SCIF_PROT_READ - allow read operations from the window
462 * SCIF_PROT_WRITE - allow write operations to the window
463 *
464 * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a
465 * fixed offset.
466 *
467 * Return:
468 * Upon successful completion, scif_register() returns the offset at which the
469 * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that
470 * is (off_t *)-1) is returned and errno is set to indicate the error; in
471 * kernel mode the negative of one of the following errors is returned.
472 *
473 * Errors:
474 * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range
475 * [offset, offset + len -1] are already registered
476 * EAGAIN - The mapping could not be performed due to lack of resources
477 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
478 * ECONNRESET - Connection reset by peer
479 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
480 * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is
481 * set in flags, and offset is not a multiple of the page size, or addr is not a
482 * multiple of the page size, or len is not a multiple of the page size, or is
483 * 0, or offset is negative
484 * ENODEV - The remote node is lost or existed, but is not currently in the
485 * network since it may have crashed
486 * ENOMEM - Not enough space
487 * ENOTCONN -The endpoint is not connected
488 */
489off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
490 int prot_flags, int map_flags);
491
492/**
493 * scif_unregister() - Mark a memory region for remote access.
494 * @epd: endpoint descriptor
495 * @offset: start of range to unregister
496 * @len: length of range to unregister
497 *
498 * The scif_unregister() function closes those previously registered windows
499 * which are entirely within the range [offset, offset + len - 1]. It is an
500 * error to specify a range which intersects only a subrange of a window.
501 *
502 * On a successful return, pages within the window may no longer be specified
503 * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
504 * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window,
505 * however, continues to exist until all previous references against it are
506 * removed. A window is referenced if there is a mapping to it created by
507 * mmap(), or if scif_get_pages() was called against the window
508 * (and the pages have not been returned via scif_put_pages()). A window is
509 * also referenced while an RMA, in which some range of the window is a source
510 * or destination, is in progress. Finally a window is referenced while some
511 * offset in that window was specified to scif_fence_signal(), and the RMAs
512 * marked by that call to scif_fence_signal() have not completed. While a
513 * window is in this state, its registered address space pages are not
514 * available for use in a new registered window.
515 *
516 * When all such references to the window have been removed, its references to
517 * all the physical pages which it represents are removed. Similarly, the
518 * registered address space pages of the window become available for
519 * registration in a new window.
520 *
521 * Return:
522 * Upon successful completion, scif_unregister() returns 0; otherwise in user
523 * mode -1 is returned and errno is set to indicate the error; in kernel mode
524 * the negative of one of the following errors is returned. In the event of an
525 * error, no windows are unregistered.
526 *
527 * Errors:
528 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
529 * ECONNRESET - Connection reset by peer
530 * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a
531 * window, or offset is negative
532 * ENODEV - The remote node is lost or existed, but is not currently in the
533 * network since it may have crashed
534 * ENOTCONN - The endpoint is not connected
535 * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the
536 * registered address space of epd
537 */
538int scif_unregister(scif_epd_t epd, off_t offset, size_t len);
539
540/**
541 * scif_readfrom() - Copy from a remote address space
542 * @epd: endpoint descriptor
543 * @loffset: offset in local registered address space to
544 * which to copy
545 * @len: length of range to copy
546 * @roffset: offset in remote registered address space
547 * from which to copy
548 * @rma_flags: transfer mode flags
549 *
550 * scif_readfrom() copies len bytes from the remote registered address space of
551 * the peer of endpoint epd, starting at the offset roffset to the local
552 * registered address space of epd, starting at the offset loffset.
553 *
554 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
555 * roffset + len - 1] must be within some registered window or windows of the
556 * local and remote nodes. A range may intersect multiple registered windows,
557 * but only if those windows are contiguous in the registered address space.
558 *
559 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
560 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
561 * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
562 * transfer is complete. Otherwise, the transfer may be performed asynchron-
563 * ously. The order in which any two asynchronous RMA operations complete
564 * is non-deterministic. The synchronization functions, scif_fence_mark()/
565 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
566 * the completion of asynchronous RMA operations on the same endpoint.
567 *
568 * The DMA transfer of individual bytes is not guaranteed to complete in
569 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
570 * cacheline or partial cacheline of the source range will become visible on
571 * the destination node after all other transferred data in the source
572 * range has become visible on the destination node.
573 *
574 * The optimal DMA performance will likely be realized if both
575 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
576 * performance will likely be realized if loffset and roffset are not
577 * cacheline aligned but are separated by some multiple of 64. The lowest level
578 * of performance is likely if loffset and roffset are not separated by a
579 * multiple of 64.
580 *
581 * The rma_flags argument is formed by ORing together zero or more of the
582 * following values.
583 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
584 * engine.
585 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
586 * transfer has completed. Passing this flag results in the
587 * current implementation busy waiting and consuming CPU cycles
588 * while the DMA transfer is in progress for best performance by
589 * avoiding the interrupt latency.
590 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
591 * the source range becomes visible on the destination node
592 * after all other transferred data in the source range has
593 * become visible on the destination
594 *
595 * Return:
596 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
597 * mode -1 is returned and errno is set to indicate the error; in kernel mode
598 * the negative of one of the following errors is returned.
599 *
600 * Errors:
601 * EACCESS - Attempt to write to a read-only range
602 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
603 * ECONNRESET - Connection reset by peer
604 * EINVAL - rma_flags is invalid
605 * ENODEV - The remote node is lost or existed, but is not currently in the
606 * network since it may have crashed
607 * ENOTCONN - The endpoint is not connected
608 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
609 * address space of epd, or, The range [roffset, roffset + len - 1] is invalid
610 * for the registered address space of the peer of epd, or loffset or roffset
611 * is negative
612 */
613int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
614 roffset, int rma_flags);
615
616/**
617 * scif_writeto() - Copy to a remote address space
618 * @epd: endpoint descriptor
619 * @loffset: offset in local registered address space
620 * from which to copy
621 * @len: length of range to copy
622 * @roffset: offset in remote registered address space to
623 * which to copy
624 * @rma_flags: transfer mode flags
625 *
626 * scif_writeto() copies len bytes from the local registered address space of
627 * epd, starting at the offset loffset to the remote registered address space
628 * of the peer of endpoint epd, starting at the offset roffset.
629 *
630 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
631 * roffset + len - 1] must be within some registered window or windows of the
632 * local and remote nodes. A range may intersect multiple registered windows,
633 * but only if those windows are contiguous in the registered address space.
634 *
635 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
636 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
637 * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the
638 * transfer is complete. Otherwise, the transfer may be performed asynchron-
639 * ously. The order in which any two asynchronous RMA operations complete
640 * is non-deterministic. The synchronization functions, scif_fence_mark()/
641 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
642 * the completion of asynchronous RMA operations on the same endpoint.
643 *
644 * The DMA transfer of individual bytes is not guaranteed to complete in
645 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
646 * cacheline or partial cacheline of the source range will become visible on
647 * the destination node after all other transferred data in the source
648 * range has become visible on the destination node.
649 *
650 * The optimal DMA performance will likely be realized if both
651 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
652 * performance will likely be realized if loffset and roffset are not cacheline
653 * aligned but are separated by some multiple of 64. The lowest level of
654 * performance is likely if loffset and roffset are not separated by a multiple
655 * of 64.
656 *
657 * The rma_flags argument is formed by ORing together zero or more of the
658 * following values.
659 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
660 * engine.
661 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
662 * transfer has completed. Passing this flag results in the
663 * current implementation busy waiting and consuming CPU cycles
664 * while the DMA transfer is in progress for best performance by
665 * avoiding the interrupt latency.
666 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
667 * the source range becomes visible on the destination node
668 * after all other transferred data in the source range has
669 * become visible on the destination
670 *
671 * Return:
672 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
673 * mode -1 is returned and errno is set to indicate the error; in kernel mode
674 * the negative of one of the following errors is returned.
675 *
676 * Errors:
677 * EACCESS - Attempt to write to a read-only range
678 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
679 * ECONNRESET - Connection reset by peer
680 * EINVAL - rma_flags is invalid
681 * ENODEV - The remote node is lost or existed, but is not currently in the
682 * network since it may have crashed
683 * ENOTCONN - The endpoint is not connected
684 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
685 * address space of epd, or, The range [roffset , roffset + len -1] is invalid
686 * for the registered address space of the peer of epd, or loffset or roffset
687 * is negative
688 */
689int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
690 roffset, int rma_flags);
691
692/**
693 * scif_vreadfrom() - Copy from a remote address space
694 * @epd: endpoint descriptor
695 * @addr: address to which to copy
696 * @len: length of range to copy
697 * @roffset: offset in remote registered address space
698 * from which to copy
699 * @rma_flags: transfer mode flags
700 *
701 * scif_vreadfrom() copies len bytes from the remote registered address
702 * space of the peer of endpoint epd, starting at the offset roffset, to local
703 * memory, starting at addr.
704 *
705 * The specified range [roffset, roffset + len - 1] must be within some
706 * registered window or windows of the remote nodes. The range may
707 * intersect multiple registered windows, but only if those windows are
708 * contiguous in the registered address space.
709 *
710 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
711 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
712 * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the
713 * transfer is complete. Otherwise, the transfer may be performed asynchron-
714 * ously. The order in which any two asynchronous RMA operations complete
715 * is non-deterministic. The synchronization functions, scif_fence_mark()/
716 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
717 * the completion of asynchronous RMA operations on the same endpoint.
718 *
719 * The DMA transfer of individual bytes is not guaranteed to complete in
720 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
721 * cacheline or partial cacheline of the source range will become visible on
722 * the destination node after all other transferred data in the source
723 * range has become visible on the destination node.
724 *
725 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
726 * the specified local memory range may be remain in a pinned state even after
727 * the specified transfer completes. This may reduce overhead if some or all of
728 * the same virtual address range is referenced in a subsequent call of
729 * scif_vreadfrom() or scif_vwriteto().
730 *
731 * The optimal DMA performance will likely be realized if both
732 * addr and roffset are cacheline aligned (are a multiple of 64). Lower
733 * performance will likely be realized if addr and roffset are not
734 * cacheline aligned but are separated by some multiple of 64. The lowest level
735 * of performance is likely if addr and roffset are not separated by a
736 * multiple of 64.
737 *
738 * The rma_flags argument is formed by ORing together zero or more of the
739 * following values.
740 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
741 * engine.
742 * SCIF_RMA_USECACHE - enable registration caching
743 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
744 * transfer has completed. Passing this flag results in the
745 * current implementation busy waiting and consuming CPU cycles
746 * while the DMA transfer is in progress for best performance by
747 * avoiding the interrupt latency.
748 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
749 * the source range becomes visible on the destination node
750 * after all other transferred data in the source range has
751 * become visible on the destination
752 *
753 * Return:
754 * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user
755 * mode -1 is returned and errno is set to indicate the error; in kernel mode
756 * the negative of one of the following errors is returned.
757 *
758 * Errors:
759 * EACCESS - Attempt to write to a read-only range
760 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
761 * ECONNRESET - Connection reset by peer
762 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
763 * EINVAL - rma_flags is invalid
764 * ENODEV - The remote node is lost or existed, but is not currently in the
765 * network since it may have crashed
766 * ENOTCONN - The endpoint is not connected
767 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
768 * registered address space of epd
769 */
770int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset,
771 int rma_flags);
772
773/**
774 * scif_vwriteto() - Copy to a remote address space
775 * @epd: endpoint descriptor
776 * @addr: address from which to copy
777 * @len: length of range to copy
778 * @roffset: offset in remote registered address space to
779 * which to copy
780 * @rma_flags: transfer mode flags
781 *
782 * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
783 * the remote registered address space of the peer of endpoint epd, starting at
784 * the offset roffset.
785 *
786 * The specified range [roffset, roffset + len - 1] must be within some
787 * registered window or windows of the remote nodes. The range may intersect
788 * multiple registered windows, but only if those windows are contiguous in the
789 * registered address space.
790 *
791 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
792 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
793 * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the
794 * transfer is complete. Otherwise, the transfer may be performed asynchron-
795 * ously. The order in which any two asynchronous RMA operations complete
796 * is non-deterministic. The synchronization functions, scif_fence_mark()/
797 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
798 * the completion of asynchronous RMA operations on the same endpoint.
799 *
800 * The DMA transfer of individual bytes is not guaranteed to complete in
801 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
802 * cacheline or partial cacheline of the source range will become visible on
803 * the destination node after all other transferred data in the source
804 * range has become visible on the destination node.
805 *
806 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
807 * the specified local memory range may be remain in a pinned state even after
808 * the specified transfer completes. This may reduce overhead if some or all of
809 * the same virtual address range is referenced in a subsequent call of
810 * scif_vreadfrom() or scif_vwriteto().
811 *
812 * The optimal DMA performance will likely be realized if both
813 * addr and offset are cacheline aligned (are a multiple of 64). Lower
814 * performance will likely be realized if addr and offset are not cacheline
815 * aligned but are separated by some multiple of 64. The lowest level of
816 * performance is likely if addr and offset are not separated by a multiple of
817 * 64.
818 *
819 * The rma_flags argument is formed by ORing together zero or more of the
820 * following values.
821 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
822 * engine.
823 * SCIF_RMA_USECACHE - allow registration caching
824 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
825 * transfer has completed. Passing this flag results in the
826 * current implementation busy waiting and consuming CPU cycles
827 * while the DMA transfer is in progress for best performance by
828 * avoiding the interrupt latency.
829 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
830 * the source range becomes visible on the destination node
831 * after all other transferred data in the source range has
832 * become visible on the destination
833 *
834 * Return:
835 * Upon successful completion, scif_vwriteto() returns 0; otherwise in user
836 * mode -1 is returned and errno is set to indicate the error; in kernel mode
837 * the negative of one of the following errors is returned.
838 *
839 * Errors:
840 * EACCESS - Attempt to write to a read-only range
841 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
842 * ECONNRESET - Connection reset by peer
843 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
844 * EINVAL - rma_flags is invalid
845 * ENODEV - The remote node is lost or existed, but is not currently in the
846 * network since it may have crashed
847 * ENOTCONN - The endpoint is not connected
848 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
849 * registered address space of epd
850 */
851int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset,
852 int rma_flags);
853
854/**
855 * scif_fence_mark() - Mark previously issued RMAs
856 * @epd: endpoint descriptor
857 * @flags: control flags
858 * @mark: marked value returned as output.
859 *
860 * scif_fence_mark() returns after marking the current set of all uncompleted
861 * RMAs initiated through the endpoint epd or the current set of all
862 * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
863 * marked with a value returned at mark. The application may subsequently call
864 * scif_fence_wait(), passing the value returned at mark, to await completion
865 * of all RMAs so marked.
866 *
867 * The flags argument has exactly one of the following values.
868 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
869 * epd are marked
870 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
871 * of endpoint epd are marked
872 *
873 * Return:
874 * Upon successful completion, scif_fence_mark() returns 0; otherwise in user
875 * mode -1 is returned and errno is set to indicate the error; in kernel mode
876 * the negative of one of the following errors is returned.
877 *
878 * Errors:
879 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
880 * ECONNRESET - Connection reset by peer
881 * EINVAL - flags is invalid
882 * ENODEV - The remote node is lost or existed, but is not currently in the
883 * network since it may have crashed
884 * ENOTCONN - The endpoint is not connected
885 * ENOMEM - Insufficient kernel memory was available
886 */
887int scif_fence_mark(scif_epd_t epd, int flags, int *mark);
888
889/**
890 * scif_fence_wait() - Wait for completion of marked RMAs
891 * @epd: endpoint descriptor
892 * @mark: mark request
893 *
894 * scif_fence_wait() returns after all RMAs marked with mark have completed.
895 * The value passed in mark must have been obtained in a previous call to
896 * scif_fence_mark().
897 *
898 * Return:
899 * Upon successful completion, scif_fence_wait() returns 0; otherwise in user
900 * mode -1 is returned and errno is set to indicate the error; in kernel mode
901 * the negative of one of the following errors is returned.
902 *
903 * Errors:
904 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
905 * ECONNRESET - Connection reset by peer
906 * ENODEV - The remote node is lost or existed, but is not currently in the
907 * network since it may have crashed
908 * ENOTCONN - The endpoint is not connected
909 * ENOMEM - Insufficient kernel memory was available
910 */
911int scif_fence_wait(scif_epd_t epd, int mark);
912
913/**
914 * scif_fence_signal() - Request a memory update on completion of RMAs
915 * @epd: endpoint descriptor
916 * @loff: local offset
917 * @lval: local value to write to loffset
918 * @roff: remote offset
919 * @rval: remote value to write to roffset
920 * @flags: flags
921 *
922 * scif_fence_signal() returns after marking the current set of all uncompleted
923 * RMAs initiated through the endpoint epd or marking the current set of all
924 * uncompleted RMAs initiated through the peer of endpoint epd.
925 *
926 * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
927 * marked set, lval is written to memory at the address corresponding to offset
928 * loff in the local registered address space of epd. loff must be within a
929 * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
930 * of the RMAs in the marked set, rval is written to memory at the address
931 * corresponding to offset roff in the remote registered address space of epd.
932 * roff must be within a remote registered window of the peer of epd. Note
933 * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
934 *
935 * The flags argument is formed by OR'ing together the following.
936 * Exactly one of the following values.
937 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
938 * epd are marked
939 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
940 * of endpoint epd are marked
941 * One or more of the following values.
942 * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to
943 * memory at the address corresponding to offset loff in the local
944 * registered address space of epd.
945 * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to
946 * memory at the address corresponding to offset roff in the remote
947 * registered address space of epd.
948 *
949 * Return:
950 * Upon successful completion, scif_fence_signal() returns 0; otherwise in
951 * user mode -1 is returned and errno is set to indicate the error; in kernel
952 * mode the negative of one of the following errors is returned.
953 *
954 * Errors:
955 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
956 * ECONNRESET - Connection reset by peer
957 * EINVAL - flags is invalid, or loff or roff are not DWORD aligned
958 * ENODEV - The remote node is lost or existed, but is not currently in the
959 * network since it may have crashed
960 * ENOTCONN - The endpoint is not connected
961 * ENXIO - loff is invalid for the registered address of epd, or roff is invalid
962 * for the registered address space, of the peer of epd
963 */
964int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff,
965 u64 rval, int flags);
966
967/**
968 * scif_get_node_ids() - Return information about online nodes
969 * @nodes: array in which to return online node IDs
970 * @len: number of entries in the nodes array
971 * @self: address to place the node ID of the local node
972 *
973 * scif_get_node_ids() fills in the nodes array with up to len node IDs of the
974 * nodes in the SCIF network. If there is not enough space in nodes, as
975 * indicated by the len parameter, only len node IDs are returned in nodes. The
976 * return value of scif_get_node_ids() is the total number of nodes currently in
977 * the SCIF network. By checking the return value against the len parameter,
978 * the user may determine if enough space for nodes was allocated.
979 *
980 * The node ID of the local node is returned at self.
981 *
982 * Return:
983 * Upon successful completion, scif_get_node_ids() returns the actual number of
984 * online nodes in the SCIF network including 'self'; otherwise in user mode
985 * -1 is returned and errno is set to indicate the error; in kernel mode no
986 * errors are returned.
987 *
988 * Errors:
989 * EFAULT - Bad address
990 */
991int scif_get_node_ids(u16 *nodes, int len, u16 *self);
992
993#endif /* __SCIF_H__ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1a0006a76b00..4ad65eebbff8 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -352,6 +352,7 @@ header-y += rtc.h
352header-y += rtnetlink.h 352header-y += rtnetlink.h
353header-y += scc.h 353header-y += scc.h
354header-y += sched.h 354header-y += sched.h
355header-y += scif_ioctl.h
355header-y += screen_info.h 356header-y += screen_info.h
356header-y += sctp.h 357header-y += sctp.h
357header-y += sdla.h 358header-y += sdla.h
diff --git a/include/uapi/linux/scif_ioctl.h b/include/uapi/linux/scif_ioctl.h
new file mode 100644
index 000000000000..4a94d917cf99
--- /dev/null
+++ b/include/uapi/linux/scif_ioctl.h
@@ -0,0 +1,130 @@
1/*
2 * Intel MIC Platform Software Stack (MPSS)
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * BSD LICENSE
21 *
22 * Copyright(c) 2014 Intel Corporation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 *
28 * * Redistributions of source code must retain the above copyright
29 * notice, this list of conditions and the following disclaimer.
30 * * Redistributions in binary form must reproduce the above copyright
31 * notice, this list of conditions and the following disclaimer in
32 * the documentation and/or other materials provided with the
33 * distribution.
34 * * Neither the name of Intel Corporation nor the names of its
35 * contributors may be used to endorse or promote products derived
36 * from this software without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
39 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
40 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
41 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
42 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
45 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
46 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 *
50 * Intel SCIF driver.
51 *
52 */
53/*
54 * -----------------------------------------
55 * SCIF IOCTL interface information
56 * -----------------------------------------
57 */
58#ifndef SCIF_IOCTL_H
59#define SCIF_IOCTL_H
60
61#include <linux/types.h>
62
63/**
64 * struct scif_port_id - SCIF port information
65 * @node: node on which port resides
66 * @port: local port number
67 */
68struct scif_port_id {
69 __u16 node;
70 __u16 port;
71};
72
73/**
74 * struct scifioctl_connect - used for SCIF_CONNECT IOCTL
75 * @self: used to read back the assigned port_id
76 * @peer: destination node and port to connect to
77 */
78struct scifioctl_connect {
79 struct scif_port_id self;
80 struct scif_port_id peer;
81};
82
83/**
84 * struct scifioctl_accept - used for SCIF_ACCEPTREQ IOCTL
85 * @flags: flags
86 * @peer: global id of peer endpoint
87 * @endpt: new connected endpoint descriptor
88 */
89struct scifioctl_accept {
90 __s32 flags;
91 struct scif_port_id peer;
92 __u64 endpt;
93};
94
95/**
96 * struct scifioctl_msg - used for SCIF_SEND/SCIF_RECV IOCTL
97 * @msg: message buffer address
98 * @len: message length
99 * @flags: flags
100 * @out_len: number of bytes sent/received
101 */
102struct scifioctl_msg {
103 __u64 msg;
104 __s32 len;
105 __s32 flags;
106 __s32 out_len;
107};
108
109/**
110 * struct scifioctl_node_ids - used for SCIF_GET_NODEIDS IOCTL
111 * @nodes: pointer to an array of node_ids
112 * @self: ID of the current node
113 * @len: length of array
114 */
115struct scifioctl_node_ids {
116 __u64 nodes;
117 __u64 self;
118 __s32 len;
119};
120
121#define SCIF_BIND _IOWR('s', 1, __u64)
122#define SCIF_LISTEN _IOW('s', 2, __s32)
123#define SCIF_CONNECT _IOWR('s', 3, struct scifioctl_connect)
124#define SCIF_ACCEPTREQ _IOWR('s', 4, struct scifioctl_accept)
125#define SCIF_ACCEPTREG _IOWR('s', 5, __u64)
126#define SCIF_SEND _IOWR('s', 6, struct scifioctl_msg)
127#define SCIF_RECV _IOWR('s', 7, struct scifioctl_msg)
128#define SCIF_GET_NODEIDS _IOWR('s', 14, struct scifioctl_node_ids)
129
130#endif /* SCIF_IOCTL_H */