aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/misc
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc')
-rw-r--r--drivers/misc/Kconfig12
-rw-r--r--drivers/misc/Makefile1
-rw-r--r--drivers/misc/sgi-xp/Makefile11
-rw-r--r--drivers/misc/sgi-xp/xp.h485
-rw-r--r--drivers/misc/sgi-xp/xp_main.c290
-rw-r--r--drivers/misc/sgi-xp/xp_nofault.S36
-rw-r--r--drivers/misc/sgi-xp/xpc.h1267
-rw-r--r--drivers/misc/sgi-xp/xpc_channel.c2379
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c1431
-rw-r--r--drivers/misc/sgi-xp/xpc_partition.c1239
-rw-r--r--drivers/misc/sgi-xp/xpnet.c718
11 files changed, 7869 insertions, 0 deletions
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index bb94ce78a6d0..297a48f85446 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -360,4 +360,16 @@ config ENCLOSURE_SERVICES
360 driver (SCSI/ATA) which supports enclosures 360 driver (SCSI/ATA) which supports enclosures
361 or a SCSI enclosure device (SES) to use these services. 361 or a SCSI enclosure device (SES) to use these services.
362 362
363config SGI_XP
364 tristate "Support communication between SGI SSIs"
365 depends on IA64_GENERIC || IA64_SGI_SN2
366 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
367 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
368 ---help---
369 An SGI machine can be divided into multiple Single System
370 Images which act independently of each other and have
371 hardware based memory protection from the others. Enabling
372 this feature will allow for direct communication between SSIs
373 based on a network adapter and DMA messaging.
374
363endif # MISC_DEVICES 375endif # MISC_DEVICES
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 4581b2533111..5914da434854 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_EEPROM_93CX6) += eeprom_93cx6.o
24obj-$(CONFIG_INTEL_MENLOW) += intel_menlow.o 24obj-$(CONFIG_INTEL_MENLOW) += intel_menlow.o
25obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o 25obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
26obj-$(CONFIG_KGDB_TESTS) += kgdbts.o 26obj-$(CONFIG_KGDB_TESTS) += kgdbts.o
27obj-$(CONFIG_SGI_XP) += sgi-xp/
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile
new file mode 100644
index 000000000000..b6e40a7958ce
--- /dev/null
+++ b/drivers/misc/sgi-xp/Makefile
@@ -0,0 +1,11 @@
1#
2# Makefile for SGI's XP devices.
3#
4
5obj-$(CONFIG_SGI_XP) += xp.o
6xp-y := xp_main.o xp_nofault.o
7
8obj-$(CONFIG_SGI_XP) += xpc.o
9xpc-y := xpc_main.o xpc_channel.o xpc_partition.o
10
11obj-$(CONFIG_SGI_XP) += xpnet.o
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
new file mode 100644
index 000000000000..fb65981754c3
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp.h
@@ -0,0 +1,485 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 2004-2008 Silicon Graphics, Inc. All rights reserved.
7 */
8
9
10/*
11 * External Cross Partition (XP) structures and defines.
12 */
13
14
15#ifndef _DRIVERS_MISC_SGIXP_XP_H
16#define _DRIVERS_MISC_SGIXP_XP_H
17
18
19#include <linux/cache.h>
20#include <linux/hardirq.h>
21#include <linux/mutex.h>
22#include <asm/sn/types.h>
23#include <asm/sn/bte.h>
24
25
26#ifdef USE_DBUG_ON
27#define DBUG_ON(condition) BUG_ON(condition)
28#else
29#define DBUG_ON(condition)
30#endif
31
32
33/*
34 * Define the maximum number of logically defined partitions the system
35 * can support. It is constrained by the maximum number of hardware
36 * partitionable regions. The term 'region' in this context refers to the
37 * minimum number of nodes that can comprise an access protection grouping.
38 * The access protection is in regards to memory, IPI and IOI.
39 *
40 * The maximum number of hardware partitionable regions is equal to the
41 * maximum number of nodes in the entire system divided by the minimum number
42 * of nodes that comprise an access protection grouping.
43 */
44#define XP_MAX_PARTITIONS 64
45
46
47/*
48 * Define the number of u64s required to represent all the C-brick nasids
49 * as a bitmap. The cross-partition kernel modules deal only with
50 * C-brick nasids, thus the need for bitmaps which don't account for
51 * odd-numbered (non C-brick) nasids.
52 */
53#define XP_MAX_PHYSNODE_ID (MAX_NUMALINK_NODES / 2)
54#define XP_NASID_MASK_BYTES ((XP_MAX_PHYSNODE_ID + 7) / 8)
55#define XP_NASID_MASK_WORDS ((XP_MAX_PHYSNODE_ID + 63) / 64)
56
57
58/*
59 * Wrapper for bte_copy() that should it return a failure status will retry
60 * the bte_copy() once in the hope that the failure was due to a temporary
61 * aberration (i.e., the link going down temporarily).
62 *
63 * src - physical address of the source of the transfer.
64 * vdst - virtual address of the destination of the transfer.
65 * len - number of bytes to transfer from source to destination.
66 * mode - see bte_copy() for definition.
67 * notification - see bte_copy() for definition.
68 *
69 * Note: xp_bte_copy() should never be called while holding a spinlock.
70 */
71static inline bte_result_t
72xp_bte_copy(u64 src, u64 vdst, u64 len, u64 mode, void *notification)
73{
74 bte_result_t ret;
75 u64 pdst = ia64_tpa(vdst);
76
77
78 /*
79 * Ensure that the physically mapped memory is contiguous.
80 *
81 * We do this by ensuring that the memory is from region 7 only.
82 * If the need should arise to use memory from one of the other
83 * regions, then modify the BUG_ON() statement to ensure that the
84 * memory from that region is always physically contiguous.
85 */
86 BUG_ON(REGION_NUMBER(vdst) != RGN_KERNEL);
87
88 ret = bte_copy(src, pdst, len, mode, notification);
89 if ((ret != BTE_SUCCESS) && BTE_ERROR_RETRY(ret)) {
90 if (!in_interrupt()) {
91 cond_resched();
92 }
93 ret = bte_copy(src, pdst, len, mode, notification);
94 }
95
96 return ret;
97}
98
99
100/*
101 * XPC establishes channel connections between the local partition and any
102 * other partition that is currently up. Over these channels, kernel-level
103 * `users' can communicate with their counterparts on the other partitions.
104 *
105 * The maxinum number of channels is limited to eight. For performance reasons,
106 * the internal cross partition structures require sixteen bytes per channel,
107 * and eight allows all of this interface-shared info to fit in one cache line.
108 *
109 * XPC_NCHANNELS reflects the total number of channels currently defined.
110 * If the need for additional channels arises, one can simply increase
111 * XPC_NCHANNELS accordingly. If the day should come where that number
112 * exceeds the MAXIMUM number of channels allowed (eight), then one will need
113 * to make changes to the XPC code to allow for this.
114 */
115#define XPC_MEM_CHANNEL 0 /* memory channel number */
116#define XPC_NET_CHANNEL 1 /* network channel number */
117
118#define XPC_NCHANNELS 2 /* #of defined channels */
119#define XPC_MAX_NCHANNELS 8 /* max #of channels allowed */
120
121#if XPC_NCHANNELS > XPC_MAX_NCHANNELS
122#error XPC_NCHANNELS exceeds MAXIMUM allowed.
123#endif
124
125
126/*
127 * The format of an XPC message is as follows:
128 *
129 * +-------+--------------------------------+
130 * | flags |////////////////////////////////|
131 * +-------+--------------------------------+
132 * | message # |
133 * +----------------------------------------+
134 * | payload (user-defined message) |
135 * | |
136 * :
137 * | |
138 * +----------------------------------------+
139 *
140 * The size of the payload is defined by the user via xpc_connect(). A user-
141 * defined message resides in the payload area.
142 *
143 * The user should have no dealings with the message header, but only the
144 * message's payload. When a message entry is allocated (via xpc_allocate())
145 * a pointer to the payload area is returned and not the actual beginning of
146 * the XPC message. The user then constructs a message in the payload area
147 * and passes that pointer as an argument on xpc_send() or xpc_send_notify().
148 *
149 * The size of a message entry (within a message queue) must be a cacheline
150 * sized multiple in order to facilitate the BTE transfer of messages from one
151 * message queue to another. A macro, XPC_MSG_SIZE(), is provided for the user
152 * that wants to fit as many msg entries as possible in a given memory size
153 * (e.g. a memory page).
154 */
155struct xpc_msg {
156 u8 flags; /* FOR XPC INTERNAL USE ONLY */
157 u8 reserved[7]; /* FOR XPC INTERNAL USE ONLY */
158 s64 number; /* FOR XPC INTERNAL USE ONLY */
159
160 u64 payload; /* user defined portion of message */
161};
162
163
164#define XPC_MSG_PAYLOAD_OFFSET (u64) (&((struct xpc_msg *)0)->payload)
165#define XPC_MSG_SIZE(_payload_size) \
166 L1_CACHE_ALIGN(XPC_MSG_PAYLOAD_OFFSET + (_payload_size))
167
168
169/*
170 * Define the return values and values passed to user's callout functions.
171 * (It is important to add new value codes at the end just preceding
172 * xpcUnknownReason, which must have the highest numerical value.)
173 */
174enum xpc_retval {
175 xpcSuccess = 0,
176
177 xpcNotConnected, /* 1: channel is not connected */
178 xpcConnected, /* 2: channel connected (opened) */
179 xpcRETIRED1, /* 3: (formerly xpcDisconnected) */
180
181 xpcMsgReceived, /* 4: message received */
182 xpcMsgDelivered, /* 5: message delivered and acknowledged */
183
184 xpcRETIRED2, /* 6: (formerly xpcTransferFailed) */
185
186 xpcNoWait, /* 7: operation would require wait */
187 xpcRetry, /* 8: retry operation */
188 xpcTimeout, /* 9: timeout in xpc_allocate_msg_wait() */
189 xpcInterrupted, /* 10: interrupted wait */
190
191 xpcUnequalMsgSizes, /* 11: message size disparity between sides */
192 xpcInvalidAddress, /* 12: invalid address */
193
194 xpcNoMemory, /* 13: no memory available for XPC structures */
195 xpcLackOfResources, /* 14: insufficient resources for operation */
196 xpcUnregistered, /* 15: channel is not registered */
197 xpcAlreadyRegistered, /* 16: channel is already registered */
198
199 xpcPartitionDown, /* 17: remote partition is down */
200 xpcNotLoaded, /* 18: XPC module is not loaded */
201 xpcUnloading, /* 19: this side is unloading XPC module */
202
203 xpcBadMagic, /* 20: XPC MAGIC string not found */
204
205 xpcReactivating, /* 21: remote partition was reactivated */
206
207 xpcUnregistering, /* 22: this side is unregistering channel */
208 xpcOtherUnregistering, /* 23: other side is unregistering channel */
209
210 xpcCloneKThread, /* 24: cloning kernel thread */
211 xpcCloneKThreadFailed, /* 25: cloning kernel thread failed */
212
213 xpcNoHeartbeat, /* 26: remote partition has no heartbeat */
214
215 xpcPioReadError, /* 27: PIO read error */
216 xpcPhysAddrRegFailed, /* 28: registration of phys addr range failed */
217
218 xpcBteDirectoryError, /* 29: maps to BTEFAIL_DIR */
219 xpcBtePoisonError, /* 30: maps to BTEFAIL_POISON */
220 xpcBteWriteError, /* 31: maps to BTEFAIL_WERR */
221 xpcBteAccessError, /* 32: maps to BTEFAIL_ACCESS */
222 xpcBtePWriteError, /* 33: maps to BTEFAIL_PWERR */
223 xpcBtePReadError, /* 34: maps to BTEFAIL_PRERR */
224 xpcBteTimeOutError, /* 35: maps to BTEFAIL_TOUT */
225 xpcBteXtalkError, /* 36: maps to BTEFAIL_XTERR */
226 xpcBteNotAvailable, /* 37: maps to BTEFAIL_NOTAVAIL */
227 xpcBteUnmappedError, /* 38: unmapped BTEFAIL_ error */
228
229 xpcBadVersion, /* 39: bad version number */
230 xpcVarsNotSet, /* 40: the XPC variables are not set up */
231 xpcNoRsvdPageAddr, /* 41: unable to get rsvd page's phys addr */
232 xpcInvalidPartid, /* 42: invalid partition ID */
233 xpcLocalPartid, /* 43: local partition ID */
234
235 xpcOtherGoingDown, /* 44: other side going down, reason unknown */
236 xpcSystemGoingDown, /* 45: system is going down, reason unknown */
237 xpcSystemHalt, /* 46: system is being halted */
238 xpcSystemReboot, /* 47: system is being rebooted */
239 xpcSystemPoweroff, /* 48: system is being powered off */
240
241 xpcDisconnecting, /* 49: channel disconnecting (closing) */
242
243 xpcOpenCloseError, /* 50: channel open/close protocol error */
244
245 xpcDisconnected, /* 51: channel disconnected (closed) */
246
247 xpcBteSh2Start, /* 52: BTE CRB timeout */
248
249 /* 53: 0x1 BTE Error Response Short */
250 xpcBteSh2RspShort = xpcBteSh2Start + BTEFAIL_SH2_RESP_SHORT,
251
252 /* 54: 0x2 BTE Error Response Long */
253 xpcBteSh2RspLong = xpcBteSh2Start + BTEFAIL_SH2_RESP_LONG,
254
255 /* 56: 0x4 BTE Error Response DSB */
256 xpcBteSh2RspDSB = xpcBteSh2Start + BTEFAIL_SH2_RESP_DSP,
257
258 /* 60: 0x8 BTE Error Response Access */
259 xpcBteSh2RspAccess = xpcBteSh2Start + BTEFAIL_SH2_RESP_ACCESS,
260
261 /* 68: 0x10 BTE Error CRB timeout */
262 xpcBteSh2CRBTO = xpcBteSh2Start + BTEFAIL_SH2_CRB_TO,
263
264 /* 84: 0x20 BTE Error NACK limit */
265 xpcBteSh2NACKLimit = xpcBteSh2Start + BTEFAIL_SH2_NACK_LIMIT,
266
267 /* 115: BTE end */
268 xpcBteSh2End = xpcBteSh2Start + BTEFAIL_SH2_ALL,
269
270 xpcUnknownReason /* 116: unknown reason -- must be last in list */
271};
272
273
274/*
275 * Define the callout function types used by XPC to update the user on
276 * connection activity and state changes (via the user function registered by
277 * xpc_connect()) and to notify them of messages received and delivered (via
278 * the user function registered by xpc_send_notify()).
279 *
280 * The two function types are xpc_channel_func and xpc_notify_func and
281 * both share the following arguments, with the exception of "data", which
282 * only xpc_channel_func has.
283 *
284 * Arguments:
285 *
286 * reason - reason code. (See following table.)
287 * partid - partition ID associated with condition.
288 * ch_number - channel # associated with condition.
289 * data - pointer to optional data. (See following table.)
290 * key - pointer to optional user-defined value provided as the "key"
291 * argument to xpc_connect() or xpc_send_notify().
292 *
293 * In the following table the "Optional Data" column applies to callouts made
294 * to functions registered by xpc_connect(). A "NA" in that column indicates
295 * that this reason code can be passed to functions registered by
296 * xpc_send_notify() (i.e. they don't have data arguments).
297 *
298 * Also, the first three reason codes in the following table indicate
299 * success, whereas the others indicate failure. When a failure reason code
300 * is received, one can assume that the channel is not connected.
301 *
302 *
303 * Reason Code | Cause | Optional Data
304 * =====================+================================+=====================
305 * xpcConnected | connection has been established| max #of entries
306 * | to the specified partition on | allowed in message
307 * | the specified channel | queue
308 * ---------------------+--------------------------------+---------------------
309 * xpcMsgReceived | an XPC message arrived from | address of payload
310 * | the specified partition on the |
311 * | specified channel | [the user must call
312 * | | xpc_received() when
313 * | | finished with the
314 * | | payload]
315 * ---------------------+--------------------------------+---------------------
316 * xpcMsgDelivered | notification that the message | NA
317 * | was delivered to the intended |
318 * | recipient and that they have |
319 * | acknowledged its receipt by |
320 * | calling xpc_received() |
321 * =====================+================================+=====================
322 * xpcUnequalMsgSizes | can't connect to the specified | NULL
323 * | partition on the specified |
324 * | channel because of mismatched |
325 * | message sizes |
326 * ---------------------+--------------------------------+---------------------
327 * xpcNoMemory | insufficient memory avaiable | NULL
328 * | to allocate message queue |
329 * ---------------------+--------------------------------+---------------------
330 * xpcLackOfResources | lack of resources to create | NULL
331 * | the necessary kthreads to |
332 * | support the channel |
333 * ---------------------+--------------------------------+---------------------
334 * xpcUnregistering | this side's user has | NULL or NA
335 * | unregistered by calling |
336 * | xpc_disconnect() |
337 * ---------------------+--------------------------------+---------------------
338 * xpcOtherUnregistering| the other side's user has | NULL or NA
339 * | unregistered by calling |
340 * | xpc_disconnect() |
341 * ---------------------+--------------------------------+---------------------
342 * xpcNoHeartbeat | the other side's XPC is no | NULL or NA
343 * | longer heartbeating |
344 * | |
345 * ---------------------+--------------------------------+---------------------
346 * xpcUnloading | this side's XPC module is | NULL or NA
347 * | being unloaded |
348 * | |
349 * ---------------------+--------------------------------+---------------------
350 * xpcOtherUnloading | the other side's XPC module is | NULL or NA
351 * | is being unloaded |
352 * | |
353 * ---------------------+--------------------------------+---------------------
354 * xpcPioReadError | xp_nofault_PIOR() returned an | NULL or NA
355 * | error while sending an IPI |
356 * | |
357 * ---------------------+--------------------------------+---------------------
358 * xpcInvalidAddress | the address either received or | NULL or NA
359 * | sent by the specified partition|
360 * | is invalid |
361 * ---------------------+--------------------------------+---------------------
362 * xpcBteNotAvailable | attempt to pull data from the | NULL or NA
363 * xpcBtePoisonError | specified partition over the |
364 * xpcBteWriteError | specified channel via a |
365 * xpcBteAccessError | bte_copy() failed |
366 * xpcBteTimeOutError | |
367 * xpcBteXtalkError | |
368 * xpcBteDirectoryError | |
369 * xpcBteGenericError | |
370 * xpcBteUnmappedError | |
371 * ---------------------+--------------------------------+---------------------
372 * xpcUnknownReason | the specified channel to the | NULL or NA
373 * | specified partition was |
374 * | unavailable for unknown reasons|
375 * =====================+================================+=====================
376 */
377
378typedef void (*xpc_channel_func)(enum xpc_retval reason, partid_t partid,
379 int ch_number, void *data, void *key);
380
381typedef void (*xpc_notify_func)(enum xpc_retval reason, partid_t partid,
382 int ch_number, void *key);
383
384
385/*
386 * The following is a registration entry. There is a global array of these,
387 * one per channel. It is used to record the connection registration made
388 * by the users of XPC. As long as a registration entry exists, for any
389 * partition that comes up, XPC will attempt to establish a connection on
390 * that channel. Notification that a connection has been made will occur via
391 * the xpc_channel_func function.
392 *
393 * The 'func' field points to the function to call when aynchronous
394 * notification is required for such events as: a connection established/lost,
395 * or an incoming message received, or an error condition encountered. A
396 * non-NULL 'func' field indicates that there is an active registration for
397 * the channel.
398 */
399struct xpc_registration {
400 struct mutex mutex;
401 xpc_channel_func func; /* function to call */
402 void *key; /* pointer to user's key */
403 u16 nentries; /* #of msg entries in local msg queue */
404 u16 msg_size; /* message queue's message size */
405 u32 assigned_limit; /* limit on #of assigned kthreads */
406 u32 idle_limit; /* limit on #of idle kthreads */
407} ____cacheline_aligned;
408
409
410#define XPC_CHANNEL_REGISTERED(_c) (xpc_registrations[_c].func != NULL)
411
412
413/* the following are valid xpc_allocate() flags */
414#define XPC_WAIT 0 /* wait flag */
415#define XPC_NOWAIT 1 /* no wait flag */
416
417
418struct xpc_interface {
419 void (*connect)(int);
420 void (*disconnect)(int);
421 enum xpc_retval (*allocate)(partid_t, int, u32, void **);
422 enum xpc_retval (*send)(partid_t, int, void *);
423 enum xpc_retval (*send_notify)(partid_t, int, void *,
424 xpc_notify_func, void *);
425 void (*received)(partid_t, int, void *);
426 enum xpc_retval (*partid_to_nasids)(partid_t, void *);
427};
428
429
430extern struct xpc_interface xpc_interface;
431
432extern void xpc_set_interface(void (*)(int),
433 void (*)(int),
434 enum xpc_retval (*)(partid_t, int, u32, void **),
435 enum xpc_retval (*)(partid_t, int, void *),
436 enum xpc_retval (*)(partid_t, int, void *, xpc_notify_func,
437 void *),
438 void (*)(partid_t, int, void *),
439 enum xpc_retval (*)(partid_t, void *));
440extern void xpc_clear_interface(void);
441
442
443extern enum xpc_retval xpc_connect(int, xpc_channel_func, void *, u16,
444 u16, u32, u32);
445extern void xpc_disconnect(int);
446
447static inline enum xpc_retval
448xpc_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
449{
450 return xpc_interface.allocate(partid, ch_number, flags, payload);
451}
452
453static inline enum xpc_retval
454xpc_send(partid_t partid, int ch_number, void *payload)
455{
456 return xpc_interface.send(partid, ch_number, payload);
457}
458
459static inline enum xpc_retval
460xpc_send_notify(partid_t partid, int ch_number, void *payload,
461 xpc_notify_func func, void *key)
462{
463 return xpc_interface.send_notify(partid, ch_number, payload, func, key);
464}
465
466static inline void
467xpc_received(partid_t partid, int ch_number, void *payload)
468{
469 return xpc_interface.received(partid, ch_number, payload);
470}
471
472static inline enum xpc_retval
473xpc_partid_to_nasids(partid_t partid, void *nasids)
474{
475 return xpc_interface.partid_to_nasids(partid, nasids);
476}
477
478
479extern u64 xp_nofault_PIOR_target;
480extern int xp_nofault_PIOR(void *);
481extern int xp_error_PIOR(void);
482
483
484#endif /* _DRIVERS_MISC_SGIXP_XP_H */
485
diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c
new file mode 100644
index 000000000000..5f9f9c2e9298
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_main.c
@@ -0,0 +1,290 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9
10/*
11 * Cross Partition (XP) base.
12 *
13 * XP provides a base from which its users can interact
14 * with XPC, yet not be dependent on XPC.
15 *
16 */
17
18
19#include <linux/kernel.h>
20#include <linux/interrupt.h>
21#include <linux/module.h>
22#include <linux/mutex.h>
23#include <asm/sn/intr.h>
24#include <asm/sn/sn_sal.h>
25#include "xp.h"
26
27
28/*
29 * Target of nofault PIO read.
30 */
31u64 xp_nofault_PIOR_target;
32
33
34/*
35 * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
36 * users of XPC.
37 */
38struct xpc_registration xpc_registrations[XPC_NCHANNELS];
39
40
41/*
42 * Initialize the XPC interface to indicate that XPC isn't loaded.
43 */
44static enum xpc_retval xpc_notloaded(void) { return xpcNotLoaded; }
45
46struct xpc_interface xpc_interface = {
47 (void (*)(int)) xpc_notloaded,
48 (void (*)(int)) xpc_notloaded,
49 (enum xpc_retval (*)(partid_t, int, u32, void **)) xpc_notloaded,
50 (enum xpc_retval (*)(partid_t, int, void *)) xpc_notloaded,
51 (enum xpc_retval (*)(partid_t, int, void *, xpc_notify_func, void *))
52 xpc_notloaded,
53 (void (*)(partid_t, int, void *)) xpc_notloaded,
54 (enum xpc_retval (*)(partid_t, void *)) xpc_notloaded
55};
56
57
58/*
59 * XPC calls this when it (the XPC module) has been loaded.
60 */
61void
62xpc_set_interface(void (*connect)(int),
63 void (*disconnect)(int),
64 enum xpc_retval (*allocate)(partid_t, int, u32, void **),
65 enum xpc_retval (*send)(partid_t, int, void *),
66 enum xpc_retval (*send_notify)(partid_t, int, void *,
67 xpc_notify_func, void *),
68 void (*received)(partid_t, int, void *),
69 enum xpc_retval (*partid_to_nasids)(partid_t, void *))
70{
71 xpc_interface.connect = connect;
72 xpc_interface.disconnect = disconnect;
73 xpc_interface.allocate = allocate;
74 xpc_interface.send = send;
75 xpc_interface.send_notify = send_notify;
76 xpc_interface.received = received;
77 xpc_interface.partid_to_nasids = partid_to_nasids;
78}
79
80
81/*
82 * XPC calls this when it (the XPC module) is being unloaded.
83 */
84void
85xpc_clear_interface(void)
86{
87 xpc_interface.connect = (void (*)(int)) xpc_notloaded;
88 xpc_interface.disconnect = (void (*)(int)) xpc_notloaded;
89 xpc_interface.allocate = (enum xpc_retval (*)(partid_t, int, u32,
90 void **)) xpc_notloaded;
91 xpc_interface.send = (enum xpc_retval (*)(partid_t, int, void *))
92 xpc_notloaded;
93 xpc_interface.send_notify = (enum xpc_retval (*)(partid_t, int, void *,
94 xpc_notify_func, void *)) xpc_notloaded;
95 xpc_interface.received = (void (*)(partid_t, int, void *))
96 xpc_notloaded;
97 xpc_interface.partid_to_nasids = (enum xpc_retval (*)(partid_t, void *))
98 xpc_notloaded;
99}
100
101
102/*
103 * Register for automatic establishment of a channel connection whenever
104 * a partition comes up.
105 *
106 * Arguments:
107 *
108 * ch_number - channel # to register for connection.
109 * func - function to call for asynchronous notification of channel
110 * state changes (i.e., connection, disconnection, error) and
111 * the arrival of incoming messages.
112 * key - pointer to optional user-defined value that gets passed back
113 * to the user on any callouts made to func.
114 * payload_size - size in bytes of the XPC message's payload area which
115 * contains a user-defined message. The user should make
116 * this large enough to hold their largest message.
117 * nentries - max #of XPC message entries a message queue can contain.
118 * The actual number, which is determined when a connection
119 * is established and may be less then requested, will be
120 * passed to the user via the xpcConnected callout.
121 * assigned_limit - max number of kthreads allowed to be processing
122 * messages (per connection) at any given instant.
123 * idle_limit - max number of kthreads allowed to be idle at any given
124 * instant.
125 */
126enum xpc_retval
127xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
128 u16 nentries, u32 assigned_limit, u32 idle_limit)
129{
130 struct xpc_registration *registration;
131
132
133 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
134 DBUG_ON(payload_size == 0 || nentries == 0);
135 DBUG_ON(func == NULL);
136 DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
137
138 registration = &xpc_registrations[ch_number];
139
140 if (mutex_lock_interruptible(&registration->mutex) != 0) {
141 return xpcInterrupted;
142 }
143
144 /* if XPC_CHANNEL_REGISTERED(ch_number) */
145 if (registration->func != NULL) {
146 mutex_unlock(&registration->mutex);
147 return xpcAlreadyRegistered;
148 }
149
150 /* register the channel for connection */
151 registration->msg_size = XPC_MSG_SIZE(payload_size);
152 registration->nentries = nentries;
153 registration->assigned_limit = assigned_limit;
154 registration->idle_limit = idle_limit;
155 registration->key = key;
156 registration->func = func;
157
158 mutex_unlock(&registration->mutex);
159
160 xpc_interface.connect(ch_number);
161
162 return xpcSuccess;
163}
164
165
166/*
167 * Remove the registration for automatic connection of the specified channel
168 * when a partition comes up.
169 *
170 * Before returning this xpc_disconnect() will wait for all connections on the
171 * specified channel have been closed/torndown. So the caller can be assured
172 * that they will not be receiving any more callouts from XPC to their
173 * function registered via xpc_connect().
174 *
175 * Arguments:
176 *
177 * ch_number - channel # to unregister.
178 */
179void
180xpc_disconnect(int ch_number)
181{
182 struct xpc_registration *registration;
183
184
185 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
186
187 registration = &xpc_registrations[ch_number];
188
189 /*
190 * We've decided not to make this a down_interruptible(), since we
191 * figured XPC's users will just turn around and call xpc_disconnect()
192 * again anyways, so we might as well wait, if need be.
193 */
194 mutex_lock(&registration->mutex);
195
196 /* if !XPC_CHANNEL_REGISTERED(ch_number) */
197 if (registration->func == NULL) {
198 mutex_unlock(&registration->mutex);
199 return;
200 }
201
202 /* remove the connection registration for the specified channel */
203 registration->func = NULL;
204 registration->key = NULL;
205 registration->nentries = 0;
206 registration->msg_size = 0;
207 registration->assigned_limit = 0;
208 registration->idle_limit = 0;
209
210 xpc_interface.disconnect(ch_number);
211
212 mutex_unlock(&registration->mutex);
213
214 return;
215}
216
217
218int __init
219xp_init(void)
220{
221 int ret, ch_number;
222 u64 func_addr = *(u64 *) xp_nofault_PIOR;
223 u64 err_func_addr = *(u64 *) xp_error_PIOR;
224
225
226 if (!ia64_platform_is("sn2")) {
227 return -ENODEV;
228 }
229
230 /*
231 * Register a nofault code region which performs a cross-partition
232 * PIO read. If the PIO read times out, the MCA handler will consume
233 * the error and return to a kernel-provided instruction to indicate
234 * an error. This PIO read exists because it is guaranteed to timeout
235 * if the destination is down (AMO operations do not timeout on at
236 * least some CPUs on Shubs <= v1.2, which unfortunately we have to
237 * work around).
238 */
239 if ((ret = sn_register_nofault_code(func_addr, err_func_addr,
240 err_func_addr, 1, 1)) != 0) {
241 printk(KERN_ERR "XP: can't register nofault code, error=%d\n",
242 ret);
243 }
244 /*
245 * Setup the nofault PIO read target. (There is no special reason why
246 * SH_IPI_ACCESS was selected.)
247 */
248 if (is_shub2()) {
249 xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
250 } else {
251 xp_nofault_PIOR_target = SH1_IPI_ACCESS;
252 }
253
254 /* initialize the connection registration mutex */
255 for (ch_number = 0; ch_number < XPC_NCHANNELS; ch_number++) {
256 mutex_init(&xpc_registrations[ch_number].mutex);
257 }
258
259 return 0;
260}
261module_init(xp_init);
262
263
264void __exit
265xp_exit(void)
266{
267 u64 func_addr = *(u64 *) xp_nofault_PIOR;
268 u64 err_func_addr = *(u64 *) xp_error_PIOR;
269
270
271 /* unregister the PIO read nofault code region */
272 (void) sn_register_nofault_code(func_addr, err_func_addr,
273 err_func_addr, 1, 0);
274}
275module_exit(xp_exit);
276
277
278MODULE_AUTHOR("Silicon Graphics, Inc.");
279MODULE_DESCRIPTION("Cross Partition (XP) base");
280MODULE_LICENSE("GPL");
281
282EXPORT_SYMBOL(xp_nofault_PIOR);
283EXPORT_SYMBOL(xp_nofault_PIOR_target);
284EXPORT_SYMBOL(xpc_registrations);
285EXPORT_SYMBOL(xpc_interface);
286EXPORT_SYMBOL(xpc_clear_interface);
287EXPORT_SYMBOL(xpc_set_interface);
288EXPORT_SYMBOL(xpc_connect);
289EXPORT_SYMBOL(xpc_disconnect);
290
diff --git a/drivers/misc/sgi-xp/xp_nofault.S b/drivers/misc/sgi-xp/xp_nofault.S
new file mode 100644
index 000000000000..c13a709c4db5
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_nofault.S
@@ -0,0 +1,36 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9
10/*
11 * The xp_nofault_PIOR function takes a pointer to a remote PIO register
12 * and attempts to load and consume a value from it. This function
13 * will be registered as a nofault code block. In the event that the
14 * PIO read fails, the MCA handler will force the error to look
15 * corrected and vector to the xp_error_PIOR which will return an error.
16 *
17 * The definition of "consumption" and the time it takes for an MCA
18 * to surface is processor implementation specific. This code
19 * is sufficient on Itanium through the Montvale processor family.
20 * It may need to be adjusted for future processor implementations.
21 *
22 * extern int xp_nofault_PIOR(void *remote_register);
23 */
24
25 .global xp_nofault_PIOR
26xp_nofault_PIOR:
27 mov r8=r0 // Stage a success return value
28 ld8.acq r9=[r32];; // PIO Read the specified register
29 adds r9=1,r9;; // Add to force consumption
30 srlz.i;; // Allow time for MCA to surface
31 br.ret.sptk.many b0;; // Return success
32
33 .global xp_error_PIOR
34xp_error_PIOR:
35 mov r8=1 // Return value of 1
36 br.ret.sptk.many b0;; // Return failure
diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h
new file mode 100644
index 000000000000..14e70ee53ebe
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc.h
@@ -0,0 +1,1267 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9
10/*
11 * Cross Partition Communication (XPC) structures and macros.
12 */
13
14#ifndef _DRIVERS_MISC_SGIXP_XPC_H
15#define _DRIVERS_MISC_SGIXP_XPC_H
16
17
18#include <linux/interrupt.h>
19#include <linux/sysctl.h>
20#include <linux/device.h>
21#include <linux/mutex.h>
22#include <linux/completion.h>
23#include <asm/pgtable.h>
24#include <asm/processor.h>
25#include <asm/sn/bte.h>
26#include <asm/sn/clksupport.h>
27#include <asm/sn/addrs.h>
28#include <asm/sn/mspec.h>
29#include <asm/sn/shub_mmr.h>
30#include "xp.h"
31
32
33/*
34 * XPC Version numbers consist of a major and minor number. XPC can always
35 * talk to versions with same major #, and never talk to versions with a
36 * different major #.
37 */
38#define _XPC_VERSION(_maj, _min) (((_maj) << 4) | ((_min) & 0xf))
39#define XPC_VERSION_MAJOR(_v) ((_v) >> 4)
40#define XPC_VERSION_MINOR(_v) ((_v) & 0xf)
41
42
43/*
44 * The next macros define word or bit representations for given
45 * C-brick nasid in either the SAL provided bit array representing
46 * nasids in the partition/machine or the AMO_t array used for
47 * inter-partition initiation communications.
48 *
49 * For SN2 machines, C-Bricks are alway even numbered NASIDs. As
50 * such, some space will be saved by insisting that nasid information
51 * passed from SAL always be packed for C-Bricks and the
52 * cross-partition interrupts use the same packing scheme.
53 */
54#define XPC_NASID_W_INDEX(_n) (((_n) / 64) / 2)
55#define XPC_NASID_B_INDEX(_n) (((_n) / 2) & (64 - 1))
56#define XPC_NASID_IN_ARRAY(_n, _p) ((_p)[XPC_NASID_W_INDEX(_n)] & \
57 (1UL << XPC_NASID_B_INDEX(_n)))
58#define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2)
59
60#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */
61#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */
62
63/* define the process name of HB checker and the CPU it is pinned to */
64#define XPC_HB_CHECK_THREAD_NAME "xpc_hb"
65#define XPC_HB_CHECK_CPU 0
66
67/* define the process name of the discovery thread */
68#define XPC_DISCOVERY_THREAD_NAME "xpc_discovery"
69
70
71/*
72 * the reserved page
73 *
74 * SAL reserves one page of memory per partition for XPC. Though a full page
75 * in length (16384 bytes), its starting address is not page aligned, but it
76 * is cacheline aligned. The reserved page consists of the following:
77 *
78 * reserved page header
79 *
80 * The first cacheline of the reserved page contains the header
81 * (struct xpc_rsvd_page). Before SAL initialization has completed,
82 * SAL has set up the following fields of the reserved page header:
83 * SAL_signature, SAL_version, partid, and nasids_size. The other
84 * fields are set up by XPC. (xpc_rsvd_page points to the local
85 * partition's reserved page.)
86 *
87 * part_nasids mask
88 * mach_nasids mask
89 *
90 * SAL also sets up two bitmaps (or masks), one that reflects the actual
91 * nasids in this partition (part_nasids), and the other that reflects
92 * the actual nasids in the entire machine (mach_nasids). We're only
93 * interested in the even numbered nasids (which contain the processors
94 * and/or memory), so we only need half as many bits to represent the
95 * nasids. The part_nasids mask is located starting at the first cacheline
96 * following the reserved page header. The mach_nasids mask follows right
97 * after the part_nasids mask. The size in bytes of each mask is reflected
98 * by the reserved page header field 'nasids_size'. (Local partition's
99 * mask pointers are xpc_part_nasids and xpc_mach_nasids.)
100 *
101 * vars
102 * vars part
103 *
104 * Immediately following the mach_nasids mask are the XPC variables
105 * required by other partitions. First are those that are generic to all
106 * partitions (vars), followed on the next available cacheline by those
107 * which are partition specific (vars part). These are setup by XPC.
108 * (Local partition's vars pointers are xpc_vars and xpc_vars_part.)
109 *
110 * Note: Until vars_pa is set, the partition XPC code has not been initialized.
111 */
112struct xpc_rsvd_page {
113 u64 SAL_signature; /* SAL: unique signature */
114 u64 SAL_version; /* SAL: version */
115 u8 partid; /* SAL: partition ID */
116 u8 version;
117 u8 pad1[6]; /* align to next u64 in cacheline */
118 volatile u64 vars_pa;
119 struct timespec stamp; /* time when reserved page was setup by XPC */
120 u64 pad2[9]; /* align to last u64 in cacheline */
121 u64 nasids_size; /* SAL: size of each nasid mask in bytes */
122};
123
124#define XPC_RP_VERSION _XPC_VERSION(1,1) /* version 1.1 of the reserved page */
125
126#define XPC_SUPPORTS_RP_STAMP(_version) \
127 (_version >= _XPC_VERSION(1,1))
128
129/*
130 * compare stamps - the return value is:
131 *
132 * < 0, if stamp1 < stamp2
133 * = 0, if stamp1 == stamp2
134 * > 0, if stamp1 > stamp2
135 */
136static inline int
137xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2)
138{
139 int ret;
140
141
142 if ((ret = stamp1->tv_sec - stamp2->tv_sec) == 0) {
143 ret = stamp1->tv_nsec - stamp2->tv_nsec;
144 }
145 return ret;
146}
147
148
149/*
150 * Define the structures by which XPC variables can be exported to other
151 * partitions. (There are two: struct xpc_vars and struct xpc_vars_part)
152 */
153
154/*
155 * The following structure describes the partition generic variables
156 * needed by other partitions in order to properly initialize.
157 *
158 * struct xpc_vars version number also applies to struct xpc_vars_part.
159 * Changes to either structure and/or related functionality should be
160 * reflected by incrementing either the major or minor version numbers
161 * of struct xpc_vars.
162 */
163struct xpc_vars {
164 u8 version;
165 u64 heartbeat;
166 u64 heartbeating_to_mask;
167 u64 heartbeat_offline; /* if 0, heartbeat should be changing */
168 int act_nasid;
169 int act_phys_cpuid;
170 u64 vars_part_pa;
171 u64 amos_page_pa; /* paddr of page of AMOs from MSPEC driver */
172 AMO_t *amos_page; /* vaddr of page of AMOs from MSPEC driver */
173};
174
175#define XPC_V_VERSION _XPC_VERSION(3,1) /* version 3.1 of the cross vars */
176
177#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \
178 (_version >= _XPC_VERSION(3,1))
179
180
181static inline int
182xpc_hb_allowed(partid_t partid, struct xpc_vars *vars)
183{
184 return ((vars->heartbeating_to_mask & (1UL << partid)) != 0);
185}
186
187static inline void
188xpc_allow_hb(partid_t partid, struct xpc_vars *vars)
189{
190 u64 old_mask, new_mask;
191
192 do {
193 old_mask = vars->heartbeating_to_mask;
194 new_mask = (old_mask | (1UL << partid));
195 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
196 old_mask);
197}
198
199static inline void
200xpc_disallow_hb(partid_t partid, struct xpc_vars *vars)
201{
202 u64 old_mask, new_mask;
203
204 do {
205 old_mask = vars->heartbeating_to_mask;
206 new_mask = (old_mask & ~(1UL << partid));
207 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
208 old_mask);
209}
210
211
212/*
213 * The AMOs page consists of a number of AMO variables which are divided into
214 * four groups, The first two groups are used to identify an IRQ's sender.
215 * These two groups consist of 64 and 128 AMO variables respectively. The last
216 * two groups, consisting of just one AMO variable each, are used to identify
217 * the remote partitions that are currently engaged (from the viewpoint of
218 * the XPC running on the remote partition).
219 */
220#define XPC_NOTIFY_IRQ_AMOS 0
221#define XPC_ACTIVATE_IRQ_AMOS (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS)
222#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS)
223#define XPC_DISENGAGE_REQUEST_AMO (XPC_ENGAGED_PARTITIONS_AMO + 1)
224
225
226/*
227 * The following structure describes the per partition specific variables.
228 *
229 * An array of these structures, one per partition, will be defined. As a
230 * partition becomes active XPC will copy the array entry corresponding to
231 * itself from that partition. It is desirable that the size of this
232 * structure evenly divide into a cacheline, such that none of the entries
233 * in this array crosses a cacheline boundary. As it is now, each entry
234 * occupies half a cacheline.
235 */
236struct xpc_vars_part {
237 volatile u64 magic;
238
239 u64 openclose_args_pa; /* physical address of open and close args */
240 u64 GPs_pa; /* physical address of Get/Put values */
241
242 u64 IPI_amo_pa; /* physical address of IPI AMO_t structure */
243 int IPI_nasid; /* nasid of where to send IPIs */
244 int IPI_phys_cpuid; /* physical CPU ID of where to send IPIs */
245
246 u8 nchannels; /* #of defined channels supported */
247
248 u8 reserved[23]; /* pad to a full 64 bytes */
249};
250
251/*
252 * The vars_part MAGIC numbers play a part in the first contact protocol.
253 *
254 * MAGIC1 indicates that the per partition specific variables for a remote
255 * partition have been initialized by this partition.
256 *
257 * MAGIC2 indicates that this partition has pulled the remote partititions
258 * per partition variables that pertain to this partition.
259 */
260#define XPC_VP_MAGIC1 0x0053524156435058L /* 'XPCVARS\0'L (little endian) */
261#define XPC_VP_MAGIC2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */
262
263
264/* the reserved page sizes and offsets */
265
266#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
267#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars))
268
269#define XPC_RP_PART_NASIDS(_rp) (u64 *) ((u8 *) _rp + XPC_RP_HEADER_SIZE)
270#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + xp_nasid_mask_words)
271#define XPC_RP_VARS(_rp) ((struct xpc_vars *) XPC_RP_MACH_NASIDS(_rp) + xp_nasid_mask_words)
272#define XPC_RP_VARS_PART(_rp) (struct xpc_vars_part *) ((u8 *) XPC_RP_VARS(rp) + XPC_RP_VARS_SIZE)
273
274
275/*
276 * Functions registered by add_timer() or called by kernel_thread() only
277 * allow for a single 64-bit argument. The following macros can be used to
278 * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from
279 * the passed argument.
280 */
281#define XPC_PACK_ARGS(_arg1, _arg2) \
282 ((((u64) _arg1) & 0xffffffff) | \
283 ((((u64) _arg2) & 0xffffffff) << 32))
284
285#define XPC_UNPACK_ARG1(_args) (((u64) _args) & 0xffffffff)
286#define XPC_UNPACK_ARG2(_args) ((((u64) _args) >> 32) & 0xffffffff)
287
288
289
290/*
291 * Define a Get/Put value pair (pointers) used with a message queue.
292 */
293struct xpc_gp {
294 volatile s64 get; /* Get value */
295 volatile s64 put; /* Put value */
296};
297
298#define XPC_GP_SIZE \
299 L1_CACHE_ALIGN(sizeof(struct xpc_gp) * XPC_NCHANNELS)
300
301
302
303/*
304 * Define a structure that contains arguments associated with opening and
305 * closing a channel.
306 */
307struct xpc_openclose_args {
308 u16 reason; /* reason why channel is closing */
309 u16 msg_size; /* sizeof each message entry */
310 u16 remote_nentries; /* #of message entries in remote msg queue */
311 u16 local_nentries; /* #of message entries in local msg queue */
312 u64 local_msgqueue_pa; /* physical address of local message queue */
313};
314
315#define XPC_OPENCLOSE_ARGS_SIZE \
316 L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * XPC_NCHANNELS)
317
318
319
320/* struct xpc_msg flags */
321
322#define XPC_M_DONE 0x01 /* msg has been received/consumed */
323#define XPC_M_READY 0x02 /* msg is ready to be sent */
324#define XPC_M_INTERRUPT 0x04 /* send interrupt when msg consumed */
325
326
327#define XPC_MSG_ADDRESS(_payload) \
328 ((struct xpc_msg *)((u8 *)(_payload) - XPC_MSG_PAYLOAD_OFFSET))
329
330
331
332/*
333 * Defines notify entry.
334 *
335 * This is used to notify a message's sender that their message was received
336 * and consumed by the intended recipient.
337 */
338struct xpc_notify {
339 volatile u8 type; /* type of notification */
340
341 /* the following two fields are only used if type == XPC_N_CALL */
342 xpc_notify_func func; /* user's notify function */
343 void *key; /* pointer to user's key */
344};
345
346/* struct xpc_notify type of notification */
347
348#define XPC_N_CALL 0x01 /* notify function provided by user */
349
350
351
352/*
353 * Define the structure that manages all the stuff required by a channel. In
354 * particular, they are used to manage the messages sent across the channel.
355 *
356 * This structure is private to a partition, and is NOT shared across the
357 * partition boundary.
358 *
359 * There is an array of these structures for each remote partition. It is
360 * allocated at the time a partition becomes active. The array contains one
361 * of these structures for each potential channel connection to that partition.
362 *
363 * Each of these structures manages two message queues (circular buffers).
364 * They are allocated at the time a channel connection is made. One of
365 * these message queues (local_msgqueue) holds the locally created messages
366 * that are destined for the remote partition. The other of these message
367 * queues (remote_msgqueue) is a locally cached copy of the remote partition's
368 * own local_msgqueue.
369 *
370 * The following is a description of the Get/Put pointers used to manage these
371 * two message queues. Consider the local_msgqueue to be on one partition
372 * and the remote_msgqueue to be its cached copy on another partition. A
373 * description of what each of the lettered areas contains is included.
374 *
375 *
376 * local_msgqueue remote_msgqueue
377 *
378 * |/////////| |/////////|
379 * w_remote_GP.get --> +---------+ |/////////|
380 * | F | |/////////|
381 * remote_GP.get --> +---------+ +---------+ <-- local_GP->get
382 * | | | |
383 * | | | E |
384 * | | | |
385 * | | +---------+ <-- w_local_GP.get
386 * | B | |/////////|
387 * | | |////D////|
388 * | | |/////////|
389 * | | +---------+ <-- w_remote_GP.put
390 * | | |////C////|
391 * local_GP->put --> +---------+ +---------+ <-- remote_GP.put
392 * | | |/////////|
393 * | A | |/////////|
394 * | | |/////////|
395 * w_local_GP.put --> +---------+ |/////////|
396 * |/////////| |/////////|
397 *
398 *
399 * ( remote_GP.[get|put] are cached copies of the remote
400 * partition's local_GP->[get|put], and thus their values can
401 * lag behind their counterparts on the remote partition. )
402 *
403 *
404 * A - Messages that have been allocated, but have not yet been sent to the
405 * remote partition.
406 *
407 * B - Messages that have been sent, but have not yet been acknowledged by the
408 * remote partition as having been received.
409 *
410 * C - Area that needs to be prepared for the copying of sent messages, by
411 * the clearing of the message flags of any previously received messages.
412 *
413 * D - Area into which sent messages are to be copied from the remote
414 * partition's local_msgqueue and then delivered to their intended
415 * recipients. [ To allow for a multi-message copy, another pointer
416 * (next_msg_to_pull) has been added to keep track of the next message
417 * number needing to be copied (pulled). It chases after w_remote_GP.put.
418 * Any messages lying between w_local_GP.get and next_msg_to_pull have
419 * been copied and are ready to be delivered. ]
420 *
421 * E - Messages that have been copied and delivered, but have not yet been
422 * acknowledged by the recipient as having been received.
423 *
424 * F - Messages that have been acknowledged, but XPC has not yet notified the
425 * sender that the message was received by its intended recipient.
426 * This is also an area that needs to be prepared for the allocating of
427 * new messages, by the clearing of the message flags of the acknowledged
428 * messages.
429 */
430struct xpc_channel {
431 partid_t partid; /* ID of remote partition connected */
432 spinlock_t lock; /* lock for updating this structure */
433 u32 flags; /* general flags */
434
435 enum xpc_retval reason; /* reason why channel is disconnect'g */
436 int reason_line; /* line# disconnect initiated from */
437
438 u16 number; /* channel # */
439
440 u16 msg_size; /* sizeof each msg entry */
441 u16 local_nentries; /* #of msg entries in local msg queue */
442 u16 remote_nentries; /* #of msg entries in remote msg queue*/
443
444 void *local_msgqueue_base; /* base address of kmalloc'd space */
445 struct xpc_msg *local_msgqueue; /* local message queue */
446 void *remote_msgqueue_base; /* base address of kmalloc'd space */
447 struct xpc_msg *remote_msgqueue;/* cached copy of remote partition's */
448 /* local message queue */
449 u64 remote_msgqueue_pa; /* phys addr of remote partition's */
450 /* local message queue */
451
452 atomic_t references; /* #of external references to queues */
453
454 atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */
455 wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */
456
457 u8 delayed_IPI_flags; /* IPI flags received, but delayed */
458 /* action until channel disconnected */
459
460 /* queue of msg senders who want to be notified when msg received */
461
462 atomic_t n_to_notify; /* #of msg senders to notify */
463 struct xpc_notify *notify_queue;/* notify queue for messages sent */
464
465 xpc_channel_func func; /* user's channel function */
466 void *key; /* pointer to user's key */
467
468 struct mutex msg_to_pull_mutex; /* next msg to pull serialization */
469 struct completion wdisconnect_wait; /* wait for channel disconnect */
470
471 struct xpc_openclose_args *local_openclose_args; /* args passed on */
472 /* opening or closing of channel */
473
474 /* various flavors of local and remote Get/Put values */
475
476 struct xpc_gp *local_GP; /* local Get/Put values */
477 struct xpc_gp remote_GP; /* remote Get/Put values */
478 struct xpc_gp w_local_GP; /* working local Get/Put values */
479 struct xpc_gp w_remote_GP; /* working remote Get/Put values */
480 s64 next_msg_to_pull; /* Put value of next msg to pull */
481
482 /* kthread management related fields */
483
484// >>> rethink having kthreads_assigned_limit and kthreads_idle_limit; perhaps
485// >>> allow the assigned limit be unbounded and let the idle limit be dynamic
486// >>> dependent on activity over the last interval of time
487 atomic_t kthreads_assigned; /* #of kthreads assigned to channel */
488 u32 kthreads_assigned_limit; /* limit on #of kthreads assigned */
489 atomic_t kthreads_idle; /* #of kthreads idle waiting for work */
490 u32 kthreads_idle_limit; /* limit on #of kthreads idle */
491 atomic_t kthreads_active; /* #of kthreads actively working */
492 // >>> following field is temporary
493 u32 kthreads_created; /* total #of kthreads created */
494
495 wait_queue_head_t idle_wq; /* idle kthread wait queue */
496
497} ____cacheline_aligned;
498
499
500/* struct xpc_channel flags */
501
502#define XPC_C_WASCONNECTED 0x00000001 /* channel was connected */
503
504#define XPC_C_ROPENREPLY 0x00000002 /* remote open channel reply */
505#define XPC_C_OPENREPLY 0x00000004 /* local open channel reply */
506#define XPC_C_ROPENREQUEST 0x00000008 /* remote open channel request */
507#define XPC_C_OPENREQUEST 0x00000010 /* local open channel request */
508
509#define XPC_C_SETUP 0x00000020 /* channel's msgqueues are alloc'd */
510#define XPC_C_CONNECTEDCALLOUT 0x00000040 /* connected callout initiated */
511#define XPC_C_CONNECTEDCALLOUT_MADE \
512 0x00000080 /* connected callout completed */
513#define XPC_C_CONNECTED 0x00000100 /* local channel is connected */
514#define XPC_C_CONNECTING 0x00000200 /* channel is being connected */
515
516#define XPC_C_RCLOSEREPLY 0x00000400 /* remote close channel reply */
517#define XPC_C_CLOSEREPLY 0x00000800 /* local close channel reply */
518#define XPC_C_RCLOSEREQUEST 0x00001000 /* remote close channel request */
519#define XPC_C_CLOSEREQUEST 0x00002000 /* local close channel request */
520
521#define XPC_C_DISCONNECTED 0x00004000 /* channel is disconnected */
522#define XPC_C_DISCONNECTING 0x00008000 /* channel is being disconnected */
523#define XPC_C_DISCONNECTINGCALLOUT \
524 0x00010000 /* disconnecting callout initiated */
525#define XPC_C_DISCONNECTINGCALLOUT_MADE \
526 0x00020000 /* disconnecting callout completed */
527#define XPC_C_WDISCONNECT 0x00040000 /* waiting for channel disconnect */
528
529
530
531/*
532 * Manages channels on a partition basis. There is one of these structures
533 * for each partition (a partition will never utilize the structure that
534 * represents itself).
535 */
536struct xpc_partition {
537
538 /* XPC HB infrastructure */
539
540 u8 remote_rp_version; /* version# of partition's rsvd pg */
541 struct timespec remote_rp_stamp;/* time when rsvd pg was initialized */
542 u64 remote_rp_pa; /* phys addr of partition's rsvd pg */
543 u64 remote_vars_pa; /* phys addr of partition's vars */
544 u64 remote_vars_part_pa; /* phys addr of partition's vars part */
545 u64 last_heartbeat; /* HB at last read */
546 u64 remote_amos_page_pa; /* phys addr of partition's amos page */
547 int remote_act_nasid; /* active part's act/deact nasid */
548 int remote_act_phys_cpuid; /* active part's act/deact phys cpuid */
549 u32 act_IRQ_rcvd; /* IRQs since activation */
550 spinlock_t act_lock; /* protect updating of act_state */
551 u8 act_state; /* from XPC HB viewpoint */
552 u8 remote_vars_version; /* version# of partition's vars */
553 enum xpc_retval reason; /* reason partition is deactivating */
554 int reason_line; /* line# deactivation initiated from */
555 int reactivate_nasid; /* nasid in partition to reactivate */
556
557 unsigned long disengage_request_timeout; /* timeout in jiffies */
558 struct timer_list disengage_request_timer;
559
560
561 /* XPC infrastructure referencing and teardown control */
562
563 volatile u8 setup_state; /* infrastructure setup state */
564 wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */
565 atomic_t references; /* #of references to infrastructure */
566
567
568 /*
569 * NONE OF THE PRECEDING FIELDS OF THIS STRUCTURE WILL BE CLEARED WHEN
570 * XPC SETS UP THE NECESSARY INFRASTRUCTURE TO SUPPORT CROSS PARTITION
571 * COMMUNICATION. ALL OF THE FOLLOWING FIELDS WILL BE CLEARED. (THE
572 * 'nchannels' FIELD MUST BE THE FIRST OF THE FIELDS TO BE CLEARED.)
573 */
574
575
576 u8 nchannels; /* #of defined channels supported */
577 atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */
578 atomic_t nchannels_engaged;/* #of channels engaged with remote part */
579 struct xpc_channel *channels;/* array of channel structures */
580
581 void *local_GPs_base; /* base address of kmalloc'd space */
582 struct xpc_gp *local_GPs; /* local Get/Put values */
583 void *remote_GPs_base; /* base address of kmalloc'd space */
584 struct xpc_gp *remote_GPs;/* copy of remote partition's local Get/Put */
585 /* values */
586 u64 remote_GPs_pa; /* phys address of remote partition's local */
587 /* Get/Put values */
588
589
590 /* fields used to pass args when opening or closing a channel */
591
592 void *local_openclose_args_base; /* base address of kmalloc'd space */
593 struct xpc_openclose_args *local_openclose_args; /* local's args */
594 void *remote_openclose_args_base; /* base address of kmalloc'd space */
595 struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */
596 /* args */
597 u64 remote_openclose_args_pa; /* phys addr of remote's args */
598
599
600 /* IPI sending, receiving and handling related fields */
601
602 int remote_IPI_nasid; /* nasid of where to send IPIs */
603 int remote_IPI_phys_cpuid; /* phys CPU ID of where to send IPIs */
604 AMO_t *remote_IPI_amo_va; /* address of remote IPI AMO_t structure */
605
606 AMO_t *local_IPI_amo_va; /* address of IPI AMO_t structure */
607 u64 local_IPI_amo; /* IPI amo flags yet to be handled */
608 char IPI_owner[8]; /* IPI owner's name */
609 struct timer_list dropped_IPI_timer; /* dropped IPI timer */
610
611 spinlock_t IPI_lock; /* IPI handler lock */
612
613
614 /* channel manager related fields */
615
616 atomic_t channel_mgr_requests; /* #of requests to activate chan mgr */
617 wait_queue_head_t channel_mgr_wq; /* channel mgr's wait queue */
618
619} ____cacheline_aligned;
620
621
622/* struct xpc_partition act_state values (for XPC HB) */
623
624#define XPC_P_INACTIVE 0x00 /* partition is not active */
625#define XPC_P_ACTIVATION_REQ 0x01 /* created thread to activate */
626#define XPC_P_ACTIVATING 0x02 /* activation thread started */
627#define XPC_P_ACTIVE 0x03 /* xpc_partition_up() was called */
628#define XPC_P_DEACTIVATING 0x04 /* partition deactivation initiated */
629
630
631#define XPC_DEACTIVATE_PARTITION(_p, _reason) \
632 xpc_deactivate_partition(__LINE__, (_p), (_reason))
633
634
635/* struct xpc_partition setup_state values */
636
637#define XPC_P_UNSET 0x00 /* infrastructure was never setup */
638#define XPC_P_SETUP 0x01 /* infrastructure is setup */
639#define XPC_P_WTEARDOWN 0x02 /* waiting to teardown infrastructure */
640#define XPC_P_TORNDOWN 0x03 /* infrastructure is torndown */
641
642
643
644/*
645 * struct xpc_partition IPI_timer #of seconds to wait before checking for
646 * dropped IPIs. These occur whenever an IPI amo write doesn't complete until
647 * after the IPI was received.
648 */
649#define XPC_P_DROPPED_IPI_WAIT (0.25 * HZ)
650
651
652/* number of seconds to wait for other partitions to disengage */
653#define XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT 90
654
655/* interval in seconds to print 'waiting disengagement' messages */
656#define XPC_DISENGAGE_PRINTMSG_INTERVAL 10
657
658
659#define XPC_PARTID(_p) ((partid_t) ((_p) - &xpc_partitions[0]))
660
661
662
663/* found in xp_main.c */
664extern struct xpc_registration xpc_registrations[];
665
666
667/* found in xpc_main.c */
668extern struct device *xpc_part;
669extern struct device *xpc_chan;
670extern int xpc_disengage_request_timelimit;
671extern int xpc_disengage_request_timedout;
672extern irqreturn_t xpc_notify_IRQ_handler(int, void *);
673extern void xpc_dropped_IPI_check(struct xpc_partition *);
674extern void xpc_activate_partition(struct xpc_partition *);
675extern void xpc_activate_kthreads(struct xpc_channel *, int);
676extern void xpc_create_kthreads(struct xpc_channel *, int, int);
677extern void xpc_disconnect_wait(int);
678
679
680/* found in xpc_partition.c */
681extern int xpc_exiting;
682extern struct xpc_vars *xpc_vars;
683extern struct xpc_rsvd_page *xpc_rsvd_page;
684extern struct xpc_vars_part *xpc_vars_part;
685extern struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
686extern char *xpc_remote_copy_buffer;
687extern void *xpc_remote_copy_buffer_base;
688extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **);
689extern struct xpc_rsvd_page *xpc_rsvd_page_init(void);
690extern void xpc_allow_IPI_ops(void);
691extern void xpc_restrict_IPI_ops(void);
692extern int xpc_identify_act_IRQ_sender(void);
693extern int xpc_partition_disengaged(struct xpc_partition *);
694extern enum xpc_retval xpc_mark_partition_active(struct xpc_partition *);
695extern void xpc_mark_partition_inactive(struct xpc_partition *);
696extern void xpc_discovery(void);
697extern void xpc_check_remote_hb(void);
698extern void xpc_deactivate_partition(const int, struct xpc_partition *,
699 enum xpc_retval);
700extern enum xpc_retval xpc_initiate_partid_to_nasids(partid_t, void *);
701
702
703/* found in xpc_channel.c */
704extern void xpc_initiate_connect(int);
705extern void xpc_initiate_disconnect(int);
706extern enum xpc_retval xpc_initiate_allocate(partid_t, int, u32, void **);
707extern enum xpc_retval xpc_initiate_send(partid_t, int, void *);
708extern enum xpc_retval xpc_initiate_send_notify(partid_t, int, void *,
709 xpc_notify_func, void *);
710extern void xpc_initiate_received(partid_t, int, void *);
711extern enum xpc_retval xpc_setup_infrastructure(struct xpc_partition *);
712extern enum xpc_retval xpc_pull_remote_vars_part(struct xpc_partition *);
713extern void xpc_process_channel_activity(struct xpc_partition *);
714extern void xpc_connected_callout(struct xpc_channel *);
715extern void xpc_deliver_msg(struct xpc_channel *);
716extern void xpc_disconnect_channel(const int, struct xpc_channel *,
717 enum xpc_retval, unsigned long *);
718extern void xpc_disconnect_callout(struct xpc_channel *, enum xpc_retval);
719extern void xpc_partition_going_down(struct xpc_partition *, enum xpc_retval);
720extern void xpc_teardown_infrastructure(struct xpc_partition *);
721
722
723
724static inline void
725xpc_wakeup_channel_mgr(struct xpc_partition *part)
726{
727 if (atomic_inc_return(&part->channel_mgr_requests) == 1) {
728 wake_up(&part->channel_mgr_wq);
729 }
730}
731
732
733
734/*
735 * These next two inlines are used to keep us from tearing down a channel's
736 * msg queues while a thread may be referencing them.
737 */
738static inline void
739xpc_msgqueue_ref(struct xpc_channel *ch)
740{
741 atomic_inc(&ch->references);
742}
743
744static inline void
745xpc_msgqueue_deref(struct xpc_channel *ch)
746{
747 s32 refs = atomic_dec_return(&ch->references);
748
749 DBUG_ON(refs < 0);
750 if (refs == 0) {
751 xpc_wakeup_channel_mgr(&xpc_partitions[ch->partid]);
752 }
753}
754
755
756
757#define XPC_DISCONNECT_CHANNEL(_ch, _reason, _irqflgs) \
758 xpc_disconnect_channel(__LINE__, _ch, _reason, _irqflgs)
759
760
761/*
762 * These two inlines are used to keep us from tearing down a partition's
763 * setup infrastructure while a thread may be referencing it.
764 */
765static inline void
766xpc_part_deref(struct xpc_partition *part)
767{
768 s32 refs = atomic_dec_return(&part->references);
769
770
771 DBUG_ON(refs < 0);
772 if (refs == 0 && part->setup_state == XPC_P_WTEARDOWN) {
773 wake_up(&part->teardown_wq);
774 }
775}
776
777static inline int
778xpc_part_ref(struct xpc_partition *part)
779{
780 int setup;
781
782
783 atomic_inc(&part->references);
784 setup = (part->setup_state == XPC_P_SETUP);
785 if (!setup) {
786 xpc_part_deref(part);
787 }
788 return setup;
789}
790
791
792
793/*
794 * The following macro is to be used for the setting of the reason and
795 * reason_line fields in both the struct xpc_channel and struct xpc_partition
796 * structures.
797 */
798#define XPC_SET_REASON(_p, _reason, _line) \
799 { \
800 (_p)->reason = _reason; \
801 (_p)->reason_line = _line; \
802 }
803
804
805
806/*
807 * This next set of inlines are used to keep track of when a partition is
808 * potentially engaged in accessing memory belonging to another partition.
809 */
810
811static inline void
812xpc_mark_partition_engaged(struct xpc_partition *part)
813{
814 unsigned long irq_flags;
815 AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
816 (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t)));
817
818
819 local_irq_save(irq_flags);
820
821 /* set bit corresponding to our partid in remote partition's AMO */
822 FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR,
823 (1UL << sn_partition_id));
824 /*
825 * We must always use the nofault function regardless of whether we
826 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
827 * didn't, we'd never know that the other partition is down and would
828 * keep sending IPIs and AMOs to it until the heartbeat times out.
829 */
830 (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
831 variable), xp_nofault_PIOR_target));
832
833 local_irq_restore(irq_flags);
834}
835
836static inline void
837xpc_mark_partition_disengaged(struct xpc_partition *part)
838{
839 unsigned long irq_flags;
840 AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
841 (XPC_ENGAGED_PARTITIONS_AMO * sizeof(AMO_t)));
842
843
844 local_irq_save(irq_flags);
845
846 /* clear bit corresponding to our partid in remote partition's AMO */
847 FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
848 ~(1UL << sn_partition_id));
849 /*
850 * We must always use the nofault function regardless of whether we
851 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
852 * didn't, we'd never know that the other partition is down and would
853 * keep sending IPIs and AMOs to it until the heartbeat times out.
854 */
855 (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
856 variable), xp_nofault_PIOR_target));
857
858 local_irq_restore(irq_flags);
859}
860
861static inline void
862xpc_request_partition_disengage(struct xpc_partition *part)
863{
864 unsigned long irq_flags;
865 AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
866 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
867
868
869 local_irq_save(irq_flags);
870
871 /* set bit corresponding to our partid in remote partition's AMO */
872 FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR,
873 (1UL << sn_partition_id));
874 /*
875 * We must always use the nofault function regardless of whether we
876 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
877 * didn't, we'd never know that the other partition is down and would
878 * keep sending IPIs and AMOs to it until the heartbeat times out.
879 */
880 (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
881 variable), xp_nofault_PIOR_target));
882
883 local_irq_restore(irq_flags);
884}
885
886static inline void
887xpc_cancel_partition_disengage_request(struct xpc_partition *part)
888{
889 unsigned long irq_flags;
890 AMO_t *amo = (AMO_t *) __va(part->remote_amos_page_pa +
891 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
892
893
894 local_irq_save(irq_flags);
895
896 /* clear bit corresponding to our partid in remote partition's AMO */
897 FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
898 ~(1UL << sn_partition_id));
899 /*
900 * We must always use the nofault function regardless of whether we
901 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
902 * didn't, we'd never know that the other partition is down and would
903 * keep sending IPIs and AMOs to it until the heartbeat times out.
904 */
905 (void) xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->
906 variable), xp_nofault_PIOR_target));
907
908 local_irq_restore(irq_flags);
909}
910
911static inline u64
912xpc_partition_engaged(u64 partid_mask)
913{
914 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
915
916
917 /* return our partition's AMO variable ANDed with partid_mask */
918 return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) &
919 partid_mask);
920}
921
922static inline u64
923xpc_partition_disengage_requested(u64 partid_mask)
924{
925 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
926
927
928 /* return our partition's AMO variable ANDed with partid_mask */
929 return (FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_LOAD) &
930 partid_mask);
931}
932
933static inline void
934xpc_clear_partition_engaged(u64 partid_mask)
935{
936 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
937
938
939 /* clear bit(s) based on partid_mask in our partition's AMO */
940 FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
941 ~partid_mask);
942}
943
944static inline void
945xpc_clear_partition_disengage_request(u64 partid_mask)
946{
947 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
948
949
950 /* clear bit(s) based on partid_mask in our partition's AMO */
951 FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_AND,
952 ~partid_mask);
953}
954
955
956
957/*
958 * The following set of macros and inlines are used for the sending and
959 * receiving of IPIs (also known as IRQs). There are two flavors of IPIs,
960 * one that is associated with partition activity (SGI_XPC_ACTIVATE) and
961 * the other that is associated with channel activity (SGI_XPC_NOTIFY).
962 */
963
964static inline u64
965xpc_IPI_receive(AMO_t *amo)
966{
967 return FETCHOP_LOAD_OP(TO_AMO((u64) &amo->variable), FETCHOP_CLEAR);
968}
969
970
971static inline enum xpc_retval
972xpc_IPI_send(AMO_t *amo, u64 flag, int nasid, int phys_cpuid, int vector)
973{
974 int ret = 0;
975 unsigned long irq_flags;
976
977
978 local_irq_save(irq_flags);
979
980 FETCHOP_STORE_OP(TO_AMO((u64) &amo->variable), FETCHOP_OR, flag);
981 sn_send_IPI_phys(nasid, phys_cpuid, vector, 0);
982
983 /*
984 * We must always use the nofault function regardless of whether we
985 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
986 * didn't, we'd never know that the other partition is down and would
987 * keep sending IPIs and AMOs to it until the heartbeat times out.
988 */
989 ret = xp_nofault_PIOR((u64 *) GLOBAL_MMR_ADDR(NASID_GET(&amo->variable),
990 xp_nofault_PIOR_target));
991
992 local_irq_restore(irq_flags);
993
994 return ((ret == 0) ? xpcSuccess : xpcPioReadError);
995}
996
997
998/*
999 * IPIs associated with SGI_XPC_ACTIVATE IRQ.
1000 */
1001
1002/*
1003 * Flag the appropriate AMO variable and send an IPI to the specified node.
1004 */
1005static inline void
1006xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid,
1007 int to_phys_cpuid)
1008{
1009 int w_index = XPC_NASID_W_INDEX(from_nasid);
1010 int b_index = XPC_NASID_B_INDEX(from_nasid);
1011 AMO_t *amos = (AMO_t *) __va(amos_page_pa +
1012 (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t)));
1013
1014
1015 (void) xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid,
1016 to_phys_cpuid, SGI_XPC_ACTIVATE);
1017}
1018
1019static inline void
1020xpc_IPI_send_activate(struct xpc_vars *vars)
1021{
1022 xpc_activate_IRQ_send(vars->amos_page_pa, cnodeid_to_nasid(0),
1023 vars->act_nasid, vars->act_phys_cpuid);
1024}
1025
1026static inline void
1027xpc_IPI_send_activated(struct xpc_partition *part)
1028{
1029 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
1030 part->remote_act_nasid, part->remote_act_phys_cpuid);
1031}
1032
1033static inline void
1034xpc_IPI_send_reactivate(struct xpc_partition *part)
1035{
1036 xpc_activate_IRQ_send(xpc_vars->amos_page_pa, part->reactivate_nasid,
1037 xpc_vars->act_nasid, xpc_vars->act_phys_cpuid);
1038}
1039
1040static inline void
1041xpc_IPI_send_disengage(struct xpc_partition *part)
1042{
1043 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
1044 part->remote_act_nasid, part->remote_act_phys_cpuid);
1045}
1046
1047
1048/*
1049 * IPIs associated with SGI_XPC_NOTIFY IRQ.
1050 */
1051
1052/*
1053 * Send an IPI to the remote partition that is associated with the
1054 * specified channel.
1055 */
1056#define XPC_NOTIFY_IRQ_SEND(_ch, _ipi_f, _irq_f) \
1057 xpc_notify_IRQ_send(_ch, _ipi_f, #_ipi_f, _irq_f)
1058
1059static inline void
1060xpc_notify_IRQ_send(struct xpc_channel *ch, u8 ipi_flag, char *ipi_flag_string,
1061 unsigned long *irq_flags)
1062{
1063 struct xpc_partition *part = &xpc_partitions[ch->partid];
1064 enum xpc_retval ret;
1065
1066
1067 if (likely(part->act_state != XPC_P_DEACTIVATING)) {
1068 ret = xpc_IPI_send(part->remote_IPI_amo_va,
1069 (u64) ipi_flag << (ch->number * 8),
1070 part->remote_IPI_nasid,
1071 part->remote_IPI_phys_cpuid,
1072 SGI_XPC_NOTIFY);
1073 dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n",
1074 ipi_flag_string, ch->partid, ch->number, ret);
1075 if (unlikely(ret != xpcSuccess)) {
1076 if (irq_flags != NULL) {
1077 spin_unlock_irqrestore(&ch->lock, *irq_flags);
1078 }
1079 XPC_DEACTIVATE_PARTITION(part, ret);
1080 if (irq_flags != NULL) {
1081 spin_lock_irqsave(&ch->lock, *irq_flags);
1082 }
1083 }
1084 }
1085}
1086
1087
1088/*
1089 * Make it look like the remote partition, which is associated with the
1090 * specified channel, sent us an IPI. This faked IPI will be handled
1091 * by xpc_dropped_IPI_check().
1092 */
1093#define XPC_NOTIFY_IRQ_SEND_LOCAL(_ch, _ipi_f) \
1094 xpc_notify_IRQ_send_local(_ch, _ipi_f, #_ipi_f)
1095
1096static inline void
1097xpc_notify_IRQ_send_local(struct xpc_channel *ch, u8 ipi_flag,
1098 char *ipi_flag_string)
1099{
1100 struct xpc_partition *part = &xpc_partitions[ch->partid];
1101
1102
1103 FETCHOP_STORE_OP(TO_AMO((u64) &part->local_IPI_amo_va->variable),
1104 FETCHOP_OR, ((u64) ipi_flag << (ch->number * 8)));
1105 dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n",
1106 ipi_flag_string, ch->partid, ch->number);
1107}
1108
1109
1110/*
1111 * The sending and receiving of IPIs includes the setting of an AMO variable
1112 * to indicate the reason the IPI was sent. The 64-bit variable is divided
1113 * up into eight bytes, ordered from right to left. Byte zero pertains to
1114 * channel 0, byte one to channel 1, and so on. Each byte is described by
1115 * the following IPI flags.
1116 */
1117
1118#define XPC_IPI_CLOSEREQUEST 0x01
1119#define XPC_IPI_CLOSEREPLY 0x02
1120#define XPC_IPI_OPENREQUEST 0x04
1121#define XPC_IPI_OPENREPLY 0x08
1122#define XPC_IPI_MSGREQUEST 0x10
1123
1124
1125/* given an AMO variable and a channel#, get its associated IPI flags */
1126#define XPC_GET_IPI_FLAGS(_amo, _c) ((u8) (((_amo) >> ((_c) * 8)) & 0xff))
1127#define XPC_SET_IPI_FLAGS(_amo, _c, _f) (_amo) |= ((u64) (_f) << ((_c) * 8))
1128
1129#define XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(_amo) ((_amo) & __IA64_UL_CONST(0x0f0f0f0f0f0f0f0f))
1130#define XPC_ANY_MSG_IPI_FLAGS_SET(_amo) ((_amo) & __IA64_UL_CONST(0x1010101010101010))
1131
1132
1133static inline void
1134xpc_IPI_send_closerequest(struct xpc_channel *ch, unsigned long *irq_flags)
1135{
1136 struct xpc_openclose_args *args = ch->local_openclose_args;
1137
1138
1139 args->reason = ch->reason;
1140
1141 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREQUEST, irq_flags);
1142}
1143
1144static inline void
1145xpc_IPI_send_closereply(struct xpc_channel *ch, unsigned long *irq_flags)
1146{
1147 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREPLY, irq_flags);
1148}
1149
1150static inline void
1151xpc_IPI_send_openrequest(struct xpc_channel *ch, unsigned long *irq_flags)
1152{
1153 struct xpc_openclose_args *args = ch->local_openclose_args;
1154
1155
1156 args->msg_size = ch->msg_size;
1157 args->local_nentries = ch->local_nentries;
1158
1159 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREQUEST, irq_flags);
1160}
1161
1162static inline void
1163xpc_IPI_send_openreply(struct xpc_channel *ch, unsigned long *irq_flags)
1164{
1165 struct xpc_openclose_args *args = ch->local_openclose_args;
1166
1167
1168 args->remote_nentries = ch->remote_nentries;
1169 args->local_nentries = ch->local_nentries;
1170 args->local_msgqueue_pa = __pa(ch->local_msgqueue);
1171
1172 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREPLY, irq_flags);
1173}
1174
1175static inline void
1176xpc_IPI_send_msgrequest(struct xpc_channel *ch)
1177{
1178 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_MSGREQUEST, NULL);
1179}
1180
1181static inline void
1182xpc_IPI_send_local_msgrequest(struct xpc_channel *ch)
1183{
1184 XPC_NOTIFY_IRQ_SEND_LOCAL(ch, XPC_IPI_MSGREQUEST);
1185}
1186
1187
1188/*
1189 * Memory for XPC's AMO variables is allocated by the MSPEC driver. These
1190 * pages are located in the lowest granule. The lowest granule uses 4k pages
1191 * for cached references and an alternate TLB handler to never provide a
1192 * cacheable mapping for the entire region. This will prevent speculative
1193 * reading of cached copies of our lines from being issued which will cause
1194 * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
1195 * AMO variables (based on XP_MAX_PARTITIONS) for message notification and an
1196 * additional 128 AMO variables (based on XP_NASID_MASK_WORDS) for partition
1197 * activation and 2 AMO variables for partition deactivation.
1198 */
1199static inline AMO_t *
1200xpc_IPI_init(int index)
1201{
1202 AMO_t *amo = xpc_vars->amos_page + index;
1203
1204
1205 (void) xpc_IPI_receive(amo); /* clear AMO variable */
1206 return amo;
1207}
1208
1209
1210
1211static inline enum xpc_retval
1212xpc_map_bte_errors(bte_result_t error)
1213{
1214 if (error == BTE_SUCCESS)
1215 return xpcSuccess;
1216
1217 if (is_shub2()) {
1218 if (BTE_VALID_SH2_ERROR(error))
1219 return xpcBteSh2Start + error;
1220 return xpcBteUnmappedError;
1221 }
1222 switch (error) {
1223 case BTE_SUCCESS: return xpcSuccess;
1224 case BTEFAIL_DIR: return xpcBteDirectoryError;
1225 case BTEFAIL_POISON: return xpcBtePoisonError;
1226 case BTEFAIL_WERR: return xpcBteWriteError;
1227 case BTEFAIL_ACCESS: return xpcBteAccessError;
1228 case BTEFAIL_PWERR: return xpcBtePWriteError;
1229 case BTEFAIL_PRERR: return xpcBtePReadError;
1230 case BTEFAIL_TOUT: return xpcBteTimeOutError;
1231 case BTEFAIL_XTERR: return xpcBteXtalkError;
1232 case BTEFAIL_NOTAVAIL: return xpcBteNotAvailable;
1233 default: return xpcBteUnmappedError;
1234 }
1235}
1236
1237
1238
1239/*
1240 * Check to see if there is any channel activity to/from the specified
1241 * partition.
1242 */
1243static inline void
1244xpc_check_for_channel_activity(struct xpc_partition *part)
1245{
1246 u64 IPI_amo;
1247 unsigned long irq_flags;
1248
1249
1250 IPI_amo = xpc_IPI_receive(part->local_IPI_amo_va);
1251 if (IPI_amo == 0) {
1252 return;
1253 }
1254
1255 spin_lock_irqsave(&part->IPI_lock, irq_flags);
1256 part->local_IPI_amo |= IPI_amo;
1257 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
1258
1259 dev_dbg(xpc_chan, "received IPI from partid=%d, IPI_amo=0x%lx\n",
1260 XPC_PARTID(part), IPI_amo);
1261
1262 xpc_wakeup_channel_mgr(part);
1263}
1264
1265
1266#endif /* _DRIVERS_MISC_SGIXP_XPC_H */
1267
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
new file mode 100644
index 000000000000..d7a215eeaaf6
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -0,0 +1,2379 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9
10/*
11 * Cross Partition Communication (XPC) channel support.
12 *
13 * This is the part of XPC that manages the channels and
14 * sends/receives messages across them to/from other partitions.
15 *
16 */
17
18
19#include <linux/kernel.h>
20#include <linux/init.h>
21#include <linux/sched.h>
22#include <linux/cache.h>
23#include <linux/interrupt.h>
24#include <linux/mutex.h>
25#include <linux/completion.h>
26#include <asm/sn/bte.h>
27#include <asm/sn/sn_sal.h>
28#include "xpc.h"
29
30
31/*
32 * Guarantee that the kzalloc'd memory is cacheline aligned.
33 */
34static void *
35xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
36{
37 /* see if kzalloc will give us cachline aligned memory by default */
38 *base = kzalloc(size, flags);
39 if (*base == NULL) {
40 return NULL;
41 }
42 if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
43 return *base;
44 }
45 kfree(*base);
46
47 /* nope, we'll have to do it ourselves */
48 *base = kzalloc(size + L1_CACHE_BYTES, flags);
49 if (*base == NULL) {
50 return NULL;
51 }
52 return (void *) L1_CACHE_ALIGN((u64) *base);
53}
54
55
56/*
57 * Set up the initial values for the XPartition Communication channels.
58 */
59static void
60xpc_initialize_channels(struct xpc_partition *part, partid_t partid)
61{
62 int ch_number;
63 struct xpc_channel *ch;
64
65
66 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
67 ch = &part->channels[ch_number];
68
69 ch->partid = partid;
70 ch->number = ch_number;
71 ch->flags = XPC_C_DISCONNECTED;
72
73 ch->local_GP = &part->local_GPs[ch_number];
74 ch->local_openclose_args =
75 &part->local_openclose_args[ch_number];
76
77 atomic_set(&ch->kthreads_assigned, 0);
78 atomic_set(&ch->kthreads_idle, 0);
79 atomic_set(&ch->kthreads_active, 0);
80
81 atomic_set(&ch->references, 0);
82 atomic_set(&ch->n_to_notify, 0);
83
84 spin_lock_init(&ch->lock);
85 mutex_init(&ch->msg_to_pull_mutex);
86 init_completion(&ch->wdisconnect_wait);
87
88 atomic_set(&ch->n_on_msg_allocate_wq, 0);
89 init_waitqueue_head(&ch->msg_allocate_wq);
90 init_waitqueue_head(&ch->idle_wq);
91 }
92}
93
94
95/*
96 * Setup the infrastructure necessary to support XPartition Communication
97 * between the specified remote partition and the local one.
98 */
99enum xpc_retval
100xpc_setup_infrastructure(struct xpc_partition *part)
101{
102 int ret, cpuid;
103 struct timer_list *timer;
104 partid_t partid = XPC_PARTID(part);
105
106
107 /*
108 * Zero out MOST of the entry for this partition. Only the fields
109 * starting with `nchannels' will be zeroed. The preceding fields must
110 * remain `viable' across partition ups and downs, since they may be
111 * referenced during this memset() operation.
112 */
113 memset(&part->nchannels, 0, sizeof(struct xpc_partition) -
114 offsetof(struct xpc_partition, nchannels));
115
116 /*
117 * Allocate all of the channel structures as a contiguous chunk of
118 * memory.
119 */
120 part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
121 GFP_KERNEL);
122 if (part->channels == NULL) {
123 dev_err(xpc_chan, "can't get memory for channels\n");
124 return xpcNoMemory;
125 }
126
127 part->nchannels = XPC_NCHANNELS;
128
129
130 /* allocate all the required GET/PUT values */
131
132 part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
133 GFP_KERNEL, &part->local_GPs_base);
134 if (part->local_GPs == NULL) {
135 kfree(part->channels);
136 part->channels = NULL;
137 dev_err(xpc_chan, "can't get memory for local get/put "
138 "values\n");
139 return xpcNoMemory;
140 }
141
142 part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
143 GFP_KERNEL, &part->remote_GPs_base);
144 if (part->remote_GPs == NULL) {
145 dev_err(xpc_chan, "can't get memory for remote get/put "
146 "values\n");
147 kfree(part->local_GPs_base);
148 part->local_GPs = NULL;
149 kfree(part->channels);
150 part->channels = NULL;
151 return xpcNoMemory;
152 }
153
154
155 /* allocate all the required open and close args */
156
157 part->local_openclose_args = xpc_kzalloc_cacheline_aligned(
158 XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
159 &part->local_openclose_args_base);
160 if (part->local_openclose_args == NULL) {
161 dev_err(xpc_chan, "can't get memory for local connect args\n");
162 kfree(part->remote_GPs_base);
163 part->remote_GPs = NULL;
164 kfree(part->local_GPs_base);
165 part->local_GPs = NULL;
166 kfree(part->channels);
167 part->channels = NULL;
168 return xpcNoMemory;
169 }
170
171 part->remote_openclose_args = xpc_kzalloc_cacheline_aligned(
172 XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
173 &part->remote_openclose_args_base);
174 if (part->remote_openclose_args == NULL) {
175 dev_err(xpc_chan, "can't get memory for remote connect args\n");
176 kfree(part->local_openclose_args_base);
177 part->local_openclose_args = NULL;
178 kfree(part->remote_GPs_base);
179 part->remote_GPs = NULL;
180 kfree(part->local_GPs_base);
181 part->local_GPs = NULL;
182 kfree(part->channels);
183 part->channels = NULL;
184 return xpcNoMemory;
185 }
186
187
188 xpc_initialize_channels(part, partid);
189
190 atomic_set(&part->nchannels_active, 0);
191 atomic_set(&part->nchannels_engaged, 0);
192
193
194 /* local_IPI_amo were set to 0 by an earlier memset() */
195
196 /* Initialize this partitions AMO_t structure */
197 part->local_IPI_amo_va = xpc_IPI_init(partid);
198
199 spin_lock_init(&part->IPI_lock);
200
201 atomic_set(&part->channel_mgr_requests, 1);
202 init_waitqueue_head(&part->channel_mgr_wq);
203
204 sprintf(part->IPI_owner, "xpc%02d", partid);
205 ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, IRQF_SHARED,
206 part->IPI_owner, (void *) (u64) partid);
207 if (ret != 0) {
208 dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
209 "errno=%d\n", -ret);
210 kfree(part->remote_openclose_args_base);
211 part->remote_openclose_args = NULL;
212 kfree(part->local_openclose_args_base);
213 part->local_openclose_args = NULL;
214 kfree(part->remote_GPs_base);
215 part->remote_GPs = NULL;
216 kfree(part->local_GPs_base);
217 part->local_GPs = NULL;
218 kfree(part->channels);
219 part->channels = NULL;
220 return xpcLackOfResources;
221 }
222
223 /* Setup a timer to check for dropped IPIs */
224 timer = &part->dropped_IPI_timer;
225 init_timer(timer);
226 timer->function = (void (*)(unsigned long)) xpc_dropped_IPI_check;
227 timer->data = (unsigned long) part;
228 timer->expires = jiffies + XPC_P_DROPPED_IPI_WAIT;
229 add_timer(timer);
230
231 /*
232 * With the setting of the partition setup_state to XPC_P_SETUP, we're
233 * declaring that this partition is ready to go.
234 */
235 part->setup_state = XPC_P_SETUP;
236
237
238 /*
239 * Setup the per partition specific variables required by the
240 * remote partition to establish channel connections with us.
241 *
242 * The setting of the magic # indicates that these per partition
243 * specific variables are ready to be used.
244 */
245 xpc_vars_part[partid].GPs_pa = __pa(part->local_GPs);
246 xpc_vars_part[partid].openclose_args_pa =
247 __pa(part->local_openclose_args);
248 xpc_vars_part[partid].IPI_amo_pa = __pa(part->local_IPI_amo_va);
249 cpuid = raw_smp_processor_id(); /* any CPU in this partition will do */
250 xpc_vars_part[partid].IPI_nasid = cpuid_to_nasid(cpuid);
251 xpc_vars_part[partid].IPI_phys_cpuid = cpu_physical_id(cpuid);
252 xpc_vars_part[partid].nchannels = part->nchannels;
253 xpc_vars_part[partid].magic = XPC_VP_MAGIC1;
254
255 return xpcSuccess;
256}
257
258
259/*
260 * Create a wrapper that hides the underlying mechanism for pulling a cacheline
261 * (or multiple cachelines) from a remote partition.
262 *
263 * src must be a cacheline aligned physical address on the remote partition.
264 * dst must be a cacheline aligned virtual address on this partition.
265 * cnt must be an cacheline sized
266 */
267static enum xpc_retval
268xpc_pull_remote_cachelines(struct xpc_partition *part, void *dst,
269 const void *src, size_t cnt)
270{
271 bte_result_t bte_ret;
272
273
274 DBUG_ON((u64) src != L1_CACHE_ALIGN((u64) src));
275 DBUG_ON((u64) dst != L1_CACHE_ALIGN((u64) dst));
276 DBUG_ON(cnt != L1_CACHE_ALIGN(cnt));
277
278 if (part->act_state == XPC_P_DEACTIVATING) {
279 return part->reason;
280 }
281
282 bte_ret = xp_bte_copy((u64) src, (u64) dst, (u64) cnt,
283 (BTE_NORMAL | BTE_WACQUIRE), NULL);
284 if (bte_ret == BTE_SUCCESS) {
285 return xpcSuccess;
286 }
287
288 dev_dbg(xpc_chan, "xp_bte_copy() from partition %d failed, ret=%d\n",
289 XPC_PARTID(part), bte_ret);
290
291 return xpc_map_bte_errors(bte_ret);
292}
293
294
295/*
296 * Pull the remote per partition specific variables from the specified
297 * partition.
298 */
299enum xpc_retval
300xpc_pull_remote_vars_part(struct xpc_partition *part)
301{
302 u8 buffer[L1_CACHE_BYTES * 2];
303 struct xpc_vars_part *pulled_entry_cacheline =
304 (struct xpc_vars_part *) L1_CACHE_ALIGN((u64) buffer);
305 struct xpc_vars_part *pulled_entry;
306 u64 remote_entry_cacheline_pa, remote_entry_pa;
307 partid_t partid = XPC_PARTID(part);
308 enum xpc_retval ret;
309
310
311 /* pull the cacheline that contains the variables we're interested in */
312
313 DBUG_ON(part->remote_vars_part_pa !=
314 L1_CACHE_ALIGN(part->remote_vars_part_pa));
315 DBUG_ON(sizeof(struct xpc_vars_part) != L1_CACHE_BYTES / 2);
316
317 remote_entry_pa = part->remote_vars_part_pa +
318 sn_partition_id * sizeof(struct xpc_vars_part);
319
320 remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1));
321
322 pulled_entry = (struct xpc_vars_part *) ((u64) pulled_entry_cacheline +
323 (remote_entry_pa & (L1_CACHE_BYTES - 1)));
324
325 ret = xpc_pull_remote_cachelines(part, pulled_entry_cacheline,
326 (void *) remote_entry_cacheline_pa,
327 L1_CACHE_BYTES);
328 if (ret != xpcSuccess) {
329 dev_dbg(xpc_chan, "failed to pull XPC vars_part from "
330 "partition %d, ret=%d\n", partid, ret);
331 return ret;
332 }
333
334
335 /* see if they've been set up yet */
336
337 if (pulled_entry->magic != XPC_VP_MAGIC1 &&
338 pulled_entry->magic != XPC_VP_MAGIC2) {
339
340 if (pulled_entry->magic != 0) {
341 dev_dbg(xpc_chan, "partition %d's XPC vars_part for "
342 "partition %d has bad magic value (=0x%lx)\n",
343 partid, sn_partition_id, pulled_entry->magic);
344 return xpcBadMagic;
345 }
346
347 /* they've not been initialized yet */
348 return xpcRetry;
349 }
350
351 if (xpc_vars_part[partid].magic == XPC_VP_MAGIC1) {
352
353 /* validate the variables */
354
355 if (pulled_entry->GPs_pa == 0 ||
356 pulled_entry->openclose_args_pa == 0 ||
357 pulled_entry->IPI_amo_pa == 0) {
358
359 dev_err(xpc_chan, "partition %d's XPC vars_part for "
360 "partition %d are not valid\n", partid,
361 sn_partition_id);
362 return xpcInvalidAddress;
363 }
364
365 /* the variables we imported look to be valid */
366
367 part->remote_GPs_pa = pulled_entry->GPs_pa;
368 part->remote_openclose_args_pa =
369 pulled_entry->openclose_args_pa;
370 part->remote_IPI_amo_va =
371 (AMO_t *) __va(pulled_entry->IPI_amo_pa);
372 part->remote_IPI_nasid = pulled_entry->IPI_nasid;
373 part->remote_IPI_phys_cpuid = pulled_entry->IPI_phys_cpuid;
374
375 if (part->nchannels > pulled_entry->nchannels) {
376 part->nchannels = pulled_entry->nchannels;
377 }
378
379 /* let the other side know that we've pulled their variables */
380
381 xpc_vars_part[partid].magic = XPC_VP_MAGIC2;
382 }
383
384 if (pulled_entry->magic == XPC_VP_MAGIC1) {
385 return xpcRetry;
386 }
387
388 return xpcSuccess;
389}
390
391
392/*
393 * Get the IPI flags and pull the openclose args and/or remote GPs as needed.
394 */
395static u64
396xpc_get_IPI_flags(struct xpc_partition *part)
397{
398 unsigned long irq_flags;
399 u64 IPI_amo;
400 enum xpc_retval ret;
401
402
403 /*
404 * See if there are any IPI flags to be handled.
405 */
406
407 spin_lock_irqsave(&part->IPI_lock, irq_flags);
408 if ((IPI_amo = part->local_IPI_amo) != 0) {
409 part->local_IPI_amo = 0;
410 }
411 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
412
413
414 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_amo)) {
415 ret = xpc_pull_remote_cachelines(part,
416 part->remote_openclose_args,
417 (void *) part->remote_openclose_args_pa,
418 XPC_OPENCLOSE_ARGS_SIZE);
419 if (ret != xpcSuccess) {
420 XPC_DEACTIVATE_PARTITION(part, ret);
421
422 dev_dbg(xpc_chan, "failed to pull openclose args from "
423 "partition %d, ret=%d\n", XPC_PARTID(part),
424 ret);
425
426 /* don't bother processing IPIs anymore */
427 IPI_amo = 0;
428 }
429 }
430
431 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_amo)) {
432 ret = xpc_pull_remote_cachelines(part, part->remote_GPs,
433 (void *) part->remote_GPs_pa,
434 XPC_GP_SIZE);
435 if (ret != xpcSuccess) {
436 XPC_DEACTIVATE_PARTITION(part, ret);
437
438 dev_dbg(xpc_chan, "failed to pull GPs from partition "
439 "%d, ret=%d\n", XPC_PARTID(part), ret);
440
441 /* don't bother processing IPIs anymore */
442 IPI_amo = 0;
443 }
444 }
445
446 return IPI_amo;
447}
448
449
450/*
451 * Allocate the local message queue and the notify queue.
452 */
453static enum xpc_retval
454xpc_allocate_local_msgqueue(struct xpc_channel *ch)
455{
456 unsigned long irq_flags;
457 int nentries;
458 size_t nbytes;
459
460
461 // >>> may want to check for ch->flags & XPC_C_DISCONNECTING between
462 // >>> iterations of the for-loop, bail if set?
463
464 // >>> should we impose a minimum #of entries? like 4 or 8?
465 for (nentries = ch->local_nentries; nentries > 0; nentries--) {
466
467 nbytes = nentries * ch->msg_size;
468 ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
469 GFP_KERNEL,
470 &ch->local_msgqueue_base);
471 if (ch->local_msgqueue == NULL) {
472 continue;
473 }
474
475 nbytes = nentries * sizeof(struct xpc_notify);
476 ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
477 if (ch->notify_queue == NULL) {
478 kfree(ch->local_msgqueue_base);
479 ch->local_msgqueue = NULL;
480 continue;
481 }
482
483 spin_lock_irqsave(&ch->lock, irq_flags);
484 if (nentries < ch->local_nentries) {
485 dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, "
486 "partid=%d, channel=%d\n", nentries,
487 ch->local_nentries, ch->partid, ch->number);
488
489 ch->local_nentries = nentries;
490 }
491 spin_unlock_irqrestore(&ch->lock, irq_flags);
492 return xpcSuccess;
493 }
494
495 dev_dbg(xpc_chan, "can't get memory for local message queue and notify "
496 "queue, partid=%d, channel=%d\n", ch->partid, ch->number);
497 return xpcNoMemory;
498}
499
500
501/*
502 * Allocate the cached remote message queue.
503 */
504static enum xpc_retval
505xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
506{
507 unsigned long irq_flags;
508 int nentries;
509 size_t nbytes;
510
511
512 DBUG_ON(ch->remote_nentries <= 0);
513
514 // >>> may want to check for ch->flags & XPC_C_DISCONNECTING between
515 // >>> iterations of the for-loop, bail if set?
516
517 // >>> should we impose a minimum #of entries? like 4 or 8?
518 for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
519
520 nbytes = nentries * ch->msg_size;
521 ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
522 GFP_KERNEL,
523 &ch->remote_msgqueue_base);
524 if (ch->remote_msgqueue == NULL) {
525 continue;
526 }
527
528 spin_lock_irqsave(&ch->lock, irq_flags);
529 if (nentries < ch->remote_nentries) {
530 dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, "
531 "partid=%d, channel=%d\n", nentries,
532 ch->remote_nentries, ch->partid, ch->number);
533
534 ch->remote_nentries = nentries;
535 }
536 spin_unlock_irqrestore(&ch->lock, irq_flags);
537 return xpcSuccess;
538 }
539
540 dev_dbg(xpc_chan, "can't get memory for cached remote message queue, "
541 "partid=%d, channel=%d\n", ch->partid, ch->number);
542 return xpcNoMemory;
543}
544
545
546/*
547 * Allocate message queues and other stuff associated with a channel.
548 *
549 * Note: Assumes all of the channel sizes are filled in.
550 */
551static enum xpc_retval
552xpc_allocate_msgqueues(struct xpc_channel *ch)
553{
554 unsigned long irq_flags;
555 enum xpc_retval ret;
556
557
558 DBUG_ON(ch->flags & XPC_C_SETUP);
559
560 if ((ret = xpc_allocate_local_msgqueue(ch)) != xpcSuccess) {
561 return ret;
562 }
563
564 if ((ret = xpc_allocate_remote_msgqueue(ch)) != xpcSuccess) {
565 kfree(ch->local_msgqueue_base);
566 ch->local_msgqueue = NULL;
567 kfree(ch->notify_queue);
568 ch->notify_queue = NULL;
569 return ret;
570 }
571
572 spin_lock_irqsave(&ch->lock, irq_flags);
573 ch->flags |= XPC_C_SETUP;
574 spin_unlock_irqrestore(&ch->lock, irq_flags);
575
576 return xpcSuccess;
577}
578
579
580/*
581 * Process a connect message from a remote partition.
582 *
583 * Note: xpc_process_connect() is expecting to be called with the
584 * spin_lock_irqsave held and will leave it locked upon return.
585 */
586static void
587xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
588{
589 enum xpc_retval ret;
590
591
592 DBUG_ON(!spin_is_locked(&ch->lock));
593
594 if (!(ch->flags & XPC_C_OPENREQUEST) ||
595 !(ch->flags & XPC_C_ROPENREQUEST)) {
596 /* nothing more to do for now */
597 return;
598 }
599 DBUG_ON(!(ch->flags & XPC_C_CONNECTING));
600
601 if (!(ch->flags & XPC_C_SETUP)) {
602 spin_unlock_irqrestore(&ch->lock, *irq_flags);
603 ret = xpc_allocate_msgqueues(ch);
604 spin_lock_irqsave(&ch->lock, *irq_flags);
605
606 if (ret != xpcSuccess) {
607 XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags);
608 }
609 if (ch->flags & (XPC_C_CONNECTED | XPC_C_DISCONNECTING)) {
610 return;
611 }
612
613 DBUG_ON(!(ch->flags & XPC_C_SETUP));
614 DBUG_ON(ch->local_msgqueue == NULL);
615 DBUG_ON(ch->remote_msgqueue == NULL);
616 }
617
618 if (!(ch->flags & XPC_C_OPENREPLY)) {
619 ch->flags |= XPC_C_OPENREPLY;
620 xpc_IPI_send_openreply(ch, irq_flags);
621 }
622
623 if (!(ch->flags & XPC_C_ROPENREPLY)) {
624 return;
625 }
626
627 DBUG_ON(ch->remote_msgqueue_pa == 0);
628
629 ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP); /* clear all else */
630
631 dev_info(xpc_chan, "channel %d to partition %d connected\n",
632 ch->number, ch->partid);
633
634 spin_unlock_irqrestore(&ch->lock, *irq_flags);
635 xpc_create_kthreads(ch, 1, 0);
636 spin_lock_irqsave(&ch->lock, *irq_flags);
637}
638
639
640/*
641 * Notify those who wanted to be notified upon delivery of their message.
642 */
643static void
644xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put)
645{
646 struct xpc_notify *notify;
647 u8 notify_type;
648 s64 get = ch->w_remote_GP.get - 1;
649
650
651 while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
652
653 notify = &ch->notify_queue[get % ch->local_nentries];
654
655 /*
656 * See if the notify entry indicates it was associated with
657 * a message who's sender wants to be notified. It is possible
658 * that it is, but someone else is doing or has done the
659 * notification.
660 */
661 notify_type = notify->type;
662 if (notify_type == 0 ||
663 cmpxchg(&notify->type, notify_type, 0) !=
664 notify_type) {
665 continue;
666 }
667
668 DBUG_ON(notify_type != XPC_N_CALL);
669
670 atomic_dec(&ch->n_to_notify);
671
672 if (notify->func != NULL) {
673 dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, "
674 "msg_number=%ld, partid=%d, channel=%d\n",
675 (void *) notify, get, ch->partid, ch->number);
676
677 notify->func(reason, ch->partid, ch->number,
678 notify->key);
679
680 dev_dbg(xpc_chan, "notify->func() returned, "
681 "notify=0x%p, msg_number=%ld, partid=%d, "
682 "channel=%d\n", (void *) notify, get,
683 ch->partid, ch->number);
684 }
685 }
686}
687
688
689/*
690 * Free up message queues and other stuff that were allocated for the specified
691 * channel.
692 *
693 * Note: ch->reason and ch->reason_line are left set for debugging purposes,
694 * they're cleared when XPC_C_DISCONNECTED is cleared.
695 */
696static void
697xpc_free_msgqueues(struct xpc_channel *ch)
698{
699 DBUG_ON(!spin_is_locked(&ch->lock));
700 DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
701
702 ch->remote_msgqueue_pa = 0;
703 ch->func = NULL;
704 ch->key = NULL;
705 ch->msg_size = 0;
706 ch->local_nentries = 0;
707 ch->remote_nentries = 0;
708 ch->kthreads_assigned_limit = 0;
709 ch->kthreads_idle_limit = 0;
710
711 ch->local_GP->get = 0;
712 ch->local_GP->put = 0;
713 ch->remote_GP.get = 0;
714 ch->remote_GP.put = 0;
715 ch->w_local_GP.get = 0;
716 ch->w_local_GP.put = 0;
717 ch->w_remote_GP.get = 0;
718 ch->w_remote_GP.put = 0;
719 ch->next_msg_to_pull = 0;
720
721 if (ch->flags & XPC_C_SETUP) {
722 ch->flags &= ~XPC_C_SETUP;
723
724 dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n",
725 ch->flags, ch->partid, ch->number);
726
727 kfree(ch->local_msgqueue_base);
728 ch->local_msgqueue = NULL;
729 kfree(ch->remote_msgqueue_base);
730 ch->remote_msgqueue = NULL;
731 kfree(ch->notify_queue);
732 ch->notify_queue = NULL;
733 }
734}
735
736
737/*
738 * spin_lock_irqsave() is expected to be held on entry.
739 */
740static void
741xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
742{
743 struct xpc_partition *part = &xpc_partitions[ch->partid];
744 u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED);
745
746
747 DBUG_ON(!spin_is_locked(&ch->lock));
748
749 if (!(ch->flags & XPC_C_DISCONNECTING)) {
750 return;
751 }
752
753 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
754
755 /* make sure all activity has settled down first */
756
757 if (atomic_read(&ch->kthreads_assigned) > 0 ||
758 atomic_read(&ch->references) > 0) {
759 return;
760 }
761 DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
762 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE));
763
764 if (part->act_state == XPC_P_DEACTIVATING) {
765 /* can't proceed until the other side disengages from us */
766 if (xpc_partition_engaged(1UL << ch->partid)) {
767 return;
768 }
769
770 } else {
771
772 /* as long as the other side is up do the full protocol */
773
774 if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
775 return;
776 }
777
778 if (!(ch->flags & XPC_C_CLOSEREPLY)) {
779 ch->flags |= XPC_C_CLOSEREPLY;
780 xpc_IPI_send_closereply(ch, irq_flags);
781 }
782
783 if (!(ch->flags & XPC_C_RCLOSEREPLY)) {
784 return;
785 }
786 }
787
788 /* wake those waiting for notify completion */
789 if (atomic_read(&ch->n_to_notify) > 0) {
790 /* >>> we do callout while holding ch->lock */
791 xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put);
792 }
793
794 /* both sides are disconnected now */
795
796 if (ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE) {
797 spin_unlock_irqrestore(&ch->lock, *irq_flags);
798 xpc_disconnect_callout(ch, xpcDisconnected);
799 spin_lock_irqsave(&ch->lock, *irq_flags);
800 }
801
802 /* it's now safe to free the channel's message queues */
803 xpc_free_msgqueues(ch);
804
805 /* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */
806 ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
807
808 atomic_dec(&part->nchannels_active);
809
810 if (channel_was_connected) {
811 dev_info(xpc_chan, "channel %d to partition %d disconnected, "
812 "reason=%d\n", ch->number, ch->partid, ch->reason);
813 }
814
815 if (ch->flags & XPC_C_WDISCONNECT) {
816 /* we won't lose the CPU since we're holding ch->lock */
817 complete(&ch->wdisconnect_wait);
818 } else if (ch->delayed_IPI_flags) {
819 if (part->act_state != XPC_P_DEACTIVATING) {
820 /* time to take action on any delayed IPI flags */
821 spin_lock(&part->IPI_lock);
822 XPC_SET_IPI_FLAGS(part->local_IPI_amo, ch->number,
823 ch->delayed_IPI_flags);
824 spin_unlock(&part->IPI_lock);
825 }
826 ch->delayed_IPI_flags = 0;
827 }
828}
829
830
831/*
832 * Process a change in the channel's remote connection state.
833 */
834static void
835xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
836 u8 IPI_flags)
837{
838 unsigned long irq_flags;
839 struct xpc_openclose_args *args =
840 &part->remote_openclose_args[ch_number];
841 struct xpc_channel *ch = &part->channels[ch_number];
842 enum xpc_retval reason;
843
844
845
846 spin_lock_irqsave(&ch->lock, irq_flags);
847
848again:
849
850 if ((ch->flags & XPC_C_DISCONNECTED) &&
851 (ch->flags & XPC_C_WDISCONNECT)) {
852 /*
853 * Delay processing IPI flags until thread waiting disconnect
854 * has had a chance to see that the channel is disconnected.
855 */
856 ch->delayed_IPI_flags |= IPI_flags;
857 spin_unlock_irqrestore(&ch->lock, irq_flags);
858 return;
859 }
860
861
862 if (IPI_flags & XPC_IPI_CLOSEREQUEST) {
863
864 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREQUEST (reason=%d) received "
865 "from partid=%d, channel=%d\n", args->reason,
866 ch->partid, ch->number);
867
868 /*
869 * If RCLOSEREQUEST is set, we're probably waiting for
870 * RCLOSEREPLY. We should find it and a ROPENREQUEST packed
871 * with this RCLOSEREQUEST in the IPI_flags.
872 */
873
874 if (ch->flags & XPC_C_RCLOSEREQUEST) {
875 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
876 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
877 DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY));
878 DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY);
879
880 DBUG_ON(!(IPI_flags & XPC_IPI_CLOSEREPLY));
881 IPI_flags &= ~XPC_IPI_CLOSEREPLY;
882 ch->flags |= XPC_C_RCLOSEREPLY;
883
884 /* both sides have finished disconnecting */
885 xpc_process_disconnect(ch, &irq_flags);
886 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
887 goto again;
888 }
889
890 if (ch->flags & XPC_C_DISCONNECTED) {
891 if (!(IPI_flags & XPC_IPI_OPENREQUEST)) {
892 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo,
893 ch_number) & XPC_IPI_OPENREQUEST)) {
894
895 DBUG_ON(ch->delayed_IPI_flags != 0);
896 spin_lock(&part->IPI_lock);
897 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
898 ch_number,
899 XPC_IPI_CLOSEREQUEST);
900 spin_unlock(&part->IPI_lock);
901 }
902 spin_unlock_irqrestore(&ch->lock, irq_flags);
903 return;
904 }
905
906 XPC_SET_REASON(ch, 0, 0);
907 ch->flags &= ~XPC_C_DISCONNECTED;
908
909 atomic_inc(&part->nchannels_active);
910 ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST);
911 }
912
913 IPI_flags &= ~(XPC_IPI_OPENREQUEST | XPC_IPI_OPENREPLY);
914
915 /*
916 * The meaningful CLOSEREQUEST connection state fields are:
917 * reason = reason connection is to be closed
918 */
919
920 ch->flags |= XPC_C_RCLOSEREQUEST;
921
922 if (!(ch->flags & XPC_C_DISCONNECTING)) {
923 reason = args->reason;
924 if (reason <= xpcSuccess || reason > xpcUnknownReason) {
925 reason = xpcUnknownReason;
926 } else if (reason == xpcUnregistering) {
927 reason = xpcOtherUnregistering;
928 }
929
930 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
931
932 DBUG_ON(IPI_flags & XPC_IPI_CLOSEREPLY);
933 spin_unlock_irqrestore(&ch->lock, irq_flags);
934 return;
935 }
936
937 xpc_process_disconnect(ch, &irq_flags);
938 }
939
940
941 if (IPI_flags & XPC_IPI_CLOSEREPLY) {
942
943 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREPLY received from partid=%d,"
944 " channel=%d\n", ch->partid, ch->number);
945
946 if (ch->flags & XPC_C_DISCONNECTED) {
947 DBUG_ON(part->act_state != XPC_P_DEACTIVATING);
948 spin_unlock_irqrestore(&ch->lock, irq_flags);
949 return;
950 }
951
952 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
953
954 if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
955 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, ch_number)
956 & XPC_IPI_CLOSEREQUEST)) {
957
958 DBUG_ON(ch->delayed_IPI_flags != 0);
959 spin_lock(&part->IPI_lock);
960 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
961 ch_number, XPC_IPI_CLOSEREPLY);
962 spin_unlock(&part->IPI_lock);
963 }
964 spin_unlock_irqrestore(&ch->lock, irq_flags);
965 return;
966 }
967
968 ch->flags |= XPC_C_RCLOSEREPLY;
969
970 if (ch->flags & XPC_C_CLOSEREPLY) {
971 /* both sides have finished disconnecting */
972 xpc_process_disconnect(ch, &irq_flags);
973 }
974 }
975
976
977 if (IPI_flags & XPC_IPI_OPENREQUEST) {
978
979 dev_dbg(xpc_chan, "XPC_IPI_OPENREQUEST (msg_size=%d, "
980 "local_nentries=%d) received from partid=%d, "
981 "channel=%d\n", args->msg_size, args->local_nentries,
982 ch->partid, ch->number);
983
984 if (part->act_state == XPC_P_DEACTIVATING ||
985 (ch->flags & XPC_C_ROPENREQUEST)) {
986 spin_unlock_irqrestore(&ch->lock, irq_flags);
987 return;
988 }
989
990 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
991 ch->delayed_IPI_flags |= XPC_IPI_OPENREQUEST;
992 spin_unlock_irqrestore(&ch->lock, irq_flags);
993 return;
994 }
995 DBUG_ON(!(ch->flags & (XPC_C_DISCONNECTED |
996 XPC_C_OPENREQUEST)));
997 DBUG_ON(ch->flags & (XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
998 XPC_C_OPENREPLY | XPC_C_CONNECTED));
999
1000 /*
1001 * The meaningful OPENREQUEST connection state fields are:
1002 * msg_size = size of channel's messages in bytes
1003 * local_nentries = remote partition's local_nentries
1004 */
1005 if (args->msg_size == 0 || args->local_nentries == 0) {
1006 /* assume OPENREQUEST was delayed by mistake */
1007 spin_unlock_irqrestore(&ch->lock, irq_flags);
1008 return;
1009 }
1010
1011 ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING);
1012 ch->remote_nentries = args->local_nentries;
1013
1014
1015 if (ch->flags & XPC_C_OPENREQUEST) {
1016 if (args->msg_size != ch->msg_size) {
1017 XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
1018 &irq_flags);
1019 spin_unlock_irqrestore(&ch->lock, irq_flags);
1020 return;
1021 }
1022 } else {
1023 ch->msg_size = args->msg_size;
1024
1025 XPC_SET_REASON(ch, 0, 0);
1026 ch->flags &= ~XPC_C_DISCONNECTED;
1027
1028 atomic_inc(&part->nchannels_active);
1029 }
1030
1031 xpc_process_connect(ch, &irq_flags);
1032 }
1033
1034
1035 if (IPI_flags & XPC_IPI_OPENREPLY) {
1036
1037 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY (local_msgqueue_pa=0x%lx, "
1038 "local_nentries=%d, remote_nentries=%d) received from "
1039 "partid=%d, channel=%d\n", args->local_msgqueue_pa,
1040 args->local_nentries, args->remote_nentries,
1041 ch->partid, ch->number);
1042
1043 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) {
1044 spin_unlock_irqrestore(&ch->lock, irq_flags);
1045 return;
1046 }
1047 if (!(ch->flags & XPC_C_OPENREQUEST)) {
1048 XPC_DISCONNECT_CHANNEL(ch, xpcOpenCloseError,
1049 &irq_flags);
1050 spin_unlock_irqrestore(&ch->lock, irq_flags);
1051 return;
1052 }
1053
1054 DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
1055 DBUG_ON(ch->flags & XPC_C_CONNECTED);
1056
1057 /*
1058 * The meaningful OPENREPLY connection state fields are:
1059 * local_msgqueue_pa = physical address of remote
1060 * partition's local_msgqueue
1061 * local_nentries = remote partition's local_nentries
1062 * remote_nentries = remote partition's remote_nentries
1063 */
1064 DBUG_ON(args->local_msgqueue_pa == 0);
1065 DBUG_ON(args->local_nentries == 0);
1066 DBUG_ON(args->remote_nentries == 0);
1067
1068 ch->flags |= XPC_C_ROPENREPLY;
1069 ch->remote_msgqueue_pa = args->local_msgqueue_pa;
1070
1071 if (args->local_nentries < ch->remote_nentries) {
1072 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
1073 "remote_nentries=%d, old remote_nentries=%d, "
1074 "partid=%d, channel=%d\n",
1075 args->local_nentries, ch->remote_nentries,
1076 ch->partid, ch->number);
1077
1078 ch->remote_nentries = args->local_nentries;
1079 }
1080 if (args->remote_nentries < ch->local_nentries) {
1081 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
1082 "local_nentries=%d, old local_nentries=%d, "
1083 "partid=%d, channel=%d\n",
1084 args->remote_nentries, ch->local_nentries,
1085 ch->partid, ch->number);
1086
1087 ch->local_nentries = args->remote_nentries;
1088 }
1089
1090 xpc_process_connect(ch, &irq_flags);
1091 }
1092
1093 spin_unlock_irqrestore(&ch->lock, irq_flags);
1094}
1095
1096
1097/*
1098 * Attempt to establish a channel connection to a remote partition.
1099 */
1100static enum xpc_retval
1101xpc_connect_channel(struct xpc_channel *ch)
1102{
1103 unsigned long irq_flags;
1104 struct xpc_registration *registration = &xpc_registrations[ch->number];
1105
1106
1107 if (mutex_trylock(&registration->mutex) == 0) {
1108 return xpcRetry;
1109 }
1110
1111 if (!XPC_CHANNEL_REGISTERED(ch->number)) {
1112 mutex_unlock(&registration->mutex);
1113 return xpcUnregistered;
1114 }
1115
1116 spin_lock_irqsave(&ch->lock, irq_flags);
1117
1118 DBUG_ON(ch->flags & XPC_C_CONNECTED);
1119 DBUG_ON(ch->flags & XPC_C_OPENREQUEST);
1120
1121 if (ch->flags & XPC_C_DISCONNECTING) {
1122 spin_unlock_irqrestore(&ch->lock, irq_flags);
1123 mutex_unlock(&registration->mutex);
1124 return ch->reason;
1125 }
1126
1127
1128 /* add info from the channel connect registration to the channel */
1129
1130 ch->kthreads_assigned_limit = registration->assigned_limit;
1131 ch->kthreads_idle_limit = registration->idle_limit;
1132 DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0);
1133 DBUG_ON(atomic_read(&ch->kthreads_idle) != 0);
1134 DBUG_ON(atomic_read(&ch->kthreads_active) != 0);
1135
1136 ch->func = registration->func;
1137 DBUG_ON(registration->func == NULL);
1138 ch->key = registration->key;
1139
1140 ch->local_nentries = registration->nentries;
1141
1142 if (ch->flags & XPC_C_ROPENREQUEST) {
1143 if (registration->msg_size != ch->msg_size) {
1144 /* the local and remote sides aren't the same */
1145
1146 /*
1147 * Because XPC_DISCONNECT_CHANNEL() can block we're
1148 * forced to up the registration sema before we unlock
1149 * the channel lock. But that's okay here because we're
1150 * done with the part that required the registration
1151 * sema. XPC_DISCONNECT_CHANNEL() requires that the
1152 * channel lock be locked and will unlock and relock
1153 * the channel lock as needed.
1154 */
1155 mutex_unlock(&registration->mutex);
1156 XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
1157 &irq_flags);
1158 spin_unlock_irqrestore(&ch->lock, irq_flags);
1159 return xpcUnequalMsgSizes;
1160 }
1161 } else {
1162 ch->msg_size = registration->msg_size;
1163
1164 XPC_SET_REASON(ch, 0, 0);
1165 ch->flags &= ~XPC_C_DISCONNECTED;
1166
1167 atomic_inc(&xpc_partitions[ch->partid].nchannels_active);
1168 }
1169
1170 mutex_unlock(&registration->mutex);
1171
1172
1173 /* initiate the connection */
1174
1175 ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING);
1176 xpc_IPI_send_openrequest(ch, &irq_flags);
1177
1178 xpc_process_connect(ch, &irq_flags);
1179
1180 spin_unlock_irqrestore(&ch->lock, irq_flags);
1181
1182 return xpcSuccess;
1183}
1184
1185
1186/*
1187 * Clear some of the msg flags in the local message queue.
1188 */
1189static inline void
1190xpc_clear_local_msgqueue_flags(struct xpc_channel *ch)
1191{
1192 struct xpc_msg *msg;
1193 s64 get;
1194
1195
1196 get = ch->w_remote_GP.get;
1197 do {
1198 msg = (struct xpc_msg *) ((u64) ch->local_msgqueue +
1199 (get % ch->local_nentries) * ch->msg_size);
1200 msg->flags = 0;
1201 } while (++get < (volatile s64) ch->remote_GP.get);
1202}
1203
1204
1205/*
1206 * Clear some of the msg flags in the remote message queue.
1207 */
1208static inline void
1209xpc_clear_remote_msgqueue_flags(struct xpc_channel *ch)
1210{
1211 struct xpc_msg *msg;
1212 s64 put;
1213
1214
1215 put = ch->w_remote_GP.put;
1216 do {
1217 msg = (struct xpc_msg *) ((u64) ch->remote_msgqueue +
1218 (put % ch->remote_nentries) * ch->msg_size);
1219 msg->flags = 0;
1220 } while (++put < (volatile s64) ch->remote_GP.put);
1221}
1222
1223
1224static void
1225xpc_process_msg_IPI(struct xpc_partition *part, int ch_number)
1226{
1227 struct xpc_channel *ch = &part->channels[ch_number];
1228 int nmsgs_sent;
1229
1230
1231 ch->remote_GP = part->remote_GPs[ch_number];
1232
1233
1234 /* See what, if anything, has changed for each connected channel */
1235
1236 xpc_msgqueue_ref(ch);
1237
1238 if (ch->w_remote_GP.get == ch->remote_GP.get &&
1239 ch->w_remote_GP.put == ch->remote_GP.put) {
1240 /* nothing changed since GPs were last pulled */
1241 xpc_msgqueue_deref(ch);
1242 return;
1243 }
1244
1245 if (!(ch->flags & XPC_C_CONNECTED)){
1246 xpc_msgqueue_deref(ch);
1247 return;
1248 }
1249
1250
1251 /*
1252 * First check to see if messages recently sent by us have been
1253 * received by the other side. (The remote GET value will have
1254 * changed since we last looked at it.)
1255 */
1256
1257 if (ch->w_remote_GP.get != ch->remote_GP.get) {
1258
1259 /*
1260 * We need to notify any senders that want to be notified
1261 * that their sent messages have been received by their
1262 * intended recipients. We need to do this before updating
1263 * w_remote_GP.get so that we don't allocate the same message
1264 * queue entries prematurely (see xpc_allocate_msg()).
1265 */
1266 if (atomic_read(&ch->n_to_notify) > 0) {
1267 /*
1268 * Notify senders that messages sent have been
1269 * received and delivered by the other side.
1270 */
1271 xpc_notify_senders(ch, xpcMsgDelivered,
1272 ch->remote_GP.get);
1273 }
1274
1275 /*
1276 * Clear msg->flags in previously sent messages, so that
1277 * they're ready for xpc_allocate_msg().
1278 */
1279 xpc_clear_local_msgqueue_flags(ch);
1280
1281 ch->w_remote_GP.get = ch->remote_GP.get;
1282
1283 dev_dbg(xpc_chan, "w_remote_GP.get changed to %ld, partid=%d, "
1284 "channel=%d\n", ch->w_remote_GP.get, ch->partid,
1285 ch->number);
1286
1287 /*
1288 * If anyone was waiting for message queue entries to become
1289 * available, wake them up.
1290 */
1291 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) {
1292 wake_up(&ch->msg_allocate_wq);
1293 }
1294 }
1295
1296
1297 /*
1298 * Now check for newly sent messages by the other side. (The remote
1299 * PUT value will have changed since we last looked at it.)
1300 */
1301
1302 if (ch->w_remote_GP.put != ch->remote_GP.put) {
1303 /*
1304 * Clear msg->flags in previously received messages, so that
1305 * they're ready for xpc_get_deliverable_msg().
1306 */
1307 xpc_clear_remote_msgqueue_flags(ch);
1308
1309 ch->w_remote_GP.put = ch->remote_GP.put;
1310
1311 dev_dbg(xpc_chan, "w_remote_GP.put changed to %ld, partid=%d, "
1312 "channel=%d\n", ch->w_remote_GP.put, ch->partid,
1313 ch->number);
1314
1315 nmsgs_sent = ch->w_remote_GP.put - ch->w_local_GP.get;
1316 if (nmsgs_sent > 0) {
1317 dev_dbg(xpc_chan, "msgs waiting to be copied and "
1318 "delivered=%d, partid=%d, channel=%d\n",
1319 nmsgs_sent, ch->partid, ch->number);
1320
1321 if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) {
1322 xpc_activate_kthreads(ch, nmsgs_sent);
1323 }
1324 }
1325 }
1326
1327 xpc_msgqueue_deref(ch);
1328}
1329
1330
1331void
1332xpc_process_channel_activity(struct xpc_partition *part)
1333{
1334 unsigned long irq_flags;
1335 u64 IPI_amo, IPI_flags;
1336 struct xpc_channel *ch;
1337 int ch_number;
1338 u32 ch_flags;
1339
1340
1341 IPI_amo = xpc_get_IPI_flags(part);
1342
1343 /*
1344 * Initiate channel connections for registered channels.
1345 *
1346 * For each connected channel that has pending messages activate idle
1347 * kthreads and/or create new kthreads as needed.
1348 */
1349
1350 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
1351 ch = &part->channels[ch_number];
1352
1353
1354 /*
1355 * Process any open or close related IPI flags, and then deal
1356 * with connecting or disconnecting the channel as required.
1357 */
1358
1359 IPI_flags = XPC_GET_IPI_FLAGS(IPI_amo, ch_number);
1360
1361 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_flags)) {
1362 xpc_process_openclose_IPI(part, ch_number, IPI_flags);
1363 }
1364
1365 ch_flags = ch->flags; /* need an atomic snapshot of flags */
1366
1367 if (ch_flags & XPC_C_DISCONNECTING) {
1368 spin_lock_irqsave(&ch->lock, irq_flags);
1369 xpc_process_disconnect(ch, &irq_flags);
1370 spin_unlock_irqrestore(&ch->lock, irq_flags);
1371 continue;
1372 }
1373
1374 if (part->act_state == XPC_P_DEACTIVATING) {
1375 continue;
1376 }
1377
1378 if (!(ch_flags & XPC_C_CONNECTED)) {
1379 if (!(ch_flags & XPC_C_OPENREQUEST)) {
1380 DBUG_ON(ch_flags & XPC_C_SETUP);
1381 (void) xpc_connect_channel(ch);
1382 } else {
1383 spin_lock_irqsave(&ch->lock, irq_flags);
1384 xpc_process_connect(ch, &irq_flags);
1385 spin_unlock_irqrestore(&ch->lock, irq_flags);
1386 }
1387 continue;
1388 }
1389
1390
1391 /*
1392 * Process any message related IPI flags, this may involve the
1393 * activation of kthreads to deliver any pending messages sent
1394 * from the other partition.
1395 */
1396
1397 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_flags)) {
1398 xpc_process_msg_IPI(part, ch_number);
1399 }
1400 }
1401}
1402
1403
1404/*
1405 * XPC's heartbeat code calls this function to inform XPC that a partition is
1406 * going down. XPC responds by tearing down the XPartition Communication
1407 * infrastructure used for the just downed partition.
1408 *
1409 * XPC's heartbeat code will never call this function and xpc_partition_up()
1410 * at the same time. Nor will it ever make multiple calls to either function
1411 * at the same time.
1412 */
1413void
1414xpc_partition_going_down(struct xpc_partition *part, enum xpc_retval reason)
1415{
1416 unsigned long irq_flags;
1417 int ch_number;
1418 struct xpc_channel *ch;
1419
1420
1421 dev_dbg(xpc_chan, "deactivating partition %d, reason=%d\n",
1422 XPC_PARTID(part), reason);
1423
1424 if (!xpc_part_ref(part)) {
1425 /* infrastructure for this partition isn't currently set up */
1426 return;
1427 }
1428
1429
1430 /* disconnect channels associated with the partition going down */
1431
1432 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
1433 ch = &part->channels[ch_number];
1434
1435 xpc_msgqueue_ref(ch);
1436 spin_lock_irqsave(&ch->lock, irq_flags);
1437
1438 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
1439
1440 spin_unlock_irqrestore(&ch->lock, irq_flags);
1441 xpc_msgqueue_deref(ch);
1442 }
1443
1444 xpc_wakeup_channel_mgr(part);
1445
1446 xpc_part_deref(part);
1447}
1448
1449
1450/*
1451 * Teardown the infrastructure necessary to support XPartition Communication
1452 * between the specified remote partition and the local one.
1453 */
1454void
1455xpc_teardown_infrastructure(struct xpc_partition *part)
1456{
1457 partid_t partid = XPC_PARTID(part);
1458
1459
1460 /*
1461 * We start off by making this partition inaccessible to local
1462 * processes by marking it as no longer setup. Then we make it
1463 * inaccessible to remote processes by clearing the XPC per partition
1464 * specific variable's magic # (which indicates that these variables
1465 * are no longer valid) and by ignoring all XPC notify IPIs sent to
1466 * this partition.
1467 */
1468
1469 DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
1470 DBUG_ON(atomic_read(&part->nchannels_active) != 0);
1471 DBUG_ON(part->setup_state != XPC_P_SETUP);
1472 part->setup_state = XPC_P_WTEARDOWN;
1473
1474 xpc_vars_part[partid].magic = 0;
1475
1476
1477 free_irq(SGI_XPC_NOTIFY, (void *) (u64) partid);
1478
1479
1480 /*
1481 * Before proceeding with the teardown we have to wait until all
1482 * existing references cease.
1483 */
1484 wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
1485
1486
1487 /* now we can begin tearing down the infrastructure */
1488
1489 part->setup_state = XPC_P_TORNDOWN;
1490
1491 /* in case we've still got outstanding timers registered... */
1492 del_timer_sync(&part->dropped_IPI_timer);
1493
1494 kfree(part->remote_openclose_args_base);
1495 part->remote_openclose_args = NULL;
1496 kfree(part->local_openclose_args_base);
1497 part->local_openclose_args = NULL;
1498 kfree(part->remote_GPs_base);
1499 part->remote_GPs = NULL;
1500 kfree(part->local_GPs_base);
1501 part->local_GPs = NULL;
1502 kfree(part->channels);
1503 part->channels = NULL;
1504 part->local_IPI_amo_va = NULL;
1505}
1506
1507
1508/*
1509 * Called by XP at the time of channel connection registration to cause
1510 * XPC to establish connections to all currently active partitions.
1511 */
1512void
1513xpc_initiate_connect(int ch_number)
1514{
1515 partid_t partid;
1516 struct xpc_partition *part;
1517 struct xpc_channel *ch;
1518
1519
1520 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
1521
1522 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1523 part = &xpc_partitions[partid];
1524
1525 if (xpc_part_ref(part)) {
1526 ch = &part->channels[ch_number];
1527
1528 /*
1529 * Initiate the establishment of a connection on the
1530 * newly registered channel to the remote partition.
1531 */
1532 xpc_wakeup_channel_mgr(part);
1533 xpc_part_deref(part);
1534 }
1535 }
1536}
1537
1538
1539void
1540xpc_connected_callout(struct xpc_channel *ch)
1541{
1542 /* let the registerer know that a connection has been established */
1543
1544 if (ch->func != NULL) {
1545 dev_dbg(xpc_chan, "ch->func() called, reason=xpcConnected, "
1546 "partid=%d, channel=%d\n", ch->partid, ch->number);
1547
1548 ch->func(xpcConnected, ch->partid, ch->number,
1549 (void *) (u64) ch->local_nentries, ch->key);
1550
1551 dev_dbg(xpc_chan, "ch->func() returned, reason=xpcConnected, "
1552 "partid=%d, channel=%d\n", ch->partid, ch->number);
1553 }
1554}
1555
1556
1557/*
1558 * Called by XP at the time of channel connection unregistration to cause
1559 * XPC to teardown all current connections for the specified channel.
1560 *
1561 * Before returning xpc_initiate_disconnect() will wait until all connections
1562 * on the specified channel have been closed/torndown. So the caller can be
1563 * assured that they will not be receiving any more callouts from XPC to the
1564 * function they registered via xpc_connect().
1565 *
1566 * Arguments:
1567 *
1568 * ch_number - channel # to unregister.
1569 */
1570void
1571xpc_initiate_disconnect(int ch_number)
1572{
1573 unsigned long irq_flags;
1574 partid_t partid;
1575 struct xpc_partition *part;
1576 struct xpc_channel *ch;
1577
1578
1579 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
1580
1581 /* initiate the channel disconnect for every active partition */
1582 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1583 part = &xpc_partitions[partid];
1584
1585 if (xpc_part_ref(part)) {
1586 ch = &part->channels[ch_number];
1587 xpc_msgqueue_ref(ch);
1588
1589 spin_lock_irqsave(&ch->lock, irq_flags);
1590
1591 if (!(ch->flags & XPC_C_DISCONNECTED)) {
1592 ch->flags |= XPC_C_WDISCONNECT;
1593
1594 XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering,
1595 &irq_flags);
1596 }
1597
1598 spin_unlock_irqrestore(&ch->lock, irq_flags);
1599
1600 xpc_msgqueue_deref(ch);
1601 xpc_part_deref(part);
1602 }
1603 }
1604
1605 xpc_disconnect_wait(ch_number);
1606}
1607
1608
1609/*
1610 * To disconnect a channel, and reflect it back to all who may be waiting.
1611 *
1612 * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
1613 * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by
1614 * xpc_disconnect_wait().
1615 *
1616 * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN.
1617 */
1618void
1619xpc_disconnect_channel(const int line, struct xpc_channel *ch,
1620 enum xpc_retval reason, unsigned long *irq_flags)
1621{
1622 u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED);
1623
1624
1625 DBUG_ON(!spin_is_locked(&ch->lock));
1626
1627 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) {
1628 return;
1629 }
1630 DBUG_ON(!(ch->flags & (XPC_C_CONNECTING | XPC_C_CONNECTED)));
1631
1632 dev_dbg(xpc_chan, "reason=%d, line=%d, partid=%d, channel=%d\n",
1633 reason, line, ch->partid, ch->number);
1634
1635 XPC_SET_REASON(ch, reason, line);
1636
1637 ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
1638 /* some of these may not have been set */
1639 ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY |
1640 XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
1641 XPC_C_CONNECTING | XPC_C_CONNECTED);
1642
1643 xpc_IPI_send_closerequest(ch, irq_flags);
1644
1645 if (channel_was_connected) {
1646 ch->flags |= XPC_C_WASCONNECTED;
1647 }
1648
1649 spin_unlock_irqrestore(&ch->lock, *irq_flags);
1650
1651 /* wake all idle kthreads so they can exit */
1652 if (atomic_read(&ch->kthreads_idle) > 0) {
1653 wake_up_all(&ch->idle_wq);
1654
1655 } else if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
1656 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
1657 /* start a kthread that will do the xpcDisconnecting callout */
1658 xpc_create_kthreads(ch, 1, 1);
1659 }
1660
1661 /* wake those waiting to allocate an entry from the local msg queue */
1662 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) {
1663 wake_up(&ch->msg_allocate_wq);
1664 }
1665
1666 spin_lock_irqsave(&ch->lock, *irq_flags);
1667}
1668
1669
1670void
1671xpc_disconnect_callout(struct xpc_channel *ch, enum xpc_retval reason)
1672{
1673 /*
1674 * Let the channel's registerer know that the channel is being
1675 * disconnected. We don't want to do this if the registerer was never
1676 * informed of a connection being made.
1677 */
1678
1679 if (ch->func != NULL) {
1680 dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, "
1681 "channel=%d\n", reason, ch->partid, ch->number);
1682
1683 ch->func(reason, ch->partid, ch->number, NULL, ch->key);
1684
1685 dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, "
1686 "channel=%d\n", reason, ch->partid, ch->number);
1687 }
1688}
1689
1690
1691/*
1692 * Wait for a message entry to become available for the specified channel,
1693 * but don't wait any longer than 1 jiffy.
1694 */
1695static enum xpc_retval
1696xpc_allocate_msg_wait(struct xpc_channel *ch)
1697{
1698 enum xpc_retval ret;
1699
1700
1701 if (ch->flags & XPC_C_DISCONNECTING) {
1702 DBUG_ON(ch->reason == xpcInterrupted); // >>> Is this true?
1703 return ch->reason;
1704 }
1705
1706 atomic_inc(&ch->n_on_msg_allocate_wq);
1707 ret = interruptible_sleep_on_timeout(&ch->msg_allocate_wq, 1);
1708 atomic_dec(&ch->n_on_msg_allocate_wq);
1709
1710 if (ch->flags & XPC_C_DISCONNECTING) {
1711 ret = ch->reason;
1712 DBUG_ON(ch->reason == xpcInterrupted); // >>> Is this true?
1713 } else if (ret == 0) {
1714 ret = xpcTimeout;
1715 } else {
1716 ret = xpcInterrupted;
1717 }
1718
1719 return ret;
1720}
1721
1722
1723/*
1724 * Allocate an entry for a message from the message queue associated with the
1725 * specified channel.
1726 */
1727static enum xpc_retval
1728xpc_allocate_msg(struct xpc_channel *ch, u32 flags,
1729 struct xpc_msg **address_of_msg)
1730{
1731 struct xpc_msg *msg;
1732 enum xpc_retval ret;
1733 s64 put;
1734
1735
1736 /* this reference will be dropped in xpc_send_msg() */
1737 xpc_msgqueue_ref(ch);
1738
1739 if (ch->flags & XPC_C_DISCONNECTING) {
1740 xpc_msgqueue_deref(ch);
1741 return ch->reason;
1742 }
1743 if (!(ch->flags & XPC_C_CONNECTED)) {
1744 xpc_msgqueue_deref(ch);
1745 return xpcNotConnected;
1746 }
1747
1748
1749 /*
1750 * Get the next available message entry from the local message queue.
1751 * If none are available, we'll make sure that we grab the latest
1752 * GP values.
1753 */
1754 ret = xpcTimeout;
1755
1756 while (1) {
1757
1758 put = (volatile s64) ch->w_local_GP.put;
1759 if (put - (volatile s64) ch->w_remote_GP.get <
1760 ch->local_nentries) {
1761
1762 /* There are available message entries. We need to try
1763 * to secure one for ourselves. We'll do this by trying
1764 * to increment w_local_GP.put as long as someone else
1765 * doesn't beat us to it. If they do, we'll have to
1766 * try again.
1767 */
1768 if (cmpxchg(&ch->w_local_GP.put, put, put + 1) ==
1769 put) {
1770 /* we got the entry referenced by put */
1771 break;
1772 }
1773 continue; /* try again */
1774 }
1775
1776
1777 /*
1778 * There aren't any available msg entries at this time.
1779 *
1780 * In waiting for a message entry to become available,
1781 * we set a timeout in case the other side is not
1782 * sending completion IPIs. This lets us fake an IPI
1783 * that will cause the IPI handler to fetch the latest
1784 * GP values as if an IPI was sent by the other side.
1785 */
1786 if (ret == xpcTimeout) {
1787 xpc_IPI_send_local_msgrequest(ch);
1788 }
1789
1790 if (flags & XPC_NOWAIT) {
1791 xpc_msgqueue_deref(ch);
1792 return xpcNoWait;
1793 }
1794
1795 ret = xpc_allocate_msg_wait(ch);
1796 if (ret != xpcInterrupted && ret != xpcTimeout) {
1797 xpc_msgqueue_deref(ch);
1798 return ret;
1799 }
1800 }
1801
1802
1803 /* get the message's address and initialize it */
1804 msg = (struct xpc_msg *) ((u64) ch->local_msgqueue +
1805 (put % ch->local_nentries) * ch->msg_size);
1806
1807
1808 DBUG_ON(msg->flags != 0);
1809 msg->number = put;
1810
1811 dev_dbg(xpc_chan, "w_local_GP.put changed to %ld; msg=0x%p, "
1812 "msg_number=%ld, partid=%d, channel=%d\n", put + 1,
1813 (void *) msg, msg->number, ch->partid, ch->number);
1814
1815 *address_of_msg = msg;
1816
1817 return xpcSuccess;
1818}
1819
1820
1821/*
1822 * Allocate an entry for a message from the message queue associated with the
1823 * specified channel. NOTE that this routine can sleep waiting for a message
1824 * entry to become available. To not sleep, pass in the XPC_NOWAIT flag.
1825 *
1826 * Arguments:
1827 *
1828 * partid - ID of partition to which the channel is connected.
1829 * ch_number - channel #.
1830 * flags - see xpc.h for valid flags.
1831 * payload - address of the allocated payload area pointer (filled in on
1832 * return) in which the user-defined message is constructed.
1833 */
1834enum xpc_retval
1835xpc_initiate_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
1836{
1837 struct xpc_partition *part = &xpc_partitions[partid];
1838 enum xpc_retval ret = xpcUnknownReason;
1839 struct xpc_msg *msg = NULL;
1840
1841
1842 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1843 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1844
1845 *payload = NULL;
1846
1847 if (xpc_part_ref(part)) {
1848 ret = xpc_allocate_msg(&part->channels[ch_number], flags, &msg);
1849 xpc_part_deref(part);
1850
1851 if (msg != NULL) {
1852 *payload = &msg->payload;
1853 }
1854 }
1855
1856 return ret;
1857}
1858
1859
1860/*
1861 * Now we actually send the messages that are ready to be sent by advancing
1862 * the local message queue's Put value and then send an IPI to the recipient
1863 * partition.
1864 */
1865static void
1866xpc_send_msgs(struct xpc_channel *ch, s64 initial_put)
1867{
1868 struct xpc_msg *msg;
1869 s64 put = initial_put + 1;
1870 int send_IPI = 0;
1871
1872
1873 while (1) {
1874
1875 while (1) {
1876 if (put == (volatile s64) ch->w_local_GP.put) {
1877 break;
1878 }
1879
1880 msg = (struct xpc_msg *) ((u64) ch->local_msgqueue +
1881 (put % ch->local_nentries) * ch->msg_size);
1882
1883 if (!(msg->flags & XPC_M_READY)) {
1884 break;
1885 }
1886
1887 put++;
1888 }
1889
1890 if (put == initial_put) {
1891 /* nothing's changed */
1892 break;
1893 }
1894
1895 if (cmpxchg_rel(&ch->local_GP->put, initial_put, put) !=
1896 initial_put) {
1897 /* someone else beat us to it */
1898 DBUG_ON((volatile s64) ch->local_GP->put < initial_put);
1899 break;
1900 }
1901
1902 /* we just set the new value of local_GP->put */
1903
1904 dev_dbg(xpc_chan, "local_GP->put changed to %ld, partid=%d, "
1905 "channel=%d\n", put, ch->partid, ch->number);
1906
1907 send_IPI = 1;
1908
1909 /*
1910 * We need to ensure that the message referenced by
1911 * local_GP->put is not XPC_M_READY or that local_GP->put
1912 * equals w_local_GP.put, so we'll go have a look.
1913 */
1914 initial_put = put;
1915 }
1916
1917 if (send_IPI) {
1918 xpc_IPI_send_msgrequest(ch);
1919 }
1920}
1921
1922
1923/*
1924 * Common code that does the actual sending of the message by advancing the
1925 * local message queue's Put value and sends an IPI to the partition the
1926 * message is being sent to.
1927 */
1928static enum xpc_retval
1929xpc_send_msg(struct xpc_channel *ch, struct xpc_msg *msg, u8 notify_type,
1930 xpc_notify_func func, void *key)
1931{
1932 enum xpc_retval ret = xpcSuccess;
1933 struct xpc_notify *notify = notify;
1934 s64 put, msg_number = msg->number;
1935
1936
1937 DBUG_ON(notify_type == XPC_N_CALL && func == NULL);
1938 DBUG_ON((((u64) msg - (u64) ch->local_msgqueue) / ch->msg_size) !=
1939 msg_number % ch->local_nentries);
1940 DBUG_ON(msg->flags & XPC_M_READY);
1941
1942 if (ch->flags & XPC_C_DISCONNECTING) {
1943 /* drop the reference grabbed in xpc_allocate_msg() */
1944 xpc_msgqueue_deref(ch);
1945 return ch->reason;
1946 }
1947
1948 if (notify_type != 0) {
1949 /*
1950 * Tell the remote side to send an ACK interrupt when the
1951 * message has been delivered.
1952 */
1953 msg->flags |= XPC_M_INTERRUPT;
1954
1955 atomic_inc(&ch->n_to_notify);
1956
1957 notify = &ch->notify_queue[msg_number % ch->local_nentries];
1958 notify->func = func;
1959 notify->key = key;
1960 notify->type = notify_type;
1961
1962 // >>> is a mb() needed here?
1963
1964 if (ch->flags & XPC_C_DISCONNECTING) {
1965 /*
1966 * An error occurred between our last error check and
1967 * this one. We will try to clear the type field from
1968 * the notify entry. If we succeed then
1969 * xpc_disconnect_channel() didn't already process
1970 * the notify entry.
1971 */
1972 if (cmpxchg(&notify->type, notify_type, 0) ==
1973 notify_type) {
1974 atomic_dec(&ch->n_to_notify);
1975 ret = ch->reason;
1976 }
1977
1978 /* drop the reference grabbed in xpc_allocate_msg() */
1979 xpc_msgqueue_deref(ch);
1980 return ret;
1981 }
1982 }
1983
1984 msg->flags |= XPC_M_READY;
1985
1986 /*
1987 * The preceding store of msg->flags must occur before the following
1988 * load of ch->local_GP->put.
1989 */
1990 mb();
1991
1992 /* see if the message is next in line to be sent, if so send it */
1993
1994 put = ch->local_GP->put;
1995 if (put == msg_number) {
1996 xpc_send_msgs(ch, put);
1997 }
1998
1999 /* drop the reference grabbed in xpc_allocate_msg() */
2000 xpc_msgqueue_deref(ch);
2001 return ret;
2002}
2003
2004
2005/*
2006 * Send a message previously allocated using xpc_initiate_allocate() on the
2007 * specified channel connected to the specified partition.
2008 *
2009 * This routine will not wait for the message to be received, nor will
2010 * notification be given when it does happen. Once this routine has returned
2011 * the message entry allocated via xpc_initiate_allocate() is no longer
2012 * accessable to the caller.
2013 *
2014 * This routine, although called by users, does not call xpc_part_ref() to
2015 * ensure that the partition infrastructure is in place. It relies on the
2016 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
2017 *
2018 * Arguments:
2019 *
2020 * partid - ID of partition to which the channel is connected.
2021 * ch_number - channel # to send message on.
2022 * payload - pointer to the payload area allocated via
2023 * xpc_initiate_allocate().
2024 */
2025enum xpc_retval
2026xpc_initiate_send(partid_t partid, int ch_number, void *payload)
2027{
2028 struct xpc_partition *part = &xpc_partitions[partid];
2029 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
2030 enum xpc_retval ret;
2031
2032
2033 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *) msg,
2034 partid, ch_number);
2035
2036 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
2037 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
2038 DBUG_ON(msg == NULL);
2039
2040 ret = xpc_send_msg(&part->channels[ch_number], msg, 0, NULL, NULL);
2041
2042 return ret;
2043}
2044
2045
2046/*
2047 * Send a message previously allocated using xpc_initiate_allocate on the
2048 * specified channel connected to the specified partition.
2049 *
2050 * This routine will not wait for the message to be sent. Once this routine
2051 * has returned the message entry allocated via xpc_initiate_allocate() is no
2052 * longer accessable to the caller.
2053 *
2054 * Once the remote end of the channel has received the message, the function
2055 * passed as an argument to xpc_initiate_send_notify() will be called. This
2056 * allows the sender to free up or re-use any buffers referenced by the
2057 * message, but does NOT mean the message has been processed at the remote
2058 * end by a receiver.
2059 *
2060 * If this routine returns an error, the caller's function will NOT be called.
2061 *
2062 * This routine, although called by users, does not call xpc_part_ref() to
2063 * ensure that the partition infrastructure is in place. It relies on the
2064 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
2065 *
2066 * Arguments:
2067 *
2068 * partid - ID of partition to which the channel is connected.
2069 * ch_number - channel # to send message on.
2070 * payload - pointer to the payload area allocated via
2071 * xpc_initiate_allocate().
2072 * func - function to call with asynchronous notification of message
2073 * receipt. THIS FUNCTION MUST BE NON-BLOCKING.
2074 * key - user-defined key to be passed to the function when it's called.
2075 */
2076enum xpc_retval
2077xpc_initiate_send_notify(partid_t partid, int ch_number, void *payload,
2078 xpc_notify_func func, void *key)
2079{
2080 struct xpc_partition *part = &xpc_partitions[partid];
2081 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
2082 enum xpc_retval ret;
2083
2084
2085 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *) msg,
2086 partid, ch_number);
2087
2088 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
2089 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
2090 DBUG_ON(msg == NULL);
2091 DBUG_ON(func == NULL);
2092
2093 ret = xpc_send_msg(&part->channels[ch_number], msg, XPC_N_CALL,
2094 func, key);
2095 return ret;
2096}
2097
2098
2099static struct xpc_msg *
2100xpc_pull_remote_msg(struct xpc_channel *ch, s64 get)
2101{
2102 struct xpc_partition *part = &xpc_partitions[ch->partid];
2103 struct xpc_msg *remote_msg, *msg;
2104 u32 msg_index, nmsgs;
2105 u64 msg_offset;
2106 enum xpc_retval ret;
2107
2108
2109 if (mutex_lock_interruptible(&ch->msg_to_pull_mutex) != 0) {
2110 /* we were interrupted by a signal */
2111 return NULL;
2112 }
2113
2114 while (get >= ch->next_msg_to_pull) {
2115
2116 /* pull as many messages as are ready and able to be pulled */
2117
2118 msg_index = ch->next_msg_to_pull % ch->remote_nentries;
2119
2120 DBUG_ON(ch->next_msg_to_pull >=
2121 (volatile s64) ch->w_remote_GP.put);
2122 nmsgs = (volatile s64) ch->w_remote_GP.put -
2123 ch->next_msg_to_pull;
2124 if (msg_index + nmsgs > ch->remote_nentries) {
2125 /* ignore the ones that wrap the msg queue for now */
2126 nmsgs = ch->remote_nentries - msg_index;
2127 }
2128
2129 msg_offset = msg_index * ch->msg_size;
2130 msg = (struct xpc_msg *) ((u64) ch->remote_msgqueue +
2131 msg_offset);
2132 remote_msg = (struct xpc_msg *) (ch->remote_msgqueue_pa +
2133 msg_offset);
2134
2135 if ((ret = xpc_pull_remote_cachelines(part, msg, remote_msg,
2136 nmsgs * ch->msg_size)) != xpcSuccess) {
2137
2138 dev_dbg(xpc_chan, "failed to pull %d msgs starting with"
2139 " msg %ld from partition %d, channel=%d, "
2140 "ret=%d\n", nmsgs, ch->next_msg_to_pull,
2141 ch->partid, ch->number, ret);
2142
2143 XPC_DEACTIVATE_PARTITION(part, ret);
2144
2145 mutex_unlock(&ch->msg_to_pull_mutex);
2146 return NULL;
2147 }
2148
2149 mb(); /* >>> this may not be needed, we're not sure */
2150
2151 ch->next_msg_to_pull += nmsgs;
2152 }
2153
2154 mutex_unlock(&ch->msg_to_pull_mutex);
2155
2156 /* return the message we were looking for */
2157 msg_offset = (get % ch->remote_nentries) * ch->msg_size;
2158 msg = (struct xpc_msg *) ((u64) ch->remote_msgqueue + msg_offset);
2159
2160 return msg;
2161}
2162
2163
2164/*
2165 * Get a message to be delivered.
2166 */
2167static struct xpc_msg *
2168xpc_get_deliverable_msg(struct xpc_channel *ch)
2169{
2170 struct xpc_msg *msg = NULL;
2171 s64 get;
2172
2173
2174 do {
2175 if ((volatile u32) ch->flags & XPC_C_DISCONNECTING) {
2176 break;
2177 }
2178
2179 get = (volatile s64) ch->w_local_GP.get;
2180 if (get == (volatile s64) ch->w_remote_GP.put) {
2181 break;
2182 }
2183
2184 /* There are messages waiting to be pulled and delivered.
2185 * We need to try to secure one for ourselves. We'll do this
2186 * by trying to increment w_local_GP.get and hope that no one
2187 * else beats us to it. If they do, we'll we'll simply have
2188 * to try again for the next one.
2189 */
2190
2191 if (cmpxchg(&ch->w_local_GP.get, get, get + 1) == get) {
2192 /* we got the entry referenced by get */
2193
2194 dev_dbg(xpc_chan, "w_local_GP.get changed to %ld, "
2195 "partid=%d, channel=%d\n", get + 1,
2196 ch->partid, ch->number);
2197
2198 /* pull the message from the remote partition */
2199
2200 msg = xpc_pull_remote_msg(ch, get);
2201
2202 DBUG_ON(msg != NULL && msg->number != get);
2203 DBUG_ON(msg != NULL && (msg->flags & XPC_M_DONE));
2204 DBUG_ON(msg != NULL && !(msg->flags & XPC_M_READY));
2205
2206 break;
2207 }
2208
2209 } while (1);
2210
2211 return msg;
2212}
2213
2214
2215/*
2216 * Deliver a message to its intended recipient.
2217 */
2218void
2219xpc_deliver_msg(struct xpc_channel *ch)
2220{
2221 struct xpc_msg *msg;
2222
2223
2224 if ((msg = xpc_get_deliverable_msg(ch)) != NULL) {
2225
2226 /*
2227 * This ref is taken to protect the payload itself from being
2228 * freed before the user is finished with it, which the user
2229 * indicates by calling xpc_initiate_received().
2230 */
2231 xpc_msgqueue_ref(ch);
2232
2233 atomic_inc(&ch->kthreads_active);
2234
2235 if (ch->func != NULL) {
2236 dev_dbg(xpc_chan, "ch->func() called, msg=0x%p, "
2237 "msg_number=%ld, partid=%d, channel=%d\n",
2238 (void *) msg, msg->number, ch->partid,
2239 ch->number);
2240
2241 /* deliver the message to its intended recipient */
2242 ch->func(xpcMsgReceived, ch->partid, ch->number,
2243 &msg->payload, ch->key);
2244
2245 dev_dbg(xpc_chan, "ch->func() returned, msg=0x%p, "
2246 "msg_number=%ld, partid=%d, channel=%d\n",
2247 (void *) msg, msg->number, ch->partid,
2248 ch->number);
2249 }
2250
2251 atomic_dec(&ch->kthreads_active);
2252 }
2253}
2254
2255
2256/*
2257 * Now we actually acknowledge the messages that have been delivered and ack'd
2258 * by advancing the cached remote message queue's Get value and if requested
2259 * send an IPI to the message sender's partition.
2260 */
2261static void
2262xpc_acknowledge_msgs(struct xpc_channel *ch, s64 initial_get, u8 msg_flags)
2263{
2264 struct xpc_msg *msg;
2265 s64 get = initial_get + 1;
2266 int send_IPI = 0;
2267
2268
2269 while (1) {
2270
2271 while (1) {
2272 if (get == (volatile s64) ch->w_local_GP.get) {
2273 break;
2274 }
2275
2276 msg = (struct xpc_msg *) ((u64) ch->remote_msgqueue +
2277 (get % ch->remote_nentries) * ch->msg_size);
2278
2279 if (!(msg->flags & XPC_M_DONE)) {
2280 break;
2281 }
2282
2283 msg_flags |= msg->flags;
2284 get++;
2285 }
2286
2287 if (get == initial_get) {
2288 /* nothing's changed */
2289 break;
2290 }
2291
2292 if (cmpxchg_rel(&ch->local_GP->get, initial_get, get) !=
2293 initial_get) {
2294 /* someone else beat us to it */
2295 DBUG_ON((volatile s64) ch->local_GP->get <=
2296 initial_get);
2297 break;
2298 }
2299
2300 /* we just set the new value of local_GP->get */
2301
2302 dev_dbg(xpc_chan, "local_GP->get changed to %ld, partid=%d, "
2303 "channel=%d\n", get, ch->partid, ch->number);
2304
2305 send_IPI = (msg_flags & XPC_M_INTERRUPT);
2306
2307 /*
2308 * We need to ensure that the message referenced by
2309 * local_GP->get is not XPC_M_DONE or that local_GP->get
2310 * equals w_local_GP.get, so we'll go have a look.
2311 */
2312 initial_get = get;
2313 }
2314
2315 if (send_IPI) {
2316 xpc_IPI_send_msgrequest(ch);
2317 }
2318}
2319
2320
2321/*
2322 * Acknowledge receipt of a delivered message.
2323 *
2324 * If a message has XPC_M_INTERRUPT set, send an interrupt to the partition
2325 * that sent the message.
2326 *
2327 * This function, although called by users, does not call xpc_part_ref() to
2328 * ensure that the partition infrastructure is in place. It relies on the
2329 * fact that we called xpc_msgqueue_ref() in xpc_deliver_msg().
2330 *
2331 * Arguments:
2332 *
2333 * partid - ID of partition to which the channel is connected.
2334 * ch_number - channel # message received on.
2335 * payload - pointer to the payload area allocated via
2336 * xpc_initiate_allocate().
2337 */
2338void
2339xpc_initiate_received(partid_t partid, int ch_number, void *payload)
2340{
2341 struct xpc_partition *part = &xpc_partitions[partid];
2342 struct xpc_channel *ch;
2343 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
2344 s64 get, msg_number = msg->number;
2345
2346
2347 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
2348 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
2349
2350 ch = &part->channels[ch_number];
2351
2352 dev_dbg(xpc_chan, "msg=0x%p, msg_number=%ld, partid=%d, channel=%d\n",
2353 (void *) msg, msg_number, ch->partid, ch->number);
2354
2355 DBUG_ON((((u64) msg - (u64) ch->remote_msgqueue) / ch->msg_size) !=
2356 msg_number % ch->remote_nentries);
2357 DBUG_ON(msg->flags & XPC_M_DONE);
2358
2359 msg->flags |= XPC_M_DONE;
2360
2361 /*
2362 * The preceding store of msg->flags must occur before the following
2363 * load of ch->local_GP->get.
2364 */
2365 mb();
2366
2367 /*
2368 * See if this message is next in line to be acknowledged as having
2369 * been delivered.
2370 */
2371 get = ch->local_GP->get;
2372 if (get == msg_number) {
2373 xpc_acknowledge_msgs(ch, get, msg->flags);
2374 }
2375
2376 /* the call to xpc_msgqueue_ref() was done by xpc_deliver_msg() */
2377 xpc_msgqueue_deref(ch);
2378}
2379
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
new file mode 100644
index 000000000000..bdb2cf1fcbcc
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -0,0 +1,1431 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9
10/*
11 * Cross Partition Communication (XPC) support - standard version.
12 *
13 * XPC provides a message passing capability that crosses partition
14 * boundaries. This module is made up of two parts:
15 *
16 * partition This part detects the presence/absence of other
17 * partitions. It provides a heartbeat and monitors
18 * the heartbeats of other partitions.
19 *
20 * channel This part manages the channels and sends/receives
21 * messages across them to/from other partitions.
22 *
23 * There are a couple of additional functions residing in XP, which
24 * provide an interface to XPC for its users.
25 *
26 *
27 * Caveats:
28 *
29 * . We currently have no way to determine which nasid an IPI came
30 * from. Thus, xpc_IPI_send() does a remote AMO write followed by
31 * an IPI. The AMO indicates where data is to be pulled from, so
32 * after the IPI arrives, the remote partition checks the AMO word.
33 * The IPI can actually arrive before the AMO however, so other code
34 * must periodically check for this case. Also, remote AMO operations
35 * do not reliably time out. Thus we do a remote PIO read solely to
36 * know whether the remote partition is down and whether we should
37 * stop sending IPIs to it. This remote PIO read operation is set up
38 * in a special nofault region so SAL knows to ignore (and cleanup)
39 * any errors due to the remote AMO write, PIO read, and/or PIO
40 * write operations.
41 *
42 * If/when new hardware solves this IPI problem, we should abandon
43 * the current approach.
44 *
45 */
46
47
48#include <linux/kernel.h>
49#include <linux/module.h>
50#include <linux/init.h>
51#include <linux/sched.h>
52#include <linux/syscalls.h>
53#include <linux/cache.h>
54#include <linux/interrupt.h>
55#include <linux/delay.h>
56#include <linux/reboot.h>
57#include <linux/completion.h>
58#include <linux/kdebug.h>
59#include <asm/sn/intr.h>
60#include <asm/sn/sn_sal.h>
61#include <asm/uaccess.h>
62#include "xpc.h"
63
64
65/* define two XPC debug device structures to be used with dev_dbg() et al */
66
67struct device_driver xpc_dbg_name = {
68 .name = "xpc"
69};
70
71struct device xpc_part_dbg_subname = {
72 .bus_id = {0}, /* set to "part" at xpc_init() time */
73 .driver = &xpc_dbg_name
74};
75
76struct device xpc_chan_dbg_subname = {
77 .bus_id = {0}, /* set to "chan" at xpc_init() time */
78 .driver = &xpc_dbg_name
79};
80
81struct device *xpc_part = &xpc_part_dbg_subname;
82struct device *xpc_chan = &xpc_chan_dbg_subname;
83
84
85static int xpc_kdebug_ignore;
86
87
88/* systune related variables for /proc/sys directories */
89
90static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
91static int xpc_hb_min_interval = 1;
92static int xpc_hb_max_interval = 10;
93
94static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
95static int xpc_hb_check_min_interval = 10;
96static int xpc_hb_check_max_interval = 120;
97
98int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
99static int xpc_disengage_request_min_timelimit = 0;
100static int xpc_disengage_request_max_timelimit = 120;
101
102static ctl_table xpc_sys_xpc_hb_dir[] = {
103 {
104 .ctl_name = CTL_UNNUMBERED,
105 .procname = "hb_interval",
106 .data = &xpc_hb_interval,
107 .maxlen = sizeof(int),
108 .mode = 0644,
109 .proc_handler = &proc_dointvec_minmax,
110 .strategy = &sysctl_intvec,
111 .extra1 = &xpc_hb_min_interval,
112 .extra2 = &xpc_hb_max_interval
113 },
114 {
115 .ctl_name = CTL_UNNUMBERED,
116 .procname = "hb_check_interval",
117 .data = &xpc_hb_check_interval,
118 .maxlen = sizeof(int),
119 .mode = 0644,
120 .proc_handler = &proc_dointvec_minmax,
121 .strategy = &sysctl_intvec,
122 .extra1 = &xpc_hb_check_min_interval,
123 .extra2 = &xpc_hb_check_max_interval
124 },
125 {}
126};
127static ctl_table xpc_sys_xpc_dir[] = {
128 {
129 .ctl_name = CTL_UNNUMBERED,
130 .procname = "hb",
131 .mode = 0555,
132 .child = xpc_sys_xpc_hb_dir
133 },
134 {
135 .ctl_name = CTL_UNNUMBERED,
136 .procname = "disengage_request_timelimit",
137 .data = &xpc_disengage_request_timelimit,
138 .maxlen = sizeof(int),
139 .mode = 0644,
140 .proc_handler = &proc_dointvec_minmax,
141 .strategy = &sysctl_intvec,
142 .extra1 = &xpc_disengage_request_min_timelimit,
143 .extra2 = &xpc_disengage_request_max_timelimit
144 },
145 {}
146};
147static ctl_table xpc_sys_dir[] = {
148 {
149 .ctl_name = CTL_UNNUMBERED,
150 .procname = "xpc",
151 .mode = 0555,
152 .child = xpc_sys_xpc_dir
153 },
154 {}
155};
156static struct ctl_table_header *xpc_sysctl;
157
158/* non-zero if any remote partition disengage request was timed out */
159int xpc_disengage_request_timedout;
160
161/* #of IRQs received */
162static atomic_t xpc_act_IRQ_rcvd;
163
164/* IRQ handler notifies this wait queue on receipt of an IRQ */
165static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
166
167static unsigned long xpc_hb_check_timeout;
168
169/* notification that the xpc_hb_checker thread has exited */
170static DECLARE_COMPLETION(xpc_hb_checker_exited);
171
172/* notification that the xpc_discovery thread has exited */
173static DECLARE_COMPLETION(xpc_discovery_exited);
174
175
176static struct timer_list xpc_hb_timer;
177
178
179static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
180
181
182static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
183static struct notifier_block xpc_reboot_notifier = {
184 .notifier_call = xpc_system_reboot,
185};
186
187static int xpc_system_die(struct notifier_block *, unsigned long, void *);
188static struct notifier_block xpc_die_notifier = {
189 .notifier_call = xpc_system_die,
190};
191
192
193/*
194 * Timer function to enforce the timelimit on the partition disengage request.
195 */
196static void
197xpc_timeout_partition_disengage_request(unsigned long data)
198{
199 struct xpc_partition *part = (struct xpc_partition *) data;
200
201
202 DBUG_ON(time_before(jiffies, part->disengage_request_timeout));
203
204 (void) xpc_partition_disengaged(part);
205
206 DBUG_ON(part->disengage_request_timeout != 0);
207 DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
208}
209
210
211/*
212 * Notify the heartbeat check thread that an IRQ has been received.
213 */
214static irqreturn_t
215xpc_act_IRQ_handler(int irq, void *dev_id)
216{
217 atomic_inc(&xpc_act_IRQ_rcvd);
218 wake_up_interruptible(&xpc_act_IRQ_wq);
219 return IRQ_HANDLED;
220}
221
222
223/*
224 * Timer to produce the heartbeat. The timer structures function is
225 * already set when this is initially called. A tunable is used to
226 * specify when the next timeout should occur.
227 */
228static void
229xpc_hb_beater(unsigned long dummy)
230{
231 xpc_vars->heartbeat++;
232
233 if (time_after_eq(jiffies, xpc_hb_check_timeout)) {
234 wake_up_interruptible(&xpc_act_IRQ_wq);
235 }
236
237 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
238 add_timer(&xpc_hb_timer);
239}
240
241
242/*
243 * This thread is responsible for nearly all of the partition
244 * activation/deactivation.
245 */
246static int
247xpc_hb_checker(void *ignore)
248{
249 int last_IRQ_count = 0;
250 int new_IRQ_count;
251 int force_IRQ=0;
252
253
254 /* this thread was marked active by xpc_hb_init() */
255
256 daemonize(XPC_HB_CHECK_THREAD_NAME);
257
258 set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
259
260 /* set our heartbeating to other partitions into motion */
261 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
262 xpc_hb_beater(0);
263
264 while (!(volatile int) xpc_exiting) {
265
266 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
267 "been received\n",
268 (int) (xpc_hb_check_timeout - jiffies),
269 atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
270
271
272 /* checking of remote heartbeats is skewed by IRQ handling */
273 if (time_after_eq(jiffies, xpc_hb_check_timeout)) {
274 dev_dbg(xpc_part, "checking remote heartbeats\n");
275 xpc_check_remote_hb();
276
277 /*
278 * We need to periodically recheck to ensure no
279 * IPI/AMO pairs have been missed. That check
280 * must always reset xpc_hb_check_timeout.
281 */
282 force_IRQ = 1;
283 }
284
285
286 /* check for outstanding IRQs */
287 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
288 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
289 force_IRQ = 0;
290
291 dev_dbg(xpc_part, "found an IRQ to process; will be "
292 "resetting xpc_hb_check_timeout\n");
293
294 last_IRQ_count += xpc_identify_act_IRQ_sender();
295 if (last_IRQ_count < new_IRQ_count) {
296 /* retry once to help avoid missing AMO */
297 (void) xpc_identify_act_IRQ_sender();
298 }
299 last_IRQ_count = new_IRQ_count;
300
301 xpc_hb_check_timeout = jiffies +
302 (xpc_hb_check_interval * HZ);
303 }
304
305 /* wait for IRQ or timeout */
306 (void) wait_event_interruptible(xpc_act_IRQ_wq,
307 (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
308 time_after_eq(jiffies, xpc_hb_check_timeout) ||
309 (volatile int) xpc_exiting));
310 }
311
312 dev_dbg(xpc_part, "heartbeat checker is exiting\n");
313
314
315 /* mark this thread as having exited */
316 complete(&xpc_hb_checker_exited);
317 return 0;
318}
319
320
321/*
322 * This thread will attempt to discover other partitions to activate
323 * based on info provided by SAL. This new thread is short lived and
324 * will exit once discovery is complete.
325 */
326static int
327xpc_initiate_discovery(void *ignore)
328{
329 daemonize(XPC_DISCOVERY_THREAD_NAME);
330
331 xpc_discovery();
332
333 dev_dbg(xpc_part, "discovery thread is exiting\n");
334
335 /* mark this thread as having exited */
336 complete(&xpc_discovery_exited);
337 return 0;
338}
339
340
341/*
342 * Establish first contact with the remote partititon. This involves pulling
343 * the XPC per partition variables from the remote partition and waiting for
344 * the remote partition to pull ours.
345 */
346static enum xpc_retval
347xpc_make_first_contact(struct xpc_partition *part)
348{
349 enum xpc_retval ret;
350
351
352 while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
353 if (ret != xpcRetry) {
354 XPC_DEACTIVATE_PARTITION(part, ret);
355 return ret;
356 }
357
358 dev_dbg(xpc_chan, "waiting to make first contact with "
359 "partition %d\n", XPC_PARTID(part));
360
361 /* wait a 1/4 of a second or so */
362 (void) msleep_interruptible(250);
363
364 if (part->act_state == XPC_P_DEACTIVATING) {
365 return part->reason;
366 }
367 }
368
369 return xpc_mark_partition_active(part);
370}
371
372
373/*
374 * The first kthread assigned to a newly activated partition is the one
375 * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
376 * that kthread until the partition is brought down, at which time that kthread
377 * returns back to XPC HB. (The return of that kthread will signify to XPC HB
378 * that XPC has dismantled all communication infrastructure for the associated
379 * partition.) This kthread becomes the channel manager for that partition.
380 *
381 * Each active partition has a channel manager, who, besides connecting and
382 * disconnecting channels, will ensure that each of the partition's connected
383 * channels has the required number of assigned kthreads to get the work done.
384 */
385static void
386xpc_channel_mgr(struct xpc_partition *part)
387{
388 while (part->act_state != XPC_P_DEACTIVATING ||
389 atomic_read(&part->nchannels_active) > 0 ||
390 !xpc_partition_disengaged(part)) {
391
392 xpc_process_channel_activity(part);
393
394
395 /*
396 * Wait until we've been requested to activate kthreads or
397 * all of the channel's message queues have been torn down or
398 * a signal is pending.
399 *
400 * The channel_mgr_requests is set to 1 after being awakened,
401 * This is done to prevent the channel mgr from making one pass
402 * through the loop for each request, since he will
403 * be servicing all the requests in one pass. The reason it's
404 * set to 1 instead of 0 is so that other kthreads will know
405 * that the channel mgr is running and won't bother trying to
406 * wake him up.
407 */
408 atomic_dec(&part->channel_mgr_requests);
409 (void) wait_event_interruptible(part->channel_mgr_wq,
410 (atomic_read(&part->channel_mgr_requests) > 0 ||
411 (volatile u64) part->local_IPI_amo != 0 ||
412 ((volatile u8) part->act_state ==
413 XPC_P_DEACTIVATING &&
414 atomic_read(&part->nchannels_active) == 0 &&
415 xpc_partition_disengaged(part))));
416 atomic_set(&part->channel_mgr_requests, 1);
417
418 // >>> Does it need to wakeup periodically as well? In case we
419 // >>> miscalculated the #of kthreads to wakeup or create?
420 }
421}
422
423
424/*
425 * When XPC HB determines that a partition has come up, it will create a new
426 * kthread and that kthread will call this function to attempt to set up the
427 * basic infrastructure used for Cross Partition Communication with the newly
428 * upped partition.
429 *
430 * The kthread that was created by XPC HB and which setup the XPC
431 * infrastructure will remain assigned to the partition until the partition
432 * goes down. At which time the kthread will teardown the XPC infrastructure
433 * and then exit.
434 *
435 * XPC HB will put the remote partition's XPC per partition specific variables
436 * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
437 * calling xpc_partition_up().
438 */
439static void
440xpc_partition_up(struct xpc_partition *part)
441{
442 DBUG_ON(part->channels != NULL);
443
444 dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
445
446 if (xpc_setup_infrastructure(part) != xpcSuccess) {
447 return;
448 }
449
450 /*
451 * The kthread that XPC HB called us with will become the
452 * channel manager for this partition. It will not return
453 * back to XPC HB until the partition's XPC infrastructure
454 * has been dismantled.
455 */
456
457 (void) xpc_part_ref(part); /* this will always succeed */
458
459 if (xpc_make_first_contact(part) == xpcSuccess) {
460 xpc_channel_mgr(part);
461 }
462
463 xpc_part_deref(part);
464
465 xpc_teardown_infrastructure(part);
466}
467
468
469static int
470xpc_activating(void *__partid)
471{
472 partid_t partid = (u64) __partid;
473 struct xpc_partition *part = &xpc_partitions[partid];
474 unsigned long irq_flags;
475 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
476 int ret;
477
478
479 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
480
481 spin_lock_irqsave(&part->act_lock, irq_flags);
482
483 if (part->act_state == XPC_P_DEACTIVATING) {
484 part->act_state = XPC_P_INACTIVE;
485 spin_unlock_irqrestore(&part->act_lock, irq_flags);
486 part->remote_rp_pa = 0;
487 return 0;
488 }
489
490 /* indicate the thread is activating */
491 DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
492 part->act_state = XPC_P_ACTIVATING;
493
494 XPC_SET_REASON(part, 0, 0);
495 spin_unlock_irqrestore(&part->act_lock, irq_flags);
496
497 dev_dbg(xpc_part, "bringing partition %d up\n", partid);
498
499 daemonize("xpc%02d", partid);
500
501 /*
502 * This thread needs to run at a realtime priority to prevent a
503 * significant performance degradation.
504 */
505 ret = sched_setscheduler(current, SCHED_FIFO, &param);
506 if (ret != 0) {
507 dev_warn(xpc_part, "unable to set pid %d to a realtime "
508 "priority, ret=%d\n", current->pid, ret);
509 }
510
511 /* allow this thread and its children to run on any CPU */
512 set_cpus_allowed(current, CPU_MASK_ALL);
513
514 /*
515 * Register the remote partition's AMOs with SAL so it can handle
516 * and cleanup errors within that address range should the remote
517 * partition go down. We don't unregister this range because it is
518 * difficult to tell when outstanding writes to the remote partition
519 * are finished and thus when it is safe to unregister. This should
520 * not result in wasted space in the SAL xp_addr_region table because
521 * we should get the same page for remote_amos_page_pa after module
522 * reloads and system reboots.
523 */
524 if (sn_register_xp_addr_region(part->remote_amos_page_pa,
525 PAGE_SIZE, 1) < 0) {
526 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
527 "xp_addr region\n", partid);
528
529 spin_lock_irqsave(&part->act_lock, irq_flags);
530 part->act_state = XPC_P_INACTIVE;
531 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
532 spin_unlock_irqrestore(&part->act_lock, irq_flags);
533 part->remote_rp_pa = 0;
534 return 0;
535 }
536
537 xpc_allow_hb(partid, xpc_vars);
538 xpc_IPI_send_activated(part);
539
540
541 /*
542 * xpc_partition_up() holds this thread and marks this partition as
543 * XPC_P_ACTIVE by calling xpc_hb_mark_active().
544 */
545 (void) xpc_partition_up(part);
546
547 xpc_disallow_hb(partid, xpc_vars);
548 xpc_mark_partition_inactive(part);
549
550 if (part->reason == xpcReactivating) {
551 /* interrupting ourselves results in activating partition */
552 xpc_IPI_send_reactivate(part);
553 }
554
555 return 0;
556}
557
558
559void
560xpc_activate_partition(struct xpc_partition *part)
561{
562 partid_t partid = XPC_PARTID(part);
563 unsigned long irq_flags;
564 pid_t pid;
565
566
567 spin_lock_irqsave(&part->act_lock, irq_flags);
568
569 DBUG_ON(part->act_state != XPC_P_INACTIVE);
570
571 part->act_state = XPC_P_ACTIVATION_REQ;
572 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
573
574 spin_unlock_irqrestore(&part->act_lock, irq_flags);
575
576 pid = kernel_thread(xpc_activating, (void *) ((u64) partid), 0);
577
578 if (unlikely(pid <= 0)) {
579 spin_lock_irqsave(&part->act_lock, irq_flags);
580 part->act_state = XPC_P_INACTIVE;
581 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
582 spin_unlock_irqrestore(&part->act_lock, irq_flags);
583 }
584}
585
586
587/*
588 * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
589 * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
590 * than one partition, we use an AMO_t structure per partition to indicate
591 * whether a partition has sent an IPI or not. >>> If it has, then wake up the
592 * associated kthread to handle it.
593 *
594 * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
595 * running on other partitions.
596 *
597 * Noteworthy Arguments:
598 *
599 * irq - Interrupt ReQuest number. NOT USED.
600 *
601 * dev_id - partid of IPI's potential sender.
602 */
603irqreturn_t
604xpc_notify_IRQ_handler(int irq, void *dev_id)
605{
606 partid_t partid = (partid_t) (u64) dev_id;
607 struct xpc_partition *part = &xpc_partitions[partid];
608
609
610 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
611
612 if (xpc_part_ref(part)) {
613 xpc_check_for_channel_activity(part);
614
615 xpc_part_deref(part);
616 }
617 return IRQ_HANDLED;
618}
619
620
621/*
622 * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
623 * because the write to their associated IPI amo completed after the IRQ/IPI
624 * was received.
625 */
626void
627xpc_dropped_IPI_check(struct xpc_partition *part)
628{
629 if (xpc_part_ref(part)) {
630 xpc_check_for_channel_activity(part);
631
632 part->dropped_IPI_timer.expires = jiffies +
633 XPC_P_DROPPED_IPI_WAIT;
634 add_timer(&part->dropped_IPI_timer);
635 xpc_part_deref(part);
636 }
637}
638
639
640void
641xpc_activate_kthreads(struct xpc_channel *ch, int needed)
642{
643 int idle = atomic_read(&ch->kthreads_idle);
644 int assigned = atomic_read(&ch->kthreads_assigned);
645 int wakeup;
646
647
648 DBUG_ON(needed <= 0);
649
650 if (idle > 0) {
651 wakeup = (needed > idle) ? idle : needed;
652 needed -= wakeup;
653
654 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
655 "channel=%d\n", wakeup, ch->partid, ch->number);
656
657 /* only wakeup the requested number of kthreads */
658 wake_up_nr(&ch->idle_wq, wakeup);
659 }
660
661 if (needed <= 0) {
662 return;
663 }
664
665 if (needed + assigned > ch->kthreads_assigned_limit) {
666 needed = ch->kthreads_assigned_limit - assigned;
667 // >>>should never be less than 0
668 if (needed <= 0) {
669 return;
670 }
671 }
672
673 dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
674 needed, ch->partid, ch->number);
675
676 xpc_create_kthreads(ch, needed, 0);
677}
678
679
680/*
681 * This function is where XPC's kthreads wait for messages to deliver.
682 */
683static void
684xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
685{
686 do {
687 /* deliver messages to their intended recipients */
688
689 while ((volatile s64) ch->w_local_GP.get <
690 (volatile s64) ch->w_remote_GP.put &&
691 !((volatile u32) ch->flags &
692 XPC_C_DISCONNECTING)) {
693 xpc_deliver_msg(ch);
694 }
695
696 if (atomic_inc_return(&ch->kthreads_idle) >
697 ch->kthreads_idle_limit) {
698 /* too many idle kthreads on this channel */
699 atomic_dec(&ch->kthreads_idle);
700 break;
701 }
702
703 dev_dbg(xpc_chan, "idle kthread calling "
704 "wait_event_interruptible_exclusive()\n");
705
706 (void) wait_event_interruptible_exclusive(ch->idle_wq,
707 ((volatile s64) ch->w_local_GP.get <
708 (volatile s64) ch->w_remote_GP.put ||
709 ((volatile u32) ch->flags &
710 XPC_C_DISCONNECTING)));
711
712 atomic_dec(&ch->kthreads_idle);
713
714 } while (!((volatile u32) ch->flags & XPC_C_DISCONNECTING));
715}
716
717
718static int
719xpc_daemonize_kthread(void *args)
720{
721 partid_t partid = XPC_UNPACK_ARG1(args);
722 u16 ch_number = XPC_UNPACK_ARG2(args);
723 struct xpc_partition *part = &xpc_partitions[partid];
724 struct xpc_channel *ch;
725 int n_needed;
726 unsigned long irq_flags;
727
728
729 daemonize("xpc%02dc%d", partid, ch_number);
730
731 dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
732 partid, ch_number);
733
734 ch = &part->channels[ch_number];
735
736 if (!(ch->flags & XPC_C_DISCONNECTING)) {
737
738 /* let registerer know that connection has been established */
739
740 spin_lock_irqsave(&ch->lock, irq_flags);
741 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
742 ch->flags |= XPC_C_CONNECTEDCALLOUT;
743 spin_unlock_irqrestore(&ch->lock, irq_flags);
744
745 xpc_connected_callout(ch);
746
747 spin_lock_irqsave(&ch->lock, irq_flags);
748 ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
749 spin_unlock_irqrestore(&ch->lock, irq_flags);
750
751 /*
752 * It is possible that while the callout was being
753 * made that the remote partition sent some messages.
754 * If that is the case, we may need to activate
755 * additional kthreads to help deliver them. We only
756 * need one less than total #of messages to deliver.
757 */
758 n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
759 if (n_needed > 0 &&
760 !(ch->flags & XPC_C_DISCONNECTING)) {
761 xpc_activate_kthreads(ch, n_needed);
762 }
763 } else {
764 spin_unlock_irqrestore(&ch->lock, irq_flags);
765 }
766
767 xpc_kthread_waitmsgs(part, ch);
768 }
769
770 /* let registerer know that connection is disconnecting */
771
772 spin_lock_irqsave(&ch->lock, irq_flags);
773 if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
774 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
775 ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
776 spin_unlock_irqrestore(&ch->lock, irq_flags);
777
778 xpc_disconnect_callout(ch, xpcDisconnecting);
779
780 spin_lock_irqsave(&ch->lock, irq_flags);
781 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
782 }
783 spin_unlock_irqrestore(&ch->lock, irq_flags);
784
785 if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
786 if (atomic_dec_return(&part->nchannels_engaged) == 0) {
787 xpc_mark_partition_disengaged(part);
788 xpc_IPI_send_disengage(part);
789 }
790 }
791
792 xpc_msgqueue_deref(ch);
793
794 dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
795 partid, ch_number);
796
797 xpc_part_deref(part);
798 return 0;
799}
800
801
802/*
803 * For each partition that XPC has established communications with, there is
804 * a minimum of one kernel thread assigned to perform any operation that
805 * may potentially sleep or block (basically the callouts to the asynchronous
806 * functions registered via xpc_connect()).
807 *
808 * Additional kthreads are created and destroyed by XPC as the workload
809 * demands.
810 *
811 * A kthread is assigned to one of the active channels that exists for a given
812 * partition.
813 */
814void
815xpc_create_kthreads(struct xpc_channel *ch, int needed,
816 int ignore_disconnecting)
817{
818 unsigned long irq_flags;
819 pid_t pid;
820 u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
821 struct xpc_partition *part = &xpc_partitions[ch->partid];
822
823
824 while (needed-- > 0) {
825
826 /*
827 * The following is done on behalf of the newly created
828 * kthread. That kthread is responsible for doing the
829 * counterpart to the following before it exits.
830 */
831 if (ignore_disconnecting) {
832 if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
833 /* kthreads assigned had gone to zero */
834 BUG_ON(!(ch->flags &
835 XPC_C_DISCONNECTINGCALLOUT_MADE));
836 break;
837 }
838
839 } else if (ch->flags & XPC_C_DISCONNECTING) {
840 break;
841
842 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1) {
843 if (atomic_inc_return(&part->nchannels_engaged) == 1)
844 xpc_mark_partition_engaged(part);
845 }
846 (void) xpc_part_ref(part);
847 xpc_msgqueue_ref(ch);
848
849 pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0);
850 if (pid < 0) {
851 /* the fork failed */
852
853 /*
854 * NOTE: if (ignore_disconnecting &&
855 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
856 * then we'll deadlock if all other kthreads assigned
857 * to this channel are blocked in the channel's
858 * registerer, because the only thing that will unblock
859 * them is the xpcDisconnecting callout that this
860 * failed kernel_thread would have made.
861 */
862
863 if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
864 atomic_dec_return(&part->nchannels_engaged) == 0) {
865 xpc_mark_partition_disengaged(part);
866 xpc_IPI_send_disengage(part);
867 }
868 xpc_msgqueue_deref(ch);
869 xpc_part_deref(part);
870
871 if (atomic_read(&ch->kthreads_assigned) <
872 ch->kthreads_idle_limit) {
873 /*
874 * Flag this as an error only if we have an
875 * insufficient #of kthreads for the channel
876 * to function.
877 */
878 spin_lock_irqsave(&ch->lock, irq_flags);
879 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
880 &irq_flags);
881 spin_unlock_irqrestore(&ch->lock, irq_flags);
882 }
883 break;
884 }
885
886 ch->kthreads_created++; // >>> temporary debug only!!!
887 }
888}
889
890
891void
892xpc_disconnect_wait(int ch_number)
893{
894 unsigned long irq_flags;
895 partid_t partid;
896 struct xpc_partition *part;
897 struct xpc_channel *ch;
898 int wakeup_channel_mgr;
899
900
901 /* now wait for all callouts to the caller's function to cease */
902 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
903 part = &xpc_partitions[partid];
904
905 if (!xpc_part_ref(part)) {
906 continue;
907 }
908
909 ch = &part->channels[ch_number];
910
911 if (!(ch->flags & XPC_C_WDISCONNECT)) {
912 xpc_part_deref(part);
913 continue;
914 }
915
916 wait_for_completion(&ch->wdisconnect_wait);
917
918 spin_lock_irqsave(&ch->lock, irq_flags);
919 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
920 wakeup_channel_mgr = 0;
921
922 if (ch->delayed_IPI_flags) {
923 if (part->act_state != XPC_P_DEACTIVATING) {
924 spin_lock(&part->IPI_lock);
925 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
926 ch->number, ch->delayed_IPI_flags);
927 spin_unlock(&part->IPI_lock);
928 wakeup_channel_mgr = 1;
929 }
930 ch->delayed_IPI_flags = 0;
931 }
932
933 ch->flags &= ~XPC_C_WDISCONNECT;
934 spin_unlock_irqrestore(&ch->lock, irq_flags);
935
936 if (wakeup_channel_mgr) {
937 xpc_wakeup_channel_mgr(part);
938 }
939
940 xpc_part_deref(part);
941 }
942}
943
944
945static void
946xpc_do_exit(enum xpc_retval reason)
947{
948 partid_t partid;
949 int active_part_count, printed_waiting_msg = 0;
950 struct xpc_partition *part;
951 unsigned long printmsg_time, disengage_request_timeout = 0;
952
953
954 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
955 DBUG_ON(xpc_exiting == 1);
956
957 /*
958 * Let the heartbeat checker thread and the discovery thread
959 * (if one is running) know that they should exit. Also wake up
960 * the heartbeat checker thread in case it's sleeping.
961 */
962 xpc_exiting = 1;
963 wake_up_interruptible(&xpc_act_IRQ_wq);
964
965 /* ignore all incoming interrupts */
966 free_irq(SGI_XPC_ACTIVATE, NULL);
967
968 /* wait for the discovery thread to exit */
969 wait_for_completion(&xpc_discovery_exited);
970
971 /* wait for the heartbeat checker thread to exit */
972 wait_for_completion(&xpc_hb_checker_exited);
973
974
975 /* sleep for a 1/3 of a second or so */
976 (void) msleep_interruptible(300);
977
978
979 /* wait for all partitions to become inactive */
980
981 printmsg_time = jiffies + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
982 xpc_disengage_request_timedout = 0;
983
984 do {
985 active_part_count = 0;
986
987 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
988 part = &xpc_partitions[partid];
989
990 if (xpc_partition_disengaged(part) &&
991 part->act_state == XPC_P_INACTIVE) {
992 continue;
993 }
994
995 active_part_count++;
996
997 XPC_DEACTIVATE_PARTITION(part, reason);
998
999 if (part->disengage_request_timeout >
1000 disengage_request_timeout) {
1001 disengage_request_timeout =
1002 part->disengage_request_timeout;
1003 }
1004 }
1005
1006 if (xpc_partition_engaged(-1UL)) {
1007 if (time_after(jiffies, printmsg_time)) {
1008 dev_info(xpc_part, "waiting for remote "
1009 "partitions to disengage, timeout in "
1010 "%ld seconds\n",
1011 (disengage_request_timeout - jiffies)
1012 / HZ);
1013 printmsg_time = jiffies +
1014 (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
1015 printed_waiting_msg = 1;
1016 }
1017
1018 } else if (active_part_count > 0) {
1019 if (printed_waiting_msg) {
1020 dev_info(xpc_part, "waiting for local partition"
1021 " to disengage\n");
1022 printed_waiting_msg = 0;
1023 }
1024
1025 } else {
1026 if (!xpc_disengage_request_timedout) {
1027 dev_info(xpc_part, "all partitions have "
1028 "disengaged\n");
1029 }
1030 break;
1031 }
1032
1033 /* sleep for a 1/3 of a second or so */
1034 (void) msleep_interruptible(300);
1035
1036 } while (1);
1037
1038 DBUG_ON(xpc_partition_engaged(-1UL));
1039
1040
1041 /* indicate to others that our reserved page is uninitialized */
1042 xpc_rsvd_page->vars_pa = 0;
1043
1044 /* now it's time to eliminate our heartbeat */
1045 del_timer_sync(&xpc_hb_timer);
1046 DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
1047
1048 if (reason == xpcUnloading) {
1049 /* take ourselves off of the reboot_notifier_list */
1050 (void) unregister_reboot_notifier(&xpc_reboot_notifier);
1051
1052 /* take ourselves off of the die_notifier list */
1053 (void) unregister_die_notifier(&xpc_die_notifier);
1054 }
1055
1056 /* close down protections for IPI operations */
1057 xpc_restrict_IPI_ops();
1058
1059
1060 /* clear the interface to XPC's functions */
1061 xpc_clear_interface();
1062
1063 if (xpc_sysctl) {
1064 unregister_sysctl_table(xpc_sysctl);
1065 }
1066
1067 kfree(xpc_remote_copy_buffer_base);
1068}
1069
1070
1071/*
1072 * This function is called when the system is being rebooted.
1073 */
1074static int
1075xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1076{
1077 enum xpc_retval reason;
1078
1079
1080 switch (event) {
1081 case SYS_RESTART:
1082 reason = xpcSystemReboot;
1083 break;
1084 case SYS_HALT:
1085 reason = xpcSystemHalt;
1086 break;
1087 case SYS_POWER_OFF:
1088 reason = xpcSystemPoweroff;
1089 break;
1090 default:
1091 reason = xpcSystemGoingDown;
1092 }
1093
1094 xpc_do_exit(reason);
1095 return NOTIFY_DONE;
1096}
1097
1098
1099/*
1100 * Notify other partitions to disengage from all references to our memory.
1101 */
1102static void
1103xpc_die_disengage(void)
1104{
1105 struct xpc_partition *part;
1106 partid_t partid;
1107 unsigned long engaged;
1108 long time, printmsg_time, disengage_request_timeout;
1109
1110
1111 /* keep xpc_hb_checker thread from doing anything (just in case) */
1112 xpc_exiting = 1;
1113
1114 xpc_vars->heartbeating_to_mask = 0; /* indicate we're deactivated */
1115
1116 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1117 part = &xpc_partitions[partid];
1118
1119 if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
1120 remote_vars_version)) {
1121
1122 /* just in case it was left set by an earlier XPC */
1123 xpc_clear_partition_engaged(1UL << partid);
1124 continue;
1125 }
1126
1127 if (xpc_partition_engaged(1UL << partid) ||
1128 part->act_state != XPC_P_INACTIVE) {
1129 xpc_request_partition_disengage(part);
1130 xpc_mark_partition_disengaged(part);
1131 xpc_IPI_send_disengage(part);
1132 }
1133 }
1134
1135 time = rtc_time();
1136 printmsg_time = time +
1137 (XPC_DISENGAGE_PRINTMSG_INTERVAL * sn_rtc_cycles_per_second);
1138 disengage_request_timeout = time +
1139 (xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
1140
1141 /* wait for all other partitions to disengage from us */
1142
1143 while (1) {
1144 engaged = xpc_partition_engaged(-1UL);
1145 if (!engaged) {
1146 dev_info(xpc_part, "all partitions have disengaged\n");
1147 break;
1148 }
1149
1150 time = rtc_time();
1151 if (time >= disengage_request_timeout) {
1152 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1153 if (engaged & (1UL << partid)) {
1154 dev_info(xpc_part, "disengage from "
1155 "remote partition %d timed "
1156 "out\n", partid);
1157 }
1158 }
1159 break;
1160 }
1161
1162 if (time >= printmsg_time) {
1163 dev_info(xpc_part, "waiting for remote partitions to "
1164 "disengage, timeout in %ld seconds\n",
1165 (disengage_request_timeout - time) /
1166 sn_rtc_cycles_per_second);
1167 printmsg_time = time +
1168 (XPC_DISENGAGE_PRINTMSG_INTERVAL *
1169 sn_rtc_cycles_per_second);
1170 }
1171 }
1172}
1173
1174
1175/*
1176 * This function is called when the system is being restarted or halted due
1177 * to some sort of system failure. If this is the case we need to notify the
1178 * other partitions to disengage from all references to our memory.
1179 * This function can also be called when our heartbeater could be offlined
1180 * for a time. In this case we need to notify other partitions to not worry
1181 * about the lack of a heartbeat.
1182 */
1183static int
1184xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1185{
1186 switch (event) {
1187 case DIE_MACHINE_RESTART:
1188 case DIE_MACHINE_HALT:
1189 xpc_die_disengage();
1190 break;
1191
1192 case DIE_KDEBUG_ENTER:
1193 /* Should lack of heartbeat be ignored by other partitions? */
1194 if (!xpc_kdebug_ignore) {
1195 break;
1196 }
1197 /* fall through */
1198 case DIE_MCA_MONARCH_ENTER:
1199 case DIE_INIT_MONARCH_ENTER:
1200 xpc_vars->heartbeat++;
1201 xpc_vars->heartbeat_offline = 1;
1202 break;
1203
1204 case DIE_KDEBUG_LEAVE:
1205 /* Is lack of heartbeat being ignored by other partitions? */
1206 if (!xpc_kdebug_ignore) {
1207 break;
1208 }
1209 /* fall through */
1210 case DIE_MCA_MONARCH_LEAVE:
1211 case DIE_INIT_MONARCH_LEAVE:
1212 xpc_vars->heartbeat++;
1213 xpc_vars->heartbeat_offline = 0;
1214 break;
1215 }
1216
1217 return NOTIFY_DONE;
1218}
1219
1220
1221int __init
1222xpc_init(void)
1223{
1224 int ret;
1225 partid_t partid;
1226 struct xpc_partition *part;
1227 pid_t pid;
1228 size_t buf_size;
1229
1230
1231 if (!ia64_platform_is("sn2")) {
1232 return -ENODEV;
1233 }
1234
1235
1236 buf_size = max(XPC_RP_VARS_SIZE,
1237 XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES);
1238 xpc_remote_copy_buffer = xpc_kmalloc_cacheline_aligned(buf_size,
1239 GFP_KERNEL, &xpc_remote_copy_buffer_base);
1240 if (xpc_remote_copy_buffer == NULL)
1241 return -ENOMEM;
1242
1243 snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
1244 snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
1245
1246 xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1247
1248 /*
1249 * The first few fields of each entry of xpc_partitions[] need to
1250 * be initialized now so that calls to xpc_connect() and
1251 * xpc_disconnect() can be made prior to the activation of any remote
1252 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
1253 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
1254 * PARTITION HAS BEEN ACTIVATED.
1255 */
1256 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1257 part = &xpc_partitions[partid];
1258
1259 DBUG_ON((u64) part != L1_CACHE_ALIGN((u64) part));
1260
1261 part->act_IRQ_rcvd = 0;
1262 spin_lock_init(&part->act_lock);
1263 part->act_state = XPC_P_INACTIVE;
1264 XPC_SET_REASON(part, 0, 0);
1265
1266 init_timer(&part->disengage_request_timer);
1267 part->disengage_request_timer.function =
1268 xpc_timeout_partition_disengage_request;
1269 part->disengage_request_timer.data = (unsigned long) part;
1270
1271 part->setup_state = XPC_P_UNSET;
1272 init_waitqueue_head(&part->teardown_wq);
1273 atomic_set(&part->references, 0);
1274 }
1275
1276 /*
1277 * Open up protections for IPI operations (and AMO operations on
1278 * Shub 1.1 systems).
1279 */
1280 xpc_allow_IPI_ops();
1281
1282 /*
1283 * Interrupts being processed will increment this atomic variable and
1284 * awaken the heartbeat thread which will process the interrupts.
1285 */
1286 atomic_set(&xpc_act_IRQ_rcvd, 0);
1287
1288 /*
1289 * This is safe to do before the xpc_hb_checker thread has started
1290 * because the handler releases a wait queue. If an interrupt is
1291 * received before the thread is waiting, it will not go to sleep,
1292 * but rather immediately process the interrupt.
1293 */
1294 ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
1295 "xpc hb", NULL);
1296 if (ret != 0) {
1297 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
1298 "errno=%d\n", -ret);
1299
1300 xpc_restrict_IPI_ops();
1301
1302 if (xpc_sysctl) {
1303 unregister_sysctl_table(xpc_sysctl);
1304 }
1305
1306 kfree(xpc_remote_copy_buffer_base);
1307 return -EBUSY;
1308 }
1309
1310 /*
1311 * Fill the partition reserved page with the information needed by
1312 * other partitions to discover we are alive and establish initial
1313 * communications.
1314 */
1315 xpc_rsvd_page = xpc_rsvd_page_init();
1316 if (xpc_rsvd_page == NULL) {
1317 dev_err(xpc_part, "could not setup our reserved page\n");
1318
1319 free_irq(SGI_XPC_ACTIVATE, NULL);
1320 xpc_restrict_IPI_ops();
1321
1322 if (xpc_sysctl) {
1323 unregister_sysctl_table(xpc_sysctl);
1324 }
1325
1326 kfree(xpc_remote_copy_buffer_base);
1327 return -EBUSY;
1328 }
1329
1330
1331 /* add ourselves to the reboot_notifier_list */
1332 ret = register_reboot_notifier(&xpc_reboot_notifier);
1333 if (ret != 0) {
1334 dev_warn(xpc_part, "can't register reboot notifier\n");
1335 }
1336
1337 /* add ourselves to the die_notifier list */
1338 ret = register_die_notifier(&xpc_die_notifier);
1339 if (ret != 0) {
1340 dev_warn(xpc_part, "can't register die notifier\n");
1341 }
1342
1343 init_timer(&xpc_hb_timer);
1344 xpc_hb_timer.function = xpc_hb_beater;
1345
1346 /*
1347 * The real work-horse behind xpc. This processes incoming
1348 * interrupts and monitors remote heartbeats.
1349 */
1350 pid = kernel_thread(xpc_hb_checker, NULL, 0);
1351 if (pid < 0) {
1352 dev_err(xpc_part, "failed while forking hb check thread\n");
1353
1354 /* indicate to others that our reserved page is uninitialized */
1355 xpc_rsvd_page->vars_pa = 0;
1356
1357 /* take ourselves off of the reboot_notifier_list */
1358 (void) unregister_reboot_notifier(&xpc_reboot_notifier);
1359
1360 /* take ourselves off of the die_notifier list */
1361 (void) unregister_die_notifier(&xpc_die_notifier);
1362
1363 del_timer_sync(&xpc_hb_timer);
1364 free_irq(SGI_XPC_ACTIVATE, NULL);
1365 xpc_restrict_IPI_ops();
1366
1367 if (xpc_sysctl) {
1368 unregister_sysctl_table(xpc_sysctl);
1369 }
1370
1371 kfree(xpc_remote_copy_buffer_base);
1372 return -EBUSY;
1373 }
1374
1375
1376 /*
1377 * Startup a thread that will attempt to discover other partitions to
1378 * activate based on info provided by SAL. This new thread is short
1379 * lived and will exit once discovery is complete.
1380 */
1381 pid = kernel_thread(xpc_initiate_discovery, NULL, 0);
1382 if (pid < 0) {
1383 dev_err(xpc_part, "failed while forking discovery thread\n");
1384
1385 /* mark this new thread as a non-starter */
1386 complete(&xpc_discovery_exited);
1387
1388 xpc_do_exit(xpcUnloading);
1389 return -EBUSY;
1390 }
1391
1392
1393 /* set the interface to point at XPC's functions */
1394 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1395 xpc_initiate_allocate, xpc_initiate_send,
1396 xpc_initiate_send_notify, xpc_initiate_received,
1397 xpc_initiate_partid_to_nasids);
1398
1399 return 0;
1400}
1401module_init(xpc_init);
1402
1403
1404void __exit
1405xpc_exit(void)
1406{
1407 xpc_do_exit(xpcUnloading);
1408}
1409module_exit(xpc_exit);
1410
1411
1412MODULE_AUTHOR("Silicon Graphics, Inc.");
1413MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1414MODULE_LICENSE("GPL");
1415
1416module_param(xpc_hb_interval, int, 0);
1417MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1418 "heartbeat increments.");
1419
1420module_param(xpc_hb_check_interval, int, 0);
1421MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1422 "heartbeat checks.");
1423
1424module_param(xpc_disengage_request_timelimit, int, 0);
1425MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
1426 "for disengage request to complete.");
1427
1428module_param(xpc_kdebug_ignore, int, 0);
1429MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
1430 "other partitions when dropping into kdebug.");
1431
diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c
new file mode 100644
index 000000000000..7412dc7351cd
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_partition.c
@@ -0,0 +1,1239 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9
10/*
11 * Cross Partition Communication (XPC) partition support.
12 *
13 * This is the part of XPC that detects the presence/absence of
14 * other partitions. It provides a heartbeat and monitors the
15 * heartbeats of other partitions.
16 *
17 */
18
19
20#include <linux/kernel.h>
21#include <linux/sysctl.h>
22#include <linux/cache.h>
23#include <linux/mmzone.h>
24#include <linux/nodemask.h>
25#include <asm/uncached.h>
26#include <asm/sn/bte.h>
27#include <asm/sn/intr.h>
28#include <asm/sn/sn_sal.h>
29#include <asm/sn/nodepda.h>
30#include <asm/sn/addrs.h>
31#include "xpc.h"
32
33
34/* XPC is exiting flag */
35int xpc_exiting;
36
37
38/* SH_IPI_ACCESS shub register value on startup */
39static u64 xpc_sh1_IPI_access;
40static u64 xpc_sh2_IPI_access0;
41static u64 xpc_sh2_IPI_access1;
42static u64 xpc_sh2_IPI_access2;
43static u64 xpc_sh2_IPI_access3;
44
45
46/* original protection values for each node */
47u64 xpc_prot_vec[MAX_NUMNODES];
48
49
50/* this partition's reserved page pointers */
51struct xpc_rsvd_page *xpc_rsvd_page;
52static u64 *xpc_part_nasids;
53static u64 *xpc_mach_nasids;
54struct xpc_vars *xpc_vars;
55struct xpc_vars_part *xpc_vars_part;
56
57static int xp_nasid_mask_bytes; /* actual size in bytes of nasid mask */
58static int xp_nasid_mask_words; /* actual size in words of nasid mask */
59
60
61/*
62 * For performance reasons, each entry of xpc_partitions[] is cacheline
63 * aligned. And xpc_partitions[] is padded with an additional entry at the
64 * end so that the last legitimate entry doesn't share its cacheline with
65 * another variable.
66 */
67struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
68
69
70/*
71 * Generic buffer used to store a local copy of portions of a remote
72 * partition's reserved page (either its header and part_nasids mask,
73 * or its vars).
74 */
75char *xpc_remote_copy_buffer;
76void *xpc_remote_copy_buffer_base;
77
78
79/*
80 * Guarantee that the kmalloc'd memory is cacheline aligned.
81 */
82void *
83xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
84{
85 /* see if kmalloc will give us cachline aligned memory by default */
86 *base = kmalloc(size, flags);
87 if (*base == NULL) {
88 return NULL;
89 }
90 if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
91 return *base;
92 }
93 kfree(*base);
94
95 /* nope, we'll have to do it ourselves */
96 *base = kmalloc(size + L1_CACHE_BYTES, flags);
97 if (*base == NULL) {
98 return NULL;
99 }
100 return (void *) L1_CACHE_ALIGN((u64) *base);
101}
102
103
104/*
105 * Given a nasid, get the physical address of the partition's reserved page
106 * for that nasid. This function returns 0 on any error.
107 */
108static u64
109xpc_get_rsvd_page_pa(int nasid)
110{
111 bte_result_t bte_res;
112 s64 status;
113 u64 cookie = 0;
114 u64 rp_pa = nasid; /* seed with nasid */
115 u64 len = 0;
116 u64 buf = buf;
117 u64 buf_len = 0;
118 void *buf_base = NULL;
119
120
121 while (1) {
122
123 status = sn_partition_reserved_page_pa(buf, &cookie, &rp_pa,
124 &len);
125
126 dev_dbg(xpc_part, "SAL returned with status=%li, cookie="
127 "0x%016lx, address=0x%016lx, len=0x%016lx\n",
128 status, cookie, rp_pa, len);
129
130 if (status != SALRET_MORE_PASSES) {
131 break;
132 }
133
134 if (L1_CACHE_ALIGN(len) > buf_len) {
135 kfree(buf_base);
136 buf_len = L1_CACHE_ALIGN(len);
137 buf = (u64) xpc_kmalloc_cacheline_aligned(buf_len,
138 GFP_KERNEL, &buf_base);
139 if (buf_base == NULL) {
140 dev_err(xpc_part, "unable to kmalloc "
141 "len=0x%016lx\n", buf_len);
142 status = SALRET_ERROR;
143 break;
144 }
145 }
146
147 bte_res = xp_bte_copy(rp_pa, buf, buf_len,
148 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
149 if (bte_res != BTE_SUCCESS) {
150 dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res);
151 status = SALRET_ERROR;
152 break;
153 }
154 }
155
156 kfree(buf_base);
157
158 if (status != SALRET_OK) {
159 rp_pa = 0;
160 }
161 dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
162 return rp_pa;
163}
164
165
166/*
167 * Fill the partition reserved page with the information needed by
168 * other partitions to discover we are alive and establish initial
169 * communications.
170 */
171struct xpc_rsvd_page *
172xpc_rsvd_page_init(void)
173{
174 struct xpc_rsvd_page *rp;
175 AMO_t *amos_page;
176 u64 rp_pa, nasid_array = 0;
177 int i, ret;
178
179
180 /* get the local reserved page's address */
181
182 preempt_disable();
183 rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id()));
184 preempt_enable();
185 if (rp_pa == 0) {
186 dev_err(xpc_part, "SAL failed to locate the reserved page\n");
187 return NULL;
188 }
189 rp = (struct xpc_rsvd_page *) __va(rp_pa);
190
191 if (rp->partid != sn_partition_id) {
192 dev_err(xpc_part, "the reserved page's partid of %d should be "
193 "%d\n", rp->partid, sn_partition_id);
194 return NULL;
195 }
196
197 rp->version = XPC_RP_VERSION;
198
199 /* establish the actual sizes of the nasid masks */
200 if (rp->SAL_version == 1) {
201 /* SAL_version 1 didn't set the nasids_size field */
202 rp->nasids_size = 128;
203 }
204 xp_nasid_mask_bytes = rp->nasids_size;
205 xp_nasid_mask_words = xp_nasid_mask_bytes / 8;
206
207 /* setup the pointers to the various items in the reserved page */
208 xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
209 xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
210 xpc_vars = XPC_RP_VARS(rp);
211 xpc_vars_part = XPC_RP_VARS_PART(rp);
212
213 /*
214 * Before clearing xpc_vars, see if a page of AMOs had been previously
215 * allocated. If not we'll need to allocate one and set permissions
216 * so that cross-partition AMOs are allowed.
217 *
218 * The allocated AMO page needs MCA reporting to remain disabled after
219 * XPC has unloaded. To make this work, we keep a copy of the pointer
220 * to this page (i.e., amos_page) in the struct xpc_vars structure,
221 * which is pointed to by the reserved page, and re-use that saved copy
222 * on subsequent loads of XPC. This AMO page is never freed, and its
223 * memory protections are never restricted.
224 */
225 if ((amos_page = xpc_vars->amos_page) == NULL) {
226 amos_page = (AMO_t *) TO_AMO(uncached_alloc_page(0));
227 if (amos_page == NULL) {
228 dev_err(xpc_part, "can't allocate page of AMOs\n");
229 return NULL;
230 }
231
232 /*
233 * Open up AMO-R/W to cpu. This is done for Shub 1.1 systems
234 * when xpc_allow_IPI_ops() is called via xpc_hb_init().
235 */
236 if (!enable_shub_wars_1_1()) {
237 ret = sn_change_memprotect(ia64_tpa((u64) amos_page),
238 PAGE_SIZE, SN_MEMPROT_ACCESS_CLASS_1,
239 &nasid_array);
240 if (ret != 0) {
241 dev_err(xpc_part, "can't change memory "
242 "protections\n");
243 uncached_free_page(__IA64_UNCACHED_OFFSET |
244 TO_PHYS((u64) amos_page));
245 return NULL;
246 }
247 }
248 } else if (!IS_AMO_ADDRESS((u64) amos_page)) {
249 /*
250 * EFI's XPBOOT can also set amos_page in the reserved page,
251 * but it happens to leave it as an uncached physical address
252 * and we need it to be an uncached virtual, so we'll have to
253 * convert it.
254 */
255 if (!IS_AMO_PHYS_ADDRESS((u64) amos_page)) {
256 dev_err(xpc_part, "previously used amos_page address "
257 "is bad = 0x%p\n", (void *) amos_page);
258 return NULL;
259 }
260 amos_page = (AMO_t *) TO_AMO((u64) amos_page);
261 }
262
263 /* clear xpc_vars */
264 memset(xpc_vars, 0, sizeof(struct xpc_vars));
265
266 xpc_vars->version = XPC_V_VERSION;
267 xpc_vars->act_nasid = cpuid_to_nasid(0);
268 xpc_vars->act_phys_cpuid = cpu_physical_id(0);
269 xpc_vars->vars_part_pa = __pa(xpc_vars_part);
270 xpc_vars->amos_page_pa = ia64_tpa((u64) amos_page);
271 xpc_vars->amos_page = amos_page; /* save for next load of XPC */
272
273
274 /* clear xpc_vars_part */
275 memset((u64 *) xpc_vars_part, 0, sizeof(struct xpc_vars_part) *
276 XP_MAX_PARTITIONS);
277
278 /* initialize the activate IRQ related AMO variables */
279 for (i = 0; i < xp_nasid_mask_words; i++) {
280 (void) xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i);
281 }
282
283 /* initialize the engaged remote partitions related AMO variables */
284 (void) xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO);
285 (void) xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO);
286
287 /* timestamp of when reserved page was setup by XPC */
288 rp->stamp = CURRENT_TIME;
289
290 /*
291 * This signifies to the remote partition that our reserved
292 * page is initialized.
293 */
294 rp->vars_pa = __pa(xpc_vars);
295
296 return rp;
297}
298
299
300/*
301 * Change protections to allow IPI operations (and AMO operations on
302 * Shub 1.1 systems).
303 */
304void
305xpc_allow_IPI_ops(void)
306{
307 int node;
308 int nasid;
309
310
311 // >>> Change SH_IPI_ACCESS code to use SAL call once it is available.
312
313 if (is_shub2()) {
314 xpc_sh2_IPI_access0 =
315 (u64) HUB_L((u64 *) LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
316 xpc_sh2_IPI_access1 =
317 (u64) HUB_L((u64 *) LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
318 xpc_sh2_IPI_access2 =
319 (u64) HUB_L((u64 *) LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
320 xpc_sh2_IPI_access3 =
321 (u64) HUB_L((u64 *) LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
322
323 for_each_online_node(node) {
324 nasid = cnodeid_to_nasid(node);
325 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
326 -1UL);
327 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
328 -1UL);
329 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
330 -1UL);
331 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
332 -1UL);
333 }
334
335 } else {
336 xpc_sh1_IPI_access =
337 (u64) HUB_L((u64 *) LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
338
339 for_each_online_node(node) {
340 nasid = cnodeid_to_nasid(node);
341 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
342 -1UL);
343
344 /*
345 * Since the BIST collides with memory operations on
346 * SHUB 1.1 sn_change_memprotect() cannot be used.
347 */
348 if (enable_shub_wars_1_1()) {
349 /* open up everything */
350 xpc_prot_vec[node] = (u64) HUB_L((u64 *)
351 GLOBAL_MMR_ADDR(nasid,
352 SH1_MD_DQLP_MMR_DIR_PRIVEC0));
353 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid,
354 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
355 -1UL);
356 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid,
357 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
358 -1UL);
359 }
360 }
361 }
362}
363
364
365/*
366 * Restrict protections to disallow IPI operations (and AMO operations on
367 * Shub 1.1 systems).
368 */
369void
370xpc_restrict_IPI_ops(void)
371{
372 int node;
373 int nasid;
374
375
376 // >>> Change SH_IPI_ACCESS code to use SAL call once it is available.
377
378 if (is_shub2()) {
379
380 for_each_online_node(node) {
381 nasid = cnodeid_to_nasid(node);
382 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
383 xpc_sh2_IPI_access0);
384 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
385 xpc_sh2_IPI_access1);
386 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
387 xpc_sh2_IPI_access2);
388 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
389 xpc_sh2_IPI_access3);
390 }
391
392 } else {
393
394 for_each_online_node(node) {
395 nasid = cnodeid_to_nasid(node);
396 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
397 xpc_sh1_IPI_access);
398
399 if (enable_shub_wars_1_1()) {
400 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid,
401 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
402 xpc_prot_vec[node]);
403 HUB_S((u64 *) GLOBAL_MMR_ADDR(nasid,
404 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
405 xpc_prot_vec[node]);
406 }
407 }
408 }
409}
410
411
412/*
413 * At periodic intervals, scan through all active partitions and ensure
414 * their heartbeat is still active. If not, the partition is deactivated.
415 */
416void
417xpc_check_remote_hb(void)
418{
419 struct xpc_vars *remote_vars;
420 struct xpc_partition *part;
421 partid_t partid;
422 bte_result_t bres;
423
424
425 remote_vars = (struct xpc_vars *) xpc_remote_copy_buffer;
426
427 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
428
429 if (xpc_exiting) {
430 break;
431 }
432
433 if (partid == sn_partition_id) {
434 continue;
435 }
436
437 part = &xpc_partitions[partid];
438
439 if (part->act_state == XPC_P_INACTIVE ||
440 part->act_state == XPC_P_DEACTIVATING) {
441 continue;
442 }
443
444 /* pull the remote_hb cache line */
445 bres = xp_bte_copy(part->remote_vars_pa,
446 (u64) remote_vars,
447 XPC_RP_VARS_SIZE,
448 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
449 if (bres != BTE_SUCCESS) {
450 XPC_DEACTIVATE_PARTITION(part,
451 xpc_map_bte_errors(bres));
452 continue;
453 }
454
455 dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
456 " = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
457 partid, remote_vars->heartbeat, part->last_heartbeat,
458 remote_vars->heartbeat_offline,
459 remote_vars->heartbeating_to_mask);
460
461 if (((remote_vars->heartbeat == part->last_heartbeat) &&
462 (remote_vars->heartbeat_offline == 0)) ||
463 !xpc_hb_allowed(sn_partition_id, remote_vars)) {
464
465 XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat);
466 continue;
467 }
468
469 part->last_heartbeat = remote_vars->heartbeat;
470 }
471}
472
473
474/*
475 * Get a copy of a portion of the remote partition's rsvd page.
476 *
477 * remote_rp points to a buffer that is cacheline aligned for BTE copies and
478 * is large enough to contain a copy of their reserved page header and
479 * part_nasids mask.
480 */
481static enum xpc_retval
482xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
483 struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa)
484{
485 int bres, i;
486
487
488 /* get the reserved page's physical address */
489
490 *remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
491 if (*remote_rp_pa == 0) {
492 return xpcNoRsvdPageAddr;
493 }
494
495
496 /* pull over the reserved page header and part_nasids mask */
497 bres = xp_bte_copy(*remote_rp_pa, (u64) remote_rp,
498 XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes,
499 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
500 if (bres != BTE_SUCCESS) {
501 return xpc_map_bte_errors(bres);
502 }
503
504
505 if (discovered_nasids != NULL) {
506 u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp);
507
508
509 for (i = 0; i < xp_nasid_mask_words; i++) {
510 discovered_nasids[i] |= remote_part_nasids[i];
511 }
512 }
513
514
515 /* check that the partid is for another partition */
516
517 if (remote_rp->partid < 1 ||
518 remote_rp->partid > (XP_MAX_PARTITIONS - 1)) {
519 return xpcInvalidPartid;
520 }
521
522 if (remote_rp->partid == sn_partition_id) {
523 return xpcLocalPartid;
524 }
525
526
527 if (XPC_VERSION_MAJOR(remote_rp->version) !=
528 XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
529 return xpcBadVersion;
530 }
531
532 return xpcSuccess;
533}
534
535
536/*
537 * Get a copy of the remote partition's XPC variables from the reserved page.
538 *
539 * remote_vars points to a buffer that is cacheline aligned for BTE copies and
540 * assumed to be of size XPC_RP_VARS_SIZE.
541 */
542static enum xpc_retval
543xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
544{
545 int bres;
546
547
548 if (remote_vars_pa == 0) {
549 return xpcVarsNotSet;
550 }
551
552 /* pull over the cross partition variables */
553 bres = xp_bte_copy(remote_vars_pa, (u64) remote_vars, XPC_RP_VARS_SIZE,
554 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
555 if (bres != BTE_SUCCESS) {
556 return xpc_map_bte_errors(bres);
557 }
558
559 if (XPC_VERSION_MAJOR(remote_vars->version) !=
560 XPC_VERSION_MAJOR(XPC_V_VERSION)) {
561 return xpcBadVersion;
562 }
563
564 return xpcSuccess;
565}
566
567
568/*
569 * Update the remote partition's info.
570 */
571static void
572xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version,
573 struct timespec *remote_rp_stamp, u64 remote_rp_pa,
574 u64 remote_vars_pa, struct xpc_vars *remote_vars)
575{
576 part->remote_rp_version = remote_rp_version;
577 dev_dbg(xpc_part, " remote_rp_version = 0x%016x\n",
578 part->remote_rp_version);
579
580 part->remote_rp_stamp = *remote_rp_stamp;
581 dev_dbg(xpc_part, " remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n",
582 part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec);
583
584 part->remote_rp_pa = remote_rp_pa;
585 dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
586
587 part->remote_vars_pa = remote_vars_pa;
588 dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n",
589 part->remote_vars_pa);
590
591 part->last_heartbeat = remote_vars->heartbeat;
592 dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n",
593 part->last_heartbeat);
594
595 part->remote_vars_part_pa = remote_vars->vars_part_pa;
596 dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n",
597 part->remote_vars_part_pa);
598
599 part->remote_act_nasid = remote_vars->act_nasid;
600 dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n",
601 part->remote_act_nasid);
602
603 part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
604 dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n",
605 part->remote_act_phys_cpuid);
606
607 part->remote_amos_page_pa = remote_vars->amos_page_pa;
608 dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n",
609 part->remote_amos_page_pa);
610
611 part->remote_vars_version = remote_vars->version;
612 dev_dbg(xpc_part, " remote_vars_version = 0x%x\n",
613 part->remote_vars_version);
614}
615
616
617/*
618 * Prior code has determined the nasid which generated an IPI. Inspect
619 * that nasid to determine if its partition needs to be activated or
620 * deactivated.
621 *
622 * A partition is consider "awaiting activation" if our partition
623 * flags indicate it is not active and it has a heartbeat. A
624 * partition is considered "awaiting deactivation" if our partition
625 * flags indicate it is active but it has no heartbeat or it is not
626 * sending its heartbeat to us.
627 *
628 * To determine the heartbeat, the remote nasid must have a properly
629 * initialized reserved page.
630 */
631static void
632xpc_identify_act_IRQ_req(int nasid)
633{
634 struct xpc_rsvd_page *remote_rp;
635 struct xpc_vars *remote_vars;
636 u64 remote_rp_pa;
637 u64 remote_vars_pa;
638 int remote_rp_version;
639 int reactivate = 0;
640 int stamp_diff;
641 struct timespec remote_rp_stamp = { 0, 0 };
642 partid_t partid;
643 struct xpc_partition *part;
644 enum xpc_retval ret;
645
646
647 /* pull over the reserved page structure */
648
649 remote_rp = (struct xpc_rsvd_page *) xpc_remote_copy_buffer;
650
651 ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
652 if (ret != xpcSuccess) {
653 dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
654 "which sent interrupt, reason=%d\n", nasid, ret);
655 return;
656 }
657
658 remote_vars_pa = remote_rp->vars_pa;
659 remote_rp_version = remote_rp->version;
660 if (XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
661 remote_rp_stamp = remote_rp->stamp;
662 }
663 partid = remote_rp->partid;
664 part = &xpc_partitions[partid];
665
666
667 /* pull over the cross partition variables */
668
669 remote_vars = (struct xpc_vars *) xpc_remote_copy_buffer;
670
671 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
672 if (ret != xpcSuccess) {
673
674 dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
675 "which sent interrupt, reason=%d\n", nasid, ret);
676
677 XPC_DEACTIVATE_PARTITION(part, ret);
678 return;
679 }
680
681
682 part->act_IRQ_rcvd++;
683
684 dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
685 "%ld:0x%lx\n", (int) nasid, (int) partid, part->act_IRQ_rcvd,
686 remote_vars->heartbeat, remote_vars->heartbeating_to_mask);
687
688 if (xpc_partition_disengaged(part) &&
689 part->act_state == XPC_P_INACTIVE) {
690
691 xpc_update_partition_info(part, remote_rp_version,
692 &remote_rp_stamp, remote_rp_pa,
693 remote_vars_pa, remote_vars);
694
695 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
696 if (xpc_partition_disengage_requested(1UL << partid)) {
697 /*
698 * Other side is waiting on us to disengage,
699 * even though we already have.
700 */
701 return;
702 }
703 } else {
704 /* other side doesn't support disengage requests */
705 xpc_clear_partition_disengage_request(1UL << partid);
706 }
707
708 xpc_activate_partition(part);
709 return;
710 }
711
712 DBUG_ON(part->remote_rp_version == 0);
713 DBUG_ON(part->remote_vars_version == 0);
714
715 if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) {
716 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part->
717 remote_vars_version));
718
719 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
720 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
721 version));
722 /* see if the other side rebooted */
723 if (part->remote_amos_page_pa ==
724 remote_vars->amos_page_pa &&
725 xpc_hb_allowed(sn_partition_id,
726 remote_vars)) {
727 /* doesn't look that way, so ignore the IPI */
728 return;
729 }
730 }
731
732 /*
733 * Other side rebooted and previous XPC didn't support the
734 * disengage request, so we don't need to do anything special.
735 */
736
737 xpc_update_partition_info(part, remote_rp_version,
738 &remote_rp_stamp, remote_rp_pa,
739 remote_vars_pa, remote_vars);
740 part->reactivate_nasid = nasid;
741 XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
742 return;
743 }
744
745 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version));
746
747 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
748 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
749
750 /*
751 * Other side rebooted and previous XPC did support the
752 * disengage request, but the new one doesn't.
753 */
754
755 xpc_clear_partition_engaged(1UL << partid);
756 xpc_clear_partition_disengage_request(1UL << partid);
757
758 xpc_update_partition_info(part, remote_rp_version,
759 &remote_rp_stamp, remote_rp_pa,
760 remote_vars_pa, remote_vars);
761 reactivate = 1;
762
763 } else {
764 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
765
766 stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp,
767 &remote_rp_stamp);
768 if (stamp_diff != 0) {
769 DBUG_ON(stamp_diff >= 0);
770
771 /*
772 * Other side rebooted and the previous XPC did support
773 * the disengage request, as does the new one.
774 */
775
776 DBUG_ON(xpc_partition_engaged(1UL << partid));
777 DBUG_ON(xpc_partition_disengage_requested(1UL <<
778 partid));
779
780 xpc_update_partition_info(part, remote_rp_version,
781 &remote_rp_stamp, remote_rp_pa,
782 remote_vars_pa, remote_vars);
783 reactivate = 1;
784 }
785 }
786
787 if (part->disengage_request_timeout > 0 &&
788 !xpc_partition_disengaged(part)) {
789 /* still waiting on other side to disengage from us */
790 return;
791 }
792
793 if (reactivate) {
794 part->reactivate_nasid = nasid;
795 XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
796
797 } else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) &&
798 xpc_partition_disengage_requested(1UL << partid)) {
799 XPC_DEACTIVATE_PARTITION(part, xpcOtherGoingDown);
800 }
801}
802
803
804/*
805 * Loop through the activation AMO variables and process any bits
806 * which are set. Each bit indicates a nasid sending a partition
807 * activation or deactivation request.
808 *
809 * Return #of IRQs detected.
810 */
811int
812xpc_identify_act_IRQ_sender(void)
813{
814 int word, bit;
815 u64 nasid_mask;
816 u64 nasid; /* remote nasid */
817 int n_IRQs_detected = 0;
818 AMO_t *act_amos;
819
820
821 act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS;
822
823
824 /* scan through act AMO variable looking for non-zero entries */
825 for (word = 0; word < xp_nasid_mask_words; word++) {
826
827 if (xpc_exiting) {
828 break;
829 }
830
831 nasid_mask = xpc_IPI_receive(&act_amos[word]);
832 if (nasid_mask == 0) {
833 /* no IRQs from nasids in this variable */
834 continue;
835 }
836
837 dev_dbg(xpc_part, "AMO[%d] gave back 0x%lx\n", word,
838 nasid_mask);
839
840
841 /*
842 * If this nasid has been added to the machine since
843 * our partition was reset, this will retain the
844 * remote nasid in our reserved pages machine mask.
845 * This is used in the event of module reload.
846 */
847 xpc_mach_nasids[word] |= nasid_mask;
848
849
850 /* locate the nasid(s) which sent interrupts */
851
852 for (bit = 0; bit < (8 * sizeof(u64)); bit++) {
853 if (nasid_mask & (1UL << bit)) {
854 n_IRQs_detected++;
855 nasid = XPC_NASID_FROM_W_B(word, bit);
856 dev_dbg(xpc_part, "interrupt from nasid %ld\n",
857 nasid);
858 xpc_identify_act_IRQ_req(nasid);
859 }
860 }
861 }
862 return n_IRQs_detected;
863}
864
865
866/*
867 * See if the other side has responded to a partition disengage request
868 * from us.
869 */
870int
871xpc_partition_disengaged(struct xpc_partition *part)
872{
873 partid_t partid = XPC_PARTID(part);
874 int disengaged;
875
876
877 disengaged = (xpc_partition_engaged(1UL << partid) == 0);
878 if (part->disengage_request_timeout) {
879 if (!disengaged) {
880 if (time_before(jiffies, part->disengage_request_timeout)) {
881 /* timelimit hasn't been reached yet */
882 return 0;
883 }
884
885 /*
886 * Other side hasn't responded to our disengage
887 * request in a timely fashion, so assume it's dead.
888 */
889
890 dev_info(xpc_part, "disengage from remote partition %d "
891 "timed out\n", partid);
892 xpc_disengage_request_timedout = 1;
893 xpc_clear_partition_engaged(1UL << partid);
894 disengaged = 1;
895 }
896 part->disengage_request_timeout = 0;
897
898 /* cancel the timer function, provided it's not us */
899 if (!in_interrupt()) {
900 del_singleshot_timer_sync(&part->
901 disengage_request_timer);
902 }
903
904 DBUG_ON(part->act_state != XPC_P_DEACTIVATING &&
905 part->act_state != XPC_P_INACTIVE);
906 if (part->act_state != XPC_P_INACTIVE) {
907 xpc_wakeup_channel_mgr(part);
908 }
909
910 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
911 xpc_cancel_partition_disengage_request(part);
912 }
913 }
914 return disengaged;
915}
916
917
918/*
919 * Mark specified partition as active.
920 */
921enum xpc_retval
922xpc_mark_partition_active(struct xpc_partition *part)
923{
924 unsigned long irq_flags;
925 enum xpc_retval ret;
926
927
928 dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));
929
930 spin_lock_irqsave(&part->act_lock, irq_flags);
931 if (part->act_state == XPC_P_ACTIVATING) {
932 part->act_state = XPC_P_ACTIVE;
933 ret = xpcSuccess;
934 } else {
935 DBUG_ON(part->reason == xpcSuccess);
936 ret = part->reason;
937 }
938 spin_unlock_irqrestore(&part->act_lock, irq_flags);
939
940 return ret;
941}
942
943
944/*
945 * Notify XPC that the partition is down.
946 */
947void
948xpc_deactivate_partition(const int line, struct xpc_partition *part,
949 enum xpc_retval reason)
950{
951 unsigned long irq_flags;
952
953
954 spin_lock_irqsave(&part->act_lock, irq_flags);
955
956 if (part->act_state == XPC_P_INACTIVE) {
957 XPC_SET_REASON(part, reason, line);
958 spin_unlock_irqrestore(&part->act_lock, irq_flags);
959 if (reason == xpcReactivating) {
960 /* we interrupt ourselves to reactivate partition */
961 xpc_IPI_send_reactivate(part);
962 }
963 return;
964 }
965 if (part->act_state == XPC_P_DEACTIVATING) {
966 if ((part->reason == xpcUnloading && reason != xpcUnloading) ||
967 reason == xpcReactivating) {
968 XPC_SET_REASON(part, reason, line);
969 }
970 spin_unlock_irqrestore(&part->act_lock, irq_flags);
971 return;
972 }
973
974 part->act_state = XPC_P_DEACTIVATING;
975 XPC_SET_REASON(part, reason, line);
976
977 spin_unlock_irqrestore(&part->act_lock, irq_flags);
978
979 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
980 xpc_request_partition_disengage(part);
981 xpc_IPI_send_disengage(part);
982
983 /* set a timelimit on the disengage request */
984 part->disengage_request_timeout = jiffies +
985 (xpc_disengage_request_timelimit * HZ);
986 part->disengage_request_timer.expires =
987 part->disengage_request_timeout;
988 add_timer(&part->disengage_request_timer);
989 }
990
991 dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
992 XPC_PARTID(part), reason);
993
994 xpc_partition_going_down(part, reason);
995}
996
997
998/*
999 * Mark specified partition as inactive.
1000 */
1001void
1002xpc_mark_partition_inactive(struct xpc_partition *part)
1003{
1004 unsigned long irq_flags;
1005
1006
1007 dev_dbg(xpc_part, "setting partition %d to INACTIVE\n",
1008 XPC_PARTID(part));
1009
1010 spin_lock_irqsave(&part->act_lock, irq_flags);
1011 part->act_state = XPC_P_INACTIVE;
1012 spin_unlock_irqrestore(&part->act_lock, irq_flags);
1013 part->remote_rp_pa = 0;
1014}
1015
1016
1017/*
1018 * SAL has provided a partition and machine mask. The partition mask
1019 * contains a bit for each even nasid in our partition. The machine
1020 * mask contains a bit for each even nasid in the entire machine.
1021 *
1022 * Using those two bit arrays, we can determine which nasids are
1023 * known in the machine. Each should also have a reserved page
1024 * initialized if they are available for partitioning.
1025 */
1026void
1027xpc_discovery(void)
1028{
1029 void *remote_rp_base;
1030 struct xpc_rsvd_page *remote_rp;
1031 struct xpc_vars *remote_vars;
1032 u64 remote_rp_pa;
1033 u64 remote_vars_pa;
1034 int region;
1035 int region_size;
1036 int max_regions;
1037 int nasid;
1038 struct xpc_rsvd_page *rp;
1039 partid_t partid;
1040 struct xpc_partition *part;
1041 u64 *discovered_nasids;
1042 enum xpc_retval ret;
1043
1044
1045 remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
1046 xp_nasid_mask_bytes,
1047 GFP_KERNEL, &remote_rp_base);
1048 if (remote_rp == NULL) {
1049 return;
1050 }
1051 remote_vars = (struct xpc_vars *) remote_rp;
1052
1053
1054 discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
1055 GFP_KERNEL);
1056 if (discovered_nasids == NULL) {
1057 kfree(remote_rp_base);
1058 return;
1059 }
1060
1061 rp = (struct xpc_rsvd_page *) xpc_rsvd_page;
1062
1063 /*
1064 * The term 'region' in this context refers to the minimum number of
1065 * nodes that can comprise an access protection grouping. The access
1066 * protection is in regards to memory, IOI and IPI.
1067 */
1068 max_regions = 64;
1069 region_size = sn_region_size;
1070
1071 switch (region_size) {
1072 case 128:
1073 max_regions *= 2;
1074 case 64:
1075 max_regions *= 2;
1076 case 32:
1077 max_regions *= 2;
1078 region_size = 16;
1079 DBUG_ON(!is_shub2());
1080 }
1081
1082 for (region = 0; region < max_regions; region++) {
1083
1084 if ((volatile int) xpc_exiting) {
1085 break;
1086 }
1087
1088 dev_dbg(xpc_part, "searching region %d\n", region);
1089
1090 for (nasid = (region * region_size * 2);
1091 nasid < ((region + 1) * region_size * 2);
1092 nasid += 2) {
1093
1094 if ((volatile int) xpc_exiting) {
1095 break;
1096 }
1097
1098 dev_dbg(xpc_part, "checking nasid %d\n", nasid);
1099
1100
1101 if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) {
1102 dev_dbg(xpc_part, "PROM indicates Nasid %d is "
1103 "part of the local partition; skipping "
1104 "region\n", nasid);
1105 break;
1106 }
1107
1108 if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) {
1109 dev_dbg(xpc_part, "PROM indicates Nasid %d was "
1110 "not on Numa-Link network at reset\n",
1111 nasid);
1112 continue;
1113 }
1114
1115 if (XPC_NASID_IN_ARRAY(nasid, discovered_nasids)) {
1116 dev_dbg(xpc_part, "Nasid %d is part of a "
1117 "partition which was previously "
1118 "discovered\n", nasid);
1119 continue;
1120 }
1121
1122
1123 /* pull over the reserved page structure */
1124
1125 ret = xpc_get_remote_rp(nasid, discovered_nasids,
1126 remote_rp, &remote_rp_pa);
1127 if (ret != xpcSuccess) {
1128 dev_dbg(xpc_part, "unable to get reserved page "
1129 "from nasid %d, reason=%d\n", nasid,
1130 ret);
1131
1132 if (ret == xpcLocalPartid) {
1133 break;
1134 }
1135 continue;
1136 }
1137
1138 remote_vars_pa = remote_rp->vars_pa;
1139
1140 partid = remote_rp->partid;
1141 part = &xpc_partitions[partid];
1142
1143
1144 /* pull over the cross partition variables */
1145
1146 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
1147 if (ret != xpcSuccess) {
1148 dev_dbg(xpc_part, "unable to get XPC variables "
1149 "from nasid %d, reason=%d\n", nasid,
1150 ret);
1151
1152 XPC_DEACTIVATE_PARTITION(part, ret);
1153 continue;
1154 }
1155
1156 if (part->act_state != XPC_P_INACTIVE) {
1157 dev_dbg(xpc_part, "partition %d on nasid %d is "
1158 "already activating\n", partid, nasid);
1159 break;
1160 }
1161
1162 /*
1163 * Register the remote partition's AMOs with SAL so it
1164 * can handle and cleanup errors within that address
1165 * range should the remote partition go down. We don't
1166 * unregister this range because it is difficult to
1167 * tell when outstanding writes to the remote partition
1168 * are finished and thus when it is thus safe to
1169 * unregister. This should not result in wasted space
1170 * in the SAL xp_addr_region table because we should
1171 * get the same page for remote_act_amos_pa after
1172 * module reloads and system reboots.
1173 */
1174 if (sn_register_xp_addr_region(
1175 remote_vars->amos_page_pa,
1176 PAGE_SIZE, 1) < 0) {
1177 dev_dbg(xpc_part, "partition %d failed to "
1178 "register xp_addr region 0x%016lx\n",
1179 partid, remote_vars->amos_page_pa);
1180
1181 XPC_SET_REASON(part, xpcPhysAddrRegFailed,
1182 __LINE__);
1183 break;
1184 }
1185
1186 /*
1187 * The remote nasid is valid and available.
1188 * Send an interrupt to that nasid to notify
1189 * it that we are ready to begin activation.
1190 */
1191 dev_dbg(xpc_part, "sending an interrupt to AMO 0x%lx, "
1192 "nasid %d, phys_cpuid 0x%x\n",
1193 remote_vars->amos_page_pa,
1194 remote_vars->act_nasid,
1195 remote_vars->act_phys_cpuid);
1196
1197 if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
1198 version)) {
1199 part->remote_amos_page_pa =
1200 remote_vars->amos_page_pa;
1201 xpc_mark_partition_disengaged(part);
1202 xpc_cancel_partition_disengage_request(part);
1203 }
1204 xpc_IPI_send_activate(remote_vars);
1205 }
1206 }
1207
1208 kfree(discovered_nasids);
1209 kfree(remote_rp_base);
1210}
1211
1212
1213/*
1214 * Given a partid, get the nasids owned by that partition from the
1215 * remote partition's reserved page.
1216 */
1217enum xpc_retval
1218xpc_initiate_partid_to_nasids(partid_t partid, void *nasid_mask)
1219{
1220 struct xpc_partition *part;
1221 u64 part_nasid_pa;
1222 int bte_res;
1223
1224
1225 part = &xpc_partitions[partid];
1226 if (part->remote_rp_pa == 0) {
1227 return xpcPartitionDown;
1228 }
1229
1230 memset(nasid_mask, 0, XP_NASID_MASK_BYTES);
1231
1232 part_nasid_pa = (u64) XPC_RP_PART_NASIDS(part->remote_rp_pa);
1233
1234 bte_res = xp_bte_copy(part_nasid_pa, (u64) nasid_mask,
1235 xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
1236
1237 return xpc_map_bte_errors(bte_res);
1238}
1239
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
new file mode 100644
index 000000000000..38552f37e53d
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -0,0 +1,718 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 1999-2008 Silicon Graphics, Inc. All rights reserved.
7 */
8
9
10/*
11 * Cross Partition Network Interface (XPNET) support
12 *
13 * XPNET provides a virtual network layered on top of the Cross
14 * Partition communication layer.
15 *
16 * XPNET provides direct point-to-point and broadcast-like support
17 * for an ethernet-like device. The ethernet broadcast medium is
18 * replaced with a point-to-point message structure which passes
19 * pointers to a DMA-capable block that a remote partition should
20 * retrieve and pass to the upper level networking layer.
21 *
22 */
23
24
25#include <linux/module.h>
26#include <linux/kernel.h>
27#include <linux/init.h>
28#include <linux/ioport.h>
29#include <linux/netdevice.h>
30#include <linux/etherdevice.h>
31#include <linux/delay.h>
32#include <linux/ethtool.h>
33#include <linux/mii.h>
34#include <linux/smp.h>
35#include <linux/string.h>
36#include <asm/sn/bte.h>
37#include <asm/sn/io.h>
38#include <asm/sn/sn_sal.h>
39#include <asm/types.h>
40#include <asm/atomic.h>
41#include "xp.h"
42
43
44/*
45 * The message payload transferred by XPC.
46 *
47 * buf_pa is the physical address where the DMA should pull from.
48 *
49 * NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a
50 * cacheline boundary. To accomplish this, we record the number of
51 * bytes from the beginning of the first cacheline to the first useful
52 * byte of the skb (leadin_ignore) and the number of bytes from the
53 * last useful byte of the skb to the end of the last cacheline
54 * (tailout_ignore).
55 *
56 * size is the number of bytes to transfer which includes the skb->len
57 * (useful bytes of the senders skb) plus the leadin and tailout
58 */
59struct xpnet_message {
60 u16 version; /* Version for this message */
61 u16 embedded_bytes; /* #of bytes embedded in XPC message */
62 u32 magic; /* Special number indicating this is xpnet */
63 u64 buf_pa; /* phys address of buffer to retrieve */
64 u32 size; /* #of bytes in buffer */
65 u8 leadin_ignore; /* #of bytes to ignore at the beginning */
66 u8 tailout_ignore; /* #of bytes to ignore at the end */
67 unsigned char data; /* body of small packets */
68};
69
70/*
71 * Determine the size of our message, the cacheline aligned size,
72 * and then the number of message will request from XPC.
73 *
74 * XPC expects each message to exist in an individual cacheline.
75 */
76#define XPNET_MSG_SIZE (L1_CACHE_BYTES - XPC_MSG_PAYLOAD_OFFSET)
77#define XPNET_MSG_DATA_MAX \
78 (XPNET_MSG_SIZE - (u64)(&((struct xpnet_message *)0)->data))
79#define XPNET_MSG_ALIGNED_SIZE (L1_CACHE_ALIGN(XPNET_MSG_SIZE))
80#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPNET_MSG_ALIGNED_SIZE)
81
82
83#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1)
84#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1)
85
86/*
87 * Version number of XPNET implementation. XPNET can always talk to versions
88 * with same major #, and never talk to versions with a different version.
89 */
90#define _XPNET_VERSION(_major, _minor) (((_major) << 4) | (_minor))
91#define XPNET_VERSION_MAJOR(_v) ((_v) >> 4)
92#define XPNET_VERSION_MINOR(_v) ((_v) & 0xf)
93
94#define XPNET_VERSION _XPNET_VERSION(1,0) /* version 1.0 */
95#define XPNET_VERSION_EMBED _XPNET_VERSION(1,1) /* version 1.1 */
96#define XPNET_MAGIC 0x88786984 /* "XNET" */
97
98#define XPNET_VALID_MSG(_m) \
99 ((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \
100 && (msg->magic == XPNET_MAGIC))
101
102#define XPNET_DEVICE_NAME "xp0"
103
104
105/*
106 * When messages are queued with xpc_send_notify, a kmalloc'd buffer
107 * of the following type is passed as a notification cookie. When the
108 * notification function is called, we use the cookie to decide
109 * whether all outstanding message sends have completed. The skb can
110 * then be released.
111 */
112struct xpnet_pending_msg {
113 struct list_head free_list;
114 struct sk_buff *skb;
115 atomic_t use_count;
116};
117
118/* driver specific structure pointed to by the device structure */
119struct xpnet_dev_private {
120 struct net_device_stats stats;
121};
122
123struct net_device *xpnet_device;
124
125/*
126 * When we are notified of other partitions activating, we add them to
127 * our bitmask of partitions to which we broadcast.
128 */
129static u64 xpnet_broadcast_partitions;
130/* protect above */
131static DEFINE_SPINLOCK(xpnet_broadcast_lock);
132
133/*
134 * Since the Block Transfer Engine (BTE) is being used for the transfer
135 * and it relies upon cache-line size transfers, we need to reserve at
136 * least one cache-line for head and tail alignment. The BTE is
137 * limited to 8MB transfers.
138 *
139 * Testing has shown that changing MTU to greater than 64KB has no effect
140 * on TCP as the two sides negotiate a Max Segment Size that is limited
141 * to 64K. Other protocols May use packets greater than this, but for
142 * now, the default is 64KB.
143 */
144#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
145/* 32KB has been determined to be the ideal */
146#define XPNET_DEF_MTU (0x8000UL)
147
148
149/*
150 * The partition id is encapsulated in the MAC address. The following
151 * define locates the octet the partid is in.
152 */
153#define XPNET_PARTID_OCTET 1
154#define XPNET_LICENSE_OCTET 2
155
156
157/*
158 * Define the XPNET debug device structure that is to be used with dev_dbg(),
159 * dev_err(), dev_warn(), and dev_info().
160 */
161struct device_driver xpnet_dbg_name = {
162 .name = "xpnet"
163};
164
165struct device xpnet_dbg_subname = {
166 .bus_id = {0}, /* set to "" */
167 .driver = &xpnet_dbg_name
168};
169
170struct device *xpnet = &xpnet_dbg_subname;
171
172/*
173 * Packet was recevied by XPC and forwarded to us.
174 */
175static void
176xpnet_receive(partid_t partid, int channel, struct xpnet_message *msg)
177{
178 struct sk_buff *skb;
179 bte_result_t bret;
180 struct xpnet_dev_private *priv =
181 (struct xpnet_dev_private *) xpnet_device->priv;
182
183
184 if (!XPNET_VALID_MSG(msg)) {
185 /*
186 * Packet with a different XPC version. Ignore.
187 */
188 xpc_received(partid, channel, (void *) msg);
189
190 priv->stats.rx_errors++;
191
192 return;
193 }
194 dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size,
195 msg->leadin_ignore, msg->tailout_ignore);
196
197
198 /* reserve an extra cache line */
199 skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES);
200 if (!skb) {
201 dev_err(xpnet, "failed on dev_alloc_skb(%d)\n",
202 msg->size + L1_CACHE_BYTES);
203
204 xpc_received(partid, channel, (void *) msg);
205
206 priv->stats.rx_errors++;
207
208 return;
209 }
210
211 /*
212 * The allocated skb has some reserved space.
213 * In order to use bte_copy, we need to get the
214 * skb->data pointer moved forward.
215 */
216 skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
217 (L1_CACHE_BYTES - 1)) +
218 msg->leadin_ignore));
219
220 /*
221 * Update the tail pointer to indicate data actually
222 * transferred.
223 */
224 skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore));
225
226 /*
227 * Move the data over from the other side.
228 */
229 if ((XPNET_VERSION_MINOR(msg->version) == 1) &&
230 (msg->embedded_bytes != 0)) {
231 dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, "
232 "%lu)\n", skb->data, &msg->data,
233 (size_t) msg->embedded_bytes);
234
235 skb_copy_to_linear_data(skb, &msg->data, (size_t)msg->embedded_bytes);
236 } else {
237 dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
238 "bte_copy(0x%p, 0x%p, %hu)\n", (void *)msg->buf_pa,
239 (void *)__pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
240 msg->size);
241
242 bret = bte_copy(msg->buf_pa,
243 __pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
244 msg->size, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
245
246 if (bret != BTE_SUCCESS) {
247 // >>> Need better way of cleaning skb. Currently skb
248 // >>> appears in_use and we can't just call
249 // >>> dev_kfree_skb.
250 dev_err(xpnet, "bte_copy(0x%p, 0x%p, 0x%hx) returned "
251 "error=0x%x\n", (void *)msg->buf_pa,
252 (void *)__pa((u64)skb->data &
253 ~(L1_CACHE_BYTES - 1)),
254 msg->size, bret);
255
256 xpc_received(partid, channel, (void *) msg);
257
258 priv->stats.rx_errors++;
259
260 return;
261 }
262 }
263
264 dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
265 "skb->end=0x%p skb->len=%d\n", (void *) skb->head,
266 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
267 skb->len);
268
269 skb->protocol = eth_type_trans(skb, xpnet_device);
270 skb->ip_summed = CHECKSUM_UNNECESSARY;
271
272 dev_dbg(xpnet, "passing skb to network layer\n"
273 KERN_DEBUG "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p "
274 "skb->end=0x%p skb->len=%d\n",
275 (void *)skb->head, (void *)skb->data, skb_tail_pointer(skb),
276 skb_end_pointer(skb), skb->len);
277
278
279 xpnet_device->last_rx = jiffies;
280 priv->stats.rx_packets++;
281 priv->stats.rx_bytes += skb->len + ETH_HLEN;
282
283 netif_rx_ni(skb);
284 xpc_received(partid, channel, (void *) msg);
285}
286
287
288/*
289 * This is the handler which XPC calls during any sort of change in
290 * state or message reception on a connection.
291 */
292static void
293xpnet_connection_activity(enum xpc_retval reason, partid_t partid, int channel,
294 void *data, void *key)
295{
296 long bp;
297
298
299 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
300 DBUG_ON(channel != XPC_NET_CHANNEL);
301
302 switch(reason) {
303 case xpcMsgReceived: /* message received */
304 DBUG_ON(data == NULL);
305
306 xpnet_receive(partid, channel, (struct xpnet_message *) data);
307 break;
308
309 case xpcConnected: /* connection completed to a partition */
310 spin_lock_bh(&xpnet_broadcast_lock);
311 xpnet_broadcast_partitions |= 1UL << (partid -1 );
312 bp = xpnet_broadcast_partitions;
313 spin_unlock_bh(&xpnet_broadcast_lock);
314
315 netif_carrier_on(xpnet_device);
316
317 dev_dbg(xpnet, "%s connection created to partition %d; "
318 "xpnet_broadcast_partitions=0x%lx\n",
319 xpnet_device->name, partid, bp);
320 break;
321
322 default:
323 spin_lock_bh(&xpnet_broadcast_lock);
324 xpnet_broadcast_partitions &= ~(1UL << (partid -1 ));
325 bp = xpnet_broadcast_partitions;
326 spin_unlock_bh(&xpnet_broadcast_lock);
327
328 if (bp == 0) {
329 netif_carrier_off(xpnet_device);
330 }
331
332 dev_dbg(xpnet, "%s disconnected from partition %d; "
333 "xpnet_broadcast_partitions=0x%lx\n",
334 xpnet_device->name, partid, bp);
335 break;
336
337 }
338}
339
340
341static int
342xpnet_dev_open(struct net_device *dev)
343{
344 enum xpc_retval ret;
345
346
347 dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
348 "%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
349 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES, XPNET_MAX_KTHREADS,
350 XPNET_MAX_IDLE_KTHREADS);
351
352 ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
353 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
354 XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS);
355 if (ret != xpcSuccess) {
356 dev_err(xpnet, "ifconfig up of %s failed on XPC connect, "
357 "ret=%d\n", dev->name, ret);
358
359 return -ENOMEM;
360 }
361
362 dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name);
363
364 return 0;
365}
366
367
368static int
369xpnet_dev_stop(struct net_device *dev)
370{
371 xpc_disconnect(XPC_NET_CHANNEL);
372
373 dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name);
374
375 return 0;
376}
377
378
379static int
380xpnet_dev_change_mtu(struct net_device *dev, int new_mtu)
381{
382 /* 68 comes from min TCP+IP+MAC header */
383 if ((new_mtu < 68) || (new_mtu > XPNET_MAX_MTU)) {
384 dev_err(xpnet, "ifconfig %s mtu %d failed; value must be "
385 "between 68 and %ld\n", dev->name, new_mtu,
386 XPNET_MAX_MTU);
387 return -EINVAL;
388 }
389
390 dev->mtu = new_mtu;
391 dev_dbg(xpnet, "ifconfig %s mtu set to %d\n", dev->name, new_mtu);
392 return 0;
393}
394
395
396/*
397 * Required for the net_device structure.
398 */
399static int
400xpnet_dev_set_config(struct net_device *dev, struct ifmap *new_map)
401{
402 return 0;
403}
404
405
406/*
407 * Return statistics to the caller.
408 */
409static struct net_device_stats *
410xpnet_dev_get_stats(struct net_device *dev)
411{
412 struct xpnet_dev_private *priv;
413
414
415 priv = (struct xpnet_dev_private *) dev->priv;
416
417 return &priv->stats;
418}
419
420
421/*
422 * Notification that the other end has received the message and
423 * DMA'd the skb information. At this point, they are done with
424 * our side. When all recipients are done processing, we
425 * release the skb and then release our pending message structure.
426 */
427static void
428xpnet_send_completed(enum xpc_retval reason, partid_t partid, int channel,
429 void *__qm)
430{
431 struct xpnet_pending_msg *queued_msg =
432 (struct xpnet_pending_msg *) __qm;
433
434
435 DBUG_ON(queued_msg == NULL);
436
437 dev_dbg(xpnet, "message to %d notified with reason %d\n",
438 partid, reason);
439
440 if (atomic_dec_return(&queued_msg->use_count) == 0) {
441 dev_dbg(xpnet, "all acks for skb->head=-x%p\n",
442 (void *) queued_msg->skb->head);
443
444 dev_kfree_skb_any(queued_msg->skb);
445 kfree(queued_msg);
446 }
447}
448
449
450/*
451 * Network layer has formatted a packet (skb) and is ready to place it
452 * "on the wire". Prepare and send an xpnet_message to all partitions
453 * which have connected with us and are targets of this packet.
454 *
455 * MAC-NOTE: For the XPNET driver, the MAC address contains the
456 * destination partition_id. If the destination partition id word
457 * is 0xff, this packet is to broadcast to all partitions.
458 */
459static int
460xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
461{
462 struct xpnet_pending_msg *queued_msg;
463 enum xpc_retval ret;
464 struct xpnet_message *msg;
465 u64 start_addr, end_addr;
466 long dp;
467 u8 second_mac_octet;
468 partid_t dest_partid;
469 struct xpnet_dev_private *priv;
470 u16 embedded_bytes;
471
472
473 priv = (struct xpnet_dev_private *) dev->priv;
474
475
476 dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
477 "skb->end=0x%p skb->len=%d\n", (void *) skb->head,
478 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
479 skb->len);
480
481
482 /*
483 * The xpnet_pending_msg tracks how many outstanding
484 * xpc_send_notifies are relying on this skb. When none
485 * remain, release the skb.
486 */
487 queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC);
488 if (queued_msg == NULL) {
489 dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping "
490 "packet\n", sizeof(struct xpnet_pending_msg));
491
492 priv->stats.tx_errors++;
493
494 return -ENOMEM;
495 }
496
497
498 /* get the beginning of the first cacheline and end of last */
499 start_addr = ((u64) skb->data & ~(L1_CACHE_BYTES - 1));
500 end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
501
502 /* calculate how many bytes to embed in the XPC message */
503 embedded_bytes = 0;
504 if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
505 /* skb->data does fit so embed */
506 embedded_bytes = skb->len;
507 }
508
509
510 /*
511 * Since the send occurs asynchronously, we set the count to one
512 * and begin sending. Any sends that happen to complete before
513 * we are done sending will not free the skb. We will be left
514 * with that task during exit. This also handles the case of
515 * a packet destined for a partition which is no longer up.
516 */
517 atomic_set(&queued_msg->use_count, 1);
518 queued_msg->skb = skb;
519
520
521 second_mac_octet = skb->data[XPNET_PARTID_OCTET];
522 if (second_mac_octet == 0xff) {
523 /* we are being asked to broadcast to all partitions */
524 dp = xpnet_broadcast_partitions;
525 } else if (second_mac_octet != 0) {
526 dp = xpnet_broadcast_partitions &
527 (1UL << (second_mac_octet - 1));
528 } else {
529 /* 0 is an invalid partid. Ignore */
530 dp = 0;
531 }
532 dev_dbg(xpnet, "destination Partitions mask (dp) = 0x%lx\n", dp);
533
534 /*
535 * If we wanted to allow promiscuous mode to work like an
536 * unswitched network, this would be a good point to OR in a
537 * mask of partitions which should be receiving all packets.
538 */
539
540 /*
541 * Main send loop.
542 */
543 for (dest_partid = 1; dp && dest_partid < XP_MAX_PARTITIONS;
544 dest_partid++) {
545
546
547 if (!(dp & (1UL << (dest_partid - 1)))) {
548 /* not destined for this partition */
549 continue;
550 }
551
552 /* remove this partition from the destinations mask */
553 dp &= ~(1UL << (dest_partid - 1));
554
555
556 /* found a partition to send to */
557
558 ret = xpc_allocate(dest_partid, XPC_NET_CHANNEL,
559 XPC_NOWAIT, (void **)&msg);
560 if (unlikely(ret != xpcSuccess)) {
561 continue;
562 }
563
564 msg->embedded_bytes = embedded_bytes;
565 if (unlikely(embedded_bytes != 0)) {
566 msg->version = XPNET_VERSION_EMBED;
567 dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
568 &msg->data, skb->data, (size_t) embedded_bytes);
569 skb_copy_from_linear_data(skb, &msg->data,
570 (size_t)embedded_bytes);
571 } else {
572 msg->version = XPNET_VERSION;
573 }
574 msg->magic = XPNET_MAGIC;
575 msg->size = end_addr - start_addr;
576 msg->leadin_ignore = (u64) skb->data - start_addr;
577 msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
578 msg->buf_pa = __pa(start_addr);
579
580 dev_dbg(xpnet, "sending XPC message to %d:%d\n"
581 KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, "
582 "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
583 dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
584 msg->leadin_ignore, msg->tailout_ignore);
585
586
587 atomic_inc(&queued_msg->use_count);
588
589 ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, msg,
590 xpnet_send_completed, queued_msg);
591 if (unlikely(ret != xpcSuccess)) {
592 atomic_dec(&queued_msg->use_count);
593 continue;
594 }
595
596 }
597
598 if (atomic_dec_return(&queued_msg->use_count) == 0) {
599 dev_dbg(xpnet, "no partitions to receive packet destined for "
600 "%d\n", dest_partid);
601
602
603 dev_kfree_skb(skb);
604 kfree(queued_msg);
605 }
606
607 priv->stats.tx_packets++;
608 priv->stats.tx_bytes += skb->len;
609
610 return 0;
611}
612
613
614/*
615 * Deal with transmit timeouts coming from the network layer.
616 */
617static void
618xpnet_dev_tx_timeout (struct net_device *dev)
619{
620 struct xpnet_dev_private *priv;
621
622
623 priv = (struct xpnet_dev_private *) dev->priv;
624
625 priv->stats.tx_errors++;
626 return;
627}
628
629
630static int __init
631xpnet_init(void)
632{
633 int i;
634 u32 license_num;
635 int result = -ENOMEM;
636
637
638 if (!ia64_platform_is("sn2")) {
639 return -ENODEV;
640 }
641
642 dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
643
644 /*
645 * use ether_setup() to init the majority of our device
646 * structure and then override the necessary pieces.
647 */
648 xpnet_device = alloc_netdev(sizeof(struct xpnet_dev_private),
649 XPNET_DEVICE_NAME, ether_setup);
650 if (xpnet_device == NULL) {
651 return -ENOMEM;
652 }
653
654 netif_carrier_off(xpnet_device);
655
656 xpnet_device->mtu = XPNET_DEF_MTU;
657 xpnet_device->change_mtu = xpnet_dev_change_mtu;
658 xpnet_device->open = xpnet_dev_open;
659 xpnet_device->get_stats = xpnet_dev_get_stats;
660 xpnet_device->stop = xpnet_dev_stop;
661 xpnet_device->hard_start_xmit = xpnet_dev_hard_start_xmit;
662 xpnet_device->tx_timeout = xpnet_dev_tx_timeout;
663 xpnet_device->set_config = xpnet_dev_set_config;
664
665 /*
666 * Multicast assumes the LSB of the first octet is set for multicast
667 * MAC addresses. We chose the first octet of the MAC to be unlikely
668 * to collide with any vendor's officially issued MAC.
669 */
670 xpnet_device->dev_addr[0] = 0xfe;
671 xpnet_device->dev_addr[XPNET_PARTID_OCTET] = sn_partition_id;
672 license_num = sn_partition_serial_number_val();
673 for (i = 3; i >= 0; i--) {
674 xpnet_device->dev_addr[XPNET_LICENSE_OCTET + i] =
675 license_num & 0xff;
676 license_num = license_num >> 8;
677 }
678
679 /*
680 * ether_setup() sets this to a multicast device. We are
681 * really not supporting multicast at this time.
682 */
683 xpnet_device->flags &= ~IFF_MULTICAST;
684
685 /*
686 * No need to checksum as it is a DMA transfer. The BTE will
687 * report an error if the data is not retrievable and the
688 * packet will be dropped.
689 */
690 xpnet_device->features = NETIF_F_NO_CSUM;
691
692 result = register_netdev(xpnet_device);
693 if (result != 0) {
694 free_netdev(xpnet_device);
695 }
696
697 return result;
698}
699module_init(xpnet_init);
700
701
702static void __exit
703xpnet_exit(void)
704{
705 dev_info(xpnet, "unregistering network device %s\n",
706 xpnet_device[0].name);
707
708 unregister_netdev(xpnet_device);
709
710 free_netdev(xpnet_device);
711}
712module_exit(xpnet_exit);
713
714
715MODULE_AUTHOR("Silicon Graphics, Inc.");
716MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)");
717MODULE_LICENSE("GPL");
718