aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-04-22 19:47:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-22 19:47:54 -0400
commitaca239b793a4006db0d92ad0e43846ab6b54d816 (patch)
tree48cc2e4ce1e954b8f3d3ee8fb8a53bb46f5c79aa /drivers
parent16abef0e9e79643827fd5a2a14a07bced851ae72 (diff)
parent2c2b94f93f4732c3b9703ce62627e6187e7d6128 (diff)
Merge branch 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6
* 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6: [IA64] run drivers/misc/sgi-xp through scripts/checkpatch.pl [IA64] run rest drivers/misc/sgi-xp through scripts/Lindent [IA64] run some drivers/misc/sgi-xp through scripts/Lindent [IA64] move XP and XPC to drivers/misc/sgi-xp [IA64] minor irq handler cleanups [IA64] simplify notify hooks in mca.c [IA64] do notify DIE_MCA_MONARCH_PROCESS for each monarchs [IA64] disable interrupts on exit of ia64_trace_syscall
Diffstat (limited to 'drivers')
-rw-r--r--drivers/misc/Kconfig12
-rw-r--r--drivers/misc/Makefile1
-rw-r--r--drivers/misc/sgi-xp/Makefile11
-rw-r--r--drivers/misc/sgi-xp/xp.h463
-rw-r--r--drivers/misc/sgi-xp/xp_main.c279
-rw-r--r--drivers/misc/sgi-xp/xp_nofault.S35
-rw-r--r--drivers/misc/sgi-xp/xpc.h1187
-rw-r--r--drivers/misc/sgi-xp/xpc_channel.c2243
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c1323
-rw-r--r--drivers/misc/sgi-xp/xpc_partition.c1174
-rw-r--r--drivers/misc/sgi-xp/xpnet.c677
11 files changed, 7405 insertions, 0 deletions
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index bb94ce78a6d0..297a48f85446 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -360,4 +360,16 @@ config ENCLOSURE_SERVICES
360 driver (SCSI/ATA) which supports enclosures 360 driver (SCSI/ATA) which supports enclosures
361 or a SCSI enclosure device (SES) to use these services. 361 or a SCSI enclosure device (SES) to use these services.
362 362
363config SGI_XP
364 tristate "Support communication between SGI SSIs"
365 depends on IA64_GENERIC || IA64_SGI_SN2
366 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
367 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
368 ---help---
369 An SGI machine can be divided into multiple Single System
370 Images which act independently of each other and have
371 hardware based memory protection from the others. Enabling
372 this feature will allow for direct communication between SSIs
373 based on a network adapter and DMA messaging.
374
363endif # MISC_DEVICES 375endif # MISC_DEVICES
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 4581b2533111..5914da434854 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_EEPROM_93CX6) += eeprom_93cx6.o
24obj-$(CONFIG_INTEL_MENLOW) += intel_menlow.o 24obj-$(CONFIG_INTEL_MENLOW) += intel_menlow.o
25obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o 25obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
26obj-$(CONFIG_KGDB_TESTS) += kgdbts.o 26obj-$(CONFIG_KGDB_TESTS) += kgdbts.o
27obj-$(CONFIG_SGI_XP) += sgi-xp/
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile
new file mode 100644
index 000000000000..b6e40a7958ce
--- /dev/null
+++ b/drivers/misc/sgi-xp/Makefile
@@ -0,0 +1,11 @@
1#
2# Makefile for SGI's XP devices.
3#
4
5obj-$(CONFIG_SGI_XP) += xp.o
6xp-y := xp_main.o xp_nofault.o
7
8obj-$(CONFIG_SGI_XP) += xpc.o
9xpc-y := xpc_main.o xpc_channel.o xpc_partition.o
10
11obj-$(CONFIG_SGI_XP) += xpnet.o
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
new file mode 100644
index 000000000000..5515234be86a
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp.h
@@ -0,0 +1,463 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 2004-2008 Silicon Graphics, Inc. All rights reserved.
7 */
8
9/*
10 * External Cross Partition (XP) structures and defines.
11 */
12
13#ifndef _DRIVERS_MISC_SGIXP_XP_H
14#define _DRIVERS_MISC_SGIXP_XP_H
15
16#include <linux/cache.h>
17#include <linux/hardirq.h>
18#include <linux/mutex.h>
19#include <asm/sn/types.h>
20#include <asm/sn/bte.h>
21
22#ifdef USE_DBUG_ON
23#define DBUG_ON(condition) BUG_ON(condition)
24#else
25#define DBUG_ON(condition)
26#endif
27
28/*
29 * Define the maximum number of logically defined partitions the system
30 * can support. It is constrained by the maximum number of hardware
31 * partitionable regions. The term 'region' in this context refers to the
32 * minimum number of nodes that can comprise an access protection grouping.
33 * The access protection is in regards to memory, IPI and IOI.
34 *
35 * The maximum number of hardware partitionable regions is equal to the
36 * maximum number of nodes in the entire system divided by the minimum number
37 * of nodes that comprise an access protection grouping.
38 */
39#define XP_MAX_PARTITIONS 64
40
41/*
42 * Define the number of u64s required to represent all the C-brick nasids
43 * as a bitmap. The cross-partition kernel modules deal only with
44 * C-brick nasids, thus the need for bitmaps which don't account for
45 * odd-numbered (non C-brick) nasids.
46 */
47#define XP_MAX_PHYSNODE_ID (MAX_NUMALINK_NODES / 2)
48#define XP_NASID_MASK_BYTES ((XP_MAX_PHYSNODE_ID + 7) / 8)
49#define XP_NASID_MASK_WORDS ((XP_MAX_PHYSNODE_ID + 63) / 64)
50
51/*
52 * Wrapper for bte_copy() that should it return a failure status will retry
53 * the bte_copy() once in the hope that the failure was due to a temporary
54 * aberration (i.e., the link going down temporarily).
55 *
56 * src - physical address of the source of the transfer.
57 * vdst - virtual address of the destination of the transfer.
58 * len - number of bytes to transfer from source to destination.
59 * mode - see bte_copy() for definition.
60 * notification - see bte_copy() for definition.
61 *
62 * Note: xp_bte_copy() should never be called while holding a spinlock.
63 */
64static inline bte_result_t
65xp_bte_copy(u64 src, u64 vdst, u64 len, u64 mode, void *notification)
66{
67 bte_result_t ret;
68 u64 pdst = ia64_tpa(vdst);
69
70 /*
71 * Ensure that the physically mapped memory is contiguous.
72 *
73 * We do this by ensuring that the memory is from region 7 only.
74 * If the need should arise to use memory from one of the other
75 * regions, then modify the BUG_ON() statement to ensure that the
76 * memory from that region is always physically contiguous.
77 */
78 BUG_ON(REGION_NUMBER(vdst) != RGN_KERNEL);
79
80 ret = bte_copy(src, pdst, len, mode, notification);
81 if ((ret != BTE_SUCCESS) && BTE_ERROR_RETRY(ret)) {
82 if (!in_interrupt())
83 cond_resched();
84
85 ret = bte_copy(src, pdst, len, mode, notification);
86 }
87
88 return ret;
89}
90
91/*
92 * XPC establishes channel connections between the local partition and any
93 * other partition that is currently up. Over these channels, kernel-level
94 * `users' can communicate with their counterparts on the other partitions.
95 *
96 * The maxinum number of channels is limited to eight. For performance reasons,
97 * the internal cross partition structures require sixteen bytes per channel,
98 * and eight allows all of this interface-shared info to fit in one cache line.
99 *
100 * XPC_NCHANNELS reflects the total number of channels currently defined.
101 * If the need for additional channels arises, one can simply increase
102 * XPC_NCHANNELS accordingly. If the day should come where that number
103 * exceeds the MAXIMUM number of channels allowed (eight), then one will need
104 * to make changes to the XPC code to allow for this.
105 */
106#define XPC_MEM_CHANNEL 0 /* memory channel number */
107#define XPC_NET_CHANNEL 1 /* network channel number */
108
109#define XPC_NCHANNELS 2 /* #of defined channels */
110#define XPC_MAX_NCHANNELS 8 /* max #of channels allowed */
111
112#if XPC_NCHANNELS > XPC_MAX_NCHANNELS
113#error XPC_NCHANNELS exceeds MAXIMUM allowed.
114#endif
115
116/*
117 * The format of an XPC message is as follows:
118 *
119 * +-------+--------------------------------+
120 * | flags |////////////////////////////////|
121 * +-------+--------------------------------+
122 * | message # |
123 * +----------------------------------------+
124 * | payload (user-defined message) |
125 * | |
126 * :
127 * | |
128 * +----------------------------------------+
129 *
130 * The size of the payload is defined by the user via xpc_connect(). A user-
131 * defined message resides in the payload area.
132 *
133 * The user should have no dealings with the message header, but only the
134 * message's payload. When a message entry is allocated (via xpc_allocate())
135 * a pointer to the payload area is returned and not the actual beginning of
136 * the XPC message. The user then constructs a message in the payload area
137 * and passes that pointer as an argument on xpc_send() or xpc_send_notify().
138 *
139 * The size of a message entry (within a message queue) must be a cacheline
140 * sized multiple in order to facilitate the BTE transfer of messages from one
141 * message queue to another. A macro, XPC_MSG_SIZE(), is provided for the user
142 * that wants to fit as many msg entries as possible in a given memory size
143 * (e.g. a memory page).
144 */
145struct xpc_msg {
146 u8 flags; /* FOR XPC INTERNAL USE ONLY */
147 u8 reserved[7]; /* FOR XPC INTERNAL USE ONLY */
148 s64 number; /* FOR XPC INTERNAL USE ONLY */
149
150 u64 payload; /* user defined portion of message */
151};
152
153#define XPC_MSG_PAYLOAD_OFFSET (u64) (&((struct xpc_msg *)0)->payload)
154#define XPC_MSG_SIZE(_payload_size) \
155 L1_CACHE_ALIGN(XPC_MSG_PAYLOAD_OFFSET + (_payload_size))
156
157/*
158 * Define the return values and values passed to user's callout functions.
159 * (It is important to add new value codes at the end just preceding
160 * xpcUnknownReason, which must have the highest numerical value.)
161 */
162enum xpc_retval {
163 xpcSuccess = 0,
164
165 xpcNotConnected, /* 1: channel is not connected */
166 xpcConnected, /* 2: channel connected (opened) */
167 xpcRETIRED1, /* 3: (formerly xpcDisconnected) */
168
169 xpcMsgReceived, /* 4: message received */
170 xpcMsgDelivered, /* 5: message delivered and acknowledged */
171
172 xpcRETIRED2, /* 6: (formerly xpcTransferFailed) */
173
174 xpcNoWait, /* 7: operation would require wait */
175 xpcRetry, /* 8: retry operation */
176 xpcTimeout, /* 9: timeout in xpc_allocate_msg_wait() */
177 xpcInterrupted, /* 10: interrupted wait */
178
179 xpcUnequalMsgSizes, /* 11: message size disparity between sides */
180 xpcInvalidAddress, /* 12: invalid address */
181
182 xpcNoMemory, /* 13: no memory available for XPC structures */
183 xpcLackOfResources, /* 14: insufficient resources for operation */
184 xpcUnregistered, /* 15: channel is not registered */
185 xpcAlreadyRegistered, /* 16: channel is already registered */
186
187 xpcPartitionDown, /* 17: remote partition is down */
188 xpcNotLoaded, /* 18: XPC module is not loaded */
189 xpcUnloading, /* 19: this side is unloading XPC module */
190
191 xpcBadMagic, /* 20: XPC MAGIC string not found */
192
193 xpcReactivating, /* 21: remote partition was reactivated */
194
195 xpcUnregistering, /* 22: this side is unregistering channel */
196 xpcOtherUnregistering, /* 23: other side is unregistering channel */
197
198 xpcCloneKThread, /* 24: cloning kernel thread */
199 xpcCloneKThreadFailed, /* 25: cloning kernel thread failed */
200
201 xpcNoHeartbeat, /* 26: remote partition has no heartbeat */
202
203 xpcPioReadError, /* 27: PIO read error */
204 xpcPhysAddrRegFailed, /* 28: registration of phys addr range failed */
205
206 xpcBteDirectoryError, /* 29: maps to BTEFAIL_DIR */
207 xpcBtePoisonError, /* 30: maps to BTEFAIL_POISON */
208 xpcBteWriteError, /* 31: maps to BTEFAIL_WERR */
209 xpcBteAccessError, /* 32: maps to BTEFAIL_ACCESS */
210 xpcBtePWriteError, /* 33: maps to BTEFAIL_PWERR */
211 xpcBtePReadError, /* 34: maps to BTEFAIL_PRERR */
212 xpcBteTimeOutError, /* 35: maps to BTEFAIL_TOUT */
213 xpcBteXtalkError, /* 36: maps to BTEFAIL_XTERR */
214 xpcBteNotAvailable, /* 37: maps to BTEFAIL_NOTAVAIL */
215 xpcBteUnmappedError, /* 38: unmapped BTEFAIL_ error */
216
217 xpcBadVersion, /* 39: bad version number */
218 xpcVarsNotSet, /* 40: the XPC variables are not set up */
219 xpcNoRsvdPageAddr, /* 41: unable to get rsvd page's phys addr */
220 xpcInvalidPartid, /* 42: invalid partition ID */
221 xpcLocalPartid, /* 43: local partition ID */
222
223 xpcOtherGoingDown, /* 44: other side going down, reason unknown */
224 xpcSystemGoingDown, /* 45: system is going down, reason unknown */
225 xpcSystemHalt, /* 46: system is being halted */
226 xpcSystemReboot, /* 47: system is being rebooted */
227 xpcSystemPoweroff, /* 48: system is being powered off */
228
229 xpcDisconnecting, /* 49: channel disconnecting (closing) */
230
231 xpcOpenCloseError, /* 50: channel open/close protocol error */
232
233 xpcDisconnected, /* 51: channel disconnected (closed) */
234
235 xpcBteSh2Start, /* 52: BTE CRB timeout */
236
237 /* 53: 0x1 BTE Error Response Short */
238 xpcBteSh2RspShort = xpcBteSh2Start + BTEFAIL_SH2_RESP_SHORT,
239
240 /* 54: 0x2 BTE Error Response Long */
241 xpcBteSh2RspLong = xpcBteSh2Start + BTEFAIL_SH2_RESP_LONG,
242
243 /* 56: 0x4 BTE Error Response DSB */
244 xpcBteSh2RspDSB = xpcBteSh2Start + BTEFAIL_SH2_RESP_DSP,
245
246 /* 60: 0x8 BTE Error Response Access */
247 xpcBteSh2RspAccess = xpcBteSh2Start + BTEFAIL_SH2_RESP_ACCESS,
248
249 /* 68: 0x10 BTE Error CRB timeout */
250 xpcBteSh2CRBTO = xpcBteSh2Start + BTEFAIL_SH2_CRB_TO,
251
252 /* 84: 0x20 BTE Error NACK limit */
253 xpcBteSh2NACKLimit = xpcBteSh2Start + BTEFAIL_SH2_NACK_LIMIT,
254
255 /* 115: BTE end */
256 xpcBteSh2End = xpcBteSh2Start + BTEFAIL_SH2_ALL,
257
258 xpcUnknownReason /* 116: unknown reason - must be last in enum */
259};
260
261/*
262 * Define the callout function types used by XPC to update the user on
263 * connection activity and state changes (via the user function registered by
264 * xpc_connect()) and to notify them of messages received and delivered (via
265 * the user function registered by xpc_send_notify()).
266 *
267 * The two function types are xpc_channel_func and xpc_notify_func and
268 * both share the following arguments, with the exception of "data", which
269 * only xpc_channel_func has.
270 *
271 * Arguments:
272 *
273 * reason - reason code. (See following table.)
274 * partid - partition ID associated with condition.
275 * ch_number - channel # associated with condition.
276 * data - pointer to optional data. (See following table.)
277 * key - pointer to optional user-defined value provided as the "key"
278 * argument to xpc_connect() or xpc_send_notify().
279 *
280 * In the following table the "Optional Data" column applies to callouts made
281 * to functions registered by xpc_connect(). A "NA" in that column indicates
282 * that this reason code can be passed to functions registered by
283 * xpc_send_notify() (i.e. they don't have data arguments).
284 *
285 * Also, the first three reason codes in the following table indicate
286 * success, whereas the others indicate failure. When a failure reason code
287 * is received, one can assume that the channel is not connected.
288 *
289 *
290 * Reason Code | Cause | Optional Data
291 * =====================+================================+=====================
292 * xpcConnected | connection has been established| max #of entries
293 * | to the specified partition on | allowed in message
294 * | the specified channel | queue
295 * ---------------------+--------------------------------+---------------------
296 * xpcMsgReceived | an XPC message arrived from | address of payload
297 * | the specified partition on the |
298 * | specified channel | [the user must call
299 * | | xpc_received() when
300 * | | finished with the
301 * | | payload]
302 * ---------------------+--------------------------------+---------------------
303 * xpcMsgDelivered | notification that the message | NA
304 * | was delivered to the intended |
305 * | recipient and that they have |
306 * | acknowledged its receipt by |
307 * | calling xpc_received() |
308 * =====================+================================+=====================
309 * xpcUnequalMsgSizes | can't connect to the specified | NULL
310 * | partition on the specified |
311 * | channel because of mismatched |
312 * | message sizes |
313 * ---------------------+--------------------------------+---------------------
314 * xpcNoMemory | insufficient memory avaiable | NULL
315 * | to allocate message queue |
316 * ---------------------+--------------------------------+---------------------
317 * xpcLackOfResources | lack of resources to create | NULL
318 * | the necessary kthreads to |
319 * | support the channel |
320 * ---------------------+--------------------------------+---------------------
321 * xpcUnregistering | this side's user has | NULL or NA
322 * | unregistered by calling |
323 * | xpc_disconnect() |
324 * ---------------------+--------------------------------+---------------------
325 * xpcOtherUnregistering| the other side's user has | NULL or NA
326 * | unregistered by calling |
327 * | xpc_disconnect() |
328 * ---------------------+--------------------------------+---------------------
329 * xpcNoHeartbeat | the other side's XPC is no | NULL or NA
330 * | longer heartbeating |
331 * | |
332 * ---------------------+--------------------------------+---------------------
333 * xpcUnloading | this side's XPC module is | NULL or NA
334 * | being unloaded |
335 * | |
336 * ---------------------+--------------------------------+---------------------
337 * xpcOtherUnloading | the other side's XPC module is | NULL or NA
338 * | is being unloaded |
339 * | |
340 * ---------------------+--------------------------------+---------------------
341 * xpcPioReadError | xp_nofault_PIOR() returned an | NULL or NA
342 * | error while sending an IPI |
343 * | |
344 * ---------------------+--------------------------------+---------------------
345 * xpcInvalidAddress | the address either received or | NULL or NA
346 * | sent by the specified partition|
347 * | is invalid |
348 * ---------------------+--------------------------------+---------------------
349 * xpcBteNotAvailable | attempt to pull data from the | NULL or NA
350 * xpcBtePoisonError | specified partition over the |
351 * xpcBteWriteError | specified channel via a |
352 * xpcBteAccessError | bte_copy() failed |
353 * xpcBteTimeOutError | |
354 * xpcBteXtalkError | |
355 * xpcBteDirectoryError | |
356 * xpcBteGenericError | |
357 * xpcBteUnmappedError | |
358 * ---------------------+--------------------------------+---------------------
359 * xpcUnknownReason | the specified channel to the | NULL or NA
360 * | specified partition was |
361 * | unavailable for unknown reasons|
362 * =====================+================================+=====================
363 */
364
365typedef void (*xpc_channel_func) (enum xpc_retval reason, partid_t partid,
366 int ch_number, void *data, void *key);
367
368typedef void (*xpc_notify_func) (enum xpc_retval reason, partid_t partid,
369 int ch_number, void *key);
370
371/*
372 * The following is a registration entry. There is a global array of these,
373 * one per channel. It is used to record the connection registration made
374 * by the users of XPC. As long as a registration entry exists, for any
375 * partition that comes up, XPC will attempt to establish a connection on
376 * that channel. Notification that a connection has been made will occur via
377 * the xpc_channel_func function.
378 *
379 * The 'func' field points to the function to call when aynchronous
380 * notification is required for such events as: a connection established/lost,
381 * or an incoming message received, or an error condition encountered. A
382 * non-NULL 'func' field indicates that there is an active registration for
383 * the channel.
384 */
385struct xpc_registration {
386 struct mutex mutex;
387 xpc_channel_func func; /* function to call */
388 void *key; /* pointer to user's key */
389 u16 nentries; /* #of msg entries in local msg queue */
390 u16 msg_size; /* message queue's message size */
391 u32 assigned_limit; /* limit on #of assigned kthreads */
392 u32 idle_limit; /* limit on #of idle kthreads */
393} ____cacheline_aligned;
394
395#define XPC_CHANNEL_REGISTERED(_c) (xpc_registrations[_c].func != NULL)
396
397/* the following are valid xpc_allocate() flags */
398#define XPC_WAIT 0 /* wait flag */
399#define XPC_NOWAIT 1 /* no wait flag */
400
401struct xpc_interface {
402 void (*connect) (int);
403 void (*disconnect) (int);
404 enum xpc_retval (*allocate) (partid_t, int, u32, void **);
405 enum xpc_retval (*send) (partid_t, int, void *);
406 enum xpc_retval (*send_notify) (partid_t, int, void *,
407 xpc_notify_func, void *);
408 void (*received) (partid_t, int, void *);
409 enum xpc_retval (*partid_to_nasids) (partid_t, void *);
410};
411
412extern struct xpc_interface xpc_interface;
413
414extern void xpc_set_interface(void (*)(int),
415 void (*)(int),
416 enum xpc_retval (*)(partid_t, int, u32, void **),
417 enum xpc_retval (*)(partid_t, int, void *),
418 enum xpc_retval (*)(partid_t, int, void *,
419 xpc_notify_func, void *),
420 void (*)(partid_t, int, void *),
421 enum xpc_retval (*)(partid_t, void *));
422extern void xpc_clear_interface(void);
423
424extern enum xpc_retval xpc_connect(int, xpc_channel_func, void *, u16,
425 u16, u32, u32);
426extern void xpc_disconnect(int);
427
428static inline enum xpc_retval
429xpc_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
430{
431 return xpc_interface.allocate(partid, ch_number, flags, payload);
432}
433
434static inline enum xpc_retval
435xpc_send(partid_t partid, int ch_number, void *payload)
436{
437 return xpc_interface.send(partid, ch_number, payload);
438}
439
440static inline enum xpc_retval
441xpc_send_notify(partid_t partid, int ch_number, void *payload,
442 xpc_notify_func func, void *key)
443{
444 return xpc_interface.send_notify(partid, ch_number, payload, func, key);
445}
446
447static inline void
448xpc_received(partid_t partid, int ch_number, void *payload)
449{
450 return xpc_interface.received(partid, ch_number, payload);
451}
452
453static inline enum xpc_retval
454xpc_partid_to_nasids(partid_t partid, void *nasids)
455{
456 return xpc_interface.partid_to_nasids(partid, nasids);
457}
458
459extern u64 xp_nofault_PIOR_target;
460extern int xp_nofault_PIOR(void *);
461extern int xp_error_PIOR(void);
462
463#endif /* _DRIVERS_MISC_SGIXP_XP_H */
diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c
new file mode 100644
index 000000000000..1fbf99bae963
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_main.c
@@ -0,0 +1,279 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition (XP) base.
11 *
12 * XP provides a base from which its users can interact
13 * with XPC, yet not be dependent on XPC.
14 *
15 */
16
17#include <linux/kernel.h>
18#include <linux/interrupt.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include <asm/sn/intr.h>
22#include <asm/sn/sn_sal.h>
23#include "xp.h"
24
25/*
26 * The export of xp_nofault_PIOR needs to happen here since it is defined
27 * in drivers/misc/sgi-xp/xp_nofault.S. The target of the nofault read is
28 * defined here.
29 */
30EXPORT_SYMBOL_GPL(xp_nofault_PIOR);
31
32u64 xp_nofault_PIOR_target;
33EXPORT_SYMBOL_GPL(xp_nofault_PIOR_target);
34
35/*
36 * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
37 * users of XPC.
38 */
39struct xpc_registration xpc_registrations[XPC_NCHANNELS];
40EXPORT_SYMBOL_GPL(xpc_registrations);
41
42/*
43 * Initialize the XPC interface to indicate that XPC isn't loaded.
44 */
45static enum xpc_retval
46xpc_notloaded(void)
47{
48 return xpcNotLoaded;
49}
50
51struct xpc_interface xpc_interface = {
52 (void (*)(int))xpc_notloaded,
53 (void (*)(int))xpc_notloaded,
54 (enum xpc_retval(*)(partid_t, int, u32, void **))xpc_notloaded,
55 (enum xpc_retval(*)(partid_t, int, void *))xpc_notloaded,
56 (enum xpc_retval(*)(partid_t, int, void *, xpc_notify_func, void *))
57 xpc_notloaded,
58 (void (*)(partid_t, int, void *))xpc_notloaded,
59 (enum xpc_retval(*)(partid_t, void *))xpc_notloaded
60};
61EXPORT_SYMBOL_GPL(xpc_interface);
62
63/*
64 * XPC calls this when it (the XPC module) has been loaded.
65 */
66void
67xpc_set_interface(void (*connect) (int),
68 void (*disconnect) (int),
69 enum xpc_retval (*allocate) (partid_t, int, u32, void **),
70 enum xpc_retval (*send) (partid_t, int, void *),
71 enum xpc_retval (*send_notify) (partid_t, int, void *,
72 xpc_notify_func, void *),
73 void (*received) (partid_t, int, void *),
74 enum xpc_retval (*partid_to_nasids) (partid_t, void *))
75{
76 xpc_interface.connect = connect;
77 xpc_interface.disconnect = disconnect;
78 xpc_interface.allocate = allocate;
79 xpc_interface.send = send;
80 xpc_interface.send_notify = send_notify;
81 xpc_interface.received = received;
82 xpc_interface.partid_to_nasids = partid_to_nasids;
83}
84EXPORT_SYMBOL_GPL(xpc_set_interface);
85
86/*
87 * XPC calls this when it (the XPC module) is being unloaded.
88 */
89void
90xpc_clear_interface(void)
91{
92 xpc_interface.connect = (void (*)(int))xpc_notloaded;
93 xpc_interface.disconnect = (void (*)(int))xpc_notloaded;
94 xpc_interface.allocate = (enum xpc_retval(*)(partid_t, int, u32,
95 void **))xpc_notloaded;
96 xpc_interface.send = (enum xpc_retval(*)(partid_t, int, void *))
97 xpc_notloaded;
98 xpc_interface.send_notify = (enum xpc_retval(*)(partid_t, int, void *,
99 xpc_notify_func,
100 void *))xpc_notloaded;
101 xpc_interface.received = (void (*)(partid_t, int, void *))
102 xpc_notloaded;
103 xpc_interface.partid_to_nasids = (enum xpc_retval(*)(partid_t, void *))
104 xpc_notloaded;
105}
106EXPORT_SYMBOL_GPL(xpc_clear_interface);
107
108/*
109 * Register for automatic establishment of a channel connection whenever
110 * a partition comes up.
111 *
112 * Arguments:
113 *
114 * ch_number - channel # to register for connection.
115 * func - function to call for asynchronous notification of channel
116 * state changes (i.e., connection, disconnection, error) and
117 * the arrival of incoming messages.
118 * key - pointer to optional user-defined value that gets passed back
119 * to the user on any callouts made to func.
120 * payload_size - size in bytes of the XPC message's payload area which
121 * contains a user-defined message. The user should make
122 * this large enough to hold their largest message.
123 * nentries - max #of XPC message entries a message queue can contain.
124 * The actual number, which is determined when a connection
125 * is established and may be less then requested, will be
126 * passed to the user via the xpcConnected callout.
127 * assigned_limit - max number of kthreads allowed to be processing
128 * messages (per connection) at any given instant.
129 * idle_limit - max number of kthreads allowed to be idle at any given
130 * instant.
131 */
132enum xpc_retval
133xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
134 u16 nentries, u32 assigned_limit, u32 idle_limit)
135{
136 struct xpc_registration *registration;
137
138 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
139 DBUG_ON(payload_size == 0 || nentries == 0);
140 DBUG_ON(func == NULL);
141 DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
142
143 registration = &xpc_registrations[ch_number];
144
145 if (mutex_lock_interruptible(&registration->mutex) != 0)
146 return xpcInterrupted;
147
148 /* if XPC_CHANNEL_REGISTERED(ch_number) */
149 if (registration->func != NULL) {
150 mutex_unlock(&registration->mutex);
151 return xpcAlreadyRegistered;
152 }
153
154 /* register the channel for connection */
155 registration->msg_size = XPC_MSG_SIZE(payload_size);
156 registration->nentries = nentries;
157 registration->assigned_limit = assigned_limit;
158 registration->idle_limit = idle_limit;
159 registration->key = key;
160 registration->func = func;
161
162 mutex_unlock(&registration->mutex);
163
164 xpc_interface.connect(ch_number);
165
166 return xpcSuccess;
167}
168EXPORT_SYMBOL_GPL(xpc_connect);
169
170/*
171 * Remove the registration for automatic connection of the specified channel
172 * when a partition comes up.
173 *
174 * Before returning this xpc_disconnect() will wait for all connections on the
175 * specified channel have been closed/torndown. So the caller can be assured
176 * that they will not be receiving any more callouts from XPC to their
177 * function registered via xpc_connect().
178 *
179 * Arguments:
180 *
181 * ch_number - channel # to unregister.
182 */
183void
184xpc_disconnect(int ch_number)
185{
186 struct xpc_registration *registration;
187
188 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
189
190 registration = &xpc_registrations[ch_number];
191
192 /*
193 * We've decided not to make this a down_interruptible(), since we
194 * figured XPC's users will just turn around and call xpc_disconnect()
195 * again anyways, so we might as well wait, if need be.
196 */
197 mutex_lock(&registration->mutex);
198
199 /* if !XPC_CHANNEL_REGISTERED(ch_number) */
200 if (registration->func == NULL) {
201 mutex_unlock(&registration->mutex);
202 return;
203 }
204
205 /* remove the connection registration for the specified channel */
206 registration->func = NULL;
207 registration->key = NULL;
208 registration->nentries = 0;
209 registration->msg_size = 0;
210 registration->assigned_limit = 0;
211 registration->idle_limit = 0;
212
213 xpc_interface.disconnect(ch_number);
214
215 mutex_unlock(&registration->mutex);
216
217 return;
218}
219EXPORT_SYMBOL_GPL(xpc_disconnect);
220
221int __init
222xp_init(void)
223{
224 int ret, ch_number;
225 u64 func_addr = *(u64 *)xp_nofault_PIOR;
226 u64 err_func_addr = *(u64 *)xp_error_PIOR;
227
228 if (!ia64_platform_is("sn2"))
229 return -ENODEV;
230
231 /*
232 * Register a nofault code region which performs a cross-partition
233 * PIO read. If the PIO read times out, the MCA handler will consume
234 * the error and return to a kernel-provided instruction to indicate
235 * an error. This PIO read exists because it is guaranteed to timeout
236 * if the destination is down (AMO operations do not timeout on at
237 * least some CPUs on Shubs <= v1.2, which unfortunately we have to
238 * work around).
239 */
240 ret = sn_register_nofault_code(func_addr, err_func_addr, err_func_addr,
241 1, 1);
242 if (ret != 0) {
243 printk(KERN_ERR "XP: can't register nofault code, error=%d\n",
244 ret);
245 }
246 /*
247 * Setup the nofault PIO read target. (There is no special reason why
248 * SH_IPI_ACCESS was selected.)
249 */
250 if (is_shub2())
251 xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
252 else
253 xp_nofault_PIOR_target = SH1_IPI_ACCESS;
254
255 /* initialize the connection registration mutex */
256 for (ch_number = 0; ch_number < XPC_NCHANNELS; ch_number++)
257 mutex_init(&xpc_registrations[ch_number].mutex);
258
259 return 0;
260}
261
262module_init(xp_init);
263
264void __exit
265xp_exit(void)
266{
267 u64 func_addr = *(u64 *)xp_nofault_PIOR;
268 u64 err_func_addr = *(u64 *)xp_error_PIOR;
269
270 /* unregister the PIO read nofault code region */
271 (void)sn_register_nofault_code(func_addr, err_func_addr,
272 err_func_addr, 1, 0);
273}
274
275module_exit(xp_exit);
276
277MODULE_AUTHOR("Silicon Graphics, Inc.");
278MODULE_DESCRIPTION("Cross Partition (XP) base");
279MODULE_LICENSE("GPL");
diff --git a/drivers/misc/sgi-xp/xp_nofault.S b/drivers/misc/sgi-xp/xp_nofault.S
new file mode 100644
index 000000000000..e38d43319429
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_nofault.S
@@ -0,0 +1,35 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * The xp_nofault_PIOR function takes a pointer to a remote PIO register
11 * and attempts to load and consume a value from it. This function
12 * will be registered as a nofault code block. In the event that the
13 * PIO read fails, the MCA handler will force the error to look
14 * corrected and vector to the xp_error_PIOR which will return an error.
15 *
16 * The definition of "consumption" and the time it takes for an MCA
17 * to surface is processor implementation specific. This code
18 * is sufficient on Itanium through the Montvale processor family.
19 * It may need to be adjusted for future processor implementations.
20 *
21 * extern int xp_nofault_PIOR(void *remote_register);
22 */
23
24 .global xp_nofault_PIOR
25xp_nofault_PIOR:
26 mov r8=r0 // Stage a success return value
27 ld8.acq r9=[r32];; // PIO Read the specified register
28 adds r9=1,r9;; // Add to force consumption
29 srlz.i;; // Allow time for MCA to surface
30 br.ret.sptk.many b0;; // Return success
31
32 .global xp_error_PIOR
33xp_error_PIOR:
34 mov r8=1 // Return value of 1
35 br.ret.sptk.many b0;; // Return failure
diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h
new file mode 100644
index 000000000000..9eb6d4a3269c
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc.h
@@ -0,0 +1,1187 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) structures and macros.
11 */
12
13#ifndef _DRIVERS_MISC_SGIXP_XPC_H
14#define _DRIVERS_MISC_SGIXP_XPC_H
15
16#include <linux/interrupt.h>
17#include <linux/sysctl.h>
18#include <linux/device.h>
19#include <linux/mutex.h>
20#include <linux/completion.h>
21#include <asm/pgtable.h>
22#include <asm/processor.h>
23#include <asm/sn/bte.h>
24#include <asm/sn/clksupport.h>
25#include <asm/sn/addrs.h>
26#include <asm/sn/mspec.h>
27#include <asm/sn/shub_mmr.h>
28#include "xp.h"
29
30/*
31 * XPC Version numbers consist of a major and minor number. XPC can always
32 * talk to versions with same major #, and never talk to versions with a
33 * different major #.
34 */
35#define _XPC_VERSION(_maj, _min) (((_maj) << 4) | ((_min) & 0xf))
36#define XPC_VERSION_MAJOR(_v) ((_v) >> 4)
37#define XPC_VERSION_MINOR(_v) ((_v) & 0xf)
38
39/*
40 * The next macros define word or bit representations for given
41 * C-brick nasid in either the SAL provided bit array representing
42 * nasids in the partition/machine or the AMO_t array used for
43 * inter-partition initiation communications.
44 *
45 * For SN2 machines, C-Bricks are alway even numbered NASIDs. As
46 * such, some space will be saved by insisting that nasid information
47 * passed from SAL always be packed for C-Bricks and the
48 * cross-partition interrupts use the same packing scheme.
49 */
50#define XPC_NASID_W_INDEX(_n) (((_n) / 64) / 2)
51#define XPC_NASID_B_INDEX(_n) (((_n) / 2) & (64 - 1))
52#define XPC_NASID_IN_ARRAY(_n, _p) ((_p)[XPC_NASID_W_INDEX(_n)] & \
53 (1UL << XPC_NASID_B_INDEX(_n)))
54#define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2)
55
56#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */
57#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */
58
59/* define the process name of HB checker and the CPU it is pinned to */
60#define XPC_HB_CHECK_THREAD_NAME "xpc_hb"
61#define XPC_HB_CHECK_CPU 0
62
63/* define the process name of the discovery thread */
64#define XPC_DISCOVERY_THREAD_NAME "xpc_discovery"
65
66/*
67 * the reserved page
68 *
69 * SAL reserves one page of memory per partition for XPC. Though a full page
70 * in length (16384 bytes), its starting address is not page aligned, but it
71 * is cacheline aligned. The reserved page consists of the following:
72 *
73 * reserved page header
74 *
75 * The first cacheline of the reserved page contains the header
76 * (struct xpc_rsvd_page). Before SAL initialization has completed,
77 * SAL has set up the following fields of the reserved page header:
78 * SAL_signature, SAL_version, partid, and nasids_size. The other
79 * fields are set up by XPC. (xpc_rsvd_page points to the local
80 * partition's reserved page.)
81 *
82 * part_nasids mask
83 * mach_nasids mask
84 *
85 * SAL also sets up two bitmaps (or masks), one that reflects the actual
86 * nasids in this partition (part_nasids), and the other that reflects
87 * the actual nasids in the entire machine (mach_nasids). We're only
88 * interested in the even numbered nasids (which contain the processors
89 * and/or memory), so we only need half as many bits to represent the
90 * nasids. The part_nasids mask is located starting at the first cacheline
91 * following the reserved page header. The mach_nasids mask follows right
92 * after the part_nasids mask. The size in bytes of each mask is reflected
93 * by the reserved page header field 'nasids_size'. (Local partition's
94 * mask pointers are xpc_part_nasids and xpc_mach_nasids.)
95 *
96 * vars
97 * vars part
98 *
99 * Immediately following the mach_nasids mask are the XPC variables
100 * required by other partitions. First are those that are generic to all
101 * partitions (vars), followed on the next available cacheline by those
102 * which are partition specific (vars part). These are setup by XPC.
103 * (Local partition's vars pointers are xpc_vars and xpc_vars_part.)
104 *
105 * Note: Until vars_pa is set, the partition XPC code has not been initialized.
106 */
107struct xpc_rsvd_page {
108 u64 SAL_signature; /* SAL: unique signature */
109 u64 SAL_version; /* SAL: version */
110 u8 partid; /* SAL: partition ID */
111 u8 version;
112 u8 pad1[6]; /* align to next u64 in cacheline */
113 u64 vars_pa; /* physical address of struct xpc_vars */
114 struct timespec stamp; /* time when reserved page was setup by XPC */
115 u64 pad2[9]; /* align to last u64 in cacheline */
116 u64 nasids_size; /* SAL: size of each nasid mask in bytes */
117};
118
119#define XPC_RP_VERSION _XPC_VERSION(1, 1) /* version 1.1 of the reserved page */
120
121#define XPC_SUPPORTS_RP_STAMP(_version) \
122 (_version >= _XPC_VERSION(1, 1))
123
124/*
125 * compare stamps - the return value is:
126 *
127 * < 0, if stamp1 < stamp2
128 * = 0, if stamp1 == stamp2
129 * > 0, if stamp1 > stamp2
130 */
131static inline int
132xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2)
133{
134 int ret;
135
136 ret = stamp1->tv_sec - stamp2->tv_sec;
137 if (ret == 0)
138 ret = stamp1->tv_nsec - stamp2->tv_nsec;
139
140 return ret;
141}
142
143/*
144 * Define the structures by which XPC variables can be exported to other
145 * partitions. (There are two: struct xpc_vars and struct xpc_vars_part)
146 */
147
148/*
149 * The following structure describes the partition generic variables
150 * needed by other partitions in order to properly initialize.
151 *
152 * struct xpc_vars version number also applies to struct xpc_vars_part.
153 * Changes to either structure and/or related functionality should be
154 * reflected by incrementing either the major or minor version numbers
155 * of struct xpc_vars.
156 */
157struct xpc_vars {
158 u8 version;
159 u64 heartbeat;
160 u64 heartbeating_to_mask;
161 u64 heartbeat_offline; /* if 0, heartbeat should be changing */
162 int act_nasid;
163 int act_phys_cpuid;
164 u64 vars_part_pa;
165 u64 amos_page_pa; /* paddr of page of AMOs from MSPEC driver */
166 AMO_t *amos_page; /* vaddr of page of AMOs from MSPEC driver */
167};
168
169#define XPC_V_VERSION _XPC_VERSION(3, 1) /* version 3.1 of the cross vars */
170
171#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \
172 (_version >= _XPC_VERSION(3, 1))
173
174static inline int
175xpc_hb_allowed(partid_t partid, struct xpc_vars *vars)
176{
177 return ((vars->heartbeating_to_mask & (1UL << partid)) != 0);
178}
179
180static inline void
181xpc_allow_hb(partid_t partid, struct xpc_vars *vars)
182{
183 u64 old_mask, new_mask;
184
185 do {
186 old_mask = vars->heartbeating_to_mask;
187 new_mask = (old_mask | (1UL << partid));
188 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
189 old_mask);
190}
191
192static inline void
193xpc_disallow_hb(partid_t partid, struct xpc_vars *vars)
194{
195 u64 old_mask, new_mask;
196
197 do {
198 old_mask = vars->heartbeating_to_mask;
199 new_mask = (old_mask & ~(1UL << partid));
200 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
201 old_mask);
202}
203
204/*
205 * The AMOs page consists of a number of AMO variables which are divided into
206 * four groups, The first two groups are used to identify an IRQ's sender.
207 * These two groups consist of 64 and 128 AMO variables respectively. The last
208 * two groups, consisting of just one AMO variable each, are used to identify
209 * the remote partitions that are currently engaged (from the viewpoint of
210 * the XPC running on the remote partition).
211 */
212#define XPC_NOTIFY_IRQ_AMOS 0
213#define XPC_ACTIVATE_IRQ_AMOS (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS)
214#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS)
215#define XPC_DISENGAGE_REQUEST_AMO (XPC_ENGAGED_PARTITIONS_AMO + 1)
216
217/*
218 * The following structure describes the per partition specific variables.
219 *
220 * An array of these structures, one per partition, will be defined. As a
221 * partition becomes active XPC will copy the array entry corresponding to
222 * itself from that partition. It is desirable that the size of this
223 * structure evenly divide into a cacheline, such that none of the entries
224 * in this array crosses a cacheline boundary. As it is now, each entry
225 * occupies half a cacheline.
226 */
227struct xpc_vars_part {
228 u64 magic;
229
230 u64 openclose_args_pa; /* physical address of open and close args */
231 u64 GPs_pa; /* physical address of Get/Put values */
232
233 u64 IPI_amo_pa; /* physical address of IPI AMO_t structure */
234 int IPI_nasid; /* nasid of where to send IPIs */
235 int IPI_phys_cpuid; /* physical CPU ID of where to send IPIs */
236
237 u8 nchannels; /* #of defined channels supported */
238
239 u8 reserved[23]; /* pad to a full 64 bytes */
240};
241
242/*
243 * The vars_part MAGIC numbers play a part in the first contact protocol.
244 *
245 * MAGIC1 indicates that the per partition specific variables for a remote
246 * partition have been initialized by this partition.
247 *
248 * MAGIC2 indicates that this partition has pulled the remote partititions
249 * per partition variables that pertain to this partition.
250 */
251#define XPC_VP_MAGIC1 0x0053524156435058L /* 'XPCVARS\0'L (little endian) */
252#define XPC_VP_MAGIC2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */
253
254/* the reserved page sizes and offsets */
255
256#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
257#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars))
258
259#define XPC_RP_PART_NASIDS(_rp) ((u64 *)((u8 *)(_rp) + XPC_RP_HEADER_SIZE))
260#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + xp_nasid_mask_words)
261#define XPC_RP_VARS(_rp) ((struct xpc_vars *)(XPC_RP_MACH_NASIDS(_rp) + \
262 xp_nasid_mask_words))
263#define XPC_RP_VARS_PART(_rp) ((struct xpc_vars_part *) \
264 ((u8 *)XPC_RP_VARS(_rp) + XPC_RP_VARS_SIZE))
265
266/*
267 * Functions registered by add_timer() or called by kernel_thread() only
268 * allow for a single 64-bit argument. The following macros can be used to
269 * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from
270 * the passed argument.
271 */
272#define XPC_PACK_ARGS(_arg1, _arg2) \
273 ((((u64) _arg1) & 0xffffffff) | \
274 ((((u64) _arg2) & 0xffffffff) << 32))
275
276#define XPC_UNPACK_ARG1(_args) (((u64) _args) & 0xffffffff)
277#define XPC_UNPACK_ARG2(_args) ((((u64) _args) >> 32) & 0xffffffff)
278
279/*
280 * Define a Get/Put value pair (pointers) used with a message queue.
281 */
282struct xpc_gp {
283 s64 get; /* Get value */
284 s64 put; /* Put value */
285};
286
287#define XPC_GP_SIZE \
288 L1_CACHE_ALIGN(sizeof(struct xpc_gp) * XPC_NCHANNELS)
289
290/*
291 * Define a structure that contains arguments associated with opening and
292 * closing a channel.
293 */
294struct xpc_openclose_args {
295 u16 reason; /* reason why channel is closing */
296 u16 msg_size; /* sizeof each message entry */
297 u16 remote_nentries; /* #of message entries in remote msg queue */
298 u16 local_nentries; /* #of message entries in local msg queue */
299 u64 local_msgqueue_pa; /* physical address of local message queue */
300};
301
302#define XPC_OPENCLOSE_ARGS_SIZE \
303 L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * XPC_NCHANNELS)
304
305/* struct xpc_msg flags */
306
307#define XPC_M_DONE 0x01 /* msg has been received/consumed */
308#define XPC_M_READY 0x02 /* msg is ready to be sent */
309#define XPC_M_INTERRUPT 0x04 /* send interrupt when msg consumed */
310
311#define XPC_MSG_ADDRESS(_payload) \
312 ((struct xpc_msg *)((u8 *)(_payload) - XPC_MSG_PAYLOAD_OFFSET))
313
314/*
315 * Defines notify entry.
316 *
317 * This is used to notify a message's sender that their message was received
318 * and consumed by the intended recipient.
319 */
320struct xpc_notify {
321 u8 type; /* type of notification */
322
323 /* the following two fields are only used if type == XPC_N_CALL */
324 xpc_notify_func func; /* user's notify function */
325 void *key; /* pointer to user's key */
326};
327
328/* struct xpc_notify type of notification */
329
330#define XPC_N_CALL 0x01 /* notify function provided by user */
331
332/*
333 * Define the structure that manages all the stuff required by a channel. In
334 * particular, they are used to manage the messages sent across the channel.
335 *
336 * This structure is private to a partition, and is NOT shared across the
337 * partition boundary.
338 *
339 * There is an array of these structures for each remote partition. It is
340 * allocated at the time a partition becomes active. The array contains one
341 * of these structures for each potential channel connection to that partition.
342 *
343 * Each of these structures manages two message queues (circular buffers).
344 * They are allocated at the time a channel connection is made. One of
345 * these message queues (local_msgqueue) holds the locally created messages
346 * that are destined for the remote partition. The other of these message
347 * queues (remote_msgqueue) is a locally cached copy of the remote partition's
348 * own local_msgqueue.
349 *
350 * The following is a description of the Get/Put pointers used to manage these
351 * two message queues. Consider the local_msgqueue to be on one partition
352 * and the remote_msgqueue to be its cached copy on another partition. A
353 * description of what each of the lettered areas contains is included.
354 *
355 *
356 * local_msgqueue remote_msgqueue
357 *
358 * |/////////| |/////////|
359 * w_remote_GP.get --> +---------+ |/////////|
360 * | F | |/////////|
361 * remote_GP.get --> +---------+ +---------+ <-- local_GP->get
362 * | | | |
363 * | | | E |
364 * | | | |
365 * | | +---------+ <-- w_local_GP.get
366 * | B | |/////////|
367 * | | |////D////|
368 * | | |/////////|
369 * | | +---------+ <-- w_remote_GP.put
370 * | | |////C////|
371 * local_GP->put --> +---------+ +---------+ <-- remote_GP.put
372 * | | |/////////|
373 * | A | |/////////|
374 * | | |/////////|
375 * w_local_GP.put --> +---------+ |/////////|
376 * |/////////| |/////////|
377 *
378 *
379 * ( remote_GP.[get|put] are cached copies of the remote
380 * partition's local_GP->[get|put], and thus their values can
381 * lag behind their counterparts on the remote partition. )
382 *
383 *
384 * A - Messages that have been allocated, but have not yet been sent to the
385 * remote partition.
386 *
387 * B - Messages that have been sent, but have not yet been acknowledged by the
388 * remote partition as having been received.
389 *
390 * C - Area that needs to be prepared for the copying of sent messages, by
391 * the clearing of the message flags of any previously received messages.
392 *
393 * D - Area into which sent messages are to be copied from the remote
394 * partition's local_msgqueue and then delivered to their intended
395 * recipients. [ To allow for a multi-message copy, another pointer
396 * (next_msg_to_pull) has been added to keep track of the next message
397 * number needing to be copied (pulled). It chases after w_remote_GP.put.
398 * Any messages lying between w_local_GP.get and next_msg_to_pull have
399 * been copied and are ready to be delivered. ]
400 *
401 * E - Messages that have been copied and delivered, but have not yet been
402 * acknowledged by the recipient as having been received.
403 *
404 * F - Messages that have been acknowledged, but XPC has not yet notified the
405 * sender that the message was received by its intended recipient.
406 * This is also an area that needs to be prepared for the allocating of
407 * new messages, by the clearing of the message flags of the acknowledged
408 * messages.
409 */
410struct xpc_channel {
411 partid_t partid; /* ID of remote partition connected */
412 spinlock_t lock; /* lock for updating this structure */
413 u32 flags; /* general flags */
414
415 enum xpc_retval reason; /* reason why channel is disconnect'g */
416 int reason_line; /* line# disconnect initiated from */
417
418 u16 number; /* channel # */
419
420 u16 msg_size; /* sizeof each msg entry */
421 u16 local_nentries; /* #of msg entries in local msg queue */
422 u16 remote_nentries; /* #of msg entries in remote msg queue */
423
424 void *local_msgqueue_base; /* base address of kmalloc'd space */
425 struct xpc_msg *local_msgqueue; /* local message queue */
426 void *remote_msgqueue_base; /* base address of kmalloc'd space */
427 struct xpc_msg *remote_msgqueue; /* cached copy of remote partition's */
428 /* local message queue */
429 u64 remote_msgqueue_pa; /* phys addr of remote partition's */
430 /* local message queue */
431
432 atomic_t references; /* #of external references to queues */
433
434 atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */
435 wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */
436
437 u8 delayed_IPI_flags; /* IPI flags received, but delayed */
438 /* action until channel disconnected */
439
440 /* queue of msg senders who want to be notified when msg received */
441
442 atomic_t n_to_notify; /* #of msg senders to notify */
443 struct xpc_notify *notify_queue; /* notify queue for messages sent */
444
445 xpc_channel_func func; /* user's channel function */
446 void *key; /* pointer to user's key */
447
448 struct mutex msg_to_pull_mutex; /* next msg to pull serialization */
449 struct completion wdisconnect_wait; /* wait for channel disconnect */
450
451 struct xpc_openclose_args *local_openclose_args; /* args passed on */
452 /* opening or closing of channel */
453
454 /* various flavors of local and remote Get/Put values */
455
456 struct xpc_gp *local_GP; /* local Get/Put values */
457 struct xpc_gp remote_GP; /* remote Get/Put values */
458 struct xpc_gp w_local_GP; /* working local Get/Put values */
459 struct xpc_gp w_remote_GP; /* working remote Get/Put values */
460 s64 next_msg_to_pull; /* Put value of next msg to pull */
461
462 /* kthread management related fields */
463
464 atomic_t kthreads_assigned; /* #of kthreads assigned to channel */
465 u32 kthreads_assigned_limit; /* limit on #of kthreads assigned */
466 atomic_t kthreads_idle; /* #of kthreads idle waiting for work */
467 u32 kthreads_idle_limit; /* limit on #of kthreads idle */
468 atomic_t kthreads_active; /* #of kthreads actively working */
469
470 wait_queue_head_t idle_wq; /* idle kthread wait queue */
471
472} ____cacheline_aligned;
473
474/* struct xpc_channel flags */
475
476#define XPC_C_WASCONNECTED 0x00000001 /* channel was connected */
477
478#define XPC_C_ROPENREPLY 0x00000002 /* remote open channel reply */
479#define XPC_C_OPENREPLY 0x00000004 /* local open channel reply */
480#define XPC_C_ROPENREQUEST 0x00000008 /* remote open channel request */
481#define XPC_C_OPENREQUEST 0x00000010 /* local open channel request */
482
483#define XPC_C_SETUP 0x00000020 /* channel's msgqueues are alloc'd */
484#define XPC_C_CONNECTEDCALLOUT 0x00000040 /* connected callout initiated */
485#define XPC_C_CONNECTEDCALLOUT_MADE \
486 0x00000080 /* connected callout completed */
487#define XPC_C_CONNECTED 0x00000100 /* local channel is connected */
488#define XPC_C_CONNECTING 0x00000200 /* channel is being connected */
489
490#define XPC_C_RCLOSEREPLY 0x00000400 /* remote close channel reply */
491#define XPC_C_CLOSEREPLY 0x00000800 /* local close channel reply */
492#define XPC_C_RCLOSEREQUEST 0x00001000 /* remote close channel request */
493#define XPC_C_CLOSEREQUEST 0x00002000 /* local close channel request */
494
495#define XPC_C_DISCONNECTED 0x00004000 /* channel is disconnected */
496#define XPC_C_DISCONNECTING 0x00008000 /* channel is being disconnected */
497#define XPC_C_DISCONNECTINGCALLOUT \
498 0x00010000 /* disconnecting callout initiated */
499#define XPC_C_DISCONNECTINGCALLOUT_MADE \
500 0x00020000 /* disconnecting callout completed */
501#define XPC_C_WDISCONNECT 0x00040000 /* waiting for channel disconnect */
502
503/*
504 * Manages channels on a partition basis. There is one of these structures
505 * for each partition (a partition will never utilize the structure that
506 * represents itself).
507 */
508struct xpc_partition {
509
510 /* XPC HB infrastructure */
511
512 u8 remote_rp_version; /* version# of partition's rsvd pg */
513 struct timespec remote_rp_stamp; /* time when rsvd pg was initialized */
514 u64 remote_rp_pa; /* phys addr of partition's rsvd pg */
515 u64 remote_vars_pa; /* phys addr of partition's vars */
516 u64 remote_vars_part_pa; /* phys addr of partition's vars part */
517 u64 last_heartbeat; /* HB at last read */
518 u64 remote_amos_page_pa; /* phys addr of partition's amos page */
519 int remote_act_nasid; /* active part's act/deact nasid */
520 int remote_act_phys_cpuid; /* active part's act/deact phys cpuid */
521 u32 act_IRQ_rcvd; /* IRQs since activation */
522 spinlock_t act_lock; /* protect updating of act_state */
523 u8 act_state; /* from XPC HB viewpoint */
524 u8 remote_vars_version; /* version# of partition's vars */
525 enum xpc_retval reason; /* reason partition is deactivating */
526 int reason_line; /* line# deactivation initiated from */
527 int reactivate_nasid; /* nasid in partition to reactivate */
528
529 unsigned long disengage_request_timeout; /* timeout in jiffies */
530 struct timer_list disengage_request_timer;
531
532 /* XPC infrastructure referencing and teardown control */
533
534 u8 setup_state; /* infrastructure setup state */
535 wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */
536 atomic_t references; /* #of references to infrastructure */
537
538 /*
539 * NONE OF THE PRECEDING FIELDS OF THIS STRUCTURE WILL BE CLEARED WHEN
540 * XPC SETS UP THE NECESSARY INFRASTRUCTURE TO SUPPORT CROSS PARTITION
541 * COMMUNICATION. ALL OF THE FOLLOWING FIELDS WILL BE CLEARED. (THE
542 * 'nchannels' FIELD MUST BE THE FIRST OF THE FIELDS TO BE CLEARED.)
543 */
544
545 u8 nchannels; /* #of defined channels supported */
546 atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */
547 atomic_t nchannels_engaged; /* #of channels engaged with remote part */
548 struct xpc_channel *channels; /* array of channel structures */
549
550 void *local_GPs_base; /* base address of kmalloc'd space */
551 struct xpc_gp *local_GPs; /* local Get/Put values */
552 void *remote_GPs_base; /* base address of kmalloc'd space */
553 struct xpc_gp *remote_GPs; /* copy of remote partition's local */
554 /* Get/Put values */
555 u64 remote_GPs_pa; /* phys address of remote partition's local */
556 /* Get/Put values */
557
558 /* fields used to pass args when opening or closing a channel */
559
560 void *local_openclose_args_base; /* base address of kmalloc'd space */
561 struct xpc_openclose_args *local_openclose_args; /* local's args */
562 void *remote_openclose_args_base; /* base address of kmalloc'd space */
563 struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */
564 /* args */
565 u64 remote_openclose_args_pa; /* phys addr of remote's args */
566
567 /* IPI sending, receiving and handling related fields */
568
569 int remote_IPI_nasid; /* nasid of where to send IPIs */
570 int remote_IPI_phys_cpuid; /* phys CPU ID of where to send IPIs */
571 AMO_t *remote_IPI_amo_va; /* address of remote IPI AMO_t structure */
572
573 AMO_t *local_IPI_amo_va; /* address of IPI AMO_t structure */
574 u64 local_IPI_amo; /* IPI amo flags yet to be handled */
575 char IPI_owner[8]; /* IPI owner's name */
576 struct timer_list dropped_IPI_timer; /* dropped IPI timer */
577
578 spinlock_t IPI_lock; /* IPI handler lock */
579
580 /* channel manager related fields */
581
582 atomic_t channel_mgr_requests; /* #of requests to activate chan mgr */
583 wait_queue_head_t channel_mgr_wq; /* channel mgr's wait queue */
584
585} ____cacheline_aligned;
586
587/* struct xpc_partition act_state values (for XPC HB) */
588
589#define XPC_P_INACTIVE 0x00 /* partition is not active */
590#define XPC_P_ACTIVATION_REQ 0x01 /* created thread to activate */
591#define XPC_P_ACTIVATING 0x02 /* activation thread started */
592#define XPC_P_ACTIVE 0x03 /* xpc_partition_up() was called */
593#define XPC_P_DEACTIVATING 0x04 /* partition deactivation initiated */
594
595#define XPC_DEACTIVATE_PARTITION(_p, _reason) \
596 xpc_deactivate_partition(__LINE__, (_p), (_reason))
597
598/* struct xpc_partition setup_state values */
599
600#define XPC_P_UNSET 0x00 /* infrastructure was never setup */
601#define XPC_P_SETUP 0x01 /* infrastructure is setup */
602#define XPC_P_WTEARDOWN 0x02 /* waiting to teardown infrastructure */
603#define XPC_P_TORNDOWN 0x03 /* infrastructure is torndown */
604
605/*
606 * struct xpc_partition IPI_timer #of seconds to wait before checking for
607 * dropped IPIs. These occur whenever an IPI amo write doesn't complete until
608 * after the IPI was received.
609 */
610#define XPC_P_DROPPED_IPI_WAIT (0.25 * HZ)
611
612/* number of seconds to wait for other partitions to disengage */
613#define XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT 90
614
615/* interval in seconds to print 'waiting disengagement' messages */
616#define XPC_DISENGAGE_PRINTMSG_INTERVAL 10
617
618#define XPC_PARTID(_p) ((partid_t) ((_p) - &xpc_partitions[0]))
619
620/* found in xp_main.c */
621extern struct xpc_registration xpc_registrations[];
622
623/* found in xpc_main.c */
624extern struct device *xpc_part;
625extern struct device *xpc_chan;
626extern int xpc_disengage_request_timelimit;
627extern int xpc_disengage_request_timedout;
628extern irqreturn_t xpc_notify_IRQ_handler(int, void *);
629extern void xpc_dropped_IPI_check(struct xpc_partition *);
630extern void xpc_activate_partition(struct xpc_partition *);
631extern void xpc_activate_kthreads(struct xpc_channel *, int);
632extern void xpc_create_kthreads(struct xpc_channel *, int, int);
633extern void xpc_disconnect_wait(int);
634
635/* found in xpc_partition.c */
636extern int xpc_exiting;
637extern struct xpc_vars *xpc_vars;
638extern struct xpc_rsvd_page *xpc_rsvd_page;
639extern struct xpc_vars_part *xpc_vars_part;
640extern struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
641extern char *xpc_remote_copy_buffer;
642extern void *xpc_remote_copy_buffer_base;
643extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **);
644extern struct xpc_rsvd_page *xpc_rsvd_page_init(void);
645extern void xpc_allow_IPI_ops(void);
646extern void xpc_restrict_IPI_ops(void);
647extern int xpc_identify_act_IRQ_sender(void);
648extern int xpc_partition_disengaged(struct xpc_partition *);
649extern enum xpc_retval xpc_mark_partition_active(struct xpc_partition *);
650extern void xpc_mark_partition_inactive(struct xpc_partition *);
651extern void xpc_discovery(void);
652extern void xpc_check_remote_hb(void);
653extern void xpc_deactivate_partition(const int, struct xpc_partition *,
654 enum xpc_retval);
655extern enum xpc_retval xpc_initiate_partid_to_nasids(partid_t, void *);
656
657/* found in xpc_channel.c */
658extern void xpc_initiate_connect(int);
659extern void xpc_initiate_disconnect(int);
660extern enum xpc_retval xpc_initiate_allocate(partid_t, int, u32, void **);
661extern enum xpc_retval xpc_initiate_send(partid_t, int, void *);
662extern enum xpc_retval xpc_initiate_send_notify(partid_t, int, void *,
663 xpc_notify_func, void *);
664extern void xpc_initiate_received(partid_t, int, void *);
665extern enum xpc_retval xpc_setup_infrastructure(struct xpc_partition *);
666extern enum xpc_retval xpc_pull_remote_vars_part(struct xpc_partition *);
667extern void xpc_process_channel_activity(struct xpc_partition *);
668extern void xpc_connected_callout(struct xpc_channel *);
669extern void xpc_deliver_msg(struct xpc_channel *);
670extern void xpc_disconnect_channel(const int, struct xpc_channel *,
671 enum xpc_retval, unsigned long *);
672extern void xpc_disconnect_callout(struct xpc_channel *, enum xpc_retval);
673extern void xpc_partition_going_down(struct xpc_partition *, enum xpc_retval);
674extern void xpc_teardown_infrastructure(struct xpc_partition *);
675
676static inline void
677xpc_wakeup_channel_mgr(struct xpc_partition *part)
678{
679 if (atomic_inc_return(&part->channel_mgr_requests) == 1)
680 wake_up(&part->channel_mgr_wq);
681}
682
683/*
684 * These next two inlines are used to keep us from tearing down a channel's
685 * msg queues while a thread may be referencing them.
686 */
687static inline void
688xpc_msgqueue_ref(struct xpc_channel *ch)
689{
690 atomic_inc(&ch->references);
691}
692
693static inline void
694xpc_msgqueue_deref(struct xpc_channel *ch)
695{
696 s32 refs = atomic_dec_return(&ch->references);
697
698 DBUG_ON(refs < 0);
699 if (refs == 0)
700 xpc_wakeup_channel_mgr(&xpc_partitions[ch->partid]);
701}
702
703#define XPC_DISCONNECT_CHANNEL(_ch, _reason, _irqflgs) \
704 xpc_disconnect_channel(__LINE__, _ch, _reason, _irqflgs)
705
706/*
707 * These two inlines are used to keep us from tearing down a partition's
708 * setup infrastructure while a thread may be referencing it.
709 */
710static inline void
711xpc_part_deref(struct xpc_partition *part)
712{
713 s32 refs = atomic_dec_return(&part->references);
714
715 DBUG_ON(refs < 0);
716 if (refs == 0 && part->setup_state == XPC_P_WTEARDOWN)
717 wake_up(&part->teardown_wq);
718}
719
720static inline int
721xpc_part_ref(struct xpc_partition *part)
722{
723 int setup;
724
725 atomic_inc(&part->references);
726 setup = (part->setup_state == XPC_P_SETUP);
727 if (!setup)
728 xpc_part_deref(part);
729
730 return setup;
731}
732
733/*
734 * The following macro is to be used for the setting of the reason and
735 * reason_line fields in both the struct xpc_channel and struct xpc_partition
736 * structures.
737 */
738#define XPC_SET_REASON(_p, _reason, _line) \
739 { \
740 (_p)->reason = _reason; \
741 (_p)->reason_line = _line; \
742 }
743
744/*
745 * This next set of inlines are used to keep track of when a partition is
746 * potentially engaged in accessing memory belonging to another partition.
747 */
748
749static inline void
750xpc_mark_partition_engaged(struct xpc_partition *part)
751{
752 unsigned long irq_flags;
753 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
754 (XPC_ENGAGED_PARTITIONS_AMO *
755 sizeof(AMO_t)));
756
757 local_irq_save(irq_flags);
758
759 /* set bit corresponding to our partid in remote partition's AMO */
760 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
761 (1UL << sn_partition_id));
762 /*
763 * We must always use the nofault function regardless of whether we
764 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
765 * didn't, we'd never know that the other partition is down and would
766 * keep sending IPIs and AMOs to it until the heartbeat times out.
767 */
768 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
769 variable),
770 xp_nofault_PIOR_target));
771
772 local_irq_restore(irq_flags);
773}
774
775static inline void
776xpc_mark_partition_disengaged(struct xpc_partition *part)
777{
778 unsigned long irq_flags;
779 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
780 (XPC_ENGAGED_PARTITIONS_AMO *
781 sizeof(AMO_t)));
782
783 local_irq_save(irq_flags);
784
785 /* clear bit corresponding to our partid in remote partition's AMO */
786 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
787 ~(1UL << sn_partition_id));
788 /*
789 * We must always use the nofault function regardless of whether we
790 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
791 * didn't, we'd never know that the other partition is down and would
792 * keep sending IPIs and AMOs to it until the heartbeat times out.
793 */
794 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
795 variable),
796 xp_nofault_PIOR_target));
797
798 local_irq_restore(irq_flags);
799}
800
801static inline void
802xpc_request_partition_disengage(struct xpc_partition *part)
803{
804 unsigned long irq_flags;
805 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
806 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
807
808 local_irq_save(irq_flags);
809
810 /* set bit corresponding to our partid in remote partition's AMO */
811 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
812 (1UL << sn_partition_id));
813 /*
814 * We must always use the nofault function regardless of whether we
815 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
816 * didn't, we'd never know that the other partition is down and would
817 * keep sending IPIs and AMOs to it until the heartbeat times out.
818 */
819 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
820 variable),
821 xp_nofault_PIOR_target));
822
823 local_irq_restore(irq_flags);
824}
825
826static inline void
827xpc_cancel_partition_disengage_request(struct xpc_partition *part)
828{
829 unsigned long irq_flags;
830 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
831 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
832
833 local_irq_save(irq_flags);
834
835 /* clear bit corresponding to our partid in remote partition's AMO */
836 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
837 ~(1UL << sn_partition_id));
838 /*
839 * We must always use the nofault function regardless of whether we
840 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
841 * didn't, we'd never know that the other partition is down and would
842 * keep sending IPIs and AMOs to it until the heartbeat times out.
843 */
844 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
845 variable),
846 xp_nofault_PIOR_target));
847
848 local_irq_restore(irq_flags);
849}
850
851static inline u64
852xpc_partition_engaged(u64 partid_mask)
853{
854 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
855
856 /* return our partition's AMO variable ANDed with partid_mask */
857 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
858 partid_mask);
859}
860
861static inline u64
862xpc_partition_disengage_requested(u64 partid_mask)
863{
864 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
865
866 /* return our partition's AMO variable ANDed with partid_mask */
867 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
868 partid_mask);
869}
870
871static inline void
872xpc_clear_partition_engaged(u64 partid_mask)
873{
874 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
875
876 /* clear bit(s) based on partid_mask in our partition's AMO */
877 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
878 ~partid_mask);
879}
880
881static inline void
882xpc_clear_partition_disengage_request(u64 partid_mask)
883{
884 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
885
886 /* clear bit(s) based on partid_mask in our partition's AMO */
887 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
888 ~partid_mask);
889}
890
891/*
892 * The following set of macros and inlines are used for the sending and
893 * receiving of IPIs (also known as IRQs). There are two flavors of IPIs,
894 * one that is associated with partition activity (SGI_XPC_ACTIVATE) and
895 * the other that is associated with channel activity (SGI_XPC_NOTIFY).
896 */
897
898static inline u64
899xpc_IPI_receive(AMO_t *amo)
900{
901 return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_CLEAR);
902}
903
904static inline enum xpc_retval
905xpc_IPI_send(AMO_t *amo, u64 flag, int nasid, int phys_cpuid, int vector)
906{
907 int ret = 0;
908 unsigned long irq_flags;
909
910 local_irq_save(irq_flags);
911
912 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, flag);
913 sn_send_IPI_phys(nasid, phys_cpuid, vector, 0);
914
915 /*
916 * We must always use the nofault function regardless of whether we
917 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
918 * didn't, we'd never know that the other partition is down and would
919 * keep sending IPIs and AMOs to it until the heartbeat times out.
920 */
921 ret = xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->variable),
922 xp_nofault_PIOR_target));
923
924 local_irq_restore(irq_flags);
925
926 return ((ret == 0) ? xpcSuccess : xpcPioReadError);
927}
928
929/*
930 * IPIs associated with SGI_XPC_ACTIVATE IRQ.
931 */
932
933/*
934 * Flag the appropriate AMO variable and send an IPI to the specified node.
935 */
936static inline void
937xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid,
938 int to_phys_cpuid)
939{
940 int w_index = XPC_NASID_W_INDEX(from_nasid);
941 int b_index = XPC_NASID_B_INDEX(from_nasid);
942 AMO_t *amos = (AMO_t *)__va(amos_page_pa +
943 (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t)));
944
945 (void)xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid,
946 to_phys_cpuid, SGI_XPC_ACTIVATE);
947}
948
949static inline void
950xpc_IPI_send_activate(struct xpc_vars *vars)
951{
952 xpc_activate_IRQ_send(vars->amos_page_pa, cnodeid_to_nasid(0),
953 vars->act_nasid, vars->act_phys_cpuid);
954}
955
956static inline void
957xpc_IPI_send_activated(struct xpc_partition *part)
958{
959 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
960 part->remote_act_nasid,
961 part->remote_act_phys_cpuid);
962}
963
964static inline void
965xpc_IPI_send_reactivate(struct xpc_partition *part)
966{
967 xpc_activate_IRQ_send(xpc_vars->amos_page_pa, part->reactivate_nasid,
968 xpc_vars->act_nasid, xpc_vars->act_phys_cpuid);
969}
970
971static inline void
972xpc_IPI_send_disengage(struct xpc_partition *part)
973{
974 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
975 part->remote_act_nasid,
976 part->remote_act_phys_cpuid);
977}
978
979/*
980 * IPIs associated with SGI_XPC_NOTIFY IRQ.
981 */
982
983/*
984 * Send an IPI to the remote partition that is associated with the
985 * specified channel.
986 */
987#define XPC_NOTIFY_IRQ_SEND(_ch, _ipi_f, _irq_f) \
988 xpc_notify_IRQ_send(_ch, _ipi_f, #_ipi_f, _irq_f)
989
990static inline void
991xpc_notify_IRQ_send(struct xpc_channel *ch, u8 ipi_flag, char *ipi_flag_string,
992 unsigned long *irq_flags)
993{
994 struct xpc_partition *part = &xpc_partitions[ch->partid];
995 enum xpc_retval ret;
996
997 if (likely(part->act_state != XPC_P_DEACTIVATING)) {
998 ret = xpc_IPI_send(part->remote_IPI_amo_va,
999 (u64)ipi_flag << (ch->number * 8),
1000 part->remote_IPI_nasid,
1001 part->remote_IPI_phys_cpuid, SGI_XPC_NOTIFY);
1002 dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n",
1003 ipi_flag_string, ch->partid, ch->number, ret);
1004 if (unlikely(ret != xpcSuccess)) {
1005 if (irq_flags != NULL)
1006 spin_unlock_irqrestore(&ch->lock, *irq_flags);
1007 XPC_DEACTIVATE_PARTITION(part, ret);
1008 if (irq_flags != NULL)
1009 spin_lock_irqsave(&ch->lock, *irq_flags);
1010 }
1011 }
1012}
1013
1014/*
1015 * Make it look like the remote partition, which is associated with the
1016 * specified channel, sent us an IPI. This faked IPI will be handled
1017 * by xpc_dropped_IPI_check().
1018 */
1019#define XPC_NOTIFY_IRQ_SEND_LOCAL(_ch, _ipi_f) \
1020 xpc_notify_IRQ_send_local(_ch, _ipi_f, #_ipi_f)
1021
1022static inline void
1023xpc_notify_IRQ_send_local(struct xpc_channel *ch, u8 ipi_flag,
1024 char *ipi_flag_string)
1025{
1026 struct xpc_partition *part = &xpc_partitions[ch->partid];
1027
1028 FETCHOP_STORE_OP(TO_AMO((u64)&part->local_IPI_amo_va->variable),
1029 FETCHOP_OR, ((u64)ipi_flag << (ch->number * 8)));
1030 dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n",
1031 ipi_flag_string, ch->partid, ch->number);
1032}
1033
1034/*
1035 * The sending and receiving of IPIs includes the setting of an AMO variable
1036 * to indicate the reason the IPI was sent. The 64-bit variable is divided
1037 * up into eight bytes, ordered from right to left. Byte zero pertains to
1038 * channel 0, byte one to channel 1, and so on. Each byte is described by
1039 * the following IPI flags.
1040 */
1041
1042#define XPC_IPI_CLOSEREQUEST 0x01
1043#define XPC_IPI_CLOSEREPLY 0x02
1044#define XPC_IPI_OPENREQUEST 0x04
1045#define XPC_IPI_OPENREPLY 0x08
1046#define XPC_IPI_MSGREQUEST 0x10
1047
1048/* given an AMO variable and a channel#, get its associated IPI flags */
1049#define XPC_GET_IPI_FLAGS(_amo, _c) ((u8) (((_amo) >> ((_c) * 8)) & 0xff))
1050#define XPC_SET_IPI_FLAGS(_amo, _c, _f) (_amo) |= ((u64) (_f) << ((_c) * 8))
1051
1052#define XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(_amo) ((_amo) & 0x0f0f0f0f0f0f0f0fUL)
1053#define XPC_ANY_MSG_IPI_FLAGS_SET(_amo) ((_amo) & 0x1010101010101010UL)
1054
1055static inline void
1056xpc_IPI_send_closerequest(struct xpc_channel *ch, unsigned long *irq_flags)
1057{
1058 struct xpc_openclose_args *args = ch->local_openclose_args;
1059
1060 args->reason = ch->reason;
1061
1062 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREQUEST, irq_flags);
1063}
1064
1065static inline void
1066xpc_IPI_send_closereply(struct xpc_channel *ch, unsigned long *irq_flags)
1067{
1068 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREPLY, irq_flags);
1069}
1070
1071static inline void
1072xpc_IPI_send_openrequest(struct xpc_channel *ch, unsigned long *irq_flags)
1073{
1074 struct xpc_openclose_args *args = ch->local_openclose_args;
1075
1076 args->msg_size = ch->msg_size;
1077 args->local_nentries = ch->local_nentries;
1078
1079 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREQUEST, irq_flags);
1080}
1081
1082static inline void
1083xpc_IPI_send_openreply(struct xpc_channel *ch, unsigned long *irq_flags)
1084{
1085 struct xpc_openclose_args *args = ch->local_openclose_args;
1086
1087 args->remote_nentries = ch->remote_nentries;
1088 args->local_nentries = ch->local_nentries;
1089 args->local_msgqueue_pa = __pa(ch->local_msgqueue);
1090
1091 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREPLY, irq_flags);
1092}
1093
1094static inline void
1095xpc_IPI_send_msgrequest(struct xpc_channel *ch)
1096{
1097 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_MSGREQUEST, NULL);
1098}
1099
1100static inline void
1101xpc_IPI_send_local_msgrequest(struct xpc_channel *ch)
1102{
1103 XPC_NOTIFY_IRQ_SEND_LOCAL(ch, XPC_IPI_MSGREQUEST);
1104}
1105
1106/*
1107 * Memory for XPC's AMO variables is allocated by the MSPEC driver. These
1108 * pages are located in the lowest granule. The lowest granule uses 4k pages
1109 * for cached references and an alternate TLB handler to never provide a
1110 * cacheable mapping for the entire region. This will prevent speculative
1111 * reading of cached copies of our lines from being issued which will cause
1112 * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
1113 * AMO variables (based on XP_MAX_PARTITIONS) for message notification and an
1114 * additional 128 AMO variables (based on XP_NASID_MASK_WORDS) for partition
1115 * activation and 2 AMO variables for partition deactivation.
1116 */
1117static inline AMO_t *
1118xpc_IPI_init(int index)
1119{
1120 AMO_t *amo = xpc_vars->amos_page + index;
1121
1122 (void)xpc_IPI_receive(amo); /* clear AMO variable */
1123 return amo;
1124}
1125
1126static inline enum xpc_retval
1127xpc_map_bte_errors(bte_result_t error)
1128{
1129 if (error == BTE_SUCCESS)
1130 return xpcSuccess;
1131
1132 if (is_shub2()) {
1133 if (BTE_VALID_SH2_ERROR(error))
1134 return xpcBteSh2Start + error;
1135 return xpcBteUnmappedError;
1136 }
1137 switch (error) {
1138 case BTE_SUCCESS:
1139 return xpcSuccess;
1140 case BTEFAIL_DIR:
1141 return xpcBteDirectoryError;
1142 case BTEFAIL_POISON:
1143 return xpcBtePoisonError;
1144 case BTEFAIL_WERR:
1145 return xpcBteWriteError;
1146 case BTEFAIL_ACCESS:
1147 return xpcBteAccessError;
1148 case BTEFAIL_PWERR:
1149 return xpcBtePWriteError;
1150 case BTEFAIL_PRERR:
1151 return xpcBtePReadError;
1152 case BTEFAIL_TOUT:
1153 return xpcBteTimeOutError;
1154 case BTEFAIL_XTERR:
1155 return xpcBteXtalkError;
1156 case BTEFAIL_NOTAVAIL:
1157 return xpcBteNotAvailable;
1158 default:
1159 return xpcBteUnmappedError;
1160 }
1161}
1162
1163/*
1164 * Check to see if there is any channel activity to/from the specified
1165 * partition.
1166 */
1167static inline void
1168xpc_check_for_channel_activity(struct xpc_partition *part)
1169{
1170 u64 IPI_amo;
1171 unsigned long irq_flags;
1172
1173 IPI_amo = xpc_IPI_receive(part->local_IPI_amo_va);
1174 if (IPI_amo == 0)
1175 return;
1176
1177 spin_lock_irqsave(&part->IPI_lock, irq_flags);
1178 part->local_IPI_amo |= IPI_amo;
1179 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
1180
1181 dev_dbg(xpc_chan, "received IPI from partid=%d, IPI_amo=0x%lx\n",
1182 XPC_PARTID(part), IPI_amo);
1183
1184 xpc_wakeup_channel_mgr(part);
1185}
1186
1187#endif /* _DRIVERS_MISC_SGIXP_XPC_H */
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
new file mode 100644
index 000000000000..bfcb9ea968e9
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -0,0 +1,2243 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) channel support.
11 *
12 * This is the part of XPC that manages the channels and
13 * sends/receives messages across them to/from other partitions.
14 *
15 */
16
17#include <linux/kernel.h>
18#include <linux/init.h>
19#include <linux/sched.h>
20#include <linux/cache.h>
21#include <linux/interrupt.h>
22#include <linux/mutex.h>
23#include <linux/completion.h>
24#include <asm/sn/bte.h>
25#include <asm/sn/sn_sal.h>
26#include "xpc.h"
27
28/*
29 * Guarantee that the kzalloc'd memory is cacheline aligned.
30 */
31static void *
32xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
33{
34 /* see if kzalloc will give us cachline aligned memory by default */
35 *base = kzalloc(size, flags);
36 if (*base == NULL)
37 return NULL;
38
39 if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
40 return *base;
41
42 kfree(*base);
43
44 /* nope, we'll have to do it ourselves */
45 *base = kzalloc(size + L1_CACHE_BYTES, flags);
46 if (*base == NULL)
47 return NULL;
48
49 return (void *)L1_CACHE_ALIGN((u64)*base);
50}
51
52/*
53 * Set up the initial values for the XPartition Communication channels.
54 */
55static void
56xpc_initialize_channels(struct xpc_partition *part, partid_t partid)
57{
58 int ch_number;
59 struct xpc_channel *ch;
60
61 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
62 ch = &part->channels[ch_number];
63
64 ch->partid = partid;
65 ch->number = ch_number;
66 ch->flags = XPC_C_DISCONNECTED;
67
68 ch->local_GP = &part->local_GPs[ch_number];
69 ch->local_openclose_args =
70 &part->local_openclose_args[ch_number];
71
72 atomic_set(&ch->kthreads_assigned, 0);
73 atomic_set(&ch->kthreads_idle, 0);
74 atomic_set(&ch->kthreads_active, 0);
75
76 atomic_set(&ch->references, 0);
77 atomic_set(&ch->n_to_notify, 0);
78
79 spin_lock_init(&ch->lock);
80 mutex_init(&ch->msg_to_pull_mutex);
81 init_completion(&ch->wdisconnect_wait);
82
83 atomic_set(&ch->n_on_msg_allocate_wq, 0);
84 init_waitqueue_head(&ch->msg_allocate_wq);
85 init_waitqueue_head(&ch->idle_wq);
86 }
87}
88
89/*
90 * Setup the infrastructure necessary to support XPartition Communication
91 * between the specified remote partition and the local one.
92 */
93enum xpc_retval
94xpc_setup_infrastructure(struct xpc_partition *part)
95{
96 int ret, cpuid;
97 struct timer_list *timer;
98 partid_t partid = XPC_PARTID(part);
99
100 /*
101 * Zero out MOST of the entry for this partition. Only the fields
102 * starting with `nchannels' will be zeroed. The preceding fields must
103 * remain `viable' across partition ups and downs, since they may be
104 * referenced during this memset() operation.
105 */
106 memset(&part->nchannels, 0, sizeof(struct xpc_partition) -
107 offsetof(struct xpc_partition, nchannels));
108
109 /*
110 * Allocate all of the channel structures as a contiguous chunk of
111 * memory.
112 */
113 part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
114 GFP_KERNEL);
115 if (part->channels == NULL) {
116 dev_err(xpc_chan, "can't get memory for channels\n");
117 return xpcNoMemory;
118 }
119
120 part->nchannels = XPC_NCHANNELS;
121
122 /* allocate all the required GET/PUT values */
123
124 part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
125 GFP_KERNEL,
126 &part->local_GPs_base);
127 if (part->local_GPs == NULL) {
128 kfree(part->channels);
129 part->channels = NULL;
130 dev_err(xpc_chan, "can't get memory for local get/put "
131 "values\n");
132 return xpcNoMemory;
133 }
134
135 part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
136 GFP_KERNEL,
137 &part->
138 remote_GPs_base);
139 if (part->remote_GPs == NULL) {
140 dev_err(xpc_chan, "can't get memory for remote get/put "
141 "values\n");
142 kfree(part->local_GPs_base);
143 part->local_GPs = NULL;
144 kfree(part->channels);
145 part->channels = NULL;
146 return xpcNoMemory;
147 }
148
149 /* allocate all the required open and close args */
150
151 part->local_openclose_args =
152 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
153 &part->local_openclose_args_base);
154 if (part->local_openclose_args == NULL) {
155 dev_err(xpc_chan, "can't get memory for local connect args\n");
156 kfree(part->remote_GPs_base);
157 part->remote_GPs = NULL;
158 kfree(part->local_GPs_base);
159 part->local_GPs = NULL;
160 kfree(part->channels);
161 part->channels = NULL;
162 return xpcNoMemory;
163 }
164
165 part->remote_openclose_args =
166 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
167 &part->remote_openclose_args_base);
168 if (part->remote_openclose_args == NULL) {
169 dev_err(xpc_chan, "can't get memory for remote connect args\n");
170 kfree(part->local_openclose_args_base);
171 part->local_openclose_args = NULL;
172 kfree(part->remote_GPs_base);
173 part->remote_GPs = NULL;
174 kfree(part->local_GPs_base);
175 part->local_GPs = NULL;
176 kfree(part->channels);
177 part->channels = NULL;
178 return xpcNoMemory;
179 }
180
181 xpc_initialize_channels(part, partid);
182
183 atomic_set(&part->nchannels_active, 0);
184 atomic_set(&part->nchannels_engaged, 0);
185
186 /* local_IPI_amo were set to 0 by an earlier memset() */
187
188 /* Initialize this partitions AMO_t structure */
189 part->local_IPI_amo_va = xpc_IPI_init(partid);
190
191 spin_lock_init(&part->IPI_lock);
192
193 atomic_set(&part->channel_mgr_requests, 1);
194 init_waitqueue_head(&part->channel_mgr_wq);
195
196 sprintf(part->IPI_owner, "xpc%02d", partid);
197 ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, IRQF_SHARED,
198 part->IPI_owner, (void *)(u64)partid);
199 if (ret != 0) {
200 dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
201 "errno=%d\n", -ret);
202 kfree(part->remote_openclose_args_base);
203 part->remote_openclose_args = NULL;
204 kfree(part->local_openclose_args_base);
205 part->local_openclose_args = NULL;
206 kfree(part->remote_GPs_base);
207 part->remote_GPs = NULL;
208 kfree(part->local_GPs_base);
209 part->local_GPs = NULL;
210 kfree(part->channels);
211 part->channels = NULL;
212 return xpcLackOfResources;
213 }
214
215 /* Setup a timer to check for dropped IPIs */
216 timer = &part->dropped_IPI_timer;
217 init_timer(timer);
218 timer->function = (void (*)(unsigned long))xpc_dropped_IPI_check;
219 timer->data = (unsigned long)part;
220 timer->expires = jiffies + XPC_P_DROPPED_IPI_WAIT;
221 add_timer(timer);
222
223 /*
224 * With the setting of the partition setup_state to XPC_P_SETUP, we're
225 * declaring that this partition is ready to go.
226 */
227 part->setup_state = XPC_P_SETUP;
228
229 /*
230 * Setup the per partition specific variables required by the
231 * remote partition to establish channel connections with us.
232 *
233 * The setting of the magic # indicates that these per partition
234 * specific variables are ready to be used.
235 */
236 xpc_vars_part[partid].GPs_pa = __pa(part->local_GPs);
237 xpc_vars_part[partid].openclose_args_pa =
238 __pa(part->local_openclose_args);
239 xpc_vars_part[partid].IPI_amo_pa = __pa(part->local_IPI_amo_va);
240 cpuid = raw_smp_processor_id(); /* any CPU in this partition will do */
241 xpc_vars_part[partid].IPI_nasid = cpuid_to_nasid(cpuid);
242 xpc_vars_part[partid].IPI_phys_cpuid = cpu_physical_id(cpuid);
243 xpc_vars_part[partid].nchannels = part->nchannels;
244 xpc_vars_part[partid].magic = XPC_VP_MAGIC1;
245
246 return xpcSuccess;
247}
248
249/*
250 * Create a wrapper that hides the underlying mechanism for pulling a cacheline
251 * (or multiple cachelines) from a remote partition.
252 *
253 * src must be a cacheline aligned physical address on the remote partition.
254 * dst must be a cacheline aligned virtual address on this partition.
255 * cnt must be an cacheline sized
256 */
257static enum xpc_retval
258xpc_pull_remote_cachelines(struct xpc_partition *part, void *dst,
259 const void *src, size_t cnt)
260{
261 bte_result_t bte_ret;
262
263 DBUG_ON((u64)src != L1_CACHE_ALIGN((u64)src));
264 DBUG_ON((u64)dst != L1_CACHE_ALIGN((u64)dst));
265 DBUG_ON(cnt != L1_CACHE_ALIGN(cnt));
266
267 if (part->act_state == XPC_P_DEACTIVATING)
268 return part->reason;
269
270 bte_ret = xp_bte_copy((u64)src, (u64)dst, (u64)cnt,
271 (BTE_NORMAL | BTE_WACQUIRE), NULL);
272 if (bte_ret == BTE_SUCCESS)
273 return xpcSuccess;
274
275 dev_dbg(xpc_chan, "xp_bte_copy() from partition %d failed, ret=%d\n",
276 XPC_PARTID(part), bte_ret);
277
278 return xpc_map_bte_errors(bte_ret);
279}
280
281/*
282 * Pull the remote per partition specific variables from the specified
283 * partition.
284 */
285enum xpc_retval
286xpc_pull_remote_vars_part(struct xpc_partition *part)
287{
288 u8 buffer[L1_CACHE_BYTES * 2];
289 struct xpc_vars_part *pulled_entry_cacheline =
290 (struct xpc_vars_part *)L1_CACHE_ALIGN((u64)buffer);
291 struct xpc_vars_part *pulled_entry;
292 u64 remote_entry_cacheline_pa, remote_entry_pa;
293 partid_t partid = XPC_PARTID(part);
294 enum xpc_retval ret;
295
296 /* pull the cacheline that contains the variables we're interested in */
297
298 DBUG_ON(part->remote_vars_part_pa !=
299 L1_CACHE_ALIGN(part->remote_vars_part_pa));
300 DBUG_ON(sizeof(struct xpc_vars_part) != L1_CACHE_BYTES / 2);
301
302 remote_entry_pa = part->remote_vars_part_pa +
303 sn_partition_id * sizeof(struct xpc_vars_part);
304
305 remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1));
306
307 pulled_entry = (struct xpc_vars_part *)((u64)pulled_entry_cacheline +
308 (remote_entry_pa &
309 (L1_CACHE_BYTES - 1)));
310
311 ret = xpc_pull_remote_cachelines(part, pulled_entry_cacheline,
312 (void *)remote_entry_cacheline_pa,
313 L1_CACHE_BYTES);
314 if (ret != xpcSuccess) {
315 dev_dbg(xpc_chan, "failed to pull XPC vars_part from "
316 "partition %d, ret=%d\n", partid, ret);
317 return ret;
318 }
319
320 /* see if they've been set up yet */
321
322 if (pulled_entry->magic != XPC_VP_MAGIC1 &&
323 pulled_entry->magic != XPC_VP_MAGIC2) {
324
325 if (pulled_entry->magic != 0) {
326 dev_dbg(xpc_chan, "partition %d's XPC vars_part for "
327 "partition %d has bad magic value (=0x%lx)\n",
328 partid, sn_partition_id, pulled_entry->magic);
329 return xpcBadMagic;
330 }
331
332 /* they've not been initialized yet */
333 return xpcRetry;
334 }
335
336 if (xpc_vars_part[partid].magic == XPC_VP_MAGIC1) {
337
338 /* validate the variables */
339
340 if (pulled_entry->GPs_pa == 0 ||
341 pulled_entry->openclose_args_pa == 0 ||
342 pulled_entry->IPI_amo_pa == 0) {
343
344 dev_err(xpc_chan, "partition %d's XPC vars_part for "
345 "partition %d are not valid\n", partid,
346 sn_partition_id);
347 return xpcInvalidAddress;
348 }
349
350 /* the variables we imported look to be valid */
351
352 part->remote_GPs_pa = pulled_entry->GPs_pa;
353 part->remote_openclose_args_pa =
354 pulled_entry->openclose_args_pa;
355 part->remote_IPI_amo_va =
356 (AMO_t *)__va(pulled_entry->IPI_amo_pa);
357 part->remote_IPI_nasid = pulled_entry->IPI_nasid;
358 part->remote_IPI_phys_cpuid = pulled_entry->IPI_phys_cpuid;
359
360 if (part->nchannels > pulled_entry->nchannels)
361 part->nchannels = pulled_entry->nchannels;
362
363 /* let the other side know that we've pulled their variables */
364
365 xpc_vars_part[partid].magic = XPC_VP_MAGIC2;
366 }
367
368 if (pulled_entry->magic == XPC_VP_MAGIC1)
369 return xpcRetry;
370
371 return xpcSuccess;
372}
373
374/*
375 * Get the IPI flags and pull the openclose args and/or remote GPs as needed.
376 */
377static u64
378xpc_get_IPI_flags(struct xpc_partition *part)
379{
380 unsigned long irq_flags;
381 u64 IPI_amo;
382 enum xpc_retval ret;
383
384 /*
385 * See if there are any IPI flags to be handled.
386 */
387
388 spin_lock_irqsave(&part->IPI_lock, irq_flags);
389 IPI_amo = part->local_IPI_amo;
390 if (IPI_amo != 0)
391 part->local_IPI_amo = 0;
392
393 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
394
395 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_amo)) {
396 ret = xpc_pull_remote_cachelines(part,
397 part->remote_openclose_args,
398 (void *)part->
399 remote_openclose_args_pa,
400 XPC_OPENCLOSE_ARGS_SIZE);
401 if (ret != xpcSuccess) {
402 XPC_DEACTIVATE_PARTITION(part, ret);
403
404 dev_dbg(xpc_chan, "failed to pull openclose args from "
405 "partition %d, ret=%d\n", XPC_PARTID(part),
406 ret);
407
408 /* don't bother processing IPIs anymore */
409 IPI_amo = 0;
410 }
411 }
412
413 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_amo)) {
414 ret = xpc_pull_remote_cachelines(part, part->remote_GPs,
415 (void *)part->remote_GPs_pa,
416 XPC_GP_SIZE);
417 if (ret != xpcSuccess) {
418 XPC_DEACTIVATE_PARTITION(part, ret);
419
420 dev_dbg(xpc_chan, "failed to pull GPs from partition "
421 "%d, ret=%d\n", XPC_PARTID(part), ret);
422
423 /* don't bother processing IPIs anymore */
424 IPI_amo = 0;
425 }
426 }
427
428 return IPI_amo;
429}
430
431/*
432 * Allocate the local message queue and the notify queue.
433 */
434static enum xpc_retval
435xpc_allocate_local_msgqueue(struct xpc_channel *ch)
436{
437 unsigned long irq_flags;
438 int nentries;
439 size_t nbytes;
440
441 for (nentries = ch->local_nentries; nentries > 0; nentries--) {
442
443 nbytes = nentries * ch->msg_size;
444 ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
445 GFP_KERNEL,
446 &ch->local_msgqueue_base);
447 if (ch->local_msgqueue == NULL)
448 continue;
449
450 nbytes = nentries * sizeof(struct xpc_notify);
451 ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
452 if (ch->notify_queue == NULL) {
453 kfree(ch->local_msgqueue_base);
454 ch->local_msgqueue = NULL;
455 continue;
456 }
457
458 spin_lock_irqsave(&ch->lock, irq_flags);
459 if (nentries < ch->local_nentries) {
460 dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, "
461 "partid=%d, channel=%d\n", nentries,
462 ch->local_nentries, ch->partid, ch->number);
463
464 ch->local_nentries = nentries;
465 }
466 spin_unlock_irqrestore(&ch->lock, irq_flags);
467 return xpcSuccess;
468 }
469
470 dev_dbg(xpc_chan, "can't get memory for local message queue and notify "
471 "queue, partid=%d, channel=%d\n", ch->partid, ch->number);
472 return xpcNoMemory;
473}
474
475/*
476 * Allocate the cached remote message queue.
477 */
478static enum xpc_retval
479xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
480{
481 unsigned long irq_flags;
482 int nentries;
483 size_t nbytes;
484
485 DBUG_ON(ch->remote_nentries <= 0);
486
487 for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
488
489 nbytes = nentries * ch->msg_size;
490 ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
491 GFP_KERNEL,
492 &ch->remote_msgqueue_base);
493 if (ch->remote_msgqueue == NULL)
494 continue;
495
496 spin_lock_irqsave(&ch->lock, irq_flags);
497 if (nentries < ch->remote_nentries) {
498 dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, "
499 "partid=%d, channel=%d\n", nentries,
500 ch->remote_nentries, ch->partid, ch->number);
501
502 ch->remote_nentries = nentries;
503 }
504 spin_unlock_irqrestore(&ch->lock, irq_flags);
505 return xpcSuccess;
506 }
507
508 dev_dbg(xpc_chan, "can't get memory for cached remote message queue, "
509 "partid=%d, channel=%d\n", ch->partid, ch->number);
510 return xpcNoMemory;
511}
512
513/*
514 * Allocate message queues and other stuff associated with a channel.
515 *
516 * Note: Assumes all of the channel sizes are filled in.
517 */
518static enum xpc_retval
519xpc_allocate_msgqueues(struct xpc_channel *ch)
520{
521 unsigned long irq_flags;
522 enum xpc_retval ret;
523
524 DBUG_ON(ch->flags & XPC_C_SETUP);
525
526 ret = xpc_allocate_local_msgqueue(ch);
527 if (ret != xpcSuccess)
528 return ret;
529
530 ret = xpc_allocate_remote_msgqueue(ch);
531 if (ret != xpcSuccess) {
532 kfree(ch->local_msgqueue_base);
533 ch->local_msgqueue = NULL;
534 kfree(ch->notify_queue);
535 ch->notify_queue = NULL;
536 return ret;
537 }
538
539 spin_lock_irqsave(&ch->lock, irq_flags);
540 ch->flags |= XPC_C_SETUP;
541 spin_unlock_irqrestore(&ch->lock, irq_flags);
542
543 return xpcSuccess;
544}
545
546/*
547 * Process a connect message from a remote partition.
548 *
549 * Note: xpc_process_connect() is expecting to be called with the
550 * spin_lock_irqsave held and will leave it locked upon return.
551 */
552static void
553xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
554{
555 enum xpc_retval ret;
556
557 DBUG_ON(!spin_is_locked(&ch->lock));
558
559 if (!(ch->flags & XPC_C_OPENREQUEST) ||
560 !(ch->flags & XPC_C_ROPENREQUEST)) {
561 /* nothing more to do for now */
562 return;
563 }
564 DBUG_ON(!(ch->flags & XPC_C_CONNECTING));
565
566 if (!(ch->flags & XPC_C_SETUP)) {
567 spin_unlock_irqrestore(&ch->lock, *irq_flags);
568 ret = xpc_allocate_msgqueues(ch);
569 spin_lock_irqsave(&ch->lock, *irq_flags);
570
571 if (ret != xpcSuccess)
572 XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags);
573
574 if (ch->flags & (XPC_C_CONNECTED | XPC_C_DISCONNECTING))
575 return;
576
577 DBUG_ON(!(ch->flags & XPC_C_SETUP));
578 DBUG_ON(ch->local_msgqueue == NULL);
579 DBUG_ON(ch->remote_msgqueue == NULL);
580 }
581
582 if (!(ch->flags & XPC_C_OPENREPLY)) {
583 ch->flags |= XPC_C_OPENREPLY;
584 xpc_IPI_send_openreply(ch, irq_flags);
585 }
586
587 if (!(ch->flags & XPC_C_ROPENREPLY))
588 return;
589
590 DBUG_ON(ch->remote_msgqueue_pa == 0);
591
592 ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP); /* clear all else */
593
594 dev_info(xpc_chan, "channel %d to partition %d connected\n",
595 ch->number, ch->partid);
596
597 spin_unlock_irqrestore(&ch->lock, *irq_flags);
598 xpc_create_kthreads(ch, 1, 0);
599 spin_lock_irqsave(&ch->lock, *irq_flags);
600}
601
602/*
603 * Notify those who wanted to be notified upon delivery of their message.
604 */
605static void
606xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put)
607{
608 struct xpc_notify *notify;
609 u8 notify_type;
610 s64 get = ch->w_remote_GP.get - 1;
611
612 while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
613
614 notify = &ch->notify_queue[get % ch->local_nentries];
615
616 /*
617 * See if the notify entry indicates it was associated with
618 * a message who's sender wants to be notified. It is possible
619 * that it is, but someone else is doing or has done the
620 * notification.
621 */
622 notify_type = notify->type;
623 if (notify_type == 0 ||
624 cmpxchg(&notify->type, notify_type, 0) != notify_type) {
625 continue;
626 }
627
628 DBUG_ON(notify_type != XPC_N_CALL);
629
630 atomic_dec(&ch->n_to_notify);
631
632 if (notify->func != NULL) {
633 dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, "
634 "msg_number=%ld, partid=%d, channel=%d\n",
635 (void *)notify, get, ch->partid, ch->number);
636
637 notify->func(reason, ch->partid, ch->number,
638 notify->key);
639
640 dev_dbg(xpc_chan, "notify->func() returned, "
641 "notify=0x%p, msg_number=%ld, partid=%d, "
642 "channel=%d\n", (void *)notify, get,
643 ch->partid, ch->number);
644 }
645 }
646}
647
648/*
649 * Free up message queues and other stuff that were allocated for the specified
650 * channel.
651 *
652 * Note: ch->reason and ch->reason_line are left set for debugging purposes,
653 * they're cleared when XPC_C_DISCONNECTED is cleared.
654 */
655static void
656xpc_free_msgqueues(struct xpc_channel *ch)
657{
658 DBUG_ON(!spin_is_locked(&ch->lock));
659 DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
660
661 ch->remote_msgqueue_pa = 0;
662 ch->func = NULL;
663 ch->key = NULL;
664 ch->msg_size = 0;
665 ch->local_nentries = 0;
666 ch->remote_nentries = 0;
667 ch->kthreads_assigned_limit = 0;
668 ch->kthreads_idle_limit = 0;
669
670 ch->local_GP->get = 0;
671 ch->local_GP->put = 0;
672 ch->remote_GP.get = 0;
673 ch->remote_GP.put = 0;
674 ch->w_local_GP.get = 0;
675 ch->w_local_GP.put = 0;
676 ch->w_remote_GP.get = 0;
677 ch->w_remote_GP.put = 0;
678 ch->next_msg_to_pull = 0;
679
680 if (ch->flags & XPC_C_SETUP) {
681 ch->flags &= ~XPC_C_SETUP;
682
683 dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n",
684 ch->flags, ch->partid, ch->number);
685
686 kfree(ch->local_msgqueue_base);
687 ch->local_msgqueue = NULL;
688 kfree(ch->remote_msgqueue_base);
689 ch->remote_msgqueue = NULL;
690 kfree(ch->notify_queue);
691 ch->notify_queue = NULL;
692 }
693}
694
695/*
696 * spin_lock_irqsave() is expected to be held on entry.
697 */
698static void
699xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
700{
701 struct xpc_partition *part = &xpc_partitions[ch->partid];
702 u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED);
703
704 DBUG_ON(!spin_is_locked(&ch->lock));
705
706 if (!(ch->flags & XPC_C_DISCONNECTING))
707 return;
708
709 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
710
711 /* make sure all activity has settled down first */
712
713 if (atomic_read(&ch->kthreads_assigned) > 0 ||
714 atomic_read(&ch->references) > 0) {
715 return;
716 }
717 DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
718 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE));
719
720 if (part->act_state == XPC_P_DEACTIVATING) {
721 /* can't proceed until the other side disengages from us */
722 if (xpc_partition_engaged(1UL << ch->partid))
723 return;
724
725 } else {
726
727 /* as long as the other side is up do the full protocol */
728
729 if (!(ch->flags & XPC_C_RCLOSEREQUEST))
730 return;
731
732 if (!(ch->flags & XPC_C_CLOSEREPLY)) {
733 ch->flags |= XPC_C_CLOSEREPLY;
734 xpc_IPI_send_closereply(ch, irq_flags);
735 }
736
737 if (!(ch->flags & XPC_C_RCLOSEREPLY))
738 return;
739 }
740
741 /* wake those waiting for notify completion */
742 if (atomic_read(&ch->n_to_notify) > 0) {
743 /* >>> we do callout while holding ch->lock */
744 xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put);
745 }
746
747 /* both sides are disconnected now */
748
749 if (ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE) {
750 spin_unlock_irqrestore(&ch->lock, *irq_flags);
751 xpc_disconnect_callout(ch, xpcDisconnected);
752 spin_lock_irqsave(&ch->lock, *irq_flags);
753 }
754
755 /* it's now safe to free the channel's message queues */
756 xpc_free_msgqueues(ch);
757
758 /* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */
759 ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
760
761 atomic_dec(&part->nchannels_active);
762
763 if (channel_was_connected) {
764 dev_info(xpc_chan, "channel %d to partition %d disconnected, "
765 "reason=%d\n", ch->number, ch->partid, ch->reason);
766 }
767
768 if (ch->flags & XPC_C_WDISCONNECT) {
769 /* we won't lose the CPU since we're holding ch->lock */
770 complete(&ch->wdisconnect_wait);
771 } else if (ch->delayed_IPI_flags) {
772 if (part->act_state != XPC_P_DEACTIVATING) {
773 /* time to take action on any delayed IPI flags */
774 spin_lock(&part->IPI_lock);
775 XPC_SET_IPI_FLAGS(part->local_IPI_amo, ch->number,
776 ch->delayed_IPI_flags);
777 spin_unlock(&part->IPI_lock);
778 }
779 ch->delayed_IPI_flags = 0;
780 }
781}
782
783/*
784 * Process a change in the channel's remote connection state.
785 */
786static void
787xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
788 u8 IPI_flags)
789{
790 unsigned long irq_flags;
791 struct xpc_openclose_args *args =
792 &part->remote_openclose_args[ch_number];
793 struct xpc_channel *ch = &part->channels[ch_number];
794 enum xpc_retval reason;
795
796 spin_lock_irqsave(&ch->lock, irq_flags);
797
798again:
799
800 if ((ch->flags & XPC_C_DISCONNECTED) &&
801 (ch->flags & XPC_C_WDISCONNECT)) {
802 /*
803 * Delay processing IPI flags until thread waiting disconnect
804 * has had a chance to see that the channel is disconnected.
805 */
806 ch->delayed_IPI_flags |= IPI_flags;
807 spin_unlock_irqrestore(&ch->lock, irq_flags);
808 return;
809 }
810
811 if (IPI_flags & XPC_IPI_CLOSEREQUEST) {
812
813 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREQUEST (reason=%d) received "
814 "from partid=%d, channel=%d\n", args->reason,
815 ch->partid, ch->number);
816
817 /*
818 * If RCLOSEREQUEST is set, we're probably waiting for
819 * RCLOSEREPLY. We should find it and a ROPENREQUEST packed
820 * with this RCLOSEREQUEST in the IPI_flags.
821 */
822
823 if (ch->flags & XPC_C_RCLOSEREQUEST) {
824 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
825 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
826 DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY));
827 DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY);
828
829 DBUG_ON(!(IPI_flags & XPC_IPI_CLOSEREPLY));
830 IPI_flags &= ~XPC_IPI_CLOSEREPLY;
831 ch->flags |= XPC_C_RCLOSEREPLY;
832
833 /* both sides have finished disconnecting */
834 xpc_process_disconnect(ch, &irq_flags);
835 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
836 goto again;
837 }
838
839 if (ch->flags & XPC_C_DISCONNECTED) {
840 if (!(IPI_flags & XPC_IPI_OPENREQUEST)) {
841 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo,
842 ch_number) &
843 XPC_IPI_OPENREQUEST)) {
844
845 DBUG_ON(ch->delayed_IPI_flags != 0);
846 spin_lock(&part->IPI_lock);
847 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
848 ch_number,
849 XPC_IPI_CLOSEREQUEST);
850 spin_unlock(&part->IPI_lock);
851 }
852 spin_unlock_irqrestore(&ch->lock, irq_flags);
853 return;
854 }
855
856 XPC_SET_REASON(ch, 0, 0);
857 ch->flags &= ~XPC_C_DISCONNECTED;
858
859 atomic_inc(&part->nchannels_active);
860 ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST);
861 }
862
863 IPI_flags &= ~(XPC_IPI_OPENREQUEST | XPC_IPI_OPENREPLY);
864
865 /*
866 * The meaningful CLOSEREQUEST connection state fields are:
867 * reason = reason connection is to be closed
868 */
869
870 ch->flags |= XPC_C_RCLOSEREQUEST;
871
872 if (!(ch->flags & XPC_C_DISCONNECTING)) {
873 reason = args->reason;
874 if (reason <= xpcSuccess || reason > xpcUnknownReason)
875 reason = xpcUnknownReason;
876 else if (reason == xpcUnregistering)
877 reason = xpcOtherUnregistering;
878
879 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
880
881 DBUG_ON(IPI_flags & XPC_IPI_CLOSEREPLY);
882 spin_unlock_irqrestore(&ch->lock, irq_flags);
883 return;
884 }
885
886 xpc_process_disconnect(ch, &irq_flags);
887 }
888
889 if (IPI_flags & XPC_IPI_CLOSEREPLY) {
890
891 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREPLY received from partid=%d,"
892 " channel=%d\n", ch->partid, ch->number);
893
894 if (ch->flags & XPC_C_DISCONNECTED) {
895 DBUG_ON(part->act_state != XPC_P_DEACTIVATING);
896 spin_unlock_irqrestore(&ch->lock, irq_flags);
897 return;
898 }
899
900 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
901
902 if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
903 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, ch_number)
904 & XPC_IPI_CLOSEREQUEST)) {
905
906 DBUG_ON(ch->delayed_IPI_flags != 0);
907 spin_lock(&part->IPI_lock);
908 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
909 ch_number,
910 XPC_IPI_CLOSEREPLY);
911 spin_unlock(&part->IPI_lock);
912 }
913 spin_unlock_irqrestore(&ch->lock, irq_flags);
914 return;
915 }
916
917 ch->flags |= XPC_C_RCLOSEREPLY;
918
919 if (ch->flags & XPC_C_CLOSEREPLY) {
920 /* both sides have finished disconnecting */
921 xpc_process_disconnect(ch, &irq_flags);
922 }
923 }
924
925 if (IPI_flags & XPC_IPI_OPENREQUEST) {
926
927 dev_dbg(xpc_chan, "XPC_IPI_OPENREQUEST (msg_size=%d, "
928 "local_nentries=%d) received from partid=%d, "
929 "channel=%d\n", args->msg_size, args->local_nentries,
930 ch->partid, ch->number);
931
932 if (part->act_state == XPC_P_DEACTIVATING ||
933 (ch->flags & XPC_C_ROPENREQUEST)) {
934 spin_unlock_irqrestore(&ch->lock, irq_flags);
935 return;
936 }
937
938 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
939 ch->delayed_IPI_flags |= XPC_IPI_OPENREQUEST;
940 spin_unlock_irqrestore(&ch->lock, irq_flags);
941 return;
942 }
943 DBUG_ON(!(ch->flags & (XPC_C_DISCONNECTED |
944 XPC_C_OPENREQUEST)));
945 DBUG_ON(ch->flags & (XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
946 XPC_C_OPENREPLY | XPC_C_CONNECTED));
947
948 /*
949 * The meaningful OPENREQUEST connection state fields are:
950 * msg_size = size of channel's messages in bytes
951 * local_nentries = remote partition's local_nentries
952 */
953 if (args->msg_size == 0 || args->local_nentries == 0) {
954 /* assume OPENREQUEST was delayed by mistake */
955 spin_unlock_irqrestore(&ch->lock, irq_flags);
956 return;
957 }
958
959 ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING);
960 ch->remote_nentries = args->local_nentries;
961
962 if (ch->flags & XPC_C_OPENREQUEST) {
963 if (args->msg_size != ch->msg_size) {
964 XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
965 &irq_flags);
966 spin_unlock_irqrestore(&ch->lock, irq_flags);
967 return;
968 }
969 } else {
970 ch->msg_size = args->msg_size;
971
972 XPC_SET_REASON(ch, 0, 0);
973 ch->flags &= ~XPC_C_DISCONNECTED;
974
975 atomic_inc(&part->nchannels_active);
976 }
977
978 xpc_process_connect(ch, &irq_flags);
979 }
980
981 if (IPI_flags & XPC_IPI_OPENREPLY) {
982
983 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY (local_msgqueue_pa=0x%lx, "
984 "local_nentries=%d, remote_nentries=%d) received from "
985 "partid=%d, channel=%d\n", args->local_msgqueue_pa,
986 args->local_nentries, args->remote_nentries,
987 ch->partid, ch->number);
988
989 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) {
990 spin_unlock_irqrestore(&ch->lock, irq_flags);
991 return;
992 }
993 if (!(ch->flags & XPC_C_OPENREQUEST)) {
994 XPC_DISCONNECT_CHANNEL(ch, xpcOpenCloseError,
995 &irq_flags);
996 spin_unlock_irqrestore(&ch->lock, irq_flags);
997 return;
998 }
999
1000 DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
1001 DBUG_ON(ch->flags & XPC_C_CONNECTED);
1002
1003 /*
1004 * The meaningful OPENREPLY connection state fields are:
1005 * local_msgqueue_pa = physical address of remote
1006 * partition's local_msgqueue
1007 * local_nentries = remote partition's local_nentries
1008 * remote_nentries = remote partition's remote_nentries
1009 */
1010 DBUG_ON(args->local_msgqueue_pa == 0);
1011 DBUG_ON(args->local_nentries == 0);
1012 DBUG_ON(args->remote_nentries == 0);
1013
1014 ch->flags |= XPC_C_ROPENREPLY;
1015 ch->remote_msgqueue_pa = args->local_msgqueue_pa;
1016
1017 if (args->local_nentries < ch->remote_nentries) {
1018 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
1019 "remote_nentries=%d, old remote_nentries=%d, "
1020 "partid=%d, channel=%d\n",
1021 args->local_nentries, ch->remote_nentries,
1022 ch->partid, ch->number);
1023
1024 ch->remote_nentries = args->local_nentries;
1025 }
1026 if (args->remote_nentries < ch->local_nentries) {
1027 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
1028 "local_nentries=%d, old local_nentries=%d, "
1029 "partid=%d, channel=%d\n",
1030 args->remote_nentries, ch->local_nentries,
1031 ch->partid, ch->number);
1032
1033 ch->local_nentries = args->remote_nentries;
1034 }
1035
1036 xpc_process_connect(ch, &irq_flags);
1037 }
1038
1039 spin_unlock_irqrestore(&ch->lock, irq_flags);
1040}
1041
1042/*
1043 * Attempt to establish a channel connection to a remote partition.
1044 */
1045static enum xpc_retval
1046xpc_connect_channel(struct xpc_channel *ch)
1047{
1048 unsigned long irq_flags;
1049 struct xpc_registration *registration = &xpc_registrations[ch->number];
1050
1051 if (mutex_trylock(&registration->mutex) == 0)
1052 return xpcRetry;
1053
1054 if (!XPC_CHANNEL_REGISTERED(ch->number)) {
1055 mutex_unlock(&registration->mutex);
1056 return xpcUnregistered;
1057 }
1058
1059 spin_lock_irqsave(&ch->lock, irq_flags);
1060
1061 DBUG_ON(ch->flags & XPC_C_CONNECTED);
1062 DBUG_ON(ch->flags & XPC_C_OPENREQUEST);
1063
1064 if (ch->flags & XPC_C_DISCONNECTING) {
1065 spin_unlock_irqrestore(&ch->lock, irq_flags);
1066 mutex_unlock(&registration->mutex);
1067 return ch->reason;
1068 }
1069
1070 /* add info from the channel connect registration to the channel */
1071
1072 ch->kthreads_assigned_limit = registration->assigned_limit;
1073 ch->kthreads_idle_limit = registration->idle_limit;
1074 DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0);
1075 DBUG_ON(atomic_read(&ch->kthreads_idle) != 0);
1076 DBUG_ON(atomic_read(&ch->kthreads_active) != 0);
1077
1078 ch->func = registration->func;
1079 DBUG_ON(registration->func == NULL);
1080 ch->key = registration->key;
1081
1082 ch->local_nentries = registration->nentries;
1083
1084 if (ch->flags & XPC_C_ROPENREQUEST) {
1085 if (registration->msg_size != ch->msg_size) {
1086 /* the local and remote sides aren't the same */
1087
1088 /*
1089 * Because XPC_DISCONNECT_CHANNEL() can block we're
1090 * forced to up the registration sema before we unlock
1091 * the channel lock. But that's okay here because we're
1092 * done with the part that required the registration
1093 * sema. XPC_DISCONNECT_CHANNEL() requires that the
1094 * channel lock be locked and will unlock and relock
1095 * the channel lock as needed.
1096 */
1097 mutex_unlock(&registration->mutex);
1098 XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
1099 &irq_flags);
1100 spin_unlock_irqrestore(&ch->lock, irq_flags);
1101 return xpcUnequalMsgSizes;
1102 }
1103 } else {
1104 ch->msg_size = registration->msg_size;
1105
1106 XPC_SET_REASON(ch, 0, 0);
1107 ch->flags &= ~XPC_C_DISCONNECTED;
1108
1109 atomic_inc(&xpc_partitions[ch->partid].nchannels_active);
1110 }
1111
1112 mutex_unlock(&registration->mutex);
1113
1114 /* initiate the connection */
1115
1116 ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING);
1117 xpc_IPI_send_openrequest(ch, &irq_flags);
1118
1119 xpc_process_connect(ch, &irq_flags);
1120
1121 spin_unlock_irqrestore(&ch->lock, irq_flags);
1122
1123 return xpcSuccess;
1124}
1125
1126/*
1127 * Clear some of the msg flags in the local message queue.
1128 */
1129static inline void
1130xpc_clear_local_msgqueue_flags(struct xpc_channel *ch)
1131{
1132 struct xpc_msg *msg;
1133 s64 get;
1134
1135 get = ch->w_remote_GP.get;
1136 do {
1137 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1138 (get % ch->local_nentries) *
1139 ch->msg_size);
1140 msg->flags = 0;
1141 } while (++get < ch->remote_GP.get);
1142}
1143
1144/*
1145 * Clear some of the msg flags in the remote message queue.
1146 */
1147static inline void
1148xpc_clear_remote_msgqueue_flags(struct xpc_channel *ch)
1149{
1150 struct xpc_msg *msg;
1151 s64 put;
1152
1153 put = ch->w_remote_GP.put;
1154 do {
1155 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
1156 (put % ch->remote_nentries) *
1157 ch->msg_size);
1158 msg->flags = 0;
1159 } while (++put < ch->remote_GP.put);
1160}
1161
1162static void
1163xpc_process_msg_IPI(struct xpc_partition *part, int ch_number)
1164{
1165 struct xpc_channel *ch = &part->channels[ch_number];
1166 int nmsgs_sent;
1167
1168 ch->remote_GP = part->remote_GPs[ch_number];
1169
1170 /* See what, if anything, has changed for each connected channel */
1171
1172 xpc_msgqueue_ref(ch);
1173
1174 if (ch->w_remote_GP.get == ch->remote_GP.get &&
1175 ch->w_remote_GP.put == ch->remote_GP.put) {
1176 /* nothing changed since GPs were last pulled */
1177 xpc_msgqueue_deref(ch);
1178 return;
1179 }
1180
1181 if (!(ch->flags & XPC_C_CONNECTED)) {
1182 xpc_msgqueue_deref(ch);
1183 return;
1184 }
1185
1186 /*
1187 * First check to see if messages recently sent by us have been
1188 * received by the other side. (The remote GET value will have
1189 * changed since we last looked at it.)
1190 */
1191
1192 if (ch->w_remote_GP.get != ch->remote_GP.get) {
1193
1194 /*
1195 * We need to notify any senders that want to be notified
1196 * that their sent messages have been received by their
1197 * intended recipients. We need to do this before updating
1198 * w_remote_GP.get so that we don't allocate the same message
1199 * queue entries prematurely (see xpc_allocate_msg()).
1200 */
1201 if (atomic_read(&ch->n_to_notify) > 0) {
1202 /*
1203 * Notify senders that messages sent have been
1204 * received and delivered by the other side.
1205 */
1206 xpc_notify_senders(ch, xpcMsgDelivered,
1207 ch->remote_GP.get);
1208 }
1209
1210 /*
1211 * Clear msg->flags in previously sent messages, so that
1212 * they're ready for xpc_allocate_msg().
1213 */
1214 xpc_clear_local_msgqueue_flags(ch);
1215
1216 ch->w_remote_GP.get = ch->remote_GP.get;
1217
1218 dev_dbg(xpc_chan, "w_remote_GP.get changed to %ld, partid=%d, "
1219 "channel=%d\n", ch->w_remote_GP.get, ch->partid,
1220 ch->number);
1221
1222 /*
1223 * If anyone was waiting for message queue entries to become
1224 * available, wake them up.
1225 */
1226 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
1227 wake_up(&ch->msg_allocate_wq);
1228 }
1229
1230 /*
1231 * Now check for newly sent messages by the other side. (The remote
1232 * PUT value will have changed since we last looked at it.)
1233 */
1234
1235 if (ch->w_remote_GP.put != ch->remote_GP.put) {
1236 /*
1237 * Clear msg->flags in previously received messages, so that
1238 * they're ready for xpc_get_deliverable_msg().
1239 */
1240 xpc_clear_remote_msgqueue_flags(ch);
1241
1242 ch->w_remote_GP.put = ch->remote_GP.put;
1243
1244 dev_dbg(xpc_chan, "w_remote_GP.put changed to %ld, partid=%d, "
1245 "channel=%d\n", ch->w_remote_GP.put, ch->partid,
1246 ch->number);
1247
1248 nmsgs_sent = ch->w_remote_GP.put - ch->w_local_GP.get;
1249 if (nmsgs_sent > 0) {
1250 dev_dbg(xpc_chan, "msgs waiting to be copied and "
1251 "delivered=%d, partid=%d, channel=%d\n",
1252 nmsgs_sent, ch->partid, ch->number);
1253
1254 if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)
1255 xpc_activate_kthreads(ch, nmsgs_sent);
1256 }
1257 }
1258
1259 xpc_msgqueue_deref(ch);
1260}
1261
1262void
1263xpc_process_channel_activity(struct xpc_partition *part)
1264{
1265 unsigned long irq_flags;
1266 u64 IPI_amo, IPI_flags;
1267 struct xpc_channel *ch;
1268 int ch_number;
1269 u32 ch_flags;
1270
1271 IPI_amo = xpc_get_IPI_flags(part);
1272
1273 /*
1274 * Initiate channel connections for registered channels.
1275 *
1276 * For each connected channel that has pending messages activate idle
1277 * kthreads and/or create new kthreads as needed.
1278 */
1279
1280 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
1281 ch = &part->channels[ch_number];
1282
1283 /*
1284 * Process any open or close related IPI flags, and then deal
1285 * with connecting or disconnecting the channel as required.
1286 */
1287
1288 IPI_flags = XPC_GET_IPI_FLAGS(IPI_amo, ch_number);
1289
1290 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_flags))
1291 xpc_process_openclose_IPI(part, ch_number, IPI_flags);
1292
1293 ch_flags = ch->flags; /* need an atomic snapshot of flags */
1294
1295 if (ch_flags & XPC_C_DISCONNECTING) {
1296 spin_lock_irqsave(&ch->lock, irq_flags);
1297 xpc_process_disconnect(ch, &irq_flags);
1298 spin_unlock_irqrestore(&ch->lock, irq_flags);
1299 continue;
1300 }
1301
1302 if (part->act_state == XPC_P_DEACTIVATING)
1303 continue;
1304
1305 if (!(ch_flags & XPC_C_CONNECTED)) {
1306 if (!(ch_flags & XPC_C_OPENREQUEST)) {
1307 DBUG_ON(ch_flags & XPC_C_SETUP);
1308 (void)xpc_connect_channel(ch);
1309 } else {
1310 spin_lock_irqsave(&ch->lock, irq_flags);
1311 xpc_process_connect(ch, &irq_flags);
1312 spin_unlock_irqrestore(&ch->lock, irq_flags);
1313 }
1314 continue;
1315 }
1316
1317 /*
1318 * Process any message related IPI flags, this may involve the
1319 * activation of kthreads to deliver any pending messages sent
1320 * from the other partition.
1321 */
1322
1323 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_flags))
1324 xpc_process_msg_IPI(part, ch_number);
1325 }
1326}
1327
1328/*
1329 * XPC's heartbeat code calls this function to inform XPC that a partition is
1330 * going down. XPC responds by tearing down the XPartition Communication
1331 * infrastructure used for the just downed partition.
1332 *
1333 * XPC's heartbeat code will never call this function and xpc_partition_up()
1334 * at the same time. Nor will it ever make multiple calls to either function
1335 * at the same time.
1336 */
1337void
1338xpc_partition_going_down(struct xpc_partition *part, enum xpc_retval reason)
1339{
1340 unsigned long irq_flags;
1341 int ch_number;
1342 struct xpc_channel *ch;
1343
1344 dev_dbg(xpc_chan, "deactivating partition %d, reason=%d\n",
1345 XPC_PARTID(part), reason);
1346
1347 if (!xpc_part_ref(part)) {
1348 /* infrastructure for this partition isn't currently set up */
1349 return;
1350 }
1351
1352 /* disconnect channels associated with the partition going down */
1353
1354 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
1355 ch = &part->channels[ch_number];
1356
1357 xpc_msgqueue_ref(ch);
1358 spin_lock_irqsave(&ch->lock, irq_flags);
1359
1360 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
1361
1362 spin_unlock_irqrestore(&ch->lock, irq_flags);
1363 xpc_msgqueue_deref(ch);
1364 }
1365
1366 xpc_wakeup_channel_mgr(part);
1367
1368 xpc_part_deref(part);
1369}
1370
1371/*
1372 * Teardown the infrastructure necessary to support XPartition Communication
1373 * between the specified remote partition and the local one.
1374 */
1375void
1376xpc_teardown_infrastructure(struct xpc_partition *part)
1377{
1378 partid_t partid = XPC_PARTID(part);
1379
1380 /*
1381 * We start off by making this partition inaccessible to local
1382 * processes by marking it as no longer setup. Then we make it
1383 * inaccessible to remote processes by clearing the XPC per partition
1384 * specific variable's magic # (which indicates that these variables
1385 * are no longer valid) and by ignoring all XPC notify IPIs sent to
1386 * this partition.
1387 */
1388
1389 DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
1390 DBUG_ON(atomic_read(&part->nchannels_active) != 0);
1391 DBUG_ON(part->setup_state != XPC_P_SETUP);
1392 part->setup_state = XPC_P_WTEARDOWN;
1393
1394 xpc_vars_part[partid].magic = 0;
1395
1396 free_irq(SGI_XPC_NOTIFY, (void *)(u64)partid);
1397
1398 /*
1399 * Before proceeding with the teardown we have to wait until all
1400 * existing references cease.
1401 */
1402 wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
1403
1404 /* now we can begin tearing down the infrastructure */
1405
1406 part->setup_state = XPC_P_TORNDOWN;
1407
1408 /* in case we've still got outstanding timers registered... */
1409 del_timer_sync(&part->dropped_IPI_timer);
1410
1411 kfree(part->remote_openclose_args_base);
1412 part->remote_openclose_args = NULL;
1413 kfree(part->local_openclose_args_base);
1414 part->local_openclose_args = NULL;
1415 kfree(part->remote_GPs_base);
1416 part->remote_GPs = NULL;
1417 kfree(part->local_GPs_base);
1418 part->local_GPs = NULL;
1419 kfree(part->channels);
1420 part->channels = NULL;
1421 part->local_IPI_amo_va = NULL;
1422}
1423
1424/*
1425 * Called by XP at the time of channel connection registration to cause
1426 * XPC to establish connections to all currently active partitions.
1427 */
1428void
1429xpc_initiate_connect(int ch_number)
1430{
1431 partid_t partid;
1432 struct xpc_partition *part;
1433 struct xpc_channel *ch;
1434
1435 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
1436
1437 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1438 part = &xpc_partitions[partid];
1439
1440 if (xpc_part_ref(part)) {
1441 ch = &part->channels[ch_number];
1442
1443 /*
1444 * Initiate the establishment of a connection on the
1445 * newly registered channel to the remote partition.
1446 */
1447 xpc_wakeup_channel_mgr(part);
1448 xpc_part_deref(part);
1449 }
1450 }
1451}
1452
1453void
1454xpc_connected_callout(struct xpc_channel *ch)
1455{
1456 /* let the registerer know that a connection has been established */
1457
1458 if (ch->func != NULL) {
1459 dev_dbg(xpc_chan, "ch->func() called, reason=xpcConnected, "
1460 "partid=%d, channel=%d\n", ch->partid, ch->number);
1461
1462 ch->func(xpcConnected, ch->partid, ch->number,
1463 (void *)(u64)ch->local_nentries, ch->key);
1464
1465 dev_dbg(xpc_chan, "ch->func() returned, reason=xpcConnected, "
1466 "partid=%d, channel=%d\n", ch->partid, ch->number);
1467 }
1468}
1469
1470/*
1471 * Called by XP at the time of channel connection unregistration to cause
1472 * XPC to teardown all current connections for the specified channel.
1473 *
1474 * Before returning xpc_initiate_disconnect() will wait until all connections
1475 * on the specified channel have been closed/torndown. So the caller can be
1476 * assured that they will not be receiving any more callouts from XPC to the
1477 * function they registered via xpc_connect().
1478 *
1479 * Arguments:
1480 *
1481 * ch_number - channel # to unregister.
1482 */
1483void
1484xpc_initiate_disconnect(int ch_number)
1485{
1486 unsigned long irq_flags;
1487 partid_t partid;
1488 struct xpc_partition *part;
1489 struct xpc_channel *ch;
1490
1491 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
1492
1493 /* initiate the channel disconnect for every active partition */
1494 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1495 part = &xpc_partitions[partid];
1496
1497 if (xpc_part_ref(part)) {
1498 ch = &part->channels[ch_number];
1499 xpc_msgqueue_ref(ch);
1500
1501 spin_lock_irqsave(&ch->lock, irq_flags);
1502
1503 if (!(ch->flags & XPC_C_DISCONNECTED)) {
1504 ch->flags |= XPC_C_WDISCONNECT;
1505
1506 XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering,
1507 &irq_flags);
1508 }
1509
1510 spin_unlock_irqrestore(&ch->lock, irq_flags);
1511
1512 xpc_msgqueue_deref(ch);
1513 xpc_part_deref(part);
1514 }
1515 }
1516
1517 xpc_disconnect_wait(ch_number);
1518}
1519
1520/*
1521 * To disconnect a channel, and reflect it back to all who may be waiting.
1522 *
1523 * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
1524 * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by
1525 * xpc_disconnect_wait().
1526 *
1527 * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN.
1528 */
1529void
1530xpc_disconnect_channel(const int line, struct xpc_channel *ch,
1531 enum xpc_retval reason, unsigned long *irq_flags)
1532{
1533 u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED);
1534
1535 DBUG_ON(!spin_is_locked(&ch->lock));
1536
1537 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
1538 return;
1539
1540 DBUG_ON(!(ch->flags & (XPC_C_CONNECTING | XPC_C_CONNECTED)));
1541
1542 dev_dbg(xpc_chan, "reason=%d, line=%d, partid=%d, channel=%d\n",
1543 reason, line, ch->partid, ch->number);
1544
1545 XPC_SET_REASON(ch, reason, line);
1546
1547 ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
1548 /* some of these may not have been set */
1549 ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY |
1550 XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
1551 XPC_C_CONNECTING | XPC_C_CONNECTED);
1552
1553 xpc_IPI_send_closerequest(ch, irq_flags);
1554
1555 if (channel_was_connected)
1556 ch->flags |= XPC_C_WASCONNECTED;
1557
1558 spin_unlock_irqrestore(&ch->lock, *irq_flags);
1559
1560 /* wake all idle kthreads so they can exit */
1561 if (atomic_read(&ch->kthreads_idle) > 0) {
1562 wake_up_all(&ch->idle_wq);
1563
1564 } else if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
1565 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
1566 /* start a kthread that will do the xpcDisconnecting callout */
1567 xpc_create_kthreads(ch, 1, 1);
1568 }
1569
1570 /* wake those waiting to allocate an entry from the local msg queue */
1571 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
1572 wake_up(&ch->msg_allocate_wq);
1573
1574 spin_lock_irqsave(&ch->lock, *irq_flags);
1575}
1576
1577void
1578xpc_disconnect_callout(struct xpc_channel *ch, enum xpc_retval reason)
1579{
1580 /*
1581 * Let the channel's registerer know that the channel is being
1582 * disconnected. We don't want to do this if the registerer was never
1583 * informed of a connection being made.
1584 */
1585
1586 if (ch->func != NULL) {
1587 dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, "
1588 "channel=%d\n", reason, ch->partid, ch->number);
1589
1590 ch->func(reason, ch->partid, ch->number, NULL, ch->key);
1591
1592 dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, "
1593 "channel=%d\n", reason, ch->partid, ch->number);
1594 }
1595}
1596
1597/*
1598 * Wait for a message entry to become available for the specified channel,
1599 * but don't wait any longer than 1 jiffy.
1600 */
1601static enum xpc_retval
1602xpc_allocate_msg_wait(struct xpc_channel *ch)
1603{
1604 enum xpc_retval ret;
1605
1606 if (ch->flags & XPC_C_DISCONNECTING) {
1607 DBUG_ON(ch->reason == xpcInterrupted);
1608 return ch->reason;
1609 }
1610
1611 atomic_inc(&ch->n_on_msg_allocate_wq);
1612 ret = interruptible_sleep_on_timeout(&ch->msg_allocate_wq, 1);
1613 atomic_dec(&ch->n_on_msg_allocate_wq);
1614
1615 if (ch->flags & XPC_C_DISCONNECTING) {
1616 ret = ch->reason;
1617 DBUG_ON(ch->reason == xpcInterrupted);
1618 } else if (ret == 0) {
1619 ret = xpcTimeout;
1620 } else {
1621 ret = xpcInterrupted;
1622 }
1623
1624 return ret;
1625}
1626
1627/*
1628 * Allocate an entry for a message from the message queue associated with the
1629 * specified channel.
1630 */
1631static enum xpc_retval
1632xpc_allocate_msg(struct xpc_channel *ch, u32 flags,
1633 struct xpc_msg **address_of_msg)
1634{
1635 struct xpc_msg *msg;
1636 enum xpc_retval ret;
1637 s64 put;
1638
1639 /* this reference will be dropped in xpc_send_msg() */
1640 xpc_msgqueue_ref(ch);
1641
1642 if (ch->flags & XPC_C_DISCONNECTING) {
1643 xpc_msgqueue_deref(ch);
1644 return ch->reason;
1645 }
1646 if (!(ch->flags & XPC_C_CONNECTED)) {
1647 xpc_msgqueue_deref(ch);
1648 return xpcNotConnected;
1649 }
1650
1651 /*
1652 * Get the next available message entry from the local message queue.
1653 * If none are available, we'll make sure that we grab the latest
1654 * GP values.
1655 */
1656 ret = xpcTimeout;
1657
1658 while (1) {
1659
1660 put = ch->w_local_GP.put;
1661 rmb(); /* guarantee that .put loads before .get */
1662 if (put - ch->w_remote_GP.get < ch->local_nentries) {
1663
1664 /* There are available message entries. We need to try
1665 * to secure one for ourselves. We'll do this by trying
1666 * to increment w_local_GP.put as long as someone else
1667 * doesn't beat us to it. If they do, we'll have to
1668 * try again.
1669 */
1670 if (cmpxchg(&ch->w_local_GP.put, put, put + 1) == put) {
1671 /* we got the entry referenced by put */
1672 break;
1673 }
1674 continue; /* try again */
1675 }
1676
1677 /*
1678 * There aren't any available msg entries at this time.
1679 *
1680 * In waiting for a message entry to become available,
1681 * we set a timeout in case the other side is not
1682 * sending completion IPIs. This lets us fake an IPI
1683 * that will cause the IPI handler to fetch the latest
1684 * GP values as if an IPI was sent by the other side.
1685 */
1686 if (ret == xpcTimeout)
1687 xpc_IPI_send_local_msgrequest(ch);
1688
1689 if (flags & XPC_NOWAIT) {
1690 xpc_msgqueue_deref(ch);
1691 return xpcNoWait;
1692 }
1693
1694 ret = xpc_allocate_msg_wait(ch);
1695 if (ret != xpcInterrupted && ret != xpcTimeout) {
1696 xpc_msgqueue_deref(ch);
1697 return ret;
1698 }
1699 }
1700
1701 /* get the message's address and initialize it */
1702 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1703 (put % ch->local_nentries) * ch->msg_size);
1704
1705 DBUG_ON(msg->flags != 0);
1706 msg->number = put;
1707
1708 dev_dbg(xpc_chan, "w_local_GP.put changed to %ld; msg=0x%p, "
1709 "msg_number=%ld, partid=%d, channel=%d\n", put + 1,
1710 (void *)msg, msg->number, ch->partid, ch->number);
1711
1712 *address_of_msg = msg;
1713
1714 return xpcSuccess;
1715}
1716
1717/*
1718 * Allocate an entry for a message from the message queue associated with the
1719 * specified channel. NOTE that this routine can sleep waiting for a message
1720 * entry to become available. To not sleep, pass in the XPC_NOWAIT flag.
1721 *
1722 * Arguments:
1723 *
1724 * partid - ID of partition to which the channel is connected.
1725 * ch_number - channel #.
1726 * flags - see xpc.h for valid flags.
1727 * payload - address of the allocated payload area pointer (filled in on
1728 * return) in which the user-defined message is constructed.
1729 */
1730enum xpc_retval
1731xpc_initiate_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
1732{
1733 struct xpc_partition *part = &xpc_partitions[partid];
1734 enum xpc_retval ret = xpcUnknownReason;
1735 struct xpc_msg *msg = NULL;
1736
1737 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1738 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1739
1740 *payload = NULL;
1741
1742 if (xpc_part_ref(part)) {
1743 ret = xpc_allocate_msg(&part->channels[ch_number], flags, &msg);
1744 xpc_part_deref(part);
1745
1746 if (msg != NULL)
1747 *payload = &msg->payload;
1748 }
1749
1750 return ret;
1751}
1752
1753/*
1754 * Now we actually send the messages that are ready to be sent by advancing
1755 * the local message queue's Put value and then send an IPI to the recipient
1756 * partition.
1757 */
1758static void
1759xpc_send_msgs(struct xpc_channel *ch, s64 initial_put)
1760{
1761 struct xpc_msg *msg;
1762 s64 put = initial_put + 1;
1763 int send_IPI = 0;
1764
1765 while (1) {
1766
1767 while (1) {
1768 if (put == ch->w_local_GP.put)
1769 break;
1770
1771 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1772 (put % ch->local_nentries) *
1773 ch->msg_size);
1774
1775 if (!(msg->flags & XPC_M_READY))
1776 break;
1777
1778 put++;
1779 }
1780
1781 if (put == initial_put) {
1782 /* nothing's changed */
1783 break;
1784 }
1785
1786 if (cmpxchg_rel(&ch->local_GP->put, initial_put, put) !=
1787 initial_put) {
1788 /* someone else beat us to it */
1789 DBUG_ON(ch->local_GP->put < initial_put);
1790 break;
1791 }
1792
1793 /* we just set the new value of local_GP->put */
1794
1795 dev_dbg(xpc_chan, "local_GP->put changed to %ld, partid=%d, "
1796 "channel=%d\n", put, ch->partid, ch->number);
1797
1798 send_IPI = 1;
1799
1800 /*
1801 * We need to ensure that the message referenced by
1802 * local_GP->put is not XPC_M_READY or that local_GP->put
1803 * equals w_local_GP.put, so we'll go have a look.
1804 */
1805 initial_put = put;
1806 }
1807
1808 if (send_IPI)
1809 xpc_IPI_send_msgrequest(ch);
1810}
1811
1812/*
1813 * Common code that does the actual sending of the message by advancing the
1814 * local message queue's Put value and sends an IPI to the partition the
1815 * message is being sent to.
1816 */
1817static enum xpc_retval
1818xpc_send_msg(struct xpc_channel *ch, struct xpc_msg *msg, u8 notify_type,
1819 xpc_notify_func func, void *key)
1820{
1821 enum xpc_retval ret = xpcSuccess;
1822 struct xpc_notify *notify = notify;
1823 s64 put, msg_number = msg->number;
1824
1825 DBUG_ON(notify_type == XPC_N_CALL && func == NULL);
1826 DBUG_ON((((u64)msg - (u64)ch->local_msgqueue) / ch->msg_size) !=
1827 msg_number % ch->local_nentries);
1828 DBUG_ON(msg->flags & XPC_M_READY);
1829
1830 if (ch->flags & XPC_C_DISCONNECTING) {
1831 /* drop the reference grabbed in xpc_allocate_msg() */
1832 xpc_msgqueue_deref(ch);
1833 return ch->reason;
1834 }
1835
1836 if (notify_type != 0) {
1837 /*
1838 * Tell the remote side to send an ACK interrupt when the
1839 * message has been delivered.
1840 */
1841 msg->flags |= XPC_M_INTERRUPT;
1842
1843 atomic_inc(&ch->n_to_notify);
1844
1845 notify = &ch->notify_queue[msg_number % ch->local_nentries];
1846 notify->func = func;
1847 notify->key = key;
1848 notify->type = notify_type;
1849
1850 /* >>> is a mb() needed here? */
1851
1852 if (ch->flags & XPC_C_DISCONNECTING) {
1853 /*
1854 * An error occurred between our last error check and
1855 * this one. We will try to clear the type field from
1856 * the notify entry. If we succeed then
1857 * xpc_disconnect_channel() didn't already process
1858 * the notify entry.
1859 */
1860 if (cmpxchg(&notify->type, notify_type, 0) ==
1861 notify_type) {
1862 atomic_dec(&ch->n_to_notify);
1863 ret = ch->reason;
1864 }
1865
1866 /* drop the reference grabbed in xpc_allocate_msg() */
1867 xpc_msgqueue_deref(ch);
1868 return ret;
1869 }
1870 }
1871
1872 msg->flags |= XPC_M_READY;
1873
1874 /*
1875 * The preceding store of msg->flags must occur before the following
1876 * load of ch->local_GP->put.
1877 */
1878 mb();
1879
1880 /* see if the message is next in line to be sent, if so send it */
1881
1882 put = ch->local_GP->put;
1883 if (put == msg_number)
1884 xpc_send_msgs(ch, put);
1885
1886 /* drop the reference grabbed in xpc_allocate_msg() */
1887 xpc_msgqueue_deref(ch);
1888 return ret;
1889}
1890
1891/*
1892 * Send a message previously allocated using xpc_initiate_allocate() on the
1893 * specified channel connected to the specified partition.
1894 *
1895 * This routine will not wait for the message to be received, nor will
1896 * notification be given when it does happen. Once this routine has returned
1897 * the message entry allocated via xpc_initiate_allocate() is no longer
1898 * accessable to the caller.
1899 *
1900 * This routine, although called by users, does not call xpc_part_ref() to
1901 * ensure that the partition infrastructure is in place. It relies on the
1902 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
1903 *
1904 * Arguments:
1905 *
1906 * partid - ID of partition to which the channel is connected.
1907 * ch_number - channel # to send message on.
1908 * payload - pointer to the payload area allocated via
1909 * xpc_initiate_allocate().
1910 */
1911enum xpc_retval
1912xpc_initiate_send(partid_t partid, int ch_number, void *payload)
1913{
1914 struct xpc_partition *part = &xpc_partitions[partid];
1915 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
1916 enum xpc_retval ret;
1917
1918 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg,
1919 partid, ch_number);
1920
1921 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1922 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1923 DBUG_ON(msg == NULL);
1924
1925 ret = xpc_send_msg(&part->channels[ch_number], msg, 0, NULL, NULL);
1926
1927 return ret;
1928}
1929
1930/*
1931 * Send a message previously allocated using xpc_initiate_allocate on the
1932 * specified channel connected to the specified partition.
1933 *
1934 * This routine will not wait for the message to be sent. Once this routine
1935 * has returned the message entry allocated via xpc_initiate_allocate() is no
1936 * longer accessable to the caller.
1937 *
1938 * Once the remote end of the channel has received the message, the function
1939 * passed as an argument to xpc_initiate_send_notify() will be called. This
1940 * allows the sender to free up or re-use any buffers referenced by the
1941 * message, but does NOT mean the message has been processed at the remote
1942 * end by a receiver.
1943 *
1944 * If this routine returns an error, the caller's function will NOT be called.
1945 *
1946 * This routine, although called by users, does not call xpc_part_ref() to
1947 * ensure that the partition infrastructure is in place. It relies on the
1948 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
1949 *
1950 * Arguments:
1951 *
1952 * partid - ID of partition to which the channel is connected.
1953 * ch_number - channel # to send message on.
1954 * payload - pointer to the payload area allocated via
1955 * xpc_initiate_allocate().
1956 * func - function to call with asynchronous notification of message
1957 * receipt. THIS FUNCTION MUST BE NON-BLOCKING.
1958 * key - user-defined key to be passed to the function when it's called.
1959 */
1960enum xpc_retval
1961xpc_initiate_send_notify(partid_t partid, int ch_number, void *payload,
1962 xpc_notify_func func, void *key)
1963{
1964 struct xpc_partition *part = &xpc_partitions[partid];
1965 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
1966 enum xpc_retval ret;
1967
1968 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg,
1969 partid, ch_number);
1970
1971 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1972 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1973 DBUG_ON(msg == NULL);
1974 DBUG_ON(func == NULL);
1975
1976 ret = xpc_send_msg(&part->channels[ch_number], msg, XPC_N_CALL,
1977 func, key);
1978 return ret;
1979}
1980
1981static struct xpc_msg *
1982xpc_pull_remote_msg(struct xpc_channel *ch, s64 get)
1983{
1984 struct xpc_partition *part = &xpc_partitions[ch->partid];
1985 struct xpc_msg *remote_msg, *msg;
1986 u32 msg_index, nmsgs;
1987 u64 msg_offset;
1988 enum xpc_retval ret;
1989
1990 if (mutex_lock_interruptible(&ch->msg_to_pull_mutex) != 0) {
1991 /* we were interrupted by a signal */
1992 return NULL;
1993 }
1994
1995 while (get >= ch->next_msg_to_pull) {
1996
1997 /* pull as many messages as are ready and able to be pulled */
1998
1999 msg_index = ch->next_msg_to_pull % ch->remote_nentries;
2000
2001 DBUG_ON(ch->next_msg_to_pull >= ch->w_remote_GP.put);
2002 nmsgs = ch->w_remote_GP.put - ch->next_msg_to_pull;
2003 if (msg_index + nmsgs > ch->remote_nentries) {
2004 /* ignore the ones that wrap the msg queue for now */
2005 nmsgs = ch->remote_nentries - msg_index;
2006 }
2007
2008 msg_offset = msg_index * ch->msg_size;
2009 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
2010 remote_msg = (struct xpc_msg *)(ch->remote_msgqueue_pa +
2011 msg_offset);
2012
2013 ret = xpc_pull_remote_cachelines(part, msg, remote_msg,
2014 nmsgs * ch->msg_size);
2015 if (ret != xpcSuccess) {
2016
2017 dev_dbg(xpc_chan, "failed to pull %d msgs starting with"
2018 " msg %ld from partition %d, channel=%d, "
2019 "ret=%d\n", nmsgs, ch->next_msg_to_pull,
2020 ch->partid, ch->number, ret);
2021
2022 XPC_DEACTIVATE_PARTITION(part, ret);
2023
2024 mutex_unlock(&ch->msg_to_pull_mutex);
2025 return NULL;
2026 }
2027
2028 ch->next_msg_to_pull += nmsgs;
2029 }
2030
2031 mutex_unlock(&ch->msg_to_pull_mutex);
2032
2033 /* return the message we were looking for */
2034 msg_offset = (get % ch->remote_nentries) * ch->msg_size;
2035 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
2036
2037 return msg;
2038}
2039
2040/*
2041 * Get a message to be delivered.
2042 */
2043static struct xpc_msg *
2044xpc_get_deliverable_msg(struct xpc_channel *ch)
2045{
2046 struct xpc_msg *msg = NULL;
2047 s64 get;
2048
2049 do {
2050 if (ch->flags & XPC_C_DISCONNECTING)
2051 break;
2052
2053 get = ch->w_local_GP.get;
2054 rmb(); /* guarantee that .get loads before .put */
2055 if (get == ch->w_remote_GP.put)
2056 break;
2057
2058 /* There are messages waiting to be pulled and delivered.
2059 * We need to try to secure one for ourselves. We'll do this
2060 * by trying to increment w_local_GP.get and hope that no one
2061 * else beats us to it. If they do, we'll we'll simply have
2062 * to try again for the next one.
2063 */
2064
2065 if (cmpxchg(&ch->w_local_GP.get, get, get + 1) == get) {
2066 /* we got the entry referenced by get */
2067
2068 dev_dbg(xpc_chan, "w_local_GP.get changed to %ld, "
2069 "partid=%d, channel=%d\n", get + 1,
2070 ch->partid, ch->number);
2071
2072 /* pull the message from the remote partition */
2073
2074 msg = xpc_pull_remote_msg(ch, get);
2075
2076 DBUG_ON(msg != NULL && msg->number != get);
2077 DBUG_ON(msg != NULL && (msg->flags & XPC_M_DONE));
2078 DBUG_ON(msg != NULL && !(msg->flags & XPC_M_READY));
2079
2080 break;
2081 }
2082
2083 } while (1);
2084
2085 return msg;
2086}
2087
2088/*
2089 * Deliver a message to its intended recipient.
2090 */
2091void
2092xpc_deliver_msg(struct xpc_channel *ch)
2093{
2094 struct xpc_msg *msg;
2095
2096 msg = xpc_get_deliverable_msg(ch);
2097 if (msg != NULL) {
2098
2099 /*
2100 * This ref is taken to protect the payload itself from being
2101 * freed before the user is finished with it, which the user
2102 * indicates by calling xpc_initiate_received().
2103 */
2104 xpc_msgqueue_ref(ch);
2105
2106 atomic_inc(&ch->kthreads_active);
2107
2108 if (ch->func != NULL) {
2109 dev_dbg(xpc_chan, "ch->func() called, msg=0x%p, "
2110 "msg_number=%ld, partid=%d, channel=%d\n",
2111 (void *)msg, msg->number, ch->partid,
2112 ch->number);
2113
2114 /* deliver the message to its intended recipient */
2115 ch->func(xpcMsgReceived, ch->partid, ch->number,
2116 &msg->payload, ch->key);
2117
2118 dev_dbg(xpc_chan, "ch->func() returned, msg=0x%p, "
2119 "msg_number=%ld, partid=%d, channel=%d\n",
2120 (void *)msg, msg->number, ch->partid,
2121 ch->number);
2122 }
2123
2124 atomic_dec(&ch->kthreads_active);
2125 }
2126}
2127
2128/*
2129 * Now we actually acknowledge the messages that have been delivered and ack'd
2130 * by advancing the cached remote message queue's Get value and if requested
2131 * send an IPI to the message sender's partition.
2132 */
2133static void
2134xpc_acknowledge_msgs(struct xpc_channel *ch, s64 initial_get, u8 msg_flags)
2135{
2136 struct xpc_msg *msg;
2137 s64 get = initial_get + 1;
2138 int send_IPI = 0;
2139
2140 while (1) {
2141
2142 while (1) {
2143 if (get == ch->w_local_GP.get)
2144 break;
2145
2146 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
2147 (get % ch->remote_nentries) *
2148 ch->msg_size);
2149
2150 if (!(msg->flags & XPC_M_DONE))
2151 break;
2152
2153 msg_flags |= msg->flags;
2154 get++;
2155 }
2156
2157 if (get == initial_get) {
2158 /* nothing's changed */
2159 break;
2160 }
2161
2162 if (cmpxchg_rel(&ch->local_GP->get, initial_get, get) !=
2163 initial_get) {
2164 /* someone else beat us to it */
2165 DBUG_ON(ch->local_GP->get <= initial_get);
2166 break;
2167 }
2168
2169 /* we just set the new value of local_GP->get */
2170
2171 dev_dbg(xpc_chan, "local_GP->get changed to %ld, partid=%d, "
2172 "channel=%d\n", get, ch->partid, ch->number);
2173
2174 send_IPI = (msg_flags & XPC_M_INTERRUPT);
2175
2176 /*
2177 * We need to ensure that the message referenced by
2178 * local_GP->get is not XPC_M_DONE or that local_GP->get
2179 * equals w_local_GP.get, so we'll go have a look.
2180 */
2181 initial_get = get;
2182 }
2183
2184 if (send_IPI)
2185 xpc_IPI_send_msgrequest(ch);
2186}
2187
2188/*
2189 * Acknowledge receipt of a delivered message.
2190 *
2191 * If a message has XPC_M_INTERRUPT set, send an interrupt to the partition
2192 * that sent the message.
2193 *
2194 * This function, although called by users, does not call xpc_part_ref() to
2195 * ensure that the partition infrastructure is in place. It relies on the
2196 * fact that we called xpc_msgqueue_ref() in xpc_deliver_msg().
2197 *
2198 * Arguments:
2199 *
2200 * partid - ID of partition to which the channel is connected.
2201 * ch_number - channel # message received on.
2202 * payload - pointer to the payload area allocated via
2203 * xpc_initiate_allocate().
2204 */
2205void
2206xpc_initiate_received(partid_t partid, int ch_number, void *payload)
2207{
2208 struct xpc_partition *part = &xpc_partitions[partid];
2209 struct xpc_channel *ch;
2210 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
2211 s64 get, msg_number = msg->number;
2212
2213 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
2214 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
2215
2216 ch = &part->channels[ch_number];
2217
2218 dev_dbg(xpc_chan, "msg=0x%p, msg_number=%ld, partid=%d, channel=%d\n",
2219 (void *)msg, msg_number, ch->partid, ch->number);
2220
2221 DBUG_ON((((u64)msg - (u64)ch->remote_msgqueue) / ch->msg_size) !=
2222 msg_number % ch->remote_nentries);
2223 DBUG_ON(msg->flags & XPC_M_DONE);
2224
2225 msg->flags |= XPC_M_DONE;
2226
2227 /*
2228 * The preceding store of msg->flags must occur before the following
2229 * load of ch->local_GP->get.
2230 */
2231 mb();
2232
2233 /*
2234 * See if this message is next in line to be acknowledged as having
2235 * been delivered.
2236 */
2237 get = ch->local_GP->get;
2238 if (get == msg_number)
2239 xpc_acknowledge_msgs(ch, get, msg->flags);
2240
2241 /* the call to xpc_msgqueue_ref() was done by xpc_deliver_msg() */
2242 xpc_msgqueue_deref(ch);
2243}
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
new file mode 100644
index 000000000000..f673ba90eb0e
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -0,0 +1,1323 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) support - standard version.
11 *
12 * XPC provides a message passing capability that crosses partition
13 * boundaries. This module is made up of two parts:
14 *
15 * partition This part detects the presence/absence of other
16 * partitions. It provides a heartbeat and monitors
17 * the heartbeats of other partitions.
18 *
19 * channel This part manages the channels and sends/receives
20 * messages across them to/from other partitions.
21 *
22 * There are a couple of additional functions residing in XP, which
23 * provide an interface to XPC for its users.
24 *
25 *
26 * Caveats:
27 *
28 * . We currently have no way to determine which nasid an IPI came
29 * from. Thus, xpc_IPI_send() does a remote AMO write followed by
30 * an IPI. The AMO indicates where data is to be pulled from, so
31 * after the IPI arrives, the remote partition checks the AMO word.
32 * The IPI can actually arrive before the AMO however, so other code
33 * must periodically check for this case. Also, remote AMO operations
34 * do not reliably time out. Thus we do a remote PIO read solely to
35 * know whether the remote partition is down and whether we should
36 * stop sending IPIs to it. This remote PIO read operation is set up
37 * in a special nofault region so SAL knows to ignore (and cleanup)
38 * any errors due to the remote AMO write, PIO read, and/or PIO
39 * write operations.
40 *
41 * If/when new hardware solves this IPI problem, we should abandon
42 * the current approach.
43 *
44 */
45
46#include <linux/kernel.h>
47#include <linux/module.h>
48#include <linux/init.h>
49#include <linux/cache.h>
50#include <linux/interrupt.h>
51#include <linux/delay.h>
52#include <linux/reboot.h>
53#include <linux/completion.h>
54#include <linux/kdebug.h>
55#include <linux/kthread.h>
56#include <linux/uaccess.h>
57#include <asm/sn/intr.h>
58#include <asm/sn/sn_sal.h>
59#include "xpc.h"
60
61/* define two XPC debug device structures to be used with dev_dbg() et al */
62
63struct device_driver xpc_dbg_name = {
64 .name = "xpc"
65};
66
67struct device xpc_part_dbg_subname = {
68 .bus_id = {0}, /* set to "part" at xpc_init() time */
69 .driver = &xpc_dbg_name
70};
71
72struct device xpc_chan_dbg_subname = {
73 .bus_id = {0}, /* set to "chan" at xpc_init() time */
74 .driver = &xpc_dbg_name
75};
76
77struct device *xpc_part = &xpc_part_dbg_subname;
78struct device *xpc_chan = &xpc_chan_dbg_subname;
79
80static int xpc_kdebug_ignore;
81
82/* systune related variables for /proc/sys directories */
83
84static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
85static int xpc_hb_min_interval = 1;
86static int xpc_hb_max_interval = 10;
87
88static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
89static int xpc_hb_check_min_interval = 10;
90static int xpc_hb_check_max_interval = 120;
91
92int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
93static int xpc_disengage_request_min_timelimit; /* = 0 */
94static int xpc_disengage_request_max_timelimit = 120;
95
96static ctl_table xpc_sys_xpc_hb_dir[] = {
97 {
98 .ctl_name = CTL_UNNUMBERED,
99 .procname = "hb_interval",
100 .data = &xpc_hb_interval,
101 .maxlen = sizeof(int),
102 .mode = 0644,
103 .proc_handler = &proc_dointvec_minmax,
104 .strategy = &sysctl_intvec,
105 .extra1 = &xpc_hb_min_interval,
106 .extra2 = &xpc_hb_max_interval},
107 {
108 .ctl_name = CTL_UNNUMBERED,
109 .procname = "hb_check_interval",
110 .data = &xpc_hb_check_interval,
111 .maxlen = sizeof(int),
112 .mode = 0644,
113 .proc_handler = &proc_dointvec_minmax,
114 .strategy = &sysctl_intvec,
115 .extra1 = &xpc_hb_check_min_interval,
116 .extra2 = &xpc_hb_check_max_interval},
117 {}
118};
119static ctl_table xpc_sys_xpc_dir[] = {
120 {
121 .ctl_name = CTL_UNNUMBERED,
122 .procname = "hb",
123 .mode = 0555,
124 .child = xpc_sys_xpc_hb_dir},
125 {
126 .ctl_name = CTL_UNNUMBERED,
127 .procname = "disengage_request_timelimit",
128 .data = &xpc_disengage_request_timelimit,
129 .maxlen = sizeof(int),
130 .mode = 0644,
131 .proc_handler = &proc_dointvec_minmax,
132 .strategy = &sysctl_intvec,
133 .extra1 = &xpc_disengage_request_min_timelimit,
134 .extra2 = &xpc_disengage_request_max_timelimit},
135 {}
136};
137static ctl_table xpc_sys_dir[] = {
138 {
139 .ctl_name = CTL_UNNUMBERED,
140 .procname = "xpc",
141 .mode = 0555,
142 .child = xpc_sys_xpc_dir},
143 {}
144};
145static struct ctl_table_header *xpc_sysctl;
146
147/* non-zero if any remote partition disengage request was timed out */
148int xpc_disengage_request_timedout;
149
150/* #of IRQs received */
151static atomic_t xpc_act_IRQ_rcvd;
152
153/* IRQ handler notifies this wait queue on receipt of an IRQ */
154static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
155
156static unsigned long xpc_hb_check_timeout;
157
158/* notification that the xpc_hb_checker thread has exited */
159static DECLARE_COMPLETION(xpc_hb_checker_exited);
160
161/* notification that the xpc_discovery thread has exited */
162static DECLARE_COMPLETION(xpc_discovery_exited);
163
164static struct timer_list xpc_hb_timer;
165
166static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
167
168static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
169static struct notifier_block xpc_reboot_notifier = {
170 .notifier_call = xpc_system_reboot,
171};
172
173static int xpc_system_die(struct notifier_block *, unsigned long, void *);
174static struct notifier_block xpc_die_notifier = {
175 .notifier_call = xpc_system_die,
176};
177
178/*
179 * Timer function to enforce the timelimit on the partition disengage request.
180 */
181static void
182xpc_timeout_partition_disengage_request(unsigned long data)
183{
184 struct xpc_partition *part = (struct xpc_partition *)data;
185
186 DBUG_ON(time_before(jiffies, part->disengage_request_timeout));
187
188 (void)xpc_partition_disengaged(part);
189
190 DBUG_ON(part->disengage_request_timeout != 0);
191 DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
192}
193
194/*
195 * Notify the heartbeat check thread that an IRQ has been received.
196 */
197static irqreturn_t
198xpc_act_IRQ_handler(int irq, void *dev_id)
199{
200 atomic_inc(&xpc_act_IRQ_rcvd);
201 wake_up_interruptible(&xpc_act_IRQ_wq);
202 return IRQ_HANDLED;
203}
204
205/*
206 * Timer to produce the heartbeat. The timer structures function is
207 * already set when this is initially called. A tunable is used to
208 * specify when the next timeout should occur.
209 */
210static void
211xpc_hb_beater(unsigned long dummy)
212{
213 xpc_vars->heartbeat++;
214
215 if (time_after_eq(jiffies, xpc_hb_check_timeout))
216 wake_up_interruptible(&xpc_act_IRQ_wq);
217
218 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
219 add_timer(&xpc_hb_timer);
220}
221
222/*
223 * This thread is responsible for nearly all of the partition
224 * activation/deactivation.
225 */
226static int
227xpc_hb_checker(void *ignore)
228{
229 int last_IRQ_count = 0;
230 int new_IRQ_count;
231 int force_IRQ = 0;
232
233 /* this thread was marked active by xpc_hb_init() */
234
235 set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
236
237 /* set our heartbeating to other partitions into motion */
238 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
239 xpc_hb_beater(0);
240
241 while (!xpc_exiting) {
242
243 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
244 "been received\n",
245 (int)(xpc_hb_check_timeout - jiffies),
246 atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
247
248 /* checking of remote heartbeats is skewed by IRQ handling */
249 if (time_after_eq(jiffies, xpc_hb_check_timeout)) {
250 dev_dbg(xpc_part, "checking remote heartbeats\n");
251 xpc_check_remote_hb();
252
253 /*
254 * We need to periodically recheck to ensure no
255 * IPI/AMO pairs have been missed. That check
256 * must always reset xpc_hb_check_timeout.
257 */
258 force_IRQ = 1;
259 }
260
261 /* check for outstanding IRQs */
262 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
263 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
264 force_IRQ = 0;
265
266 dev_dbg(xpc_part, "found an IRQ to process; will be "
267 "resetting xpc_hb_check_timeout\n");
268
269 last_IRQ_count += xpc_identify_act_IRQ_sender();
270 if (last_IRQ_count < new_IRQ_count) {
271 /* retry once to help avoid missing AMO */
272 (void)xpc_identify_act_IRQ_sender();
273 }
274 last_IRQ_count = new_IRQ_count;
275
276 xpc_hb_check_timeout = jiffies +
277 (xpc_hb_check_interval * HZ);
278 }
279
280 /* wait for IRQ or timeout */
281 (void)wait_event_interruptible(xpc_act_IRQ_wq,
282 (last_IRQ_count <
283 atomic_read(&xpc_act_IRQ_rcvd)
284 || time_after_eq(jiffies,
285 xpc_hb_check_timeout) ||
286 xpc_exiting));
287 }
288
289 dev_dbg(xpc_part, "heartbeat checker is exiting\n");
290
291 /* mark this thread as having exited */
292 complete(&xpc_hb_checker_exited);
293 return 0;
294}
295
296/*
297 * This thread will attempt to discover other partitions to activate
298 * based on info provided by SAL. This new thread is short lived and
299 * will exit once discovery is complete.
300 */
301static int
302xpc_initiate_discovery(void *ignore)
303{
304 xpc_discovery();
305
306 dev_dbg(xpc_part, "discovery thread is exiting\n");
307
308 /* mark this thread as having exited */
309 complete(&xpc_discovery_exited);
310 return 0;
311}
312
313/*
314 * Establish first contact with the remote partititon. This involves pulling
315 * the XPC per partition variables from the remote partition and waiting for
316 * the remote partition to pull ours.
317 */
318static enum xpc_retval
319xpc_make_first_contact(struct xpc_partition *part)
320{
321 enum xpc_retval ret;
322
323 while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
324 if (ret != xpcRetry) {
325 XPC_DEACTIVATE_PARTITION(part, ret);
326 return ret;
327 }
328
329 dev_dbg(xpc_chan, "waiting to make first contact with "
330 "partition %d\n", XPC_PARTID(part));
331
332 /* wait a 1/4 of a second or so */
333 (void)msleep_interruptible(250);
334
335 if (part->act_state == XPC_P_DEACTIVATING)
336 return part->reason;
337 }
338
339 return xpc_mark_partition_active(part);
340}
341
342/*
343 * The first kthread assigned to a newly activated partition is the one
344 * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
345 * that kthread until the partition is brought down, at which time that kthread
346 * returns back to XPC HB. (The return of that kthread will signify to XPC HB
347 * that XPC has dismantled all communication infrastructure for the associated
348 * partition.) This kthread becomes the channel manager for that partition.
349 *
350 * Each active partition has a channel manager, who, besides connecting and
351 * disconnecting channels, will ensure that each of the partition's connected
352 * channels has the required number of assigned kthreads to get the work done.
353 */
354static void
355xpc_channel_mgr(struct xpc_partition *part)
356{
357 while (part->act_state != XPC_P_DEACTIVATING ||
358 atomic_read(&part->nchannels_active) > 0 ||
359 !xpc_partition_disengaged(part)) {
360
361 xpc_process_channel_activity(part);
362
363 /*
364 * Wait until we've been requested to activate kthreads or
365 * all of the channel's message queues have been torn down or
366 * a signal is pending.
367 *
368 * The channel_mgr_requests is set to 1 after being awakened,
369 * This is done to prevent the channel mgr from making one pass
370 * through the loop for each request, since he will
371 * be servicing all the requests in one pass. The reason it's
372 * set to 1 instead of 0 is so that other kthreads will know
373 * that the channel mgr is running and won't bother trying to
374 * wake him up.
375 */
376 atomic_dec(&part->channel_mgr_requests);
377 (void)wait_event_interruptible(part->channel_mgr_wq,
378 (atomic_read(&part->channel_mgr_requests) > 0 ||
379 part->local_IPI_amo != 0 ||
380 (part->act_state == XPC_P_DEACTIVATING &&
381 atomic_read(&part->nchannels_active) == 0 &&
382 xpc_partition_disengaged(part))));
383 atomic_set(&part->channel_mgr_requests, 1);
384 }
385}
386
387/*
388 * When XPC HB determines that a partition has come up, it will create a new
389 * kthread and that kthread will call this function to attempt to set up the
390 * basic infrastructure used for Cross Partition Communication with the newly
391 * upped partition.
392 *
393 * The kthread that was created by XPC HB and which setup the XPC
394 * infrastructure will remain assigned to the partition until the partition
395 * goes down. At which time the kthread will teardown the XPC infrastructure
396 * and then exit.
397 *
398 * XPC HB will put the remote partition's XPC per partition specific variables
399 * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
400 * calling xpc_partition_up().
401 */
402static void
403xpc_partition_up(struct xpc_partition *part)
404{
405 DBUG_ON(part->channels != NULL);
406
407 dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
408
409 if (xpc_setup_infrastructure(part) != xpcSuccess)
410 return;
411
412 /*
413 * The kthread that XPC HB called us with will become the
414 * channel manager for this partition. It will not return
415 * back to XPC HB until the partition's XPC infrastructure
416 * has been dismantled.
417 */
418
419 (void)xpc_part_ref(part); /* this will always succeed */
420
421 if (xpc_make_first_contact(part) == xpcSuccess)
422 xpc_channel_mgr(part);
423
424 xpc_part_deref(part);
425
426 xpc_teardown_infrastructure(part);
427}
428
429static int
430xpc_activating(void *__partid)
431{
432 partid_t partid = (u64)__partid;
433 struct xpc_partition *part = &xpc_partitions[partid];
434 unsigned long irq_flags;
435
436 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
437
438 spin_lock_irqsave(&part->act_lock, irq_flags);
439
440 if (part->act_state == XPC_P_DEACTIVATING) {
441 part->act_state = XPC_P_INACTIVE;
442 spin_unlock_irqrestore(&part->act_lock, irq_flags);
443 part->remote_rp_pa = 0;
444 return 0;
445 }
446
447 /* indicate the thread is activating */
448 DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
449 part->act_state = XPC_P_ACTIVATING;
450
451 XPC_SET_REASON(part, 0, 0);
452 spin_unlock_irqrestore(&part->act_lock, irq_flags);
453
454 dev_dbg(xpc_part, "bringing partition %d up\n", partid);
455
456 /*
457 * Register the remote partition's AMOs with SAL so it can handle
458 * and cleanup errors within that address range should the remote
459 * partition go down. We don't unregister this range because it is
460 * difficult to tell when outstanding writes to the remote partition
461 * are finished and thus when it is safe to unregister. This should
462 * not result in wasted space in the SAL xp_addr_region table because
463 * we should get the same page for remote_amos_page_pa after module
464 * reloads and system reboots.
465 */
466 if (sn_register_xp_addr_region(part->remote_amos_page_pa,
467 PAGE_SIZE, 1) < 0) {
468 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
469 "xp_addr region\n", partid);
470
471 spin_lock_irqsave(&part->act_lock, irq_flags);
472 part->act_state = XPC_P_INACTIVE;
473 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
474 spin_unlock_irqrestore(&part->act_lock, irq_flags);
475 part->remote_rp_pa = 0;
476 return 0;
477 }
478
479 xpc_allow_hb(partid, xpc_vars);
480 xpc_IPI_send_activated(part);
481
482 /*
483 * xpc_partition_up() holds this thread and marks this partition as
484 * XPC_P_ACTIVE by calling xpc_hb_mark_active().
485 */
486 (void)xpc_partition_up(part);
487
488 xpc_disallow_hb(partid, xpc_vars);
489 xpc_mark_partition_inactive(part);
490
491 if (part->reason == xpcReactivating) {
492 /* interrupting ourselves results in activating partition */
493 xpc_IPI_send_reactivate(part);
494 }
495
496 return 0;
497}
498
499void
500xpc_activate_partition(struct xpc_partition *part)
501{
502 partid_t partid = XPC_PARTID(part);
503 unsigned long irq_flags;
504 struct task_struct *kthread;
505
506 spin_lock_irqsave(&part->act_lock, irq_flags);
507
508 DBUG_ON(part->act_state != XPC_P_INACTIVE);
509
510 part->act_state = XPC_P_ACTIVATION_REQ;
511 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
512
513 spin_unlock_irqrestore(&part->act_lock, irq_flags);
514
515 kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
516 partid);
517 if (IS_ERR(kthread)) {
518 spin_lock_irqsave(&part->act_lock, irq_flags);
519 part->act_state = XPC_P_INACTIVE;
520 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
521 spin_unlock_irqrestore(&part->act_lock, irq_flags);
522 }
523}
524
525/*
526 * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
527 * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
528 * than one partition, we use an AMO_t structure per partition to indicate
529 * whether a partition has sent an IPI or not. If it has, then wake up the
530 * associated kthread to handle it.
531 *
532 * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
533 * running on other partitions.
534 *
535 * Noteworthy Arguments:
536 *
537 * irq - Interrupt ReQuest number. NOT USED.
538 *
539 * dev_id - partid of IPI's potential sender.
540 */
541irqreturn_t
542xpc_notify_IRQ_handler(int irq, void *dev_id)
543{
544 partid_t partid = (partid_t) (u64)dev_id;
545 struct xpc_partition *part = &xpc_partitions[partid];
546
547 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
548
549 if (xpc_part_ref(part)) {
550 xpc_check_for_channel_activity(part);
551
552 xpc_part_deref(part);
553 }
554 return IRQ_HANDLED;
555}
556
557/*
558 * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
559 * because the write to their associated IPI amo completed after the IRQ/IPI
560 * was received.
561 */
562void
563xpc_dropped_IPI_check(struct xpc_partition *part)
564{
565 if (xpc_part_ref(part)) {
566 xpc_check_for_channel_activity(part);
567
568 part->dropped_IPI_timer.expires = jiffies +
569 XPC_P_DROPPED_IPI_WAIT;
570 add_timer(&part->dropped_IPI_timer);
571 xpc_part_deref(part);
572 }
573}
574
575void
576xpc_activate_kthreads(struct xpc_channel *ch, int needed)
577{
578 int idle = atomic_read(&ch->kthreads_idle);
579 int assigned = atomic_read(&ch->kthreads_assigned);
580 int wakeup;
581
582 DBUG_ON(needed <= 0);
583
584 if (idle > 0) {
585 wakeup = (needed > idle) ? idle : needed;
586 needed -= wakeup;
587
588 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
589 "channel=%d\n", wakeup, ch->partid, ch->number);
590
591 /* only wakeup the requested number of kthreads */
592 wake_up_nr(&ch->idle_wq, wakeup);
593 }
594
595 if (needed <= 0)
596 return;
597
598 if (needed + assigned > ch->kthreads_assigned_limit) {
599 needed = ch->kthreads_assigned_limit - assigned;
600 if (needed <= 0)
601 return;
602 }
603
604 dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
605 needed, ch->partid, ch->number);
606
607 xpc_create_kthreads(ch, needed, 0);
608}
609
610/*
611 * This function is where XPC's kthreads wait for messages to deliver.
612 */
613static void
614xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
615{
616 do {
617 /* deliver messages to their intended recipients */
618
619 while (ch->w_local_GP.get < ch->w_remote_GP.put &&
620 !(ch->flags & XPC_C_DISCONNECTING)) {
621 xpc_deliver_msg(ch);
622 }
623
624 if (atomic_inc_return(&ch->kthreads_idle) >
625 ch->kthreads_idle_limit) {
626 /* too many idle kthreads on this channel */
627 atomic_dec(&ch->kthreads_idle);
628 break;
629 }
630
631 dev_dbg(xpc_chan, "idle kthread calling "
632 "wait_event_interruptible_exclusive()\n");
633
634 (void)wait_event_interruptible_exclusive(ch->idle_wq,
635 (ch->w_local_GP.get < ch->w_remote_GP.put ||
636 (ch->flags & XPC_C_DISCONNECTING)));
637
638 atomic_dec(&ch->kthreads_idle);
639
640 } while (!(ch->flags & XPC_C_DISCONNECTING));
641}
642
643static int
644xpc_kthread_start(void *args)
645{
646 partid_t partid = XPC_UNPACK_ARG1(args);
647 u16 ch_number = XPC_UNPACK_ARG2(args);
648 struct xpc_partition *part = &xpc_partitions[partid];
649 struct xpc_channel *ch;
650 int n_needed;
651 unsigned long irq_flags;
652
653 dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
654 partid, ch_number);
655
656 ch = &part->channels[ch_number];
657
658 if (!(ch->flags & XPC_C_DISCONNECTING)) {
659
660 /* let registerer know that connection has been established */
661
662 spin_lock_irqsave(&ch->lock, irq_flags);
663 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
664 ch->flags |= XPC_C_CONNECTEDCALLOUT;
665 spin_unlock_irqrestore(&ch->lock, irq_flags);
666
667 xpc_connected_callout(ch);
668
669 spin_lock_irqsave(&ch->lock, irq_flags);
670 ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
671 spin_unlock_irqrestore(&ch->lock, irq_flags);
672
673 /*
674 * It is possible that while the callout was being
675 * made that the remote partition sent some messages.
676 * If that is the case, we may need to activate
677 * additional kthreads to help deliver them. We only
678 * need one less than total #of messages to deliver.
679 */
680 n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
681 if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
682 xpc_activate_kthreads(ch, n_needed);
683
684 } else {
685 spin_unlock_irqrestore(&ch->lock, irq_flags);
686 }
687
688 xpc_kthread_waitmsgs(part, ch);
689 }
690
691 /* let registerer know that connection is disconnecting */
692
693 spin_lock_irqsave(&ch->lock, irq_flags);
694 if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
695 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
696 ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
697 spin_unlock_irqrestore(&ch->lock, irq_flags);
698
699 xpc_disconnect_callout(ch, xpcDisconnecting);
700
701 spin_lock_irqsave(&ch->lock, irq_flags);
702 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
703 }
704 spin_unlock_irqrestore(&ch->lock, irq_flags);
705
706 if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
707 if (atomic_dec_return(&part->nchannels_engaged) == 0) {
708 xpc_mark_partition_disengaged(part);
709 xpc_IPI_send_disengage(part);
710 }
711 }
712
713 xpc_msgqueue_deref(ch);
714
715 dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
716 partid, ch_number);
717
718 xpc_part_deref(part);
719 return 0;
720}
721
722/*
723 * For each partition that XPC has established communications with, there is
724 * a minimum of one kernel thread assigned to perform any operation that
725 * may potentially sleep or block (basically the callouts to the asynchronous
726 * functions registered via xpc_connect()).
727 *
728 * Additional kthreads are created and destroyed by XPC as the workload
729 * demands.
730 *
731 * A kthread is assigned to one of the active channels that exists for a given
732 * partition.
733 */
734void
735xpc_create_kthreads(struct xpc_channel *ch, int needed,
736 int ignore_disconnecting)
737{
738 unsigned long irq_flags;
739 u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
740 struct xpc_partition *part = &xpc_partitions[ch->partid];
741 struct task_struct *kthread;
742
743 while (needed-- > 0) {
744
745 /*
746 * The following is done on behalf of the newly created
747 * kthread. That kthread is responsible for doing the
748 * counterpart to the following before it exits.
749 */
750 if (ignore_disconnecting) {
751 if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
752 /* kthreads assigned had gone to zero */
753 BUG_ON(!(ch->flags &
754 XPC_C_DISCONNECTINGCALLOUT_MADE));
755 break;
756 }
757
758 } else if (ch->flags & XPC_C_DISCONNECTING) {
759 break;
760
761 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1) {
762 if (atomic_inc_return(&part->nchannels_engaged) == 1)
763 xpc_mark_partition_engaged(part);
764 }
765 (void)xpc_part_ref(part);
766 xpc_msgqueue_ref(ch);
767
768 kthread = kthread_run(xpc_kthread_start, (void *)args,
769 "xpc%02dc%d", ch->partid, ch->number);
770 if (IS_ERR(kthread)) {
771 /* the fork failed */
772
773 /*
774 * NOTE: if (ignore_disconnecting &&
775 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
776 * then we'll deadlock if all other kthreads assigned
777 * to this channel are blocked in the channel's
778 * registerer, because the only thing that will unblock
779 * them is the xpcDisconnecting callout that this
780 * failed kthread_run() would have made.
781 */
782
783 if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
784 atomic_dec_return(&part->nchannels_engaged) == 0) {
785 xpc_mark_partition_disengaged(part);
786 xpc_IPI_send_disengage(part);
787 }
788 xpc_msgqueue_deref(ch);
789 xpc_part_deref(part);
790
791 if (atomic_read(&ch->kthreads_assigned) <
792 ch->kthreads_idle_limit) {
793 /*
794 * Flag this as an error only if we have an
795 * insufficient #of kthreads for the channel
796 * to function.
797 */
798 spin_lock_irqsave(&ch->lock, irq_flags);
799 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
800 &irq_flags);
801 spin_unlock_irqrestore(&ch->lock, irq_flags);
802 }
803 break;
804 }
805 }
806}
807
808void
809xpc_disconnect_wait(int ch_number)
810{
811 unsigned long irq_flags;
812 partid_t partid;
813 struct xpc_partition *part;
814 struct xpc_channel *ch;
815 int wakeup_channel_mgr;
816
817 /* now wait for all callouts to the caller's function to cease */
818 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
819 part = &xpc_partitions[partid];
820
821 if (!xpc_part_ref(part))
822 continue;
823
824 ch = &part->channels[ch_number];
825
826 if (!(ch->flags & XPC_C_WDISCONNECT)) {
827 xpc_part_deref(part);
828 continue;
829 }
830
831 wait_for_completion(&ch->wdisconnect_wait);
832
833 spin_lock_irqsave(&ch->lock, irq_flags);
834 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
835 wakeup_channel_mgr = 0;
836
837 if (ch->delayed_IPI_flags) {
838 if (part->act_state != XPC_P_DEACTIVATING) {
839 spin_lock(&part->IPI_lock);
840 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
841 ch->number,
842 ch->delayed_IPI_flags);
843 spin_unlock(&part->IPI_lock);
844 wakeup_channel_mgr = 1;
845 }
846 ch->delayed_IPI_flags = 0;
847 }
848
849 ch->flags &= ~XPC_C_WDISCONNECT;
850 spin_unlock_irqrestore(&ch->lock, irq_flags);
851
852 if (wakeup_channel_mgr)
853 xpc_wakeup_channel_mgr(part);
854
855 xpc_part_deref(part);
856 }
857}
858
859static void
860xpc_do_exit(enum xpc_retval reason)
861{
862 partid_t partid;
863 int active_part_count, printed_waiting_msg = 0;
864 struct xpc_partition *part;
865 unsigned long printmsg_time, disengage_request_timeout = 0;
866
867 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
868 DBUG_ON(xpc_exiting == 1);
869
870 /*
871 * Let the heartbeat checker thread and the discovery thread
872 * (if one is running) know that they should exit. Also wake up
873 * the heartbeat checker thread in case it's sleeping.
874 */
875 xpc_exiting = 1;
876 wake_up_interruptible(&xpc_act_IRQ_wq);
877
878 /* ignore all incoming interrupts */
879 free_irq(SGI_XPC_ACTIVATE, NULL);
880
881 /* wait for the discovery thread to exit */
882 wait_for_completion(&xpc_discovery_exited);
883
884 /* wait for the heartbeat checker thread to exit */
885 wait_for_completion(&xpc_hb_checker_exited);
886
887 /* sleep for a 1/3 of a second or so */
888 (void)msleep_interruptible(300);
889
890 /* wait for all partitions to become inactive */
891
892 printmsg_time = jiffies + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
893 xpc_disengage_request_timedout = 0;
894
895 do {
896 active_part_count = 0;
897
898 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
899 part = &xpc_partitions[partid];
900
901 if (xpc_partition_disengaged(part) &&
902 part->act_state == XPC_P_INACTIVE) {
903 continue;
904 }
905
906 active_part_count++;
907
908 XPC_DEACTIVATE_PARTITION(part, reason);
909
910 if (part->disengage_request_timeout >
911 disengage_request_timeout) {
912 disengage_request_timeout =
913 part->disengage_request_timeout;
914 }
915 }
916
917 if (xpc_partition_engaged(-1UL)) {
918 if (time_after(jiffies, printmsg_time)) {
919 dev_info(xpc_part, "waiting for remote "
920 "partitions to disengage, timeout in "
921 "%ld seconds\n",
922 (disengage_request_timeout - jiffies)
923 / HZ);
924 printmsg_time = jiffies +
925 (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
926 printed_waiting_msg = 1;
927 }
928
929 } else if (active_part_count > 0) {
930 if (printed_waiting_msg) {
931 dev_info(xpc_part, "waiting for local partition"
932 " to disengage\n");
933 printed_waiting_msg = 0;
934 }
935
936 } else {
937 if (!xpc_disengage_request_timedout) {
938 dev_info(xpc_part, "all partitions have "
939 "disengaged\n");
940 }
941 break;
942 }
943
944 /* sleep for a 1/3 of a second or so */
945 (void)msleep_interruptible(300);
946
947 } while (1);
948
949 DBUG_ON(xpc_partition_engaged(-1UL));
950
951 /* indicate to others that our reserved page is uninitialized */
952 xpc_rsvd_page->vars_pa = 0;
953
954 /* now it's time to eliminate our heartbeat */
955 del_timer_sync(&xpc_hb_timer);
956 DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
957
958 if (reason == xpcUnloading) {
959 /* take ourselves off of the reboot_notifier_list */
960 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
961
962 /* take ourselves off of the die_notifier list */
963 (void)unregister_die_notifier(&xpc_die_notifier);
964 }
965
966 /* close down protections for IPI operations */
967 xpc_restrict_IPI_ops();
968
969 /* clear the interface to XPC's functions */
970 xpc_clear_interface();
971
972 if (xpc_sysctl)
973 unregister_sysctl_table(xpc_sysctl);
974
975 kfree(xpc_remote_copy_buffer_base);
976}
977
978/*
979 * This function is called when the system is being rebooted.
980 */
981static int
982xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
983{
984 enum xpc_retval reason;
985
986 switch (event) {
987 case SYS_RESTART:
988 reason = xpcSystemReboot;
989 break;
990 case SYS_HALT:
991 reason = xpcSystemHalt;
992 break;
993 case SYS_POWER_OFF:
994 reason = xpcSystemPoweroff;
995 break;
996 default:
997 reason = xpcSystemGoingDown;
998 }
999
1000 xpc_do_exit(reason);
1001 return NOTIFY_DONE;
1002}
1003
1004/*
1005 * Notify other partitions to disengage from all references to our memory.
1006 */
1007static void
1008xpc_die_disengage(void)
1009{
1010 struct xpc_partition *part;
1011 partid_t partid;
1012 unsigned long engaged;
1013 long time, printmsg_time, disengage_request_timeout;
1014
1015 /* keep xpc_hb_checker thread from doing anything (just in case) */
1016 xpc_exiting = 1;
1017
1018 xpc_vars->heartbeating_to_mask = 0; /* indicate we're deactivated */
1019
1020 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1021 part = &xpc_partitions[partid];
1022
1023 if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
1024 remote_vars_version)) {
1025
1026 /* just in case it was left set by an earlier XPC */
1027 xpc_clear_partition_engaged(1UL << partid);
1028 continue;
1029 }
1030
1031 if (xpc_partition_engaged(1UL << partid) ||
1032 part->act_state != XPC_P_INACTIVE) {
1033 xpc_request_partition_disengage(part);
1034 xpc_mark_partition_disengaged(part);
1035 xpc_IPI_send_disengage(part);
1036 }
1037 }
1038
1039 time = rtc_time();
1040 printmsg_time = time +
1041 (XPC_DISENGAGE_PRINTMSG_INTERVAL * sn_rtc_cycles_per_second);
1042 disengage_request_timeout = time +
1043 (xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
1044
1045 /* wait for all other partitions to disengage from us */
1046
1047 while (1) {
1048 engaged = xpc_partition_engaged(-1UL);
1049 if (!engaged) {
1050 dev_info(xpc_part, "all partitions have disengaged\n");
1051 break;
1052 }
1053
1054 time = rtc_time();
1055 if (time >= disengage_request_timeout) {
1056 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1057 if (engaged & (1UL << partid)) {
1058 dev_info(xpc_part, "disengage from "
1059 "remote partition %d timed "
1060 "out\n", partid);
1061 }
1062 }
1063 break;
1064 }
1065
1066 if (time >= printmsg_time) {
1067 dev_info(xpc_part, "waiting for remote partitions to "
1068 "disengage, timeout in %ld seconds\n",
1069 (disengage_request_timeout - time) /
1070 sn_rtc_cycles_per_second);
1071 printmsg_time = time +
1072 (XPC_DISENGAGE_PRINTMSG_INTERVAL *
1073 sn_rtc_cycles_per_second);
1074 }
1075 }
1076}
1077
1078/*
1079 * This function is called when the system is being restarted or halted due
1080 * to some sort of system failure. If this is the case we need to notify the
1081 * other partitions to disengage from all references to our memory.
1082 * This function can also be called when our heartbeater could be offlined
1083 * for a time. In this case we need to notify other partitions to not worry
1084 * about the lack of a heartbeat.
1085 */
1086static int
1087xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1088{
1089 switch (event) {
1090 case DIE_MACHINE_RESTART:
1091 case DIE_MACHINE_HALT:
1092 xpc_die_disengage();
1093 break;
1094
1095 case DIE_KDEBUG_ENTER:
1096 /* Should lack of heartbeat be ignored by other partitions? */
1097 if (!xpc_kdebug_ignore)
1098 break;
1099
1100 /* fall through */
1101 case DIE_MCA_MONARCH_ENTER:
1102 case DIE_INIT_MONARCH_ENTER:
1103 xpc_vars->heartbeat++;
1104 xpc_vars->heartbeat_offline = 1;
1105 break;
1106
1107 case DIE_KDEBUG_LEAVE:
1108 /* Is lack of heartbeat being ignored by other partitions? */
1109 if (!xpc_kdebug_ignore)
1110 break;
1111
1112 /* fall through */
1113 case DIE_MCA_MONARCH_LEAVE:
1114 case DIE_INIT_MONARCH_LEAVE:
1115 xpc_vars->heartbeat++;
1116 xpc_vars->heartbeat_offline = 0;
1117 break;
1118 }
1119
1120 return NOTIFY_DONE;
1121}
1122
1123int __init
1124xpc_init(void)
1125{
1126 int ret;
1127 partid_t partid;
1128 struct xpc_partition *part;
1129 struct task_struct *kthread;
1130 size_t buf_size;
1131
1132 if (!ia64_platform_is("sn2"))
1133 return -ENODEV;
1134
1135 buf_size = max(XPC_RP_VARS_SIZE,
1136 XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES);
1137 xpc_remote_copy_buffer = xpc_kmalloc_cacheline_aligned(buf_size,
1138 GFP_KERNEL,
1139 &xpc_remote_copy_buffer_base);
1140 if (xpc_remote_copy_buffer == NULL)
1141 return -ENOMEM;
1142
1143 snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
1144 snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
1145
1146 xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1147
1148 /*
1149 * The first few fields of each entry of xpc_partitions[] need to
1150 * be initialized now so that calls to xpc_connect() and
1151 * xpc_disconnect() can be made prior to the activation of any remote
1152 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
1153 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
1154 * PARTITION HAS BEEN ACTIVATED.
1155 */
1156 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1157 part = &xpc_partitions[partid];
1158
1159 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
1160
1161 part->act_IRQ_rcvd = 0;
1162 spin_lock_init(&part->act_lock);
1163 part->act_state = XPC_P_INACTIVE;
1164 XPC_SET_REASON(part, 0, 0);
1165
1166 init_timer(&part->disengage_request_timer);
1167 part->disengage_request_timer.function =
1168 xpc_timeout_partition_disengage_request;
1169 part->disengage_request_timer.data = (unsigned long)part;
1170
1171 part->setup_state = XPC_P_UNSET;
1172 init_waitqueue_head(&part->teardown_wq);
1173 atomic_set(&part->references, 0);
1174 }
1175
1176 /*
1177 * Open up protections for IPI operations (and AMO operations on
1178 * Shub 1.1 systems).
1179 */
1180 xpc_allow_IPI_ops();
1181
1182 /*
1183 * Interrupts being processed will increment this atomic variable and
1184 * awaken the heartbeat thread which will process the interrupts.
1185 */
1186 atomic_set(&xpc_act_IRQ_rcvd, 0);
1187
1188 /*
1189 * This is safe to do before the xpc_hb_checker thread has started
1190 * because the handler releases a wait queue. If an interrupt is
1191 * received before the thread is waiting, it will not go to sleep,
1192 * but rather immediately process the interrupt.
1193 */
1194 ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
1195 "xpc hb", NULL);
1196 if (ret != 0) {
1197 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
1198 "errno=%d\n", -ret);
1199
1200 xpc_restrict_IPI_ops();
1201
1202 if (xpc_sysctl)
1203 unregister_sysctl_table(xpc_sysctl);
1204
1205 kfree(xpc_remote_copy_buffer_base);
1206 return -EBUSY;
1207 }
1208
1209 /*
1210 * Fill the partition reserved page with the information needed by
1211 * other partitions to discover we are alive and establish initial
1212 * communications.
1213 */
1214 xpc_rsvd_page = xpc_rsvd_page_init();
1215 if (xpc_rsvd_page == NULL) {
1216 dev_err(xpc_part, "could not setup our reserved page\n");
1217
1218 free_irq(SGI_XPC_ACTIVATE, NULL);
1219 xpc_restrict_IPI_ops();
1220
1221 if (xpc_sysctl)
1222 unregister_sysctl_table(xpc_sysctl);
1223
1224 kfree(xpc_remote_copy_buffer_base);
1225 return -EBUSY;
1226 }
1227
1228 /* add ourselves to the reboot_notifier_list */
1229 ret = register_reboot_notifier(&xpc_reboot_notifier);
1230 if (ret != 0)
1231 dev_warn(xpc_part, "can't register reboot notifier\n");
1232
1233 /* add ourselves to the die_notifier list */
1234 ret = register_die_notifier(&xpc_die_notifier);
1235 if (ret != 0)
1236 dev_warn(xpc_part, "can't register die notifier\n");
1237
1238 init_timer(&xpc_hb_timer);
1239 xpc_hb_timer.function = xpc_hb_beater;
1240
1241 /*
1242 * The real work-horse behind xpc. This processes incoming
1243 * interrupts and monitors remote heartbeats.
1244 */
1245 kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
1246 if (IS_ERR(kthread)) {
1247 dev_err(xpc_part, "failed while forking hb check thread\n");
1248
1249 /* indicate to others that our reserved page is uninitialized */
1250 xpc_rsvd_page->vars_pa = 0;
1251
1252 /* take ourselves off of the reboot_notifier_list */
1253 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1254
1255 /* take ourselves off of the die_notifier list */
1256 (void)unregister_die_notifier(&xpc_die_notifier);
1257
1258 del_timer_sync(&xpc_hb_timer);
1259 free_irq(SGI_XPC_ACTIVATE, NULL);
1260 xpc_restrict_IPI_ops();
1261
1262 if (xpc_sysctl)
1263 unregister_sysctl_table(xpc_sysctl);
1264
1265 kfree(xpc_remote_copy_buffer_base);
1266 return -EBUSY;
1267 }
1268
1269 /*
1270 * Startup a thread that will attempt to discover other partitions to
1271 * activate based on info provided by SAL. This new thread is short
1272 * lived and will exit once discovery is complete.
1273 */
1274 kthread = kthread_run(xpc_initiate_discovery, NULL,
1275 XPC_DISCOVERY_THREAD_NAME);
1276 if (IS_ERR(kthread)) {
1277 dev_err(xpc_part, "failed while forking discovery thread\n");
1278
1279 /* mark this new thread as a non-starter */
1280 complete(&xpc_discovery_exited);
1281
1282 xpc_do_exit(xpcUnloading);
1283 return -EBUSY;
1284 }
1285
1286 /* set the interface to point at XPC's functions */
1287 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1288 xpc_initiate_allocate, xpc_initiate_send,
1289 xpc_initiate_send_notify, xpc_initiate_received,
1290 xpc_initiate_partid_to_nasids);
1291
1292 return 0;
1293}
1294
1295module_init(xpc_init);
1296
1297void __exit
1298xpc_exit(void)
1299{
1300 xpc_do_exit(xpcUnloading);
1301}
1302
1303module_exit(xpc_exit);
1304
1305MODULE_AUTHOR("Silicon Graphics, Inc.");
1306MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1307MODULE_LICENSE("GPL");
1308
1309module_param(xpc_hb_interval, int, 0);
1310MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1311 "heartbeat increments.");
1312
1313module_param(xpc_hb_check_interval, int, 0);
1314MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1315 "heartbeat checks.");
1316
1317module_param(xpc_disengage_request_timelimit, int, 0);
1318MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
1319 "for disengage request to complete.");
1320
1321module_param(xpc_kdebug_ignore, int, 0);
1322MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
1323 "other partitions when dropping into kdebug.");
diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c
new file mode 100644
index 000000000000..27e200ec5826
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_partition.c
@@ -0,0 +1,1174 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) partition support.
11 *
12 * This is the part of XPC that detects the presence/absence of
13 * other partitions. It provides a heartbeat and monitors the
14 * heartbeats of other partitions.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/sysctl.h>
20#include <linux/cache.h>
21#include <linux/mmzone.h>
22#include <linux/nodemask.h>
23#include <asm/uncached.h>
24#include <asm/sn/bte.h>
25#include <asm/sn/intr.h>
26#include <asm/sn/sn_sal.h>
27#include <asm/sn/nodepda.h>
28#include <asm/sn/addrs.h>
29#include "xpc.h"
30
31/* XPC is exiting flag */
32int xpc_exiting;
33
34/* SH_IPI_ACCESS shub register value on startup */
35static u64 xpc_sh1_IPI_access;
36static u64 xpc_sh2_IPI_access0;
37static u64 xpc_sh2_IPI_access1;
38static u64 xpc_sh2_IPI_access2;
39static u64 xpc_sh2_IPI_access3;
40
41/* original protection values for each node */
42u64 xpc_prot_vec[MAX_NUMNODES];
43
44/* this partition's reserved page pointers */
45struct xpc_rsvd_page *xpc_rsvd_page;
46static u64 *xpc_part_nasids;
47static u64 *xpc_mach_nasids;
48struct xpc_vars *xpc_vars;
49struct xpc_vars_part *xpc_vars_part;
50
51static int xp_nasid_mask_bytes; /* actual size in bytes of nasid mask */
52static int xp_nasid_mask_words; /* actual size in words of nasid mask */
53
54/*
55 * For performance reasons, each entry of xpc_partitions[] is cacheline
56 * aligned. And xpc_partitions[] is padded with an additional entry at the
57 * end so that the last legitimate entry doesn't share its cacheline with
58 * another variable.
59 */
60struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
61
62/*
63 * Generic buffer used to store a local copy of portions of a remote
64 * partition's reserved page (either its header and part_nasids mask,
65 * or its vars).
66 */
67char *xpc_remote_copy_buffer;
68void *xpc_remote_copy_buffer_base;
69
70/*
71 * Guarantee that the kmalloc'd memory is cacheline aligned.
72 */
73void *
74xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
75{
76 /* see if kmalloc will give us cachline aligned memory by default */
77 *base = kmalloc(size, flags);
78 if (*base == NULL)
79 return NULL;
80
81 if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
82 return *base;
83
84 kfree(*base);
85
86 /* nope, we'll have to do it ourselves */
87 *base = kmalloc(size + L1_CACHE_BYTES, flags);
88 if (*base == NULL)
89 return NULL;
90
91 return (void *)L1_CACHE_ALIGN((u64)*base);
92}
93
94/*
95 * Given a nasid, get the physical address of the partition's reserved page
96 * for that nasid. This function returns 0 on any error.
97 */
98static u64
99xpc_get_rsvd_page_pa(int nasid)
100{
101 bte_result_t bte_res;
102 s64 status;
103 u64 cookie = 0;
104 u64 rp_pa = nasid; /* seed with nasid */
105 u64 len = 0;
106 u64 buf = buf;
107 u64 buf_len = 0;
108 void *buf_base = NULL;
109
110 while (1) {
111
112 status = sn_partition_reserved_page_pa(buf, &cookie, &rp_pa,
113 &len);
114
115 dev_dbg(xpc_part, "SAL returned with status=%li, cookie="
116 "0x%016lx, address=0x%016lx, len=0x%016lx\n",
117 status, cookie, rp_pa, len);
118
119 if (status != SALRET_MORE_PASSES)
120 break;
121
122 if (L1_CACHE_ALIGN(len) > buf_len) {
123 kfree(buf_base);
124 buf_len = L1_CACHE_ALIGN(len);
125 buf = (u64)xpc_kmalloc_cacheline_aligned(buf_len,
126 GFP_KERNEL,
127 &buf_base);
128 if (buf_base == NULL) {
129 dev_err(xpc_part, "unable to kmalloc "
130 "len=0x%016lx\n", buf_len);
131 status = SALRET_ERROR;
132 break;
133 }
134 }
135
136 bte_res = xp_bte_copy(rp_pa, buf, buf_len,
137 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
138 if (bte_res != BTE_SUCCESS) {
139 dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res);
140 status = SALRET_ERROR;
141 break;
142 }
143 }
144
145 kfree(buf_base);
146
147 if (status != SALRET_OK)
148 rp_pa = 0;
149
150 dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
151 return rp_pa;
152}
153
154/*
155 * Fill the partition reserved page with the information needed by
156 * other partitions to discover we are alive and establish initial
157 * communications.
158 */
159struct xpc_rsvd_page *
160xpc_rsvd_page_init(void)
161{
162 struct xpc_rsvd_page *rp;
163 AMO_t *amos_page;
164 u64 rp_pa, nasid_array = 0;
165 int i, ret;
166
167 /* get the local reserved page's address */
168
169 preempt_disable();
170 rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id()));
171 preempt_enable();
172 if (rp_pa == 0) {
173 dev_err(xpc_part, "SAL failed to locate the reserved page\n");
174 return NULL;
175 }
176 rp = (struct xpc_rsvd_page *)__va(rp_pa);
177
178 if (rp->partid != sn_partition_id) {
179 dev_err(xpc_part, "the reserved page's partid of %d should be "
180 "%d\n", rp->partid, sn_partition_id);
181 return NULL;
182 }
183
184 rp->version = XPC_RP_VERSION;
185
186 /* establish the actual sizes of the nasid masks */
187 if (rp->SAL_version == 1) {
188 /* SAL_version 1 didn't set the nasids_size field */
189 rp->nasids_size = 128;
190 }
191 xp_nasid_mask_bytes = rp->nasids_size;
192 xp_nasid_mask_words = xp_nasid_mask_bytes / 8;
193
194 /* setup the pointers to the various items in the reserved page */
195 xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
196 xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
197 xpc_vars = XPC_RP_VARS(rp);
198 xpc_vars_part = XPC_RP_VARS_PART(rp);
199
200 /*
201 * Before clearing xpc_vars, see if a page of AMOs had been previously
202 * allocated. If not we'll need to allocate one and set permissions
203 * so that cross-partition AMOs are allowed.
204 *
205 * The allocated AMO page needs MCA reporting to remain disabled after
206 * XPC has unloaded. To make this work, we keep a copy of the pointer
207 * to this page (i.e., amos_page) in the struct xpc_vars structure,
208 * which is pointed to by the reserved page, and re-use that saved copy
209 * on subsequent loads of XPC. This AMO page is never freed, and its
210 * memory protections are never restricted.
211 */
212 amos_page = xpc_vars->amos_page;
213 if (amos_page == NULL) {
214 amos_page = (AMO_t *)TO_AMO(uncached_alloc_page(0));
215 if (amos_page == NULL) {
216 dev_err(xpc_part, "can't allocate page of AMOs\n");
217 return NULL;
218 }
219
220 /*
221 * Open up AMO-R/W to cpu. This is done for Shub 1.1 systems
222 * when xpc_allow_IPI_ops() is called via xpc_hb_init().
223 */
224 if (!enable_shub_wars_1_1()) {
225 ret = sn_change_memprotect(ia64_tpa((u64)amos_page),
226 PAGE_SIZE,
227 SN_MEMPROT_ACCESS_CLASS_1,
228 &nasid_array);
229 if (ret != 0) {
230 dev_err(xpc_part, "can't change memory "
231 "protections\n");
232 uncached_free_page(__IA64_UNCACHED_OFFSET |
233 TO_PHYS((u64)amos_page));
234 return NULL;
235 }
236 }
237 } else if (!IS_AMO_ADDRESS((u64)amos_page)) {
238 /*
239 * EFI's XPBOOT can also set amos_page in the reserved page,
240 * but it happens to leave it as an uncached physical address
241 * and we need it to be an uncached virtual, so we'll have to
242 * convert it.
243 */
244 if (!IS_AMO_PHYS_ADDRESS((u64)amos_page)) {
245 dev_err(xpc_part, "previously used amos_page address "
246 "is bad = 0x%p\n", (void *)amos_page);
247 return NULL;
248 }
249 amos_page = (AMO_t *)TO_AMO((u64)amos_page);
250 }
251
252 /* clear xpc_vars */
253 memset(xpc_vars, 0, sizeof(struct xpc_vars));
254
255 xpc_vars->version = XPC_V_VERSION;
256 xpc_vars->act_nasid = cpuid_to_nasid(0);
257 xpc_vars->act_phys_cpuid = cpu_physical_id(0);
258 xpc_vars->vars_part_pa = __pa(xpc_vars_part);
259 xpc_vars->amos_page_pa = ia64_tpa((u64)amos_page);
260 xpc_vars->amos_page = amos_page; /* save for next load of XPC */
261
262 /* clear xpc_vars_part */
263 memset((u64 *)xpc_vars_part, 0, sizeof(struct xpc_vars_part) *
264 XP_MAX_PARTITIONS);
265
266 /* initialize the activate IRQ related AMO variables */
267 for (i = 0; i < xp_nasid_mask_words; i++)
268 (void)xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i);
269
270 /* initialize the engaged remote partitions related AMO variables */
271 (void)xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO);
272 (void)xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO);
273
274 /* timestamp of when reserved page was setup by XPC */
275 rp->stamp = CURRENT_TIME;
276
277 /*
278 * This signifies to the remote partition that our reserved
279 * page is initialized.
280 */
281 rp->vars_pa = __pa(xpc_vars);
282
283 return rp;
284}
285
286/*
287 * Change protections to allow IPI operations (and AMO operations on
288 * Shub 1.1 systems).
289 */
290void
291xpc_allow_IPI_ops(void)
292{
293 int node;
294 int nasid;
295
296 /* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
297
298 if (is_shub2()) {
299 xpc_sh2_IPI_access0 =
300 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
301 xpc_sh2_IPI_access1 =
302 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
303 xpc_sh2_IPI_access2 =
304 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
305 xpc_sh2_IPI_access3 =
306 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
307
308 for_each_online_node(node) {
309 nasid = cnodeid_to_nasid(node);
310 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
311 -1UL);
312 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
313 -1UL);
314 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
315 -1UL);
316 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
317 -1UL);
318 }
319
320 } else {
321 xpc_sh1_IPI_access =
322 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
323
324 for_each_online_node(node) {
325 nasid = cnodeid_to_nasid(node);
326 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
327 -1UL);
328
329 /*
330 * Since the BIST collides with memory operations on
331 * SHUB 1.1 sn_change_memprotect() cannot be used.
332 */
333 if (enable_shub_wars_1_1()) {
334 /* open up everything */
335 xpc_prot_vec[node] = (u64)HUB_L((u64 *)
336 GLOBAL_MMR_ADDR
337 (nasid,
338 SH1_MD_DQLP_MMR_DIR_PRIVEC0));
339 HUB_S((u64 *)
340 GLOBAL_MMR_ADDR(nasid,
341 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
342 -1UL);
343 HUB_S((u64 *)
344 GLOBAL_MMR_ADDR(nasid,
345 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
346 -1UL);
347 }
348 }
349 }
350}
351
352/*
353 * Restrict protections to disallow IPI operations (and AMO operations on
354 * Shub 1.1 systems).
355 */
356void
357xpc_restrict_IPI_ops(void)
358{
359 int node;
360 int nasid;
361
362 /* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
363
364 if (is_shub2()) {
365
366 for_each_online_node(node) {
367 nasid = cnodeid_to_nasid(node);
368 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
369 xpc_sh2_IPI_access0);
370 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
371 xpc_sh2_IPI_access1);
372 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
373 xpc_sh2_IPI_access2);
374 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
375 xpc_sh2_IPI_access3);
376 }
377
378 } else {
379
380 for_each_online_node(node) {
381 nasid = cnodeid_to_nasid(node);
382 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
383 xpc_sh1_IPI_access);
384
385 if (enable_shub_wars_1_1()) {
386 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
387 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
388 xpc_prot_vec[node]);
389 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
390 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
391 xpc_prot_vec[node]);
392 }
393 }
394 }
395}
396
397/*
398 * At periodic intervals, scan through all active partitions and ensure
399 * their heartbeat is still active. If not, the partition is deactivated.
400 */
401void
402xpc_check_remote_hb(void)
403{
404 struct xpc_vars *remote_vars;
405 struct xpc_partition *part;
406 partid_t partid;
407 bte_result_t bres;
408
409 remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
410
411 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
412
413 if (xpc_exiting)
414 break;
415
416 if (partid == sn_partition_id)
417 continue;
418
419 part = &xpc_partitions[partid];
420
421 if (part->act_state == XPC_P_INACTIVE ||
422 part->act_state == XPC_P_DEACTIVATING) {
423 continue;
424 }
425
426 /* pull the remote_hb cache line */
427 bres = xp_bte_copy(part->remote_vars_pa,
428 (u64)remote_vars,
429 XPC_RP_VARS_SIZE,
430 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
431 if (bres != BTE_SUCCESS) {
432 XPC_DEACTIVATE_PARTITION(part,
433 xpc_map_bte_errors(bres));
434 continue;
435 }
436
437 dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
438 " = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
439 partid, remote_vars->heartbeat, part->last_heartbeat,
440 remote_vars->heartbeat_offline,
441 remote_vars->heartbeating_to_mask);
442
443 if (((remote_vars->heartbeat == part->last_heartbeat) &&
444 (remote_vars->heartbeat_offline == 0)) ||
445 !xpc_hb_allowed(sn_partition_id, remote_vars)) {
446
447 XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat);
448 continue;
449 }
450
451 part->last_heartbeat = remote_vars->heartbeat;
452 }
453}
454
455/*
456 * Get a copy of a portion of the remote partition's rsvd page.
457 *
458 * remote_rp points to a buffer that is cacheline aligned for BTE copies and
459 * is large enough to contain a copy of their reserved page header and
460 * part_nasids mask.
461 */
462static enum xpc_retval
463xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
464 struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa)
465{
466 int bres, i;
467
468 /* get the reserved page's physical address */
469
470 *remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
471 if (*remote_rp_pa == 0)
472 return xpcNoRsvdPageAddr;
473
474 /* pull over the reserved page header and part_nasids mask */
475 bres = xp_bte_copy(*remote_rp_pa, (u64)remote_rp,
476 XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes,
477 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
478 if (bres != BTE_SUCCESS)
479 return xpc_map_bte_errors(bres);
480
481 if (discovered_nasids != NULL) {
482 u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp);
483
484 for (i = 0; i < xp_nasid_mask_words; i++)
485 discovered_nasids[i] |= remote_part_nasids[i];
486 }
487
488 /* check that the partid is for another partition */
489
490 if (remote_rp->partid < 1 ||
491 remote_rp->partid > (XP_MAX_PARTITIONS - 1)) {
492 return xpcInvalidPartid;
493 }
494
495 if (remote_rp->partid == sn_partition_id)
496 return xpcLocalPartid;
497
498 if (XPC_VERSION_MAJOR(remote_rp->version) !=
499 XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
500 return xpcBadVersion;
501 }
502
503 return xpcSuccess;
504}
505
506/*
507 * Get a copy of the remote partition's XPC variables from the reserved page.
508 *
509 * remote_vars points to a buffer that is cacheline aligned for BTE copies and
510 * assumed to be of size XPC_RP_VARS_SIZE.
511 */
512static enum xpc_retval
513xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
514{
515 int bres;
516
517 if (remote_vars_pa == 0)
518 return xpcVarsNotSet;
519
520 /* pull over the cross partition variables */
521 bres = xp_bte_copy(remote_vars_pa, (u64)remote_vars, XPC_RP_VARS_SIZE,
522 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
523 if (bres != BTE_SUCCESS)
524 return xpc_map_bte_errors(bres);
525
526 if (XPC_VERSION_MAJOR(remote_vars->version) !=
527 XPC_VERSION_MAJOR(XPC_V_VERSION)) {
528 return xpcBadVersion;
529 }
530
531 return xpcSuccess;
532}
533
534/*
535 * Update the remote partition's info.
536 */
537static void
538xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version,
539 struct timespec *remote_rp_stamp, u64 remote_rp_pa,
540 u64 remote_vars_pa, struct xpc_vars *remote_vars)
541{
542 part->remote_rp_version = remote_rp_version;
543 dev_dbg(xpc_part, " remote_rp_version = 0x%016x\n",
544 part->remote_rp_version);
545
546 part->remote_rp_stamp = *remote_rp_stamp;
547 dev_dbg(xpc_part, " remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n",
548 part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec);
549
550 part->remote_rp_pa = remote_rp_pa;
551 dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
552
553 part->remote_vars_pa = remote_vars_pa;
554 dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n",
555 part->remote_vars_pa);
556
557 part->last_heartbeat = remote_vars->heartbeat;
558 dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n",
559 part->last_heartbeat);
560
561 part->remote_vars_part_pa = remote_vars->vars_part_pa;
562 dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n",
563 part->remote_vars_part_pa);
564
565 part->remote_act_nasid = remote_vars->act_nasid;
566 dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n",
567 part->remote_act_nasid);
568
569 part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
570 dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n",
571 part->remote_act_phys_cpuid);
572
573 part->remote_amos_page_pa = remote_vars->amos_page_pa;
574 dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n",
575 part->remote_amos_page_pa);
576
577 part->remote_vars_version = remote_vars->version;
578 dev_dbg(xpc_part, " remote_vars_version = 0x%x\n",
579 part->remote_vars_version);
580}
581
582/*
583 * Prior code has determined the nasid which generated an IPI. Inspect
584 * that nasid to determine if its partition needs to be activated or
585 * deactivated.
586 *
587 * A partition is consider "awaiting activation" if our partition
588 * flags indicate it is not active and it has a heartbeat. A
589 * partition is considered "awaiting deactivation" if our partition
590 * flags indicate it is active but it has no heartbeat or it is not
591 * sending its heartbeat to us.
592 *
593 * To determine the heartbeat, the remote nasid must have a properly
594 * initialized reserved page.
595 */
596static void
597xpc_identify_act_IRQ_req(int nasid)
598{
599 struct xpc_rsvd_page *remote_rp;
600 struct xpc_vars *remote_vars;
601 u64 remote_rp_pa;
602 u64 remote_vars_pa;
603 int remote_rp_version;
604 int reactivate = 0;
605 int stamp_diff;
606 struct timespec remote_rp_stamp = { 0, 0 };
607 partid_t partid;
608 struct xpc_partition *part;
609 enum xpc_retval ret;
610
611 /* pull over the reserved page structure */
612
613 remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer;
614
615 ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
616 if (ret != xpcSuccess) {
617 dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
618 "which sent interrupt, reason=%d\n", nasid, ret);
619 return;
620 }
621
622 remote_vars_pa = remote_rp->vars_pa;
623 remote_rp_version = remote_rp->version;
624 if (XPC_SUPPORTS_RP_STAMP(remote_rp_version))
625 remote_rp_stamp = remote_rp->stamp;
626
627 partid = remote_rp->partid;
628 part = &xpc_partitions[partid];
629
630 /* pull over the cross partition variables */
631
632 remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
633
634 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
635 if (ret != xpcSuccess) {
636
637 dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
638 "which sent interrupt, reason=%d\n", nasid, ret);
639
640 XPC_DEACTIVATE_PARTITION(part, ret);
641 return;
642 }
643
644 part->act_IRQ_rcvd++;
645
646 dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
647 "%ld:0x%lx\n", (int)nasid, (int)partid, part->act_IRQ_rcvd,
648 remote_vars->heartbeat, remote_vars->heartbeating_to_mask);
649
650 if (xpc_partition_disengaged(part) &&
651 part->act_state == XPC_P_INACTIVE) {
652
653 xpc_update_partition_info(part, remote_rp_version,
654 &remote_rp_stamp, remote_rp_pa,
655 remote_vars_pa, remote_vars);
656
657 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
658 if (xpc_partition_disengage_requested(1UL << partid)) {
659 /*
660 * Other side is waiting on us to disengage,
661 * even though we already have.
662 */
663 return;
664 }
665 } else {
666 /* other side doesn't support disengage requests */
667 xpc_clear_partition_disengage_request(1UL << partid);
668 }
669
670 xpc_activate_partition(part);
671 return;
672 }
673
674 DBUG_ON(part->remote_rp_version == 0);
675 DBUG_ON(part->remote_vars_version == 0);
676
677 if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) {
678 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part->
679 remote_vars_version));
680
681 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
682 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
683 version));
684 /* see if the other side rebooted */
685 if (part->remote_amos_page_pa ==
686 remote_vars->amos_page_pa &&
687 xpc_hb_allowed(sn_partition_id, remote_vars)) {
688 /* doesn't look that way, so ignore the IPI */
689 return;
690 }
691 }
692
693 /*
694 * Other side rebooted and previous XPC didn't support the
695 * disengage request, so we don't need to do anything special.
696 */
697
698 xpc_update_partition_info(part, remote_rp_version,
699 &remote_rp_stamp, remote_rp_pa,
700 remote_vars_pa, remote_vars);
701 part->reactivate_nasid = nasid;
702 XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
703 return;
704 }
705
706 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version));
707
708 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
709 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
710
711 /*
712 * Other side rebooted and previous XPC did support the
713 * disengage request, but the new one doesn't.
714 */
715
716 xpc_clear_partition_engaged(1UL << partid);
717 xpc_clear_partition_disengage_request(1UL << partid);
718
719 xpc_update_partition_info(part, remote_rp_version,
720 &remote_rp_stamp, remote_rp_pa,
721 remote_vars_pa, remote_vars);
722 reactivate = 1;
723
724 } else {
725 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
726
727 stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp,
728 &remote_rp_stamp);
729 if (stamp_diff != 0) {
730 DBUG_ON(stamp_diff >= 0);
731
732 /*
733 * Other side rebooted and the previous XPC did support
734 * the disengage request, as does the new one.
735 */
736
737 DBUG_ON(xpc_partition_engaged(1UL << partid));
738 DBUG_ON(xpc_partition_disengage_requested(1UL <<
739 partid));
740
741 xpc_update_partition_info(part, remote_rp_version,
742 &remote_rp_stamp,
743 remote_rp_pa, remote_vars_pa,
744 remote_vars);
745 reactivate = 1;
746 }
747 }
748
749 if (part->disengage_request_timeout > 0 &&
750 !xpc_partition_disengaged(part)) {
751 /* still waiting on other side to disengage from us */
752 return;
753 }
754
755 if (reactivate) {
756 part->reactivate_nasid = nasid;
757 XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
758
759 } else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) &&
760 xpc_partition_disengage_requested(1UL << partid)) {
761 XPC_DEACTIVATE_PARTITION(part, xpcOtherGoingDown);
762 }
763}
764
765/*
766 * Loop through the activation AMO variables and process any bits
767 * which are set. Each bit indicates a nasid sending a partition
768 * activation or deactivation request.
769 *
770 * Return #of IRQs detected.
771 */
772int
773xpc_identify_act_IRQ_sender(void)
774{
775 int word, bit;
776 u64 nasid_mask;
777 u64 nasid; /* remote nasid */
778 int n_IRQs_detected = 0;
779 AMO_t *act_amos;
780
781 act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS;
782
783 /* scan through act AMO variable looking for non-zero entries */
784 for (word = 0; word < xp_nasid_mask_words; word++) {
785
786 if (xpc_exiting)
787 break;
788
789 nasid_mask = xpc_IPI_receive(&act_amos[word]);
790 if (nasid_mask == 0) {
791 /* no IRQs from nasids in this variable */
792 continue;
793 }
794
795 dev_dbg(xpc_part, "AMO[%d] gave back 0x%lx\n", word,
796 nasid_mask);
797
798 /*
799 * If this nasid has been added to the machine since
800 * our partition was reset, this will retain the
801 * remote nasid in our reserved pages machine mask.
802 * This is used in the event of module reload.
803 */
804 xpc_mach_nasids[word] |= nasid_mask;
805
806 /* locate the nasid(s) which sent interrupts */
807
808 for (bit = 0; bit < (8 * sizeof(u64)); bit++) {
809 if (nasid_mask & (1UL << bit)) {
810 n_IRQs_detected++;
811 nasid = XPC_NASID_FROM_W_B(word, bit);
812 dev_dbg(xpc_part, "interrupt from nasid %ld\n",
813 nasid);
814 xpc_identify_act_IRQ_req(nasid);
815 }
816 }
817 }
818 return n_IRQs_detected;
819}
820
821/*
822 * See if the other side has responded to a partition disengage request
823 * from us.
824 */
825int
826xpc_partition_disengaged(struct xpc_partition *part)
827{
828 partid_t partid = XPC_PARTID(part);
829 int disengaged;
830
831 disengaged = (xpc_partition_engaged(1UL << partid) == 0);
832 if (part->disengage_request_timeout) {
833 if (!disengaged) {
834 if (time_before(jiffies,
835 part->disengage_request_timeout)) {
836 /* timelimit hasn't been reached yet */
837 return 0;
838 }
839
840 /*
841 * Other side hasn't responded to our disengage
842 * request in a timely fashion, so assume it's dead.
843 */
844
845 dev_info(xpc_part, "disengage from remote partition %d "
846 "timed out\n", partid);
847 xpc_disengage_request_timedout = 1;
848 xpc_clear_partition_engaged(1UL << partid);
849 disengaged = 1;
850 }
851 part->disengage_request_timeout = 0;
852
853 /* cancel the timer function, provided it's not us */
854 if (!in_interrupt()) {
855 del_singleshot_timer_sync(&part->
856 disengage_request_timer);
857 }
858
859 DBUG_ON(part->act_state != XPC_P_DEACTIVATING &&
860 part->act_state != XPC_P_INACTIVE);
861 if (part->act_state != XPC_P_INACTIVE)
862 xpc_wakeup_channel_mgr(part);
863
864 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version))
865 xpc_cancel_partition_disengage_request(part);
866 }
867 return disengaged;
868}
869
870/*
871 * Mark specified partition as active.
872 */
873enum xpc_retval
874xpc_mark_partition_active(struct xpc_partition *part)
875{
876 unsigned long irq_flags;
877 enum xpc_retval ret;
878
879 dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));
880
881 spin_lock_irqsave(&part->act_lock, irq_flags);
882 if (part->act_state == XPC_P_ACTIVATING) {
883 part->act_state = XPC_P_ACTIVE;
884 ret = xpcSuccess;
885 } else {
886 DBUG_ON(part->reason == xpcSuccess);
887 ret = part->reason;
888 }
889 spin_unlock_irqrestore(&part->act_lock, irq_flags);
890
891 return ret;
892}
893
894/*
895 * Notify XPC that the partition is down.
896 */
897void
898xpc_deactivate_partition(const int line, struct xpc_partition *part,
899 enum xpc_retval reason)
900{
901 unsigned long irq_flags;
902
903 spin_lock_irqsave(&part->act_lock, irq_flags);
904
905 if (part->act_state == XPC_P_INACTIVE) {
906 XPC_SET_REASON(part, reason, line);
907 spin_unlock_irqrestore(&part->act_lock, irq_flags);
908 if (reason == xpcReactivating) {
909 /* we interrupt ourselves to reactivate partition */
910 xpc_IPI_send_reactivate(part);
911 }
912 return;
913 }
914 if (part->act_state == XPC_P_DEACTIVATING) {
915 if ((part->reason == xpcUnloading && reason != xpcUnloading) ||
916 reason == xpcReactivating) {
917 XPC_SET_REASON(part, reason, line);
918 }
919 spin_unlock_irqrestore(&part->act_lock, irq_flags);
920 return;
921 }
922
923 part->act_state = XPC_P_DEACTIVATING;
924 XPC_SET_REASON(part, reason, line);
925
926 spin_unlock_irqrestore(&part->act_lock, irq_flags);
927
928 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
929 xpc_request_partition_disengage(part);
930 xpc_IPI_send_disengage(part);
931
932 /* set a timelimit on the disengage request */
933 part->disengage_request_timeout = jiffies +
934 (xpc_disengage_request_timelimit * HZ);
935 part->disengage_request_timer.expires =
936 part->disengage_request_timeout;
937 add_timer(&part->disengage_request_timer);
938 }
939
940 dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
941 XPC_PARTID(part), reason);
942
943 xpc_partition_going_down(part, reason);
944}
945
946/*
947 * Mark specified partition as inactive.
948 */
949void
950xpc_mark_partition_inactive(struct xpc_partition *part)
951{
952 unsigned long irq_flags;
953
954 dev_dbg(xpc_part, "setting partition %d to INACTIVE\n",
955 XPC_PARTID(part));
956
957 spin_lock_irqsave(&part->act_lock, irq_flags);
958 part->act_state = XPC_P_INACTIVE;
959 spin_unlock_irqrestore(&part->act_lock, irq_flags);
960 part->remote_rp_pa = 0;
961}
962
963/*
964 * SAL has provided a partition and machine mask. The partition mask
965 * contains a bit for each even nasid in our partition. The machine
966 * mask contains a bit for each even nasid in the entire machine.
967 *
968 * Using those two bit arrays, we can determine which nasids are
969 * known in the machine. Each should also have a reserved page
970 * initialized if they are available for partitioning.
971 */
972void
973xpc_discovery(void)
974{
975 void *remote_rp_base;
976 struct xpc_rsvd_page *remote_rp;
977 struct xpc_vars *remote_vars;
978 u64 remote_rp_pa;
979 u64 remote_vars_pa;
980 int region;
981 int region_size;
982 int max_regions;
983 int nasid;
984 struct xpc_rsvd_page *rp;
985 partid_t partid;
986 struct xpc_partition *part;
987 u64 *discovered_nasids;
988 enum xpc_retval ret;
989
990 remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
991 xp_nasid_mask_bytes,
992 GFP_KERNEL, &remote_rp_base);
993 if (remote_rp == NULL)
994 return;
995
996 remote_vars = (struct xpc_vars *)remote_rp;
997
998 discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
999 GFP_KERNEL);
1000 if (discovered_nasids == NULL) {
1001 kfree(remote_rp_base);
1002 return;
1003 }
1004
1005 rp = (struct xpc_rsvd_page *)xpc_rsvd_page;
1006
1007 /*
1008 * The term 'region' in this context refers to the minimum number of
1009 * nodes that can comprise an access protection grouping. The access
1010 * protection is in regards to memory, IOI and IPI.
1011 */
1012 max_regions = 64;
1013 region_size = sn_region_size;
1014
1015 switch (region_size) {
1016 case 128:
1017 max_regions *= 2;
1018 case 64:
1019 max_regions *= 2;
1020 case 32:
1021 max_regions *= 2;
1022 region_size = 16;
1023 DBUG_ON(!is_shub2());
1024 }
1025
1026 for (region = 0; region < max_regions; region++) {
1027
1028 if (xpc_exiting)
1029 break;
1030
1031 dev_dbg(xpc_part, "searching region %d\n", region);
1032
1033 for (nasid = (region * region_size * 2);
1034 nasid < ((region + 1) * region_size * 2); nasid += 2) {
1035
1036 if (xpc_exiting)
1037 break;
1038
1039 dev_dbg(xpc_part, "checking nasid %d\n", nasid);
1040
1041 if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) {
1042 dev_dbg(xpc_part, "PROM indicates Nasid %d is "
1043 "part of the local partition; skipping "
1044 "region\n", nasid);
1045 break;
1046 }
1047
1048 if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) {
1049 dev_dbg(xpc_part, "PROM indicates Nasid %d was "
1050 "not on Numa-Link network at reset\n",
1051 nasid);
1052 continue;
1053 }
1054
1055 if (XPC_NASID_IN_ARRAY(nasid, discovered_nasids)) {
1056 dev_dbg(xpc_part, "Nasid %d is part of a "
1057 "partition which was previously "
1058 "discovered\n", nasid);
1059 continue;
1060 }
1061
1062 /* pull over the reserved page structure */
1063
1064 ret = xpc_get_remote_rp(nasid, discovered_nasids,
1065 remote_rp, &remote_rp_pa);
1066 if (ret != xpcSuccess) {
1067 dev_dbg(xpc_part, "unable to get reserved page "
1068 "from nasid %d, reason=%d\n", nasid,
1069 ret);
1070
1071 if (ret == xpcLocalPartid)
1072 break;
1073
1074 continue;
1075 }
1076
1077 remote_vars_pa = remote_rp->vars_pa;
1078
1079 partid = remote_rp->partid;
1080 part = &xpc_partitions[partid];
1081
1082 /* pull over the cross partition variables */
1083
1084 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
1085 if (ret != xpcSuccess) {
1086 dev_dbg(xpc_part, "unable to get XPC variables "
1087 "from nasid %d, reason=%d\n", nasid,
1088 ret);
1089
1090 XPC_DEACTIVATE_PARTITION(part, ret);
1091 continue;
1092 }
1093
1094 if (part->act_state != XPC_P_INACTIVE) {
1095 dev_dbg(xpc_part, "partition %d on nasid %d is "
1096 "already activating\n", partid, nasid);
1097 break;
1098 }
1099
1100 /*
1101 * Register the remote partition's AMOs with SAL so it
1102 * can handle and cleanup errors within that address
1103 * range should the remote partition go down. We don't
1104 * unregister this range because it is difficult to
1105 * tell when outstanding writes to the remote partition
1106 * are finished and thus when it is thus safe to
1107 * unregister. This should not result in wasted space
1108 * in the SAL xp_addr_region table because we should
1109 * get the same page for remote_act_amos_pa after
1110 * module reloads and system reboots.
1111 */
1112 if (sn_register_xp_addr_region
1113 (remote_vars->amos_page_pa, PAGE_SIZE, 1) < 0) {
1114 dev_dbg(xpc_part,
1115 "partition %d failed to "
1116 "register xp_addr region 0x%016lx\n",
1117 partid, remote_vars->amos_page_pa);
1118
1119 XPC_SET_REASON(part, xpcPhysAddrRegFailed,
1120 __LINE__);
1121 break;
1122 }
1123
1124 /*
1125 * The remote nasid is valid and available.
1126 * Send an interrupt to that nasid to notify
1127 * it that we are ready to begin activation.
1128 */
1129 dev_dbg(xpc_part, "sending an interrupt to AMO 0x%lx, "
1130 "nasid %d, phys_cpuid 0x%x\n",
1131 remote_vars->amos_page_pa,
1132 remote_vars->act_nasid,
1133 remote_vars->act_phys_cpuid);
1134
1135 if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
1136 version)) {
1137 part->remote_amos_page_pa =
1138 remote_vars->amos_page_pa;
1139 xpc_mark_partition_disengaged(part);
1140 xpc_cancel_partition_disengage_request(part);
1141 }
1142 xpc_IPI_send_activate(remote_vars);
1143 }
1144 }
1145
1146 kfree(discovered_nasids);
1147 kfree(remote_rp_base);
1148}
1149
1150/*
1151 * Given a partid, get the nasids owned by that partition from the
1152 * remote partition's reserved page.
1153 */
1154enum xpc_retval
1155xpc_initiate_partid_to_nasids(partid_t partid, void *nasid_mask)
1156{
1157 struct xpc_partition *part;
1158 u64 part_nasid_pa;
1159 int bte_res;
1160
1161 part = &xpc_partitions[partid];
1162 if (part->remote_rp_pa == 0)
1163 return xpcPartitionDown;
1164
1165 memset(nasid_mask, 0, XP_NASID_MASK_BYTES);
1166
1167 part_nasid_pa = (u64)XPC_RP_PART_NASIDS(part->remote_rp_pa);
1168
1169 bte_res = xp_bte_copy(part_nasid_pa, (u64)nasid_mask,
1170 xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE),
1171 NULL);
1172
1173 return xpc_map_bte_errors(bte_res);
1174}
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
new file mode 100644
index 000000000000..a9543c65814d
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -0,0 +1,677 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 1999-2008 Silicon Graphics, Inc. All rights reserved.
7 */
8
9/*
10 * Cross Partition Network Interface (XPNET) support
11 *
12 * XPNET provides a virtual network layered on top of the Cross
13 * Partition communication layer.
14 *
15 * XPNET provides direct point-to-point and broadcast-like support
16 * for an ethernet-like device. The ethernet broadcast medium is
17 * replaced with a point-to-point message structure which passes
18 * pointers to a DMA-capable block that a remote partition should
19 * retrieve and pass to the upper level networking layer.
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/init.h>
27#include <linux/ioport.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/delay.h>
31#include <linux/ethtool.h>
32#include <linux/mii.h>
33#include <linux/smp.h>
34#include <linux/string.h>
35#include <asm/sn/bte.h>
36#include <asm/sn/io.h>
37#include <asm/sn/sn_sal.h>
38#include <asm/atomic.h>
39#include "xp.h"
40
41/*
42 * The message payload transferred by XPC.
43 *
44 * buf_pa is the physical address where the DMA should pull from.
45 *
46 * NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a
47 * cacheline boundary. To accomplish this, we record the number of
48 * bytes from the beginning of the first cacheline to the first useful
49 * byte of the skb (leadin_ignore) and the number of bytes from the
50 * last useful byte of the skb to the end of the last cacheline
51 * (tailout_ignore).
52 *
53 * size is the number of bytes to transfer which includes the skb->len
54 * (useful bytes of the senders skb) plus the leadin and tailout
55 */
56struct xpnet_message {
57 u16 version; /* Version for this message */
58 u16 embedded_bytes; /* #of bytes embedded in XPC message */
59 u32 magic; /* Special number indicating this is xpnet */
60 u64 buf_pa; /* phys address of buffer to retrieve */
61 u32 size; /* #of bytes in buffer */
62 u8 leadin_ignore; /* #of bytes to ignore at the beginning */
63 u8 tailout_ignore; /* #of bytes to ignore at the end */
64 unsigned char data; /* body of small packets */
65};
66
67/*
68 * Determine the size of our message, the cacheline aligned size,
69 * and then the number of message will request from XPC.
70 *
71 * XPC expects each message to exist in an individual cacheline.
72 */
73#define XPNET_MSG_SIZE (L1_CACHE_BYTES - XPC_MSG_PAYLOAD_OFFSET)
74#define XPNET_MSG_DATA_MAX \
75 (XPNET_MSG_SIZE - (u64)(&((struct xpnet_message *)0)->data))
76#define XPNET_MSG_ALIGNED_SIZE (L1_CACHE_ALIGN(XPNET_MSG_SIZE))
77#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPNET_MSG_ALIGNED_SIZE)
78
79#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1)
80#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1)
81
82/*
83 * Version number of XPNET implementation. XPNET can always talk to versions
84 * with same major #, and never talk to versions with a different version.
85 */
86#define _XPNET_VERSION(_major, _minor) (((_major) << 4) | (_minor))
87#define XPNET_VERSION_MAJOR(_v) ((_v) >> 4)
88#define XPNET_VERSION_MINOR(_v) ((_v) & 0xf)
89
90#define XPNET_VERSION _XPNET_VERSION(1, 0) /* version 1.0 */
91#define XPNET_VERSION_EMBED _XPNET_VERSION(1, 1) /* version 1.1 */
92#define XPNET_MAGIC 0x88786984 /* "XNET" */
93
94#define XPNET_VALID_MSG(_m) \
95 ((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \
96 && (msg->magic == XPNET_MAGIC))
97
98#define XPNET_DEVICE_NAME "xp0"
99
100/*
101 * When messages are queued with xpc_send_notify, a kmalloc'd buffer
102 * of the following type is passed as a notification cookie. When the
103 * notification function is called, we use the cookie to decide
104 * whether all outstanding message sends have completed. The skb can
105 * then be released.
106 */
107struct xpnet_pending_msg {
108 struct list_head free_list;
109 struct sk_buff *skb;
110 atomic_t use_count;
111};
112
113/* driver specific structure pointed to by the device structure */
114struct xpnet_dev_private {
115 struct net_device_stats stats;
116};
117
118struct net_device *xpnet_device;
119
120/*
121 * When we are notified of other partitions activating, we add them to
122 * our bitmask of partitions to which we broadcast.
123 */
124static u64 xpnet_broadcast_partitions;
125/* protect above */
126static DEFINE_SPINLOCK(xpnet_broadcast_lock);
127
128/*
129 * Since the Block Transfer Engine (BTE) is being used for the transfer
130 * and it relies upon cache-line size transfers, we need to reserve at
131 * least one cache-line for head and tail alignment. The BTE is
132 * limited to 8MB transfers.
133 *
134 * Testing has shown that changing MTU to greater than 64KB has no effect
135 * on TCP as the two sides negotiate a Max Segment Size that is limited
136 * to 64K. Other protocols May use packets greater than this, but for
137 * now, the default is 64KB.
138 */
139#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
140/* 32KB has been determined to be the ideal */
141#define XPNET_DEF_MTU (0x8000UL)
142
143/*
144 * The partition id is encapsulated in the MAC address. The following
145 * define locates the octet the partid is in.
146 */
147#define XPNET_PARTID_OCTET 1
148#define XPNET_LICENSE_OCTET 2
149
150/*
151 * Define the XPNET debug device structure that is to be used with dev_dbg(),
152 * dev_err(), dev_warn(), and dev_info().
153 */
154struct device_driver xpnet_dbg_name = {
155 .name = "xpnet"
156};
157
158struct device xpnet_dbg_subname = {
159 .bus_id = {0}, /* set to "" */
160 .driver = &xpnet_dbg_name
161};
162
163struct device *xpnet = &xpnet_dbg_subname;
164
165/*
166 * Packet was recevied by XPC and forwarded to us.
167 */
168static void
169xpnet_receive(partid_t partid, int channel, struct xpnet_message *msg)
170{
171 struct sk_buff *skb;
172 bte_result_t bret;
173 struct xpnet_dev_private *priv =
174 (struct xpnet_dev_private *)xpnet_device->priv;
175
176 if (!XPNET_VALID_MSG(msg)) {
177 /*
178 * Packet with a different XPC version. Ignore.
179 */
180 xpc_received(partid, channel, (void *)msg);
181
182 priv->stats.rx_errors++;
183
184 return;
185 }
186 dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size,
187 msg->leadin_ignore, msg->tailout_ignore);
188
189 /* reserve an extra cache line */
190 skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES);
191 if (!skb) {
192 dev_err(xpnet, "failed on dev_alloc_skb(%d)\n",
193 msg->size + L1_CACHE_BYTES);
194
195 xpc_received(partid, channel, (void *)msg);
196
197 priv->stats.rx_errors++;
198
199 return;
200 }
201
202 /*
203 * The allocated skb has some reserved space.
204 * In order to use bte_copy, we need to get the
205 * skb->data pointer moved forward.
206 */
207 skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
208 (L1_CACHE_BYTES - 1)) +
209 msg->leadin_ignore));
210
211 /*
212 * Update the tail pointer to indicate data actually
213 * transferred.
214 */
215 skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore));
216
217 /*
218 * Move the data over from the other side.
219 */
220 if ((XPNET_VERSION_MINOR(msg->version) == 1) &&
221 (msg->embedded_bytes != 0)) {
222 dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, "
223 "%lu)\n", skb->data, &msg->data,
224 (size_t)msg->embedded_bytes);
225
226 skb_copy_to_linear_data(skb, &msg->data,
227 (size_t)msg->embedded_bytes);
228 } else {
229 dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
230 "bte_copy(0x%p, 0x%p, %hu)\n", (void *)msg->buf_pa,
231 (void *)__pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
232 msg->size);
233
234 bret = bte_copy(msg->buf_pa,
235 __pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
236 msg->size, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
237
238 if (bret != BTE_SUCCESS) {
239 /*
240 * >>> Need better way of cleaning skb. Currently skb
241 * >>> appears in_use and we can't just call
242 * >>> dev_kfree_skb.
243 */
244 dev_err(xpnet, "bte_copy(0x%p, 0x%p, 0x%hx) returned "
245 "error=0x%x\n", (void *)msg->buf_pa,
246 (void *)__pa((u64)skb->data &
247 ~(L1_CACHE_BYTES - 1)),
248 msg->size, bret);
249
250 xpc_received(partid, channel, (void *)msg);
251
252 priv->stats.rx_errors++;
253
254 return;
255 }
256 }
257
258 dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
259 "skb->end=0x%p skb->len=%d\n", (void *)skb->head,
260 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
261 skb->len);
262
263 skb->protocol = eth_type_trans(skb, xpnet_device);
264 skb->ip_summed = CHECKSUM_UNNECESSARY;
265
266 dev_dbg(xpnet, "passing skb to network layer\n"
267 KERN_DEBUG "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p "
268 "skb->end=0x%p skb->len=%d\n",
269 (void *)skb->head, (void *)skb->data, skb_tail_pointer(skb),
270 skb_end_pointer(skb), skb->len);
271
272 xpnet_device->last_rx = jiffies;
273 priv->stats.rx_packets++;
274 priv->stats.rx_bytes += skb->len + ETH_HLEN;
275
276 netif_rx_ni(skb);
277 xpc_received(partid, channel, (void *)msg);
278}
279
280/*
281 * This is the handler which XPC calls during any sort of change in
282 * state or message reception on a connection.
283 */
284static void
285xpnet_connection_activity(enum xpc_retval reason, partid_t partid, int channel,
286 void *data, void *key)
287{
288 long bp;
289
290 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
291 DBUG_ON(channel != XPC_NET_CHANNEL);
292
293 switch (reason) {
294 case xpcMsgReceived: /* message received */
295 DBUG_ON(data == NULL);
296
297 xpnet_receive(partid, channel, (struct xpnet_message *)data);
298 break;
299
300 case xpcConnected: /* connection completed to a partition */
301 spin_lock_bh(&xpnet_broadcast_lock);
302 xpnet_broadcast_partitions |= 1UL << (partid - 1);
303 bp = xpnet_broadcast_partitions;
304 spin_unlock_bh(&xpnet_broadcast_lock);
305
306 netif_carrier_on(xpnet_device);
307
308 dev_dbg(xpnet, "%s connection created to partition %d; "
309 "xpnet_broadcast_partitions=0x%lx\n",
310 xpnet_device->name, partid, bp);
311 break;
312
313 default:
314 spin_lock_bh(&xpnet_broadcast_lock);
315 xpnet_broadcast_partitions &= ~(1UL << (partid - 1));
316 bp = xpnet_broadcast_partitions;
317 spin_unlock_bh(&xpnet_broadcast_lock);
318
319 if (bp == 0)
320 netif_carrier_off(xpnet_device);
321
322 dev_dbg(xpnet, "%s disconnected from partition %d; "
323 "xpnet_broadcast_partitions=0x%lx\n",
324 xpnet_device->name, partid, bp);
325 break;
326
327 }
328}
329
330static int
331xpnet_dev_open(struct net_device *dev)
332{
333 enum xpc_retval ret;
334
335 dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
336 "%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
337 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES, XPNET_MAX_KTHREADS,
338 XPNET_MAX_IDLE_KTHREADS);
339
340 ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
341 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
342 XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS);
343 if (ret != xpcSuccess) {
344 dev_err(xpnet, "ifconfig up of %s failed on XPC connect, "
345 "ret=%d\n", dev->name, ret);
346
347 return -ENOMEM;
348 }
349
350 dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name);
351
352 return 0;
353}
354
355static int
356xpnet_dev_stop(struct net_device *dev)
357{
358 xpc_disconnect(XPC_NET_CHANNEL);
359
360 dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name);
361
362 return 0;
363}
364
365static int
366xpnet_dev_change_mtu(struct net_device *dev, int new_mtu)
367{
368 /* 68 comes from min TCP+IP+MAC header */
369 if ((new_mtu < 68) || (new_mtu > XPNET_MAX_MTU)) {
370 dev_err(xpnet, "ifconfig %s mtu %d failed; value must be "
371 "between 68 and %ld\n", dev->name, new_mtu,
372 XPNET_MAX_MTU);
373 return -EINVAL;
374 }
375
376 dev->mtu = new_mtu;
377 dev_dbg(xpnet, "ifconfig %s mtu set to %d\n", dev->name, new_mtu);
378 return 0;
379}
380
381/*
382 * Required for the net_device structure.
383 */
384static int
385xpnet_dev_set_config(struct net_device *dev, struct ifmap *new_map)
386{
387 return 0;
388}
389
390/*
391 * Return statistics to the caller.
392 */
393static struct net_device_stats *
394xpnet_dev_get_stats(struct net_device *dev)
395{
396 struct xpnet_dev_private *priv;
397
398 priv = (struct xpnet_dev_private *)dev->priv;
399
400 return &priv->stats;
401}
402
403/*
404 * Notification that the other end has received the message and
405 * DMA'd the skb information. At this point, they are done with
406 * our side. When all recipients are done processing, we
407 * release the skb and then release our pending message structure.
408 */
409static void
410xpnet_send_completed(enum xpc_retval reason, partid_t partid, int channel,
411 void *__qm)
412{
413 struct xpnet_pending_msg *queued_msg = (struct xpnet_pending_msg *)__qm;
414
415 DBUG_ON(queued_msg == NULL);
416
417 dev_dbg(xpnet, "message to %d notified with reason %d\n",
418 partid, reason);
419
420 if (atomic_dec_return(&queued_msg->use_count) == 0) {
421 dev_dbg(xpnet, "all acks for skb->head=-x%p\n",
422 (void *)queued_msg->skb->head);
423
424 dev_kfree_skb_any(queued_msg->skb);
425 kfree(queued_msg);
426 }
427}
428
429/*
430 * Network layer has formatted a packet (skb) and is ready to place it
431 * "on the wire". Prepare and send an xpnet_message to all partitions
432 * which have connected with us and are targets of this packet.
433 *
434 * MAC-NOTE: For the XPNET driver, the MAC address contains the
435 * destination partition_id. If the destination partition id word
436 * is 0xff, this packet is to broadcast to all partitions.
437 */
438static int
439xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
440{
441 struct xpnet_pending_msg *queued_msg;
442 enum xpc_retval ret;
443 struct xpnet_message *msg;
444 u64 start_addr, end_addr;
445 long dp;
446 u8 second_mac_octet;
447 partid_t dest_partid;
448 struct xpnet_dev_private *priv;
449 u16 embedded_bytes;
450
451 priv = (struct xpnet_dev_private *)dev->priv;
452
453 dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
454 "skb->end=0x%p skb->len=%d\n", (void *)skb->head,
455 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
456 skb->len);
457
458 /*
459 * The xpnet_pending_msg tracks how many outstanding
460 * xpc_send_notifies are relying on this skb. When none
461 * remain, release the skb.
462 */
463 queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC);
464 if (queued_msg == NULL) {
465 dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping "
466 "packet\n", sizeof(struct xpnet_pending_msg));
467
468 priv->stats.tx_errors++;
469
470 return -ENOMEM;
471 }
472
473 /* get the beginning of the first cacheline and end of last */
474 start_addr = ((u64)skb->data & ~(L1_CACHE_BYTES - 1));
475 end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
476
477 /* calculate how many bytes to embed in the XPC message */
478 embedded_bytes = 0;
479 if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
480 /* skb->data does fit so embed */
481 embedded_bytes = skb->len;
482 }
483
484 /*
485 * Since the send occurs asynchronously, we set the count to one
486 * and begin sending. Any sends that happen to complete before
487 * we are done sending will not free the skb. We will be left
488 * with that task during exit. This also handles the case of
489 * a packet destined for a partition which is no longer up.
490 */
491 atomic_set(&queued_msg->use_count, 1);
492 queued_msg->skb = skb;
493
494 second_mac_octet = skb->data[XPNET_PARTID_OCTET];
495 if (second_mac_octet == 0xff) {
496 /* we are being asked to broadcast to all partitions */
497 dp = xpnet_broadcast_partitions;
498 } else if (second_mac_octet != 0) {
499 dp = xpnet_broadcast_partitions &
500 (1UL << (second_mac_octet - 1));
501 } else {
502 /* 0 is an invalid partid. Ignore */
503 dp = 0;
504 }
505 dev_dbg(xpnet, "destination Partitions mask (dp) = 0x%lx\n", dp);
506
507 /*
508 * If we wanted to allow promiscuous mode to work like an
509 * unswitched network, this would be a good point to OR in a
510 * mask of partitions which should be receiving all packets.
511 */
512
513 /*
514 * Main send loop.
515 */
516 for (dest_partid = 1; dp && dest_partid < XP_MAX_PARTITIONS;
517 dest_partid++) {
518
519 if (!(dp & (1UL << (dest_partid - 1)))) {
520 /* not destined for this partition */
521 continue;
522 }
523
524 /* remove this partition from the destinations mask */
525 dp &= ~(1UL << (dest_partid - 1));
526
527 /* found a partition to send to */
528
529 ret = xpc_allocate(dest_partid, XPC_NET_CHANNEL,
530 XPC_NOWAIT, (void **)&msg);
531 if (unlikely(ret != xpcSuccess))
532 continue;
533
534 msg->embedded_bytes = embedded_bytes;
535 if (unlikely(embedded_bytes != 0)) {
536 msg->version = XPNET_VERSION_EMBED;
537 dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
538 &msg->data, skb->data, (size_t)embedded_bytes);
539 skb_copy_from_linear_data(skb, &msg->data,
540 (size_t)embedded_bytes);
541 } else {
542 msg->version = XPNET_VERSION;
543 }
544 msg->magic = XPNET_MAGIC;
545 msg->size = end_addr - start_addr;
546 msg->leadin_ignore = (u64)skb->data - start_addr;
547 msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
548 msg->buf_pa = __pa(start_addr);
549
550 dev_dbg(xpnet, "sending XPC message to %d:%d\n"
551 KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, "
552 "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
553 dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
554 msg->leadin_ignore, msg->tailout_ignore);
555
556 atomic_inc(&queued_msg->use_count);
557
558 ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, msg,
559 xpnet_send_completed, queued_msg);
560 if (unlikely(ret != xpcSuccess)) {
561 atomic_dec(&queued_msg->use_count);
562 continue;
563 }
564 }
565
566 if (atomic_dec_return(&queued_msg->use_count) == 0) {
567 dev_dbg(xpnet, "no partitions to receive packet destined for "
568 "%d\n", dest_partid);
569
570 dev_kfree_skb(skb);
571 kfree(queued_msg);
572 }
573
574 priv->stats.tx_packets++;
575 priv->stats.tx_bytes += skb->len;
576
577 return 0;
578}
579
580/*
581 * Deal with transmit timeouts coming from the network layer.
582 */
583static void
584xpnet_dev_tx_timeout(struct net_device *dev)
585{
586 struct xpnet_dev_private *priv;
587
588 priv = (struct xpnet_dev_private *)dev->priv;
589
590 priv->stats.tx_errors++;
591 return;
592}
593
594static int __init
595xpnet_init(void)
596{
597 int i;
598 u32 license_num;
599 int result = -ENOMEM;
600
601 if (!ia64_platform_is("sn2"))
602 return -ENODEV;
603
604 dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
605
606 /*
607 * use ether_setup() to init the majority of our device
608 * structure and then override the necessary pieces.
609 */
610 xpnet_device = alloc_netdev(sizeof(struct xpnet_dev_private),
611 XPNET_DEVICE_NAME, ether_setup);
612 if (xpnet_device == NULL)
613 return -ENOMEM;
614
615 netif_carrier_off(xpnet_device);
616
617 xpnet_device->mtu = XPNET_DEF_MTU;
618 xpnet_device->change_mtu = xpnet_dev_change_mtu;
619 xpnet_device->open = xpnet_dev_open;
620 xpnet_device->get_stats = xpnet_dev_get_stats;
621 xpnet_device->stop = xpnet_dev_stop;
622 xpnet_device->hard_start_xmit = xpnet_dev_hard_start_xmit;
623 xpnet_device->tx_timeout = xpnet_dev_tx_timeout;
624 xpnet_device->set_config = xpnet_dev_set_config;
625
626 /*
627 * Multicast assumes the LSB of the first octet is set for multicast
628 * MAC addresses. We chose the first octet of the MAC to be unlikely
629 * to collide with any vendor's officially issued MAC.
630 */
631 xpnet_device->dev_addr[0] = 0xfe;
632 xpnet_device->dev_addr[XPNET_PARTID_OCTET] = sn_partition_id;
633 license_num = sn_partition_serial_number_val();
634 for (i = 3; i >= 0; i--) {
635 xpnet_device->dev_addr[XPNET_LICENSE_OCTET + i] =
636 license_num & 0xff;
637 license_num = license_num >> 8;
638 }
639
640 /*
641 * ether_setup() sets this to a multicast device. We are
642 * really not supporting multicast at this time.
643 */
644 xpnet_device->flags &= ~IFF_MULTICAST;
645
646 /*
647 * No need to checksum as it is a DMA transfer. The BTE will
648 * report an error if the data is not retrievable and the
649 * packet will be dropped.
650 */
651 xpnet_device->features = NETIF_F_NO_CSUM;
652
653 result = register_netdev(xpnet_device);
654 if (result != 0)
655 free_netdev(xpnet_device);
656
657 return result;
658}
659
660module_init(xpnet_init);
661
662static void __exit
663xpnet_exit(void)
664{
665 dev_info(xpnet, "unregistering network device %s\n",
666 xpnet_device[0].name);
667
668 unregister_netdev(xpnet_device);
669
670 free_netdev(xpnet_device);
671}
672
673module_exit(xpnet_exit);
674
675MODULE_AUTHOR("Silicon Graphics, Inc.");
676MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)");
677MODULE_LICENSE("GPL");