aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/misc/sgi-xp
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/sgi-xp')
-rw-r--r--drivers/misc/sgi-xp/Makefile11
-rw-r--r--drivers/misc/sgi-xp/xp.h463
-rw-r--r--drivers/misc/sgi-xp/xp_main.c279
-rw-r--r--drivers/misc/sgi-xp/xp_nofault.S35
-rw-r--r--drivers/misc/sgi-xp/xpc.h1187
-rw-r--r--drivers/misc/sgi-xp/xpc_channel.c2243
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c1323
-rw-r--r--drivers/misc/sgi-xp/xpc_partition.c1174
-rw-r--r--drivers/misc/sgi-xp/xpnet.c677
9 files changed, 7392 insertions, 0 deletions
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile
new file mode 100644
index 000000000000..b6e40a7958ce
--- /dev/null
+++ b/drivers/misc/sgi-xp/Makefile
@@ -0,0 +1,11 @@
1#
2# Makefile for SGI's XP devices.
3#
4
5obj-$(CONFIG_SGI_XP) += xp.o
6xp-y := xp_main.o xp_nofault.o
7
8obj-$(CONFIG_SGI_XP) += xpc.o
9xpc-y := xpc_main.o xpc_channel.o xpc_partition.o
10
11obj-$(CONFIG_SGI_XP) += xpnet.o
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
new file mode 100644
index 000000000000..5515234be86a
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp.h
@@ -0,0 +1,463 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 2004-2008 Silicon Graphics, Inc. All rights reserved.
7 */
8
9/*
10 * External Cross Partition (XP) structures and defines.
11 */
12
13#ifndef _DRIVERS_MISC_SGIXP_XP_H
14#define _DRIVERS_MISC_SGIXP_XP_H
15
16#include <linux/cache.h>
17#include <linux/hardirq.h>
18#include <linux/mutex.h>
19#include <asm/sn/types.h>
20#include <asm/sn/bte.h>
21
22#ifdef USE_DBUG_ON
23#define DBUG_ON(condition) BUG_ON(condition)
24#else
25#define DBUG_ON(condition)
26#endif
27
28/*
29 * Define the maximum number of logically defined partitions the system
30 * can support. It is constrained by the maximum number of hardware
31 * partitionable regions. The term 'region' in this context refers to the
32 * minimum number of nodes that can comprise an access protection grouping.
33 * The access protection is in regards to memory, IPI and IOI.
34 *
35 * The maximum number of hardware partitionable regions is equal to the
36 * maximum number of nodes in the entire system divided by the minimum number
37 * of nodes that comprise an access protection grouping.
38 */
39#define XP_MAX_PARTITIONS 64
40
41/*
42 * Define the number of u64s required to represent all the C-brick nasids
43 * as a bitmap. The cross-partition kernel modules deal only with
44 * C-brick nasids, thus the need for bitmaps which don't account for
45 * odd-numbered (non C-brick) nasids.
46 */
47#define XP_MAX_PHYSNODE_ID (MAX_NUMALINK_NODES / 2)
48#define XP_NASID_MASK_BYTES ((XP_MAX_PHYSNODE_ID + 7) / 8)
49#define XP_NASID_MASK_WORDS ((XP_MAX_PHYSNODE_ID + 63) / 64)
50
51/*
52 * Wrapper for bte_copy() that should it return a failure status will retry
53 * the bte_copy() once in the hope that the failure was due to a temporary
54 * aberration (i.e., the link going down temporarily).
55 *
56 * src - physical address of the source of the transfer.
57 * vdst - virtual address of the destination of the transfer.
58 * len - number of bytes to transfer from source to destination.
59 * mode - see bte_copy() for definition.
60 * notification - see bte_copy() for definition.
61 *
62 * Note: xp_bte_copy() should never be called while holding a spinlock.
63 */
64static inline bte_result_t
65xp_bte_copy(u64 src, u64 vdst, u64 len, u64 mode, void *notification)
66{
67 bte_result_t ret;
68 u64 pdst = ia64_tpa(vdst);
69
70 /*
71 * Ensure that the physically mapped memory is contiguous.
72 *
73 * We do this by ensuring that the memory is from region 7 only.
74 * If the need should arise to use memory from one of the other
75 * regions, then modify the BUG_ON() statement to ensure that the
76 * memory from that region is always physically contiguous.
77 */
78 BUG_ON(REGION_NUMBER(vdst) != RGN_KERNEL);
79
80 ret = bte_copy(src, pdst, len, mode, notification);
81 if ((ret != BTE_SUCCESS) && BTE_ERROR_RETRY(ret)) {
82 if (!in_interrupt())
83 cond_resched();
84
85 ret = bte_copy(src, pdst, len, mode, notification);
86 }
87
88 return ret;
89}
90
91/*
92 * XPC establishes channel connections between the local partition and any
93 * other partition that is currently up. Over these channels, kernel-level
94 * `users' can communicate with their counterparts on the other partitions.
95 *
96 * The maxinum number of channels is limited to eight. For performance reasons,
97 * the internal cross partition structures require sixteen bytes per channel,
98 * and eight allows all of this interface-shared info to fit in one cache line.
99 *
100 * XPC_NCHANNELS reflects the total number of channels currently defined.
101 * If the need for additional channels arises, one can simply increase
102 * XPC_NCHANNELS accordingly. If the day should come where that number
103 * exceeds the MAXIMUM number of channels allowed (eight), then one will need
104 * to make changes to the XPC code to allow for this.
105 */
106#define XPC_MEM_CHANNEL 0 /* memory channel number */
107#define XPC_NET_CHANNEL 1 /* network channel number */
108
109#define XPC_NCHANNELS 2 /* #of defined channels */
110#define XPC_MAX_NCHANNELS 8 /* max #of channels allowed */
111
112#if XPC_NCHANNELS > XPC_MAX_NCHANNELS
113#error XPC_NCHANNELS exceeds MAXIMUM allowed.
114#endif
115
116/*
117 * The format of an XPC message is as follows:
118 *
119 * +-------+--------------------------------+
120 * | flags |////////////////////////////////|
121 * +-------+--------------------------------+
122 * | message # |
123 * +----------------------------------------+
124 * | payload (user-defined message) |
125 * | |
126 * :
127 * | |
128 * +----------------------------------------+
129 *
130 * The size of the payload is defined by the user via xpc_connect(). A user-
131 * defined message resides in the payload area.
132 *
133 * The user should have no dealings with the message header, but only the
134 * message's payload. When a message entry is allocated (via xpc_allocate())
135 * a pointer to the payload area is returned and not the actual beginning of
136 * the XPC message. The user then constructs a message in the payload area
137 * and passes that pointer as an argument on xpc_send() or xpc_send_notify().
138 *
139 * The size of a message entry (within a message queue) must be a cacheline
140 * sized multiple in order to facilitate the BTE transfer of messages from one
141 * message queue to another. A macro, XPC_MSG_SIZE(), is provided for the user
142 * that wants to fit as many msg entries as possible in a given memory size
143 * (e.g. a memory page).
144 */
145struct xpc_msg {
146 u8 flags; /* FOR XPC INTERNAL USE ONLY */
147 u8 reserved[7]; /* FOR XPC INTERNAL USE ONLY */
148 s64 number; /* FOR XPC INTERNAL USE ONLY */
149
150 u64 payload; /* user defined portion of message */
151};
152
153#define XPC_MSG_PAYLOAD_OFFSET (u64) (&((struct xpc_msg *)0)->payload)
154#define XPC_MSG_SIZE(_payload_size) \
155 L1_CACHE_ALIGN(XPC_MSG_PAYLOAD_OFFSET + (_payload_size))
156
157/*
158 * Define the return values and values passed to user's callout functions.
159 * (It is important to add new value codes at the end just preceding
160 * xpcUnknownReason, which must have the highest numerical value.)
161 */
162enum xpc_retval {
163 xpcSuccess = 0,
164
165 xpcNotConnected, /* 1: channel is not connected */
166 xpcConnected, /* 2: channel connected (opened) */
167 xpcRETIRED1, /* 3: (formerly xpcDisconnected) */
168
169 xpcMsgReceived, /* 4: message received */
170 xpcMsgDelivered, /* 5: message delivered and acknowledged */
171
172 xpcRETIRED2, /* 6: (formerly xpcTransferFailed) */
173
174 xpcNoWait, /* 7: operation would require wait */
175 xpcRetry, /* 8: retry operation */
176 xpcTimeout, /* 9: timeout in xpc_allocate_msg_wait() */
177 xpcInterrupted, /* 10: interrupted wait */
178
179 xpcUnequalMsgSizes, /* 11: message size disparity between sides */
180 xpcInvalidAddress, /* 12: invalid address */
181
182 xpcNoMemory, /* 13: no memory available for XPC structures */
183 xpcLackOfResources, /* 14: insufficient resources for operation */
184 xpcUnregistered, /* 15: channel is not registered */
185 xpcAlreadyRegistered, /* 16: channel is already registered */
186
187 xpcPartitionDown, /* 17: remote partition is down */
188 xpcNotLoaded, /* 18: XPC module is not loaded */
189 xpcUnloading, /* 19: this side is unloading XPC module */
190
191 xpcBadMagic, /* 20: XPC MAGIC string not found */
192
193 xpcReactivating, /* 21: remote partition was reactivated */
194
195 xpcUnregistering, /* 22: this side is unregistering channel */
196 xpcOtherUnregistering, /* 23: other side is unregistering channel */
197
198 xpcCloneKThread, /* 24: cloning kernel thread */
199 xpcCloneKThreadFailed, /* 25: cloning kernel thread failed */
200
201 xpcNoHeartbeat, /* 26: remote partition has no heartbeat */
202
203 xpcPioReadError, /* 27: PIO read error */
204 xpcPhysAddrRegFailed, /* 28: registration of phys addr range failed */
205
206 xpcBteDirectoryError, /* 29: maps to BTEFAIL_DIR */
207 xpcBtePoisonError, /* 30: maps to BTEFAIL_POISON */
208 xpcBteWriteError, /* 31: maps to BTEFAIL_WERR */
209 xpcBteAccessError, /* 32: maps to BTEFAIL_ACCESS */
210 xpcBtePWriteError, /* 33: maps to BTEFAIL_PWERR */
211 xpcBtePReadError, /* 34: maps to BTEFAIL_PRERR */
212 xpcBteTimeOutError, /* 35: maps to BTEFAIL_TOUT */
213 xpcBteXtalkError, /* 36: maps to BTEFAIL_XTERR */
214 xpcBteNotAvailable, /* 37: maps to BTEFAIL_NOTAVAIL */
215 xpcBteUnmappedError, /* 38: unmapped BTEFAIL_ error */
216
217 xpcBadVersion, /* 39: bad version number */
218 xpcVarsNotSet, /* 40: the XPC variables are not set up */
219 xpcNoRsvdPageAddr, /* 41: unable to get rsvd page's phys addr */
220 xpcInvalidPartid, /* 42: invalid partition ID */
221 xpcLocalPartid, /* 43: local partition ID */
222
223 xpcOtherGoingDown, /* 44: other side going down, reason unknown */
224 xpcSystemGoingDown, /* 45: system is going down, reason unknown */
225 xpcSystemHalt, /* 46: system is being halted */
226 xpcSystemReboot, /* 47: system is being rebooted */
227 xpcSystemPoweroff, /* 48: system is being powered off */
228
229 xpcDisconnecting, /* 49: channel disconnecting (closing) */
230
231 xpcOpenCloseError, /* 50: channel open/close protocol error */
232
233 xpcDisconnected, /* 51: channel disconnected (closed) */
234
235 xpcBteSh2Start, /* 52: BTE CRB timeout */
236
237 /* 53: 0x1 BTE Error Response Short */
238 xpcBteSh2RspShort = xpcBteSh2Start + BTEFAIL_SH2_RESP_SHORT,
239
240 /* 54: 0x2 BTE Error Response Long */
241 xpcBteSh2RspLong = xpcBteSh2Start + BTEFAIL_SH2_RESP_LONG,
242
243 /* 56: 0x4 BTE Error Response DSB */
244 xpcBteSh2RspDSB = xpcBteSh2Start + BTEFAIL_SH2_RESP_DSP,
245
246 /* 60: 0x8 BTE Error Response Access */
247 xpcBteSh2RspAccess = xpcBteSh2Start + BTEFAIL_SH2_RESP_ACCESS,
248
249 /* 68: 0x10 BTE Error CRB timeout */
250 xpcBteSh2CRBTO = xpcBteSh2Start + BTEFAIL_SH2_CRB_TO,
251
252 /* 84: 0x20 BTE Error NACK limit */
253 xpcBteSh2NACKLimit = xpcBteSh2Start + BTEFAIL_SH2_NACK_LIMIT,
254
255 /* 115: BTE end */
256 xpcBteSh2End = xpcBteSh2Start + BTEFAIL_SH2_ALL,
257
258 xpcUnknownReason /* 116: unknown reason - must be last in enum */
259};
260
261/*
262 * Define the callout function types used by XPC to update the user on
263 * connection activity and state changes (via the user function registered by
264 * xpc_connect()) and to notify them of messages received and delivered (via
265 * the user function registered by xpc_send_notify()).
266 *
267 * The two function types are xpc_channel_func and xpc_notify_func and
268 * both share the following arguments, with the exception of "data", which
269 * only xpc_channel_func has.
270 *
271 * Arguments:
272 *
273 * reason - reason code. (See following table.)
274 * partid - partition ID associated with condition.
275 * ch_number - channel # associated with condition.
276 * data - pointer to optional data. (See following table.)
277 * key - pointer to optional user-defined value provided as the "key"
278 * argument to xpc_connect() or xpc_send_notify().
279 *
280 * In the following table the "Optional Data" column applies to callouts made
281 * to functions registered by xpc_connect(). A "NA" in that column indicates
282 * that this reason code can be passed to functions registered by
283 * xpc_send_notify() (i.e. they don't have data arguments).
284 *
285 * Also, the first three reason codes in the following table indicate
286 * success, whereas the others indicate failure. When a failure reason code
287 * is received, one can assume that the channel is not connected.
288 *
289 *
290 * Reason Code | Cause | Optional Data
291 * =====================+================================+=====================
292 * xpcConnected | connection has been established| max #of entries
293 * | to the specified partition on | allowed in message
294 * | the specified channel | queue
295 * ---------------------+--------------------------------+---------------------
296 * xpcMsgReceived | an XPC message arrived from | address of payload
297 * | the specified partition on the |
298 * | specified channel | [the user must call
299 * | | xpc_received() when
300 * | | finished with the
301 * | | payload]
302 * ---------------------+--------------------------------+---------------------
303 * xpcMsgDelivered | notification that the message | NA
304 * | was delivered to the intended |
305 * | recipient and that they have |
306 * | acknowledged its receipt by |
307 * | calling xpc_received() |
308 * =====================+================================+=====================
309 * xpcUnequalMsgSizes | can't connect to the specified | NULL
310 * | partition on the specified |
311 * | channel because of mismatched |
312 * | message sizes |
313 * ---------------------+--------------------------------+---------------------
314 * xpcNoMemory | insufficient memory avaiable | NULL
315 * | to allocate message queue |
316 * ---------------------+--------------------------------+---------------------
317 * xpcLackOfResources | lack of resources to create | NULL
318 * | the necessary kthreads to |
319 * | support the channel |
320 * ---------------------+--------------------------------+---------------------
321 * xpcUnregistering | this side's user has | NULL or NA
322 * | unregistered by calling |
323 * | xpc_disconnect() |
324 * ---------------------+--------------------------------+---------------------
325 * xpcOtherUnregistering| the other side's user has | NULL or NA
326 * | unregistered by calling |
327 * | xpc_disconnect() |
328 * ---------------------+--------------------------------+---------------------
329 * xpcNoHeartbeat | the other side's XPC is no | NULL or NA
330 * | longer heartbeating |
331 * | |
332 * ---------------------+--------------------------------+---------------------
333 * xpcUnloading | this side's XPC module is | NULL or NA
334 * | being unloaded |
335 * | |
336 * ---------------------+--------------------------------+---------------------
337 * xpcOtherUnloading | the other side's XPC module is | NULL or NA
338 * | is being unloaded |
339 * | |
340 * ---------------------+--------------------------------+---------------------
341 * xpcPioReadError | xp_nofault_PIOR() returned an | NULL or NA
342 * | error while sending an IPI |
343 * | |
344 * ---------------------+--------------------------------+---------------------
345 * xpcInvalidAddress | the address either received or | NULL or NA
346 * | sent by the specified partition|
347 * | is invalid |
348 * ---------------------+--------------------------------+---------------------
349 * xpcBteNotAvailable | attempt to pull data from the | NULL or NA
350 * xpcBtePoisonError | specified partition over the |
351 * xpcBteWriteError | specified channel via a |
352 * xpcBteAccessError | bte_copy() failed |
353 * xpcBteTimeOutError | |
354 * xpcBteXtalkError | |
355 * xpcBteDirectoryError | |
356 * xpcBteGenericError | |
357 * xpcBteUnmappedError | |
358 * ---------------------+--------------------------------+---------------------
359 * xpcUnknownReason | the specified channel to the | NULL or NA
360 * | specified partition was |
361 * | unavailable for unknown reasons|
362 * =====================+================================+=====================
363 */
364
365typedef void (*xpc_channel_func) (enum xpc_retval reason, partid_t partid,
366 int ch_number, void *data, void *key);
367
368typedef void (*xpc_notify_func) (enum xpc_retval reason, partid_t partid,
369 int ch_number, void *key);
370
371/*
372 * The following is a registration entry. There is a global array of these,
373 * one per channel. It is used to record the connection registration made
374 * by the users of XPC. As long as a registration entry exists, for any
375 * partition that comes up, XPC will attempt to establish a connection on
376 * that channel. Notification that a connection has been made will occur via
377 * the xpc_channel_func function.
378 *
379 * The 'func' field points to the function to call when aynchronous
380 * notification is required for such events as: a connection established/lost,
381 * or an incoming message received, or an error condition encountered. A
382 * non-NULL 'func' field indicates that there is an active registration for
383 * the channel.
384 */
385struct xpc_registration {
386 struct mutex mutex;
387 xpc_channel_func func; /* function to call */
388 void *key; /* pointer to user's key */
389 u16 nentries; /* #of msg entries in local msg queue */
390 u16 msg_size; /* message queue's message size */
391 u32 assigned_limit; /* limit on #of assigned kthreads */
392 u32 idle_limit; /* limit on #of idle kthreads */
393} ____cacheline_aligned;
394
395#define XPC_CHANNEL_REGISTERED(_c) (xpc_registrations[_c].func != NULL)
396
397/* the following are valid xpc_allocate() flags */
398#define XPC_WAIT 0 /* wait flag */
399#define XPC_NOWAIT 1 /* no wait flag */
400
401struct xpc_interface {
402 void (*connect) (int);
403 void (*disconnect) (int);
404 enum xpc_retval (*allocate) (partid_t, int, u32, void **);
405 enum xpc_retval (*send) (partid_t, int, void *);
406 enum xpc_retval (*send_notify) (partid_t, int, void *,
407 xpc_notify_func, void *);
408 void (*received) (partid_t, int, void *);
409 enum xpc_retval (*partid_to_nasids) (partid_t, void *);
410};
411
412extern struct xpc_interface xpc_interface;
413
414extern void xpc_set_interface(void (*)(int),
415 void (*)(int),
416 enum xpc_retval (*)(partid_t, int, u32, void **),
417 enum xpc_retval (*)(partid_t, int, void *),
418 enum xpc_retval (*)(partid_t, int, void *,
419 xpc_notify_func, void *),
420 void (*)(partid_t, int, void *),
421 enum xpc_retval (*)(partid_t, void *));
422extern void xpc_clear_interface(void);
423
424extern enum xpc_retval xpc_connect(int, xpc_channel_func, void *, u16,
425 u16, u32, u32);
426extern void xpc_disconnect(int);
427
428static inline enum xpc_retval
429xpc_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
430{
431 return xpc_interface.allocate(partid, ch_number, flags, payload);
432}
433
434static inline enum xpc_retval
435xpc_send(partid_t partid, int ch_number, void *payload)
436{
437 return xpc_interface.send(partid, ch_number, payload);
438}
439
440static inline enum xpc_retval
441xpc_send_notify(partid_t partid, int ch_number, void *payload,
442 xpc_notify_func func, void *key)
443{
444 return xpc_interface.send_notify(partid, ch_number, payload, func, key);
445}
446
447static inline void
448xpc_received(partid_t partid, int ch_number, void *payload)
449{
450 return xpc_interface.received(partid, ch_number, payload);
451}
452
453static inline enum xpc_retval
454xpc_partid_to_nasids(partid_t partid, void *nasids)
455{
456 return xpc_interface.partid_to_nasids(partid, nasids);
457}
458
459extern u64 xp_nofault_PIOR_target;
460extern int xp_nofault_PIOR(void *);
461extern int xp_error_PIOR(void);
462
463#endif /* _DRIVERS_MISC_SGIXP_XP_H */
diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c
new file mode 100644
index 000000000000..1fbf99bae963
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_main.c
@@ -0,0 +1,279 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition (XP) base.
11 *
12 * XP provides a base from which its users can interact
13 * with XPC, yet not be dependent on XPC.
14 *
15 */
16
17#include <linux/kernel.h>
18#include <linux/interrupt.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include <asm/sn/intr.h>
22#include <asm/sn/sn_sal.h>
23#include "xp.h"
24
25/*
26 * The export of xp_nofault_PIOR needs to happen here since it is defined
27 * in drivers/misc/sgi-xp/xp_nofault.S. The target of the nofault read is
28 * defined here.
29 */
30EXPORT_SYMBOL_GPL(xp_nofault_PIOR);
31
32u64 xp_nofault_PIOR_target;
33EXPORT_SYMBOL_GPL(xp_nofault_PIOR_target);
34
35/*
36 * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
37 * users of XPC.
38 */
39struct xpc_registration xpc_registrations[XPC_NCHANNELS];
40EXPORT_SYMBOL_GPL(xpc_registrations);
41
42/*
43 * Initialize the XPC interface to indicate that XPC isn't loaded.
44 */
45static enum xpc_retval
46xpc_notloaded(void)
47{
48 return xpcNotLoaded;
49}
50
51struct xpc_interface xpc_interface = {
52 (void (*)(int))xpc_notloaded,
53 (void (*)(int))xpc_notloaded,
54 (enum xpc_retval(*)(partid_t, int, u32, void **))xpc_notloaded,
55 (enum xpc_retval(*)(partid_t, int, void *))xpc_notloaded,
56 (enum xpc_retval(*)(partid_t, int, void *, xpc_notify_func, void *))
57 xpc_notloaded,
58 (void (*)(partid_t, int, void *))xpc_notloaded,
59 (enum xpc_retval(*)(partid_t, void *))xpc_notloaded
60};
61EXPORT_SYMBOL_GPL(xpc_interface);
62
63/*
64 * XPC calls this when it (the XPC module) has been loaded.
65 */
66void
67xpc_set_interface(void (*connect) (int),
68 void (*disconnect) (int),
69 enum xpc_retval (*allocate) (partid_t, int, u32, void **),
70 enum xpc_retval (*send) (partid_t, int, void *),
71 enum xpc_retval (*send_notify) (partid_t, int, void *,
72 xpc_notify_func, void *),
73 void (*received) (partid_t, int, void *),
74 enum xpc_retval (*partid_to_nasids) (partid_t, void *))
75{
76 xpc_interface.connect = connect;
77 xpc_interface.disconnect = disconnect;
78 xpc_interface.allocate = allocate;
79 xpc_interface.send = send;
80 xpc_interface.send_notify = send_notify;
81 xpc_interface.received = received;
82 xpc_interface.partid_to_nasids = partid_to_nasids;
83}
84EXPORT_SYMBOL_GPL(xpc_set_interface);
85
86/*
87 * XPC calls this when it (the XPC module) is being unloaded.
88 */
89void
90xpc_clear_interface(void)
91{
92 xpc_interface.connect = (void (*)(int))xpc_notloaded;
93 xpc_interface.disconnect = (void (*)(int))xpc_notloaded;
94 xpc_interface.allocate = (enum xpc_retval(*)(partid_t, int, u32,
95 void **))xpc_notloaded;
96 xpc_interface.send = (enum xpc_retval(*)(partid_t, int, void *))
97 xpc_notloaded;
98 xpc_interface.send_notify = (enum xpc_retval(*)(partid_t, int, void *,
99 xpc_notify_func,
100 void *))xpc_notloaded;
101 xpc_interface.received = (void (*)(partid_t, int, void *))
102 xpc_notloaded;
103 xpc_interface.partid_to_nasids = (enum xpc_retval(*)(partid_t, void *))
104 xpc_notloaded;
105}
106EXPORT_SYMBOL_GPL(xpc_clear_interface);
107
108/*
109 * Register for automatic establishment of a channel connection whenever
110 * a partition comes up.
111 *
112 * Arguments:
113 *
114 * ch_number - channel # to register for connection.
115 * func - function to call for asynchronous notification of channel
116 * state changes (i.e., connection, disconnection, error) and
117 * the arrival of incoming messages.
118 * key - pointer to optional user-defined value that gets passed back
119 * to the user on any callouts made to func.
120 * payload_size - size in bytes of the XPC message's payload area which
121 * contains a user-defined message. The user should make
122 * this large enough to hold their largest message.
123 * nentries - max #of XPC message entries a message queue can contain.
124 * The actual number, which is determined when a connection
125 * is established and may be less then requested, will be
126 * passed to the user via the xpcConnected callout.
127 * assigned_limit - max number of kthreads allowed to be processing
128 * messages (per connection) at any given instant.
129 * idle_limit - max number of kthreads allowed to be idle at any given
130 * instant.
131 */
132enum xpc_retval
133xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
134 u16 nentries, u32 assigned_limit, u32 idle_limit)
135{
136 struct xpc_registration *registration;
137
138 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
139 DBUG_ON(payload_size == 0 || nentries == 0);
140 DBUG_ON(func == NULL);
141 DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
142
143 registration = &xpc_registrations[ch_number];
144
145 if (mutex_lock_interruptible(&registration->mutex) != 0)
146 return xpcInterrupted;
147
148 /* if XPC_CHANNEL_REGISTERED(ch_number) */
149 if (registration->func != NULL) {
150 mutex_unlock(&registration->mutex);
151 return xpcAlreadyRegistered;
152 }
153
154 /* register the channel for connection */
155 registration->msg_size = XPC_MSG_SIZE(payload_size);
156 registration->nentries = nentries;
157 registration->assigned_limit = assigned_limit;
158 registration->idle_limit = idle_limit;
159 registration->key = key;
160 registration->func = func;
161
162 mutex_unlock(&registration->mutex);
163
164 xpc_interface.connect(ch_number);
165
166 return xpcSuccess;
167}
168EXPORT_SYMBOL_GPL(xpc_connect);
169
170/*
171 * Remove the registration for automatic connection of the specified channel
172 * when a partition comes up.
173 *
174 * Before returning this xpc_disconnect() will wait for all connections on the
175 * specified channel have been closed/torndown. So the caller can be assured
176 * that they will not be receiving any more callouts from XPC to their
177 * function registered via xpc_connect().
178 *
179 * Arguments:
180 *
181 * ch_number - channel # to unregister.
182 */
183void
184xpc_disconnect(int ch_number)
185{
186 struct xpc_registration *registration;
187
188 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
189
190 registration = &xpc_registrations[ch_number];
191
192 /*
193 * We've decided not to make this a down_interruptible(), since we
194 * figured XPC's users will just turn around and call xpc_disconnect()
195 * again anyways, so we might as well wait, if need be.
196 */
197 mutex_lock(&registration->mutex);
198
199 /* if !XPC_CHANNEL_REGISTERED(ch_number) */
200 if (registration->func == NULL) {
201 mutex_unlock(&registration->mutex);
202 return;
203 }
204
205 /* remove the connection registration for the specified channel */
206 registration->func = NULL;
207 registration->key = NULL;
208 registration->nentries = 0;
209 registration->msg_size = 0;
210 registration->assigned_limit = 0;
211 registration->idle_limit = 0;
212
213 xpc_interface.disconnect(ch_number);
214
215 mutex_unlock(&registration->mutex);
216
217 return;
218}
219EXPORT_SYMBOL_GPL(xpc_disconnect);
220
221int __init
222xp_init(void)
223{
224 int ret, ch_number;
225 u64 func_addr = *(u64 *)xp_nofault_PIOR;
226 u64 err_func_addr = *(u64 *)xp_error_PIOR;
227
228 if (!ia64_platform_is("sn2"))
229 return -ENODEV;
230
231 /*
232 * Register a nofault code region which performs a cross-partition
233 * PIO read. If the PIO read times out, the MCA handler will consume
234 * the error and return to a kernel-provided instruction to indicate
235 * an error. This PIO read exists because it is guaranteed to timeout
236 * if the destination is down (AMO operations do not timeout on at
237 * least some CPUs on Shubs <= v1.2, which unfortunately we have to
238 * work around).
239 */
240 ret = sn_register_nofault_code(func_addr, err_func_addr, err_func_addr,
241 1, 1);
242 if (ret != 0) {
243 printk(KERN_ERR "XP: can't register nofault code, error=%d\n",
244 ret);
245 }
246 /*
247 * Setup the nofault PIO read target. (There is no special reason why
248 * SH_IPI_ACCESS was selected.)
249 */
250 if (is_shub2())
251 xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
252 else
253 xp_nofault_PIOR_target = SH1_IPI_ACCESS;
254
255 /* initialize the connection registration mutex */
256 for (ch_number = 0; ch_number < XPC_NCHANNELS; ch_number++)
257 mutex_init(&xpc_registrations[ch_number].mutex);
258
259 return 0;
260}
261
262module_init(xp_init);
263
264void __exit
265xp_exit(void)
266{
267 u64 func_addr = *(u64 *)xp_nofault_PIOR;
268 u64 err_func_addr = *(u64 *)xp_error_PIOR;
269
270 /* unregister the PIO read nofault code region */
271 (void)sn_register_nofault_code(func_addr, err_func_addr,
272 err_func_addr, 1, 0);
273}
274
275module_exit(xp_exit);
276
277MODULE_AUTHOR("Silicon Graphics, Inc.");
278MODULE_DESCRIPTION("Cross Partition (XP) base");
279MODULE_LICENSE("GPL");
diff --git a/drivers/misc/sgi-xp/xp_nofault.S b/drivers/misc/sgi-xp/xp_nofault.S
new file mode 100644
index 000000000000..e38d43319429
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_nofault.S
@@ -0,0 +1,35 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * The xp_nofault_PIOR function takes a pointer to a remote PIO register
11 * and attempts to load and consume a value from it. This function
12 * will be registered as a nofault code block. In the event that the
13 * PIO read fails, the MCA handler will force the error to look
14 * corrected and vector to the xp_error_PIOR which will return an error.
15 *
16 * The definition of "consumption" and the time it takes for an MCA
17 * to surface is processor implementation specific. This code
18 * is sufficient on Itanium through the Montvale processor family.
19 * It may need to be adjusted for future processor implementations.
20 *
21 * extern int xp_nofault_PIOR(void *remote_register);
22 */
23
24 .global xp_nofault_PIOR
25xp_nofault_PIOR:
26 mov r8=r0 // Stage a success return value
27 ld8.acq r9=[r32];; // PIO Read the specified register
28 adds r9=1,r9;; // Add to force consumption
29 srlz.i;; // Allow time for MCA to surface
30 br.ret.sptk.many b0;; // Return success
31
32 .global xp_error_PIOR
33xp_error_PIOR:
34 mov r8=1 // Return value of 1
35 br.ret.sptk.many b0;; // Return failure
diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h
new file mode 100644
index 000000000000..9eb6d4a3269c
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc.h
@@ -0,0 +1,1187 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) structures and macros.
11 */
12
13#ifndef _DRIVERS_MISC_SGIXP_XPC_H
14#define _DRIVERS_MISC_SGIXP_XPC_H
15
16#include <linux/interrupt.h>
17#include <linux/sysctl.h>
18#include <linux/device.h>
19#include <linux/mutex.h>
20#include <linux/completion.h>
21#include <asm/pgtable.h>
22#include <asm/processor.h>
23#include <asm/sn/bte.h>
24#include <asm/sn/clksupport.h>
25#include <asm/sn/addrs.h>
26#include <asm/sn/mspec.h>
27#include <asm/sn/shub_mmr.h>
28#include "xp.h"
29
30/*
31 * XPC Version numbers consist of a major and minor number. XPC can always
32 * talk to versions with same major #, and never talk to versions with a
33 * different major #.
34 */
35#define _XPC_VERSION(_maj, _min) (((_maj) << 4) | ((_min) & 0xf))
36#define XPC_VERSION_MAJOR(_v) ((_v) >> 4)
37#define XPC_VERSION_MINOR(_v) ((_v) & 0xf)
38
39/*
40 * The next macros define word or bit representations for given
41 * C-brick nasid in either the SAL provided bit array representing
42 * nasids in the partition/machine or the AMO_t array used for
43 * inter-partition initiation communications.
44 *
45 * For SN2 machines, C-Bricks are alway even numbered NASIDs. As
46 * such, some space will be saved by insisting that nasid information
47 * passed from SAL always be packed for C-Bricks and the
48 * cross-partition interrupts use the same packing scheme.
49 */
50#define XPC_NASID_W_INDEX(_n) (((_n) / 64) / 2)
51#define XPC_NASID_B_INDEX(_n) (((_n) / 2) & (64 - 1))
52#define XPC_NASID_IN_ARRAY(_n, _p) ((_p)[XPC_NASID_W_INDEX(_n)] & \
53 (1UL << XPC_NASID_B_INDEX(_n)))
54#define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2)
55
56#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */
57#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */
58
59/* define the process name of HB checker and the CPU it is pinned to */
60#define XPC_HB_CHECK_THREAD_NAME "xpc_hb"
61#define XPC_HB_CHECK_CPU 0
62
63/* define the process name of the discovery thread */
64#define XPC_DISCOVERY_THREAD_NAME "xpc_discovery"
65
66/*
67 * the reserved page
68 *
69 * SAL reserves one page of memory per partition for XPC. Though a full page
70 * in length (16384 bytes), its starting address is not page aligned, but it
71 * is cacheline aligned. The reserved page consists of the following:
72 *
73 * reserved page header
74 *
75 * The first cacheline of the reserved page contains the header
76 * (struct xpc_rsvd_page). Before SAL initialization has completed,
77 * SAL has set up the following fields of the reserved page header:
78 * SAL_signature, SAL_version, partid, and nasids_size. The other
79 * fields are set up by XPC. (xpc_rsvd_page points to the local
80 * partition's reserved page.)
81 *
82 * part_nasids mask
83 * mach_nasids mask
84 *
85 * SAL also sets up two bitmaps (or masks), one that reflects the actual
86 * nasids in this partition (part_nasids), and the other that reflects
87 * the actual nasids in the entire machine (mach_nasids). We're only
88 * interested in the even numbered nasids (which contain the processors
89 * and/or memory), so we only need half as many bits to represent the
90 * nasids. The part_nasids mask is located starting at the first cacheline
91 * following the reserved page header. The mach_nasids mask follows right
92 * after the part_nasids mask. The size in bytes of each mask is reflected
93 * by the reserved page header field 'nasids_size'. (Local partition's
94 * mask pointers are xpc_part_nasids and xpc_mach_nasids.)
95 *
96 * vars
97 * vars part
98 *
99 * Immediately following the mach_nasids mask are the XPC variables
100 * required by other partitions. First are those that are generic to all
101 * partitions (vars), followed on the next available cacheline by those
102 * which are partition specific (vars part). These are setup by XPC.
103 * (Local partition's vars pointers are xpc_vars and xpc_vars_part.)
104 *
105 * Note: Until vars_pa is set, the partition XPC code has not been initialized.
106 */
107struct xpc_rsvd_page {
108 u64 SAL_signature; /* SAL: unique signature */
109 u64 SAL_version; /* SAL: version */
110 u8 partid; /* SAL: partition ID */
111 u8 version;
112 u8 pad1[6]; /* align to next u64 in cacheline */
113 u64 vars_pa; /* physical address of struct xpc_vars */
114 struct timespec stamp; /* time when reserved page was setup by XPC */
115 u64 pad2[9]; /* align to last u64 in cacheline */
116 u64 nasids_size; /* SAL: size of each nasid mask in bytes */
117};
118
119#define XPC_RP_VERSION _XPC_VERSION(1, 1) /* version 1.1 of the reserved page */
120
121#define XPC_SUPPORTS_RP_STAMP(_version) \
122 (_version >= _XPC_VERSION(1, 1))
123
124/*
125 * compare stamps - the return value is:
126 *
127 * < 0, if stamp1 < stamp2
128 * = 0, if stamp1 == stamp2
129 * > 0, if stamp1 > stamp2
130 */
131static inline int
132xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2)
133{
134 int ret;
135
136 ret = stamp1->tv_sec - stamp2->tv_sec;
137 if (ret == 0)
138 ret = stamp1->tv_nsec - stamp2->tv_nsec;
139
140 return ret;
141}
142
143/*
144 * Define the structures by which XPC variables can be exported to other
145 * partitions. (There are two: struct xpc_vars and struct xpc_vars_part)
146 */
147
148/*
149 * The following structure describes the partition generic variables
150 * needed by other partitions in order to properly initialize.
151 *
152 * struct xpc_vars version number also applies to struct xpc_vars_part.
153 * Changes to either structure and/or related functionality should be
154 * reflected by incrementing either the major or minor version numbers
155 * of struct xpc_vars.
156 */
157struct xpc_vars {
158 u8 version;
159 u64 heartbeat;
160 u64 heartbeating_to_mask;
161 u64 heartbeat_offline; /* if 0, heartbeat should be changing */
162 int act_nasid;
163 int act_phys_cpuid;
164 u64 vars_part_pa;
165 u64 amos_page_pa; /* paddr of page of AMOs from MSPEC driver */
166 AMO_t *amos_page; /* vaddr of page of AMOs from MSPEC driver */
167};
168
169#define XPC_V_VERSION _XPC_VERSION(3, 1) /* version 3.1 of the cross vars */
170
171#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \
172 (_version >= _XPC_VERSION(3, 1))
173
174static inline int
175xpc_hb_allowed(partid_t partid, struct xpc_vars *vars)
176{
177 return ((vars->heartbeating_to_mask & (1UL << partid)) != 0);
178}
179
180static inline void
181xpc_allow_hb(partid_t partid, struct xpc_vars *vars)
182{
183 u64 old_mask, new_mask;
184
185 do {
186 old_mask = vars->heartbeating_to_mask;
187 new_mask = (old_mask | (1UL << partid));
188 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
189 old_mask);
190}
191
192static inline void
193xpc_disallow_hb(partid_t partid, struct xpc_vars *vars)
194{
195 u64 old_mask, new_mask;
196
197 do {
198 old_mask = vars->heartbeating_to_mask;
199 new_mask = (old_mask & ~(1UL << partid));
200 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
201 old_mask);
202}
203
204/*
205 * The AMOs page consists of a number of AMO variables which are divided into
206 * four groups, The first two groups are used to identify an IRQ's sender.
207 * These two groups consist of 64 and 128 AMO variables respectively. The last
208 * two groups, consisting of just one AMO variable each, are used to identify
209 * the remote partitions that are currently engaged (from the viewpoint of
210 * the XPC running on the remote partition).
211 */
212#define XPC_NOTIFY_IRQ_AMOS 0
213#define XPC_ACTIVATE_IRQ_AMOS (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS)
214#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS)
215#define XPC_DISENGAGE_REQUEST_AMO (XPC_ENGAGED_PARTITIONS_AMO + 1)
216
217/*
218 * The following structure describes the per partition specific variables.
219 *
220 * An array of these structures, one per partition, will be defined. As a
221 * partition becomes active XPC will copy the array entry corresponding to
222 * itself from that partition. It is desirable that the size of this
223 * structure evenly divide into a cacheline, such that none of the entries
224 * in this array crosses a cacheline boundary. As it is now, each entry
225 * occupies half a cacheline.
226 */
227struct xpc_vars_part {
228 u64 magic;
229
230 u64 openclose_args_pa; /* physical address of open and close args */
231 u64 GPs_pa; /* physical address of Get/Put values */
232
233 u64 IPI_amo_pa; /* physical address of IPI AMO_t structure */
234 int IPI_nasid; /* nasid of where to send IPIs */
235 int IPI_phys_cpuid; /* physical CPU ID of where to send IPIs */
236
237 u8 nchannels; /* #of defined channels supported */
238
239 u8 reserved[23]; /* pad to a full 64 bytes */
240};
241
242/*
243 * The vars_part MAGIC numbers play a part in the first contact protocol.
244 *
245 * MAGIC1 indicates that the per partition specific variables for a remote
246 * partition have been initialized by this partition.
247 *
248 * MAGIC2 indicates that this partition has pulled the remote partititions
249 * per partition variables that pertain to this partition.
250 */
251#define XPC_VP_MAGIC1 0x0053524156435058L /* 'XPCVARS\0'L (little endian) */
252#define XPC_VP_MAGIC2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */
253
254/* the reserved page sizes and offsets */
255
256#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
257#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars))
258
259#define XPC_RP_PART_NASIDS(_rp) ((u64 *)((u8 *)(_rp) + XPC_RP_HEADER_SIZE))
260#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + xp_nasid_mask_words)
261#define XPC_RP_VARS(_rp) ((struct xpc_vars *)(XPC_RP_MACH_NASIDS(_rp) + \
262 xp_nasid_mask_words))
263#define XPC_RP_VARS_PART(_rp) ((struct xpc_vars_part *) \
264 ((u8 *)XPC_RP_VARS(_rp) + XPC_RP_VARS_SIZE))
265
266/*
267 * Functions registered by add_timer() or called by kernel_thread() only
268 * allow for a single 64-bit argument. The following macros can be used to
269 * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from
270 * the passed argument.
271 */
272#define XPC_PACK_ARGS(_arg1, _arg2) \
273 ((((u64) _arg1) & 0xffffffff) | \
274 ((((u64) _arg2) & 0xffffffff) << 32))
275
276#define XPC_UNPACK_ARG1(_args) (((u64) _args) & 0xffffffff)
277#define XPC_UNPACK_ARG2(_args) ((((u64) _args) >> 32) & 0xffffffff)
278
279/*
280 * Define a Get/Put value pair (pointers) used with a message queue.
281 */
282struct xpc_gp {
283 s64 get; /* Get value */
284 s64 put; /* Put value */
285};
286
287#define XPC_GP_SIZE \
288 L1_CACHE_ALIGN(sizeof(struct xpc_gp) * XPC_NCHANNELS)
289
290/*
291 * Define a structure that contains arguments associated with opening and
292 * closing a channel.
293 */
294struct xpc_openclose_args {
295 u16 reason; /* reason why channel is closing */
296 u16 msg_size; /* sizeof each message entry */
297 u16 remote_nentries; /* #of message entries in remote msg queue */
298 u16 local_nentries; /* #of message entries in local msg queue */
299 u64 local_msgqueue_pa; /* physical address of local message queue */
300};
301
302#define XPC_OPENCLOSE_ARGS_SIZE \
303 L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * XPC_NCHANNELS)
304
305/* struct xpc_msg flags */
306
307#define XPC_M_DONE 0x01 /* msg has been received/consumed */
308#define XPC_M_READY 0x02 /* msg is ready to be sent */
309#define XPC_M_INTERRUPT 0x04 /* send interrupt when msg consumed */
310
311#define XPC_MSG_ADDRESS(_payload) \
312 ((struct xpc_msg *)((u8 *)(_payload) - XPC_MSG_PAYLOAD_OFFSET))
313
314/*
315 * Defines notify entry.
316 *
317 * This is used to notify a message's sender that their message was received
318 * and consumed by the intended recipient.
319 */
320struct xpc_notify {
321 u8 type; /* type of notification */
322
323 /* the following two fields are only used if type == XPC_N_CALL */
324 xpc_notify_func func; /* user's notify function */
325 void *key; /* pointer to user's key */
326};
327
328/* struct xpc_notify type of notification */
329
330#define XPC_N_CALL 0x01 /* notify function provided by user */
331
332/*
333 * Define the structure that manages all the stuff required by a channel. In
334 * particular, they are used to manage the messages sent across the channel.
335 *
336 * This structure is private to a partition, and is NOT shared across the
337 * partition boundary.
338 *
339 * There is an array of these structures for each remote partition. It is
340 * allocated at the time a partition becomes active. The array contains one
341 * of these structures for each potential channel connection to that partition.
342 *
343 * Each of these structures manages two message queues (circular buffers).
344 * They are allocated at the time a channel connection is made. One of
345 * these message queues (local_msgqueue) holds the locally created messages
346 * that are destined for the remote partition. The other of these message
347 * queues (remote_msgqueue) is a locally cached copy of the remote partition's
348 * own local_msgqueue.
349 *
350 * The following is a description of the Get/Put pointers used to manage these
351 * two message queues. Consider the local_msgqueue to be on one partition
352 * and the remote_msgqueue to be its cached copy on another partition. A
353 * description of what each of the lettered areas contains is included.
354 *
355 *
356 * local_msgqueue remote_msgqueue
357 *
358 * |/////////| |/////////|
359 * w_remote_GP.get --> +---------+ |/////////|
360 * | F | |/////////|
361 * remote_GP.get --> +---------+ +---------+ <-- local_GP->get
362 * | | | |
363 * | | | E |
364 * | | | |
365 * | | +---------+ <-- w_local_GP.get
366 * | B | |/////////|
367 * | | |////D////|
368 * | | |/////////|
369 * | | +---------+ <-- w_remote_GP.put
370 * | | |////C////|
371 * local_GP->put --> +---------+ +---------+ <-- remote_GP.put
372 * | | |/////////|
373 * | A | |/////////|
374 * | | |/////////|
375 * w_local_GP.put --> +---------+ |/////////|
376 * |/////////| |/////////|
377 *
378 *
379 * ( remote_GP.[get|put] are cached copies of the remote
380 * partition's local_GP->[get|put], and thus their values can
381 * lag behind their counterparts on the remote partition. )
382 *
383 *
384 * A - Messages that have been allocated, but have not yet been sent to the
385 * remote partition.
386 *
387 * B - Messages that have been sent, but have not yet been acknowledged by the
388 * remote partition as having been received.
389 *
390 * C - Area that needs to be prepared for the copying of sent messages, by
391 * the clearing of the message flags of any previously received messages.
392 *
393 * D - Area into which sent messages are to be copied from the remote
394 * partition's local_msgqueue and then delivered to their intended
395 * recipients. [ To allow for a multi-message copy, another pointer
396 * (next_msg_to_pull) has been added to keep track of the next message
397 * number needing to be copied (pulled). It chases after w_remote_GP.put.
398 * Any messages lying between w_local_GP.get and next_msg_to_pull have
399 * been copied and are ready to be delivered. ]
400 *
401 * E - Messages that have been copied and delivered, but have not yet been
402 * acknowledged by the recipient as having been received.
403 *
404 * F - Messages that have been acknowledged, but XPC has not yet notified the
405 * sender that the message was received by its intended recipient.
406 * This is also an area that needs to be prepared for the allocating of
407 * new messages, by the clearing of the message flags of the acknowledged
408 * messages.
409 */
410struct xpc_channel {
411 partid_t partid; /* ID of remote partition connected */
412 spinlock_t lock; /* lock for updating this structure */
413 u32 flags; /* general flags */
414
415 enum xpc_retval reason; /* reason why channel is disconnect'g */
416 int reason_line; /* line# disconnect initiated from */
417
418 u16 number; /* channel # */
419
420 u16 msg_size; /* sizeof each msg entry */
421 u16 local_nentries; /* #of msg entries in local msg queue */
422 u16 remote_nentries; /* #of msg entries in remote msg queue */
423
424 void *local_msgqueue_base; /* base address of kmalloc'd space */
425 struct xpc_msg *local_msgqueue; /* local message queue */
426 void *remote_msgqueue_base; /* base address of kmalloc'd space */
427 struct xpc_msg *remote_msgqueue; /* cached copy of remote partition's */
428 /* local message queue */
429 u64 remote_msgqueue_pa; /* phys addr of remote partition's */
430 /* local message queue */
431
432 atomic_t references; /* #of external references to queues */
433
434 atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */
435 wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */
436
437 u8 delayed_IPI_flags; /* IPI flags received, but delayed */
438 /* action until channel disconnected */
439
440 /* queue of msg senders who want to be notified when msg received */
441
442 atomic_t n_to_notify; /* #of msg senders to notify */
443 struct xpc_notify *notify_queue; /* notify queue for messages sent */
444
445 xpc_channel_func func; /* user's channel function */
446 void *key; /* pointer to user's key */
447
448 struct mutex msg_to_pull_mutex; /* next msg to pull serialization */
449 struct completion wdisconnect_wait; /* wait for channel disconnect */
450
451 struct xpc_openclose_args *local_openclose_args; /* args passed on */
452 /* opening or closing of channel */
453
454 /* various flavors of local and remote Get/Put values */
455
456 struct xpc_gp *local_GP; /* local Get/Put values */
457 struct xpc_gp remote_GP; /* remote Get/Put values */
458 struct xpc_gp w_local_GP; /* working local Get/Put values */
459 struct xpc_gp w_remote_GP; /* working remote Get/Put values */
460 s64 next_msg_to_pull; /* Put value of next msg to pull */
461
462 /* kthread management related fields */
463
464 atomic_t kthreads_assigned; /* #of kthreads assigned to channel */
465 u32 kthreads_assigned_limit; /* limit on #of kthreads assigned */
466 atomic_t kthreads_idle; /* #of kthreads idle waiting for work */
467 u32 kthreads_idle_limit; /* limit on #of kthreads idle */
468 atomic_t kthreads_active; /* #of kthreads actively working */
469
470 wait_queue_head_t idle_wq; /* idle kthread wait queue */
471
472} ____cacheline_aligned;
473
474/* struct xpc_channel flags */
475
476#define XPC_C_WASCONNECTED 0x00000001 /* channel was connected */
477
478#define XPC_C_ROPENREPLY 0x00000002 /* remote open channel reply */
479#define XPC_C_OPENREPLY 0x00000004 /* local open channel reply */
480#define XPC_C_ROPENREQUEST 0x00000008 /* remote open channel request */
481#define XPC_C_OPENREQUEST 0x00000010 /* local open channel request */
482
483#define XPC_C_SETUP 0x00000020 /* channel's msgqueues are alloc'd */
484#define XPC_C_CONNECTEDCALLOUT 0x00000040 /* connected callout initiated */
485#define XPC_C_CONNECTEDCALLOUT_MADE \
486 0x00000080 /* connected callout completed */
487#define XPC_C_CONNECTED 0x00000100 /* local channel is connected */
488#define XPC_C_CONNECTING 0x00000200 /* channel is being connected */
489
490#define XPC_C_RCLOSEREPLY 0x00000400 /* remote close channel reply */
491#define XPC_C_CLOSEREPLY 0x00000800 /* local close channel reply */
492#define XPC_C_RCLOSEREQUEST 0x00001000 /* remote close channel request */
493#define XPC_C_CLOSEREQUEST 0x00002000 /* local close channel request */
494
495#define XPC_C_DISCONNECTED 0x00004000 /* channel is disconnected */
496#define XPC_C_DISCONNECTING 0x00008000 /* channel is being disconnected */
497#define XPC_C_DISCONNECTINGCALLOUT \
498 0x00010000 /* disconnecting callout initiated */
499#define XPC_C_DISCONNECTINGCALLOUT_MADE \
500 0x00020000 /* disconnecting callout completed */
501#define XPC_C_WDISCONNECT 0x00040000 /* waiting for channel disconnect */
502
503/*
504 * Manages channels on a partition basis. There is one of these structures
505 * for each partition (a partition will never utilize the structure that
506 * represents itself).
507 */
508struct xpc_partition {
509
510 /* XPC HB infrastructure */
511
512 u8 remote_rp_version; /* version# of partition's rsvd pg */
513 struct timespec remote_rp_stamp; /* time when rsvd pg was initialized */
514 u64 remote_rp_pa; /* phys addr of partition's rsvd pg */
515 u64 remote_vars_pa; /* phys addr of partition's vars */
516 u64 remote_vars_part_pa; /* phys addr of partition's vars part */
517 u64 last_heartbeat; /* HB at last read */
518 u64 remote_amos_page_pa; /* phys addr of partition's amos page */
519 int remote_act_nasid; /* active part's act/deact nasid */
520 int remote_act_phys_cpuid; /* active part's act/deact phys cpuid */
521 u32 act_IRQ_rcvd; /* IRQs since activation */
522 spinlock_t act_lock; /* protect updating of act_state */
523 u8 act_state; /* from XPC HB viewpoint */
524 u8 remote_vars_version; /* version# of partition's vars */
525 enum xpc_retval reason; /* reason partition is deactivating */
526 int reason_line; /* line# deactivation initiated from */
527 int reactivate_nasid; /* nasid in partition to reactivate */
528
529 unsigned long disengage_request_timeout; /* timeout in jiffies */
530 struct timer_list disengage_request_timer;
531
532 /* XPC infrastructure referencing and teardown control */
533
534 u8 setup_state; /* infrastructure setup state */
535 wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */
536 atomic_t references; /* #of references to infrastructure */
537
538 /*
539 * NONE OF THE PRECEDING FIELDS OF THIS STRUCTURE WILL BE CLEARED WHEN
540 * XPC SETS UP THE NECESSARY INFRASTRUCTURE TO SUPPORT CROSS PARTITION
541 * COMMUNICATION. ALL OF THE FOLLOWING FIELDS WILL BE CLEARED. (THE
542 * 'nchannels' FIELD MUST BE THE FIRST OF THE FIELDS TO BE CLEARED.)
543 */
544
545 u8 nchannels; /* #of defined channels supported */
546 atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */
547 atomic_t nchannels_engaged; /* #of channels engaged with remote part */
548 struct xpc_channel *channels; /* array of channel structures */
549
550 void *local_GPs_base; /* base address of kmalloc'd space */
551 struct xpc_gp *local_GPs; /* local Get/Put values */
552 void *remote_GPs_base; /* base address of kmalloc'd space */
553 struct xpc_gp *remote_GPs; /* copy of remote partition's local */
554 /* Get/Put values */
555 u64 remote_GPs_pa; /* phys address of remote partition's local */
556 /* Get/Put values */
557
558 /* fields used to pass args when opening or closing a channel */
559
560 void *local_openclose_args_base; /* base address of kmalloc'd space */
561 struct xpc_openclose_args *local_openclose_args; /* local's args */
562 void *remote_openclose_args_base; /* base address of kmalloc'd space */
563 struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */
564 /* args */
565 u64 remote_openclose_args_pa; /* phys addr of remote's args */
566
567 /* IPI sending, receiving and handling related fields */
568
569 int remote_IPI_nasid; /* nasid of where to send IPIs */
570 int remote_IPI_phys_cpuid; /* phys CPU ID of where to send IPIs */
571 AMO_t *remote_IPI_amo_va; /* address of remote IPI AMO_t structure */
572
573 AMO_t *local_IPI_amo_va; /* address of IPI AMO_t structure */
574 u64 local_IPI_amo; /* IPI amo flags yet to be handled */
575 char IPI_owner[8]; /* IPI owner's name */
576 struct timer_list dropped_IPI_timer; /* dropped IPI timer */
577
578 spinlock_t IPI_lock; /* IPI handler lock */
579
580 /* channel manager related fields */
581
582 atomic_t channel_mgr_requests; /* #of requests to activate chan mgr */
583 wait_queue_head_t channel_mgr_wq; /* channel mgr's wait queue */
584
585} ____cacheline_aligned;
586
587/* struct xpc_partition act_state values (for XPC HB) */
588
589#define XPC_P_INACTIVE 0x00 /* partition is not active */
590#define XPC_P_ACTIVATION_REQ 0x01 /* created thread to activate */
591#define XPC_P_ACTIVATING 0x02 /* activation thread started */
592#define XPC_P_ACTIVE 0x03 /* xpc_partition_up() was called */
593#define XPC_P_DEACTIVATING 0x04 /* partition deactivation initiated */
594
595#define XPC_DEACTIVATE_PARTITION(_p, _reason) \
596 xpc_deactivate_partition(__LINE__, (_p), (_reason))
597
598/* struct xpc_partition setup_state values */
599
600#define XPC_P_UNSET 0x00 /* infrastructure was never setup */
601#define XPC_P_SETUP 0x01 /* infrastructure is setup */
602#define XPC_P_WTEARDOWN 0x02 /* waiting to teardown infrastructure */
603#define XPC_P_TORNDOWN 0x03 /* infrastructure is torndown */
604
605/*
606 * struct xpc_partition IPI_timer #of seconds to wait before checking for
607 * dropped IPIs. These occur whenever an IPI amo write doesn't complete until
608 * after the IPI was received.
609 */
610#define XPC_P_DROPPED_IPI_WAIT (0.25 * HZ)
611
612/* number of seconds to wait for other partitions to disengage */
613#define XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT 90
614
615/* interval in seconds to print 'waiting disengagement' messages */
616#define XPC_DISENGAGE_PRINTMSG_INTERVAL 10
617
618#define XPC_PARTID(_p) ((partid_t) ((_p) - &xpc_partitions[0]))
619
620/* found in xp_main.c */
621extern struct xpc_registration xpc_registrations[];
622
623/* found in xpc_main.c */
624extern struct device *xpc_part;
625extern struct device *xpc_chan;
626extern int xpc_disengage_request_timelimit;
627extern int xpc_disengage_request_timedout;
628extern irqreturn_t xpc_notify_IRQ_handler(int, void *);
629extern void xpc_dropped_IPI_check(struct xpc_partition *);
630extern void xpc_activate_partition(struct xpc_partition *);
631extern void xpc_activate_kthreads(struct xpc_channel *, int);
632extern void xpc_create_kthreads(struct xpc_channel *, int, int);
633extern void xpc_disconnect_wait(int);
634
635/* found in xpc_partition.c */
636extern int xpc_exiting;
637extern struct xpc_vars *xpc_vars;
638extern struct xpc_rsvd_page *xpc_rsvd_page;
639extern struct xpc_vars_part *xpc_vars_part;
640extern struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
641extern char *xpc_remote_copy_buffer;
642extern void *xpc_remote_copy_buffer_base;
643extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **);
644extern struct xpc_rsvd_page *xpc_rsvd_page_init(void);
645extern void xpc_allow_IPI_ops(void);
646extern void xpc_restrict_IPI_ops(void);
647extern int xpc_identify_act_IRQ_sender(void);
648extern int xpc_partition_disengaged(struct xpc_partition *);
649extern enum xpc_retval xpc_mark_partition_active(struct xpc_partition *);
650extern void xpc_mark_partition_inactive(struct xpc_partition *);
651extern void xpc_discovery(void);
652extern void xpc_check_remote_hb(void);
653extern void xpc_deactivate_partition(const int, struct xpc_partition *,
654 enum xpc_retval);
655extern enum xpc_retval xpc_initiate_partid_to_nasids(partid_t, void *);
656
657/* found in xpc_channel.c */
658extern void xpc_initiate_connect(int);
659extern void xpc_initiate_disconnect(int);
660extern enum xpc_retval xpc_initiate_allocate(partid_t, int, u32, void **);
661extern enum xpc_retval xpc_initiate_send(partid_t, int, void *);
662extern enum xpc_retval xpc_initiate_send_notify(partid_t, int, void *,
663 xpc_notify_func, void *);
664extern void xpc_initiate_received(partid_t, int, void *);
665extern enum xpc_retval xpc_setup_infrastructure(struct xpc_partition *);
666extern enum xpc_retval xpc_pull_remote_vars_part(struct xpc_partition *);
667extern void xpc_process_channel_activity(struct xpc_partition *);
668extern void xpc_connected_callout(struct xpc_channel *);
669extern void xpc_deliver_msg(struct xpc_channel *);
670extern void xpc_disconnect_channel(const int, struct xpc_channel *,
671 enum xpc_retval, unsigned long *);
672extern void xpc_disconnect_callout(struct xpc_channel *, enum xpc_retval);
673extern void xpc_partition_going_down(struct xpc_partition *, enum xpc_retval);
674extern void xpc_teardown_infrastructure(struct xpc_partition *);
675
676static inline void
677xpc_wakeup_channel_mgr(struct xpc_partition *part)
678{
679 if (atomic_inc_return(&part->channel_mgr_requests) == 1)
680 wake_up(&part->channel_mgr_wq);
681}
682
683/*
684 * These next two inlines are used to keep us from tearing down a channel's
685 * msg queues while a thread may be referencing them.
686 */
687static inline void
688xpc_msgqueue_ref(struct xpc_channel *ch)
689{
690 atomic_inc(&ch->references);
691}
692
693static inline void
694xpc_msgqueue_deref(struct xpc_channel *ch)
695{
696 s32 refs = atomic_dec_return(&ch->references);
697
698 DBUG_ON(refs < 0);
699 if (refs == 0)
700 xpc_wakeup_channel_mgr(&xpc_partitions[ch->partid]);
701}
702
703#define XPC_DISCONNECT_CHANNEL(_ch, _reason, _irqflgs) \
704 xpc_disconnect_channel(__LINE__, _ch, _reason, _irqflgs)
705
706/*
707 * These two inlines are used to keep us from tearing down a partition's
708 * setup infrastructure while a thread may be referencing it.
709 */
710static inline void
711xpc_part_deref(struct xpc_partition *part)
712{
713 s32 refs = atomic_dec_return(&part->references);
714
715 DBUG_ON(refs < 0);
716 if (refs == 0 && part->setup_state == XPC_P_WTEARDOWN)
717 wake_up(&part->teardown_wq);
718}
719
720static inline int
721xpc_part_ref(struct xpc_partition *part)
722{
723 int setup;
724
725 atomic_inc(&part->references);
726 setup = (part->setup_state == XPC_P_SETUP);
727 if (!setup)
728 xpc_part_deref(part);
729
730 return setup;
731}
732
733/*
734 * The following macro is to be used for the setting of the reason and
735 * reason_line fields in both the struct xpc_channel and struct xpc_partition
736 * structures.
737 */
738#define XPC_SET_REASON(_p, _reason, _line) \
739 { \
740 (_p)->reason = _reason; \
741 (_p)->reason_line = _line; \
742 }
743
744/*
745 * This next set of inlines are used to keep track of when a partition is
746 * potentially engaged in accessing memory belonging to another partition.
747 */
748
749static inline void
750xpc_mark_partition_engaged(struct xpc_partition *part)
751{
752 unsigned long irq_flags;
753 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
754 (XPC_ENGAGED_PARTITIONS_AMO *
755 sizeof(AMO_t)));
756
757 local_irq_save(irq_flags);
758
759 /* set bit corresponding to our partid in remote partition's AMO */
760 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
761 (1UL << sn_partition_id));
762 /*
763 * We must always use the nofault function regardless of whether we
764 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
765 * didn't, we'd never know that the other partition is down and would
766 * keep sending IPIs and AMOs to it until the heartbeat times out.
767 */
768 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
769 variable),
770 xp_nofault_PIOR_target));
771
772 local_irq_restore(irq_flags);
773}
774
775static inline void
776xpc_mark_partition_disengaged(struct xpc_partition *part)
777{
778 unsigned long irq_flags;
779 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
780 (XPC_ENGAGED_PARTITIONS_AMO *
781 sizeof(AMO_t)));
782
783 local_irq_save(irq_flags);
784
785 /* clear bit corresponding to our partid in remote partition's AMO */
786 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
787 ~(1UL << sn_partition_id));
788 /*
789 * We must always use the nofault function regardless of whether we
790 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
791 * didn't, we'd never know that the other partition is down and would
792 * keep sending IPIs and AMOs to it until the heartbeat times out.
793 */
794 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
795 variable),
796 xp_nofault_PIOR_target));
797
798 local_irq_restore(irq_flags);
799}
800
801static inline void
802xpc_request_partition_disengage(struct xpc_partition *part)
803{
804 unsigned long irq_flags;
805 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
806 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
807
808 local_irq_save(irq_flags);
809
810 /* set bit corresponding to our partid in remote partition's AMO */
811 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
812 (1UL << sn_partition_id));
813 /*
814 * We must always use the nofault function regardless of whether we
815 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
816 * didn't, we'd never know that the other partition is down and would
817 * keep sending IPIs and AMOs to it until the heartbeat times out.
818 */
819 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
820 variable),
821 xp_nofault_PIOR_target));
822
823 local_irq_restore(irq_flags);
824}
825
826static inline void
827xpc_cancel_partition_disengage_request(struct xpc_partition *part)
828{
829 unsigned long irq_flags;
830 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
831 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
832
833 local_irq_save(irq_flags);
834
835 /* clear bit corresponding to our partid in remote partition's AMO */
836 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
837 ~(1UL << sn_partition_id));
838 /*
839 * We must always use the nofault function regardless of whether we
840 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
841 * didn't, we'd never know that the other partition is down and would
842 * keep sending IPIs and AMOs to it until the heartbeat times out.
843 */
844 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
845 variable),
846 xp_nofault_PIOR_target));
847
848 local_irq_restore(irq_flags);
849}
850
851static inline u64
852xpc_partition_engaged(u64 partid_mask)
853{
854 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
855
856 /* return our partition's AMO variable ANDed with partid_mask */
857 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
858 partid_mask);
859}
860
861static inline u64
862xpc_partition_disengage_requested(u64 partid_mask)
863{
864 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
865
866 /* return our partition's AMO variable ANDed with partid_mask */
867 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
868 partid_mask);
869}
870
871static inline void
872xpc_clear_partition_engaged(u64 partid_mask)
873{
874 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
875
876 /* clear bit(s) based on partid_mask in our partition's AMO */
877 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
878 ~partid_mask);
879}
880
881static inline void
882xpc_clear_partition_disengage_request(u64 partid_mask)
883{
884 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
885
886 /* clear bit(s) based on partid_mask in our partition's AMO */
887 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
888 ~partid_mask);
889}
890
891/*
892 * The following set of macros and inlines are used for the sending and
893 * receiving of IPIs (also known as IRQs). There are two flavors of IPIs,
894 * one that is associated with partition activity (SGI_XPC_ACTIVATE) and
895 * the other that is associated with channel activity (SGI_XPC_NOTIFY).
896 */
897
898static inline u64
899xpc_IPI_receive(AMO_t *amo)
900{
901 return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_CLEAR);
902}
903
904static inline enum xpc_retval
905xpc_IPI_send(AMO_t *amo, u64 flag, int nasid, int phys_cpuid, int vector)
906{
907 int ret = 0;
908 unsigned long irq_flags;
909
910 local_irq_save(irq_flags);
911
912 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, flag);
913 sn_send_IPI_phys(nasid, phys_cpuid, vector, 0);
914
915 /*
916 * We must always use the nofault function regardless of whether we
917 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
918 * didn't, we'd never know that the other partition is down and would
919 * keep sending IPIs and AMOs to it until the heartbeat times out.
920 */
921 ret = xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->variable),
922 xp_nofault_PIOR_target));
923
924 local_irq_restore(irq_flags);
925
926 return ((ret == 0) ? xpcSuccess : xpcPioReadError);
927}
928
929/*
930 * IPIs associated with SGI_XPC_ACTIVATE IRQ.
931 */
932
933/*
934 * Flag the appropriate AMO variable and send an IPI to the specified node.
935 */
936static inline void
937xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid,
938 int to_phys_cpuid)
939{
940 int w_index = XPC_NASID_W_INDEX(from_nasid);
941 int b_index = XPC_NASID_B_INDEX(from_nasid);
942 AMO_t *amos = (AMO_t *)__va(amos_page_pa +
943 (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t)));
944
945 (void)xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid,
946 to_phys_cpuid, SGI_XPC_ACTIVATE);
947}
948
949static inline void
950xpc_IPI_send_activate(struct xpc_vars *vars)
951{
952 xpc_activate_IRQ_send(vars->amos_page_pa, cnodeid_to_nasid(0),
953 vars->act_nasid, vars->act_phys_cpuid);
954}
955
956static inline void
957xpc_IPI_send_activated(struct xpc_partition *part)
958{
959 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
960 part->remote_act_nasid,
961 part->remote_act_phys_cpuid);
962}
963
964static inline void
965xpc_IPI_send_reactivate(struct xpc_partition *part)
966{
967 xpc_activate_IRQ_send(xpc_vars->amos_page_pa, part->reactivate_nasid,
968 xpc_vars->act_nasid, xpc_vars->act_phys_cpuid);
969}
970
971static inline void
972xpc_IPI_send_disengage(struct xpc_partition *part)
973{
974 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
975 part->remote_act_nasid,
976 part->remote_act_phys_cpuid);
977}
978
979/*
980 * IPIs associated with SGI_XPC_NOTIFY IRQ.
981 */
982
983/*
984 * Send an IPI to the remote partition that is associated with the
985 * specified channel.
986 */
987#define XPC_NOTIFY_IRQ_SEND(_ch, _ipi_f, _irq_f) \
988 xpc_notify_IRQ_send(_ch, _ipi_f, #_ipi_f, _irq_f)
989
990static inline void
991xpc_notify_IRQ_send(struct xpc_channel *ch, u8 ipi_flag, char *ipi_flag_string,
992 unsigned long *irq_flags)
993{
994 struct xpc_partition *part = &xpc_partitions[ch->partid];
995 enum xpc_retval ret;
996
997 if (likely(part->act_state != XPC_P_DEACTIVATING)) {
998 ret = xpc_IPI_send(part->remote_IPI_amo_va,
999 (u64)ipi_flag << (ch->number * 8),
1000 part->remote_IPI_nasid,
1001 part->remote_IPI_phys_cpuid, SGI_XPC_NOTIFY);
1002 dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n",
1003 ipi_flag_string, ch->partid, ch->number, ret);
1004 if (unlikely(ret != xpcSuccess)) {
1005 if (irq_flags != NULL)
1006 spin_unlock_irqrestore(&ch->lock, *irq_flags);
1007 XPC_DEACTIVATE_PARTITION(part, ret);
1008 if (irq_flags != NULL)
1009 spin_lock_irqsave(&ch->lock, *irq_flags);
1010 }
1011 }
1012}
1013
1014/*
1015 * Make it look like the remote partition, which is associated with the
1016 * specified channel, sent us an IPI. This faked IPI will be handled
1017 * by xpc_dropped_IPI_check().
1018 */
1019#define XPC_NOTIFY_IRQ_SEND_LOCAL(_ch, _ipi_f) \
1020 xpc_notify_IRQ_send_local(_ch, _ipi_f, #_ipi_f)
1021
1022static inline void
1023xpc_notify_IRQ_send_local(struct xpc_channel *ch, u8 ipi_flag,
1024 char *ipi_flag_string)
1025{
1026 struct xpc_partition *part = &xpc_partitions[ch->partid];
1027
1028 FETCHOP_STORE_OP(TO_AMO((u64)&part->local_IPI_amo_va->variable),
1029 FETCHOP_OR, ((u64)ipi_flag << (ch->number * 8)));
1030 dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n",
1031 ipi_flag_string, ch->partid, ch->number);
1032}
1033
1034/*
1035 * The sending and receiving of IPIs includes the setting of an AMO variable
1036 * to indicate the reason the IPI was sent. The 64-bit variable is divided
1037 * up into eight bytes, ordered from right to left. Byte zero pertains to
1038 * channel 0, byte one to channel 1, and so on. Each byte is described by
1039 * the following IPI flags.
1040 */
1041
1042#define XPC_IPI_CLOSEREQUEST 0x01
1043#define XPC_IPI_CLOSEREPLY 0x02
1044#define XPC_IPI_OPENREQUEST 0x04
1045#define XPC_IPI_OPENREPLY 0x08
1046#define XPC_IPI_MSGREQUEST 0x10
1047
1048/* given an AMO variable and a channel#, get its associated IPI flags */
1049#define XPC_GET_IPI_FLAGS(_amo, _c) ((u8) (((_amo) >> ((_c) * 8)) & 0xff))
1050#define XPC_SET_IPI_FLAGS(_amo, _c, _f) (_amo) |= ((u64) (_f) << ((_c) * 8))
1051
1052#define XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(_amo) ((_amo) & 0x0f0f0f0f0f0f0f0fUL)
1053#define XPC_ANY_MSG_IPI_FLAGS_SET(_amo) ((_amo) & 0x1010101010101010UL)
1054
1055static inline void
1056xpc_IPI_send_closerequest(struct xpc_channel *ch, unsigned long *irq_flags)
1057{
1058 struct xpc_openclose_args *args = ch->local_openclose_args;
1059
1060 args->reason = ch->reason;
1061
1062 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREQUEST, irq_flags);
1063}
1064
1065static inline void
1066xpc_IPI_send_closereply(struct xpc_channel *ch, unsigned long *irq_flags)
1067{
1068 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREPLY, irq_flags);
1069}
1070
1071static inline void
1072xpc_IPI_send_openrequest(struct xpc_channel *ch, unsigned long *irq_flags)
1073{
1074 struct xpc_openclose_args *args = ch->local_openclose_args;
1075
1076 args->msg_size = ch->msg_size;
1077 args->local_nentries = ch->local_nentries;
1078
1079 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREQUEST, irq_flags);
1080}
1081
1082static inline void
1083xpc_IPI_send_openreply(struct xpc_channel *ch, unsigned long *irq_flags)
1084{
1085 struct xpc_openclose_args *args = ch->local_openclose_args;
1086
1087 args->remote_nentries = ch->remote_nentries;
1088 args->local_nentries = ch->local_nentries;
1089 args->local_msgqueue_pa = __pa(ch->local_msgqueue);
1090
1091 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREPLY, irq_flags);
1092}
1093
1094static inline void
1095xpc_IPI_send_msgrequest(struct xpc_channel *ch)
1096{
1097 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_MSGREQUEST, NULL);
1098}
1099
1100static inline void
1101xpc_IPI_send_local_msgrequest(struct xpc_channel *ch)
1102{
1103 XPC_NOTIFY_IRQ_SEND_LOCAL(ch, XPC_IPI_MSGREQUEST);
1104}
1105
1106/*
1107 * Memory for XPC's AMO variables is allocated by the MSPEC driver. These
1108 * pages are located in the lowest granule. The lowest granule uses 4k pages
1109 * for cached references and an alternate TLB handler to never provide a
1110 * cacheable mapping for the entire region. This will prevent speculative
1111 * reading of cached copies of our lines from being issued which will cause
1112 * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
1113 * AMO variables (based on XP_MAX_PARTITIONS) for message notification and an
1114 * additional 128 AMO variables (based on XP_NASID_MASK_WORDS) for partition
1115 * activation and 2 AMO variables for partition deactivation.
1116 */
1117static inline AMO_t *
1118xpc_IPI_init(int index)
1119{
1120 AMO_t *amo = xpc_vars->amos_page + index;
1121
1122 (void)xpc_IPI_receive(amo); /* clear AMO variable */
1123 return amo;
1124}
1125
1126static inline enum xpc_retval
1127xpc_map_bte_errors(bte_result_t error)
1128{
1129 if (error == BTE_SUCCESS)
1130 return xpcSuccess;
1131
1132 if (is_shub2()) {
1133 if (BTE_VALID_SH2_ERROR(error))
1134 return xpcBteSh2Start + error;
1135 return xpcBteUnmappedError;
1136 }
1137 switch (error) {
1138 case BTE_SUCCESS:
1139 return xpcSuccess;
1140 case BTEFAIL_DIR:
1141 return xpcBteDirectoryError;
1142 case BTEFAIL_POISON:
1143 return xpcBtePoisonError;
1144 case BTEFAIL_WERR:
1145 return xpcBteWriteError;
1146 case BTEFAIL_ACCESS:
1147 return xpcBteAccessError;
1148 case BTEFAIL_PWERR:
1149 return xpcBtePWriteError;
1150 case BTEFAIL_PRERR:
1151 return xpcBtePReadError;
1152 case BTEFAIL_TOUT:
1153 return xpcBteTimeOutError;
1154 case BTEFAIL_XTERR:
1155 return xpcBteXtalkError;
1156 case BTEFAIL_NOTAVAIL:
1157 return xpcBteNotAvailable;
1158 default:
1159 return xpcBteUnmappedError;
1160 }
1161}
1162
1163/*
1164 * Check to see if there is any channel activity to/from the specified
1165 * partition.
1166 */
1167static inline void
1168xpc_check_for_channel_activity(struct xpc_partition *part)
1169{
1170 u64 IPI_amo;
1171 unsigned long irq_flags;
1172
1173 IPI_amo = xpc_IPI_receive(part->local_IPI_amo_va);
1174 if (IPI_amo == 0)
1175 return;
1176
1177 spin_lock_irqsave(&part->IPI_lock, irq_flags);
1178 part->local_IPI_amo |= IPI_amo;
1179 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
1180
1181 dev_dbg(xpc_chan, "received IPI from partid=%d, IPI_amo=0x%lx\n",
1182 XPC_PARTID(part), IPI_amo);
1183
1184 xpc_wakeup_channel_mgr(part);
1185}
1186
1187#endif /* _DRIVERS_MISC_SGIXP_XPC_H */
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
new file mode 100644
index 000000000000..bfcb9ea968e9
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -0,0 +1,2243 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) channel support.
11 *
12 * This is the part of XPC that manages the channels and
13 * sends/receives messages across them to/from other partitions.
14 *
15 */
16
17#include <linux/kernel.h>
18#include <linux/init.h>
19#include <linux/sched.h>
20#include <linux/cache.h>
21#include <linux/interrupt.h>
22#include <linux/mutex.h>
23#include <linux/completion.h>
24#include <asm/sn/bte.h>
25#include <asm/sn/sn_sal.h>
26#include "xpc.h"
27
28/*
29 * Guarantee that the kzalloc'd memory is cacheline aligned.
30 */
31static void *
32xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
33{
34 /* see if kzalloc will give us cachline aligned memory by default */
35 *base = kzalloc(size, flags);
36 if (*base == NULL)
37 return NULL;
38
39 if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
40 return *base;
41
42 kfree(*base);
43
44 /* nope, we'll have to do it ourselves */
45 *base = kzalloc(size + L1_CACHE_BYTES, flags);
46 if (*base == NULL)
47 return NULL;
48
49 return (void *)L1_CACHE_ALIGN((u64)*base);
50}
51
52/*
53 * Set up the initial values for the XPartition Communication channels.
54 */
55static void
56xpc_initialize_channels(struct xpc_partition *part, partid_t partid)
57{
58 int ch_number;
59 struct xpc_channel *ch;
60
61 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
62 ch = &part->channels[ch_number];
63
64 ch->partid = partid;
65 ch->number = ch_number;
66 ch->flags = XPC_C_DISCONNECTED;
67
68 ch->local_GP = &part->local_GPs[ch_number];
69 ch->local_openclose_args =
70 &part->local_openclose_args[ch_number];
71
72 atomic_set(&ch->kthreads_assigned, 0);
73 atomic_set(&ch->kthreads_idle, 0);
74 atomic_set(&ch->kthreads_active, 0);
75
76 atomic_set(&ch->references, 0);
77 atomic_set(&ch->n_to_notify, 0);
78
79 spin_lock_init(&ch->lock);
80 mutex_init(&ch->msg_to_pull_mutex);
81 init_completion(&ch->wdisconnect_wait);
82
83 atomic_set(&ch->n_on_msg_allocate_wq, 0);
84 init_waitqueue_head(&ch->msg_allocate_wq);
85 init_waitqueue_head(&ch->idle_wq);
86 }
87}
88
89/*
90 * Setup the infrastructure necessary to support XPartition Communication
91 * between the specified remote partition and the local one.
92 */
93enum xpc_retval
94xpc_setup_infrastructure(struct xpc_partition *part)
95{
96 int ret, cpuid;
97 struct timer_list *timer;
98 partid_t partid = XPC_PARTID(part);
99
100 /*
101 * Zero out MOST of the entry for this partition. Only the fields
102 * starting with `nchannels' will be zeroed. The preceding fields must
103 * remain `viable' across partition ups and downs, since they may be
104 * referenced during this memset() operation.
105 */
106 memset(&part->nchannels, 0, sizeof(struct xpc_partition) -
107 offsetof(struct xpc_partition, nchannels));
108
109 /*
110 * Allocate all of the channel structures as a contiguous chunk of
111 * memory.
112 */
113 part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
114 GFP_KERNEL);
115 if (part->channels == NULL) {
116 dev_err(xpc_chan, "can't get memory for channels\n");
117 return xpcNoMemory;
118 }
119
120 part->nchannels = XPC_NCHANNELS;
121
122 /* allocate all the required GET/PUT values */
123
124 part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
125 GFP_KERNEL,
126 &part->local_GPs_base);
127 if (part->local_GPs == NULL) {
128 kfree(part->channels);
129 part->channels = NULL;
130 dev_err(xpc_chan, "can't get memory for local get/put "
131 "values\n");
132 return xpcNoMemory;
133 }
134
135 part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
136 GFP_KERNEL,
137 &part->
138 remote_GPs_base);
139 if (part->remote_GPs == NULL) {
140 dev_err(xpc_chan, "can't get memory for remote get/put "
141 "values\n");
142 kfree(part->local_GPs_base);
143 part->local_GPs = NULL;
144 kfree(part->channels);
145 part->channels = NULL;
146 return xpcNoMemory;
147 }
148
149 /* allocate all the required open and close args */
150
151 part->local_openclose_args =
152 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
153 &part->local_openclose_args_base);
154 if (part->local_openclose_args == NULL) {
155 dev_err(xpc_chan, "can't get memory for local connect args\n");
156 kfree(part->remote_GPs_base);
157 part->remote_GPs = NULL;
158 kfree(part->local_GPs_base);
159 part->local_GPs = NULL;
160 kfree(part->channels);
161 part->channels = NULL;
162 return xpcNoMemory;
163 }
164
165 part->remote_openclose_args =
166 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
167 &part->remote_openclose_args_base);
168 if (part->remote_openclose_args == NULL) {
169 dev_err(xpc_chan, "can't get memory for remote connect args\n");
170 kfree(part->local_openclose_args_base);
171 part->local_openclose_args = NULL;
172 kfree(part->remote_GPs_base);
173 part->remote_GPs = NULL;
174 kfree(part->local_GPs_base);
175 part->local_GPs = NULL;
176 kfree(part->channels);
177 part->channels = NULL;
178 return xpcNoMemory;
179 }
180
181 xpc_initialize_channels(part, partid);
182
183 atomic_set(&part->nchannels_active, 0);
184 atomic_set(&part->nchannels_engaged, 0);
185
186 /* local_IPI_amo were set to 0 by an earlier memset() */
187
188 /* Initialize this partitions AMO_t structure */
189 part->local_IPI_amo_va = xpc_IPI_init(partid);
190
191 spin_lock_init(&part->IPI_lock);
192
193 atomic_set(&part->channel_mgr_requests, 1);
194 init_waitqueue_head(&part->channel_mgr_wq);
195
196 sprintf(part->IPI_owner, "xpc%02d", partid);
197 ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, IRQF_SHARED,
198 part->IPI_owner, (void *)(u64)partid);
199 if (ret != 0) {
200 dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
201 "errno=%d\n", -ret);
202 kfree(part->remote_openclose_args_base);
203 part->remote_openclose_args = NULL;
204 kfree(part->local_openclose_args_base);
205 part->local_openclose_args = NULL;
206 kfree(part->remote_GPs_base);
207 part->remote_GPs = NULL;
208 kfree(part->local_GPs_base);
209 part->local_GPs = NULL;
210 kfree(part->channels);
211 part->channels = NULL;
212 return xpcLackOfResources;
213 }
214
215 /* Setup a timer to check for dropped IPIs */
216 timer = &part->dropped_IPI_timer;
217 init_timer(timer);
218 timer->function = (void (*)(unsigned long))xpc_dropped_IPI_check;
219 timer->data = (unsigned long)part;
220 timer->expires = jiffies + XPC_P_DROPPED_IPI_WAIT;
221 add_timer(timer);
222
223 /*
224 * With the setting of the partition setup_state to XPC_P_SETUP, we're
225 * declaring that this partition is ready to go.
226 */
227 part->setup_state = XPC_P_SETUP;
228
229 /*
230 * Setup the per partition specific variables required by the
231 * remote partition to establish channel connections with us.
232 *
233 * The setting of the magic # indicates that these per partition
234 * specific variables are ready to be used.
235 */
236 xpc_vars_part[partid].GPs_pa = __pa(part->local_GPs);
237 xpc_vars_part[partid].openclose_args_pa =
238 __pa(part->local_openclose_args);
239 xpc_vars_part[partid].IPI_amo_pa = __pa(part->local_IPI_amo_va);
240 cpuid = raw_smp_processor_id(); /* any CPU in this partition will do */
241 xpc_vars_part[partid].IPI_nasid = cpuid_to_nasid(cpuid);
242 xpc_vars_part[partid].IPI_phys_cpuid = cpu_physical_id(cpuid);
243 xpc_vars_part[partid].nchannels = part->nchannels;
244 xpc_vars_part[partid].magic = XPC_VP_MAGIC1;
245
246 return xpcSuccess;
247}
248
249/*
250 * Create a wrapper that hides the underlying mechanism for pulling a cacheline
251 * (or multiple cachelines) from a remote partition.
252 *
253 * src must be a cacheline aligned physical address on the remote partition.
254 * dst must be a cacheline aligned virtual address on this partition.
255 * cnt must be an cacheline sized
256 */
257static enum xpc_retval
258xpc_pull_remote_cachelines(struct xpc_partition *part, void *dst,
259 const void *src, size_t cnt)
260{
261 bte_result_t bte_ret;
262
263 DBUG_ON((u64)src != L1_CACHE_ALIGN((u64)src));
264 DBUG_ON((u64)dst != L1_CACHE_ALIGN((u64)dst));
265 DBUG_ON(cnt != L1_CACHE_ALIGN(cnt));
266
267 if (part->act_state == XPC_P_DEACTIVATING)
268 return part->reason;
269
270 bte_ret = xp_bte_copy((u64)src, (u64)dst, (u64)cnt,
271 (BTE_NORMAL | BTE_WACQUIRE), NULL);
272 if (bte_ret == BTE_SUCCESS)
273 return xpcSuccess;
274
275 dev_dbg(xpc_chan, "xp_bte_copy() from partition %d failed, ret=%d\n",
276 XPC_PARTID(part), bte_ret);
277
278 return xpc_map_bte_errors(bte_ret);
279}
280
281/*
282 * Pull the remote per partition specific variables from the specified
283 * partition.
284 */
285enum xpc_retval
286xpc_pull_remote_vars_part(struct xpc_partition *part)
287{
288 u8 buffer[L1_CACHE_BYTES * 2];
289 struct xpc_vars_part *pulled_entry_cacheline =
290 (struct xpc_vars_part *)L1_CACHE_ALIGN((u64)buffer);
291 struct xpc_vars_part *pulled_entry;
292 u64 remote_entry_cacheline_pa, remote_entry_pa;
293 partid_t partid = XPC_PARTID(part);
294 enum xpc_retval ret;
295
296 /* pull the cacheline that contains the variables we're interested in */
297
298 DBUG_ON(part->remote_vars_part_pa !=
299 L1_CACHE_ALIGN(part->remote_vars_part_pa));
300 DBUG_ON(sizeof(struct xpc_vars_part) != L1_CACHE_BYTES / 2);
301
302 remote_entry_pa = part->remote_vars_part_pa +
303 sn_partition_id * sizeof(struct xpc_vars_part);
304
305 remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1));
306
307 pulled_entry = (struct xpc_vars_part *)((u64)pulled_entry_cacheline +
308 (remote_entry_pa &
309 (L1_CACHE_BYTES - 1)));
310
311 ret = xpc_pull_remote_cachelines(part, pulled_entry_cacheline,
312 (void *)remote_entry_cacheline_pa,
313 L1_CACHE_BYTES);
314 if (ret != xpcSuccess) {
315 dev_dbg(xpc_chan, "failed to pull XPC vars_part from "
316 "partition %d, ret=%d\n", partid, ret);
317 return ret;
318 }
319
320 /* see if they've been set up yet */
321
322 if (pulled_entry->magic != XPC_VP_MAGIC1 &&
323 pulled_entry->magic != XPC_VP_MAGIC2) {
324
325 if (pulled_entry->magic != 0) {
326 dev_dbg(xpc_chan, "partition %d's XPC vars_part for "
327 "partition %d has bad magic value (=0x%lx)\n",
328 partid, sn_partition_id, pulled_entry->magic);
329 return xpcBadMagic;
330 }
331
332 /* they've not been initialized yet */
333 return xpcRetry;
334 }
335
336 if (xpc_vars_part[partid].magic == XPC_VP_MAGIC1) {
337
338 /* validate the variables */
339
340 if (pulled_entry->GPs_pa == 0 ||
341 pulled_entry->openclose_args_pa == 0 ||
342 pulled_entry->IPI_amo_pa == 0) {
343
344 dev_err(xpc_chan, "partition %d's XPC vars_part for "
345 "partition %d are not valid\n", partid,
346 sn_partition_id);
347 return xpcInvalidAddress;
348 }
349
350 /* the variables we imported look to be valid */
351
352 part->remote_GPs_pa = pulled_entry->GPs_pa;
353 part->remote_openclose_args_pa =
354 pulled_entry->openclose_args_pa;
355 part->remote_IPI_amo_va =
356 (AMO_t *)__va(pulled_entry->IPI_amo_pa);
357 part->remote_IPI_nasid = pulled_entry->IPI_nasid;
358 part->remote_IPI_phys_cpuid = pulled_entry->IPI_phys_cpuid;
359
360 if (part->nchannels > pulled_entry->nchannels)
361 part->nchannels = pulled_entry->nchannels;
362
363 /* let the other side know that we've pulled their variables */
364
365 xpc_vars_part[partid].magic = XPC_VP_MAGIC2;
366 }
367
368 if (pulled_entry->magic == XPC_VP_MAGIC1)
369 return xpcRetry;
370
371 return xpcSuccess;
372}
373
374/*
375 * Get the IPI flags and pull the openclose args and/or remote GPs as needed.
376 */
377static u64
378xpc_get_IPI_flags(struct xpc_partition *part)
379{
380 unsigned long irq_flags;
381 u64 IPI_amo;
382 enum xpc_retval ret;
383
384 /*
385 * See if there are any IPI flags to be handled.
386 */
387
388 spin_lock_irqsave(&part->IPI_lock, irq_flags);
389 IPI_amo = part->local_IPI_amo;
390 if (IPI_amo != 0)
391 part->local_IPI_amo = 0;
392
393 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
394
395 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_amo)) {
396 ret = xpc_pull_remote_cachelines(part,
397 part->remote_openclose_args,
398 (void *)part->
399 remote_openclose_args_pa,
400 XPC_OPENCLOSE_ARGS_SIZE);
401 if (ret != xpcSuccess) {
402 XPC_DEACTIVATE_PARTITION(part, ret);
403
404 dev_dbg(xpc_chan, "failed to pull openclose args from "
405 "partition %d, ret=%d\n", XPC_PARTID(part),
406 ret);
407
408 /* don't bother processing IPIs anymore */
409 IPI_amo = 0;
410 }
411 }
412
413 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_amo)) {
414 ret = xpc_pull_remote_cachelines(part, part->remote_GPs,
415 (void *)part->remote_GPs_pa,
416 XPC_GP_SIZE);
417 if (ret != xpcSuccess) {
418 XPC_DEACTIVATE_PARTITION(part, ret);
419
420 dev_dbg(xpc_chan, "failed to pull GPs from partition "
421 "%d, ret=%d\n", XPC_PARTID(part), ret);
422
423 /* don't bother processing IPIs anymore */
424 IPI_amo = 0;
425 }
426 }
427
428 return IPI_amo;
429}
430
431/*
432 * Allocate the local message queue and the notify queue.
433 */
434static enum xpc_retval
435xpc_allocate_local_msgqueue(struct xpc_channel *ch)
436{
437 unsigned long irq_flags;
438 int nentries;
439 size_t nbytes;
440
441 for (nentries = ch->local_nentries; nentries > 0; nentries--) {
442
443 nbytes = nentries * ch->msg_size;
444 ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
445 GFP_KERNEL,
446 &ch->local_msgqueue_base);
447 if (ch->local_msgqueue == NULL)
448 continue;
449
450 nbytes = nentries * sizeof(struct xpc_notify);
451 ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
452 if (ch->notify_queue == NULL) {
453 kfree(ch->local_msgqueue_base);
454 ch->local_msgqueue = NULL;
455 continue;
456 }
457
458 spin_lock_irqsave(&ch->lock, irq_flags);
459 if (nentries < ch->local_nentries) {
460 dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, "
461 "partid=%d, channel=%d\n", nentries,
462 ch->local_nentries, ch->partid, ch->number);
463
464 ch->local_nentries = nentries;
465 }
466 spin_unlock_irqrestore(&ch->lock, irq_flags);
467 return xpcSuccess;
468 }
469
470 dev_dbg(xpc_chan, "can't get memory for local message queue and notify "
471 "queue, partid=%d, channel=%d\n", ch->partid, ch->number);
472 return xpcNoMemory;
473}
474
475/*
476 * Allocate the cached remote message queue.
477 */
478static enum xpc_retval
479xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
480{
481 unsigned long irq_flags;
482 int nentries;
483 size_t nbytes;
484
485 DBUG_ON(ch->remote_nentries <= 0);
486
487 for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
488
489 nbytes = nentries * ch->msg_size;
490 ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
491 GFP_KERNEL,
492 &ch->remote_msgqueue_base);
493 if (ch->remote_msgqueue == NULL)
494 continue;
495
496 spin_lock_irqsave(&ch->lock, irq_flags);
497 if (nentries < ch->remote_nentries) {
498 dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, "
499 "partid=%d, channel=%d\n", nentries,
500 ch->remote_nentries, ch->partid, ch->number);
501
502 ch->remote_nentries = nentries;
503 }
504 spin_unlock_irqrestore(&ch->lock, irq_flags);
505 return xpcSuccess;
506 }
507
508 dev_dbg(xpc_chan, "can't get memory for cached remote message queue, "
509 "partid=%d, channel=%d\n", ch->partid, ch->number);
510 return xpcNoMemory;
511}
512
513/*
514 * Allocate message queues and other stuff associated with a channel.
515 *
516 * Note: Assumes all of the channel sizes are filled in.
517 */
518static enum xpc_retval
519xpc_allocate_msgqueues(struct xpc_channel *ch)
520{
521 unsigned long irq_flags;
522 enum xpc_retval ret;
523
524 DBUG_ON(ch->flags & XPC_C_SETUP);
525
526 ret = xpc_allocate_local_msgqueue(ch);
527 if (ret != xpcSuccess)
528 return ret;
529
530 ret = xpc_allocate_remote_msgqueue(ch);
531 if (ret != xpcSuccess) {
532 kfree(ch->local_msgqueue_base);
533 ch->local_msgqueue = NULL;
534 kfree(ch->notify_queue);
535 ch->notify_queue = NULL;
536 return ret;
537 }
538
539 spin_lock_irqsave(&ch->lock, irq_flags);
540 ch->flags |= XPC_C_SETUP;
541 spin_unlock_irqrestore(&ch->lock, irq_flags);
542
543 return xpcSuccess;
544}
545
546/*
547 * Process a connect message from a remote partition.
548 *
549 * Note: xpc_process_connect() is expecting to be called with the
550 * spin_lock_irqsave held and will leave it locked upon return.
551 */
552static void
553xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
554{
555 enum xpc_retval ret;
556
557 DBUG_ON(!spin_is_locked(&ch->lock));
558
559 if (!(ch->flags & XPC_C_OPENREQUEST) ||
560 !(ch->flags & XPC_C_ROPENREQUEST)) {
561 /* nothing more to do for now */
562 return;
563 }
564 DBUG_ON(!(ch->flags & XPC_C_CONNECTING));
565
566 if (!(ch->flags & XPC_C_SETUP)) {
567 spin_unlock_irqrestore(&ch->lock, *irq_flags);
568 ret = xpc_allocate_msgqueues(ch);
569 spin_lock_irqsave(&ch->lock, *irq_flags);
570
571 if (ret != xpcSuccess)
572 XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags);
573
574 if (ch->flags & (XPC_C_CONNECTED | XPC_C_DISCONNECTING))
575 return;
576
577 DBUG_ON(!(ch->flags & XPC_C_SETUP));
578 DBUG_ON(ch->local_msgqueue == NULL);
579 DBUG_ON(ch->remote_msgqueue == NULL);
580 }
581
582 if (!(ch->flags & XPC_C_OPENREPLY)) {
583 ch->flags |= XPC_C_OPENREPLY;
584 xpc_IPI_send_openreply(ch, irq_flags);
585 }
586
587 if (!(ch->flags & XPC_C_ROPENREPLY))
588 return;
589
590 DBUG_ON(ch->remote_msgqueue_pa == 0);
591
592 ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP); /* clear all else */
593
594 dev_info(xpc_chan, "channel %d to partition %d connected\n",
595 ch->number, ch->partid);
596
597 spin_unlock_irqrestore(&ch->lock, *irq_flags);
598 xpc_create_kthreads(ch, 1, 0);
599 spin_lock_irqsave(&ch->lock, *irq_flags);
600}
601
602/*
603 * Notify those who wanted to be notified upon delivery of their message.
604 */
605static void
606xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put)
607{
608 struct xpc_notify *notify;
609 u8 notify_type;
610 s64 get = ch->w_remote_GP.get - 1;
611
612 while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
613
614 notify = &ch->notify_queue[get % ch->local_nentries];
615
616 /*
617 * See if the notify entry indicates it was associated with
618 * a message who's sender wants to be notified. It is possible
619 * that it is, but someone else is doing or has done the
620 * notification.
621 */
622 notify_type = notify->type;
623 if (notify_type == 0 ||
624 cmpxchg(&notify->type, notify_type, 0) != notify_type) {
625 continue;
626 }
627
628 DBUG_ON(notify_type != XPC_N_CALL);
629
630 atomic_dec(&ch->n_to_notify);
631
632 if (notify->func != NULL) {
633 dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, "
634 "msg_number=%ld, partid=%d, channel=%d\n",
635 (void *)notify, get, ch->partid, ch->number);
636
637 notify->func(reason, ch->partid, ch->number,
638 notify->key);
639
640 dev_dbg(xpc_chan, "notify->func() returned, "
641 "notify=0x%p, msg_number=%ld, partid=%d, "
642 "channel=%d\n", (void *)notify, get,
643 ch->partid, ch->number);
644 }
645 }
646}
647
648/*
649 * Free up message queues and other stuff that were allocated for the specified
650 * channel.
651 *
652 * Note: ch->reason and ch->reason_line are left set for debugging purposes,
653 * they're cleared when XPC_C_DISCONNECTED is cleared.
654 */
655static void
656xpc_free_msgqueues(struct xpc_channel *ch)
657{
658 DBUG_ON(!spin_is_locked(&ch->lock));
659 DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
660
661 ch->remote_msgqueue_pa = 0;
662 ch->func = NULL;
663 ch->key = NULL;
664 ch->msg_size = 0;
665 ch->local_nentries = 0;
666 ch->remote_nentries = 0;
667 ch->kthreads_assigned_limit = 0;
668 ch->kthreads_idle_limit = 0;
669
670 ch->local_GP->get = 0;
671 ch->local_GP->put = 0;
672 ch->remote_GP.get = 0;
673 ch->remote_GP.put = 0;
674 ch->w_local_GP.get = 0;
675 ch->w_local_GP.put = 0;
676 ch->w_remote_GP.get = 0;
677 ch->w_remote_GP.put = 0;
678 ch->next_msg_to_pull = 0;
679
680 if (ch->flags & XPC_C_SETUP) {
681 ch->flags &= ~XPC_C_SETUP;
682
683 dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n",
684 ch->flags, ch->partid, ch->number);
685
686 kfree(ch->local_msgqueue_base);
687 ch->local_msgqueue = NULL;
688 kfree(ch->remote_msgqueue_base);
689 ch->remote_msgqueue = NULL;
690 kfree(ch->notify_queue);
691 ch->notify_queue = NULL;
692 }
693}
694
695/*
696 * spin_lock_irqsave() is expected to be held on entry.
697 */
698static void
699xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
700{
701 struct xpc_partition *part = &xpc_partitions[ch->partid];
702 u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED);
703
704 DBUG_ON(!spin_is_locked(&ch->lock));
705
706 if (!(ch->flags & XPC_C_DISCONNECTING))
707 return;
708
709 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
710
711 /* make sure all activity has settled down first */
712
713 if (atomic_read(&ch->kthreads_assigned) > 0 ||
714 atomic_read(&ch->references) > 0) {
715 return;
716 }
717 DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
718 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE));
719
720 if (part->act_state == XPC_P_DEACTIVATING) {
721 /* can't proceed until the other side disengages from us */
722 if (xpc_partition_engaged(1UL << ch->partid))
723 return;
724
725 } else {
726
727 /* as long as the other side is up do the full protocol */
728
729 if (!(ch->flags & XPC_C_RCLOSEREQUEST))
730 return;
731
732 if (!(ch->flags & XPC_C_CLOSEREPLY)) {
733 ch->flags |= XPC_C_CLOSEREPLY;
734 xpc_IPI_send_closereply(ch, irq_flags);
735 }
736
737 if (!(ch->flags & XPC_C_RCLOSEREPLY))
738 return;
739 }
740
741 /* wake those waiting for notify completion */
742 if (atomic_read(&ch->n_to_notify) > 0) {
743 /* >>> we do callout while holding ch->lock */
744 xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put);
745 }
746
747 /* both sides are disconnected now */
748
749 if (ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE) {
750 spin_unlock_irqrestore(&ch->lock, *irq_flags);
751 xpc_disconnect_callout(ch, xpcDisconnected);
752 spin_lock_irqsave(&ch->lock, *irq_flags);
753 }
754
755 /* it's now safe to free the channel's message queues */
756 xpc_free_msgqueues(ch);
757
758 /* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */
759 ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
760
761 atomic_dec(&part->nchannels_active);
762
763 if (channel_was_connected) {
764 dev_info(xpc_chan, "channel %d to partition %d disconnected, "
765 "reason=%d\n", ch->number, ch->partid, ch->reason);
766 }
767
768 if (ch->flags & XPC_C_WDISCONNECT) {
769 /* we won't lose the CPU since we're holding ch->lock */
770 complete(&ch->wdisconnect_wait);
771 } else if (ch->delayed_IPI_flags) {
772 if (part->act_state != XPC_P_DEACTIVATING) {
773 /* time to take action on any delayed IPI flags */
774 spin_lock(&part->IPI_lock);
775 XPC_SET_IPI_FLAGS(part->local_IPI_amo, ch->number,
776 ch->delayed_IPI_flags);
777 spin_unlock(&part->IPI_lock);
778 }
779 ch->delayed_IPI_flags = 0;
780 }
781}
782
783/*
784 * Process a change in the channel's remote connection state.
785 */
786static void
787xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
788 u8 IPI_flags)
789{
790 unsigned long irq_flags;
791 struct xpc_openclose_args *args =
792 &part->remote_openclose_args[ch_number];
793 struct xpc_channel *ch = &part->channels[ch_number];
794 enum xpc_retval reason;
795
796 spin_lock_irqsave(&ch->lock, irq_flags);
797
798again:
799
800 if ((ch->flags & XPC_C_DISCONNECTED) &&
801 (ch->flags & XPC_C_WDISCONNECT)) {
802 /*
803 * Delay processing IPI flags until thread waiting disconnect
804 * has had a chance to see that the channel is disconnected.
805 */
806 ch->delayed_IPI_flags |= IPI_flags;
807 spin_unlock_irqrestore(&ch->lock, irq_flags);
808 return;
809 }
810
811 if (IPI_flags & XPC_IPI_CLOSEREQUEST) {
812
813 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREQUEST (reason=%d) received "
814 "from partid=%d, channel=%d\n", args->reason,
815 ch->partid, ch->number);
816
817 /*
818 * If RCLOSEREQUEST is set, we're probably waiting for
819 * RCLOSEREPLY. We should find it and a ROPENREQUEST packed
820 * with this RCLOSEREQUEST in the IPI_flags.
821 */
822
823 if (ch->flags & XPC_C_RCLOSEREQUEST) {
824 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
825 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
826 DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY));
827 DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY);
828
829 DBUG_ON(!(IPI_flags & XPC_IPI_CLOSEREPLY));
830 IPI_flags &= ~XPC_IPI_CLOSEREPLY;
831 ch->flags |= XPC_C_RCLOSEREPLY;
832
833 /* both sides have finished disconnecting */
834 xpc_process_disconnect(ch, &irq_flags);
835 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
836 goto again;
837 }
838
839 if (ch->flags & XPC_C_DISCONNECTED) {
840 if (!(IPI_flags & XPC_IPI_OPENREQUEST)) {
841 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo,
842 ch_number) &
843 XPC_IPI_OPENREQUEST)) {
844
845 DBUG_ON(ch->delayed_IPI_flags != 0);
846 spin_lock(&part->IPI_lock);
847 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
848 ch_number,
849 XPC_IPI_CLOSEREQUEST);
850 spin_unlock(&part->IPI_lock);
851 }
852 spin_unlock_irqrestore(&ch->lock, irq_flags);
853 return;
854 }
855
856 XPC_SET_REASON(ch, 0, 0);
857 ch->flags &= ~XPC_C_DISCONNECTED;
858
859 atomic_inc(&part->nchannels_active);
860 ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST);
861 }
862
863 IPI_flags &= ~(XPC_IPI_OPENREQUEST | XPC_IPI_OPENREPLY);
864
865 /*
866 * The meaningful CLOSEREQUEST connection state fields are:
867 * reason = reason connection is to be closed
868 */
869
870 ch->flags |= XPC_C_RCLOSEREQUEST;
871
872 if (!(ch->flags & XPC_C_DISCONNECTING)) {
873 reason = args->reason;
874 if (reason <= xpcSuccess || reason > xpcUnknownReason)
875 reason = xpcUnknownReason;
876 else if (reason == xpcUnregistering)
877 reason = xpcOtherUnregistering;
878
879 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
880
881 DBUG_ON(IPI_flags & XPC_IPI_CLOSEREPLY);
882 spin_unlock_irqrestore(&ch->lock, irq_flags);
883 return;
884 }
885
886 xpc_process_disconnect(ch, &irq_flags);
887 }
888
889 if (IPI_flags & XPC_IPI_CLOSEREPLY) {
890
891 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREPLY received from partid=%d,"
892 " channel=%d\n", ch->partid, ch->number);
893
894 if (ch->flags & XPC_C_DISCONNECTED) {
895 DBUG_ON(part->act_state != XPC_P_DEACTIVATING);
896 spin_unlock_irqrestore(&ch->lock, irq_flags);
897 return;
898 }
899
900 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
901
902 if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
903 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, ch_number)
904 & XPC_IPI_CLOSEREQUEST)) {
905
906 DBUG_ON(ch->delayed_IPI_flags != 0);
907 spin_lock(&part->IPI_lock);
908 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
909 ch_number,
910 XPC_IPI_CLOSEREPLY);
911 spin_unlock(&part->IPI_lock);
912 }
913 spin_unlock_irqrestore(&ch->lock, irq_flags);
914 return;
915 }
916
917 ch->flags |= XPC_C_RCLOSEREPLY;
918
919 if (ch->flags & XPC_C_CLOSEREPLY) {
920 /* both sides have finished disconnecting */
921 xpc_process_disconnect(ch, &irq_flags);
922 }
923 }
924
925 if (IPI_flags & XPC_IPI_OPENREQUEST) {
926
927 dev_dbg(xpc_chan, "XPC_IPI_OPENREQUEST (msg_size=%d, "
928 "local_nentries=%d) received from partid=%d, "
929 "channel=%d\n", args->msg_size, args->local_nentries,
930 ch->partid, ch->number);
931
932 if (part->act_state == XPC_P_DEACTIVATING ||
933 (ch->flags & XPC_C_ROPENREQUEST)) {
934 spin_unlock_irqrestore(&ch->lock, irq_flags);
935 return;
936 }
937
938 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
939 ch->delayed_IPI_flags |= XPC_IPI_OPENREQUEST;
940 spin_unlock_irqrestore(&ch->lock, irq_flags);
941 return;
942 }
943 DBUG_ON(!(ch->flags & (XPC_C_DISCONNECTED |
944 XPC_C_OPENREQUEST)));
945 DBUG_ON(ch->flags & (XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
946 XPC_C_OPENREPLY | XPC_C_CONNECTED));
947
948 /*
949 * The meaningful OPENREQUEST connection state fields are:
950 * msg_size = size of channel's messages in bytes
951 * local_nentries = remote partition's local_nentries
952 */
953 if (args->msg_size == 0 || args->local_nentries == 0) {
954 /* assume OPENREQUEST was delayed by mistake */
955 spin_unlock_irqrestore(&ch->lock, irq_flags);
956 return;
957 }
958
959 ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING);
960 ch->remote_nentries = args->local_nentries;
961
962 if (ch->flags & XPC_C_OPENREQUEST) {
963 if (args->msg_size != ch->msg_size) {
964 XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
965 &irq_flags);
966 spin_unlock_irqrestore(&ch->lock, irq_flags);
967 return;
968 }
969 } else {
970 ch->msg_size = args->msg_size;
971
972 XPC_SET_REASON(ch, 0, 0);
973 ch->flags &= ~XPC_C_DISCONNECTED;
974
975 atomic_inc(&part->nchannels_active);
976 }
977
978 xpc_process_connect(ch, &irq_flags);
979 }
980
981 if (IPI_flags & XPC_IPI_OPENREPLY) {
982
983 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY (local_msgqueue_pa=0x%lx, "
984 "local_nentries=%d, remote_nentries=%d) received from "
985 "partid=%d, channel=%d\n", args->local_msgqueue_pa,
986 args->local_nentries, args->remote_nentries,
987 ch->partid, ch->number);
988
989 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) {
990 spin_unlock_irqrestore(&ch->lock, irq_flags);
991 return;
992 }
993 if (!(ch->flags & XPC_C_OPENREQUEST)) {
994 XPC_DISCONNECT_CHANNEL(ch, xpcOpenCloseError,
995 &irq_flags);
996 spin_unlock_irqrestore(&ch->lock, irq_flags);
997 return;
998 }
999
1000 DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
1001 DBUG_ON(ch->flags & XPC_C_CONNECTED);
1002
1003 /*
1004 * The meaningful OPENREPLY connection state fields are:
1005 * local_msgqueue_pa = physical address of remote
1006 * partition's local_msgqueue
1007 * local_nentries = remote partition's local_nentries
1008 * remote_nentries = remote partition's remote_nentries
1009 */
1010 DBUG_ON(args->local_msgqueue_pa == 0);
1011 DBUG_ON(args->local_nentries == 0);
1012 DBUG_ON(args->remote_nentries == 0);
1013
1014 ch->flags |= XPC_C_ROPENREPLY;
1015 ch->remote_msgqueue_pa = args->local_msgqueue_pa;
1016
1017 if (args->local_nentries < ch->remote_nentries) {
1018 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
1019 "remote_nentries=%d, old remote_nentries=%d, "
1020 "partid=%d, channel=%d\n",
1021 args->local_nentries, ch->remote_nentries,
1022 ch->partid, ch->number);
1023
1024 ch->remote_nentries = args->local_nentries;
1025 }
1026 if (args->remote_nentries < ch->local_nentries) {
1027 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
1028 "local_nentries=%d, old local_nentries=%d, "
1029 "partid=%d, channel=%d\n",
1030 args->remote_nentries, ch->local_nentries,
1031 ch->partid, ch->number);
1032
1033 ch->local_nentries = args->remote_nentries;
1034 }
1035
1036 xpc_process_connect(ch, &irq_flags);
1037 }
1038
1039 spin_unlock_irqrestore(&ch->lock, irq_flags);
1040}
1041
1042/*
1043 * Attempt to establish a channel connection to a remote partition.
1044 */
1045static enum xpc_retval
1046xpc_connect_channel(struct xpc_channel *ch)
1047{
1048 unsigned long irq_flags;
1049 struct xpc_registration *registration = &xpc_registrations[ch->number];
1050
1051 if (mutex_trylock(&registration->mutex) == 0)
1052 return xpcRetry;
1053
1054 if (!XPC_CHANNEL_REGISTERED(ch->number)) {
1055 mutex_unlock(&registration->mutex);
1056 return xpcUnregistered;
1057 }
1058
1059 spin_lock_irqsave(&ch->lock, irq_flags);
1060
1061 DBUG_ON(ch->flags & XPC_C_CONNECTED);
1062 DBUG_ON(ch->flags & XPC_C_OPENREQUEST);
1063
1064 if (ch->flags & XPC_C_DISCONNECTING) {
1065 spin_unlock_irqrestore(&ch->lock, irq_flags);
1066 mutex_unlock(&registration->mutex);
1067 return ch->reason;
1068 }
1069
1070 /* add info from the channel connect registration to the channel */
1071
1072 ch->kthreads_assigned_limit = registration->assigned_limit;
1073 ch->kthreads_idle_limit = registration->idle_limit;
1074 DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0);
1075 DBUG_ON(atomic_read(&ch->kthreads_idle) != 0);
1076 DBUG_ON(atomic_read(&ch->kthreads_active) != 0);
1077
1078 ch->func = registration->func;
1079 DBUG_ON(registration->func == NULL);
1080 ch->key = registration->key;
1081
1082 ch->local_nentries = registration->nentries;
1083
1084 if (ch->flags & XPC_C_ROPENREQUEST) {
1085 if (registration->msg_size != ch->msg_size) {
1086 /* the local and remote sides aren't the same */
1087
1088 /*
1089 * Because XPC_DISCONNECT_CHANNEL() can block we're
1090 * forced to up the registration sema before we unlock
1091 * the channel lock. But that's okay here because we're
1092 * done with the part that required the registration
1093 * sema. XPC_DISCONNECT_CHANNEL() requires that the
1094 * channel lock be locked and will unlock and relock
1095 * the channel lock as needed.
1096 */
1097 mutex_unlock(&registration->mutex);
1098 XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
1099 &irq_flags);
1100 spin_unlock_irqrestore(&ch->lock, irq_flags);
1101 return xpcUnequalMsgSizes;
1102 }
1103 } else {
1104 ch->msg_size = registration->msg_size;
1105
1106 XPC_SET_REASON(ch, 0, 0);
1107 ch->flags &= ~XPC_C_DISCONNECTED;
1108
1109 atomic_inc(&xpc_partitions[ch->partid].nchannels_active);
1110 }
1111
1112 mutex_unlock(&registration->mutex);
1113
1114 /* initiate the connection */
1115
1116 ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING);
1117 xpc_IPI_send_openrequest(ch, &irq_flags);
1118
1119 xpc_process_connect(ch, &irq_flags);
1120
1121 spin_unlock_irqrestore(&ch->lock, irq_flags);
1122
1123 return xpcSuccess;
1124}
1125
1126/*
1127 * Clear some of the msg flags in the local message queue.
1128 */
1129static inline void
1130xpc_clear_local_msgqueue_flags(struct xpc_channel *ch)
1131{
1132 struct xpc_msg *msg;
1133 s64 get;
1134
1135 get = ch->w_remote_GP.get;
1136 do {
1137 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1138 (get % ch->local_nentries) *
1139 ch->msg_size);
1140 msg->flags = 0;
1141 } while (++get < ch->remote_GP.get);
1142}
1143
1144/*
1145 * Clear some of the msg flags in the remote message queue.
1146 */
1147static inline void
1148xpc_clear_remote_msgqueue_flags(struct xpc_channel *ch)
1149{
1150 struct xpc_msg *msg;
1151 s64 put;
1152
1153 put = ch->w_remote_GP.put;
1154 do {
1155 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
1156 (put % ch->remote_nentries) *
1157 ch->msg_size);
1158 msg->flags = 0;
1159 } while (++put < ch->remote_GP.put);
1160}
1161
1162static void
1163xpc_process_msg_IPI(struct xpc_partition *part, int ch_number)
1164{
1165 struct xpc_channel *ch = &part->channels[ch_number];
1166 int nmsgs_sent;
1167
1168 ch->remote_GP = part->remote_GPs[ch_number];
1169
1170 /* See what, if anything, has changed for each connected channel */
1171
1172 xpc_msgqueue_ref(ch);
1173
1174 if (ch->w_remote_GP.get == ch->remote_GP.get &&
1175 ch->w_remote_GP.put == ch->remote_GP.put) {
1176 /* nothing changed since GPs were last pulled */
1177 xpc_msgqueue_deref(ch);
1178 return;
1179 }
1180
1181 if (!(ch->flags & XPC_C_CONNECTED)) {
1182 xpc_msgqueue_deref(ch);
1183 return;
1184 }
1185
1186 /*
1187 * First check to see if messages recently sent by us have been
1188 * received by the other side. (The remote GET value will have
1189 * changed since we last looked at it.)
1190 */
1191
1192 if (ch->w_remote_GP.get != ch->remote_GP.get) {
1193
1194 /*
1195 * We need to notify any senders that want to be notified
1196 * that their sent messages have been received by their
1197 * intended recipients. We need to do this before updating
1198 * w_remote_GP.get so that we don't allocate the same message
1199 * queue entries prematurely (see xpc_allocate_msg()).
1200 */
1201 if (atomic_read(&ch->n_to_notify) > 0) {
1202 /*
1203 * Notify senders that messages sent have been
1204 * received and delivered by the other side.
1205 */
1206 xpc_notify_senders(ch, xpcMsgDelivered,
1207 ch->remote_GP.get);
1208 }
1209
1210 /*
1211 * Clear msg->flags in previously sent messages, so that
1212 * they're ready for xpc_allocate_msg().
1213 */
1214 xpc_clear_local_msgqueue_flags(ch);
1215
1216 ch->w_remote_GP.get = ch->remote_GP.get;
1217
1218 dev_dbg(xpc_chan, "w_remote_GP.get changed to %ld, partid=%d, "
1219 "channel=%d\n", ch->w_remote_GP.get, ch->partid,
1220 ch->number);
1221
1222 /*
1223 * If anyone was waiting for message queue entries to become
1224 * available, wake them up.
1225 */
1226 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
1227 wake_up(&ch->msg_allocate_wq);
1228 }
1229
1230 /*
1231 * Now check for newly sent messages by the other side. (The remote
1232 * PUT value will have changed since we last looked at it.)
1233 */
1234
1235 if (ch->w_remote_GP.put != ch->remote_GP.put) {
1236 /*
1237 * Clear msg->flags in previously received messages, so that
1238 * they're ready for xpc_get_deliverable_msg().
1239 */
1240 xpc_clear_remote_msgqueue_flags(ch);
1241
1242 ch->w_remote_GP.put = ch->remote_GP.put;
1243
1244 dev_dbg(xpc_chan, "w_remote_GP.put changed to %ld, partid=%d, "
1245 "channel=%d\n", ch->w_remote_GP.put, ch->partid,
1246 ch->number);
1247
1248 nmsgs_sent = ch->w_remote_GP.put - ch->w_local_GP.get;
1249 if (nmsgs_sent > 0) {
1250 dev_dbg(xpc_chan, "msgs waiting to be copied and "
1251 "delivered=%d, partid=%d, channel=%d\n",
1252 nmsgs_sent, ch->partid, ch->number);
1253
1254 if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)
1255 xpc_activate_kthreads(ch, nmsgs_sent);
1256 }
1257 }
1258
1259 xpc_msgqueue_deref(ch);
1260}
1261
1262void
1263xpc_process_channel_activity(struct xpc_partition *part)
1264{
1265 unsigned long irq_flags;
1266 u64 IPI_amo, IPI_flags;
1267 struct xpc_channel *ch;
1268 int ch_number;
1269 u32 ch_flags;
1270
1271 IPI_amo = xpc_get_IPI_flags(part);
1272
1273 /*
1274 * Initiate channel connections for registered channels.
1275 *
1276 * For each connected channel that has pending messages activate idle
1277 * kthreads and/or create new kthreads as needed.
1278 */
1279
1280 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
1281 ch = &part->channels[ch_number];
1282
1283 /*
1284 * Process any open or close related IPI flags, and then deal
1285 * with connecting or disconnecting the channel as required.
1286 */
1287
1288 IPI_flags = XPC_GET_IPI_FLAGS(IPI_amo, ch_number);
1289
1290 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_flags))
1291 xpc_process_openclose_IPI(part, ch_number, IPI_flags);
1292
1293 ch_flags = ch->flags; /* need an atomic snapshot of flags */
1294
1295 if (ch_flags & XPC_C_DISCONNECTING) {
1296 spin_lock_irqsave(&ch->lock, irq_flags);
1297 xpc_process_disconnect(ch, &irq_flags);
1298 spin_unlock_irqrestore(&ch->lock, irq_flags);
1299 continue;
1300 }
1301
1302 if (part->act_state == XPC_P_DEACTIVATING)
1303 continue;
1304
1305 if (!(ch_flags & XPC_C_CONNECTED)) {
1306 if (!(ch_flags & XPC_C_OPENREQUEST)) {
1307 DBUG_ON(ch_flags & XPC_C_SETUP);
1308 (void)xpc_connect_channel(ch);
1309 } else {
1310 spin_lock_irqsave(&ch->lock, irq_flags);
1311 xpc_process_connect(ch, &irq_flags);
1312 spin_unlock_irqrestore(&ch->lock, irq_flags);
1313 }
1314 continue;
1315 }
1316
1317 /*
1318 * Process any message related IPI flags, this may involve the
1319 * activation of kthreads to deliver any pending messages sent
1320 * from the other partition.
1321 */
1322
1323 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_flags))
1324 xpc_process_msg_IPI(part, ch_number);
1325 }
1326}
1327
1328/*
1329 * XPC's heartbeat code calls this function to inform XPC that a partition is
1330 * going down. XPC responds by tearing down the XPartition Communication
1331 * infrastructure used for the just downed partition.
1332 *
1333 * XPC's heartbeat code will never call this function and xpc_partition_up()
1334 * at the same time. Nor will it ever make multiple calls to either function
1335 * at the same time.
1336 */
1337void
1338xpc_partition_going_down(struct xpc_partition *part, enum xpc_retval reason)
1339{
1340 unsigned long irq_flags;
1341 int ch_number;
1342 struct xpc_channel *ch;
1343
1344 dev_dbg(xpc_chan, "deactivating partition %d, reason=%d\n",
1345 XPC_PARTID(part), reason);
1346
1347 if (!xpc_part_ref(part)) {
1348 /* infrastructure for this partition isn't currently set up */
1349 return;
1350 }
1351
1352 /* disconnect channels associated with the partition going down */
1353
1354 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
1355 ch = &part->channels[ch_number];
1356
1357 xpc_msgqueue_ref(ch);
1358 spin_lock_irqsave(&ch->lock, irq_flags);
1359
1360 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
1361
1362 spin_unlock_irqrestore(&ch->lock, irq_flags);
1363 xpc_msgqueue_deref(ch);
1364 }
1365
1366 xpc_wakeup_channel_mgr(part);
1367
1368 xpc_part_deref(part);
1369}
1370
1371/*
1372 * Teardown the infrastructure necessary to support XPartition Communication
1373 * between the specified remote partition and the local one.
1374 */
1375void
1376xpc_teardown_infrastructure(struct xpc_partition *part)
1377{
1378 partid_t partid = XPC_PARTID(part);
1379
1380 /*
1381 * We start off by making this partition inaccessible to local
1382 * processes by marking it as no longer setup. Then we make it
1383 * inaccessible to remote processes by clearing the XPC per partition
1384 * specific variable's magic # (which indicates that these variables
1385 * are no longer valid) and by ignoring all XPC notify IPIs sent to
1386 * this partition.
1387 */
1388
1389 DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
1390 DBUG_ON(atomic_read(&part->nchannels_active) != 0);
1391 DBUG_ON(part->setup_state != XPC_P_SETUP);
1392 part->setup_state = XPC_P_WTEARDOWN;
1393
1394 xpc_vars_part[partid].magic = 0;
1395
1396 free_irq(SGI_XPC_NOTIFY, (void *)(u64)partid);
1397
1398 /*
1399 * Before proceeding with the teardown we have to wait until all
1400 * existing references cease.
1401 */
1402 wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
1403
1404 /* now we can begin tearing down the infrastructure */
1405
1406 part->setup_state = XPC_P_TORNDOWN;
1407
1408 /* in case we've still got outstanding timers registered... */
1409 del_timer_sync(&part->dropped_IPI_timer);
1410
1411 kfree(part->remote_openclose_args_base);
1412 part->remote_openclose_args = NULL;
1413 kfree(part->local_openclose_args_base);
1414 part->local_openclose_args = NULL;
1415 kfree(part->remote_GPs_base);
1416 part->remote_GPs = NULL;
1417 kfree(part->local_GPs_base);
1418 part->local_GPs = NULL;
1419 kfree(part->channels);
1420 part->channels = NULL;
1421 part->local_IPI_amo_va = NULL;
1422}
1423
1424/*
1425 * Called by XP at the time of channel connection registration to cause
1426 * XPC to establish connections to all currently active partitions.
1427 */
1428void
1429xpc_initiate_connect(int ch_number)
1430{
1431 partid_t partid;
1432 struct xpc_partition *part;
1433 struct xpc_channel *ch;
1434
1435 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
1436
1437 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1438 part = &xpc_partitions[partid];
1439
1440 if (xpc_part_ref(part)) {
1441 ch = &part->channels[ch_number];
1442
1443 /*
1444 * Initiate the establishment of a connection on the
1445 * newly registered channel to the remote partition.
1446 */
1447 xpc_wakeup_channel_mgr(part);
1448 xpc_part_deref(part);
1449 }
1450 }
1451}
1452
1453void
1454xpc_connected_callout(struct xpc_channel *ch)
1455{
1456 /* let the registerer know that a connection has been established */
1457
1458 if (ch->func != NULL) {
1459 dev_dbg(xpc_chan, "ch->func() called, reason=xpcConnected, "
1460 "partid=%d, channel=%d\n", ch->partid, ch->number);
1461
1462 ch->func(xpcConnected, ch->partid, ch->number,
1463 (void *)(u64)ch->local_nentries, ch->key);
1464
1465 dev_dbg(xpc_chan, "ch->func() returned, reason=xpcConnected, "
1466 "partid=%d, channel=%d\n", ch->partid, ch->number);
1467 }
1468}
1469
1470/*
1471 * Called by XP at the time of channel connection unregistration to cause
1472 * XPC to teardown all current connections for the specified channel.
1473 *
1474 * Before returning xpc_initiate_disconnect() will wait until all connections
1475 * on the specified channel have been closed/torndown. So the caller can be
1476 * assured that they will not be receiving any more callouts from XPC to the
1477 * function they registered via xpc_connect().
1478 *
1479 * Arguments:
1480 *
1481 * ch_number - channel # to unregister.
1482 */
1483void
1484xpc_initiate_disconnect(int ch_number)
1485{
1486 unsigned long irq_flags;
1487 partid_t partid;
1488 struct xpc_partition *part;
1489 struct xpc_channel *ch;
1490
1491 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
1492
1493 /* initiate the channel disconnect for every active partition */
1494 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1495 part = &xpc_partitions[partid];
1496
1497 if (xpc_part_ref(part)) {
1498 ch = &part->channels[ch_number];
1499 xpc_msgqueue_ref(ch);
1500
1501 spin_lock_irqsave(&ch->lock, irq_flags);
1502
1503 if (!(ch->flags & XPC_C_DISCONNECTED)) {
1504 ch->flags |= XPC_C_WDISCONNECT;
1505
1506 XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering,
1507 &irq_flags);
1508 }
1509
1510 spin_unlock_irqrestore(&ch->lock, irq_flags);
1511
1512 xpc_msgqueue_deref(ch);
1513 xpc_part_deref(part);
1514 }
1515 }
1516
1517 xpc_disconnect_wait(ch_number);
1518}
1519
1520/*
1521 * To disconnect a channel, and reflect it back to all who may be waiting.
1522 *
1523 * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
1524 * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by
1525 * xpc_disconnect_wait().
1526 *
1527 * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN.
1528 */
1529void
1530xpc_disconnect_channel(const int line, struct xpc_channel *ch,
1531 enum xpc_retval reason, unsigned long *irq_flags)
1532{
1533 u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED);
1534
1535 DBUG_ON(!spin_is_locked(&ch->lock));
1536
1537 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
1538 return;
1539
1540 DBUG_ON(!(ch->flags & (XPC_C_CONNECTING | XPC_C_CONNECTED)));
1541
1542 dev_dbg(xpc_chan, "reason=%d, line=%d, partid=%d, channel=%d\n",
1543 reason, line, ch->partid, ch->number);
1544
1545 XPC_SET_REASON(ch, reason, line);
1546
1547 ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
1548 /* some of these may not have been set */
1549 ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY |
1550 XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
1551 XPC_C_CONNECTING | XPC_C_CONNECTED);
1552
1553 xpc_IPI_send_closerequest(ch, irq_flags);
1554
1555 if (channel_was_connected)
1556 ch->flags |= XPC_C_WASCONNECTED;
1557
1558 spin_unlock_irqrestore(&ch->lock, *irq_flags);
1559
1560 /* wake all idle kthreads so they can exit */
1561 if (atomic_read(&ch->kthreads_idle) > 0) {
1562 wake_up_all(&ch->idle_wq);
1563
1564 } else if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
1565 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
1566 /* start a kthread that will do the xpcDisconnecting callout */
1567 xpc_create_kthreads(ch, 1, 1);
1568 }
1569
1570 /* wake those waiting to allocate an entry from the local msg queue */
1571 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
1572 wake_up(&ch->msg_allocate_wq);
1573
1574 spin_lock_irqsave(&ch->lock, *irq_flags);
1575}
1576
1577void
1578xpc_disconnect_callout(struct xpc_channel *ch, enum xpc_retval reason)
1579{
1580 /*
1581 * Let the channel's registerer know that the channel is being
1582 * disconnected. We don't want to do this if the registerer was never
1583 * informed of a connection being made.
1584 */
1585
1586 if (ch->func != NULL) {
1587 dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, "
1588 "channel=%d\n", reason, ch->partid, ch->number);
1589
1590 ch->func(reason, ch->partid, ch->number, NULL, ch->key);
1591
1592 dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, "
1593 "channel=%d\n", reason, ch->partid, ch->number);
1594 }
1595}
1596
1597/*
1598 * Wait for a message entry to become available for the specified channel,
1599 * but don't wait any longer than 1 jiffy.
1600 */
1601static enum xpc_retval
1602xpc_allocate_msg_wait(struct xpc_channel *ch)
1603{
1604 enum xpc_retval ret;
1605
1606 if (ch->flags & XPC_C_DISCONNECTING) {
1607 DBUG_ON(ch->reason == xpcInterrupted);
1608 return ch->reason;
1609 }
1610
1611 atomic_inc(&ch->n_on_msg_allocate_wq);
1612 ret = interruptible_sleep_on_timeout(&ch->msg_allocate_wq, 1);
1613 atomic_dec(&ch->n_on_msg_allocate_wq);
1614
1615 if (ch->flags & XPC_C_DISCONNECTING) {
1616 ret = ch->reason;
1617 DBUG_ON(ch->reason == xpcInterrupted);
1618 } else if (ret == 0) {
1619 ret = xpcTimeout;
1620 } else {
1621 ret = xpcInterrupted;
1622 }
1623
1624 return ret;
1625}
1626
1627/*
1628 * Allocate an entry for a message from the message queue associated with the
1629 * specified channel.
1630 */
1631static enum xpc_retval
1632xpc_allocate_msg(struct xpc_channel *ch, u32 flags,
1633 struct xpc_msg **address_of_msg)
1634{
1635 struct xpc_msg *msg;
1636 enum xpc_retval ret;
1637 s64 put;
1638
1639 /* this reference will be dropped in xpc_send_msg() */
1640 xpc_msgqueue_ref(ch);
1641
1642 if (ch->flags & XPC_C_DISCONNECTING) {
1643 xpc_msgqueue_deref(ch);
1644 return ch->reason;
1645 }
1646 if (!(ch->flags & XPC_C_CONNECTED)) {
1647 xpc_msgqueue_deref(ch);
1648 return xpcNotConnected;
1649 }
1650
1651 /*
1652 * Get the next available message entry from the local message queue.
1653 * If none are available, we'll make sure that we grab the latest
1654 * GP values.
1655 */
1656 ret = xpcTimeout;
1657
1658 while (1) {
1659
1660 put = ch->w_local_GP.put;
1661 rmb(); /* guarantee that .put loads before .get */
1662 if (put - ch->w_remote_GP.get < ch->local_nentries) {
1663
1664 /* There are available message entries. We need to try
1665 * to secure one for ourselves. We'll do this by trying
1666 * to increment w_local_GP.put as long as someone else
1667 * doesn't beat us to it. If they do, we'll have to
1668 * try again.
1669 */
1670 if (cmpxchg(&ch->w_local_GP.put, put, put + 1) == put) {
1671 /* we got the entry referenced by put */
1672 break;
1673 }
1674 continue; /* try again */
1675 }
1676
1677 /*
1678 * There aren't any available msg entries at this time.
1679 *
1680 * In waiting for a message entry to become available,
1681 * we set a timeout in case the other side is not
1682 * sending completion IPIs. This lets us fake an IPI
1683 * that will cause the IPI handler to fetch the latest
1684 * GP values as if an IPI was sent by the other side.
1685 */
1686 if (ret == xpcTimeout)
1687 xpc_IPI_send_local_msgrequest(ch);
1688
1689 if (flags & XPC_NOWAIT) {
1690 xpc_msgqueue_deref(ch);
1691 return xpcNoWait;
1692 }
1693
1694 ret = xpc_allocate_msg_wait(ch);
1695 if (ret != xpcInterrupted && ret != xpcTimeout) {
1696 xpc_msgqueue_deref(ch);
1697 return ret;
1698 }
1699 }
1700
1701 /* get the message's address and initialize it */
1702 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1703 (put % ch->local_nentries) * ch->msg_size);
1704
1705 DBUG_ON(msg->flags != 0);
1706 msg->number = put;
1707
1708 dev_dbg(xpc_chan, "w_local_GP.put changed to %ld; msg=0x%p, "
1709 "msg_number=%ld, partid=%d, channel=%d\n", put + 1,
1710 (void *)msg, msg->number, ch->partid, ch->number);
1711
1712 *address_of_msg = msg;
1713
1714 return xpcSuccess;
1715}
1716
1717/*
1718 * Allocate an entry for a message from the message queue associated with the
1719 * specified channel. NOTE that this routine can sleep waiting for a message
1720 * entry to become available. To not sleep, pass in the XPC_NOWAIT flag.
1721 *
1722 * Arguments:
1723 *
1724 * partid - ID of partition to which the channel is connected.
1725 * ch_number - channel #.
1726 * flags - see xpc.h for valid flags.
1727 * payload - address of the allocated payload area pointer (filled in on
1728 * return) in which the user-defined message is constructed.
1729 */
1730enum xpc_retval
1731xpc_initiate_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
1732{
1733 struct xpc_partition *part = &xpc_partitions[partid];
1734 enum xpc_retval ret = xpcUnknownReason;
1735 struct xpc_msg *msg = NULL;
1736
1737 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1738 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1739
1740 *payload = NULL;
1741
1742 if (xpc_part_ref(part)) {
1743 ret = xpc_allocate_msg(&part->channels[ch_number], flags, &msg);
1744 xpc_part_deref(part);
1745
1746 if (msg != NULL)
1747 *payload = &msg->payload;
1748 }
1749
1750 return ret;
1751}
1752
1753/*
1754 * Now we actually send the messages that are ready to be sent by advancing
1755 * the local message queue's Put value and then send an IPI to the recipient
1756 * partition.
1757 */
1758static void
1759xpc_send_msgs(struct xpc_channel *ch, s64 initial_put)
1760{
1761 struct xpc_msg *msg;
1762 s64 put = initial_put + 1;
1763 int send_IPI = 0;
1764
1765 while (1) {
1766
1767 while (1) {
1768 if (put == ch->w_local_GP.put)
1769 break;
1770
1771 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1772 (put % ch->local_nentries) *
1773 ch->msg_size);
1774
1775 if (!(msg->flags & XPC_M_READY))
1776 break;
1777
1778 put++;
1779 }
1780
1781 if (put == initial_put) {
1782 /* nothing's changed */
1783 break;
1784 }
1785
1786 if (cmpxchg_rel(&ch->local_GP->put, initial_put, put) !=
1787 initial_put) {
1788 /* someone else beat us to it */
1789 DBUG_ON(ch->local_GP->put < initial_put);
1790 break;
1791 }
1792
1793 /* we just set the new value of local_GP->put */
1794
1795 dev_dbg(xpc_chan, "local_GP->put changed to %ld, partid=%d, "
1796 "channel=%d\n", put, ch->partid, ch->number);
1797
1798 send_IPI = 1;
1799
1800 /*
1801 * We need to ensure that the message referenced by
1802 * local_GP->put is not XPC_M_READY or that local_GP->put
1803 * equals w_local_GP.put, so we'll go have a look.
1804 */
1805 initial_put = put;
1806 }
1807
1808 if (send_IPI)
1809 xpc_IPI_send_msgrequest(ch);
1810}
1811
1812/*
1813 * Common code that does the actual sending of the message by advancing the
1814 * local message queue's Put value and sends an IPI to the partition the
1815 * message is being sent to.
1816 */
1817static enum xpc_retval
1818xpc_send_msg(struct xpc_channel *ch, struct xpc_msg *msg, u8 notify_type,
1819 xpc_notify_func func, void *key)
1820{
1821 enum xpc_retval ret = xpcSuccess;
1822 struct xpc_notify *notify = notify;
1823 s64 put, msg_number = msg->number;
1824
1825 DBUG_ON(notify_type == XPC_N_CALL && func == NULL);
1826 DBUG_ON((((u64)msg - (u64)ch->local_msgqueue) / ch->msg_size) !=
1827 msg_number % ch->local_nentries);
1828 DBUG_ON(msg->flags & XPC_M_READY);
1829
1830 if (ch->flags & XPC_C_DISCONNECTING) {
1831 /* drop the reference grabbed in xpc_allocate_msg() */
1832 xpc_msgqueue_deref(ch);
1833 return ch->reason;
1834 }
1835
1836 if (notify_type != 0) {
1837 /*
1838 * Tell the remote side to send an ACK interrupt when the
1839 * message has been delivered.
1840 */
1841 msg->flags |= XPC_M_INTERRUPT;
1842
1843 atomic_inc(&ch->n_to_notify);
1844
1845 notify = &ch->notify_queue[msg_number % ch->local_nentries];
1846 notify->func = func;
1847 notify->key = key;
1848 notify->type = notify_type;
1849
1850 /* >>> is a mb() needed here? */
1851
1852 if (ch->flags & XPC_C_DISCONNECTING) {
1853 /*
1854 * An error occurred between our last error check and
1855 * this one. We will try to clear the type field from
1856 * the notify entry. If we succeed then
1857 * xpc_disconnect_channel() didn't already process
1858 * the notify entry.
1859 */
1860 if (cmpxchg(&notify->type, notify_type, 0) ==
1861 notify_type) {
1862 atomic_dec(&ch->n_to_notify);
1863 ret = ch->reason;
1864 }
1865
1866 /* drop the reference grabbed in xpc_allocate_msg() */
1867 xpc_msgqueue_deref(ch);
1868 return ret;
1869 }
1870 }
1871
1872 msg->flags |= XPC_M_READY;
1873
1874 /*
1875 * The preceding store of msg->flags must occur before the following
1876 * load of ch->local_GP->put.
1877 */
1878 mb();
1879
1880 /* see if the message is next in line to be sent, if so send it */
1881
1882 put = ch->local_GP->put;
1883 if (put == msg_number)
1884 xpc_send_msgs(ch, put);
1885
1886 /* drop the reference grabbed in xpc_allocate_msg() */
1887 xpc_msgqueue_deref(ch);
1888 return ret;
1889}
1890
1891/*
1892 * Send a message previously allocated using xpc_initiate_allocate() on the
1893 * specified channel connected to the specified partition.
1894 *
1895 * This routine will not wait for the message to be received, nor will
1896 * notification be given when it does happen. Once this routine has returned
1897 * the message entry allocated via xpc_initiate_allocate() is no longer
1898 * accessable to the caller.
1899 *
1900 * This routine, although called by users, does not call xpc_part_ref() to
1901 * ensure that the partition infrastructure is in place. It relies on the
1902 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
1903 *
1904 * Arguments:
1905 *
1906 * partid - ID of partition to which the channel is connected.
1907 * ch_number - channel # to send message on.
1908 * payload - pointer to the payload area allocated via
1909 * xpc_initiate_allocate().
1910 */
1911enum xpc_retval
1912xpc_initiate_send(partid_t partid, int ch_number, void *payload)
1913{
1914 struct xpc_partition *part = &xpc_partitions[partid];
1915 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
1916 enum xpc_retval ret;
1917
1918 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg,
1919 partid, ch_number);
1920
1921 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1922 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1923 DBUG_ON(msg == NULL);
1924
1925 ret = xpc_send_msg(&part->channels[ch_number], msg, 0, NULL, NULL);
1926
1927 return ret;
1928}
1929
1930/*
1931 * Send a message previously allocated using xpc_initiate_allocate on the
1932 * specified channel connected to the specified partition.
1933 *
1934 * This routine will not wait for the message to be sent. Once this routine
1935 * has returned the message entry allocated via xpc_initiate_allocate() is no
1936 * longer accessable to the caller.
1937 *
1938 * Once the remote end of the channel has received the message, the function
1939 * passed as an argument to xpc_initiate_send_notify() will be called. This
1940 * allows the sender to free up or re-use any buffers referenced by the
1941 * message, but does NOT mean the message has been processed at the remote
1942 * end by a receiver.
1943 *
1944 * If this routine returns an error, the caller's function will NOT be called.
1945 *
1946 * This routine, although called by users, does not call xpc_part_ref() to
1947 * ensure that the partition infrastructure is in place. It relies on the
1948 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
1949 *
1950 * Arguments:
1951 *
1952 * partid - ID of partition to which the channel is connected.
1953 * ch_number - channel # to send message on.
1954 * payload - pointer to the payload area allocated via
1955 * xpc_initiate_allocate().
1956 * func - function to call with asynchronous notification of message
1957 * receipt. THIS FUNCTION MUST BE NON-BLOCKING.
1958 * key - user-defined key to be passed to the function when it's called.
1959 */
1960enum xpc_retval
1961xpc_initiate_send_notify(partid_t partid, int ch_number, void *payload,
1962 xpc_notify_func func, void *key)
1963{
1964 struct xpc_partition *part = &xpc_partitions[partid];
1965 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
1966 enum xpc_retval ret;
1967
1968 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg,
1969 partid, ch_number);
1970
1971 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1972 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1973 DBUG_ON(msg == NULL);
1974 DBUG_ON(func == NULL);
1975
1976 ret = xpc_send_msg(&part->channels[ch_number], msg, XPC_N_CALL,
1977 func, key);
1978 return ret;
1979}
1980
1981static struct xpc_msg *
1982xpc_pull_remote_msg(struct xpc_channel *ch, s64 get)
1983{
1984 struct xpc_partition *part = &xpc_partitions[ch->partid];
1985 struct xpc_msg *remote_msg, *msg;
1986 u32 msg_index, nmsgs;
1987 u64 msg_offset;
1988 enum xpc_retval ret;
1989
1990 if (mutex_lock_interruptible(&ch->msg_to_pull_mutex) != 0) {
1991 /* we were interrupted by a signal */
1992 return NULL;
1993 }
1994
1995 while (get >= ch->next_msg_to_pull) {
1996
1997 /* pull as many messages as are ready and able to be pulled */
1998
1999 msg_index = ch->next_msg_to_pull % ch->remote_nentries;
2000
2001 DBUG_ON(ch->next_msg_to_pull >= ch->w_remote_GP.put);
2002 nmsgs = ch->w_remote_GP.put - ch->next_msg_to_pull;
2003 if (msg_index + nmsgs > ch->remote_nentries) {
2004 /* ignore the ones that wrap the msg queue for now */
2005 nmsgs = ch->remote_nentries - msg_index;
2006 }
2007
2008 msg_offset = msg_index * ch->msg_size;
2009 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
2010 remote_msg = (struct xpc_msg *)(ch->remote_msgqueue_pa +
2011 msg_offset);
2012
2013 ret = xpc_pull_remote_cachelines(part, msg, remote_msg,
2014 nmsgs * ch->msg_size);
2015 if (ret != xpcSuccess) {
2016
2017 dev_dbg(xpc_chan, "failed to pull %d msgs starting with"
2018 " msg %ld from partition %d, channel=%d, "
2019 "ret=%d\n", nmsgs, ch->next_msg_to_pull,
2020 ch->partid, ch->number, ret);
2021
2022 XPC_DEACTIVATE_PARTITION(part, ret);
2023
2024 mutex_unlock(&ch->msg_to_pull_mutex);
2025 return NULL;
2026 }
2027
2028 ch->next_msg_to_pull += nmsgs;
2029 }
2030
2031 mutex_unlock(&ch->msg_to_pull_mutex);
2032
2033 /* return the message we were looking for */
2034 msg_offset = (get % ch->remote_nentries) * ch->msg_size;
2035 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
2036
2037 return msg;
2038}
2039
2040/*
2041 * Get a message to be delivered.
2042 */
2043static struct xpc_msg *
2044xpc_get_deliverable_msg(struct xpc_channel *ch)
2045{
2046 struct xpc_msg *msg = NULL;
2047 s64 get;
2048
2049 do {
2050 if (ch->flags & XPC_C_DISCONNECTING)
2051 break;
2052
2053 get = ch->w_local_GP.get;
2054 rmb(); /* guarantee that .get loads before .put */
2055 if (get == ch->w_remote_GP.put)
2056 break;
2057
2058 /* There are messages waiting to be pulled and delivered.
2059 * We need to try to secure one for ourselves. We'll do this
2060 * by trying to increment w_local_GP.get and hope that no one
2061 * else beats us to it. If they do, we'll we'll simply have
2062 * to try again for the next one.
2063 */
2064
2065 if (cmpxchg(&ch->w_local_GP.get, get, get + 1) == get) {
2066 /* we got the entry referenced by get */
2067
2068 dev_dbg(xpc_chan, "w_local_GP.get changed to %ld, "
2069 "partid=%d, channel=%d\n", get + 1,
2070 ch->partid, ch->number);
2071
2072 /* pull the message from the remote partition */
2073
2074 msg = xpc_pull_remote_msg(ch, get);
2075
2076 DBUG_ON(msg != NULL && msg->number != get);
2077 DBUG_ON(msg != NULL && (msg->flags & XPC_M_DONE));
2078 DBUG_ON(msg != NULL && !(msg->flags & XPC_M_READY));
2079
2080 break;
2081 }
2082
2083 } while (1);
2084
2085 return msg;
2086}
2087
2088/*
2089 * Deliver a message to its intended recipient.
2090 */
2091void
2092xpc_deliver_msg(struct xpc_channel *ch)
2093{
2094 struct xpc_msg *msg;
2095
2096 msg = xpc_get_deliverable_msg(ch);
2097 if (msg != NULL) {
2098
2099 /*
2100 * This ref is taken to protect the payload itself from being
2101 * freed before the user is finished with it, which the user
2102 * indicates by calling xpc_initiate_received().
2103 */
2104 xpc_msgqueue_ref(ch);
2105
2106 atomic_inc(&ch->kthreads_active);
2107
2108 if (ch->func != NULL) {
2109 dev_dbg(xpc_chan, "ch->func() called, msg=0x%p, "
2110 "msg_number=%ld, partid=%d, channel=%d\n",
2111 (void *)msg, msg->number, ch->partid,
2112 ch->number);
2113
2114 /* deliver the message to its intended recipient */
2115 ch->func(xpcMsgReceived, ch->partid, ch->number,
2116 &msg->payload, ch->key);
2117
2118 dev_dbg(xpc_chan, "ch->func() returned, msg=0x%p, "
2119 "msg_number=%ld, partid=%d, channel=%d\n",
2120 (void *)msg, msg->number, ch->partid,
2121 ch->number);
2122 }
2123
2124 atomic_dec(&ch->kthreads_active);
2125 }
2126}
2127
2128/*
2129 * Now we actually acknowledge the messages that have been delivered and ack'd
2130 * by advancing the cached remote message queue's Get value and if requested
2131 * send an IPI to the message sender's partition.
2132 */
2133static void
2134xpc_acknowledge_msgs(struct xpc_channel *ch, s64 initial_get, u8 msg_flags)
2135{
2136 struct xpc_msg *msg;
2137 s64 get = initial_get + 1;
2138 int send_IPI = 0;
2139
2140 while (1) {
2141
2142 while (1) {
2143 if (get == ch->w_local_GP.get)
2144 break;
2145
2146 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
2147 (get % ch->remote_nentries) *
2148 ch->msg_size);
2149
2150 if (!(msg->flags & XPC_M_DONE))
2151 break;
2152
2153 msg_flags |= msg->flags;
2154 get++;
2155 }
2156
2157 if (get == initial_get) {
2158 /* nothing's changed */
2159 break;
2160 }
2161
2162 if (cmpxchg_rel(&ch->local_GP->get, initial_get, get) !=
2163 initial_get) {
2164 /* someone else beat us to it */
2165 DBUG_ON(ch->local_GP->get <= initial_get);
2166 break;
2167 }
2168
2169 /* we just set the new value of local_GP->get */
2170
2171 dev_dbg(xpc_chan, "local_GP->get changed to %ld, partid=%d, "
2172 "channel=%d\n", get, ch->partid, ch->number);
2173
2174 send_IPI = (msg_flags & XPC_M_INTERRUPT);
2175
2176 /*
2177 * We need to ensure that the message referenced by
2178 * local_GP->get is not XPC_M_DONE or that local_GP->get
2179 * equals w_local_GP.get, so we'll go have a look.
2180 */
2181 initial_get = get;
2182 }
2183
2184 if (send_IPI)
2185 xpc_IPI_send_msgrequest(ch);
2186}
2187
2188/*
2189 * Acknowledge receipt of a delivered message.
2190 *
2191 * If a message has XPC_M_INTERRUPT set, send an interrupt to the partition
2192 * that sent the message.
2193 *
2194 * This function, although called by users, does not call xpc_part_ref() to
2195 * ensure that the partition infrastructure is in place. It relies on the
2196 * fact that we called xpc_msgqueue_ref() in xpc_deliver_msg().
2197 *
2198 * Arguments:
2199 *
2200 * partid - ID of partition to which the channel is connected.
2201 * ch_number - channel # message received on.
2202 * payload - pointer to the payload area allocated via
2203 * xpc_initiate_allocate().
2204 */
2205void
2206xpc_initiate_received(partid_t partid, int ch_number, void *payload)
2207{
2208 struct xpc_partition *part = &xpc_partitions[partid];
2209 struct xpc_channel *ch;
2210 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
2211 s64 get, msg_number = msg->number;
2212
2213 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
2214 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
2215
2216 ch = &part->channels[ch_number];
2217
2218 dev_dbg(xpc_chan, "msg=0x%p, msg_number=%ld, partid=%d, channel=%d\n",
2219 (void *)msg, msg_number, ch->partid, ch->number);
2220
2221 DBUG_ON((((u64)msg - (u64)ch->remote_msgqueue) / ch->msg_size) !=
2222 msg_number % ch->remote_nentries);
2223 DBUG_ON(msg->flags & XPC_M_DONE);
2224
2225 msg->flags |= XPC_M_DONE;
2226
2227 /*
2228 * The preceding store of msg->flags must occur before the following
2229 * load of ch->local_GP->get.
2230 */
2231 mb();
2232
2233 /*
2234 * See if this message is next in line to be acknowledged as having
2235 * been delivered.
2236 */
2237 get = ch->local_GP->get;
2238 if (get == msg_number)
2239 xpc_acknowledge_msgs(ch, get, msg->flags);
2240
2241 /* the call to xpc_msgqueue_ref() was done by xpc_deliver_msg() */
2242 xpc_msgqueue_deref(ch);
2243}
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
new file mode 100644
index 000000000000..f673ba90eb0e
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -0,0 +1,1323 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) support - standard version.
11 *
12 * XPC provides a message passing capability that crosses partition
13 * boundaries. This module is made up of two parts:
14 *
15 * partition This part detects the presence/absence of other
16 * partitions. It provides a heartbeat and monitors
17 * the heartbeats of other partitions.
18 *
19 * channel This part manages the channels and sends/receives
20 * messages across them to/from other partitions.
21 *
22 * There are a couple of additional functions residing in XP, which
23 * provide an interface to XPC for its users.
24 *
25 *
26 * Caveats:
27 *
28 * . We currently have no way to determine which nasid an IPI came
29 * from. Thus, xpc_IPI_send() does a remote AMO write followed by
30 * an IPI. The AMO indicates where data is to be pulled from, so
31 * after the IPI arrives, the remote partition checks the AMO word.
32 * The IPI can actually arrive before the AMO however, so other code
33 * must periodically check for this case. Also, remote AMO operations
34 * do not reliably time out. Thus we do a remote PIO read solely to
35 * know whether the remote partition is down and whether we should
36 * stop sending IPIs to it. This remote PIO read operation is set up
37 * in a special nofault region so SAL knows to ignore (and cleanup)
38 * any errors due to the remote AMO write, PIO read, and/or PIO
39 * write operations.
40 *
41 * If/when new hardware solves this IPI problem, we should abandon
42 * the current approach.
43 *
44 */
45
46#include <linux/kernel.h>
47#include <linux/module.h>
48#include <linux/init.h>
49#include <linux/cache.h>
50#include <linux/interrupt.h>
51#include <linux/delay.h>
52#include <linux/reboot.h>
53#include <linux/completion.h>
54#include <linux/kdebug.h>
55#include <linux/kthread.h>
56#include <linux/uaccess.h>
57#include <asm/sn/intr.h>
58#include <asm/sn/sn_sal.h>
59#include "xpc.h"
60
61/* define two XPC debug device structures to be used with dev_dbg() et al */
62
63struct device_driver xpc_dbg_name = {
64 .name = "xpc"
65};
66
67struct device xpc_part_dbg_subname = {
68 .bus_id = {0}, /* set to "part" at xpc_init() time */
69 .driver = &xpc_dbg_name
70};
71
72struct device xpc_chan_dbg_subname = {
73 .bus_id = {0}, /* set to "chan" at xpc_init() time */
74 .driver = &xpc_dbg_name
75};
76
77struct device *xpc_part = &xpc_part_dbg_subname;
78struct device *xpc_chan = &xpc_chan_dbg_subname;
79
80static int xpc_kdebug_ignore;
81
82/* systune related variables for /proc/sys directories */
83
84static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
85static int xpc_hb_min_interval = 1;
86static int xpc_hb_max_interval = 10;
87
88static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
89static int xpc_hb_check_min_interval = 10;
90static int xpc_hb_check_max_interval = 120;
91
92int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
93static int xpc_disengage_request_min_timelimit; /* = 0 */
94static int xpc_disengage_request_max_timelimit = 120;
95
96static ctl_table xpc_sys_xpc_hb_dir[] = {
97 {
98 .ctl_name = CTL_UNNUMBERED,
99 .procname = "hb_interval",
100 .data = &xpc_hb_interval,
101 .maxlen = sizeof(int),
102 .mode = 0644,
103 .proc_handler = &proc_dointvec_minmax,
104 .strategy = &sysctl_intvec,
105 .extra1 = &xpc_hb_min_interval,
106 .extra2 = &xpc_hb_max_interval},
107 {
108 .ctl_name = CTL_UNNUMBERED,
109 .procname = "hb_check_interval",
110 .data = &xpc_hb_check_interval,
111 .maxlen = sizeof(int),
112 .mode = 0644,
113 .proc_handler = &proc_dointvec_minmax,
114 .strategy = &sysctl_intvec,
115 .extra1 = &xpc_hb_check_min_interval,
116 .extra2 = &xpc_hb_check_max_interval},
117 {}
118};
119static ctl_table xpc_sys_xpc_dir[] = {
120 {
121 .ctl_name = CTL_UNNUMBERED,
122 .procname = "hb",
123 .mode = 0555,
124 .child = xpc_sys_xpc_hb_dir},
125 {
126 .ctl_name = CTL_UNNUMBERED,
127 .procname = "disengage_request_timelimit",
128 .data = &xpc_disengage_request_timelimit,
129 .maxlen = sizeof(int),
130 .mode = 0644,
131 .proc_handler = &proc_dointvec_minmax,
132 .strategy = &sysctl_intvec,
133 .extra1 = &xpc_disengage_request_min_timelimit,
134 .extra2 = &xpc_disengage_request_max_timelimit},
135 {}
136};
137static ctl_table xpc_sys_dir[] = {
138 {
139 .ctl_name = CTL_UNNUMBERED,
140 .procname = "xpc",
141 .mode = 0555,
142 .child = xpc_sys_xpc_dir},
143 {}
144};
145static struct ctl_table_header *xpc_sysctl;
146
147/* non-zero if any remote partition disengage request was timed out */
148int xpc_disengage_request_timedout;
149
150/* #of IRQs received */
151static atomic_t xpc_act_IRQ_rcvd;
152
153/* IRQ handler notifies this wait queue on receipt of an IRQ */
154static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
155
156static unsigned long xpc_hb_check_timeout;
157
158/* notification that the xpc_hb_checker thread has exited */
159static DECLARE_COMPLETION(xpc_hb_checker_exited);
160
161/* notification that the xpc_discovery thread has exited */
162static DECLARE_COMPLETION(xpc_discovery_exited);
163
164static struct timer_list xpc_hb_timer;
165
166static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
167
168static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
169static struct notifier_block xpc_reboot_notifier = {
170 .notifier_call = xpc_system_reboot,
171};
172
173static int xpc_system_die(struct notifier_block *, unsigned long, void *);
174static struct notifier_block xpc_die_notifier = {
175 .notifier_call = xpc_system_die,
176};
177
178/*
179 * Timer function to enforce the timelimit on the partition disengage request.
180 */
181static void
182xpc_timeout_partition_disengage_request(unsigned long data)
183{
184 struct xpc_partition *part = (struct xpc_partition *)data;
185
186 DBUG_ON(time_before(jiffies, part->disengage_request_timeout));
187
188 (void)xpc_partition_disengaged(part);
189
190 DBUG_ON(part->disengage_request_timeout != 0);
191 DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
192}
193
194/*
195 * Notify the heartbeat check thread that an IRQ has been received.
196 */
197static irqreturn_t
198xpc_act_IRQ_handler(int irq, void *dev_id)
199{
200 atomic_inc(&xpc_act_IRQ_rcvd);
201 wake_up_interruptible(&xpc_act_IRQ_wq);
202 return IRQ_HANDLED;
203}
204
205/*
206 * Timer to produce the heartbeat. The timer structures function is
207 * already set when this is initially called. A tunable is used to
208 * specify when the next timeout should occur.
209 */
210static void
211xpc_hb_beater(unsigned long dummy)
212{
213 xpc_vars->heartbeat++;
214
215 if (time_after_eq(jiffies, xpc_hb_check_timeout))
216 wake_up_interruptible(&xpc_act_IRQ_wq);
217
218 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
219 add_timer(&xpc_hb_timer);
220}
221
222/*
223 * This thread is responsible for nearly all of the partition
224 * activation/deactivation.
225 */
226static int
227xpc_hb_checker(void *ignore)
228{
229 int last_IRQ_count = 0;
230 int new_IRQ_count;
231 int force_IRQ = 0;
232
233 /* this thread was marked active by xpc_hb_init() */
234
235 set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
236
237 /* set our heartbeating to other partitions into motion */
238 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
239 xpc_hb_beater(0);
240
241 while (!xpc_exiting) {
242
243 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
244 "been received\n",
245 (int)(xpc_hb_check_timeout - jiffies),
246 atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
247
248 /* checking of remote heartbeats is skewed by IRQ handling */
249 if (time_after_eq(jiffies, xpc_hb_check_timeout)) {
250 dev_dbg(xpc_part, "checking remote heartbeats\n");
251 xpc_check_remote_hb();
252
253 /*
254 * We need to periodically recheck to ensure no
255 * IPI/AMO pairs have been missed. That check
256 * must always reset xpc_hb_check_timeout.
257 */
258 force_IRQ = 1;
259 }
260
261 /* check for outstanding IRQs */
262 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
263 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
264 force_IRQ = 0;
265
266 dev_dbg(xpc_part, "found an IRQ to process; will be "
267 "resetting xpc_hb_check_timeout\n");
268
269 last_IRQ_count += xpc_identify_act_IRQ_sender();
270 if (last_IRQ_count < new_IRQ_count) {
271 /* retry once to help avoid missing AMO */
272 (void)xpc_identify_act_IRQ_sender();
273 }
274 last_IRQ_count = new_IRQ_count;
275
276 xpc_hb_check_timeout = jiffies +
277 (xpc_hb_check_interval * HZ);
278 }
279
280 /* wait for IRQ or timeout */
281 (void)wait_event_interruptible(xpc_act_IRQ_wq,
282 (last_IRQ_count <
283 atomic_read(&xpc_act_IRQ_rcvd)
284 || time_after_eq(jiffies,
285 xpc_hb_check_timeout) ||
286 xpc_exiting));
287 }
288
289 dev_dbg(xpc_part, "heartbeat checker is exiting\n");
290
291 /* mark this thread as having exited */
292 complete(&xpc_hb_checker_exited);
293 return 0;
294}
295
296/*
297 * This thread will attempt to discover other partitions to activate
298 * based on info provided by SAL. This new thread is short lived and
299 * will exit once discovery is complete.
300 */
301static int
302xpc_initiate_discovery(void *ignore)
303{
304 xpc_discovery();
305
306 dev_dbg(xpc_part, "discovery thread is exiting\n");
307
308 /* mark this thread as having exited */
309 complete(&xpc_discovery_exited);
310 return 0;
311}
312
313/*
314 * Establish first contact with the remote partititon. This involves pulling
315 * the XPC per partition variables from the remote partition and waiting for
316 * the remote partition to pull ours.
317 */
318static enum xpc_retval
319xpc_make_first_contact(struct xpc_partition *part)
320{
321 enum xpc_retval ret;
322
323 while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
324 if (ret != xpcRetry) {
325 XPC_DEACTIVATE_PARTITION(part, ret);
326 return ret;
327 }
328
329 dev_dbg(xpc_chan, "waiting to make first contact with "
330 "partition %d\n", XPC_PARTID(part));
331
332 /* wait a 1/4 of a second or so */
333 (void)msleep_interruptible(250);
334
335 if (part->act_state == XPC_P_DEACTIVATING)
336 return part->reason;
337 }
338
339 return xpc_mark_partition_active(part);
340}
341
342/*
343 * The first kthread assigned to a newly activated partition is the one
344 * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
345 * that kthread until the partition is brought down, at which time that kthread
346 * returns back to XPC HB. (The return of that kthread will signify to XPC HB
347 * that XPC has dismantled all communication infrastructure for the associated
348 * partition.) This kthread becomes the channel manager for that partition.
349 *
350 * Each active partition has a channel manager, who, besides connecting and
351 * disconnecting channels, will ensure that each of the partition's connected
352 * channels has the required number of assigned kthreads to get the work done.
353 */
354static void
355xpc_channel_mgr(struct xpc_partition *part)
356{
357 while (part->act_state != XPC_P_DEACTIVATING ||
358 atomic_read(&part->nchannels_active) > 0 ||
359 !xpc_partition_disengaged(part)) {
360
361 xpc_process_channel_activity(part);
362
363 /*
364 * Wait until we've been requested to activate kthreads or
365 * all of the channel's message queues have been torn down or
366 * a signal is pending.
367 *
368 * The channel_mgr_requests is set to 1 after being awakened,
369 * This is done to prevent the channel mgr from making one pass
370 * through the loop for each request, since he will
371 * be servicing all the requests in one pass. The reason it's
372 * set to 1 instead of 0 is so that other kthreads will know
373 * that the channel mgr is running and won't bother trying to
374 * wake him up.
375 */
376 atomic_dec(&part->channel_mgr_requests);
377 (void)wait_event_interruptible(part->channel_mgr_wq,
378 (atomic_read(&part->channel_mgr_requests) > 0 ||
379 part->local_IPI_amo != 0 ||
380 (part->act_state == XPC_P_DEACTIVATING &&
381 atomic_read(&part->nchannels_active) == 0 &&
382 xpc_partition_disengaged(part))));
383 atomic_set(&part->channel_mgr_requests, 1);
384 }
385}
386
387/*
388 * When XPC HB determines that a partition has come up, it will create a new
389 * kthread and that kthread will call this function to attempt to set up the
390 * basic infrastructure used for Cross Partition Communication with the newly
391 * upped partition.
392 *
393 * The kthread that was created by XPC HB and which setup the XPC
394 * infrastructure will remain assigned to the partition until the partition
395 * goes down. At which time the kthread will teardown the XPC infrastructure
396 * and then exit.
397 *
398 * XPC HB will put the remote partition's XPC per partition specific variables
399 * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
400 * calling xpc_partition_up().
401 */
402static void
403xpc_partition_up(struct xpc_partition *part)
404{
405 DBUG_ON(part->channels != NULL);
406
407 dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
408
409 if (xpc_setup_infrastructure(part) != xpcSuccess)
410 return;
411
412 /*
413 * The kthread that XPC HB called us with will become the
414 * channel manager for this partition. It will not return
415 * back to XPC HB until the partition's XPC infrastructure
416 * has been dismantled.
417 */
418
419 (void)xpc_part_ref(part); /* this will always succeed */
420
421 if (xpc_make_first_contact(part) == xpcSuccess)
422 xpc_channel_mgr(part);
423
424 xpc_part_deref(part);
425
426 xpc_teardown_infrastructure(part);
427}
428
429static int
430xpc_activating(void *__partid)
431{
432 partid_t partid = (u64)__partid;
433 struct xpc_partition *part = &xpc_partitions[partid];
434 unsigned long irq_flags;
435
436 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
437
438 spin_lock_irqsave(&part->act_lock, irq_flags);
439
440 if (part->act_state == XPC_P_DEACTIVATING) {
441 part->act_state = XPC_P_INACTIVE;
442 spin_unlock_irqrestore(&part->act_lock, irq_flags);
443 part->remote_rp_pa = 0;
444 return 0;
445 }
446
447 /* indicate the thread is activating */
448 DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
449 part->act_state = XPC_P_ACTIVATING;
450
451 XPC_SET_REASON(part, 0, 0);
452 spin_unlock_irqrestore(&part->act_lock, irq_flags);
453
454 dev_dbg(xpc_part, "bringing partition %d up\n", partid);
455
456 /*
457 * Register the remote partition's AMOs with SAL so it can handle
458 * and cleanup errors within that address range should the remote
459 * partition go down. We don't unregister this range because it is
460 * difficult to tell when outstanding writes to the remote partition
461 * are finished and thus when it is safe to unregister. This should
462 * not result in wasted space in the SAL xp_addr_region table because
463 * we should get the same page for remote_amos_page_pa after module
464 * reloads and system reboots.
465 */
466 if (sn_register_xp_addr_region(part->remote_amos_page_pa,
467 PAGE_SIZE, 1) < 0) {
468 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
469 "xp_addr region\n", partid);
470
471 spin_lock_irqsave(&part->act_lock, irq_flags);
472 part->act_state = XPC_P_INACTIVE;
473 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
474 spin_unlock_irqrestore(&part->act_lock, irq_flags);
475 part->remote_rp_pa = 0;
476 return 0;
477 }
478
479 xpc_allow_hb(partid, xpc_vars);
480 xpc_IPI_send_activated(part);
481
482 /*
483 * xpc_partition_up() holds this thread and marks this partition as
484 * XPC_P_ACTIVE by calling xpc_hb_mark_active().
485 */
486 (void)xpc_partition_up(part);
487
488 xpc_disallow_hb(partid, xpc_vars);
489 xpc_mark_partition_inactive(part);
490
491 if (part->reason == xpcReactivating) {
492 /* interrupting ourselves results in activating partition */
493 xpc_IPI_send_reactivate(part);
494 }
495
496 return 0;
497}
498
499void
500xpc_activate_partition(struct xpc_partition *part)
501{
502 partid_t partid = XPC_PARTID(part);
503 unsigned long irq_flags;
504 struct task_struct *kthread;
505
506 spin_lock_irqsave(&part->act_lock, irq_flags);
507
508 DBUG_ON(part->act_state != XPC_P_INACTIVE);
509
510 part->act_state = XPC_P_ACTIVATION_REQ;
511 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
512
513 spin_unlock_irqrestore(&part->act_lock, irq_flags);
514
515 kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
516 partid);
517 if (IS_ERR(kthread)) {
518 spin_lock_irqsave(&part->act_lock, irq_flags);
519 part->act_state = XPC_P_INACTIVE;
520 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
521 spin_unlock_irqrestore(&part->act_lock, irq_flags);
522 }
523}
524
525/*
526 * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
527 * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
528 * than one partition, we use an AMO_t structure per partition to indicate
529 * whether a partition has sent an IPI or not. If it has, then wake up the
530 * associated kthread to handle it.
531 *
532 * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
533 * running on other partitions.
534 *
535 * Noteworthy Arguments:
536 *
537 * irq - Interrupt ReQuest number. NOT USED.
538 *
539 * dev_id - partid of IPI's potential sender.
540 */
541irqreturn_t
542xpc_notify_IRQ_handler(int irq, void *dev_id)
543{
544 partid_t partid = (partid_t) (u64)dev_id;
545 struct xpc_partition *part = &xpc_partitions[partid];
546
547 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
548
549 if (xpc_part_ref(part)) {
550 xpc_check_for_channel_activity(part);
551
552 xpc_part_deref(part);
553 }
554 return IRQ_HANDLED;
555}
556
557/*
558 * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
559 * because the write to their associated IPI amo completed after the IRQ/IPI
560 * was received.
561 */
562void
563xpc_dropped_IPI_check(struct xpc_partition *part)
564{
565 if (xpc_part_ref(part)) {
566 xpc_check_for_channel_activity(part);
567
568 part->dropped_IPI_timer.expires = jiffies +
569 XPC_P_DROPPED_IPI_WAIT;
570 add_timer(&part->dropped_IPI_timer);
571 xpc_part_deref(part);
572 }
573}
574
575void
576xpc_activate_kthreads(struct xpc_channel *ch, int needed)
577{
578 int idle = atomic_read(&ch->kthreads_idle);
579 int assigned = atomic_read(&ch->kthreads_assigned);
580 int wakeup;
581
582 DBUG_ON(needed <= 0);
583
584 if (idle > 0) {
585 wakeup = (needed > idle) ? idle : needed;
586 needed -= wakeup;
587
588 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
589 "channel=%d\n", wakeup, ch->partid, ch->number);
590
591 /* only wakeup the requested number of kthreads */
592 wake_up_nr(&ch->idle_wq, wakeup);
593 }
594
595 if (needed <= 0)
596 return;
597
598 if (needed + assigned > ch->kthreads_assigned_limit) {
599 needed = ch->kthreads_assigned_limit - assigned;
600 if (needed <= 0)
601 return;
602 }
603
604 dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
605 needed, ch->partid, ch->number);
606
607 xpc_create_kthreads(ch, needed, 0);
608}
609
610/*
611 * This function is where XPC's kthreads wait for messages to deliver.
612 */
613static void
614xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
615{
616 do {
617 /* deliver messages to their intended recipients */
618
619 while (ch->w_local_GP.get < ch->w_remote_GP.put &&
620 !(ch->flags & XPC_C_DISCONNECTING)) {
621 xpc_deliver_msg(ch);
622 }
623
624 if (atomic_inc_return(&ch->kthreads_idle) >
625 ch->kthreads_idle_limit) {
626 /* too many idle kthreads on this channel */
627 atomic_dec(&ch->kthreads_idle);
628 break;
629 }
630
631 dev_dbg(xpc_chan, "idle kthread calling "
632 "wait_event_interruptible_exclusive()\n");
633
634 (void)wait_event_interruptible_exclusive(ch->idle_wq,
635 (ch->w_local_GP.get < ch->w_remote_GP.put ||
636 (ch->flags & XPC_C_DISCONNECTING)));
637
638 atomic_dec(&ch->kthreads_idle);
639
640 } while (!(ch->flags & XPC_C_DISCONNECTING));
641}
642
643static int
644xpc_kthread_start(void *args)
645{
646 partid_t partid = XPC_UNPACK_ARG1(args);
647 u16 ch_number = XPC_UNPACK_ARG2(args);
648 struct xpc_partition *part = &xpc_partitions[partid];
649 struct xpc_channel *ch;
650 int n_needed;
651 unsigned long irq_flags;
652
653 dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
654 partid, ch_number);
655
656 ch = &part->channels[ch_number];
657
658 if (!(ch->flags & XPC_C_DISCONNECTING)) {
659
660 /* let registerer know that connection has been established */
661
662 spin_lock_irqsave(&ch->lock, irq_flags);
663 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
664 ch->flags |= XPC_C_CONNECTEDCALLOUT;
665 spin_unlock_irqrestore(&ch->lock, irq_flags);
666
667 xpc_connected_callout(ch);
668
669 spin_lock_irqsave(&ch->lock, irq_flags);
670 ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
671 spin_unlock_irqrestore(&ch->lock, irq_flags);
672
673 /*
674 * It is possible that while the callout was being
675 * made that the remote partition sent some messages.
676 * If that is the case, we may need to activate
677 * additional kthreads to help deliver them. We only
678 * need one less than total #of messages to deliver.
679 */
680 n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
681 if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
682 xpc_activate_kthreads(ch, n_needed);
683
684 } else {
685 spin_unlock_irqrestore(&ch->lock, irq_flags);
686 }
687
688 xpc_kthread_waitmsgs(part, ch);
689 }
690
691 /* let registerer know that connection is disconnecting */
692
693 spin_lock_irqsave(&ch->lock, irq_flags);
694 if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
695 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
696 ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
697 spin_unlock_irqrestore(&ch->lock, irq_flags);
698
699 xpc_disconnect_callout(ch, xpcDisconnecting);
700
701 spin_lock_irqsave(&ch->lock, irq_flags);
702 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
703 }
704 spin_unlock_irqrestore(&ch->lock, irq_flags);
705
706 if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
707 if (atomic_dec_return(&part->nchannels_engaged) == 0) {
708 xpc_mark_partition_disengaged(part);
709 xpc_IPI_send_disengage(part);
710 }
711 }
712
713 xpc_msgqueue_deref(ch);
714
715 dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
716 partid, ch_number);
717
718 xpc_part_deref(part);
719 return 0;
720}
721
722/*
723 * For each partition that XPC has established communications with, there is
724 * a minimum of one kernel thread assigned to perform any operation that
725 * may potentially sleep or block (basically the callouts to the asynchronous
726 * functions registered via xpc_connect()).
727 *
728 * Additional kthreads are created and destroyed by XPC as the workload
729 * demands.
730 *
731 * A kthread is assigned to one of the active channels that exists for a given
732 * partition.
733 */
734void
735xpc_create_kthreads(struct xpc_channel *ch, int needed,
736 int ignore_disconnecting)
737{
738 unsigned long irq_flags;
739 u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
740 struct xpc_partition *part = &xpc_partitions[ch->partid];
741 struct task_struct *kthread;
742
743 while (needed-- > 0) {
744
745 /*
746 * The following is done on behalf of the newly created
747 * kthread. That kthread is responsible for doing the
748 * counterpart to the following before it exits.
749 */
750 if (ignore_disconnecting) {
751 if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
752 /* kthreads assigned had gone to zero */
753 BUG_ON(!(ch->flags &
754 XPC_C_DISCONNECTINGCALLOUT_MADE));
755 break;
756 }
757
758 } else if (ch->flags & XPC_C_DISCONNECTING) {
759 break;
760
761 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1) {
762 if (atomic_inc_return(&part->nchannels_engaged) == 1)
763 xpc_mark_partition_engaged(part);
764 }
765 (void)xpc_part_ref(part);
766 xpc_msgqueue_ref(ch);
767
768 kthread = kthread_run(xpc_kthread_start, (void *)args,
769 "xpc%02dc%d", ch->partid, ch->number);
770 if (IS_ERR(kthread)) {
771 /* the fork failed */
772
773 /*
774 * NOTE: if (ignore_disconnecting &&
775 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
776 * then we'll deadlock if all other kthreads assigned
777 * to this channel are blocked in the channel's
778 * registerer, because the only thing that will unblock
779 * them is the xpcDisconnecting callout that this
780 * failed kthread_run() would have made.
781 */
782
783 if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
784 atomic_dec_return(&part->nchannels_engaged) == 0) {
785 xpc_mark_partition_disengaged(part);
786 xpc_IPI_send_disengage(part);
787 }
788 xpc_msgqueue_deref(ch);
789 xpc_part_deref(part);
790
791 if (atomic_read(&ch->kthreads_assigned) <
792 ch->kthreads_idle_limit) {
793 /*
794 * Flag this as an error only if we have an
795 * insufficient #of kthreads for the channel
796 * to function.
797 */
798 spin_lock_irqsave(&ch->lock, irq_flags);
799 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
800 &irq_flags);
801 spin_unlock_irqrestore(&ch->lock, irq_flags);
802 }
803 break;
804 }
805 }
806}
807
808void
809xpc_disconnect_wait(int ch_number)
810{
811 unsigned long irq_flags;
812 partid_t partid;
813 struct xpc_partition *part;
814 struct xpc_channel *ch;
815 int wakeup_channel_mgr;
816
817 /* now wait for all callouts to the caller's function to cease */
818 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
819 part = &xpc_partitions[partid];
820
821 if (!xpc_part_ref(part))
822 continue;
823
824 ch = &part->channels[ch_number];
825
826 if (!(ch->flags & XPC_C_WDISCONNECT)) {
827 xpc_part_deref(part);
828 continue;
829 }
830
831 wait_for_completion(&ch->wdisconnect_wait);
832
833 spin_lock_irqsave(&ch->lock, irq_flags);
834 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
835 wakeup_channel_mgr = 0;
836
837 if (ch->delayed_IPI_flags) {
838 if (part->act_state != XPC_P_DEACTIVATING) {
839 spin_lock(&part->IPI_lock);
840 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
841 ch->number,
842 ch->delayed_IPI_flags);
843 spin_unlock(&part->IPI_lock);
844 wakeup_channel_mgr = 1;
845 }
846 ch->delayed_IPI_flags = 0;
847 }
848
849 ch->flags &= ~XPC_C_WDISCONNECT;
850 spin_unlock_irqrestore(&ch->lock, irq_flags);
851
852 if (wakeup_channel_mgr)
853 xpc_wakeup_channel_mgr(part);
854
855 xpc_part_deref(part);
856 }
857}
858
859static void
860xpc_do_exit(enum xpc_retval reason)
861{
862 partid_t partid;
863 int active_part_count, printed_waiting_msg = 0;
864 struct xpc_partition *part;
865 unsigned long printmsg_time, disengage_request_timeout = 0;
866
867 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
868 DBUG_ON(xpc_exiting == 1);
869
870 /*
871 * Let the heartbeat checker thread and the discovery thread
872 * (if one is running) know that they should exit. Also wake up
873 * the heartbeat checker thread in case it's sleeping.
874 */
875 xpc_exiting = 1;
876 wake_up_interruptible(&xpc_act_IRQ_wq);
877
878 /* ignore all incoming interrupts */
879 free_irq(SGI_XPC_ACTIVATE, NULL);
880
881 /* wait for the discovery thread to exit */
882 wait_for_completion(&xpc_discovery_exited);
883
884 /* wait for the heartbeat checker thread to exit */
885 wait_for_completion(&xpc_hb_checker_exited);
886
887 /* sleep for a 1/3 of a second or so */
888 (void)msleep_interruptible(300);
889
890 /* wait for all partitions to become inactive */
891
892 printmsg_time = jiffies + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
893 xpc_disengage_request_timedout = 0;
894
895 do {
896 active_part_count = 0;
897
898 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
899 part = &xpc_partitions[partid];
900
901 if (xpc_partition_disengaged(part) &&
902 part->act_state == XPC_P_INACTIVE) {
903 continue;
904 }
905
906 active_part_count++;
907
908 XPC_DEACTIVATE_PARTITION(part, reason);
909
910 if (part->disengage_request_timeout >
911 disengage_request_timeout) {
912 disengage_request_timeout =
913 part->disengage_request_timeout;
914 }
915 }
916
917 if (xpc_partition_engaged(-1UL)) {
918 if (time_after(jiffies, printmsg_time)) {
919 dev_info(xpc_part, "waiting for remote "
920 "partitions to disengage, timeout in "
921 "%ld seconds\n",
922 (disengage_request_timeout - jiffies)
923 / HZ);
924 printmsg_time = jiffies +
925 (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
926 printed_waiting_msg = 1;
927 }
928
929 } else if (active_part_count > 0) {
930 if (printed_waiting_msg) {
931 dev_info(xpc_part, "waiting for local partition"
932 " to disengage\n");
933 printed_waiting_msg = 0;
934 }
935
936 } else {
937 if (!xpc_disengage_request_timedout) {
938 dev_info(xpc_part, "all partitions have "
939 "disengaged\n");
940 }
941 break;
942 }
943
944 /* sleep for a 1/3 of a second or so */
945 (void)msleep_interruptible(300);
946
947 } while (1);
948
949 DBUG_ON(xpc_partition_engaged(-1UL));
950
951 /* indicate to others that our reserved page is uninitialized */
952 xpc_rsvd_page->vars_pa = 0;
953
954 /* now it's time to eliminate our heartbeat */
955 del_timer_sync(&xpc_hb_timer);
956 DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
957
958 if (reason == xpcUnloading) {
959 /* take ourselves off of the reboot_notifier_list */
960 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
961
962 /* take ourselves off of the die_notifier list */
963 (void)unregister_die_notifier(&xpc_die_notifier);
964 }
965
966 /* close down protections for IPI operations */
967 xpc_restrict_IPI_ops();
968
969 /* clear the interface to XPC's functions */
970 xpc_clear_interface();
971
972 if (xpc_sysctl)
973 unregister_sysctl_table(xpc_sysctl);
974
975 kfree(xpc_remote_copy_buffer_base);
976}
977
978/*
979 * This function is called when the system is being rebooted.
980 */
981static int
982xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
983{
984 enum xpc_retval reason;
985
986 switch (event) {
987 case SYS_RESTART:
988 reason = xpcSystemReboot;
989 break;
990 case SYS_HALT:
991 reason = xpcSystemHalt;
992 break;
993 case SYS_POWER_OFF:
994 reason = xpcSystemPoweroff;
995 break;
996 default:
997 reason = xpcSystemGoingDown;
998 }
999
1000 xpc_do_exit(reason);
1001 return NOTIFY_DONE;
1002}
1003
1004/*
1005 * Notify other partitions to disengage from all references to our memory.
1006 */
1007static void
1008xpc_die_disengage(void)
1009{
1010 struct xpc_partition *part;
1011 partid_t partid;
1012 unsigned long engaged;
1013 long time, printmsg_time, disengage_request_timeout;
1014
1015 /* keep xpc_hb_checker thread from doing anything (just in case) */
1016 xpc_exiting = 1;
1017
1018 xpc_vars->heartbeating_to_mask = 0; /* indicate we're deactivated */
1019
1020 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1021 part = &xpc_partitions[partid];
1022
1023 if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
1024 remote_vars_version)) {
1025
1026 /* just in case it was left set by an earlier XPC */
1027 xpc_clear_partition_engaged(1UL << partid);
1028 continue;
1029 }
1030
1031 if (xpc_partition_engaged(1UL << partid) ||
1032 part->act_state != XPC_P_INACTIVE) {
1033 xpc_request_partition_disengage(part);
1034 xpc_mark_partition_disengaged(part);
1035 xpc_IPI_send_disengage(part);
1036 }
1037 }
1038
1039 time = rtc_time();
1040 printmsg_time = time +
1041 (XPC_DISENGAGE_PRINTMSG_INTERVAL * sn_rtc_cycles_per_second);
1042 disengage_request_timeout = time +
1043 (xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
1044
1045 /* wait for all other partitions to disengage from us */
1046
1047 while (1) {
1048 engaged = xpc_partition_engaged(-1UL);
1049 if (!engaged) {
1050 dev_info(xpc_part, "all partitions have disengaged\n");
1051 break;
1052 }
1053
1054 time = rtc_time();
1055 if (time >= disengage_request_timeout) {
1056 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1057 if (engaged & (1UL << partid)) {
1058 dev_info(xpc_part, "disengage from "
1059 "remote partition %d timed "
1060 "out\n", partid);
1061 }
1062 }
1063 break;
1064 }
1065
1066 if (time >= printmsg_time) {
1067 dev_info(xpc_part, "waiting for remote partitions to "
1068 "disengage, timeout in %ld seconds\n",
1069 (disengage_request_timeout - time) /
1070 sn_rtc_cycles_per_second);
1071 printmsg_time = time +
1072 (XPC_DISENGAGE_PRINTMSG_INTERVAL *
1073 sn_rtc_cycles_per_second);
1074 }
1075 }
1076}
1077
1078/*
1079 * This function is called when the system is being restarted or halted due
1080 * to some sort of system failure. If this is the case we need to notify the
1081 * other partitions to disengage from all references to our memory.
1082 * This function can also be called when our heartbeater could be offlined
1083 * for a time. In this case we need to notify other partitions to not worry
1084 * about the lack of a heartbeat.
1085 */
1086static int
1087xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1088{
1089 switch (event) {
1090 case DIE_MACHINE_RESTART:
1091 case DIE_MACHINE_HALT:
1092 xpc_die_disengage();
1093 break;
1094
1095 case DIE_KDEBUG_ENTER:
1096 /* Should lack of heartbeat be ignored by other partitions? */
1097 if (!xpc_kdebug_ignore)
1098 break;
1099
1100 /* fall through */
1101 case DIE_MCA_MONARCH_ENTER:
1102 case DIE_INIT_MONARCH_ENTER:
1103 xpc_vars->heartbeat++;
1104 xpc_vars->heartbeat_offline = 1;
1105 break;
1106
1107 case DIE_KDEBUG_LEAVE:
1108 /* Is lack of heartbeat being ignored by other partitions? */
1109 if (!xpc_kdebug_ignore)
1110 break;
1111
1112 /* fall through */
1113 case DIE_MCA_MONARCH_LEAVE:
1114 case DIE_INIT_MONARCH_LEAVE:
1115 xpc_vars->heartbeat++;
1116 xpc_vars->heartbeat_offline = 0;
1117 break;
1118 }
1119
1120 return NOTIFY_DONE;
1121}
1122
1123int __init
1124xpc_init(void)
1125{
1126 int ret;
1127 partid_t partid;
1128 struct xpc_partition *part;
1129 struct task_struct *kthread;
1130 size_t buf_size;
1131
1132 if (!ia64_platform_is("sn2"))
1133 return -ENODEV;
1134
1135 buf_size = max(XPC_RP_VARS_SIZE,
1136 XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES);
1137 xpc_remote_copy_buffer = xpc_kmalloc_cacheline_aligned(buf_size,
1138 GFP_KERNEL,
1139 &xpc_remote_copy_buffer_base);
1140 if (xpc_remote_copy_buffer == NULL)
1141 return -ENOMEM;
1142
1143 snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
1144 snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
1145
1146 xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1147
1148 /*
1149 * The first few fields of each entry of xpc_partitions[] need to
1150 * be initialized now so that calls to xpc_connect() and
1151 * xpc_disconnect() can be made prior to the activation of any remote
1152 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
1153 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
1154 * PARTITION HAS BEEN ACTIVATED.
1155 */
1156 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1157 part = &xpc_partitions[partid];
1158
1159 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
1160
1161 part->act_IRQ_rcvd = 0;
1162 spin_lock_init(&part->act_lock);
1163 part->act_state = XPC_P_INACTIVE;
1164 XPC_SET_REASON(part, 0, 0);
1165
1166 init_timer(&part->disengage_request_timer);
1167 part->disengage_request_timer.function =
1168 xpc_timeout_partition_disengage_request;
1169 part->disengage_request_timer.data = (unsigned long)part;
1170
1171 part->setup_state = XPC_P_UNSET;
1172 init_waitqueue_head(&part->teardown_wq);
1173 atomic_set(&part->references, 0);
1174 }
1175
1176 /*
1177 * Open up protections for IPI operations (and AMO operations on
1178 * Shub 1.1 systems).
1179 */
1180 xpc_allow_IPI_ops();
1181
1182 /*
1183 * Interrupts being processed will increment this atomic variable and
1184 * awaken the heartbeat thread which will process the interrupts.
1185 */
1186 atomic_set(&xpc_act_IRQ_rcvd, 0);
1187
1188 /*
1189 * This is safe to do before the xpc_hb_checker thread has started
1190 * because the handler releases a wait queue. If an interrupt is
1191 * received before the thread is waiting, it will not go to sleep,
1192 * but rather immediately process the interrupt.
1193 */
1194 ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
1195 "xpc hb", NULL);
1196 if (ret != 0) {
1197 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
1198 "errno=%d\n", -ret);
1199
1200 xpc_restrict_IPI_ops();
1201
1202 if (xpc_sysctl)
1203 unregister_sysctl_table(xpc_sysctl);
1204
1205 kfree(xpc_remote_copy_buffer_base);
1206 return -EBUSY;
1207 }
1208
1209 /*
1210 * Fill the partition reserved page with the information needed by
1211 * other partitions to discover we are alive and establish initial
1212 * communications.
1213 */
1214 xpc_rsvd_page = xpc_rsvd_page_init();
1215 if (xpc_rsvd_page == NULL) {
1216 dev_err(xpc_part, "could not setup our reserved page\n");
1217
1218 free_irq(SGI_XPC_ACTIVATE, NULL);
1219 xpc_restrict_IPI_ops();
1220
1221 if (xpc_sysctl)
1222 unregister_sysctl_table(xpc_sysctl);
1223
1224 kfree(xpc_remote_copy_buffer_base);
1225 return -EBUSY;
1226 }
1227
1228 /* add ourselves to the reboot_notifier_list */
1229 ret = register_reboot_notifier(&xpc_reboot_notifier);
1230 if (ret != 0)
1231 dev_warn(xpc_part, "can't register reboot notifier\n");
1232
1233 /* add ourselves to the die_notifier list */
1234 ret = register_die_notifier(&xpc_die_notifier);
1235 if (ret != 0)
1236 dev_warn(xpc_part, "can't register die notifier\n");
1237
1238 init_timer(&xpc_hb_timer);
1239 xpc_hb_timer.function = xpc_hb_beater;
1240
1241 /*
1242 * The real work-horse behind xpc. This processes incoming
1243 * interrupts and monitors remote heartbeats.
1244 */
1245 kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
1246 if (IS_ERR(kthread)) {
1247 dev_err(xpc_part, "failed while forking hb check thread\n");
1248
1249 /* indicate to others that our reserved page is uninitialized */
1250 xpc_rsvd_page->vars_pa = 0;
1251
1252 /* take ourselves off of the reboot_notifier_list */
1253 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1254
1255 /* take ourselves off of the die_notifier list */
1256 (void)unregister_die_notifier(&xpc_die_notifier);
1257
1258 del_timer_sync(&xpc_hb_timer);
1259 free_irq(SGI_XPC_ACTIVATE, NULL);
1260 xpc_restrict_IPI_ops();
1261
1262 if (xpc_sysctl)
1263 unregister_sysctl_table(xpc_sysctl);
1264
1265 kfree(xpc_remote_copy_buffer_base);
1266 return -EBUSY;
1267 }
1268
1269 /*
1270 * Startup a thread that will attempt to discover other partitions to
1271 * activate based on info provided by SAL. This new thread is short
1272 * lived and will exit once discovery is complete.
1273 */
1274 kthread = kthread_run(xpc_initiate_discovery, NULL,
1275 XPC_DISCOVERY_THREAD_NAME);
1276 if (IS_ERR(kthread)) {
1277 dev_err(xpc_part, "failed while forking discovery thread\n");
1278
1279 /* mark this new thread as a non-starter */
1280 complete(&xpc_discovery_exited);
1281
1282 xpc_do_exit(xpcUnloading);
1283 return -EBUSY;
1284 }
1285
1286 /* set the interface to point at XPC's functions */
1287 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1288 xpc_initiate_allocate, xpc_initiate_send,
1289 xpc_initiate_send_notify, xpc_initiate_received,
1290 xpc_initiate_partid_to_nasids);
1291
1292 return 0;
1293}
1294
1295module_init(xpc_init);
1296
1297void __exit
1298xpc_exit(void)
1299{
1300 xpc_do_exit(xpcUnloading);
1301}
1302
1303module_exit(xpc_exit);
1304
1305MODULE_AUTHOR("Silicon Graphics, Inc.");
1306MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1307MODULE_LICENSE("GPL");
1308
1309module_param(xpc_hb_interval, int, 0);
1310MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1311 "heartbeat increments.");
1312
1313module_param(xpc_hb_check_interval, int, 0);
1314MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1315 "heartbeat checks.");
1316
1317module_param(xpc_disengage_request_timelimit, int, 0);
1318MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
1319 "for disengage request to complete.");
1320
1321module_param(xpc_kdebug_ignore, int, 0);
1322MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
1323 "other partitions when dropping into kdebug.");
diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c
new file mode 100644
index 000000000000..27e200ec5826
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_partition.c
@@ -0,0 +1,1174 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) partition support.
11 *
12 * This is the part of XPC that detects the presence/absence of
13 * other partitions. It provides a heartbeat and monitors the
14 * heartbeats of other partitions.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/sysctl.h>
20#include <linux/cache.h>
21#include <linux/mmzone.h>
22#include <linux/nodemask.h>
23#include <asm/uncached.h>
24#include <asm/sn/bte.h>
25#include <asm/sn/intr.h>
26#include <asm/sn/sn_sal.h>
27#include <asm/sn/nodepda.h>
28#include <asm/sn/addrs.h>
29#include "xpc.h"
30
31/* XPC is exiting flag */
32int xpc_exiting;
33
34/* SH_IPI_ACCESS shub register value on startup */
35static u64 xpc_sh1_IPI_access;
36static u64 xpc_sh2_IPI_access0;
37static u64 xpc_sh2_IPI_access1;
38static u64 xpc_sh2_IPI_access2;
39static u64 xpc_sh2_IPI_access3;
40
41/* original protection values for each node */
42u64 xpc_prot_vec[MAX_NUMNODES];
43
44/* this partition's reserved page pointers */
45struct xpc_rsvd_page *xpc_rsvd_page;
46static u64 *xpc_part_nasids;
47static u64 *xpc_mach_nasids;
48struct xpc_vars *xpc_vars;
49struct xpc_vars_part *xpc_vars_part;
50
51static int xp_nasid_mask_bytes; /* actual size in bytes of nasid mask */
52static int xp_nasid_mask_words; /* actual size in words of nasid mask */
53
54/*
55 * For performance reasons, each entry of xpc_partitions[] is cacheline
56 * aligned. And xpc_partitions[] is padded with an additional entry at the
57 * end so that the last legitimate entry doesn't share its cacheline with
58 * another variable.
59 */
60struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
61
62/*
63 * Generic buffer used to store a local copy of portions of a remote
64 * partition's reserved page (either its header and part_nasids mask,
65 * or its vars).
66 */
67char *xpc_remote_copy_buffer;
68void *xpc_remote_copy_buffer_base;
69
70/*
71 * Guarantee that the kmalloc'd memory is cacheline aligned.
72 */
73void *
74xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
75{
76 /* see if kmalloc will give us cachline aligned memory by default */
77 *base = kmalloc(size, flags);
78 if (*base == NULL)
79 return NULL;
80
81 if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
82 return *base;
83
84 kfree(*base);
85
86 /* nope, we'll have to do it ourselves */
87 *base = kmalloc(size + L1_CACHE_BYTES, flags);
88 if (*base == NULL)
89 return NULL;
90
91 return (void *)L1_CACHE_ALIGN((u64)*base);
92}
93
94/*
95 * Given a nasid, get the physical address of the partition's reserved page
96 * for that nasid. This function returns 0 on any error.
97 */
98static u64
99xpc_get_rsvd_page_pa(int nasid)
100{
101 bte_result_t bte_res;
102 s64 status;
103 u64 cookie = 0;
104 u64 rp_pa = nasid; /* seed with nasid */
105 u64 len = 0;
106 u64 buf = buf;
107 u64 buf_len = 0;
108 void *buf_base = NULL;
109
110 while (1) {
111
112 status = sn_partition_reserved_page_pa(buf, &cookie, &rp_pa,
113 &len);
114
115 dev_dbg(xpc_part, "SAL returned with status=%li, cookie="
116 "0x%016lx, address=0x%016lx, len=0x%016lx\n",
117 status, cookie, rp_pa, len);
118
119 if (status != SALRET_MORE_PASSES)
120 break;
121
122 if (L1_CACHE_ALIGN(len) > buf_len) {
123 kfree(buf_base);
124 buf_len = L1_CACHE_ALIGN(len);
125 buf = (u64)xpc_kmalloc_cacheline_aligned(buf_len,
126 GFP_KERNEL,
127 &buf_base);
128 if (buf_base == NULL) {
129 dev_err(xpc_part, "unable to kmalloc "
130 "len=0x%016lx\n", buf_len);
131 status = SALRET_ERROR;
132 break;
133 }
134 }
135
136 bte_res = xp_bte_copy(rp_pa, buf, buf_len,
137 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
138 if (bte_res != BTE_SUCCESS) {
139 dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res);
140 status = SALRET_ERROR;
141 break;
142 }
143 }
144
145 kfree(buf_base);
146
147 if (status != SALRET_OK)
148 rp_pa = 0;
149
150 dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
151 return rp_pa;
152}
153
154/*
155 * Fill the partition reserved page with the information needed by
156 * other partitions to discover we are alive and establish initial
157 * communications.
158 */
159struct xpc_rsvd_page *
160xpc_rsvd_page_init(void)
161{
162 struct xpc_rsvd_page *rp;
163 AMO_t *amos_page;
164 u64 rp_pa, nasid_array = 0;
165 int i, ret;
166
167 /* get the local reserved page's address */
168
169 preempt_disable();
170 rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id()));
171 preempt_enable();
172 if (rp_pa == 0) {
173 dev_err(xpc_part, "SAL failed to locate the reserved page\n");
174 return NULL;
175 }
176 rp = (struct xpc_rsvd_page *)__va(rp_pa);
177
178 if (rp->partid != sn_partition_id) {
179 dev_err(xpc_part, "the reserved page's partid of %d should be "
180 "%d\n", rp->partid, sn_partition_id);
181 return NULL;
182 }
183
184 rp->version = XPC_RP_VERSION;
185
186 /* establish the actual sizes of the nasid masks */
187 if (rp->SAL_version == 1) {
188 /* SAL_version 1 didn't set the nasids_size field */
189 rp->nasids_size = 128;
190 }
191 xp_nasid_mask_bytes = rp->nasids_size;
192 xp_nasid_mask_words = xp_nasid_mask_bytes / 8;
193
194 /* setup the pointers to the various items in the reserved page */
195 xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
196 xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
197 xpc_vars = XPC_RP_VARS(rp);
198 xpc_vars_part = XPC_RP_VARS_PART(rp);
199
200 /*
201 * Before clearing xpc_vars, see if a page of AMOs had been previously
202 * allocated. If not we'll need to allocate one and set permissions
203 * so that cross-partition AMOs are allowed.
204 *
205 * The allocated AMO page needs MCA reporting to remain disabled after
206 * XPC has unloaded. To make this work, we keep a copy of the pointer
207 * to this page (i.e., amos_page) in the struct xpc_vars structure,
208 * which is pointed to by the reserved page, and re-use that saved copy
209 * on subsequent loads of XPC. This AMO page is never freed, and its
210 * memory protections are never restricted.
211 */
212 amos_page = xpc_vars->amos_page;
213 if (amos_page == NULL) {
214 amos_page = (AMO_t *)TO_AMO(uncached_alloc_page(0));
215 if (amos_page == NULL) {
216 dev_err(xpc_part, "can't allocate page of AMOs\n");
217 return NULL;
218 }
219
220 /*
221 * Open up AMO-R/W to cpu. This is done for Shub 1.1 systems
222 * when xpc_allow_IPI_ops() is called via xpc_hb_init().
223 */
224 if (!enable_shub_wars_1_1()) {
225 ret = sn_change_memprotect(ia64_tpa((u64)amos_page),
226 PAGE_SIZE,
227 SN_MEMPROT_ACCESS_CLASS_1,
228 &nasid_array);
229 if (ret != 0) {
230 dev_err(xpc_part, "can't change memory "
231 "protections\n");
232 uncached_free_page(__IA64_UNCACHED_OFFSET |
233 TO_PHYS((u64)amos_page));
234 return NULL;
235 }
236 }
237 } else if (!IS_AMO_ADDRESS((u64)amos_page)) {
238 /*
239 * EFI's XPBOOT can also set amos_page in the reserved page,
240 * but it happens to leave it as an uncached physical address
241 * and we need it to be an uncached virtual, so we'll have to
242 * convert it.
243 */
244 if (!IS_AMO_PHYS_ADDRESS((u64)amos_page)) {
245 dev_err(xpc_part, "previously used amos_page address "
246 "is bad = 0x%p\n", (void *)amos_page);
247 return NULL;
248 }
249 amos_page = (AMO_t *)TO_AMO((u64)amos_page);
250 }
251
252 /* clear xpc_vars */
253 memset(xpc_vars, 0, sizeof(struct xpc_vars));
254
255 xpc_vars->version = XPC_V_VERSION;
256 xpc_vars->act_nasid = cpuid_to_nasid(0);
257 xpc_vars->act_phys_cpuid = cpu_physical_id(0);
258 xpc_vars->vars_part_pa = __pa(xpc_vars_part);
259 xpc_vars->amos_page_pa = ia64_tpa((u64)amos_page);
260 xpc_vars->amos_page = amos_page; /* save for next load of XPC */
261
262 /* clear xpc_vars_part */
263 memset((u64 *)xpc_vars_part, 0, sizeof(struct xpc_vars_part) *
264 XP_MAX_PARTITIONS);
265
266 /* initialize the activate IRQ related AMO variables */
267 for (i = 0; i < xp_nasid_mask_words; i++)
268 (void)xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i);
269
270 /* initialize the engaged remote partitions related AMO variables */
271 (void)xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO);
272 (void)xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO);
273
274 /* timestamp of when reserved page was setup by XPC */
275 rp->stamp = CURRENT_TIME;
276
277 /*
278 * This signifies to the remote partition that our reserved
279 * page is initialized.
280 */
281 rp->vars_pa = __pa(xpc_vars);
282
283 return rp;
284}
285
286/*
287 * Change protections to allow IPI operations (and AMO operations on
288 * Shub 1.1 systems).
289 */
290void
291xpc_allow_IPI_ops(void)
292{
293 int node;
294 int nasid;
295
296 /* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
297
298 if (is_shub2()) {
299 xpc_sh2_IPI_access0 =
300 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
301 xpc_sh2_IPI_access1 =
302 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
303 xpc_sh2_IPI_access2 =
304 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
305 xpc_sh2_IPI_access3 =
306 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
307
308 for_each_online_node(node) {
309 nasid = cnodeid_to_nasid(node);
310 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
311 -1UL);
312 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
313 -1UL);
314 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
315 -1UL);
316 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
317 -1UL);
318 }
319
320 } else {
321 xpc_sh1_IPI_access =
322 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
323
324 for_each_online_node(node) {
325 nasid = cnodeid_to_nasid(node);
326 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
327 -1UL);
328
329 /*
330 * Since the BIST collides with memory operations on
331 * SHUB 1.1 sn_change_memprotect() cannot be used.
332 */
333 if (enable_shub_wars_1_1()) {
334 /* open up everything */
335 xpc_prot_vec[node] = (u64)HUB_L((u64 *)
336 GLOBAL_MMR_ADDR
337 (nasid,
338 SH1_MD_DQLP_MMR_DIR_PRIVEC0));
339 HUB_S((u64 *)
340 GLOBAL_MMR_ADDR(nasid,
341 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
342 -1UL);
343 HUB_S((u64 *)
344 GLOBAL_MMR_ADDR(nasid,
345 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
346 -1UL);
347 }
348 }
349 }
350}
351
352/*
353 * Restrict protections to disallow IPI operations (and AMO operations on
354 * Shub 1.1 systems).
355 */
356void
357xpc_restrict_IPI_ops(void)
358{
359 int node;
360 int nasid;
361
362 /* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
363
364 if (is_shub2()) {
365
366 for_each_online_node(node) {
367 nasid = cnodeid_to_nasid(node);
368 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
369 xpc_sh2_IPI_access0);
370 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
371 xpc_sh2_IPI_access1);
372 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
373 xpc_sh2_IPI_access2);
374 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
375 xpc_sh2_IPI_access3);
376 }
377
378 } else {
379
380 for_each_online_node(node) {
381 nasid = cnodeid_to_nasid(node);
382 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
383 xpc_sh1_IPI_access);
384
385 if (enable_shub_wars_1_1()) {
386 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
387 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
388 xpc_prot_vec[node]);
389 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
390 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
391 xpc_prot_vec[node]);
392 }
393 }
394 }
395}
396
397/*
398 * At periodic intervals, scan through all active partitions and ensure
399 * their heartbeat is still active. If not, the partition is deactivated.
400 */
401void
402xpc_check_remote_hb(void)
403{
404 struct xpc_vars *remote_vars;
405 struct xpc_partition *part;
406 partid_t partid;
407 bte_result_t bres;
408
409 remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
410
411 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
412
413 if (xpc_exiting)
414 break;
415
416 if (partid == sn_partition_id)
417 continue;
418
419 part = &xpc_partitions[partid];
420
421 if (part->act_state == XPC_P_INACTIVE ||
422 part->act_state == XPC_P_DEACTIVATING) {
423 continue;
424 }
425
426 /* pull the remote_hb cache line */
427 bres = xp_bte_copy(part->remote_vars_pa,
428 (u64)remote_vars,
429 XPC_RP_VARS_SIZE,
430 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
431 if (bres != BTE_SUCCESS) {
432 XPC_DEACTIVATE_PARTITION(part,
433 xpc_map_bte_errors(bres));
434 continue;
435 }
436
437 dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
438 " = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
439 partid, remote_vars->heartbeat, part->last_heartbeat,
440 remote_vars->heartbeat_offline,
441 remote_vars->heartbeating_to_mask);
442
443 if (((remote_vars->heartbeat == part->last_heartbeat) &&
444 (remote_vars->heartbeat_offline == 0)) ||
445 !xpc_hb_allowed(sn_partition_id, remote_vars)) {
446
447 XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat);
448 continue;
449 }
450
451 part->last_heartbeat = remote_vars->heartbeat;
452 }
453}
454
455/*
456 * Get a copy of a portion of the remote partition's rsvd page.
457 *
458 * remote_rp points to a buffer that is cacheline aligned for BTE copies and
459 * is large enough to contain a copy of their reserved page header and
460 * part_nasids mask.
461 */
462static enum xpc_retval
463xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
464 struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa)
465{
466 int bres, i;
467
468 /* get the reserved page's physical address */
469
470 *remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
471 if (*remote_rp_pa == 0)
472 return xpcNoRsvdPageAddr;
473
474 /* pull over the reserved page header and part_nasids mask */
475 bres = xp_bte_copy(*remote_rp_pa, (u64)remote_rp,
476 XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes,
477 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
478 if (bres != BTE_SUCCESS)
479 return xpc_map_bte_errors(bres);
480
481 if (discovered_nasids != NULL) {
482 u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp);
483
484 for (i = 0; i < xp_nasid_mask_words; i++)
485 discovered_nasids[i] |= remote_part_nasids[i];
486 }
487
488 /* check that the partid is for another partition */
489
490 if (remote_rp->partid < 1 ||
491 remote_rp->partid > (XP_MAX_PARTITIONS - 1)) {
492 return xpcInvalidPartid;
493 }
494
495 if (remote_rp->partid == sn_partition_id)
496 return xpcLocalPartid;
497
498 if (XPC_VERSION_MAJOR(remote_rp->version) !=
499 XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
500 return xpcBadVersion;
501 }
502
503 return xpcSuccess;
504}
505
506/*
507 * Get a copy of the remote partition's XPC variables from the reserved page.
508 *
509 * remote_vars points to a buffer that is cacheline aligned for BTE copies and
510 * assumed to be of size XPC_RP_VARS_SIZE.
511 */
512static enum xpc_retval
513xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
514{
515 int bres;
516
517 if (remote_vars_pa == 0)
518 return xpcVarsNotSet;
519
520 /* pull over the cross partition variables */
521 bres = xp_bte_copy(remote_vars_pa, (u64)remote_vars, XPC_RP_VARS_SIZE,
522 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
523 if (bres != BTE_SUCCESS)
524 return xpc_map_bte_errors(bres);
525
526 if (XPC_VERSION_MAJOR(remote_vars->version) !=
527 XPC_VERSION_MAJOR(XPC_V_VERSION)) {
528 return xpcBadVersion;
529 }
530
531 return xpcSuccess;
532}
533
534/*
535 * Update the remote partition's info.
536 */
537static void
538xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version,
539 struct timespec *remote_rp_stamp, u64 remote_rp_pa,
540 u64 remote_vars_pa, struct xpc_vars *remote_vars)
541{
542 part->remote_rp_version = remote_rp_version;
543 dev_dbg(xpc_part, " remote_rp_version = 0x%016x\n",
544 part->remote_rp_version);
545
546 part->remote_rp_stamp = *remote_rp_stamp;
547 dev_dbg(xpc_part, " remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n",
548 part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec);
549
550 part->remote_rp_pa = remote_rp_pa;
551 dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
552
553 part->remote_vars_pa = remote_vars_pa;
554 dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n",
555 part->remote_vars_pa);
556
557 part->last_heartbeat = remote_vars->heartbeat;
558 dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n",
559 part->last_heartbeat);
560
561 part->remote_vars_part_pa = remote_vars->vars_part_pa;
562 dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n",
563 part->remote_vars_part_pa);
564
565 part->remote_act_nasid = remote_vars->act_nasid;
566 dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n",
567 part->remote_act_nasid);
568
569 part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
570 dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n",
571 part->remote_act_phys_cpuid);
572
573 part->remote_amos_page_pa = remote_vars->amos_page_pa;
574 dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n",
575 part->remote_amos_page_pa);
576
577 part->remote_vars_version = remote_vars->version;
578 dev_dbg(xpc_part, " remote_vars_version = 0x%x\n",
579 part->remote_vars_version);
580}
581
582/*
583 * Prior code has determined the nasid which generated an IPI. Inspect
584 * that nasid to determine if its partition needs to be activated or
585 * deactivated.
586 *
587 * A partition is consider "awaiting activation" if our partition
588 * flags indicate it is not active and it has a heartbeat. A
589 * partition is considered "awaiting deactivation" if our partition
590 * flags indicate it is active but it has no heartbeat or it is not
591 * sending its heartbeat to us.
592 *
593 * To determine the heartbeat, the remote nasid must have a properly
594 * initialized reserved page.
595 */
596static void
597xpc_identify_act_IRQ_req(int nasid)
598{
599 struct xpc_rsvd_page *remote_rp;
600 struct xpc_vars *remote_vars;
601 u64 remote_rp_pa;
602 u64 remote_vars_pa;
603 int remote_rp_version;
604 int reactivate = 0;
605 int stamp_diff;
606 struct timespec remote_rp_stamp = { 0, 0 };
607 partid_t partid;
608 struct xpc_partition *part;
609 enum xpc_retval ret;
610
611 /* pull over the reserved page structure */
612
613 remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer;
614
615 ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
616 if (ret != xpcSuccess) {
617 dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
618 "which sent interrupt, reason=%d\n", nasid, ret);
619 return;
620 }
621
622 remote_vars_pa = remote_rp->vars_pa;
623 remote_rp_version = remote_rp->version;
624 if (XPC_SUPPORTS_RP_STAMP(remote_rp_version))
625 remote_rp_stamp = remote_rp->stamp;
626
627 partid = remote_rp->partid;
628 part = &xpc_partitions[partid];
629
630 /* pull over the cross partition variables */
631
632 remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
633
634 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
635 if (ret != xpcSuccess) {
636
637 dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
638 "which sent interrupt, reason=%d\n", nasid, ret);
639
640 XPC_DEACTIVATE_PARTITION(part, ret);
641 return;
642 }
643
644 part->act_IRQ_rcvd++;
645
646 dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
647 "%ld:0x%lx\n", (int)nasid, (int)partid, part->act_IRQ_rcvd,
648 remote_vars->heartbeat, remote_vars->heartbeating_to_mask);
649
650 if (xpc_partition_disengaged(part) &&
651 part->act_state == XPC_P_INACTIVE) {
652
653 xpc_update_partition_info(part, remote_rp_version,
654 &remote_rp_stamp, remote_rp_pa,
655 remote_vars_pa, remote_vars);
656
657 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
658 if (xpc_partition_disengage_requested(1UL << partid)) {
659 /*
660 * Other side is waiting on us to disengage,
661 * even though we already have.
662 */
663 return;
664 }
665 } else {
666 /* other side doesn't support disengage requests */
667 xpc_clear_partition_disengage_request(1UL << partid);
668 }
669
670 xpc_activate_partition(part);
671 return;
672 }
673
674 DBUG_ON(part->remote_rp_version == 0);
675 DBUG_ON(part->remote_vars_version == 0);
676
677 if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) {
678 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part->
679 remote_vars_version));
680
681 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
682 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
683 version));
684 /* see if the other side rebooted */
685 if (part->remote_amos_page_pa ==
686 remote_vars->amos_page_pa &&
687 xpc_hb_allowed(sn_partition_id, remote_vars)) {
688 /* doesn't look that way, so ignore the IPI */
689 return;
690 }
691 }
692
693 /*
694 * Other side rebooted and previous XPC didn't support the
695 * disengage request, so we don't need to do anything special.
696 */
697
698 xpc_update_partition_info(part, remote_rp_version,
699 &remote_rp_stamp, remote_rp_pa,
700 remote_vars_pa, remote_vars);
701 part->reactivate_nasid = nasid;
702 XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
703 return;
704 }
705
706 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version));
707
708 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
709 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
710
711 /*
712 * Other side rebooted and previous XPC did support the
713 * disengage request, but the new one doesn't.
714 */
715
716 xpc_clear_partition_engaged(1UL << partid);
717 xpc_clear_partition_disengage_request(1UL << partid);
718
719 xpc_update_partition_info(part, remote_rp_version,
720 &remote_rp_stamp, remote_rp_pa,
721 remote_vars_pa, remote_vars);
722 reactivate = 1;
723
724 } else {
725 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
726
727 stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp,
728 &remote_rp_stamp);
729 if (stamp_diff != 0) {
730 DBUG_ON(stamp_diff >= 0);
731
732 /*
733 * Other side rebooted and the previous XPC did support
734 * the disengage request, as does the new one.
735 */
736
737 DBUG_ON(xpc_partition_engaged(1UL << partid));
738 DBUG_ON(xpc_partition_disengage_requested(1UL <<
739 partid));
740
741 xpc_update_partition_info(part, remote_rp_version,
742 &remote_rp_stamp,
743 remote_rp_pa, remote_vars_pa,
744 remote_vars);
745 reactivate = 1;
746 }
747 }
748
749 if (part->disengage_request_timeout > 0 &&
750 !xpc_partition_disengaged(part)) {
751 /* still waiting on other side to disengage from us */
752 return;
753 }
754
755 if (reactivate) {
756 part->reactivate_nasid = nasid;
757 XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
758
759 } else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) &&
760 xpc_partition_disengage_requested(1UL << partid)) {
761 XPC_DEACTIVATE_PARTITION(part, xpcOtherGoingDown);
762 }
763}
764
765/*
766 * Loop through the activation AMO variables and process any bits
767 * which are set. Each bit indicates a nasid sending a partition
768 * activation or deactivation request.
769 *
770 * Return #of IRQs detected.
771 */
772int
773xpc_identify_act_IRQ_sender(void)
774{
775 int word, bit;
776 u64 nasid_mask;
777 u64 nasid; /* remote nasid */
778 int n_IRQs_detected = 0;
779 AMO_t *act_amos;
780
781 act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS;
782
783 /* scan through act AMO variable looking for non-zero entries */
784 for (word = 0; word < xp_nasid_mask_words; word++) {
785
786 if (xpc_exiting)
787 break;
788
789 nasid_mask = xpc_IPI_receive(&act_amos[word]);
790 if (nasid_mask == 0) {
791 /* no IRQs from nasids in this variable */
792 continue;
793 }
794
795 dev_dbg(xpc_part, "AMO[%d] gave back 0x%lx\n", word,
796 nasid_mask);
797
798 /*
799 * If this nasid has been added to the machine since
800 * our partition was reset, this will retain the
801 * remote nasid in our reserved pages machine mask.
802 * This is used in the event of module reload.
803 */
804 xpc_mach_nasids[word] |= nasid_mask;
805
806 /* locate the nasid(s) which sent interrupts */
807
808 for (bit = 0; bit < (8 * sizeof(u64)); bit++) {
809 if (nasid_mask & (1UL << bit)) {
810 n_IRQs_detected++;
811 nasid = XPC_NASID_FROM_W_B(word, bit);
812 dev_dbg(xpc_part, "interrupt from nasid %ld\n",
813 nasid);
814 xpc_identify_act_IRQ_req(nasid);
815 }
816 }
817 }
818 return n_IRQs_detected;
819}
820
821/*
822 * See if the other side has responded to a partition disengage request
823 * from us.
824 */
825int
826xpc_partition_disengaged(struct xpc_partition *part)
827{
828 partid_t partid = XPC_PARTID(part);
829 int disengaged;
830
831 disengaged = (xpc_partition_engaged(1UL << partid) == 0);
832 if (part->disengage_request_timeout) {
833 if (!disengaged) {
834 if (time_before(jiffies,
835 part->disengage_request_timeout)) {
836 /* timelimit hasn't been reached yet */
837 return 0;
838 }
839
840 /*
841 * Other side hasn't responded to our disengage
842 * request in a timely fashion, so assume it's dead.
843 */
844
845 dev_info(xpc_part, "disengage from remote partition %d "
846 "timed out\n", partid);
847 xpc_disengage_request_timedout = 1;
848 xpc_clear_partition_engaged(1UL << partid);
849 disengaged = 1;
850 }
851 part->disengage_request_timeout = 0;
852
853 /* cancel the timer function, provided it's not us */
854 if (!in_interrupt()) {
855 del_singleshot_timer_sync(&part->
856 disengage_request_timer);
857 }
858
859 DBUG_ON(part->act_state != XPC_P_DEACTIVATING &&
860 part->act_state != XPC_P_INACTIVE);
861 if (part->act_state != XPC_P_INACTIVE)
862 xpc_wakeup_channel_mgr(part);
863
864 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version))
865 xpc_cancel_partition_disengage_request(part);
866 }
867 return disengaged;
868}
869
870/*
871 * Mark specified partition as active.
872 */
873enum xpc_retval
874xpc_mark_partition_active(struct xpc_partition *part)
875{
876 unsigned long irq_flags;
877 enum xpc_retval ret;
878
879 dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));
880
881 spin_lock_irqsave(&part->act_lock, irq_flags);
882 if (part->act_state == XPC_P_ACTIVATING) {
883 part->act_state = XPC_P_ACTIVE;
884 ret = xpcSuccess;
885 } else {
886 DBUG_ON(part->reason == xpcSuccess);
887 ret = part->reason;
888 }
889 spin_unlock_irqrestore(&part->act_lock, irq_flags);
890
891 return ret;
892}
893
894/*
895 * Notify XPC that the partition is down.
896 */
897void
898xpc_deactivate_partition(const int line, struct xpc_partition *part,
899 enum xpc_retval reason)
900{
901 unsigned long irq_flags;
902
903 spin_lock_irqsave(&part->act_lock, irq_flags);
904
905 if (part->act_state == XPC_P_INACTIVE) {
906 XPC_SET_REASON(part, reason, line);
907 spin_unlock_irqrestore(&part->act_lock, irq_flags);
908 if (reason == xpcReactivating) {
909 /* we interrupt ourselves to reactivate partition */
910 xpc_IPI_send_reactivate(part);
911 }
912 return;
913 }
914 if (part->act_state == XPC_P_DEACTIVATING) {
915 if ((part->reason == xpcUnloading && reason != xpcUnloading) ||
916 reason == xpcReactivating) {
917 XPC_SET_REASON(part, reason, line);
918 }
919 spin_unlock_irqrestore(&part->act_lock, irq_flags);
920 return;
921 }
922
923 part->act_state = XPC_P_DEACTIVATING;
924 XPC_SET_REASON(part, reason, line);
925
926 spin_unlock_irqrestore(&part->act_lock, irq_flags);
927
928 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
929 xpc_request_partition_disengage(part);
930 xpc_IPI_send_disengage(part);
931
932 /* set a timelimit on the disengage request */
933 part->disengage_request_timeout = jiffies +
934 (xpc_disengage_request_timelimit * HZ);
935 part->disengage_request_timer.expires =
936 part->disengage_request_timeout;
937 add_timer(&part->disengage_request_timer);
938 }
939
940 dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
941 XPC_PARTID(part), reason);
942
943 xpc_partition_going_down(part, reason);
944}
945
946/*
947 * Mark specified partition as inactive.
948 */
949void
950xpc_mark_partition_inactive(struct xpc_partition *part)
951{
952 unsigned long irq_flags;
953
954 dev_dbg(xpc_part, "setting partition %d to INACTIVE\n",
955 XPC_PARTID(part));
956
957 spin_lock_irqsave(&part->act_lock, irq_flags);
958 part->act_state = XPC_P_INACTIVE;
959 spin_unlock_irqrestore(&part->act_lock, irq_flags);
960 part->remote_rp_pa = 0;
961}
962
963/*
964 * SAL has provided a partition and machine mask. The partition mask
965 * contains a bit for each even nasid in our partition. The machine
966 * mask contains a bit for each even nasid in the entire machine.
967 *
968 * Using those two bit arrays, we can determine which nasids are
969 * known in the machine. Each should also have a reserved page
970 * initialized if they are available for partitioning.
971 */
972void
973xpc_discovery(void)
974{
975 void *remote_rp_base;
976 struct xpc_rsvd_page *remote_rp;
977 struct xpc_vars *remote_vars;
978 u64 remote_rp_pa;
979 u64 remote_vars_pa;
980 int region;
981 int region_size;
982 int max_regions;
983 int nasid;
984 struct xpc_rsvd_page *rp;
985 partid_t partid;
986 struct xpc_partition *part;
987 u64 *discovered_nasids;
988 enum xpc_retval ret;
989
990 remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
991 xp_nasid_mask_bytes,
992 GFP_KERNEL, &remote_rp_base);
993 if (remote_rp == NULL)
994 return;
995
996 remote_vars = (struct xpc_vars *)remote_rp;
997
998 discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
999 GFP_KERNEL);
1000 if (discovered_nasids == NULL) {
1001 kfree(remote_rp_base);
1002 return;
1003 }
1004
1005 rp = (struct xpc_rsvd_page *)xpc_rsvd_page;
1006
1007 /*
1008 * The term 'region' in this context refers to the minimum number of
1009 * nodes that can comprise an access protection grouping. The access
1010 * protection is in regards to memory, IOI and IPI.
1011 */
1012 max_regions = 64;
1013 region_size = sn_region_size;
1014
1015 switch (region_size) {
1016 case 128:
1017 max_regions *= 2;
1018 case 64:
1019 max_regions *= 2;
1020 case 32:
1021 max_regions *= 2;
1022 region_size = 16;
1023 DBUG_ON(!is_shub2());
1024 }
1025
1026 for (region = 0; region < max_regions; region++) {
1027
1028 if (xpc_exiting)
1029 break;
1030
1031 dev_dbg(xpc_part, "searching region %d\n", region);
1032
1033 for (nasid = (region * region_size * 2);
1034 nasid < ((region + 1) * region_size * 2); nasid += 2) {
1035
1036 if (xpc_exiting)
1037 break;
1038
1039 dev_dbg(xpc_part, "checking nasid %d\n", nasid);
1040
1041 if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) {
1042 dev_dbg(xpc_part, "PROM indicates Nasid %d is "
1043 "part of the local partition; skipping "
1044 "region\n", nasid);
1045 break;
1046 }
1047
1048 if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) {
1049 dev_dbg(xpc_part, "PROM indicates Nasid %d was "
1050 "not on Numa-Link network at reset\n",
1051 nasid);
1052 continue;
1053 }
1054
1055 if (XPC_NASID_IN_ARRAY(nasid, discovered_nasids)) {
1056 dev_dbg(xpc_part, "Nasid %d is part of a "
1057 "partition which was previously "
1058 "discovered\n", nasid);
1059 continue;
1060 }
1061
1062 /* pull over the reserved page structure */
1063
1064 ret = xpc_get_remote_rp(nasid, discovered_nasids,
1065 remote_rp, &remote_rp_pa);
1066 if (ret != xpcSuccess) {
1067 dev_dbg(xpc_part, "unable to get reserved page "
1068 "from nasid %d, reason=%d\n", nasid,
1069 ret);
1070
1071 if (ret == xpcLocalPartid)
1072 break;
1073
1074 continue;
1075 }
1076
1077 remote_vars_pa = remote_rp->vars_pa;
1078
1079 partid = remote_rp->partid;
1080 part = &xpc_partitions[partid];
1081
1082 /* pull over the cross partition variables */
1083
1084 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
1085 if (ret != xpcSuccess) {
1086 dev_dbg(xpc_part, "unable to get XPC variables "
1087 "from nasid %d, reason=%d\n", nasid,
1088 ret);
1089
1090 XPC_DEACTIVATE_PARTITION(part, ret);
1091 continue;
1092 }
1093
1094 if (part->act_state != XPC_P_INACTIVE) {
1095 dev_dbg(xpc_part, "partition %d on nasid %d is "
1096 "already activating\n", partid, nasid);
1097 break;
1098 }
1099
1100 /*
1101 * Register the remote partition's AMOs with SAL so it
1102 * can handle and cleanup errors within that address
1103 * range should the remote partition go down. We don't
1104 * unregister this range because it is difficult to
1105 * tell when outstanding writes to the remote partition
1106 * are finished and thus when it is thus safe to
1107 * unregister. This should not result in wasted space
1108 * in the SAL xp_addr_region table because we should
1109 * get the same page for remote_act_amos_pa after
1110 * module reloads and system reboots.
1111 */
1112 if (sn_register_xp_addr_region
1113 (remote_vars->amos_page_pa, PAGE_SIZE, 1) < 0) {
1114 dev_dbg(xpc_part,
1115 "partition %d failed to "
1116 "register xp_addr region 0x%016lx\n",
1117 partid, remote_vars->amos_page_pa);
1118
1119 XPC_SET_REASON(part, xpcPhysAddrRegFailed,
1120 __LINE__);
1121 break;
1122 }
1123
1124 /*
1125 * The remote nasid is valid and available.
1126 * Send an interrupt to that nasid to notify
1127 * it that we are ready to begin activation.
1128 */
1129 dev_dbg(xpc_part, "sending an interrupt to AMO 0x%lx, "
1130 "nasid %d, phys_cpuid 0x%x\n",
1131 remote_vars->amos_page_pa,
1132 remote_vars->act_nasid,
1133 remote_vars->act_phys_cpuid);
1134
1135 if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
1136 version)) {
1137 part->remote_amos_page_pa =
1138 remote_vars->amos_page_pa;
1139 xpc_mark_partition_disengaged(part);
1140 xpc_cancel_partition_disengage_request(part);
1141 }
1142 xpc_IPI_send_activate(remote_vars);
1143 }
1144 }
1145
1146 kfree(discovered_nasids);
1147 kfree(remote_rp_base);
1148}
1149
1150/*
1151 * Given a partid, get the nasids owned by that partition from the
1152 * remote partition's reserved page.
1153 */
1154enum xpc_retval
1155xpc_initiate_partid_to_nasids(partid_t partid, void *nasid_mask)
1156{
1157 struct xpc_partition *part;
1158 u64 part_nasid_pa;
1159 int bte_res;
1160
1161 part = &xpc_partitions[partid];
1162 if (part->remote_rp_pa == 0)
1163 return xpcPartitionDown;
1164
1165 memset(nasid_mask, 0, XP_NASID_MASK_BYTES);
1166
1167 part_nasid_pa = (u64)XPC_RP_PART_NASIDS(part->remote_rp_pa);
1168
1169 bte_res = xp_bte_copy(part_nasid_pa, (u64)nasid_mask,
1170 xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE),
1171 NULL);
1172
1173 return xpc_map_bte_errors(bte_res);
1174}
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
new file mode 100644
index 000000000000..a9543c65814d
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -0,0 +1,677 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 1999-2008 Silicon Graphics, Inc. All rights reserved.
7 */
8
9/*
10 * Cross Partition Network Interface (XPNET) support
11 *
12 * XPNET provides a virtual network layered on top of the Cross
13 * Partition communication layer.
14 *
15 * XPNET provides direct point-to-point and broadcast-like support
16 * for an ethernet-like device. The ethernet broadcast medium is
17 * replaced with a point-to-point message structure which passes
18 * pointers to a DMA-capable block that a remote partition should
19 * retrieve and pass to the upper level networking layer.
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/init.h>
27#include <linux/ioport.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/delay.h>
31#include <linux/ethtool.h>
32#include <linux/mii.h>
33#include <linux/smp.h>
34#include <linux/string.h>
35#include <asm/sn/bte.h>
36#include <asm/sn/io.h>
37#include <asm/sn/sn_sal.h>
38#include <asm/atomic.h>
39#include "xp.h"
40
41/*
42 * The message payload transferred by XPC.
43 *
44 * buf_pa is the physical address where the DMA should pull from.
45 *
46 * NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a
47 * cacheline boundary. To accomplish this, we record the number of
48 * bytes from the beginning of the first cacheline to the first useful
49 * byte of the skb (leadin_ignore) and the number of bytes from the
50 * last useful byte of the skb to the end of the last cacheline
51 * (tailout_ignore).
52 *
53 * size is the number of bytes to transfer which includes the skb->len
54 * (useful bytes of the senders skb) plus the leadin and tailout
55 */
56struct xpnet_message {
57 u16 version; /* Version for this message */
58 u16 embedded_bytes; /* #of bytes embedded in XPC message */
59 u32 magic; /* Special number indicating this is xpnet */
60 u64 buf_pa; /* phys address of buffer to retrieve */
61 u32 size; /* #of bytes in buffer */
62 u8 leadin_ignore; /* #of bytes to ignore at the beginning */
63 u8 tailout_ignore; /* #of bytes to ignore at the end */
64 unsigned char data; /* body of small packets */
65};
66
67/*
68 * Determine the size of our message, the cacheline aligned size,
69 * and then the number of message will request from XPC.
70 *
71 * XPC expects each message to exist in an individual cacheline.
72 */
73#define XPNET_MSG_SIZE (L1_CACHE_BYTES - XPC_MSG_PAYLOAD_OFFSET)
74#define XPNET_MSG_DATA_MAX \
75 (XPNET_MSG_SIZE - (u64)(&((struct xpnet_message *)0)->data))
76#define XPNET_MSG_ALIGNED_SIZE (L1_CACHE_ALIGN(XPNET_MSG_SIZE))
77#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPNET_MSG_ALIGNED_SIZE)
78
79#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1)
80#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1)
81
82/*
83 * Version number of XPNET implementation. XPNET can always talk to versions
84 * with same major #, and never talk to versions with a different version.
85 */
86#define _XPNET_VERSION(_major, _minor) (((_major) << 4) | (_minor))
87#define XPNET_VERSION_MAJOR(_v) ((_v) >> 4)
88#define XPNET_VERSION_MINOR(_v) ((_v) & 0xf)
89
90#define XPNET_VERSION _XPNET_VERSION(1, 0) /* version 1.0 */
91#define XPNET_VERSION_EMBED _XPNET_VERSION(1, 1) /* version 1.1 */
92#define XPNET_MAGIC 0x88786984 /* "XNET" */
93
94#define XPNET_VALID_MSG(_m) \
95 ((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \
96 && (msg->magic == XPNET_MAGIC))
97
98#define XPNET_DEVICE_NAME "xp0"
99
100/*
101 * When messages are queued with xpc_send_notify, a kmalloc'd buffer
102 * of the following type is passed as a notification cookie. When the
103 * notification function is called, we use the cookie to decide
104 * whether all outstanding message sends have completed. The skb can
105 * then be released.
106 */
107struct xpnet_pending_msg {
108 struct list_head free_list;
109 struct sk_buff *skb;
110 atomic_t use_count;
111};
112
113/* driver specific structure pointed to by the device structure */
114struct xpnet_dev_private {
115 struct net_device_stats stats;
116};
117
118struct net_device *xpnet_device;
119
120/*
121 * When we are notified of other partitions activating, we add them to
122 * our bitmask of partitions to which we broadcast.
123 */
124static u64 xpnet_broadcast_partitions;
125/* protect above */
126static DEFINE_SPINLOCK(xpnet_broadcast_lock);
127
128/*
129 * Since the Block Transfer Engine (BTE) is being used for the transfer
130 * and it relies upon cache-line size transfers, we need to reserve at
131 * least one cache-line for head and tail alignment. The BTE is
132 * limited to 8MB transfers.
133 *
134 * Testing has shown that changing MTU to greater than 64KB has no effect
135 * on TCP as the two sides negotiate a Max Segment Size that is limited
136 * to 64K. Other protocols May use packets greater than this, but for
137 * now, the default is 64KB.
138 */
139#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
140/* 32KB has been determined to be the ideal */
141#define XPNET_DEF_MTU (0x8000UL)
142
143/*
144 * The partition id is encapsulated in the MAC address. The following
145 * define locates the octet the partid is in.
146 */
147#define XPNET_PARTID_OCTET 1
148#define XPNET_LICENSE_OCTET 2
149
150/*
151 * Define the XPNET debug device structure that is to be used with dev_dbg(),
152 * dev_err(), dev_warn(), and dev_info().
153 */
154struct device_driver xpnet_dbg_name = {
155 .name = "xpnet"
156};
157
158struct device xpnet_dbg_subname = {
159 .bus_id = {0}, /* set to "" */
160 .driver = &xpnet_dbg_name
161};
162
163struct device *xpnet = &xpnet_dbg_subname;
164
165/*
166 * Packet was recevied by XPC and forwarded to us.
167 */
168static void
169xpnet_receive(partid_t partid, int channel, struct xpnet_message *msg)
170{
171 struct sk_buff *skb;
172 bte_result_t bret;
173 struct xpnet_dev_private *priv =
174 (struct xpnet_dev_private *)xpnet_device->priv;
175
176 if (!XPNET_VALID_MSG(msg)) {
177 /*
178 * Packet with a different XPC version. Ignore.
179 */
180 xpc_received(partid, channel, (void *)msg);
181
182 priv->stats.rx_errors++;
183
184 return;
185 }
186 dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size,
187 msg->leadin_ignore, msg->tailout_ignore);
188
189 /* reserve an extra cache line */
190 skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES);
191 if (!skb) {
192 dev_err(xpnet, "failed on dev_alloc_skb(%d)\n",
193 msg->size + L1_CACHE_BYTES);
194
195 xpc_received(partid, channel, (void *)msg);
196
197 priv->stats.rx_errors++;
198
199 return;
200 }
201
202 /*
203 * The allocated skb has some reserved space.
204 * In order to use bte_copy, we need to get the
205 * skb->data pointer moved forward.
206 */
207 skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
208 (L1_CACHE_BYTES - 1)) +
209 msg->leadin_ignore));
210
211 /*
212 * Update the tail pointer to indicate data actually
213 * transferred.
214 */
215 skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore));
216
217 /*
218 * Move the data over from the other side.
219 */
220 if ((XPNET_VERSION_MINOR(msg->version) == 1) &&
221 (msg->embedded_bytes != 0)) {
222 dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, "
223 "%lu)\n", skb->data, &msg->data,
224 (size_t)msg->embedded_bytes);
225
226 skb_copy_to_linear_data(skb, &msg->data,
227 (size_t)msg->embedded_bytes);
228 } else {
229 dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
230 "bte_copy(0x%p, 0x%p, %hu)\n", (void *)msg->buf_pa,
231 (void *)__pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
232 msg->size);
233
234 bret = bte_copy(msg->buf_pa,
235 __pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
236 msg->size, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
237
238 if (bret != BTE_SUCCESS) {
239 /*
240 * >>> Need better way of cleaning skb. Currently skb
241 * >>> appears in_use and we can't just call
242 * >>> dev_kfree_skb.
243 */
244 dev_err(xpnet, "bte_copy(0x%p, 0x%p, 0x%hx) returned "
245 "error=0x%x\n", (void *)msg->buf_pa,
246 (void *)__pa((u64)skb->data &
247 ~(L1_CACHE_BYTES - 1)),
248 msg->size, bret);
249
250 xpc_received(partid, channel, (void *)msg);
251
252 priv->stats.rx_errors++;
253
254 return;
255 }
256 }
257
258 dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
259 "skb->end=0x%p skb->len=%d\n", (void *)skb->head,
260 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
261 skb->len);
262
263 skb->protocol = eth_type_trans(skb, xpnet_device);
264 skb->ip_summed = CHECKSUM_UNNECESSARY;
265
266 dev_dbg(xpnet, "passing skb to network layer\n"
267 KERN_DEBUG "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p "
268 "skb->end=0x%p skb->len=%d\n",
269 (void *)skb->head, (void *)skb->data, skb_tail_pointer(skb),
270 skb_end_pointer(skb), skb->len);
271
272 xpnet_device->last_rx = jiffies;
273 priv->stats.rx_packets++;
274 priv->stats.rx_bytes += skb->len + ETH_HLEN;
275
276 netif_rx_ni(skb);
277 xpc_received(partid, channel, (void *)msg);
278}
279
280/*
281 * This is the handler which XPC calls during any sort of change in
282 * state or message reception on a connection.
283 */
284static void
285xpnet_connection_activity(enum xpc_retval reason, partid_t partid, int channel,
286 void *data, void *key)
287{
288 long bp;
289
290 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
291 DBUG_ON(channel != XPC_NET_CHANNEL);
292
293 switch (reason) {
294 case xpcMsgReceived: /* message received */
295 DBUG_ON(data == NULL);
296
297 xpnet_receive(partid, channel, (struct xpnet_message *)data);
298 break;
299
300 case xpcConnected: /* connection completed to a partition */
301 spin_lock_bh(&xpnet_broadcast_lock);
302 xpnet_broadcast_partitions |= 1UL << (partid - 1);
303 bp = xpnet_broadcast_partitions;
304 spin_unlock_bh(&xpnet_broadcast_lock);
305
306 netif_carrier_on(xpnet_device);
307
308 dev_dbg(xpnet, "%s connection created to partition %d; "
309 "xpnet_broadcast_partitions=0x%lx\n",
310 xpnet_device->name, partid, bp);
311 break;
312
313 default:
314 spin_lock_bh(&xpnet_broadcast_lock);
315 xpnet_broadcast_partitions &= ~(1UL << (partid - 1));
316 bp = xpnet_broadcast_partitions;
317 spin_unlock_bh(&xpnet_broadcast_lock);
318
319 if (bp == 0)
320 netif_carrier_off(xpnet_device);
321
322 dev_dbg(xpnet, "%s disconnected from partition %d; "
323 "xpnet_broadcast_partitions=0x%lx\n",
324 xpnet_device->name, partid, bp);
325 break;
326
327 }
328}
329
330static int
331xpnet_dev_open(struct net_device *dev)
332{
333 enum xpc_retval ret;
334
335 dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
336 "%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
337 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES, XPNET_MAX_KTHREADS,
338 XPNET_MAX_IDLE_KTHREADS);
339
340 ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
341 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
342 XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS);
343 if (ret != xpcSuccess) {
344 dev_err(xpnet, "ifconfig up of %s failed on XPC connect, "
345 "ret=%d\n", dev->name, ret);
346
347 return -ENOMEM;
348 }
349
350 dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name);
351
352 return 0;
353}
354
355static int
356xpnet_dev_stop(struct net_device *dev)
357{
358 xpc_disconnect(XPC_NET_CHANNEL);
359
360 dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name);
361
362 return 0;
363}
364
365static int
366xpnet_dev_change_mtu(struct net_device *dev, int new_mtu)
367{
368 /* 68 comes from min TCP+IP+MAC header */
369 if ((new_mtu < 68) || (new_mtu > XPNET_MAX_MTU)) {
370 dev_err(xpnet, "ifconfig %s mtu %d failed; value must be "
371 "between 68 and %ld\n", dev->name, new_mtu,
372 XPNET_MAX_MTU);
373 return -EINVAL;
374 }
375
376 dev->mtu = new_mtu;
377 dev_dbg(xpnet, "ifconfig %s mtu set to %d\n", dev->name, new_mtu);
378 return 0;
379}
380
381/*
382 * Required for the net_device structure.
383 */
384static int
385xpnet_dev_set_config(struct net_device *dev, struct ifmap *new_map)
386{
387 return 0;
388}
389
390/*
391 * Return statistics to the caller.
392 */
393static struct net_device_stats *
394xpnet_dev_get_stats(struct net_device *dev)
395{
396 struct xpnet_dev_private *priv;
397
398 priv = (struct xpnet_dev_private *)dev->priv;
399
400 return &priv->stats;
401}
402
403/*
404 * Notification that the other end has received the message and
405 * DMA'd the skb information. At this point, they are done with
406 * our side. When all recipients are done processing, we
407 * release the skb and then release our pending message structure.
408 */
409static void
410xpnet_send_completed(enum xpc_retval reason, partid_t partid, int channel,
411 void *__qm)
412{
413 struct xpnet_pending_msg *queued_msg = (struct xpnet_pending_msg *)__qm;
414
415 DBUG_ON(queued_msg == NULL);
416
417 dev_dbg(xpnet, "message to %d notified with reason %d\n",
418 partid, reason);
419
420 if (atomic_dec_return(&queued_msg->use_count) == 0) {
421 dev_dbg(xpnet, "all acks for skb->head=-x%p\n",
422 (void *)queued_msg->skb->head);
423
424 dev_kfree_skb_any(queued_msg->skb);
425 kfree(queued_msg);
426 }
427}
428
429/*
430 * Network layer has formatted a packet (skb) and is ready to place it
431 * "on the wire". Prepare and send an xpnet_message to all partitions
432 * which have connected with us and are targets of this packet.
433 *
434 * MAC-NOTE: For the XPNET driver, the MAC address contains the
435 * destination partition_id. If the destination partition id word
436 * is 0xff, this packet is to broadcast to all partitions.
437 */
438static int
439xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
440{
441 struct xpnet_pending_msg *queued_msg;
442 enum xpc_retval ret;
443 struct xpnet_message *msg;
444 u64 start_addr, end_addr;
445 long dp;
446 u8 second_mac_octet;
447 partid_t dest_partid;
448 struct xpnet_dev_private *priv;
449 u16 embedded_bytes;
450
451 priv = (struct xpnet_dev_private *)dev->priv;
452
453 dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
454 "skb->end=0x%p skb->len=%d\n", (void *)skb->head,
455 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
456 skb->len);
457
458 /*
459 * The xpnet_pending_msg tracks how many outstanding
460 * xpc_send_notifies are relying on this skb. When none
461 * remain, release the skb.
462 */
463 queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC);
464 if (queued_msg == NULL) {
465 dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping "
466 "packet\n", sizeof(struct xpnet_pending_msg));
467
468 priv->stats.tx_errors++;
469
470 return -ENOMEM;
471 }
472
473 /* get the beginning of the first cacheline and end of last */
474 start_addr = ((u64)skb->data & ~(L1_CACHE_BYTES - 1));
475 end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
476
477 /* calculate how many bytes to embed in the XPC message */
478 embedded_bytes = 0;
479 if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
480 /* skb->data does fit so embed */
481 embedded_bytes = skb->len;
482 }
483
484 /*
485 * Since the send occurs asynchronously, we set the count to one
486 * and begin sending. Any sends that happen to complete before
487 * we are done sending will not free the skb. We will be left
488 * with that task during exit. This also handles the case of
489 * a packet destined for a partition which is no longer up.
490 */
491 atomic_set(&queued_msg->use_count, 1);
492 queued_msg->skb = skb;
493
494 second_mac_octet = skb->data[XPNET_PARTID_OCTET];
495 if (second_mac_octet == 0xff) {
496 /* we are being asked to broadcast to all partitions */
497 dp = xpnet_broadcast_partitions;
498 } else if (second_mac_octet != 0) {
499 dp = xpnet_broadcast_partitions &
500 (1UL << (second_mac_octet - 1));
501 } else {
502 /* 0 is an invalid partid. Ignore */
503 dp = 0;
504 }
505 dev_dbg(xpnet, "destination Partitions mask (dp) = 0x%lx\n", dp);
506
507 /*
508 * If we wanted to allow promiscuous mode to work like an
509 * unswitched network, this would be a good point to OR in a
510 * mask of partitions which should be receiving all packets.
511 */
512
513 /*
514 * Main send loop.
515 */
516 for (dest_partid = 1; dp && dest_partid < XP_MAX_PARTITIONS;
517 dest_partid++) {
518
519 if (!(dp & (1UL << (dest_partid - 1)))) {
520 /* not destined for this partition */
521 continue;
522 }
523
524 /* remove this partition from the destinations mask */
525 dp &= ~(1UL << (dest_partid - 1));
526
527 /* found a partition to send to */
528
529 ret = xpc_allocate(dest_partid, XPC_NET_CHANNEL,
530 XPC_NOWAIT, (void **)&msg);
531 if (unlikely(ret != xpcSuccess))
532 continue;
533
534 msg->embedded_bytes = embedded_bytes;
535 if (unlikely(embedded_bytes != 0)) {
536 msg->version = XPNET_VERSION_EMBED;
537 dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
538 &msg->data, skb->data, (size_t)embedded_bytes);
539 skb_copy_from_linear_data(skb, &msg->data,
540 (size_t)embedded_bytes);
541 } else {
542 msg->version = XPNET_VERSION;
543 }
544 msg->magic = XPNET_MAGIC;
545 msg->size = end_addr - start_addr;
546 msg->leadin_ignore = (u64)skb->data - start_addr;
547 msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
548 msg->buf_pa = __pa(start_addr);
549
550 dev_dbg(xpnet, "sending XPC message to %d:%d\n"
551 KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, "
552 "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
553 dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
554 msg->leadin_ignore, msg->tailout_ignore);
555
556 atomic_inc(&queued_msg->use_count);
557
558 ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, msg,
559 xpnet_send_completed, queued_msg);
560 if (unlikely(ret != xpcSuccess)) {
561 atomic_dec(&queued_msg->use_count);
562 continue;
563 }
564 }
565
566 if (atomic_dec_return(&queued_msg->use_count) == 0) {
567 dev_dbg(xpnet, "no partitions to receive packet destined for "
568 "%d\n", dest_partid);
569
570 dev_kfree_skb(skb);
571 kfree(queued_msg);
572 }
573
574 priv->stats.tx_packets++;
575 priv->stats.tx_bytes += skb->len;
576
577 return 0;
578}
579
580/*
581 * Deal with transmit timeouts coming from the network layer.
582 */
583static void
584xpnet_dev_tx_timeout(struct net_device *dev)
585{
586 struct xpnet_dev_private *priv;
587
588 priv = (struct xpnet_dev_private *)dev->priv;
589
590 priv->stats.tx_errors++;
591 return;
592}
593
594static int __init
595xpnet_init(void)
596{
597 int i;
598 u32 license_num;
599 int result = -ENOMEM;
600
601 if (!ia64_platform_is("sn2"))
602 return -ENODEV;
603
604 dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
605
606 /*
607 * use ether_setup() to init the majority of our device
608 * structure and then override the necessary pieces.
609 */
610 xpnet_device = alloc_netdev(sizeof(struct xpnet_dev_private),
611 XPNET_DEVICE_NAME, ether_setup);
612 if (xpnet_device == NULL)
613 return -ENOMEM;
614
615 netif_carrier_off(xpnet_device);
616
617 xpnet_device->mtu = XPNET_DEF_MTU;
618 xpnet_device->change_mtu = xpnet_dev_change_mtu;
619 xpnet_device->open = xpnet_dev_open;
620 xpnet_device->get_stats = xpnet_dev_get_stats;
621 xpnet_device->stop = xpnet_dev_stop;
622 xpnet_device->hard_start_xmit = xpnet_dev_hard_start_xmit;
623 xpnet_device->tx_timeout = xpnet_dev_tx_timeout;
624 xpnet_device->set_config = xpnet_dev_set_config;
625
626 /*
627 * Multicast assumes the LSB of the first octet is set for multicast
628 * MAC addresses. We chose the first octet of the MAC to be unlikely
629 * to collide with any vendor's officially issued MAC.
630 */
631 xpnet_device->dev_addr[0] = 0xfe;
632 xpnet_device->dev_addr[XPNET_PARTID_OCTET] = sn_partition_id;
633 license_num = sn_partition_serial_number_val();
634 for (i = 3; i >= 0; i--) {
635 xpnet_device->dev_addr[XPNET_LICENSE_OCTET + i] =
636 license_num & 0xff;
637 license_num = license_num >> 8;
638 }
639
640 /*
641 * ether_setup() sets this to a multicast device. We are
642 * really not supporting multicast at this time.
643 */
644 xpnet_device->flags &= ~IFF_MULTICAST;
645
646 /*
647 * No need to checksum as it is a DMA transfer. The BTE will
648 * report an error if the data is not retrievable and the
649 * packet will be dropped.
650 */
651 xpnet_device->features = NETIF_F_NO_CSUM;
652
653 result = register_netdev(xpnet_device);
654 if (result != 0)
655 free_netdev(xpnet_device);
656
657 return result;
658}
659
660module_init(xpnet_init);
661
662static void __exit
663xpnet_exit(void)
664{
665 dev_info(xpnet, "unregistering network device %s\n",
666 xpnet_device[0].name);
667
668 unregister_netdev(xpnet_device);
669
670 free_netdev(xpnet_device);
671}
672
673module_exit(xpnet_exit);
674
675MODULE_AUTHOR("Silicon Graphics, Inc.");
676MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)");
677MODULE_LICENSE("GPL");