aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/misc
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc')
-rw-r--r--drivers/misc/Kconfig27
-rw-r--r--drivers/misc/Makefile1
-rw-r--r--drivers/misc/sgi-gru/Makefile3
-rw-r--r--drivers/misc/sgi-gru/gru.h67
-rw-r--r--drivers/misc/sgi-gru/gru_instructions.h669
-rw-r--r--drivers/misc/sgi-gru/grufault.c633
-rw-r--r--drivers/misc/sgi-gru/grufile.c485
-rw-r--r--drivers/misc/sgi-gru/gruhandles.h663
-rw-r--r--drivers/misc/sgi-gru/grukservices.c679
-rw-r--r--drivers/misc/sgi-gru/grukservices.h134
-rw-r--r--drivers/misc/sgi-gru/grulib.h97
-rw-r--r--drivers/misc/sgi-gru/grumain.c802
-rw-r--r--drivers/misc/sgi-gru/gruprocfs.c336
-rw-r--r--drivers/misc/sgi-gru/grutables.h609
-rw-r--r--drivers/misc/sgi-gru/grutlbpurge.c371
-rw-r--r--drivers/misc/sgi-xp/Makefile10
-rw-r--r--drivers/misc/sgi-xp/xp.h225
-rw-r--r--drivers/misc/sgi-xp/xp_main.c131
-rw-r--r--drivers/misc/sgi-xp/xp_sn2.c146
-rw-r--r--drivers/misc/sgi-xp/xp_uv.c72
-rw-r--r--drivers/misc/sgi-xp/xpc.h1200
-rw-r--r--drivers/misc/sgi-xp/xpc_channel.c1585
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c977
-rw-r--r--drivers/misc/sgi-xp/xpc_partition.c928
-rw-r--r--drivers/misc/sgi-xp/xpc_sn2.c2404
-rw-r--r--drivers/misc/sgi-xp/xpc_uv.c1443
-rw-r--r--drivers/misc/sgi-xp/xpnet.c277
27 files changed, 11279 insertions, 3695 deletions
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index f5ade1904aad..82af385460e4 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -426,9 +426,11 @@ config ENCLOSURE_SERVICES
426 426
427config SGI_XP 427config SGI_XP
428 tristate "Support communication between SGI SSIs" 428 tristate "Support communication between SGI SSIs"
429 depends on IA64_GENERIC || IA64_SGI_SN2 429 depends on NET
430 depends on IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || (X86_64 && SMP)
430 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 431 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
431 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 432 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
433 select SGI_GRU if IA64_GENERIC || IA64_SGI_UV || (X86_64 && SMP)
432 ---help--- 434 ---help---
433 An SGI machine can be divided into multiple Single System 435 An SGI machine can be divided into multiple Single System
434 Images which act independently of each other and have 436 Images which act independently of each other and have
@@ -450,4 +452,27 @@ config HP_ILO
450 To compile this driver as a module, choose M here: the 452 To compile this driver as a module, choose M here: the
451 module will be called hpilo. 453 module will be called hpilo.
452 454
455config SGI_GRU
456 tristate "SGI GRU driver"
457 depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
458 default n
459 select MMU_NOTIFIER
460 ---help---
461 The GRU is a hardware resource located in the system chipset. The GRU
462 contains memory that can be mmapped into the user address space. This memory is
463 used to communicate with the GRU to perform functions such as load/store,
464 scatter/gather, bcopy, AMOs, etc. The GRU is directly accessed by user
465 instructions using user virtual addresses. GRU instructions (ex., bcopy) use
466 user virtual addresses for operands.
467
468 If you are not running on a SGI UV system, say N.
469
470config SGI_GRU_DEBUG
471 bool "SGI GRU driver debug"
472 depends on SGI_GRU
473 default n
474 ---help---
475 This option enables addition debugging code for the SGI GRU driver. If
476 you are unsure, say N.
477
453endif # MISC_DEVICES 478endif # MISC_DEVICES
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index f5e273420c09..c6c13f60b452 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -28,4 +28,5 @@ obj-$(CONFIG_INTEL_MENLOW) += intel_menlow.o
28obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o 28obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
29obj-$(CONFIG_KGDB_TESTS) += kgdbts.o 29obj-$(CONFIG_KGDB_TESTS) += kgdbts.o
30obj-$(CONFIG_SGI_XP) += sgi-xp/ 30obj-$(CONFIG_SGI_XP) += sgi-xp/
31obj-$(CONFIG_SGI_GRU) += sgi-gru/
31obj-$(CONFIG_HP_ILO) += hpilo.o 32obj-$(CONFIG_HP_ILO) += hpilo.o
diff --git a/drivers/misc/sgi-gru/Makefile b/drivers/misc/sgi-gru/Makefile
new file mode 100644
index 000000000000..d03597a521b0
--- /dev/null
+++ b/drivers/misc/sgi-gru/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_SGI_GRU) := gru.o
2gru-y := grufile.o grumain.o grufault.o grutlbpurge.o gruprocfs.o grukservices.o
3
diff --git a/drivers/misc/sgi-gru/gru.h b/drivers/misc/sgi-gru/gru.h
new file mode 100644
index 000000000000..40df7cb3f0a5
--- /dev/null
+++ b/drivers/misc/sgi-gru/gru.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation; either version 2.1 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef __GRU_H__
20#define __GRU_H__
21
22/*
23 * GRU architectural definitions
24 */
25#define GRU_CACHE_LINE_BYTES 64
26#define GRU_HANDLE_STRIDE 256
27#define GRU_CB_BASE 0
28#define GRU_DS_BASE 0x20000
29
30/*
31 * Size used to map GRU GSeg
32 */
33#if defined CONFIG_IA64
34#define GRU_GSEG_PAGESIZE (256 * 1024UL)
35#elif defined CONFIG_X86_64
36#define GRU_GSEG_PAGESIZE (256 * 1024UL) /* ZZZ 2MB ??? */
37#else
38#error "Unsupported architecture"
39#endif
40
41/*
42 * Structure for obtaining GRU resource information
43 */
44struct gru_chiplet_info {
45 int node;
46 int chiplet;
47 int blade;
48 int total_dsr_bytes;
49 int total_cbr;
50 int total_user_dsr_bytes;
51 int total_user_cbr;
52 int free_user_dsr_bytes;
53 int free_user_cbr;
54};
55
56/* Flags for GRU options on the gru_create_context() call */
57/* Select one of the follow 4 options to specify how TLB misses are handled */
58#define GRU_OPT_MISS_DEFAULT 0x0000 /* Use default mode */
59#define GRU_OPT_MISS_USER_POLL 0x0001 /* User will poll CB for faults */
60#define GRU_OPT_MISS_FMM_INTR 0x0002 /* Send interrupt to cpu to
61 handle fault */
62#define GRU_OPT_MISS_FMM_POLL 0x0003 /* Use system polling thread */
63#define GRU_OPT_MISS_MASK 0x0003 /* Mask for TLB MISS option */
64
65
66
67#endif /* __GRU_H__ */
diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h
new file mode 100644
index 000000000000..0dc36225c7c6
--- /dev/null
+++ b/drivers/misc/sgi-gru/gru_instructions.h
@@ -0,0 +1,669 @@
1/*
2 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation; either version 2.1 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef __GRU_INSTRUCTIONS_H__
20#define __GRU_INSTRUCTIONS_H__
21
22#define gru_flush_cache_hook(p)
23#define gru_emulator_wait_hook(p, w)
24
25/*
26 * Architecture dependent functions
27 */
28
29#if defined CONFIG_IA64
30#include <linux/compiler.h>
31#include <asm/intrinsics.h>
32#define __flush_cache(p) ia64_fc(p)
33/* Use volatile on IA64 to ensure ordering via st4.rel */
34#define gru_ordered_store_int(p,v) \
35 do { \
36 barrier(); \
37 *((volatile int *)(p)) = v; /* force st.rel */ \
38 } while (0)
39#elif defined CONFIG_X86_64
40#define __flush_cache(p) clflush(p)
41#define gru_ordered_store_int(p,v) \
42 do { \
43 barrier(); \
44 *(int *)p = v; \
45 } while (0)
46#else
47#error "Unsupported architecture"
48#endif
49
50/*
51 * Control block status and exception codes
52 */
53#define CBS_IDLE 0
54#define CBS_EXCEPTION 1
55#define CBS_ACTIVE 2
56#define CBS_CALL_OS 3
57
58/* CB substatus bitmasks */
59#define CBSS_MSG_QUEUE_MASK 7
60#define CBSS_IMPLICIT_ABORT_ACTIVE_MASK 8
61
62/* CB substatus message queue values (low 3 bits of substatus) */
63#define CBSS_NO_ERROR 0
64#define CBSS_LB_OVERFLOWED 1
65#define CBSS_QLIMIT_REACHED 2
66#define CBSS_PAGE_OVERFLOW 3
67#define CBSS_AMO_NACKED 4
68#define CBSS_PUT_NACKED 5
69
70/*
71 * Structure used to fetch exception detail for CBs that terminate with
72 * CBS_EXCEPTION
73 */
74struct control_block_extended_exc_detail {
75 unsigned long cb;
76 int opc;
77 int ecause;
78 int exopc;
79 long exceptdet0;
80 int exceptdet1;
81};
82
83/*
84 * Instruction formats
85 */
86
87/*
88 * Generic instruction format.
89 * This definition has precise bit field definitions.
90 */
91struct gru_instruction_bits {
92 /* DW 0 - low */
93 unsigned int icmd: 1;
94 unsigned char ima: 3; /* CB_DelRep, unmapped mode */
95 unsigned char reserved0: 4;
96 unsigned int xtype: 3;
97 unsigned int iaa0: 2;
98 unsigned int iaa1: 2;
99 unsigned char reserved1: 1;
100 unsigned char opc: 8; /* opcode */
101 unsigned char exopc: 8; /* extended opcode */
102 /* DW 0 - high */
103 unsigned int idef2: 22; /* TRi0 */
104 unsigned char reserved2: 2;
105 unsigned char istatus: 2;
106 unsigned char isubstatus:4;
107 unsigned char reserved3: 2;
108 /* DW 1 */
109 unsigned long idef4; /* 42 bits: TRi1, BufSize */
110 /* DW 2-6 */
111 unsigned long idef1; /* BAddr0 */
112 unsigned long idef5; /* Nelem */
113 unsigned long idef6; /* Stride, Operand1 */
114 unsigned long idef3; /* BAddr1, Value, Operand2 */
115 unsigned long reserved4;
116 /* DW 7 */
117 unsigned long avalue; /* AValue */
118};
119
120/*
121 * Generic instruction with friendlier names. This format is used
122 * for inline instructions.
123 */
124struct gru_instruction {
125 /* DW 0 */
126 unsigned int op32; /* icmd,xtype,iaa0,ima,opc */
127 unsigned int tri0;
128 unsigned long tri1_bufsize; /* DW 1 */
129 unsigned long baddr0; /* DW 2 */
130 unsigned long nelem; /* DW 3 */
131 unsigned long op1_stride; /* DW 4 */
132 unsigned long op2_value_baddr1; /* DW 5 */
133 unsigned long reserved0; /* DW 6 */
134 unsigned long avalue; /* DW 7 */
135};
136
137/* Some shifts and masks for the low 32 bits of a GRU command */
138#define GRU_CB_ICMD_SHFT 0
139#define GRU_CB_ICMD_MASK 0x1
140#define GRU_CB_XTYPE_SHFT 8
141#define GRU_CB_XTYPE_MASK 0x7
142#define GRU_CB_IAA0_SHFT 11
143#define GRU_CB_IAA0_MASK 0x3
144#define GRU_CB_IAA1_SHFT 13
145#define GRU_CB_IAA1_MASK 0x3
146#define GRU_CB_IMA_SHFT 1
147#define GRU_CB_IMA_MASK 0x3
148#define GRU_CB_OPC_SHFT 16
149#define GRU_CB_OPC_MASK 0xff
150#define GRU_CB_EXOPC_SHFT 24
151#define GRU_CB_EXOPC_MASK 0xff
152
153/* GRU instruction opcodes (opc field) */
154#define OP_NOP 0x00
155#define OP_BCOPY 0x01
156#define OP_VLOAD 0x02
157#define OP_IVLOAD 0x03
158#define OP_VSTORE 0x04
159#define OP_IVSTORE 0x05
160#define OP_VSET 0x06
161#define OP_IVSET 0x07
162#define OP_MESQ 0x08
163#define OP_GAMXR 0x09
164#define OP_GAMIR 0x0a
165#define OP_GAMIRR 0x0b
166#define OP_GAMER 0x0c
167#define OP_GAMERR 0x0d
168#define OP_BSTORE 0x0e
169#define OP_VFLUSH 0x0f
170
171
172/* Extended opcodes values (exopc field) */
173
174/* GAMIR - AMOs with implicit operands */
175#define EOP_IR_FETCH 0x01 /* Plain fetch of memory */
176#define EOP_IR_CLR 0x02 /* Fetch and clear */
177#define EOP_IR_INC 0x05 /* Fetch and increment */
178#define EOP_IR_DEC 0x07 /* Fetch and decrement */
179#define EOP_IR_QCHK1 0x0d /* Queue check, 64 byte msg */
180#define EOP_IR_QCHK2 0x0e /* Queue check, 128 byte msg */
181
182/* GAMIRR - Registered AMOs with implicit operands */
183#define EOP_IRR_FETCH 0x01 /* Registered fetch of memory */
184#define EOP_IRR_CLR 0x02 /* Registered fetch and clear */
185#define EOP_IRR_INC 0x05 /* Registered fetch and increment */
186#define EOP_IRR_DEC 0x07 /* Registered fetch and decrement */
187#define EOP_IRR_DECZ 0x0f /* Registered fetch and decrement, update on zero*/
188
189/* GAMER - AMOs with explicit operands */
190#define EOP_ER_SWAP 0x00 /* Exchange argument and memory */
191#define EOP_ER_OR 0x01 /* Logical OR with memory */
192#define EOP_ER_AND 0x02 /* Logical AND with memory */
193#define EOP_ER_XOR 0x03 /* Logical XOR with memory */
194#define EOP_ER_ADD 0x04 /* Add value to memory */
195#define EOP_ER_CSWAP 0x08 /* Compare with operand2, write operand1 if match*/
196#define EOP_ER_CADD 0x0c /* Queue check, operand1*64 byte msg */
197
198/* GAMERR - Registered AMOs with explicit operands */
199#define EOP_ERR_SWAP 0x00 /* Exchange argument and memory */
200#define EOP_ERR_OR 0x01 /* Logical OR with memory */
201#define EOP_ERR_AND 0x02 /* Logical AND with memory */
202#define EOP_ERR_XOR 0x03 /* Logical XOR with memory */
203#define EOP_ERR_ADD 0x04 /* Add value to memory */
204#define EOP_ERR_CSWAP 0x08 /* Compare with operand2, write operand1 if match*/
205#define EOP_ERR_EPOLL 0x09 /* Poll for equality */
206#define EOP_ERR_NPOLL 0x0a /* Poll for inequality */
207
208/* GAMXR - SGI Arithmetic unit */
209#define EOP_XR_CSWAP 0x0b /* Masked compare exchange */
210
211
212/* Transfer types (xtype field) */
213#define XTYPE_B 0x0 /* byte */
214#define XTYPE_S 0x1 /* short (2-byte) */
215#define XTYPE_W 0x2 /* word (4-byte) */
216#define XTYPE_DW 0x3 /* doubleword (8-byte) */
217#define XTYPE_CL 0x6 /* cacheline (64-byte) */
218
219
220/* Instruction access attributes (iaa0, iaa1 fields) */
221#define IAA_RAM 0x0 /* normal cached RAM access */
222#define IAA_NCRAM 0x2 /* noncoherent RAM access */
223#define IAA_MMIO 0x1 /* noncoherent memory-mapped I/O space */
224#define IAA_REGISTER 0x3 /* memory-mapped registers, etc. */
225
226
227/* Instruction mode attributes (ima field) */
228#define IMA_MAPPED 0x0 /* Virtual mode */
229#define IMA_CB_DELAY 0x1 /* hold read responses until status changes */
230#define IMA_UNMAPPED 0x2 /* bypass the TLBs (OS only) */
231#define IMA_INTERRUPT 0x4 /* Interrupt when instruction completes */
232
233/* CBE ecause bits */
234#define CBE_CAUSE_RI (1 << 0)
235#define CBE_CAUSE_INVALID_INSTRUCTION (1 << 1)
236#define CBE_CAUSE_UNMAPPED_MODE_FORBIDDEN (1 << 2)
237#define CBE_CAUSE_PE_CHECK_DATA_ERROR (1 << 3)
238#define CBE_CAUSE_IAA_GAA_MISMATCH (1 << 4)
239#define CBE_CAUSE_DATA_SEGMENT_LIMIT_EXCEPTION (1 << 5)
240#define CBE_CAUSE_OS_FATAL_TLB_FAULT (1 << 6)
241#define CBE_CAUSE_EXECUTION_HW_ERROR (1 << 7)
242#define CBE_CAUSE_TLBHW_ERROR (1 << 8)
243#define CBE_CAUSE_RA_REQUEST_TIMEOUT (1 << 9)
244#define CBE_CAUSE_HA_REQUEST_TIMEOUT (1 << 10)
245#define CBE_CAUSE_RA_RESPONSE_FATAL (1 << 11)
246#define CBE_CAUSE_RA_RESPONSE_NON_FATAL (1 << 12)
247#define CBE_CAUSE_HA_RESPONSE_FATAL (1 << 13)
248#define CBE_CAUSE_HA_RESPONSE_NON_FATAL (1 << 14)
249#define CBE_CAUSE_ADDRESS_SPACE_DECODE_ERROR (1 << 15)
250#define CBE_CAUSE_RESPONSE_DATA_ERROR (1 << 16)
251#define CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR (1 << 17)
252
253/*
254 * Exceptions are retried for the following cases. If any OTHER bits are set
255 * in ecause, the exception is not retryable.
256 */
257#define EXCEPTION_RETRY_BITS (CBE_CAUSE_RESPONSE_DATA_ERROR | \
258 CBE_CAUSE_RA_REQUEST_TIMEOUT | \
259 CBE_CAUSE_TLBHW_ERROR | \
260 CBE_CAUSE_HA_REQUEST_TIMEOUT)
261
262/* Message queue head structure */
263union gru_mesqhead {
264 unsigned long val;
265 struct {
266 unsigned int head;
267 unsigned int limit;
268 };
269};
270
271
272/* Generate the low word of a GRU instruction */
273static inline unsigned int
274__opword(unsigned char opcode, unsigned char exopc, unsigned char xtype,
275 unsigned char iaa0, unsigned char iaa1,
276 unsigned char ima)
277{
278 return (1 << GRU_CB_ICMD_SHFT) |
279 (iaa0 << GRU_CB_IAA0_SHFT) |
280 (iaa1 << GRU_CB_IAA1_SHFT) |
281 (ima << GRU_CB_IMA_SHFT) |
282 (xtype << GRU_CB_XTYPE_SHFT) |
283 (opcode << GRU_CB_OPC_SHFT) |
284 (exopc << GRU_CB_EXOPC_SHFT);
285}
286
287/*
288 * Architecture specific intrinsics
289 */
290static inline void gru_flush_cache(void *p)
291{
292 __flush_cache(p);
293}
294
295/*
296 * Store the lower 32 bits of the command including the "start" bit. Then
297 * start the instruction executing.
298 */
299static inline void gru_start_instruction(struct gru_instruction *ins, int op32)
300{
301 gru_ordered_store_int(ins, op32);
302}
303
304
305/* Convert "hints" to IMA */
306#define CB_IMA(h) ((h) | IMA_UNMAPPED)
307
308/* Convert data segment cache line index into TRI0 / TRI1 value */
309#define GRU_DINDEX(i) ((i) * GRU_CACHE_LINE_BYTES)
310
311/* Inline functions for GRU instructions.
312 * Note:
313 * - nelem and stride are in elements
314 * - tri0/tri1 is in bytes for the beginning of the data segment.
315 */
316static inline void gru_vload(void *cb, unsigned long mem_addr,
317 unsigned int tri0, unsigned char xtype, unsigned long nelem,
318 unsigned long stride, unsigned long hints)
319{
320 struct gru_instruction *ins = (struct gru_instruction *)cb;
321
322 ins->baddr0 = (long)mem_addr;
323 ins->nelem = nelem;
324 ins->tri0 = tri0;
325 ins->op1_stride = stride;
326 gru_start_instruction(ins, __opword(OP_VLOAD, 0, xtype, IAA_RAM, 0,
327 CB_IMA(hints)));
328}
329
330static inline void gru_vstore(void *cb, unsigned long mem_addr,
331 unsigned int tri0, unsigned char xtype, unsigned long nelem,
332 unsigned long stride, unsigned long hints)
333{
334 struct gru_instruction *ins = (void *)cb;
335
336 ins->baddr0 = (long)mem_addr;
337 ins->nelem = nelem;
338 ins->tri0 = tri0;
339 ins->op1_stride = stride;
340 gru_start_instruction(ins, __opword(OP_VSTORE, 0, xtype, IAA_RAM, 0,
341 CB_IMA(hints)));
342}
343
344static inline void gru_ivload(void *cb, unsigned long mem_addr,
345 unsigned int tri0, unsigned int tri1, unsigned char xtype,
346 unsigned long nelem, unsigned long hints)
347{
348 struct gru_instruction *ins = (void *)cb;
349
350 ins->baddr0 = (long)mem_addr;
351 ins->nelem = nelem;
352 ins->tri0 = tri0;
353 ins->tri1_bufsize = tri1;
354 gru_start_instruction(ins, __opword(OP_IVLOAD, 0, xtype, IAA_RAM, 0,
355 CB_IMA(hints)));
356}
357
358static inline void gru_ivstore(void *cb, unsigned long mem_addr,
359 unsigned int tri0, unsigned int tri1,
360 unsigned char xtype, unsigned long nelem, unsigned long hints)
361{
362 struct gru_instruction *ins = (void *)cb;
363
364 ins->baddr0 = (long)mem_addr;
365 ins->nelem = nelem;
366 ins->tri0 = tri0;
367 ins->tri1_bufsize = tri1;
368 gru_start_instruction(ins, __opword(OP_IVSTORE, 0, xtype, IAA_RAM, 0,
369 CB_IMA(hints)));
370}
371
372static inline void gru_vset(void *cb, unsigned long mem_addr,
373 unsigned long value, unsigned char xtype, unsigned long nelem,
374 unsigned long stride, unsigned long hints)
375{
376 struct gru_instruction *ins = (void *)cb;
377
378 ins->baddr0 = (long)mem_addr;
379 ins->op2_value_baddr1 = value;
380 ins->nelem = nelem;
381 ins->op1_stride = stride;
382 gru_start_instruction(ins, __opword(OP_VSET, 0, xtype, IAA_RAM, 0,
383 CB_IMA(hints)));
384}
385
386static inline void gru_ivset(void *cb, unsigned long mem_addr,
387 unsigned int tri1, unsigned long value, unsigned char xtype,
388 unsigned long nelem, unsigned long hints)
389{
390 struct gru_instruction *ins = (void *)cb;
391
392 ins->baddr0 = (long)mem_addr;
393 ins->op2_value_baddr1 = value;
394 ins->nelem = nelem;
395 ins->tri1_bufsize = tri1;
396 gru_start_instruction(ins, __opword(OP_IVSET, 0, xtype, IAA_RAM, 0,
397 CB_IMA(hints)));
398}
399
400static inline void gru_vflush(void *cb, unsigned long mem_addr,
401 unsigned long nelem, unsigned char xtype, unsigned long stride,
402 unsigned long hints)
403{
404 struct gru_instruction *ins = (void *)cb;
405
406 ins->baddr0 = (long)mem_addr;
407 ins->op1_stride = stride;
408 ins->nelem = nelem;
409 gru_start_instruction(ins, __opword(OP_VFLUSH, 0, xtype, IAA_RAM, 0,
410 CB_IMA(hints)));
411}
412
413static inline void gru_nop(void *cb, int hints)
414{
415 struct gru_instruction *ins = (void *)cb;
416
417 gru_start_instruction(ins, __opword(OP_NOP, 0, 0, 0, 0, CB_IMA(hints)));
418}
419
420
421static inline void gru_bcopy(void *cb, const unsigned long src,
422 unsigned long dest,
423 unsigned int tri0, unsigned int xtype, unsigned long nelem,
424 unsigned int bufsize, unsigned long hints)
425{
426 struct gru_instruction *ins = (void *)cb;
427
428 ins->baddr0 = (long)src;
429 ins->op2_value_baddr1 = (long)dest;
430 ins->nelem = nelem;
431 ins->tri0 = tri0;
432 ins->tri1_bufsize = bufsize;
433 gru_start_instruction(ins, __opword(OP_BCOPY, 0, xtype, IAA_RAM,
434 IAA_RAM, CB_IMA(hints)));
435}
436
437static inline void gru_bstore(void *cb, const unsigned long src,
438 unsigned long dest, unsigned int tri0, unsigned int xtype,
439 unsigned long nelem, unsigned long hints)
440{
441 struct gru_instruction *ins = (void *)cb;
442
443 ins->baddr0 = (long)src;
444 ins->op2_value_baddr1 = (long)dest;
445 ins->nelem = nelem;
446 ins->tri0 = tri0;
447 gru_start_instruction(ins, __opword(OP_BSTORE, 0, xtype, 0, IAA_RAM,
448 CB_IMA(hints)));
449}
450
451static inline void gru_gamir(void *cb, int exopc, unsigned long src,
452 unsigned int xtype, unsigned long hints)
453{
454 struct gru_instruction *ins = (void *)cb;
455
456 ins->baddr0 = (long)src;
457 gru_start_instruction(ins, __opword(OP_GAMIR, exopc, xtype, IAA_RAM, 0,
458 CB_IMA(hints)));
459}
460
461static inline void gru_gamirr(void *cb, int exopc, unsigned long src,
462 unsigned int xtype, unsigned long hints)
463{
464 struct gru_instruction *ins = (void *)cb;
465
466 ins->baddr0 = (long)src;
467 gru_start_instruction(ins, __opword(OP_GAMIRR, exopc, xtype, IAA_RAM, 0,
468 CB_IMA(hints)));
469}
470
471static inline void gru_gamer(void *cb, int exopc, unsigned long src,
472 unsigned int xtype,
473 unsigned long operand1, unsigned long operand2,
474 unsigned long hints)
475{
476 struct gru_instruction *ins = (void *)cb;
477
478 ins->baddr0 = (long)src;
479 ins->op1_stride = operand1;
480 ins->op2_value_baddr1 = operand2;
481 gru_start_instruction(ins, __opword(OP_GAMER, exopc, xtype, IAA_RAM, 0,
482 CB_IMA(hints)));
483}
484
485static inline void gru_gamerr(void *cb, int exopc, unsigned long src,
486 unsigned int xtype, unsigned long operand1,
487 unsigned long operand2, unsigned long hints)
488{
489 struct gru_instruction *ins = (void *)cb;
490
491 ins->baddr0 = (long)src;
492 ins->op1_stride = operand1;
493 ins->op2_value_baddr1 = operand2;
494 gru_start_instruction(ins, __opword(OP_GAMERR, exopc, xtype, IAA_RAM, 0,
495 CB_IMA(hints)));
496}
497
498static inline void gru_gamxr(void *cb, unsigned long src,
499 unsigned int tri0, unsigned long hints)
500{
501 struct gru_instruction *ins = (void *)cb;
502
503 ins->baddr0 = (long)src;
504 ins->nelem = 4;
505 gru_start_instruction(ins, __opword(OP_GAMXR, EOP_XR_CSWAP, XTYPE_DW,
506 IAA_RAM, 0, CB_IMA(hints)));
507}
508
509static inline void gru_mesq(void *cb, unsigned long queue,
510 unsigned long tri0, unsigned long nelem,
511 unsigned long hints)
512{
513 struct gru_instruction *ins = (void *)cb;
514
515 ins->baddr0 = (long)queue;
516 ins->nelem = nelem;
517 ins->tri0 = tri0;
518 gru_start_instruction(ins, __opword(OP_MESQ, 0, XTYPE_CL, IAA_RAM, 0,
519 CB_IMA(hints)));
520}
521
522static inline unsigned long gru_get_amo_value(void *cb)
523{
524 struct gru_instruction *ins = (void *)cb;
525
526 return ins->avalue;
527}
528
529static inline int gru_get_amo_value_head(void *cb)
530{
531 struct gru_instruction *ins = (void *)cb;
532
533 return ins->avalue & 0xffffffff;
534}
535
536static inline int gru_get_amo_value_limit(void *cb)
537{
538 struct gru_instruction *ins = (void *)cb;
539
540 return ins->avalue >> 32;
541}
542
543static inline union gru_mesqhead gru_mesq_head(int head, int limit)
544{
545 union gru_mesqhead mqh;
546
547 mqh.head = head;
548 mqh.limit = limit;
549 return mqh;
550}
551
552/*
553 * Get struct control_block_extended_exc_detail for CB.
554 */
555extern int gru_get_cb_exception_detail(void *cb,
556 struct control_block_extended_exc_detail *excdet);
557
558#define GRU_EXC_STR_SIZE 256
559
560extern int gru_check_status_proc(void *cb);
561extern int gru_wait_proc(void *cb);
562extern void gru_wait_abort_proc(void *cb);
563
564/*
565 * Control block definition for checking status
566 */
567struct gru_control_block_status {
568 unsigned int icmd :1;
569 unsigned int unused1 :31;
570 unsigned int unused2 :24;
571 unsigned int istatus :2;
572 unsigned int isubstatus :4;
573 unsigned int inused3 :2;
574};
575
576/* Get CB status */
577static inline int gru_get_cb_status(void *cb)
578{
579 struct gru_control_block_status *cbs = (void *)cb;
580
581 return cbs->istatus;
582}
583
584/* Get CB message queue substatus */
585static inline int gru_get_cb_message_queue_substatus(void *cb)
586{
587 struct gru_control_block_status *cbs = (void *)cb;
588
589 return cbs->isubstatus & CBSS_MSG_QUEUE_MASK;
590}
591
592/* Get CB substatus */
593static inline int gru_get_cb_substatus(void *cb)
594{
595 struct gru_control_block_status *cbs = (void *)cb;
596
597 return cbs->isubstatus;
598}
599
600/* Check the status of a CB. If the CB is in UPM mode, call the
601 * OS to handle the UPM status.
602 * Returns the CB status field value (0 for normal completion)
603 */
604static inline int gru_check_status(void *cb)
605{
606 struct gru_control_block_status *cbs = (void *)cb;
607 int ret = cbs->istatus;
608
609 if (ret == CBS_CALL_OS)
610 ret = gru_check_status_proc(cb);
611 return ret;
612}
613
614/* Wait for CB to complete.
615 * Returns the CB status field value (0 for normal completion)
616 */
617static inline int gru_wait(void *cb)
618{
619 struct gru_control_block_status *cbs = (void *)cb;
620 int ret = cbs->istatus;;
621
622 if (ret != CBS_IDLE)
623 ret = gru_wait_proc(cb);
624 return ret;
625}
626
627/* Wait for CB to complete. Aborts program if error. (Note: error does NOT
628 * mean TLB mis - only fatal errors such as memory parity error or user
629 * bugs will cause termination.
630 */
631static inline void gru_wait_abort(void *cb)
632{
633 struct gru_control_block_status *cbs = (void *)cb;
634
635 if (cbs->istatus != CBS_IDLE)
636 gru_wait_abort_proc(cb);
637}
638
639
640/*
641 * Get a pointer to a control block
642 * gseg - GSeg address returned from gru_get_thread_gru_segment()
643 * index - index of desired CB
644 */
645static inline void *gru_get_cb_pointer(void *gseg,
646 int index)
647{
648 return gseg + GRU_CB_BASE + index * GRU_HANDLE_STRIDE;
649}
650
651/*
652 * Get a pointer to a cacheline in the data segment portion of a GSeg
653 * gseg - GSeg address returned from gru_get_thread_gru_segment()
654 * index - index of desired cache line
655 */
656static inline void *gru_get_data_pointer(void *gseg, int index)
657{
658 return gseg + GRU_DS_BASE + index * GRU_CACHE_LINE_BYTES;
659}
660
661/*
662 * Convert a vaddr into the tri index within the GSEG
663 * vaddr - virtual address of within gseg
664 */
665static inline int gru_get_tri(void *vaddr)
666{
667 return ((unsigned long)vaddr & (GRU_GSEG_PAGESIZE - 1)) - GRU_DS_BASE;
668}
669#endif /* __GRU_INSTRUCTIONS_H__ */
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
new file mode 100644
index 000000000000..3d33015bbf31
--- /dev/null
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -0,0 +1,633 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * FAULT HANDLER FOR GRU DETECTED TLB MISSES
5 *
6 * This file contains code that handles TLB misses within the GRU.
7 * These misses are reported either via interrupts or user polling of
8 * the user CB.
9 *
10 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 */
26
27#include <linux/kernel.h>
28#include <linux/errno.h>
29#include <linux/spinlock.h>
30#include <linux/mm.h>
31#include <linux/hugetlb.h>
32#include <linux/device.h>
33#include <linux/io.h>
34#include <linux/uaccess.h>
35#include <asm/pgtable.h>
36#include "gru.h"
37#include "grutables.h"
38#include "grulib.h"
39#include "gru_instructions.h"
40#include <asm/uv/uv_hub.h>
41
42/*
43 * Test if a physical address is a valid GRU GSEG address
44 */
45static inline int is_gru_paddr(unsigned long paddr)
46{
47 return paddr >= gru_start_paddr && paddr < gru_end_paddr;
48}
49
50/*
51 * Find the vma of a GRU segment. Caller must hold mmap_sem.
52 */
53struct vm_area_struct *gru_find_vma(unsigned long vaddr)
54{
55 struct vm_area_struct *vma;
56
57 vma = find_vma(current->mm, vaddr);
58 if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
59 return vma;
60 return NULL;
61}
62
63/*
64 * Find and lock the gts that contains the specified user vaddr.
65 *
66 * Returns:
67 * - *gts with the mmap_sem locked for read and the GTS locked.
68 * - NULL if vaddr invalid OR is not a valid GSEG vaddr.
69 */
70
71static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
72{
73 struct mm_struct *mm = current->mm;
74 struct vm_area_struct *vma;
75 struct gru_thread_state *gts = NULL;
76
77 down_read(&mm->mmap_sem);
78 vma = gru_find_vma(vaddr);
79 if (vma)
80 gts = gru_find_thread_state(vma, TSID(vaddr, vma));
81 if (gts)
82 mutex_lock(&gts->ts_ctxlock);
83 else
84 up_read(&mm->mmap_sem);
85 return gts;
86}
87
88static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
89{
90 struct mm_struct *mm = current->mm;
91 struct vm_area_struct *vma;
92 struct gru_thread_state *gts = NULL;
93
94 down_write(&mm->mmap_sem);
95 vma = gru_find_vma(vaddr);
96 if (vma)
97 gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
98 if (gts) {
99 mutex_lock(&gts->ts_ctxlock);
100 downgrade_write(&mm->mmap_sem);
101 } else {
102 up_write(&mm->mmap_sem);
103 }
104
105 return gts;
106}
107
108/*
109 * Unlock a GTS that was previously locked with gru_find_lock_gts().
110 */
111static void gru_unlock_gts(struct gru_thread_state *gts)
112{
113 mutex_unlock(&gts->ts_ctxlock);
114 up_read(&current->mm->mmap_sem);
115}
116
117/*
118 * Set a CB.istatus to active using a user virtual address. This must be done
119 * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
120 * If the line is evicted, the status may be lost. The in-cache update
121 * is necessary to prevent the user from seeing a stale cb.istatus that will
122 * change as soon as the TFH restart is complete. Races may cause an
123 * occasional failure to clear the cb.istatus, but that is ok.
124 *
125 * If the cb address is not valid (should not happen, but...), nothing
126 * bad will happen.. The get_user()/put_user() will fail but there
127 * are no bad side-effects.
128 */
129static void gru_cb_set_istatus_active(unsigned long __user *cb)
130{
131 union {
132 struct gru_instruction_bits bits;
133 unsigned long dw;
134 } u;
135
136 if (cb) {
137 get_user(u.dw, cb);
138 u.bits.istatus = CBS_ACTIVE;
139 put_user(u.dw, cb);
140 }
141}
142
143/*
144 * Convert a interrupt IRQ to a pointer to the GRU GTS that caused the
145 * interrupt. Interrupts are always sent to a cpu on the blade that contains the
146 * GRU (except for headless blades which are not currently supported). A blade
147 * has N grus; a block of N consecutive IRQs is assigned to the GRUs. The IRQ
148 * number uniquely identifies the GRU chiplet on the local blade that caused the
149 * interrupt. Always called in interrupt context.
150 */
151static inline struct gru_state *irq_to_gru(int irq)
152{
153 return &gru_base[uv_numa_blade_id()]->bs_grus[irq - IRQ_GRU];
154}
155
156/*
157 * Read & clear a TFM
158 *
159 * The GRU has an array of fault maps. A map is private to a cpu
160 * Only one cpu will be accessing a cpu's fault map.
161 *
162 * This function scans the cpu-private fault map & clears all bits that
163 * are set. The function returns a bitmap that indicates the bits that
164 * were cleared. Note that sense the maps may be updated asynchronously by
165 * the GRU, atomic operations must be used to clear bits.
166 */
167static void get_clear_fault_map(struct gru_state *gru,
168 struct gru_tlb_fault_map *map)
169{
170 unsigned long i, k;
171 struct gru_tlb_fault_map *tfm;
172
173 tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
174 prefetchw(tfm); /* Helps on hardware, required for emulator */
175 for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
176 k = tfm->fault_bits[i];
177 if (k)
178 k = xchg(&tfm->fault_bits[i], 0UL);
179 map->fault_bits[i] = k;
180 }
181
182 /*
183 * Not functionally required but helps performance. (Required
184 * on emulator)
185 */
186 gru_flush_cache(tfm);
187}
188
189/*
190 * Atomic (interrupt context) & non-atomic (user context) functions to
191 * convert a vaddr into a physical address. The size of the page
192 * is returned in pageshift.
193 * returns:
194 * 0 - successful
195 * < 0 - error code
196 * 1 - (atomic only) try again in non-atomic context
197 */
198static int non_atomic_pte_lookup(struct vm_area_struct *vma,
199 unsigned long vaddr, int write,
200 unsigned long *paddr, int *pageshift)
201{
202 struct page *page;
203
204 /* ZZZ Need to handle HUGE pages */
205 if (is_vm_hugetlb_page(vma))
206 return -EFAULT;
207 *pageshift = PAGE_SHIFT;
208 if (get_user_pages
209 (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
210 return -EFAULT;
211 *paddr = page_to_phys(page);
212 put_page(page);
213 return 0;
214}
215
216/*
217 *
218 * atomic_pte_lookup
219 *
220 * Convert a user virtual address to a physical address
221 * Only supports Intel large pages (2MB only) on x86_64.
222 * ZZZ - hugepage support is incomplete
223 */
224static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
225 int write, unsigned long *paddr, int *pageshift)
226{
227 pgd_t *pgdp;
228 pmd_t *pmdp;
229 pud_t *pudp;
230 pte_t pte;
231
232 WARN_ON(irqs_disabled()); /* ZZZ debug */
233
234 local_irq_disable();
235 pgdp = pgd_offset(vma->vm_mm, vaddr);
236 if (unlikely(pgd_none(*pgdp)))
237 goto err;
238
239 pudp = pud_offset(pgdp, vaddr);
240 if (unlikely(pud_none(*pudp)))
241 goto err;
242
243 pmdp = pmd_offset(pudp, vaddr);
244 if (unlikely(pmd_none(*pmdp)))
245 goto err;
246#ifdef CONFIG_X86_64
247 if (unlikely(pmd_large(*pmdp)))
248 pte = *(pte_t *) pmdp;
249 else
250#endif
251 pte = *pte_offset_kernel(pmdp, vaddr);
252
253 local_irq_enable();
254
255 if (unlikely(!pte_present(pte) ||
256 (write && (!pte_write(pte) || !pte_dirty(pte)))))
257 return 1;
258
259 *paddr = pte_pfn(pte) << PAGE_SHIFT;
260 *pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
261 return 0;
262
263err:
264 local_irq_enable();
265 return 1;
266}
267
268/*
269 * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
270 * Input:
271 * cb Address of user CBR. Null if not running in user context
272 * Return:
273 * 0 = dropin, exception, or switch to UPM successful
274 * 1 = range invalidate active
275 * < 0 = error code
276 *
277 */
278static int gru_try_dropin(struct gru_thread_state *gts,
279 struct gru_tlb_fault_handle *tfh,
280 unsigned long __user *cb)
281{
282 struct mm_struct *mm = gts->ts_mm;
283 struct vm_area_struct *vma;
284 int pageshift, asid, write, ret;
285 unsigned long paddr, gpa, vaddr;
286
287 /*
288 * NOTE: The GRU contains magic hardware that eliminates races between
289 * TLB invalidates and TLB dropins. If an invalidate occurs
290 * in the window between reading the TFH and the subsequent TLB dropin,
291 * the dropin is ignored. This eliminates the need for additional locks.
292 */
293
294 /*
295 * Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
296 * Might be a hardware race OR a stupid user. Ignore FMM because FMM
297 * is a transient state.
298 */
299 if (tfh->state == TFHSTATE_IDLE)
300 goto failidle;
301 if (tfh->state == TFHSTATE_MISS_FMM && cb)
302 goto failfmm;
303
304 write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
305 vaddr = tfh->missvaddr;
306 asid = tfh->missasid;
307 if (asid == 0)
308 goto failnoasid;
309
310 rmb(); /* TFH must be cache resident before reading ms_range_active */
311
312 /*
313 * TFH is cache resident - at least briefly. Fail the dropin
314 * if a range invalidate is active.
315 */
316 if (atomic_read(&gts->ts_gms->ms_range_active))
317 goto failactive;
318
319 vma = find_vma(mm, vaddr);
320 if (!vma)
321 goto failinval;
322
323 /*
324 * Atomic lookup is faster & usually works even if called in non-atomic
325 * context.
326 */
327 ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &pageshift);
328 if (ret) {
329 if (!cb)
330 goto failupm;
331 if (non_atomic_pte_lookup(vma, vaddr, write, &paddr,
332 &pageshift))
333 goto failinval;
334 }
335 if (is_gru_paddr(paddr))
336 goto failinval;
337
338 paddr = paddr & ~((1UL << pageshift) - 1);
339 gpa = uv_soc_phys_ram_to_gpa(paddr);
340 gru_cb_set_istatus_active(cb);
341 tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
342 GRU_PAGESIZE(pageshift));
343 STAT(tlb_dropin);
344 gru_dbg(grudev,
345 "%s: tfh 0x%p, vaddr 0x%lx, asid 0x%x, ps %d, gpa 0x%lx\n",
346 ret ? "non-atomic" : "atomic", tfh, vaddr, asid,
347 pageshift, gpa);
348 return 0;
349
350failnoasid:
351 /* No asid (delayed unload). */
352 STAT(tlb_dropin_fail_no_asid);
353 gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
354 if (!cb)
355 tfh_user_polling_mode(tfh);
356 else
357 gru_flush_cache(tfh);
358 return -EAGAIN;
359
360failupm:
361 /* Atomic failure switch CBR to UPM */
362 tfh_user_polling_mode(tfh);
363 STAT(tlb_dropin_fail_upm);
364 gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
365 return 1;
366
367failfmm:
368 /* FMM state on UPM call */
369 STAT(tlb_dropin_fail_fmm);
370 gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
371 return 0;
372
373failidle:
374 /* TFH was idle - no miss pending */
375 gru_flush_cache(tfh);
376 if (cb)
377 gru_flush_cache(cb);
378 STAT(tlb_dropin_fail_idle);
379 gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
380 return 0;
381
382failinval:
383 /* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
384 tfh_exception(tfh);
385 STAT(tlb_dropin_fail_invalid);
386 gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
387 return -EFAULT;
388
389failactive:
390 /* Range invalidate active. Switch to UPM iff atomic */
391 if (!cb)
392 tfh_user_polling_mode(tfh);
393 else
394 gru_flush_cache(tfh);
395 STAT(tlb_dropin_fail_range_active);
396 gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
397 tfh, vaddr);
398 return 1;
399}
400
401/*
402 * Process an external interrupt from the GRU. This interrupt is
403 * caused by a TLB miss.
404 * Note that this is the interrupt handler that is registered with linux
405 * interrupt handlers.
406 */
407irqreturn_t gru_intr(int irq, void *dev_id)
408{
409 struct gru_state *gru;
410 struct gru_tlb_fault_map map;
411 struct gru_thread_state *gts;
412 struct gru_tlb_fault_handle *tfh = NULL;
413 int cbrnum, ctxnum;
414
415 STAT(intr);
416
417 gru = irq_to_gru(irq);
418 if (!gru) {
419 dev_err(grudev, "GRU: invalid interrupt: cpu %d, irq %d\n",
420 raw_smp_processor_id(), irq);
421 return IRQ_NONE;
422 }
423 get_clear_fault_map(gru, &map);
424 gru_dbg(grudev, "irq %d, gru %x, map 0x%lx\n", irq, gru->gs_gid,
425 map.fault_bits[0]);
426
427 for_each_cbr_in_tfm(cbrnum, map.fault_bits) {
428 tfh = get_tfh_by_index(gru, cbrnum);
429 prefetchw(tfh); /* Helps on hdw, required for emulator */
430
431 /*
432 * When hardware sets a bit in the faultmap, it implicitly
433 * locks the GRU context so that it cannot be unloaded.
434 * The gts cannot change until a TFH start/writestart command
435 * is issued.
436 */
437 ctxnum = tfh->ctxnum;
438 gts = gru->gs_gts[ctxnum];
439
440 /*
441 * This is running in interrupt context. Trylock the mmap_sem.
442 * If it fails, retry the fault in user context.
443 */
444 if (down_read_trylock(&gts->ts_mm->mmap_sem)) {
445 gru_try_dropin(gts, tfh, NULL);
446 up_read(&gts->ts_mm->mmap_sem);
447 } else {
448 tfh_user_polling_mode(tfh);
449 }
450 }
451 return IRQ_HANDLED;
452}
453
454
455static int gru_user_dropin(struct gru_thread_state *gts,
456 struct gru_tlb_fault_handle *tfh,
457 unsigned long __user *cb)
458{
459 struct gru_mm_struct *gms = gts->ts_gms;
460 int ret;
461
462 while (1) {
463 wait_event(gms->ms_wait_queue,
464 atomic_read(&gms->ms_range_active) == 0);
465 prefetchw(tfh); /* Helps on hdw, required for emulator */
466 ret = gru_try_dropin(gts, tfh, cb);
467 if (ret <= 0)
468 return ret;
469 STAT(call_os_wait_queue);
470 }
471}
472
473/*
474 * This interface is called as a result of a user detecting a "call OS" bit
475 * in a user CB. Normally means that a TLB fault has occurred.
476 * cb - user virtual address of the CB
477 */
478int gru_handle_user_call_os(unsigned long cb)
479{
480 struct gru_tlb_fault_handle *tfh;
481 struct gru_thread_state *gts;
482 unsigned long __user *cbp;
483 int ucbnum, cbrnum, ret = -EINVAL;
484
485 STAT(call_os);
486 gru_dbg(grudev, "address 0x%lx\n", cb);
487
488 /* sanity check the cb pointer */
489 ucbnum = get_cb_number((void *)cb);
490 if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
491 return -EINVAL;
492 cbp = (unsigned long *)cb;
493
494 gts = gru_find_lock_gts(cb);
495 if (!gts)
496 return -EINVAL;
497
498 if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
499 ret = -EINVAL;
500 goto exit;
501 }
502
503 /*
504 * If force_unload is set, the UPM TLB fault is phony. The task
505 * has migrated to another node and the GSEG must be moved. Just
506 * unload the context. The task will page fault and assign a new
507 * context.
508 */
509 ret = -EAGAIN;
510 cbrnum = thread_cbr_number(gts, ucbnum);
511 if (gts->ts_force_unload) {
512 gru_unload_context(gts, 1);
513 } else if (gts->ts_gru) {
514 tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
515 ret = gru_user_dropin(gts, tfh, cbp);
516 }
517exit:
518 gru_unlock_gts(gts);
519 return ret;
520}
521
522/*
523 * Fetch the exception detail information for a CB that terminated with
524 * an exception.
525 */
526int gru_get_exception_detail(unsigned long arg)
527{
528 struct control_block_extended_exc_detail excdet;
529 struct gru_control_block_extended *cbe;
530 struct gru_thread_state *gts;
531 int ucbnum, cbrnum, ret;
532
533 STAT(user_exception);
534 if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
535 return -EFAULT;
536
537 gru_dbg(grudev, "address 0x%lx\n", excdet.cb);
538 gts = gru_find_lock_gts(excdet.cb);
539 if (!gts)
540 return -EINVAL;
541
542 if (gts->ts_gru) {
543 ucbnum = get_cb_number((void *)excdet.cb);
544 cbrnum = thread_cbr_number(gts, ucbnum);
545 cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
546 excdet.opc = cbe->opccpy;
547 excdet.exopc = cbe->exopccpy;
548 excdet.ecause = cbe->ecause;
549 excdet.exceptdet0 = cbe->idef1upd;
550 excdet.exceptdet1 = cbe->idef3upd;
551 ret = 0;
552 } else {
553 ret = -EAGAIN;
554 }
555 gru_unlock_gts(gts);
556
557 gru_dbg(grudev, "address 0x%lx, ecause 0x%x\n", excdet.cb,
558 excdet.ecause);
559 if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
560 ret = -EFAULT;
561 return ret;
562}
563
564/*
565 * User request to unload a context. Content is saved for possible reload.
566 */
567int gru_user_unload_context(unsigned long arg)
568{
569 struct gru_thread_state *gts;
570 struct gru_unload_context_req req;
571
572 STAT(user_unload_context);
573 if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
574 return -EFAULT;
575
576 gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
577
578 gts = gru_find_lock_gts(req.gseg);
579 if (!gts)
580 return -EINVAL;
581
582 if (gts->ts_gru)
583 gru_unload_context(gts, 1);
584 gru_unlock_gts(gts);
585
586 return 0;
587}
588
589/*
590 * User request to flush a range of virtual addresses from the GRU TLB
591 * (Mainly for testing).
592 */
593int gru_user_flush_tlb(unsigned long arg)
594{
595 struct gru_thread_state *gts;
596 struct gru_flush_tlb_req req;
597
598 STAT(user_flush_tlb);
599 if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
600 return -EFAULT;
601
602 gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
603 req.vaddr, req.len);
604
605 gts = gru_find_lock_gts(req.gseg);
606 if (!gts)
607 return -EINVAL;
608
609 gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.vaddr + req.len);
610 gru_unlock_gts(gts);
611
612 return 0;
613}
614
615/*
616 * Register the current task as the user of the GSEG slice.
617 * Needed for TLB fault interrupt targeting.
618 */
619int gru_set_task_slice(long address)
620{
621 struct gru_thread_state *gts;
622
623 STAT(set_task_slice);
624 gru_dbg(grudev, "address 0x%lx\n", address);
625 gts = gru_alloc_locked_gts(address);
626 if (!gts)
627 return -EINVAL;
628
629 gts->ts_tgid_owner = current->tgid;
630 gru_unlock_gts(gts);
631
632 return 0;
633}
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
new file mode 100644
index 000000000000..23c91f5f6b61
--- /dev/null
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -0,0 +1,485 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * FILE OPERATIONS & DRIVER INITIALIZATION
5 *
6 * This file supports the user system call for file open, close, mmap, etc.
7 * This also incudes the driver initialization code.
8 *
9 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28#include <linux/errno.h>
29#include <linux/slab.h>
30#include <linux/mm.h>
31#include <linux/io.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h>
34#include <linux/device.h>
35#include <linux/miscdevice.h>
36#include <linux/interrupt.h>
37#include <linux/proc_fs.h>
38#include <linux/uaccess.h>
39#include "gru.h"
40#include "grulib.h"
41#include "grutables.h"
42
43#if defined CONFIG_X86_64
44#include <asm/genapic.h>
45#include <asm/irq.h>
46#define IS_UV() is_uv_system()
47#elif defined CONFIG_IA64
48#include <asm/system.h>
49#include <asm/sn/simulator.h>
50/* temp support for running on hardware simulator */
51#define IS_UV() IS_MEDUSA() || ia64_platform_is("uv")
52#else
53#define IS_UV() 0
54#endif
55
56#include <asm/uv/uv_hub.h>
57#include <asm/uv/uv_mmrs.h>
58
59struct gru_blade_state *gru_base[GRU_MAX_BLADES] __read_mostly;
60unsigned long gru_start_paddr, gru_end_paddr __read_mostly;
61struct gru_stats_s gru_stats;
62
63/* Guaranteed user available resources on each node */
64static int max_user_cbrs, max_user_dsr_bytes;
65
66static struct file_operations gru_fops;
67static struct miscdevice gru_miscdev;
68
69
70/*
71 * gru_vma_close
72 *
73 * Called when unmapping a device mapping. Frees all gru resources
74 * and tables belonging to the vma.
75 */
76static void gru_vma_close(struct vm_area_struct *vma)
77{
78 struct gru_vma_data *vdata;
79 struct gru_thread_state *gts;
80 struct list_head *entry, *next;
81
82 if (!vma->vm_private_data)
83 return;
84
85 vdata = vma->vm_private_data;
86 vma->vm_private_data = NULL;
87 gru_dbg(grudev, "vma %p, file %p, vdata %p\n", vma, vma->vm_file,
88 vdata);
89 list_for_each_safe(entry, next, &vdata->vd_head) {
90 gts =
91 list_entry(entry, struct gru_thread_state, ts_next);
92 list_del(&gts->ts_next);
93 mutex_lock(&gts->ts_ctxlock);
94 if (gts->ts_gru)
95 gru_unload_context(gts, 0);
96 mutex_unlock(&gts->ts_ctxlock);
97 gts_drop(gts);
98 }
99 kfree(vdata);
100 STAT(vdata_free);
101}
102
103/*
104 * gru_file_mmap
105 *
106 * Called when mmaping the device. Initializes the vma with a fault handler
107 * and private data structure necessary to allocate, track, and free the
108 * underlying pages.
109 */
110static int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
111{
112 if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) != (VM_SHARED | VM_WRITE))
113 return -EPERM;
114
115 if (vma->vm_start & (GRU_GSEG_PAGESIZE - 1) ||
116 vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
117 return -EINVAL;
118
119 vma->vm_flags |=
120 (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP |
121 VM_RESERVED);
122 vma->vm_page_prot = PAGE_SHARED;
123 vma->vm_ops = &gru_vm_ops;
124
125 vma->vm_private_data = gru_alloc_vma_data(vma, 0);
126 if (!vma->vm_private_data)
127 return -ENOMEM;
128
129 gru_dbg(grudev, "file %p, vaddr 0x%lx, vma %p, vdata %p\n",
130 file, vma->vm_start, vma, vma->vm_private_data);
131 return 0;
132}
133
134/*
135 * Create a new GRU context
136 */
137static int gru_create_new_context(unsigned long arg)
138{
139 struct gru_create_context_req req;
140 struct vm_area_struct *vma;
141 struct gru_vma_data *vdata;
142 int ret = -EINVAL;
143
144
145 if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
146 return -EFAULT;
147
148 if (req.data_segment_bytes == 0 ||
149 req.data_segment_bytes > max_user_dsr_bytes)
150 return -EINVAL;
151 if (!req.control_blocks || !req.maximum_thread_count ||
152 req.control_blocks > max_user_cbrs)
153 return -EINVAL;
154
155 if (!(req.options & GRU_OPT_MISS_MASK))
156 req.options |= GRU_OPT_MISS_FMM_INTR;
157
158 down_write(&current->mm->mmap_sem);
159 vma = gru_find_vma(req.gseg);
160 if (vma) {
161 vdata = vma->vm_private_data;
162 vdata->vd_user_options = req.options;
163 vdata->vd_dsr_au_count =
164 GRU_DS_BYTES_TO_AU(req.data_segment_bytes);
165 vdata->vd_cbr_au_count = GRU_CB_COUNT_TO_AU(req.control_blocks);
166 ret = 0;
167 }
168 up_write(&current->mm->mmap_sem);
169
170 return ret;
171}
172
173/*
174 * Get GRU configuration info (temp - for emulator testing)
175 */
176static long gru_get_config_info(unsigned long arg)
177{
178 struct gru_config_info info;
179 int nodesperblade;
180
181 if (num_online_nodes() > 1 &&
182 (uv_node_to_blade_id(1) == uv_node_to_blade_id(0)))
183 nodesperblade = 2;
184 else
185 nodesperblade = 1;
186 info.cpus = num_online_cpus();
187 info.nodes = num_online_nodes();
188 info.blades = info.nodes / nodesperblade;
189 info.chiplets = GRU_CHIPLETS_PER_BLADE * info.blades;
190
191 if (copy_to_user((void __user *)arg, &info, sizeof(info)))
192 return -EFAULT;
193 return 0;
194}
195
196/*
197 * Get GRU chiplet status
198 */
199static long gru_get_chiplet_status(unsigned long arg)
200{
201 struct gru_state *gru;
202 struct gru_chiplet_info info;
203
204 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
205 return -EFAULT;
206
207 if (info.node == -1)
208 info.node = numa_node_id();
209 if (info.node >= num_possible_nodes() ||
210 info.chiplet >= GRU_CHIPLETS_PER_HUB ||
211 info.node < 0 || info.chiplet < 0)
212 return -EINVAL;
213
214 info.blade = uv_node_to_blade_id(info.node);
215 gru = get_gru(info.blade, info.chiplet);
216
217 info.total_dsr_bytes = GRU_NUM_DSR_BYTES;
218 info.total_cbr = GRU_NUM_CB;
219 info.total_user_dsr_bytes = GRU_NUM_DSR_BYTES -
220 gru->gs_reserved_dsr_bytes;
221 info.total_user_cbr = GRU_NUM_CB - gru->gs_reserved_cbrs;
222 info.free_user_dsr_bytes = hweight64(gru->gs_dsr_map) *
223 GRU_DSR_AU_BYTES;
224 info.free_user_cbr = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
225
226 if (copy_to_user((void __user *)arg, &info, sizeof(info)))
227 return -EFAULT;
228 return 0;
229}
230
231/*
232 * gru_file_unlocked_ioctl
233 *
234 * Called to update file attributes via IOCTL calls.
235 */
236static long gru_file_unlocked_ioctl(struct file *file, unsigned int req,
237 unsigned long arg)
238{
239 int err = -EBADRQC;
240
241 gru_dbg(grudev, "file %p\n", file);
242
243 switch (req) {
244 case GRU_CREATE_CONTEXT:
245 err = gru_create_new_context(arg);
246 break;
247 case GRU_SET_TASK_SLICE:
248 err = gru_set_task_slice(arg);
249 break;
250 case GRU_USER_GET_EXCEPTION_DETAIL:
251 err = gru_get_exception_detail(arg);
252 break;
253 case GRU_USER_UNLOAD_CONTEXT:
254 err = gru_user_unload_context(arg);
255 break;
256 case GRU_GET_CHIPLET_STATUS:
257 err = gru_get_chiplet_status(arg);
258 break;
259 case GRU_USER_FLUSH_TLB:
260 err = gru_user_flush_tlb(arg);
261 break;
262 case GRU_USER_CALL_OS:
263 err = gru_handle_user_call_os(arg);
264 break;
265 case GRU_GET_CONFIG_INFO:
266 err = gru_get_config_info(arg);
267 break;
268 }
269 return err;
270}
271
272/*
273 * Called at init time to build tables for all GRUs that are present in the
274 * system.
275 */
276static void gru_init_chiplet(struct gru_state *gru, unsigned long paddr,
277 void *vaddr, int nid, int bid, int grunum)
278{
279 spin_lock_init(&gru->gs_lock);
280 spin_lock_init(&gru->gs_asid_lock);
281 gru->gs_gru_base_paddr = paddr;
282 gru->gs_gru_base_vaddr = vaddr;
283 gru->gs_gid = bid * GRU_CHIPLETS_PER_BLADE + grunum;
284 gru->gs_blade = gru_base[bid];
285 gru->gs_blade_id = bid;
286 gru->gs_cbr_map = (GRU_CBR_AU == 64) ? ~0 : (1UL << GRU_CBR_AU) - 1;
287 gru->gs_dsr_map = (1UL << GRU_DSR_AU) - 1;
288 gru_tgh_flush_init(gru);
289 gru_dbg(grudev, "bid %d, nid %d, gru %x, vaddr %p (0x%lx)\n",
290 bid, nid, gru->gs_gid, gru->gs_gru_base_vaddr,
291 gru->gs_gru_base_paddr);
292 gru_kservices_init(gru);
293}
294
295static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
296{
297 int pnode, nid, bid, chip;
298 int cbrs, dsrbytes, n;
299 int order = get_order(sizeof(struct gru_blade_state));
300 struct page *page;
301 struct gru_state *gru;
302 unsigned long paddr;
303 void *vaddr;
304
305 max_user_cbrs = GRU_NUM_CB;
306 max_user_dsr_bytes = GRU_NUM_DSR_BYTES;
307 for_each_online_node(nid) {
308 bid = uv_node_to_blade_id(nid);
309 pnode = uv_node_to_pnode(nid);
310 if (gru_base[bid])
311 continue;
312 page = alloc_pages_node(nid, GFP_KERNEL, order);
313 if (!page)
314 goto fail;
315 gru_base[bid] = page_address(page);
316 memset(gru_base[bid], 0, sizeof(struct gru_blade_state));
317 gru_base[bid]->bs_lru_gru = &gru_base[bid]->bs_grus[0];
318 spin_lock_init(&gru_base[bid]->bs_lock);
319
320 dsrbytes = 0;
321 cbrs = 0;
322 for (gru = gru_base[bid]->bs_grus, chip = 0;
323 chip < GRU_CHIPLETS_PER_BLADE;
324 chip++, gru++) {
325 paddr = gru_chiplet_paddr(gru_base_paddr, pnode, chip);
326 vaddr = gru_chiplet_vaddr(gru_base_vaddr, pnode, chip);
327 gru_init_chiplet(gru, paddr, vaddr, bid, nid, chip);
328 n = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
329 cbrs = max(cbrs, n);
330 n = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
331 dsrbytes = max(dsrbytes, n);
332 }
333 max_user_cbrs = min(max_user_cbrs, cbrs);
334 max_user_dsr_bytes = min(max_user_dsr_bytes, dsrbytes);
335 }
336
337 return 0;
338
339fail:
340 for (nid--; nid >= 0; nid--)
341 free_pages((unsigned long)gru_base[nid], order);
342 return -ENOMEM;
343}
344
345#ifdef CONFIG_IA64
346
347static int get_base_irq(void)
348{
349 return IRQ_GRU;
350}
351
352#elif defined CONFIG_X86_64
353
354static void noop(unsigned int irq)
355{
356}
357
358static struct irq_chip gru_chip = {
359 .name = "gru",
360 .mask = noop,
361 .unmask = noop,
362 .ack = noop,
363};
364
365static int get_base_irq(void)
366{
367 set_irq_chip(IRQ_GRU, &gru_chip);
368 set_irq_chip(IRQ_GRU + 1, &gru_chip);
369 return IRQ_GRU;
370}
371#endif
372
373/*
374 * gru_init
375 *
376 * Called at boot or module load time to initialize the GRUs.
377 */
378static int __init gru_init(void)
379{
380 int ret, irq, chip;
381 char id[10];
382 void *gru_start_vaddr;
383
384 if (!IS_UV())
385 return 0;
386
387#if defined CONFIG_IA64
388 gru_start_paddr = 0xd000000000UL; /* ZZZZZZZZZZZZZZZZZZZ fixme */
389#else
390 gru_start_paddr = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR) &
391 0x7fffffffffffUL;
392
393#endif
394 gru_start_vaddr = __va(gru_start_paddr);
395 gru_end_paddr = gru_start_paddr + MAX_NUMNODES * GRU_SIZE;
396 printk(KERN_INFO "GRU space: 0x%lx - 0x%lx\n",
397 gru_start_paddr, gru_end_paddr);
398 irq = get_base_irq();
399 for (chip = 0; chip < GRU_CHIPLETS_PER_BLADE; chip++) {
400 ret = request_irq(irq + chip, gru_intr, 0, id, NULL);
401 if (ret) {
402 printk(KERN_ERR "%s: request_irq failed\n",
403 GRU_DRIVER_ID_STR);
404 goto exit1;
405 }
406 }
407
408 ret = misc_register(&gru_miscdev);
409 if (ret) {
410 printk(KERN_ERR "%s: misc_register failed\n",
411 GRU_DRIVER_ID_STR);
412 goto exit1;
413 }
414
415 ret = gru_proc_init();
416 if (ret) {
417 printk(KERN_ERR "%s: proc init failed\n", GRU_DRIVER_ID_STR);
418 goto exit2;
419 }
420
421 ret = gru_init_tables(gru_start_paddr, gru_start_vaddr);
422 if (ret) {
423 printk(KERN_ERR "%s: init tables failed\n", GRU_DRIVER_ID_STR);
424 goto exit3;
425 }
426
427 printk(KERN_INFO "%s: v%s\n", GRU_DRIVER_ID_STR,
428 GRU_DRIVER_VERSION_STR);
429 return 0;
430
431exit3:
432 gru_proc_exit();
433exit2:
434 misc_deregister(&gru_miscdev);
435exit1:
436 for (--chip; chip >= 0; chip--)
437 free_irq(irq + chip, NULL);
438 return ret;
439
440}
441
442static void __exit gru_exit(void)
443{
444 int i, bid;
445 int order = get_order(sizeof(struct gru_state) *
446 GRU_CHIPLETS_PER_BLADE);
447
448 for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++)
449 free_irq(IRQ_GRU + i, NULL);
450
451 for (bid = 0; bid < GRU_MAX_BLADES; bid++)
452 free_pages((unsigned long)gru_base[bid], order);
453
454 misc_deregister(&gru_miscdev);
455 gru_proc_exit();
456}
457
458static struct file_operations gru_fops = {
459 .owner = THIS_MODULE,
460 .unlocked_ioctl = gru_file_unlocked_ioctl,
461 .mmap = gru_file_mmap,
462};
463
464static struct miscdevice gru_miscdev = {
465 .minor = MISC_DYNAMIC_MINOR,
466 .name = "gru",
467 .fops = &gru_fops,
468};
469
470struct vm_operations_struct gru_vm_ops = {
471 .close = gru_vma_close,
472 .fault = gru_fault,
473};
474
475module_init(gru_init);
476module_exit(gru_exit);
477
478module_param(gru_options, ulong, 0644);
479MODULE_PARM_DESC(gru_options, "Various debug options");
480
481MODULE_AUTHOR("Silicon Graphics, Inc.");
482MODULE_LICENSE("GPL");
483MODULE_DESCRIPTION(GRU_DRIVER_ID_STR GRU_DRIVER_VERSION_STR);
484MODULE_VERSION(GRU_DRIVER_VERSION_STR);
485
diff --git a/drivers/misc/sgi-gru/gruhandles.h b/drivers/misc/sgi-gru/gruhandles.h
new file mode 100644
index 000000000000..d16031d62673
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruhandles.h
@@ -0,0 +1,663 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * GRU HANDLE DEFINITION
5 *
6 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef __GRUHANDLES_H__
24#define __GRUHANDLES_H__
25#include "gru_instructions.h"
26
27/*
28 * Manifest constants for GRU Memory Map
29 */
30#define GRU_GSEG0_BASE 0
31#define GRU_MCS_BASE (64 * 1024 * 1024)
32#define GRU_SIZE (128UL * 1024 * 1024)
33
34/* Handle & resource counts */
35#define GRU_NUM_CB 128
36#define GRU_NUM_DSR_BYTES (32 * 1024)
37#define GRU_NUM_TFM 16
38#define GRU_NUM_TGH 24
39#define GRU_NUM_CBE 128
40#define GRU_NUM_TFH 128
41#define GRU_NUM_CCH 16
42#define GRU_NUM_GSH 1
43
44/* Maximum resource counts that can be reserved by user programs */
45#define GRU_NUM_USER_CBR GRU_NUM_CBE
46#define GRU_NUM_USER_DSR_BYTES GRU_NUM_DSR_BYTES
47
48/* Bytes per handle & handle stride. Code assumes all cb, tfh, cbe handles
49 * are the same */
50#define GRU_HANDLE_BYTES 64
51#define GRU_HANDLE_STRIDE 256
52
53/* Base addresses of handles */
54#define GRU_TFM_BASE (GRU_MCS_BASE + 0x00000)
55#define GRU_TGH_BASE (GRU_MCS_BASE + 0x08000)
56#define GRU_CBE_BASE (GRU_MCS_BASE + 0x10000)
57#define GRU_TFH_BASE (GRU_MCS_BASE + 0x18000)
58#define GRU_CCH_BASE (GRU_MCS_BASE + 0x20000)
59#define GRU_GSH_BASE (GRU_MCS_BASE + 0x30000)
60
61/* User gseg constants */
62#define GRU_GSEG_STRIDE (4 * 1024 * 1024)
63#define GSEG_BASE(a) ((a) & ~(GRU_GSEG_PAGESIZE - 1))
64
65/* Data segment constants */
66#define GRU_DSR_AU_BYTES 1024
67#define GRU_DSR_CL (GRU_NUM_DSR_BYTES / GRU_CACHE_LINE_BYTES)
68#define GRU_DSR_AU_CL (GRU_DSR_AU_BYTES / GRU_CACHE_LINE_BYTES)
69#define GRU_DSR_AU (GRU_NUM_DSR_BYTES / GRU_DSR_AU_BYTES)
70
71/* Control block constants */
72#define GRU_CBR_AU_SIZE 2
73#define GRU_CBR_AU (GRU_NUM_CBE / GRU_CBR_AU_SIZE)
74
75/* Convert resource counts to the number of AU */
76#define GRU_DS_BYTES_TO_AU(n) DIV_ROUND_UP(n, GRU_DSR_AU_BYTES)
77#define GRU_CB_COUNT_TO_AU(n) DIV_ROUND_UP(n, GRU_CBR_AU_SIZE)
78
79/* UV limits */
80#define GRU_CHIPLETS_PER_HUB 2
81#define GRU_HUBS_PER_BLADE 1
82#define GRU_CHIPLETS_PER_BLADE (GRU_HUBS_PER_BLADE * GRU_CHIPLETS_PER_HUB)
83
84/* User GRU Gseg offsets */
85#define GRU_CB_BASE 0
86#define GRU_CB_LIMIT (GRU_CB_BASE + GRU_HANDLE_STRIDE * GRU_NUM_CBE)
87#define GRU_DS_BASE 0x20000
88#define GRU_DS_LIMIT (GRU_DS_BASE + GRU_NUM_DSR_BYTES)
89
90/* Convert a GRU physical address to the chiplet offset */
91#define GSEGPOFF(h) ((h) & (GRU_SIZE - 1))
92
93/* Convert an arbitrary handle address to the beginning of the GRU segment */
94#ifndef __PLUGIN__
95#define GRUBASE(h) ((void *)((unsigned long)(h) & ~(GRU_SIZE - 1)))
96#else
97extern void *gmu_grubase(void *h);
98#define GRUBASE(h) gmu_grubase(h)
99#endif
100
101/* General addressing macros. */
102static inline void *get_gseg_base_address(void *base, int ctxnum)
103{
104 return (void *)(base + GRU_GSEG0_BASE + GRU_GSEG_STRIDE * ctxnum);
105}
106
107static inline void *get_gseg_base_address_cb(void *base, int ctxnum, int line)
108{
109 return (void *)(get_gseg_base_address(base, ctxnum) +
110 GRU_CB_BASE + GRU_HANDLE_STRIDE * line);
111}
112
113static inline void *get_gseg_base_address_ds(void *base, int ctxnum, int line)
114{
115 return (void *)(get_gseg_base_address(base, ctxnum) + GRU_DS_BASE +
116 GRU_CACHE_LINE_BYTES * line);
117}
118
119static inline struct gru_tlb_fault_map *get_tfm(void *base, int ctxnum)
120{
121 return (struct gru_tlb_fault_map *)(base + GRU_TFM_BASE +
122 ctxnum * GRU_HANDLE_STRIDE);
123}
124
125static inline struct gru_tlb_global_handle *get_tgh(void *base, int ctxnum)
126{
127 return (struct gru_tlb_global_handle *)(base + GRU_TGH_BASE +
128 ctxnum * GRU_HANDLE_STRIDE);
129}
130
131static inline struct gru_control_block_extended *get_cbe(void *base, int ctxnum)
132{
133 return (struct gru_control_block_extended *)(base + GRU_CBE_BASE +
134 ctxnum * GRU_HANDLE_STRIDE);
135}
136
137static inline struct gru_tlb_fault_handle *get_tfh(void *base, int ctxnum)
138{
139 return (struct gru_tlb_fault_handle *)(base + GRU_TFH_BASE +
140 ctxnum * GRU_HANDLE_STRIDE);
141}
142
143static inline struct gru_context_configuration_handle *get_cch(void *base,
144 int ctxnum)
145{
146 return (struct gru_context_configuration_handle *)(base +
147 GRU_CCH_BASE + ctxnum * GRU_HANDLE_STRIDE);
148}
149
150static inline unsigned long get_cb_number(void *cb)
151{
152 return (((unsigned long)cb - GRU_CB_BASE) % GRU_GSEG_PAGESIZE) /
153 GRU_HANDLE_STRIDE;
154}
155
156/* byte offset to a specific GRU chiplet. (p=pnode, c=chiplet (0 or 1)*/
157static inline unsigned long gru_chiplet_paddr(unsigned long paddr, int pnode,
158 int chiplet)
159{
160 return paddr + GRU_SIZE * (2 * pnode + chiplet);
161}
162
163static inline void *gru_chiplet_vaddr(void *vaddr, int pnode, int chiplet)
164{
165 return vaddr + GRU_SIZE * (2 * pnode + chiplet);
166}
167
168
169
170/*
171 * Global TLB Fault Map
172 * Bitmap of outstanding TLB misses needing interrupt/polling service.
173 *
174 */
175struct gru_tlb_fault_map {
176 unsigned long fault_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
177 unsigned long fill0[2];
178 unsigned long done_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
179 unsigned long fill1[2];
180};
181
182/*
183 * TGH - TLB Global Handle
184 * Used for TLB flushing.
185 *
186 */
187struct gru_tlb_global_handle {
188 unsigned int cmd:1; /* DW 0 */
189 unsigned int delresp:1;
190 unsigned int opc:1;
191 unsigned int fill1:5;
192
193 unsigned int fill2:8;
194
195 unsigned int status:2;
196 unsigned long fill3:2;
197 unsigned int state:3;
198 unsigned long fill4:1;
199
200 unsigned int cause:3;
201 unsigned long fill5:37;
202
203 unsigned long vaddr:64; /* DW 1 */
204
205 unsigned int asid:24; /* DW 2 */
206 unsigned int fill6:8;
207
208 unsigned int pagesize:5;
209 unsigned int fill7:11;
210
211 unsigned int global:1;
212 unsigned int fill8:15;
213
214 unsigned long vaddrmask:39; /* DW 3 */
215 unsigned int fill9:9;
216 unsigned int n:10;
217 unsigned int fill10:6;
218
219 unsigned int ctxbitmap:16; /* DW4 */
220 unsigned long fill11[3];
221};
222
223enum gru_tgh_cmd {
224 TGHCMD_START
225};
226
227enum gru_tgh_opc {
228 TGHOP_TLBNOP,
229 TGHOP_TLBINV
230};
231
232enum gru_tgh_status {
233 TGHSTATUS_IDLE,
234 TGHSTATUS_EXCEPTION,
235 TGHSTATUS_ACTIVE
236};
237
238enum gru_tgh_state {
239 TGHSTATE_IDLE,
240 TGHSTATE_PE_INVAL,
241 TGHSTATE_INTERRUPT_INVAL,
242 TGHSTATE_WAITDONE,
243 TGHSTATE_RESTART_CTX,
244};
245
246/*
247 * TFH - TLB Global Handle
248 * Used for TLB dropins into the GRU TLB.
249 *
250 */
251struct gru_tlb_fault_handle {
252 unsigned int cmd:1; /* DW 0 - low 32*/
253 unsigned int delresp:1;
254 unsigned int fill0:2;
255 unsigned int opc:3;
256 unsigned int fill1:9;
257
258 unsigned int status:2;
259 unsigned int fill2:1;
260 unsigned int color:1;
261 unsigned int state:3;
262 unsigned int fill3:1;
263
264 unsigned int cause:7; /* DW 0 - high 32 */
265 unsigned int fill4:1;
266
267 unsigned int indexway:12;
268 unsigned int fill5:4;
269
270 unsigned int ctxnum:4;
271 unsigned int fill6:12;
272
273 unsigned long missvaddr:64; /* DW 1 */
274
275 unsigned int missasid:24; /* DW 2 */
276 unsigned int fill7:8;
277 unsigned int fillasid:24;
278 unsigned int dirty:1;
279 unsigned int gaa:2;
280 unsigned long fill8:5;
281
282 unsigned long pfn:41; /* DW 3 */
283 unsigned int fill9:7;
284 unsigned int pagesize:5;
285 unsigned int fill10:11;
286
287 unsigned long fillvaddr:64; /* DW 4 */
288
289 unsigned long fill11[3];
290};
291
292enum gru_tfh_opc {
293 TFHOP_NOOP,
294 TFHOP_RESTART,
295 TFHOP_WRITE_ONLY,
296 TFHOP_WRITE_RESTART,
297 TFHOP_EXCEPTION,
298 TFHOP_USER_POLLING_MODE = 7,
299};
300
301enum tfh_status {
302 TFHSTATUS_IDLE,
303 TFHSTATUS_EXCEPTION,
304 TFHSTATUS_ACTIVE,
305};
306
307enum tfh_state {
308 TFHSTATE_INACTIVE,
309 TFHSTATE_IDLE,
310 TFHSTATE_MISS_UPM,
311 TFHSTATE_MISS_FMM,
312 TFHSTATE_HW_ERR,
313 TFHSTATE_WRITE_TLB,
314 TFHSTATE_RESTART_CBR,
315};
316
317/* TFH cause bits */
318enum tfh_cause {
319 TFHCAUSE_NONE,
320 TFHCAUSE_TLB_MISS,
321 TFHCAUSE_TLB_MOD,
322 TFHCAUSE_HW_ERROR_RR,
323 TFHCAUSE_HW_ERROR_MAIN_ARRAY,
324 TFHCAUSE_HW_ERROR_VALID,
325 TFHCAUSE_HW_ERROR_PAGESIZE,
326 TFHCAUSE_INSTRUCTION_EXCEPTION,
327 TFHCAUSE_UNCORRECTIBLE_ERROR,
328};
329
330/* GAA values */
331#define GAA_RAM 0x0
332#define GAA_NCRAM 0x2
333#define GAA_MMIO 0x1
334#define GAA_REGISTER 0x3
335
336/* GRU paddr shift for pfn. (NOTE: shift is NOT by actual pagesize) */
337#define GRU_PADDR_SHIFT 12
338
339/*
340 * Context Configuration handle
341 * Used to allocate resources to a GSEG context.
342 *
343 */
344struct gru_context_configuration_handle {
345 unsigned int cmd:1; /* DW0 */
346 unsigned int delresp:1;
347 unsigned int opc:3;
348 unsigned int unmap_enable:1;
349 unsigned int req_slice_set_enable:1;
350 unsigned int req_slice:2;
351 unsigned int cb_int_enable:1;
352 unsigned int tlb_int_enable:1;
353 unsigned int tfm_fault_bit_enable:1;
354 unsigned int tlb_int_select:4;
355
356 unsigned int status:2;
357 unsigned int state:2;
358 unsigned int reserved2:4;
359
360 unsigned int cause:4;
361 unsigned int tfm_done_bit_enable:1;
362 unsigned int unused:3;
363
364 unsigned int dsr_allocation_map;
365
366 unsigned long cbr_allocation_map; /* DW1 */
367
368 unsigned int asid[8]; /* DW 2 - 5 */
369 unsigned short sizeavail[8]; /* DW 6 - 7 */
370} __attribute__ ((packed));
371
372enum gru_cch_opc {
373 CCHOP_START = 1,
374 CCHOP_ALLOCATE,
375 CCHOP_INTERRUPT,
376 CCHOP_DEALLOCATE,
377 CCHOP_INTERRUPT_SYNC,
378};
379
380enum gru_cch_status {
381 CCHSTATUS_IDLE,
382 CCHSTATUS_EXCEPTION,
383 CCHSTATUS_ACTIVE,
384};
385
386enum gru_cch_state {
387 CCHSTATE_INACTIVE,
388 CCHSTATE_MAPPED,
389 CCHSTATE_ACTIVE,
390 CCHSTATE_INTERRUPTED,
391};
392
393/* CCH Exception cause */
394enum gru_cch_cause {
395 CCHCAUSE_REGION_REGISTER_WRITE_ERROR = 1,
396 CCHCAUSE_ILLEGAL_OPCODE = 2,
397 CCHCAUSE_INVALID_START_REQUEST = 3,
398 CCHCAUSE_INVALID_ALLOCATION_REQUEST = 4,
399 CCHCAUSE_INVALID_DEALLOCATION_REQUEST = 5,
400 CCHCAUSE_INVALID_INTERRUPT_REQUEST = 6,
401 CCHCAUSE_CCH_BUSY = 7,
402 CCHCAUSE_NO_CBRS_TO_ALLOCATE = 8,
403 CCHCAUSE_BAD_TFM_CONFIG = 9,
404 CCHCAUSE_CBR_RESOURCES_OVERSUBSCRIPED = 10,
405 CCHCAUSE_DSR_RESOURCES_OVERSUBSCRIPED = 11,
406 CCHCAUSE_CBR_DEALLOCATION_ERROR = 12,
407};
408/*
409 * CBE - Control Block Extended
410 * Maintains internal GRU state for active CBs.
411 *
412 */
413struct gru_control_block_extended {
414 unsigned int reserved0:1; /* DW 0 - low */
415 unsigned int imacpy:3;
416 unsigned int reserved1:4;
417 unsigned int xtypecpy:3;
418 unsigned int iaa0cpy:2;
419 unsigned int iaa1cpy:2;
420 unsigned int reserved2:1;
421 unsigned int opccpy:8;
422 unsigned int exopccpy:8;
423
424 unsigned int idef2cpy:22; /* DW 0 - high */
425 unsigned int reserved3:10;
426
427 unsigned int idef4cpy:22; /* DW 1 */
428 unsigned int reserved4:10;
429 unsigned int idef4upd:22;
430 unsigned int reserved5:10;
431
432 unsigned long idef1upd:64; /* DW 2 */
433
434 unsigned long idef5cpy:64; /* DW 3 */
435
436 unsigned long idef6cpy:64; /* DW 4 */
437
438 unsigned long idef3upd:64; /* DW 5 */
439
440 unsigned long idef5upd:64; /* DW 6 */
441
442 unsigned int idef2upd:22; /* DW 7 */
443 unsigned int reserved6:10;
444
445 unsigned int ecause:20;
446 unsigned int cbrstate:4;
447 unsigned int cbrexecstatus:8;
448};
449
450enum gru_cbr_state {
451 CBRSTATE_INACTIVE,
452 CBRSTATE_IDLE,
453 CBRSTATE_PE_CHECK,
454 CBRSTATE_QUEUED,
455 CBRSTATE_WAIT_RESPONSE,
456 CBRSTATE_INTERRUPTED,
457 CBRSTATE_INTERRUPTED_MISS_FMM,
458 CBRSTATE_BUSY_INTERRUPT_MISS_FMM,
459 CBRSTATE_INTERRUPTED_MISS_UPM,
460 CBRSTATE_BUSY_INTERRUPTED_MISS_UPM,
461 CBRSTATE_REQUEST_ISSUE,
462 CBRSTATE_BUSY_INTERRUPT,
463};
464
465/* CBE cbrexecstatus bits */
466#define CBR_EXS_ABORT_OCC_BIT 0
467#define CBR_EXS_INT_OCC_BIT 1
468#define CBR_EXS_PENDING_BIT 2
469#define CBR_EXS_QUEUED_BIT 3
470#define CBR_EXS_TLBHW_BIT 4
471#define CBR_EXS_EXCEPTION_BIT 5
472
473#define CBR_EXS_ABORT_OCC (1 << CBR_EXS_ABORT_OCC_BIT)
474#define CBR_EXS_INT_OCC (1 << CBR_EXS_INT_OCC_BIT)
475#define CBR_EXS_PENDING (1 << CBR_EXS_PENDING_BIT)
476#define CBR_EXS_QUEUED (1 << CBR_EXS_QUEUED_BIT)
477#define CBR_EXS_TLBHW (1 << CBR_EXS_TLBHW_BIT)
478#define CBR_EXS_EXCEPTION (1 << CBR_EXS_EXCEPTION_BIT)
479
480/* CBE ecause bits - defined in gru_instructions.h */
481
482/*
483 * Convert a processor pagesize into the strange encoded pagesize used by the
484 * GRU. Processor pagesize is encoded as log of bytes per page. (or PAGE_SHIFT)
485 * pagesize log pagesize grupagesize
486 * 4k 12 0
487 * 16k 14 1
488 * 64k 16 2
489 * 256k 18 3
490 * 1m 20 4
491 * 2m 21 5
492 * 4m 22 6
493 * 16m 24 7
494 * 64m 26 8
495 * ...
496 */
497#define GRU_PAGESIZE(sh) ((((sh) > 20 ? (sh) + 2: (sh)) >> 1) - 6)
498#define GRU_SIZEAVAIL(sh) (1UL << GRU_PAGESIZE(sh))
499
500/* minimum TLB purge count to ensure a full purge */
501#define GRUMAXINVAL 1024UL
502
503
504/* Extract the status field from a kernel handle */
505#define GET_MSEG_HANDLE_STATUS(h) (((*(unsigned long *)(h)) >> 16) & 3)
506
507static inline void start_instruction(void *h)
508{
509 unsigned long *w0 = h;
510
511 wmb(); /* setting CMD bit must be last */
512 *w0 = *w0 | 1;
513 gru_flush_cache(h);
514}
515
516static inline int wait_instruction_complete(void *h)
517{
518 int status;
519
520 do {
521 cpu_relax();
522 barrier();
523 status = GET_MSEG_HANDLE_STATUS(h);
524 } while (status == CCHSTATUS_ACTIVE);
525 return status;
526}
527
528#if defined CONFIG_IA64
529static inline void cch_allocate_set_asids(
530 struct gru_context_configuration_handle *cch, int asidval)
531{
532 int i;
533
534 for (i = 0; i <= RGN_HPAGE; i++) { /* assume HPAGE is last region */
535 cch->asid[i] = (asidval++);
536#if 0
537 /* ZZZ hugepages not supported yet */
538 if (i == RGN_HPAGE)
539 cch->sizeavail[i] = GRU_SIZEAVAIL(hpage_shift);
540 else
541#endif
542 cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT);
543 }
544}
545#elif defined CONFIG_X86_64
546static inline void cch_allocate_set_asids(
547 struct gru_context_configuration_handle *cch, int asidval)
548{
549 int i;
550
551 for (i = 0; i < 8; i++) {
552 cch->asid[i] = asidval++;
553 cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT) |
554 GRU_SIZEAVAIL(21);
555 }
556}
557#endif
558
559static inline int cch_allocate(struct gru_context_configuration_handle *cch,
560 int asidval, unsigned long cbrmap,
561 unsigned long dsrmap)
562{
563 cch_allocate_set_asids(cch, asidval);
564 cch->dsr_allocation_map = dsrmap;
565 cch->cbr_allocation_map = cbrmap;
566 cch->opc = CCHOP_ALLOCATE;
567 start_instruction(cch);
568 return wait_instruction_complete(cch);
569}
570
571static inline int cch_start(struct gru_context_configuration_handle *cch)
572{
573 cch->opc = CCHOP_START;
574 start_instruction(cch);
575 return wait_instruction_complete(cch);
576}
577
578static inline int cch_interrupt(struct gru_context_configuration_handle *cch)
579{
580 cch->opc = CCHOP_INTERRUPT;
581 start_instruction(cch);
582 return wait_instruction_complete(cch);
583}
584
585static inline int cch_deallocate(struct gru_context_configuration_handle *cch)
586{
587 cch->opc = CCHOP_DEALLOCATE;
588 start_instruction(cch);
589 return wait_instruction_complete(cch);
590}
591
592static inline int cch_interrupt_sync(struct gru_context_configuration_handle
593 *cch)
594{
595 cch->opc = CCHOP_INTERRUPT_SYNC;
596 start_instruction(cch);
597 return wait_instruction_complete(cch);
598}
599
600static inline int tgh_invalidate(struct gru_tlb_global_handle *tgh,
601 unsigned long vaddr, unsigned long vaddrmask,
602 int asid, int pagesize, int global, int n,
603 unsigned short ctxbitmap)
604{
605 tgh->vaddr = vaddr;
606 tgh->asid = asid;
607 tgh->pagesize = pagesize;
608 tgh->n = n;
609 tgh->global = global;
610 tgh->vaddrmask = vaddrmask;
611 tgh->ctxbitmap = ctxbitmap;
612 tgh->opc = TGHOP_TLBINV;
613 start_instruction(tgh);
614 return wait_instruction_complete(tgh);
615}
616
617static inline void tfh_write_only(struct gru_tlb_fault_handle *tfh,
618 unsigned long pfn, unsigned long vaddr,
619 int asid, int dirty, int pagesize)
620{
621 tfh->fillasid = asid;
622 tfh->fillvaddr = vaddr;
623 tfh->pfn = pfn;
624 tfh->dirty = dirty;
625 tfh->pagesize = pagesize;
626 tfh->opc = TFHOP_WRITE_ONLY;
627 start_instruction(tfh);
628}
629
630static inline void tfh_write_restart(struct gru_tlb_fault_handle *tfh,
631 unsigned long paddr, int gaa,
632 unsigned long vaddr, int asid, int dirty,
633 int pagesize)
634{
635 tfh->fillasid = asid;
636 tfh->fillvaddr = vaddr;
637 tfh->pfn = paddr >> GRU_PADDR_SHIFT;
638 tfh->gaa = gaa;
639 tfh->dirty = dirty;
640 tfh->pagesize = pagesize;
641 tfh->opc = TFHOP_WRITE_RESTART;
642 start_instruction(tfh);
643}
644
645static inline void tfh_restart(struct gru_tlb_fault_handle *tfh)
646{
647 tfh->opc = TFHOP_RESTART;
648 start_instruction(tfh);
649}
650
651static inline void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh)
652{
653 tfh->opc = TFHOP_USER_POLLING_MODE;
654 start_instruction(tfh);
655}
656
657static inline void tfh_exception(struct gru_tlb_fault_handle *tfh)
658{
659 tfh->opc = TFHOP_EXCEPTION;
660 start_instruction(tfh);
661}
662
663#endif /* __GRUHANDLES_H__ */
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
new file mode 100644
index 000000000000..dfd49af0fe18
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -0,0 +1,679 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * KERNEL SERVICES THAT USE THE GRU
5 *
6 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include <linux/kernel.h>
24#include <linux/errno.h>
25#include <linux/slab.h>
26#include <linux/mm.h>
27#include <linux/smp_lock.h>
28#include <linux/spinlock.h>
29#include <linux/device.h>
30#include <linux/miscdevice.h>
31#include <linux/proc_fs.h>
32#include <linux/interrupt.h>
33#include <linux/uaccess.h>
34#include "gru.h"
35#include "grulib.h"
36#include "grutables.h"
37#include "grukservices.h"
38#include "gru_instructions.h"
39#include <asm/uv/uv_hub.h>
40
41/*
42 * Kernel GRU Usage
43 *
44 * The following is an interim algorithm for management of kernel GRU
45 * resources. This will likely be replaced when we better understand the
46 * kernel/user requirements.
47 *
48 * At boot time, the kernel permanently reserves a fixed number of
49 * CBRs/DSRs for each cpu to use. The resources are all taken from
50 * the GRU chiplet 1 on the blade. This leaves the full set of resources
51 * of chiplet 0 available to be allocated to a single user.
52 */
53
54/* Blade percpu resources PERMANENTLY reserved for kernel use */
55#define GRU_NUM_KERNEL_CBR 1
56#define GRU_NUM_KERNEL_DSR_BYTES 256
57#define KERNEL_CTXNUM 15
58
59/* GRU instruction attributes for all instructions */
60#define IMA IMA_CB_DELAY
61
62/* GRU cacheline size is always 64 bytes - even on arches with 128 byte lines */
63#define __gru_cacheline_aligned__ \
64 __attribute__((__aligned__(GRU_CACHE_LINE_BYTES)))
65
66#define MAGIC 0x1234567887654321UL
67
68/* Default retry count for GRU errors on kernel instructions */
69#define EXCEPTION_RETRY_LIMIT 3
70
71/* Status of message queue sections */
72#define MQS_EMPTY 0
73#define MQS_FULL 1
74#define MQS_NOOP 2
75
76/*----------------- RESOURCE MANAGEMENT -------------------------------------*/
77/* optimized for x86_64 */
78struct message_queue {
79 union gru_mesqhead head __gru_cacheline_aligned__; /* CL 0 */
80 int qlines; /* DW 1 */
81 long hstatus[2];
82 void *next __gru_cacheline_aligned__;/* CL 1 */
83 void *limit;
84 void *start;
85 void *start2;
86 char data ____cacheline_aligned; /* CL 2 */
87};
88
89/* First word in every message - used by mesq interface */
90struct message_header {
91 char present;
92 char present2;
93 char lines;
94 char fill;
95};
96
97#define QLINES(mq) ((mq) + offsetof(struct message_queue, qlines))
98#define HSTATUS(mq, h) ((mq) + offsetof(struct message_queue, hstatus[h]))
99
100static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
101{
102 struct gru_blade_state *bs;
103 int lcpu;
104
105 BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
106 preempt_disable();
107 bs = gru_base[uv_numa_blade_id()];
108 lcpu = uv_blade_processor_id();
109 *cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
110 *dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
111 return 0;
112}
113
114static void gru_free_cpu_resources(void *cb, void *dsr)
115{
116 preempt_enable();
117}
118
119int gru_get_cb_exception_detail(void *cb,
120 struct control_block_extended_exc_detail *excdet)
121{
122 struct gru_control_block_extended *cbe;
123
124 cbe = get_cbe(GRUBASE(cb), get_cb_number(cb));
125 excdet->opc = cbe->opccpy;
126 excdet->exopc = cbe->exopccpy;
127 excdet->ecause = cbe->ecause;
128 excdet->exceptdet0 = cbe->idef1upd;
129 excdet->exceptdet1 = cbe->idef3upd;
130 return 0;
131}
132
133char *gru_get_cb_exception_detail_str(int ret, void *cb,
134 char *buf, int size)
135{
136 struct gru_control_block_status *gen = (void *)cb;
137 struct control_block_extended_exc_detail excdet;
138
139 if (ret > 0 && gen->istatus == CBS_EXCEPTION) {
140 gru_get_cb_exception_detail(cb, &excdet);
141 snprintf(buf, size,
142 "GRU exception: cb %p, opc %d, exopc %d, ecause 0x%x,"
143 "excdet0 0x%lx, excdet1 0x%x",
144 gen, excdet.opc, excdet.exopc, excdet.ecause,
145 excdet.exceptdet0, excdet.exceptdet1);
146 } else {
147 snprintf(buf, size, "No exception");
148 }
149 return buf;
150}
151
152static int gru_wait_idle_or_exception(struct gru_control_block_status *gen)
153{
154 while (gen->istatus >= CBS_ACTIVE) {
155 cpu_relax();
156 barrier();
157 }
158 return gen->istatus;
159}
160
161static int gru_retry_exception(void *cb)
162{
163 struct gru_control_block_status *gen = (void *)cb;
164 struct control_block_extended_exc_detail excdet;
165 int retry = EXCEPTION_RETRY_LIMIT;
166
167 while (1) {
168 if (gru_get_cb_message_queue_substatus(cb))
169 break;
170 if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
171 return CBS_IDLE;
172
173 gru_get_cb_exception_detail(cb, &excdet);
174 if (excdet.ecause & ~EXCEPTION_RETRY_BITS)
175 break;
176 if (retry-- == 0)
177 break;
178 gen->icmd = 1;
179 gru_flush_cache(gen);
180 }
181 return CBS_EXCEPTION;
182}
183
184int gru_check_status_proc(void *cb)
185{
186 struct gru_control_block_status *gen = (void *)cb;
187 int ret;
188
189 ret = gen->istatus;
190 if (ret != CBS_EXCEPTION)
191 return ret;
192 return gru_retry_exception(cb);
193
194}
195
196int gru_wait_proc(void *cb)
197{
198 struct gru_control_block_status *gen = (void *)cb;
199 int ret;
200
201 ret = gru_wait_idle_or_exception(gen);
202 if (ret == CBS_EXCEPTION)
203 ret = gru_retry_exception(cb);
204
205 return ret;
206}
207
208void gru_abort(int ret, void *cb, char *str)
209{
210 char buf[GRU_EXC_STR_SIZE];
211
212 panic("GRU FATAL ERROR: %s - %s\n", str,
213 gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf)));
214}
215
216void gru_wait_abort_proc(void *cb)
217{
218 int ret;
219
220 ret = gru_wait_proc(cb);
221 if (ret)
222 gru_abort(ret, cb, "gru_wait_abort");
223}
224
225
226/*------------------------------ MESSAGE QUEUES -----------------------------*/
227
228/* Internal status . These are NOT returned to the user. */
229#define MQIE_AGAIN -1 /* try again */
230
231
232/*
233 * Save/restore the "present" flag that is in the second line of 2-line
234 * messages
235 */
236static inline int get_present2(void *p)
237{
238 struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
239 return mhdr->present;
240}
241
242static inline void restore_present2(void *p, int val)
243{
244 struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
245 mhdr->present = val;
246}
247
248/*
249 * Create a message queue.
250 * qlines - message queue size in cache lines. Includes 2-line header.
251 */
252int gru_create_message_queue(void *p, unsigned int bytes)
253{
254 struct message_queue *mq = p;
255 unsigned int qlines;
256
257 qlines = bytes / GRU_CACHE_LINE_BYTES - 2;
258 memset(mq, 0, bytes);
259 mq->start = &mq->data;
260 mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES;
261 mq->next = &mq->data;
262 mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES;
263 mq->qlines = qlines;
264 mq->hstatus[0] = 0;
265 mq->hstatus[1] = 1;
266 mq->head = gru_mesq_head(2, qlines / 2 + 1);
267 return 0;
268}
269EXPORT_SYMBOL_GPL(gru_create_message_queue);
270
271/*
272 * Send a NOOP message to a message queue
273 * Returns:
274 * 0 - if queue is full after the send. This is the normal case
275 * but various races can change this.
276 * -1 - if mesq sent successfully but queue not full
277 * >0 - unexpected error. MQE_xxx returned
278 */
279static int send_noop_message(void *cb,
280 unsigned long mq, void *mesg)
281{
282 const struct message_header noop_header = {
283 .present = MQS_NOOP, .lines = 1};
284 unsigned long m;
285 int substatus, ret;
286 struct message_header save_mhdr, *mhdr = mesg;
287
288 STAT(mesq_noop);
289 save_mhdr = *mhdr;
290 *mhdr = noop_header;
291 gru_mesq(cb, mq, gru_get_tri(mhdr), 1, IMA);
292 ret = gru_wait(cb);
293
294 if (ret) {
295 substatus = gru_get_cb_message_queue_substatus(cb);
296 switch (substatus) {
297 case CBSS_NO_ERROR:
298 STAT(mesq_noop_unexpected_error);
299 ret = MQE_UNEXPECTED_CB_ERR;
300 break;
301 case CBSS_LB_OVERFLOWED:
302 STAT(mesq_noop_lb_overflow);
303 ret = MQE_CONGESTION;
304 break;
305 case CBSS_QLIMIT_REACHED:
306 STAT(mesq_noop_qlimit_reached);
307 ret = 0;
308 break;
309 case CBSS_AMO_NACKED:
310 STAT(mesq_noop_amo_nacked);
311 ret = MQE_CONGESTION;
312 break;
313 case CBSS_PUT_NACKED:
314 STAT(mesq_noop_put_nacked);
315 m = mq + (gru_get_amo_value_head(cb) << 6);
316 gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
317 IMA);
318 if (gru_wait(cb) == CBS_IDLE)
319 ret = MQIE_AGAIN;
320 else
321 ret = MQE_UNEXPECTED_CB_ERR;
322 break;
323 case CBSS_PAGE_OVERFLOW:
324 default:
325 BUG();
326 }
327 }
328 *mhdr = save_mhdr;
329 return ret;
330}
331
332/*
333 * Handle a gru_mesq full.
334 */
335static int send_message_queue_full(void *cb,
336 unsigned long mq, void *mesg, int lines)
337{
338 union gru_mesqhead mqh;
339 unsigned int limit, head;
340 unsigned long avalue;
341 int half, qlines, save;
342
343 /* Determine if switching to first/second half of q */
344 avalue = gru_get_amo_value(cb);
345 head = gru_get_amo_value_head(cb);
346 limit = gru_get_amo_value_limit(cb);
347
348 /*
349 * Fetch "qlines" from the queue header. Since the queue may be
350 * in memory that can't be accessed using socket addresses, use
351 * the GRU to access the data. Use DSR space from the message.
352 */
353 save = *(int *)mesg;
354 gru_vload(cb, QLINES(mq), gru_get_tri(mesg), XTYPE_W, 1, 1, IMA);
355 if (gru_wait(cb) != CBS_IDLE)
356 goto cberr;
357 qlines = *(int *)mesg;
358 *(int *)mesg = save;
359 half = (limit != qlines);
360
361 if (half)
362 mqh = gru_mesq_head(qlines / 2 + 1, qlines);
363 else
364 mqh = gru_mesq_head(2, qlines / 2 + 1);
365
366 /* Try to get lock for switching head pointer */
367 gru_gamir(cb, EOP_IR_CLR, HSTATUS(mq, half), XTYPE_DW, IMA);
368 if (gru_wait(cb) != CBS_IDLE)
369 goto cberr;
370 if (!gru_get_amo_value(cb)) {
371 STAT(mesq_qf_locked);
372 return MQE_QUEUE_FULL;
373 }
374
375 /* Got the lock. Send optional NOP if queue not full, */
376 if (head != limit) {
377 if (send_noop_message(cb, mq, mesg)) {
378 gru_gamir(cb, EOP_IR_INC, HSTATUS(mq, half),
379 XTYPE_DW, IMA);
380 if (gru_wait(cb) != CBS_IDLE)
381 goto cberr;
382 STAT(mesq_qf_noop_not_full);
383 return MQIE_AGAIN;
384 }
385 avalue++;
386 }
387
388 /* Then flip queuehead to other half of queue. */
389 gru_gamer(cb, EOP_ERR_CSWAP, mq, XTYPE_DW, mqh.val, avalue, IMA);
390 if (gru_wait(cb) != CBS_IDLE)
391 goto cberr;
392
393 /* If not successfully in swapping queue head, clear the hstatus lock */
394 if (gru_get_amo_value(cb) != avalue) {
395 STAT(mesq_qf_switch_head_failed);
396 gru_gamir(cb, EOP_IR_INC, HSTATUS(mq, half), XTYPE_DW, IMA);
397 if (gru_wait(cb) != CBS_IDLE)
398 goto cberr;
399 }
400 return MQIE_AGAIN;
401cberr:
402 STAT(mesq_qf_unexpected_error);
403 return MQE_UNEXPECTED_CB_ERR;
404}
405
406
407/*
408 * Handle a gru_mesq failure. Some of these failures are software recoverable
409 * or retryable.
410 */
411static int send_message_failure(void *cb,
412 unsigned long mq,
413 void *mesg,
414 int lines)
415{
416 int substatus, ret = 0;
417 unsigned long m;
418
419 substatus = gru_get_cb_message_queue_substatus(cb);
420 switch (substatus) {
421 case CBSS_NO_ERROR:
422 STAT(mesq_send_unexpected_error);
423 ret = MQE_UNEXPECTED_CB_ERR;
424 break;
425 case CBSS_LB_OVERFLOWED:
426 STAT(mesq_send_lb_overflow);
427 ret = MQE_CONGESTION;
428 break;
429 case CBSS_QLIMIT_REACHED:
430 STAT(mesq_send_qlimit_reached);
431 ret = send_message_queue_full(cb, mq, mesg, lines);
432 break;
433 case CBSS_AMO_NACKED:
434 STAT(mesq_send_amo_nacked);
435 ret = MQE_CONGESTION;
436 break;
437 case CBSS_PUT_NACKED:
438 STAT(mesq_send_put_nacked);
439 m =mq + (gru_get_amo_value_head(cb) << 6);
440 gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
441 if (gru_wait(cb) == CBS_IDLE)
442 ret = MQE_OK;
443 else
444 ret = MQE_UNEXPECTED_CB_ERR;
445 break;
446 default:
447 BUG();
448 }
449 return ret;
450}
451
452/*
453 * Send a message to a message queue
454 * cb GRU control block to use to send message
455 * mq message queue
456 * mesg message. ust be vaddr within a GSEG
457 * bytes message size (<= 2 CL)
458 */
459int gru_send_message_gpa(unsigned long mq, void *mesg, unsigned int bytes)
460{
461 struct message_header *mhdr;
462 void *cb;
463 void *dsr;
464 int istatus, clines, ret;
465
466 STAT(mesq_send);
467 BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES);
468
469 clines = (bytes + GRU_CACHE_LINE_BYTES - 1) / GRU_CACHE_LINE_BYTES;
470 if (gru_get_cpu_resources(bytes, &cb, &dsr))
471 return MQE_BUG_NO_RESOURCES;
472 memcpy(dsr, mesg, bytes);
473 mhdr = dsr;
474 mhdr->present = MQS_FULL;
475 mhdr->lines = clines;
476 if (clines == 2) {
477 mhdr->present2 = get_present2(mhdr);
478 restore_present2(mhdr, MQS_FULL);
479 }
480
481 do {
482 ret = MQE_OK;
483 gru_mesq(cb, mq, gru_get_tri(mhdr), clines, IMA);
484 istatus = gru_wait(cb);
485 if (istatus != CBS_IDLE)
486 ret = send_message_failure(cb, mq, dsr, clines);
487 } while (ret == MQIE_AGAIN);
488 gru_free_cpu_resources(cb, dsr);
489
490 if (ret)
491 STAT(mesq_send_failed);
492 return ret;
493}
494EXPORT_SYMBOL_GPL(gru_send_message_gpa);
495
496/*
497 * Advance the receive pointer for the queue to the next message.
498 */
499void gru_free_message(void *rmq, void *mesg)
500{
501 struct message_queue *mq = rmq;
502 struct message_header *mhdr = mq->next;
503 void *next, *pnext;
504 int half = -1;
505 int lines = mhdr->lines;
506
507 if (lines == 2)
508 restore_present2(mhdr, MQS_EMPTY);
509 mhdr->present = MQS_EMPTY;
510
511 pnext = mq->next;
512 next = pnext + GRU_CACHE_LINE_BYTES * lines;
513 if (next == mq->limit) {
514 next = mq->start;
515 half = 1;
516 } else if (pnext < mq->start2 && next >= mq->start2) {
517 half = 0;
518 }
519
520 if (half >= 0)
521 mq->hstatus[half] = 1;
522 mq->next = next;
523}
524EXPORT_SYMBOL_GPL(gru_free_message);
525
526/*
527 * Get next message from message queue. Return NULL if no message
528 * present. User must call next_message() to move to next message.
529 * rmq message queue
530 */
531void *gru_get_next_message(void *rmq)
532{
533 struct message_queue *mq = rmq;
534 struct message_header *mhdr = mq->next;
535 int present = mhdr->present;
536
537 /* skip NOOP messages */
538 STAT(mesq_receive);
539 while (present == MQS_NOOP) {
540 gru_free_message(rmq, mhdr);
541 mhdr = mq->next;
542 present = mhdr->present;
543 }
544
545 /* Wait for both halves of 2 line messages */
546 if (present == MQS_FULL && mhdr->lines == 2 &&
547 get_present2(mhdr) == MQS_EMPTY)
548 present = MQS_EMPTY;
549
550 if (!present) {
551 STAT(mesq_receive_none);
552 return NULL;
553 }
554
555 if (mhdr->lines == 2)
556 restore_present2(mhdr, mhdr->present2);
557
558 return mhdr;
559}
560EXPORT_SYMBOL_GPL(gru_get_next_message);
561
562/* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/
563
564/*
565 * Copy a block of data using the GRU resources
566 */
567int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
568 unsigned int bytes)
569{
570 void *cb;
571 void *dsr;
572 int ret;
573
574 STAT(copy_gpa);
575 if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
576 return MQE_BUG_NO_RESOURCES;
577 gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr),
578 XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_BYTES, IMA);
579 ret = gru_wait(cb);
580 gru_free_cpu_resources(cb, dsr);
581 return ret;
582}
583EXPORT_SYMBOL_GPL(gru_copy_gpa);
584
585/* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
586/* Temp - will delete after we gain confidence in the GRU */
587static __cacheline_aligned unsigned long word0;
588static __cacheline_aligned unsigned long word1;
589
590static int quicktest(struct gru_state *gru)
591{
592 void *cb;
593 void *ds;
594 unsigned long *p;
595
596 cb = get_gseg_base_address_cb(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0);
597 ds = get_gseg_base_address_ds(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0);
598 p = ds;
599 word0 = MAGIC;
600
601 gru_vload(cb, uv_gpa(&word0), 0, XTYPE_DW, 1, 1, IMA);
602 if (gru_wait(cb) != CBS_IDLE)
603 BUG();
604
605 if (*(unsigned long *)ds != MAGIC)
606 BUG();
607 gru_vstore(cb, uv_gpa(&word1), 0, XTYPE_DW, 1, 1, IMA);
608 if (gru_wait(cb) != CBS_IDLE)
609 BUG();
610
611 if (word0 != word1 || word0 != MAGIC) {
612 printk
613 ("GRU quicktest err: gru %d, found 0x%lx, expected 0x%lx\n",
614 gru->gs_gid, word1, MAGIC);
615 BUG(); /* ZZZ should not be fatal */
616 }
617
618 return 0;
619}
620
621
622int gru_kservices_init(struct gru_state *gru)
623{
624 struct gru_blade_state *bs;
625 struct gru_context_configuration_handle *cch;
626 unsigned long cbr_map, dsr_map;
627 int err, num, cpus_possible;
628
629 /*
630 * Currently, resources are reserved ONLY on the second chiplet
631 * on each blade. This leaves ALL resources on chiplet 0 available
632 * for user code.
633 */
634 bs = gru->gs_blade;
635 if (gru != &bs->bs_grus[1])
636 return 0;
637
638 cpus_possible = uv_blade_nr_possible_cpus(gru->gs_blade_id);
639
640 num = GRU_NUM_KERNEL_CBR * cpus_possible;
641 cbr_map = gru_reserve_cb_resources(gru, GRU_CB_COUNT_TO_AU(num), NULL);
642 gru->gs_reserved_cbrs += num;
643
644 num = GRU_NUM_KERNEL_DSR_BYTES * cpus_possible;
645 dsr_map = gru_reserve_ds_resources(gru, GRU_DS_BYTES_TO_AU(num), NULL);
646 gru->gs_reserved_dsr_bytes += num;
647
648 gru->gs_active_contexts++;
649 __set_bit(KERNEL_CTXNUM, &gru->gs_context_map);
650 cch = get_cch(gru->gs_gru_base_vaddr, KERNEL_CTXNUM);
651
652 bs->kernel_cb = get_gseg_base_address_cb(gru->gs_gru_base_vaddr,
653 KERNEL_CTXNUM, 0);
654 bs->kernel_dsr = get_gseg_base_address_ds(gru->gs_gru_base_vaddr,
655 KERNEL_CTXNUM, 0);
656
657 lock_cch_handle(cch);
658 cch->tfm_fault_bit_enable = 0;
659 cch->tlb_int_enable = 0;
660 cch->tfm_done_bit_enable = 0;
661 cch->unmap_enable = 1;
662 err = cch_allocate(cch, 0, cbr_map, dsr_map);
663 if (err) {
664 gru_dbg(grudev,
665 "Unable to allocate kernel CCH: gru %d, err %d\n",
666 gru->gs_gid, err);
667 BUG();
668 }
669 if (cch_start(cch)) {
670 gru_dbg(grudev, "Unable to start kernel CCH: gru %d, err %d\n",
671 gru->gs_gid, err);
672 BUG();
673 }
674 unlock_cch_handle(cch);
675
676 if (gru_options & GRU_QUICKLOOK)
677 quicktest(gru);
678 return 0;
679}
diff --git a/drivers/misc/sgi-gru/grukservices.h b/drivers/misc/sgi-gru/grukservices.h
new file mode 100644
index 000000000000..eb17e0a3ac61
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukservices.h
@@ -0,0 +1,134 @@
1
2/*
3 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef __GRU_KSERVICES_H_
20#define __GRU_KSERVICES_H_
21
22
23/*
24 * Message queues using the GRU to send/receive messages.
25 *
26 * These function allow the user to create a message queue for
27 * sending/receiving 1 or 2 cacheline messages using the GRU.
28 *
29 * Processes SENDING messages will use a kernel CBR/DSR to send
30 * the message. This is transparent to the caller.
31 *
32 * The receiver does not use any GRU resources.
33 *
34 * The functions support:
35 * - single receiver
36 * - multiple senders
37 * - cross partition message
38 *
39 * Missing features ZZZ:
40 * - user options for dealing with timeouts, queue full, etc.
41 * - gru_create_message_queue() needs interrupt vector info
42 */
43
44/*
45 * Initialize a user allocated chunk of memory to be used as
46 * a message queue. The caller must ensure that the queue is
47 * in contiguous physical memory and is cacheline aligned.
48 *
49 * Message queue size is the total number of bytes allocated
50 * to the queue including a 2 cacheline header that is used
51 * to manage the queue.
52 *
53 * Input:
54 * p pointer to user allocated memory.
55 * bytes size of message queue in bytes
56 *
57 * Errors:
58 * 0 OK
59 * >0 error
60 */
61extern int gru_create_message_queue(void *p, unsigned int bytes);
62
63/*
64 * Send a message to a message queue.
65 *
66 * Note: The message queue transport mechanism uses the first 32
67 * bits of the message. Users should avoid using these bits.
68 *
69 *
70 * Input:
71 * xmq message queue - must be a UV global physical address
72 * mesg pointer to message. Must be 64-bit aligned
73 * bytes size of message in bytes
74 *
75 * Output:
76 * 0 message sent
77 * >0 Send failure - see error codes below
78 *
79 */
80extern int gru_send_message_gpa(unsigned long mq_gpa, void *mesg,
81 unsigned int bytes);
82
83/* Status values for gru_send_message() */
84#define MQE_OK 0 /* message sent successfully */
85#define MQE_CONGESTION 1 /* temporary congestion, try again */
86#define MQE_QUEUE_FULL 2 /* queue is full */
87#define MQE_UNEXPECTED_CB_ERR 3 /* unexpected CB error */
88#define MQE_PAGE_OVERFLOW 10 /* BUG - queue overflowed a page */
89#define MQE_BUG_NO_RESOURCES 11 /* BUG - could not alloc GRU cb/dsr */
90
91/*
92 * Advance the receive pointer for the message queue to the next message.
93 * Note: current API requires messages to be gotten & freed in order. Future
94 * API extensions may allow for out-of-order freeing.
95 *
96 * Input
97 * mq message queue
98 * mesq message being freed
99 */
100extern void gru_free_message(void *mq, void *mesq);
101
102/*
103 * Get next message from message queue. Returns pointer to
104 * message OR NULL if no message present.
105 * User must call gru_free_message() after message is processed
106 * in order to move the queue pointers to next message.
107 *
108 * Input
109 * mq message queue
110 *
111 * Output:
112 * p pointer to message
113 * NULL no message available
114 */
115extern void *gru_get_next_message(void *mq);
116
117
118/*
119 * Copy data using the GRU. Source or destination can be located in a remote
120 * partition.
121 *
122 * Input:
123 * dest_gpa destination global physical address
124 * src_gpa source global physical address
125 * bytes number of bytes to copy
126 *
127 * Output:
128 * 0 OK
129 * >0 error
130 */
131extern int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
132 unsigned int bytes);
133
134#endif /* __GRU_KSERVICES_H_ */
diff --git a/drivers/misc/sgi-gru/grulib.h b/drivers/misc/sgi-gru/grulib.h
new file mode 100644
index 000000000000..e56e196a6998
--- /dev/null
+++ b/drivers/misc/sgi-gru/grulib.h
@@ -0,0 +1,97 @@
1/*
2 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation; either version 2.1 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef __GRULIB_H__
20#define __GRULIB_H__
21
22#define GRU_BASENAME "gru"
23#define GRU_FULLNAME "/dev/gru"
24#define GRU_IOCTL_NUM 'G'
25
26/*
27 * Maximum number of GRU segments that a user can have open
28 * ZZZ temp - set high for testing. Revisit.
29 */
30#define GRU_MAX_OPEN_CONTEXTS 32
31
32/* Set Number of Request Blocks */
33#define GRU_CREATE_CONTEXT _IOWR(GRU_IOCTL_NUM, 1, void *)
34
35/* Register task as using the slice */
36#define GRU_SET_TASK_SLICE _IOWR(GRU_IOCTL_NUM, 5, void *)
37
38/* Fetch exception detail */
39#define GRU_USER_GET_EXCEPTION_DETAIL _IOWR(GRU_IOCTL_NUM, 6, void *)
40
41/* For user call_os handling - normally a TLB fault */
42#define GRU_USER_CALL_OS _IOWR(GRU_IOCTL_NUM, 8, void *)
43
44/* For user unload context */
45#define GRU_USER_UNLOAD_CONTEXT _IOWR(GRU_IOCTL_NUM, 9, void *)
46
47/* For fetching GRU chiplet status */
48#define GRU_GET_CHIPLET_STATUS _IOWR(GRU_IOCTL_NUM, 10, void *)
49
50/* For user TLB flushing (primarily for tests) */
51#define GRU_USER_FLUSH_TLB _IOWR(GRU_IOCTL_NUM, 50, void *)
52
53/* Get some config options (primarily for tests & emulator) */
54#define GRU_GET_CONFIG_INFO _IOWR(GRU_IOCTL_NUM, 51, void *)
55
56#define CONTEXT_WINDOW_BYTES(th) (GRU_GSEG_PAGESIZE * (th))
57#define THREAD_POINTER(p, th) (p + GRU_GSEG_PAGESIZE * (th))
58
59/*
60 * Structure used to pass TLB flush parameters to the driver
61 */
62struct gru_create_context_req {
63 unsigned long gseg;
64 unsigned int data_segment_bytes;
65 unsigned int control_blocks;
66 unsigned int maximum_thread_count;
67 unsigned int options;
68};
69
70/*
71 * Structure used to pass unload context parameters to the driver
72 */
73struct gru_unload_context_req {
74 unsigned long gseg;
75};
76
77/*
78 * Structure used to pass TLB flush parameters to the driver
79 */
80struct gru_flush_tlb_req {
81 unsigned long gseg;
82 unsigned long vaddr;
83 size_t len;
84};
85
86/*
87 * GRU configuration info (temp - for testing)
88 */
89struct gru_config_info {
90 int cpus;
91 int blades;
92 int nodes;
93 int chiplets;
94 int fill[16];
95};
96
97#endif /* __GRULIB_H__ */
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
new file mode 100644
index 000000000000..0eeb8dddd2f5
--- /dev/null
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -0,0 +1,802 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * DRIVER TABLE MANAGER + GRU CONTEXT LOAD/UNLOAD
5 *
6 * This file is subject to the terms and conditions of the GNU General Public
7 * License. See the file "COPYING" in the main directory of this archive
8 * for more details.
9 *
10 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
11 */
12
13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/mm.h>
16#include <linux/spinlock.h>
17#include <linux/sched.h>
18#include <linux/device.h>
19#include <linux/list.h>
20#include <asm/uv/uv_hub.h>
21#include "gru.h"
22#include "grutables.h"
23#include "gruhandles.h"
24
25unsigned long gru_options __read_mostly;
26
27static struct device_driver gru_driver = {
28 .name = "gru"
29};
30
31static struct device gru_device = {
32 .bus_id = {0},
33 .driver = &gru_driver,
34};
35
36struct device *grudev = &gru_device;
37
38/*
39 * Select a gru fault map to be used by the current cpu. Note that
40 * multiple cpus may be using the same map.
41 * ZZZ should "shift" be used?? Depends on HT cpu numbering
42 * ZZZ should be inline but did not work on emulator
43 */
44int gru_cpu_fault_map_id(void)
45{
46 return uv_blade_processor_id() % GRU_NUM_TFM;
47}
48
49/*--------- ASID Management -------------------------------------------
50 *
51 * Initially, assign asids sequentially from MIN_ASID .. MAX_ASID.
52 * Once MAX is reached, flush the TLB & start over. However,
53 * some asids may still be in use. There won't be many (percentage wise) still
54 * in use. Search active contexts & determine the value of the first
55 * asid in use ("x"s below). Set "limit" to this value.
56 * This defines a block of assignable asids.
57 *
58 * When "limit" is reached, search forward from limit+1 and determine the
59 * next block of assignable asids.
60 *
61 * Repeat until MAX_ASID is reached, then start over again.
62 *
63 * Each time MAX_ASID is reached, increment the asid generation. Since
64 * the search for in-use asids only checks contexts with GRUs currently
65 * assigned, asids in some contexts will be missed. Prior to loading
66 * a context, the asid generation of the GTS asid is rechecked. If it
67 * doesn't match the current generation, a new asid will be assigned.
68 *
69 * 0---------------x------------x---------------------x----|
70 * ^-next ^-limit ^-MAX_ASID
71 *
72 * All asid manipulation & context loading/unloading is protected by the
73 * gs_lock.
74 */
75
76/* Hit the asid limit. Start over */
77static int gru_wrap_asid(struct gru_state *gru)
78{
79 gru_dbg(grudev, "gru %p\n", gru);
80 STAT(asid_wrap);
81 gru->gs_asid_gen++;
82 gru_flush_all_tlb(gru);
83 return MIN_ASID;
84}
85
86/* Find the next chunk of unused asids */
87static int gru_reset_asid_limit(struct gru_state *gru, int asid)
88{
89 int i, gid, inuse_asid, limit;
90
91 gru_dbg(grudev, "gru %p, asid 0x%x\n", gru, asid);
92 STAT(asid_next);
93 limit = MAX_ASID;
94 if (asid >= limit)
95 asid = gru_wrap_asid(gru);
96 gid = gru->gs_gid;
97again:
98 for (i = 0; i < GRU_NUM_CCH; i++) {
99 if (!gru->gs_gts[i])
100 continue;
101 inuse_asid = gru->gs_gts[i]->ts_gms->ms_asids[gid].mt_asid;
102 gru_dbg(grudev, "gru %p, inuse_asid 0x%x, cxtnum %d, gts %p\n",
103 gru, inuse_asid, i, gru->gs_gts[i]);
104 if (inuse_asid == asid) {
105 asid += ASID_INC;
106 if (asid >= limit) {
107 /*
108 * empty range: reset the range limit and
109 * start over
110 */
111 limit = MAX_ASID;
112 if (asid >= MAX_ASID)
113 asid = gru_wrap_asid(gru);
114 goto again;
115 }
116 }
117
118 if ((inuse_asid > asid) && (inuse_asid < limit))
119 limit = inuse_asid;
120 }
121 gru->gs_asid_limit = limit;
122 gru->gs_asid = asid;
123 gru_dbg(grudev, "gru %p, new asid 0x%x, new_limit 0x%x\n", gru, asid,
124 limit);
125 return asid;
126}
127
128/* Assign a new ASID to a thread context. */
129static int gru_assign_asid(struct gru_state *gru)
130{
131 int asid;
132
133 spin_lock(&gru->gs_asid_lock);
134 gru->gs_asid += ASID_INC;
135 asid = gru->gs_asid;
136 if (asid >= gru->gs_asid_limit)
137 asid = gru_reset_asid_limit(gru, asid);
138 spin_unlock(&gru->gs_asid_lock);
139
140 gru_dbg(grudev, "gru %p, asid 0x%x\n", gru, asid);
141 return asid;
142}
143
144/*
145 * Clear n bits in a word. Return a word indicating the bits that were cleared.
146 * Optionally, build an array of chars that contain the bit numbers allocated.
147 */
148static unsigned long reserve_resources(unsigned long *p, int n, int mmax,
149 char *idx)
150{
151 unsigned long bits = 0;
152 int i;
153
154 do {
155 i = find_first_bit(p, mmax);
156 if (i == mmax)
157 BUG();
158 __clear_bit(i, p);
159 __set_bit(i, &bits);
160 if (idx)
161 *idx++ = i;
162 } while (--n);
163 return bits;
164}
165
166unsigned long gru_reserve_cb_resources(struct gru_state *gru, int cbr_au_count,
167 char *cbmap)
168{
169 return reserve_resources(&gru->gs_cbr_map, cbr_au_count, GRU_CBR_AU,
170 cbmap);
171}
172
173unsigned long gru_reserve_ds_resources(struct gru_state *gru, int dsr_au_count,
174 char *dsmap)
175{
176 return reserve_resources(&gru->gs_dsr_map, dsr_au_count, GRU_DSR_AU,
177 dsmap);
178}
179
180static void reserve_gru_resources(struct gru_state *gru,
181 struct gru_thread_state *gts)
182{
183 gru->gs_active_contexts++;
184 gts->ts_cbr_map =
185 gru_reserve_cb_resources(gru, gts->ts_cbr_au_count,
186 gts->ts_cbr_idx);
187 gts->ts_dsr_map =
188 gru_reserve_ds_resources(gru, gts->ts_dsr_au_count, NULL);
189}
190
191static void free_gru_resources(struct gru_state *gru,
192 struct gru_thread_state *gts)
193{
194 gru->gs_active_contexts--;
195 gru->gs_cbr_map |= gts->ts_cbr_map;
196 gru->gs_dsr_map |= gts->ts_dsr_map;
197}
198
199/*
200 * Check if a GRU has sufficient free resources to satisfy an allocation
201 * request. Note: GRU locks may or may not be held when this is called. If
202 * not held, recheck after acquiring the appropriate locks.
203 *
204 * Returns 1 if sufficient resources, 0 if not
205 */
206static int check_gru_resources(struct gru_state *gru, int cbr_au_count,
207 int dsr_au_count, int max_active_contexts)
208{
209 return hweight64(gru->gs_cbr_map) >= cbr_au_count
210 && hweight64(gru->gs_dsr_map) >= dsr_au_count
211 && gru->gs_active_contexts < max_active_contexts;
212}
213
214/*
215 * TLB manangment requires tracking all GRU chiplets that have loaded a GSEG
216 * context.
217 */
218static int gru_load_mm_tracker(struct gru_state *gru, struct gru_mm_struct *gms,
219 int ctxnum)
220{
221 struct gru_mm_tracker *asids = &gms->ms_asids[gru->gs_gid];
222 unsigned short ctxbitmap = (1 << ctxnum);
223 int asid;
224
225 spin_lock(&gms->ms_asid_lock);
226 asid = asids->mt_asid;
227
228 if (asid == 0 || asids->mt_asid_gen != gru->gs_asid_gen) {
229 asid = gru_assign_asid(gru);
230 asids->mt_asid = asid;
231 asids->mt_asid_gen = gru->gs_asid_gen;
232 STAT(asid_new);
233 } else {
234 STAT(asid_reuse);
235 }
236
237 BUG_ON(asids->mt_ctxbitmap & ctxbitmap);
238 asids->mt_ctxbitmap |= ctxbitmap;
239 if (!test_bit(gru->gs_gid, gms->ms_asidmap))
240 __set_bit(gru->gs_gid, gms->ms_asidmap);
241 spin_unlock(&gms->ms_asid_lock);
242
243 gru_dbg(grudev,
244 "gru %x, gms %p, ctxnum 0x%d, asid 0x%x, asidmap 0x%lx\n",
245 gru->gs_gid, gms, ctxnum, asid, gms->ms_asidmap[0]);
246 return asid;
247}
248
249static void gru_unload_mm_tracker(struct gru_state *gru,
250 struct gru_mm_struct *gms, int ctxnum)
251{
252 struct gru_mm_tracker *asids;
253 unsigned short ctxbitmap;
254
255 asids = &gms->ms_asids[gru->gs_gid];
256 ctxbitmap = (1 << ctxnum);
257 spin_lock(&gms->ms_asid_lock);
258 BUG_ON((asids->mt_ctxbitmap & ctxbitmap) != ctxbitmap);
259 asids->mt_ctxbitmap ^= ctxbitmap;
260 gru_dbg(grudev, "gru %x, gms %p, ctxnum 0x%d, asidmap 0x%lx\n",
261 gru->gs_gid, gms, ctxnum, gms->ms_asidmap[0]);
262 spin_unlock(&gms->ms_asid_lock);
263}
264
265/*
266 * Decrement the reference count on a GTS structure. Free the structure
267 * if the reference count goes to zero.
268 */
269void gts_drop(struct gru_thread_state *gts)
270{
271 if (gts && atomic_dec_return(&gts->ts_refcnt) == 0) {
272 gru_drop_mmu_notifier(gts->ts_gms);
273 kfree(gts);
274 STAT(gts_free);
275 }
276}
277
278/*
279 * Locate the GTS structure for the current thread.
280 */
281static struct gru_thread_state *gru_find_current_gts_nolock(struct gru_vma_data
282 *vdata, int tsid)
283{
284 struct gru_thread_state *gts;
285
286 list_for_each_entry(gts, &vdata->vd_head, ts_next)
287 if (gts->ts_tsid == tsid)
288 return gts;
289 return NULL;
290}
291
292/*
293 * Allocate a thread state structure.
294 */
295static struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
296 struct gru_vma_data *vdata,
297 int tsid)
298{
299 struct gru_thread_state *gts;
300 int bytes;
301
302 bytes = DSR_BYTES(vdata->vd_dsr_au_count) +
303 CBR_BYTES(vdata->vd_cbr_au_count);
304 bytes += sizeof(struct gru_thread_state);
305 gts = kzalloc(bytes, GFP_KERNEL);
306 if (!gts)
307 return NULL;
308
309 STAT(gts_alloc);
310 atomic_set(&gts->ts_refcnt, 1);
311 mutex_init(&gts->ts_ctxlock);
312 gts->ts_cbr_au_count = vdata->vd_cbr_au_count;
313 gts->ts_dsr_au_count = vdata->vd_dsr_au_count;
314 gts->ts_user_options = vdata->vd_user_options;
315 gts->ts_tsid = tsid;
316 gts->ts_user_options = vdata->vd_user_options;
317 gts->ts_ctxnum = NULLCTX;
318 gts->ts_mm = current->mm;
319 gts->ts_vma = vma;
320 gts->ts_tlb_int_select = -1;
321 gts->ts_gms = gru_register_mmu_notifier();
322 if (!gts->ts_gms)
323 goto err;
324
325 gru_dbg(grudev, "alloc vdata %p, new gts %p\n", vdata, gts);
326 return gts;
327
328err:
329 gts_drop(gts);
330 return NULL;
331}
332
333/*
334 * Allocate a vma private data structure.
335 */
336struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma, int tsid)
337{
338 struct gru_vma_data *vdata = NULL;
339
340 vdata = kmalloc(sizeof(*vdata), GFP_KERNEL);
341 if (!vdata)
342 return NULL;
343
344 INIT_LIST_HEAD(&vdata->vd_head);
345 spin_lock_init(&vdata->vd_lock);
346 gru_dbg(grudev, "alloc vdata %p\n", vdata);
347 return vdata;
348}
349
350/*
351 * Find the thread state structure for the current thread.
352 */
353struct gru_thread_state *gru_find_thread_state(struct vm_area_struct *vma,
354 int tsid)
355{
356 struct gru_vma_data *vdata = vma->vm_private_data;
357 struct gru_thread_state *gts;
358
359 spin_lock(&vdata->vd_lock);
360 gts = gru_find_current_gts_nolock(vdata, tsid);
361 spin_unlock(&vdata->vd_lock);
362 gru_dbg(grudev, "vma %p, gts %p\n", vma, gts);
363 return gts;
364}
365
366/*
367 * Allocate a new thread state for a GSEG. Note that races may allow
368 * another thread to race to create a gts.
369 */
370struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct *vma,
371 int tsid)
372{
373 struct gru_vma_data *vdata = vma->vm_private_data;
374 struct gru_thread_state *gts, *ngts;
375
376 gts = gru_alloc_gts(vma, vdata, tsid);
377 if (!gts)
378 return NULL;
379
380 spin_lock(&vdata->vd_lock);
381 ngts = gru_find_current_gts_nolock(vdata, tsid);
382 if (ngts) {
383 gts_drop(gts);
384 gts = ngts;
385 STAT(gts_double_allocate);
386 } else {
387 list_add(&gts->ts_next, &vdata->vd_head);
388 }
389 spin_unlock(&vdata->vd_lock);
390 gru_dbg(grudev, "vma %p, gts %p\n", vma, gts);
391 return gts;
392}
393
394/*
395 * Free the GRU context assigned to the thread state.
396 */
397static void gru_free_gru_context(struct gru_thread_state *gts)
398{
399 struct gru_state *gru;
400
401 gru = gts->ts_gru;
402 gru_dbg(grudev, "gts %p, gru %p\n", gts, gru);
403
404 spin_lock(&gru->gs_lock);
405 gru->gs_gts[gts->ts_ctxnum] = NULL;
406 free_gru_resources(gru, gts);
407 BUG_ON(test_bit(gts->ts_ctxnum, &gru->gs_context_map) == 0);
408 __clear_bit(gts->ts_ctxnum, &gru->gs_context_map);
409 gts->ts_ctxnum = NULLCTX;
410 gts->ts_gru = NULL;
411 spin_unlock(&gru->gs_lock);
412
413 gts_drop(gts);
414 STAT(free_context);
415}
416
417/*
418 * Prefetching cachelines help hardware performance.
419 * (Strictly a performance enhancement. Not functionally required).
420 */
421static void prefetch_data(void *p, int num, int stride)
422{
423 while (num-- > 0) {
424 prefetchw(p);
425 p += stride;
426 }
427}
428
429static inline long gru_copy_handle(void *d, void *s)
430{
431 memcpy(d, s, GRU_HANDLE_BYTES);
432 return GRU_HANDLE_BYTES;
433}
434
435/* rewrite in assembly & use lots of prefetch */
436static void gru_load_context_data(void *save, void *grubase, int ctxnum,
437 unsigned long cbrmap, unsigned long dsrmap)
438{
439 void *gseg, *cb, *cbe;
440 unsigned long length;
441 int i, scr;
442
443 gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
444 length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
445 prefetch_data(gseg + GRU_DS_BASE, length / GRU_CACHE_LINE_BYTES,
446 GRU_CACHE_LINE_BYTES);
447
448 cb = gseg + GRU_CB_BASE;
449 cbe = grubase + GRU_CBE_BASE;
450 for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
451 prefetch_data(cb, 1, GRU_CACHE_LINE_BYTES);
452 prefetch_data(cbe + i * GRU_HANDLE_STRIDE, 1,
453 GRU_CACHE_LINE_BYTES);
454 cb += GRU_HANDLE_STRIDE;
455 }
456
457 cb = gseg + GRU_CB_BASE;
458 for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
459 save += gru_copy_handle(cb, save);
460 save += gru_copy_handle(cbe + i * GRU_HANDLE_STRIDE, save);
461 cb += GRU_HANDLE_STRIDE;
462 }
463
464 memcpy(gseg + GRU_DS_BASE, save, length);
465}
466
467static void gru_unload_context_data(void *save, void *grubase, int ctxnum,
468 unsigned long cbrmap, unsigned long dsrmap)
469{
470 void *gseg, *cb, *cbe;
471 unsigned long length;
472 int i, scr;
473
474 gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
475
476 cb = gseg + GRU_CB_BASE;
477 cbe = grubase + GRU_CBE_BASE;
478 for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
479 save += gru_copy_handle(save, cb);
480 save += gru_copy_handle(save, cbe + i * GRU_HANDLE_STRIDE);
481 cb += GRU_HANDLE_STRIDE;
482 }
483 length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
484 memcpy(save, gseg + GRU_DS_BASE, length);
485}
486
487void gru_unload_context(struct gru_thread_state *gts, int savestate)
488{
489 struct gru_state *gru = gts->ts_gru;
490 struct gru_context_configuration_handle *cch;
491 int ctxnum = gts->ts_ctxnum;
492
493 zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE);
494 cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
495
496 lock_cch_handle(cch);
497 if (cch_interrupt_sync(cch))
498 BUG();
499 gru_dbg(grudev, "gts %p\n", gts);
500
501 gru_unload_mm_tracker(gru, gts->ts_gms, gts->ts_ctxnum);
502 if (savestate)
503 gru_unload_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr,
504 ctxnum, gts->ts_cbr_map,
505 gts->ts_dsr_map);
506
507 if (cch_deallocate(cch))
508 BUG();
509 gts->ts_force_unload = 0; /* ts_force_unload locked by CCH lock */
510 unlock_cch_handle(cch);
511
512 gru_free_gru_context(gts);
513 STAT(unload_context);
514}
515
516/*
517 * Load a GRU context by copying it from the thread data structure in memory
518 * to the GRU.
519 */
520static void gru_load_context(struct gru_thread_state *gts)
521{
522 struct gru_state *gru = gts->ts_gru;
523 struct gru_context_configuration_handle *cch;
524 int err, asid, ctxnum = gts->ts_ctxnum;
525
526 gru_dbg(grudev, "gts %p\n", gts);
527 cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
528
529 lock_cch_handle(cch);
530 asid = gru_load_mm_tracker(gru, gts->ts_gms, gts->ts_ctxnum);
531 cch->tfm_fault_bit_enable =
532 (gts->ts_user_options == GRU_OPT_MISS_FMM_POLL
533 || gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
534 cch->tlb_int_enable = (gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
535 if (cch->tlb_int_enable) {
536 gts->ts_tlb_int_select = gru_cpu_fault_map_id();
537 cch->tlb_int_select = gts->ts_tlb_int_select;
538 }
539 cch->tfm_done_bit_enable = 0;
540 err = cch_allocate(cch, asid, gts->ts_cbr_map, gts->ts_dsr_map);
541 if (err) {
542 gru_dbg(grudev,
543 "err %d: cch %p, gts %p, cbr 0x%lx, dsr 0x%lx\n",
544 err, cch, gts, gts->ts_cbr_map, gts->ts_dsr_map);
545 BUG();
546 }
547
548 gru_load_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr, ctxnum,
549 gts->ts_cbr_map, gts->ts_dsr_map);
550
551 if (cch_start(cch))
552 BUG();
553 unlock_cch_handle(cch);
554
555 STAT(load_context);
556}
557
558/*
559 * Update fields in an active CCH:
560 * - retarget interrupts on local blade
561 * - force a delayed context unload by clearing the CCH asids. This
562 * forces TLB misses for new GRU instructions. The context is unloaded
563 * when the next TLB miss occurs.
564 */
565static int gru_update_cch(struct gru_thread_state *gts, int int_select)
566{
567 struct gru_context_configuration_handle *cch;
568 struct gru_state *gru = gts->ts_gru;
569 int i, ctxnum = gts->ts_ctxnum, ret = 0;
570
571 cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
572
573 lock_cch_handle(cch);
574 if (cch->state == CCHSTATE_ACTIVE) {
575 if (gru->gs_gts[gts->ts_ctxnum] != gts)
576 goto exit;
577 if (cch_interrupt(cch))
578 BUG();
579 if (int_select >= 0) {
580 gts->ts_tlb_int_select = int_select;
581 cch->tlb_int_select = int_select;
582 } else {
583 for (i = 0; i < 8; i++)
584 cch->asid[i] = 0;
585 cch->tfm_fault_bit_enable = 0;
586 cch->tlb_int_enable = 0;
587 gts->ts_force_unload = 1;
588 }
589 if (cch_start(cch))
590 BUG();
591 ret = 1;
592 }
593exit:
594 unlock_cch_handle(cch);
595 return ret;
596}
597
598/*
599 * Update CCH tlb interrupt select. Required when all the following is true:
600 * - task's GRU context is loaded into a GRU
601 * - task is using interrupt notification for TLB faults
602 * - task has migrated to a different cpu on the same blade where
603 * it was previously running.
604 */
605static int gru_retarget_intr(struct gru_thread_state *gts)
606{
607 if (gts->ts_tlb_int_select < 0
608 || gts->ts_tlb_int_select == gru_cpu_fault_map_id())
609 return 0;
610
611 gru_dbg(grudev, "retarget from %d to %d\n", gts->ts_tlb_int_select,
612 gru_cpu_fault_map_id());
613 return gru_update_cch(gts, gru_cpu_fault_map_id());
614}
615
616
617/*
618 * Insufficient GRU resources available on the local blade. Steal a context from
619 * a process. This is a hack until a _real_ resource scheduler is written....
620 */
621#define next_ctxnum(n) ((n) < GRU_NUM_CCH - 2 ? (n) + 1 : 0)
622#define next_gru(b, g) (((g) < &(b)->bs_grus[GRU_CHIPLETS_PER_BLADE - 1]) ? \
623 ((g)+1) : &(b)->bs_grus[0])
624
625static void gru_steal_context(struct gru_thread_state *gts)
626{
627 struct gru_blade_state *blade;
628 struct gru_state *gru, *gru0;
629 struct gru_thread_state *ngts = NULL;
630 int ctxnum, ctxnum0, flag = 0, cbr, dsr;
631
632 cbr = gts->ts_cbr_au_count;
633 dsr = gts->ts_dsr_au_count;
634
635 preempt_disable();
636 blade = gru_base[uv_numa_blade_id()];
637 spin_lock(&blade->bs_lock);
638
639 ctxnum = next_ctxnum(blade->bs_lru_ctxnum);
640 gru = blade->bs_lru_gru;
641 if (ctxnum == 0)
642 gru = next_gru(blade, gru);
643 ctxnum0 = ctxnum;
644 gru0 = gru;
645 while (1) {
646 if (check_gru_resources(gru, cbr, dsr, GRU_NUM_CCH))
647 break;
648 spin_lock(&gru->gs_lock);
649 for (; ctxnum < GRU_NUM_CCH; ctxnum++) {
650 if (flag && gru == gru0 && ctxnum == ctxnum0)
651 break;
652 ngts = gru->gs_gts[ctxnum];
653 /*
654 * We are grabbing locks out of order, so trylock is
655 * needed. GTSs are usually not locked, so the odds of
656 * success are high. If trylock fails, try to steal a
657 * different GSEG.
658 */
659 if (ngts && mutex_trylock(&ngts->ts_ctxlock))
660 break;
661 ngts = NULL;
662 flag = 1;
663 }
664 spin_unlock(&gru->gs_lock);
665 if (ngts || (flag && gru == gru0 && ctxnum == ctxnum0))
666 break;
667 ctxnum = 0;
668 gru = next_gru(blade, gru);
669 }
670 blade->bs_lru_gru = gru;
671 blade->bs_lru_ctxnum = ctxnum;
672 spin_unlock(&blade->bs_lock);
673 preempt_enable();
674
675 if (ngts) {
676 STAT(steal_context);
677 ngts->ts_steal_jiffies = jiffies;
678 gru_unload_context(ngts, 1);
679 mutex_unlock(&ngts->ts_ctxlock);
680 } else {
681 STAT(steal_context_failed);
682 }
683 gru_dbg(grudev,
684 "stole gru %x, ctxnum %d from gts %p. Need cb %d, ds %d;"
685 " avail cb %ld, ds %ld\n",
686 gru->gs_gid, ctxnum, ngts, cbr, dsr, hweight64(gru->gs_cbr_map),
687 hweight64(gru->gs_dsr_map));
688}
689
690/*
691 * Scan the GRUs on the local blade & assign a GRU context.
692 */
693static struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts)
694{
695 struct gru_state *gru, *grux;
696 int i, max_active_contexts;
697
698 preempt_disable();
699
700again:
701 gru = NULL;
702 max_active_contexts = GRU_NUM_CCH;
703 for_each_gru_on_blade(grux, uv_numa_blade_id(), i) {
704 if (check_gru_resources(grux, gts->ts_cbr_au_count,
705 gts->ts_dsr_au_count,
706 max_active_contexts)) {
707 gru = grux;
708 max_active_contexts = grux->gs_active_contexts;
709 if (max_active_contexts == 0)
710 break;
711 }
712 }
713
714 if (gru) {
715 spin_lock(&gru->gs_lock);
716 if (!check_gru_resources(gru, gts->ts_cbr_au_count,
717 gts->ts_dsr_au_count, GRU_NUM_CCH)) {
718 spin_unlock(&gru->gs_lock);
719 goto again;
720 }
721 reserve_gru_resources(gru, gts);
722 gts->ts_gru = gru;
723 gts->ts_ctxnum =
724 find_first_zero_bit(&gru->gs_context_map, GRU_NUM_CCH);
725 BUG_ON(gts->ts_ctxnum == GRU_NUM_CCH);
726 atomic_inc(&gts->ts_refcnt);
727 gru->gs_gts[gts->ts_ctxnum] = gts;
728 __set_bit(gts->ts_ctxnum, &gru->gs_context_map);
729 spin_unlock(&gru->gs_lock);
730
731 STAT(assign_context);
732 gru_dbg(grudev,
733 "gseg %p, gts %p, gru %x, ctx %d, cbr %d, dsr %d\n",
734 gseg_virtual_address(gts->ts_gru, gts->ts_ctxnum), gts,
735 gts->ts_gru->gs_gid, gts->ts_ctxnum,
736 gts->ts_cbr_au_count, gts->ts_dsr_au_count);
737 } else {
738 gru_dbg(grudev, "failed to allocate a GTS %s\n", "");
739 STAT(assign_context_failed);
740 }
741
742 preempt_enable();
743 return gru;
744}
745
746/*
747 * gru_nopage
748 *
749 * Map the user's GRU segment
750 *
751 * Note: gru segments alway mmaped on GRU_GSEG_PAGESIZE boundaries.
752 */
753int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
754{
755 struct gru_thread_state *gts;
756 unsigned long paddr, vaddr;
757
758 vaddr = (unsigned long)vmf->virtual_address;
759 gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
760 vma, vaddr, GSEG_BASE(vaddr));
761 STAT(nopfn);
762
763 /* The following check ensures vaddr is a valid address in the VMA */
764 gts = gru_find_thread_state(vma, TSID(vaddr, vma));
765 if (!gts)
766 return VM_FAULT_SIGBUS;
767
768again:
769 preempt_disable();
770 mutex_lock(&gts->ts_ctxlock);
771 if (gts->ts_gru) {
772 if (gts->ts_gru->gs_blade_id != uv_numa_blade_id()) {
773 STAT(migrated_nopfn_unload);
774 gru_unload_context(gts, 1);
775 } else {
776 if (gru_retarget_intr(gts))
777 STAT(migrated_nopfn_retarget);
778 }
779 }
780
781 if (!gts->ts_gru) {
782 if (!gru_assign_gru_context(gts)) {
783 mutex_unlock(&gts->ts_ctxlock);
784 preempt_enable();
785 schedule_timeout(GRU_ASSIGN_DELAY); /* true hack ZZZ */
786 if (gts->ts_steal_jiffies + GRU_STEAL_DELAY < jiffies)
787 gru_steal_context(gts);
788 goto again;
789 }
790 gru_load_context(gts);
791 paddr = gseg_physical_address(gts->ts_gru, gts->ts_ctxnum);
792 remap_pfn_range(vma, vaddr & ~(GRU_GSEG_PAGESIZE - 1),
793 paddr >> PAGE_SHIFT, GRU_GSEG_PAGESIZE,
794 vma->vm_page_prot);
795 }
796
797 mutex_unlock(&gts->ts_ctxlock);
798 preempt_enable();
799
800 return VM_FAULT_NOPAGE;
801}
802
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
new file mode 100644
index 000000000000..533923f83f1a
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -0,0 +1,336 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * PROC INTERFACES
5 *
6 * This file supports the /proc interfaces for the GRU driver
7 *
8 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25#include <linux/proc_fs.h>
26#include <linux/device.h>
27#include <linux/seq_file.h>
28#include <linux/uaccess.h>
29#include "gru.h"
30#include "grulib.h"
31#include "grutables.h"
32
33#define printstat(s, f) printstat_val(s, &gru_stats.f, #f)
34
35static void printstat_val(struct seq_file *s, atomic_long_t *v, char *id)
36{
37 unsigned long val = atomic_long_read(v);
38
39 if (val)
40 seq_printf(s, "%16lu %s\n", val, id);
41}
42
43static int statistics_show(struct seq_file *s, void *p)
44{
45 printstat(s, vdata_alloc);
46 printstat(s, vdata_free);
47 printstat(s, gts_alloc);
48 printstat(s, gts_free);
49 printstat(s, vdata_double_alloc);
50 printstat(s, gts_double_allocate);
51 printstat(s, assign_context);
52 printstat(s, assign_context_failed);
53 printstat(s, free_context);
54 printstat(s, load_context);
55 printstat(s, unload_context);
56 printstat(s, steal_context);
57 printstat(s, steal_context_failed);
58 printstat(s, nopfn);
59 printstat(s, break_cow);
60 printstat(s, asid_new);
61 printstat(s, asid_next);
62 printstat(s, asid_wrap);
63 printstat(s, asid_reuse);
64 printstat(s, intr);
65 printstat(s, call_os);
66 printstat(s, call_os_check_for_bug);
67 printstat(s, call_os_wait_queue);
68 printstat(s, user_flush_tlb);
69 printstat(s, user_unload_context);
70 printstat(s, user_exception);
71 printstat(s, set_task_slice);
72 printstat(s, migrate_check);
73 printstat(s, migrated_retarget);
74 printstat(s, migrated_unload);
75 printstat(s, migrated_unload_delay);
76 printstat(s, migrated_nopfn_retarget);
77 printstat(s, migrated_nopfn_unload);
78 printstat(s, tlb_dropin);
79 printstat(s, tlb_dropin_fail_no_asid);
80 printstat(s, tlb_dropin_fail_upm);
81 printstat(s, tlb_dropin_fail_invalid);
82 printstat(s, tlb_dropin_fail_range_active);
83 printstat(s, tlb_dropin_fail_idle);
84 printstat(s, tlb_dropin_fail_fmm);
85 printstat(s, mmu_invalidate_range);
86 printstat(s, mmu_invalidate_page);
87 printstat(s, mmu_clear_flush_young);
88 printstat(s, flush_tlb);
89 printstat(s, flush_tlb_gru);
90 printstat(s, flush_tlb_gru_tgh);
91 printstat(s, flush_tlb_gru_zero_asid);
92 printstat(s, copy_gpa);
93 printstat(s, mesq_receive);
94 printstat(s, mesq_receive_none);
95 printstat(s, mesq_send);
96 printstat(s, mesq_send_failed);
97 printstat(s, mesq_noop);
98 printstat(s, mesq_send_unexpected_error);
99 printstat(s, mesq_send_lb_overflow);
100 printstat(s, mesq_send_qlimit_reached);
101 printstat(s, mesq_send_amo_nacked);
102 printstat(s, mesq_send_put_nacked);
103 printstat(s, mesq_qf_not_full);
104 printstat(s, mesq_qf_locked);
105 printstat(s, mesq_qf_noop_not_full);
106 printstat(s, mesq_qf_switch_head_failed);
107 printstat(s, mesq_qf_unexpected_error);
108 printstat(s, mesq_noop_unexpected_error);
109 printstat(s, mesq_noop_lb_overflow);
110 printstat(s, mesq_noop_qlimit_reached);
111 printstat(s, mesq_noop_amo_nacked);
112 printstat(s, mesq_noop_put_nacked);
113 return 0;
114}
115
116static ssize_t statistics_write(struct file *file, const char __user *userbuf,
117 size_t count, loff_t *data)
118{
119 memset(&gru_stats, 0, sizeof(gru_stats));
120 return count;
121}
122
123static int options_show(struct seq_file *s, void *p)
124{
125 seq_printf(s, "0x%lx\n", gru_options);
126 return 0;
127}
128
129static ssize_t options_write(struct file *file, const char __user *userbuf,
130 size_t count, loff_t *data)
131{
132 unsigned long val;
133 char buf[80];
134
135 if (copy_from_user
136 (buf, userbuf, count < sizeof(buf) ? count : sizeof(buf)))
137 return -EFAULT;
138 if (!strict_strtoul(buf, 10, &val))
139 gru_options = val;
140
141 return count;
142}
143
144static int cch_seq_show(struct seq_file *file, void *data)
145{
146 long gid = *(long *)data;
147 int i;
148 struct gru_state *gru = GID_TO_GRU(gid);
149 struct gru_thread_state *ts;
150 const char *mode[] = { "??", "UPM", "INTR", "OS_POLL" };
151
152 if (gid == 0)
153 seq_printf(file, "#%5s%5s%6s%9s%6s%8s%8s\n", "gid", "bid",
154 "ctx#", "pid", "cbrs", "dsbytes", "mode");
155 if (gru)
156 for (i = 0; i < GRU_NUM_CCH; i++) {
157 ts = gru->gs_gts[i];
158 if (!ts)
159 continue;
160 seq_printf(file, " %5d%5d%6d%9d%6d%8d%8s\n",
161 gru->gs_gid, gru->gs_blade_id, i,
162 ts->ts_tgid_owner,
163 ts->ts_cbr_au_count * GRU_CBR_AU_SIZE,
164 ts->ts_cbr_au_count * GRU_DSR_AU_BYTES,
165 mode[ts->ts_user_options &
166 GRU_OPT_MISS_MASK]);
167 }
168
169 return 0;
170}
171
172static int gru_seq_show(struct seq_file *file, void *data)
173{
174 long gid = *(long *)data, ctxfree, cbrfree, dsrfree;
175 struct gru_state *gru = GID_TO_GRU(gid);
176
177 if (gid == 0) {
178 seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "gid", "nid",
179 "ctx", "cbr", "dsr", "ctx", "cbr", "dsr");
180 seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "", "", "busy",
181 "busy", "busy", "free", "free", "free");
182 }
183 if (gru) {
184 ctxfree = GRU_NUM_CCH - gru->gs_active_contexts;
185 cbrfree = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
186 dsrfree = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
187 seq_printf(file, " %5d%5d%7ld%6ld%6ld%8ld%6ld%6ld\n",
188 gru->gs_gid, gru->gs_blade_id, GRU_NUM_CCH - ctxfree,
189 GRU_NUM_CBE - cbrfree, GRU_NUM_DSR_BYTES - dsrfree,
190 ctxfree, cbrfree, dsrfree);
191 }
192
193 return 0;
194}
195
196static void seq_stop(struct seq_file *file, void *data)
197{
198}
199
200static void *seq_start(struct seq_file *file, loff_t *gid)
201{
202 if (*gid < GRU_MAX_GRUS)
203 return gid;
204 return NULL;
205}
206
207static void *seq_next(struct seq_file *file, void *data, loff_t *gid)
208{
209 (*gid)++;
210 if (*gid < GRU_MAX_GRUS)
211 return gid;
212 return NULL;
213}
214
215static const struct seq_operations cch_seq_ops = {
216 .start = seq_start,
217 .next = seq_next,
218 .stop = seq_stop,
219 .show = cch_seq_show
220};
221
222static const struct seq_operations gru_seq_ops = {
223 .start = seq_start,
224 .next = seq_next,
225 .stop = seq_stop,
226 .show = gru_seq_show
227};
228
229static int statistics_open(struct inode *inode, struct file *file)
230{
231 return single_open(file, statistics_show, NULL);
232}
233
234static int options_open(struct inode *inode, struct file *file)
235{
236 return single_open(file, options_show, NULL);
237}
238
239static int cch_open(struct inode *inode, struct file *file)
240{
241 return seq_open(file, &cch_seq_ops);
242}
243
244static int gru_open(struct inode *inode, struct file *file)
245{
246 return seq_open(file, &gru_seq_ops);
247}
248
249/* *INDENT-OFF* */
250static const struct file_operations statistics_fops = {
251 .open = statistics_open,
252 .read = seq_read,
253 .write = statistics_write,
254 .llseek = seq_lseek,
255 .release = single_release,
256};
257
258static const struct file_operations options_fops = {
259 .open = options_open,
260 .read = seq_read,
261 .write = options_write,
262 .llseek = seq_lseek,
263 .release = single_release,
264};
265
266static const struct file_operations cch_fops = {
267 .open = cch_open,
268 .read = seq_read,
269 .llseek = seq_lseek,
270 .release = seq_release,
271};
272static const struct file_operations gru_fops = {
273 .open = gru_open,
274 .read = seq_read,
275 .llseek = seq_lseek,
276 .release = seq_release,
277};
278
279static struct proc_entry {
280 char *name;
281 int mode;
282 const struct file_operations *fops;
283 struct proc_dir_entry *entry;
284} proc_files[] = {
285 {"statistics", 0644, &statistics_fops},
286 {"debug_options", 0644, &options_fops},
287 {"cch_status", 0444, &cch_fops},
288 {"gru_status", 0444, &gru_fops},
289 {NULL}
290};
291/* *INDENT-ON* */
292
293static struct proc_dir_entry *proc_gru __read_mostly;
294
295static int create_proc_file(struct proc_entry *p)
296{
297 p->entry = create_proc_entry(p->name, p->mode, proc_gru);
298 if (!p->entry)
299 return -1;
300 p->entry->proc_fops = p->fops;
301 return 0;
302}
303
304static void delete_proc_files(void)
305{
306 struct proc_entry *p;
307
308 if (proc_gru) {
309 for (p = proc_files; p->name; p++)
310 if (p->entry)
311 remove_proc_entry(p->name, proc_gru);
312 remove_proc_entry("gru", NULL);
313 }
314}
315
316int gru_proc_init(void)
317{
318 struct proc_entry *p;
319
320 proc_mkdir("sgi_uv", NULL);
321 proc_gru = proc_mkdir("sgi_uv/gru", NULL);
322
323 for (p = proc_files; p->name; p++)
324 if (create_proc_file(p))
325 goto err;
326 return 0;
327
328err:
329 delete_proc_files();
330 return -1;
331}
332
333void gru_proc_exit(void)
334{
335 delete_proc_files();
336}
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
new file mode 100644
index 000000000000..4251018f70ff
--- /dev/null
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -0,0 +1,609 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * GRU DRIVER TABLES, MACROS, externs, etc
5 *
6 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef __GRUTABLES_H__
24#define __GRUTABLES_H__
25
26/*
27 * GRU Chiplet:
28 * The GRU is a user addressible memory accelerator. It provides
29 * several forms of load, store, memset, bcopy instructions. In addition, it
30 * contains special instructions for AMOs, sending messages to message
31 * queues, etc.
32 *
33 * The GRU is an integral part of the node controller. It connects
34 * directly to the cpu socket. In its current implementation, there are 2
35 * GRU chiplets in the node controller on each blade (~node).
36 *
37 * The entire GRU memory space is fully coherent and cacheable by the cpus.
38 *
39 * Each GRU chiplet has a physical memory map that looks like the following:
40 *
41 * +-----------------+
42 * |/////////////////|
43 * |/////////////////|
44 * |/////////////////|
45 * |/////////////////|
46 * |/////////////////|
47 * |/////////////////|
48 * |/////////////////|
49 * |/////////////////|
50 * +-----------------+
51 * | system control |
52 * +-----------------+ _______ +-------------+
53 * |/////////////////| / | |
54 * |/////////////////| / | |
55 * |/////////////////| / | instructions|
56 * |/////////////////| / | |
57 * |/////////////////| / | |
58 * |/////////////////| / |-------------|
59 * |/////////////////| / | |
60 * +-----------------+ | |
61 * | context 15 | | data |
62 * +-----------------+ | |
63 * | ...... | \ | |
64 * +-----------------+ \____________ +-------------+
65 * | context 1 |
66 * +-----------------+
67 * | context 0 |
68 * +-----------------+
69 *
70 * Each of the "contexts" is a chunk of memory that can be mmaped into user
71 * space. The context consists of 2 parts:
72 *
73 * - an instruction space that can be directly accessed by the user
74 * to issue GRU instructions and to check instruction status.
75 *
76 * - a data area that acts as normal RAM.
77 *
78 * User instructions contain virtual addresses of data to be accessed by the
79 * GRU. The GRU contains a TLB that is used to convert these user virtual
80 * addresses to physical addresses.
81 *
82 * The "system control" area of the GRU chiplet is used by the kernel driver
83 * to manage user contexts and to perform functions such as TLB dropin and
84 * purging.
85 *
86 * One context may be reserved for the kernel and used for cross-partition
87 * communication. The GRU will also be used to asynchronously zero out
88 * large blocks of memory (not currently implemented).
89 *
90 *
91 * Tables:
92 *
93 * VDATA-VMA Data - Holds a few parameters. Head of linked list of
94 * GTS tables for threads using the GSEG
95 * GTS - Gru Thread State - contains info for managing a GSEG context. A
96 * GTS is allocated for each thread accessing a
97 * GSEG.
98 * GTD - GRU Thread Data - contains shadow copy of GRU data when GSEG is
99 * not loaded into a GRU
100 * GMS - GRU Memory Struct - Used to manage TLB shootdowns. Tracks GRUs
101 * where a GSEG has been loaded. Similar to
102 * an mm_struct but for GRU.
103 *
104 * GS - GRU State - Used to manage the state of a GRU chiplet
105 * BS - Blade State - Used to manage state of all GRU chiplets
106 * on a blade
107 *
108 *
109 * Normal task tables for task using GRU.
110 * - 2 threads in process
111 * - 2 GSEGs open in process
112 * - GSEG1 is being used by both threads
113 * - GSEG2 is used only by thread 2
114 *
115 * task -->|
116 * task ---+---> mm ->------ (notifier) -------+-> gms
117 * | |
118 * |--> vma -> vdata ---> gts--->| GSEG1 (thread1)
119 * | | |
120 * | +-> gts--->| GSEG1 (thread2)
121 * | |
122 * |--> vma -> vdata ---> gts--->| GSEG2 (thread2)
123 * .
124 * .
125 *
126 * GSEGs are marked DONTCOPY on fork
127 *
128 * At open
129 * file.private_data -> NULL
130 *
131 * At mmap,
132 * vma -> vdata
133 *
134 * After gseg reference
135 * vma -> vdata ->gts
136 *
137 * After fork
138 * parent
139 * vma -> vdata -> gts
140 * child
141 * (vma is not copied)
142 *
143 */
144
145#include <linux/rmap.h>
146#include <linux/interrupt.h>
147#include <linux/mutex.h>
148#include <linux/wait.h>
149#include <linux/mmu_notifier.h>
150#include "gru.h"
151#include "gruhandles.h"
152
153extern struct gru_stats_s gru_stats;
154extern struct gru_blade_state *gru_base[];
155extern unsigned long gru_start_paddr, gru_end_paddr;
156
157#define GRU_MAX_BLADES MAX_NUMNODES
158#define GRU_MAX_GRUS (GRU_MAX_BLADES * GRU_CHIPLETS_PER_BLADE)
159
160#define GRU_DRIVER_ID_STR "SGI GRU Device Driver"
161#define GRU_DRIVER_VERSION_STR "0.80"
162
163/*
164 * GRU statistics.
165 */
166struct gru_stats_s {
167 atomic_long_t vdata_alloc;
168 atomic_long_t vdata_free;
169 atomic_long_t gts_alloc;
170 atomic_long_t gts_free;
171 atomic_long_t vdata_double_alloc;
172 atomic_long_t gts_double_allocate;
173 atomic_long_t assign_context;
174 atomic_long_t assign_context_failed;
175 atomic_long_t free_context;
176 atomic_long_t load_context;
177 atomic_long_t unload_context;
178 atomic_long_t steal_context;
179 atomic_long_t steal_context_failed;
180 atomic_long_t nopfn;
181 atomic_long_t break_cow;
182 atomic_long_t asid_new;
183 atomic_long_t asid_next;
184 atomic_long_t asid_wrap;
185 atomic_long_t asid_reuse;
186 atomic_long_t intr;
187 atomic_long_t call_os;
188 atomic_long_t call_os_check_for_bug;
189 atomic_long_t call_os_wait_queue;
190 atomic_long_t user_flush_tlb;
191 atomic_long_t user_unload_context;
192 atomic_long_t user_exception;
193 atomic_long_t set_task_slice;
194 atomic_long_t migrate_check;
195 atomic_long_t migrated_retarget;
196 atomic_long_t migrated_unload;
197 atomic_long_t migrated_unload_delay;
198 atomic_long_t migrated_nopfn_retarget;
199 atomic_long_t migrated_nopfn_unload;
200 atomic_long_t tlb_dropin;
201 atomic_long_t tlb_dropin_fail_no_asid;
202 atomic_long_t tlb_dropin_fail_upm;
203 atomic_long_t tlb_dropin_fail_invalid;
204 atomic_long_t tlb_dropin_fail_range_active;
205 atomic_long_t tlb_dropin_fail_idle;
206 atomic_long_t tlb_dropin_fail_fmm;
207 atomic_long_t mmu_invalidate_range;
208 atomic_long_t mmu_invalidate_page;
209 atomic_long_t mmu_clear_flush_young;
210 atomic_long_t flush_tlb;
211 atomic_long_t flush_tlb_gru;
212 atomic_long_t flush_tlb_gru_tgh;
213 atomic_long_t flush_tlb_gru_zero_asid;
214
215 atomic_long_t copy_gpa;
216
217 atomic_long_t mesq_receive;
218 atomic_long_t mesq_receive_none;
219 atomic_long_t mesq_send;
220 atomic_long_t mesq_send_failed;
221 atomic_long_t mesq_noop;
222 atomic_long_t mesq_send_unexpected_error;
223 atomic_long_t mesq_send_lb_overflow;
224 atomic_long_t mesq_send_qlimit_reached;
225 atomic_long_t mesq_send_amo_nacked;
226 atomic_long_t mesq_send_put_nacked;
227 atomic_long_t mesq_qf_not_full;
228 atomic_long_t mesq_qf_locked;
229 atomic_long_t mesq_qf_noop_not_full;
230 atomic_long_t mesq_qf_switch_head_failed;
231 atomic_long_t mesq_qf_unexpected_error;
232 atomic_long_t mesq_noop_unexpected_error;
233 atomic_long_t mesq_noop_lb_overflow;
234 atomic_long_t mesq_noop_qlimit_reached;
235 atomic_long_t mesq_noop_amo_nacked;
236 atomic_long_t mesq_noop_put_nacked;
237
238};
239
240#define OPT_DPRINT 1
241#define OPT_STATS 2
242#define GRU_QUICKLOOK 4
243
244
245#define IRQ_GRU 110 /* Starting IRQ number for interrupts */
246
247/* Delay in jiffies between attempts to assign a GRU context */
248#define GRU_ASSIGN_DELAY ((HZ * 20) / 1000)
249
250/*
251 * If a process has it's context stolen, min delay in jiffies before trying to
252 * steal a context from another process.
253 */
254#define GRU_STEAL_DELAY ((HZ * 200) / 1000)
255
256#define STAT(id) do { \
257 if (gru_options & OPT_STATS) \
258 atomic_long_inc(&gru_stats.id); \
259 } while (0)
260
261#ifdef CONFIG_SGI_GRU_DEBUG
262#define gru_dbg(dev, fmt, x...) \
263 do { \
264 if (gru_options & OPT_DPRINT) \
265 dev_dbg(dev, "%s: " fmt, __func__, x); \
266 } while (0)
267#else
268#define gru_dbg(x...)
269#endif
270
271/*-----------------------------------------------------------------------------
272 * ASID management
273 */
274#define MAX_ASID 0xfffff0
275#define MIN_ASID 8
276#define ASID_INC 8 /* number of regions */
277
278/* Generate a GRU asid value from a GRU base asid & a virtual address. */
279#if defined CONFIG_IA64
280#define VADDR_HI_BIT 64
281#define GRUREGION(addr) ((addr) >> (VADDR_HI_BIT - 3) & 3)
282#elif defined __x86_64
283#define VADDR_HI_BIT 48
284#define GRUREGION(addr) (0) /* ZZZ could do better */
285#else
286#error "Unsupported architecture"
287#endif
288#define GRUASID(asid, addr) ((asid) + GRUREGION(addr))
289
290/*------------------------------------------------------------------------------
291 * File & VMS Tables
292 */
293
294struct gru_state;
295
296/*
297 * This structure is pointed to from the mmstruct via the notifier pointer.
298 * There is one of these per address space.
299 */
300struct gru_mm_tracker {
301 unsigned int mt_asid_gen; /* ASID wrap count */
302 int mt_asid; /* current base ASID for gru */
303 unsigned short mt_ctxbitmap; /* bitmap of contexts using
304 asid */
305};
306
307struct gru_mm_struct {
308 struct mmu_notifier ms_notifier;
309 atomic_t ms_refcnt;
310 spinlock_t ms_asid_lock; /* protects ASID assignment */
311 atomic_t ms_range_active;/* num range_invals active */
312 char ms_released;
313 wait_queue_head_t ms_wait_queue;
314 DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS);
315 struct gru_mm_tracker ms_asids[GRU_MAX_GRUS];
316};
317
318/*
319 * One of these structures is allocated when a GSEG is mmaped. The
320 * structure is pointed to by the vma->vm_private_data field in the vma struct.
321 */
322struct gru_vma_data {
323 spinlock_t vd_lock; /* Serialize access to vma */
324 struct list_head vd_head; /* head of linked list of gts */
325 long vd_user_options;/* misc user option flags */
326 int vd_cbr_au_count;
327 int vd_dsr_au_count;
328};
329
330/*
331 * One of these is allocated for each thread accessing a mmaped GRU. A linked
332 * list of these structure is hung off the struct gru_vma_data in the mm_struct.
333 */
334struct gru_thread_state {
335 struct list_head ts_next; /* list - head at vma-private */
336 struct mutex ts_ctxlock; /* load/unload CTX lock */
337 struct mm_struct *ts_mm; /* mm currently mapped to
338 context */
339 struct vm_area_struct *ts_vma; /* vma of GRU context */
340 struct gru_state *ts_gru; /* GRU where the context is
341 loaded */
342 struct gru_mm_struct *ts_gms; /* asid & ioproc struct */
343 unsigned long ts_cbr_map; /* map of allocated CBRs */
344 unsigned long ts_dsr_map; /* map of allocated DATA
345 resources */
346 unsigned long ts_steal_jiffies;/* jiffies when context last
347 stolen */
348 long ts_user_options;/* misc user option flags */
349 pid_t ts_tgid_owner; /* task that is using the
350 context - for migration */
351 int ts_tsid; /* thread that owns the
352 structure */
353 int ts_tlb_int_select;/* target cpu if interrupts
354 enabled */
355 int ts_ctxnum; /* context number where the
356 context is loaded */
357 atomic_t ts_refcnt; /* reference count GTS */
358 unsigned char ts_dsr_au_count;/* Number of DSR resources
359 required for contest */
360 unsigned char ts_cbr_au_count;/* Number of CBR resources
361 required for contest */
362 char ts_force_unload;/* force context to be unloaded
363 after migration */
364 char ts_cbr_idx[GRU_CBR_AU];/* CBR numbers of each
365 allocated CB */
366 unsigned long ts_gdata[0]; /* save area for GRU data (CB,
367 DS, CBE) */
368};
369
370/*
371 * Threaded programs actually allocate an array of GSEGs when a context is
372 * created. Each thread uses a separate GSEG. TSID is the index into the GSEG
373 * array.
374 */
375#define TSID(a, v) (((a) - (v)->vm_start) / GRU_GSEG_PAGESIZE)
376#define UGRUADDR(gts) ((gts)->ts_vma->vm_start + \
377 (gts)->ts_tsid * GRU_GSEG_PAGESIZE)
378
379#define NULLCTX (-1) /* if context not loaded into GRU */
380
381/*-----------------------------------------------------------------------------
382 * GRU State Tables
383 */
384
385/*
386 * One of these exists for each GRU chiplet.
387 */
388struct gru_state {
389 struct gru_blade_state *gs_blade; /* GRU state for entire
390 blade */
391 unsigned long gs_gru_base_paddr; /* Physical address of
392 gru segments (64) */
393 void *gs_gru_base_vaddr; /* Virtual address of
394 gru segments (64) */
395 unsigned char gs_gid; /* unique GRU number */
396 unsigned char gs_tgh_local_shift; /* used to pick TGH for
397 local flush */
398 unsigned char gs_tgh_first_remote; /* starting TGH# for
399 remote flush */
400 unsigned short gs_blade_id; /* blade of GRU */
401 spinlock_t gs_asid_lock; /* lock used for
402 assigning asids */
403 spinlock_t gs_lock; /* lock used for
404 assigning contexts */
405
406 /* -- the following are protected by the gs_asid_lock spinlock ---- */
407 unsigned int gs_asid; /* Next availe ASID */
408 unsigned int gs_asid_limit; /* Limit of available
409 ASIDs */
410 unsigned int gs_asid_gen; /* asid generation.
411 Inc on wrap */
412
413 /* --- the following fields are protected by the gs_lock spinlock --- */
414 unsigned long gs_context_map; /* bitmap to manage
415 contexts in use */
416 unsigned long gs_cbr_map; /* bitmap to manage CB
417 resources */
418 unsigned long gs_dsr_map; /* bitmap used to manage
419 DATA resources */
420 unsigned int gs_reserved_cbrs; /* Number of kernel-
421 reserved cbrs */
422 unsigned int gs_reserved_dsr_bytes; /* Bytes of kernel-
423 reserved dsrs */
424 unsigned short gs_active_contexts; /* number of contexts
425 in use */
426 struct gru_thread_state *gs_gts[GRU_NUM_CCH]; /* GTS currently using
427 the context */
428};
429
430/*
431 * This structure contains the GRU state for all the GRUs on a blade.
432 */
433struct gru_blade_state {
434 void *kernel_cb; /* First kernel
435 reserved cb */
436 void *kernel_dsr; /* First kernel
437 reserved DSR */
438 /* ---- the following are protected by the bs_lock spinlock ---- */
439 spinlock_t bs_lock; /* lock used for
440 stealing contexts */
441 int bs_lru_ctxnum; /* STEAL - last context
442 stolen */
443 struct gru_state *bs_lru_gru; /* STEAL - last gru
444 stolen */
445
446 struct gru_state bs_grus[GRU_CHIPLETS_PER_BLADE];
447};
448
449/*-----------------------------------------------------------------------------
450 * Address Primitives
451 */
452#define get_tfm_for_cpu(g, c) \
453 ((struct gru_tlb_fault_map *)get_tfm((g)->gs_gru_base_vaddr, (c)))
454#define get_tfh_by_index(g, i) \
455 ((struct gru_tlb_fault_handle *)get_tfh((g)->gs_gru_base_vaddr, (i)))
456#define get_tgh_by_index(g, i) \
457 ((struct gru_tlb_global_handle *)get_tgh((g)->gs_gru_base_vaddr, (i)))
458#define get_cbe_by_index(g, i) \
459 ((struct gru_control_block_extended *)get_cbe((g)->gs_gru_base_vaddr,\
460 (i)))
461
462/*-----------------------------------------------------------------------------
463 * Useful Macros
464 */
465
466/* Given a blade# & chiplet#, get a pointer to the GRU */
467#define get_gru(b, c) (&gru_base[b]->bs_grus[c])
468
469/* Number of bytes to save/restore when unloading/loading GRU contexts */
470#define DSR_BYTES(dsr) ((dsr) * GRU_DSR_AU_BYTES)
471#define CBR_BYTES(cbr) ((cbr) * GRU_HANDLE_BYTES * GRU_CBR_AU_SIZE * 2)
472
473/* Convert a user CB number to the actual CBRNUM */
474#define thread_cbr_number(gts, n) ((gts)->ts_cbr_idx[(n) / GRU_CBR_AU_SIZE] \
475 * GRU_CBR_AU_SIZE + (n) % GRU_CBR_AU_SIZE)
476
477/* Convert a gid to a pointer to the GRU */
478#define GID_TO_GRU(gid) \
479 (gru_base[(gid) / GRU_CHIPLETS_PER_BLADE] ? \
480 (&gru_base[(gid) / GRU_CHIPLETS_PER_BLADE]-> \
481 bs_grus[(gid) % GRU_CHIPLETS_PER_BLADE]) : \
482 NULL)
483
484/* Scan all active GRUs in a GRU bitmap */
485#define for_each_gru_in_bitmap(gid, map) \
486 for ((gid) = find_first_bit((map), GRU_MAX_GRUS); (gid) < GRU_MAX_GRUS;\
487 (gid)++, (gid) = find_next_bit((map), GRU_MAX_GRUS, (gid)))
488
489/* Scan all active GRUs on a specific blade */
490#define for_each_gru_on_blade(gru, nid, i) \
491 for ((gru) = gru_base[nid]->bs_grus, (i) = 0; \
492 (i) < GRU_CHIPLETS_PER_BLADE; \
493 (i)++, (gru)++)
494
495/* Scan all active GTSs on a gru. Note: must hold ss_lock to use this macro. */
496#define for_each_gts_on_gru(gts, gru, ctxnum) \
497 for ((ctxnum) = 0; (ctxnum) < GRU_NUM_CCH; (ctxnum)++) \
498 if (((gts) = (gru)->gs_gts[ctxnum]))
499
500/* Scan each CBR whose bit is set in a TFM (or copy of) */
501#define for_each_cbr_in_tfm(i, map) \
502 for ((i) = find_first_bit(map, GRU_NUM_CBE); \
503 (i) < GRU_NUM_CBE; \
504 (i)++, (i) = find_next_bit(map, GRU_NUM_CBE, i))
505
506/* Scan each CBR in a CBR bitmap. Note: multiple CBRs in an allocation unit */
507#define for_each_cbr_in_allocation_map(i, map, k) \
508 for ((k) = find_first_bit(map, GRU_CBR_AU); (k) < GRU_CBR_AU; \
509 (k) = find_next_bit(map, GRU_CBR_AU, (k) + 1)) \
510 for ((i) = (k)*GRU_CBR_AU_SIZE; \
511 (i) < ((k) + 1) * GRU_CBR_AU_SIZE; (i)++)
512
513/* Scan each DSR in a DSR bitmap. Note: multiple DSRs in an allocation unit */
514#define for_each_dsr_in_allocation_map(i, map, k) \
515 for ((k) = find_first_bit((const unsigned long *)map, GRU_DSR_AU);\
516 (k) < GRU_DSR_AU; \
517 (k) = find_next_bit((const unsigned long *)map, \
518 GRU_DSR_AU, (k) + 1)) \
519 for ((i) = (k) * GRU_DSR_AU_CL; \
520 (i) < ((k) + 1) * GRU_DSR_AU_CL; (i)++)
521
522#define gseg_physical_address(gru, ctxnum) \
523 ((gru)->gs_gru_base_paddr + ctxnum * GRU_GSEG_STRIDE)
524#define gseg_virtual_address(gru, ctxnum) \
525 ((gru)->gs_gru_base_vaddr + ctxnum * GRU_GSEG_STRIDE)
526
527/*-----------------------------------------------------------------------------
528 * Lock / Unlock GRU handles
529 * Use the "delresp" bit in the handle as a "lock" bit.
530 */
531
532/* Lock hierarchy checking enabled only in emulator */
533
534static inline void __lock_handle(void *h)
535{
536 while (test_and_set_bit(1, h))
537 cpu_relax();
538}
539
540static inline void __unlock_handle(void *h)
541{
542 clear_bit(1, h);
543}
544
545static inline void lock_cch_handle(struct gru_context_configuration_handle *cch)
546{
547 __lock_handle(cch);
548}
549
550static inline void unlock_cch_handle(struct gru_context_configuration_handle
551 *cch)
552{
553 __unlock_handle(cch);
554}
555
556static inline void lock_tgh_handle(struct gru_tlb_global_handle *tgh)
557{
558 __lock_handle(tgh);
559}
560
561static inline void unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
562{
563 __unlock_handle(tgh);
564}
565
566/*-----------------------------------------------------------------------------
567 * Function prototypes & externs
568 */
569struct gru_unload_context_req;
570
571extern struct vm_operations_struct gru_vm_ops;
572extern struct device *grudev;
573
574extern struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma,
575 int tsid);
576extern struct gru_thread_state *gru_find_thread_state(struct vm_area_struct
577 *vma, int tsid);
578extern struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct
579 *vma, int tsid);
580extern void gru_unload_context(struct gru_thread_state *gts, int savestate);
581extern void gts_drop(struct gru_thread_state *gts);
582extern void gru_tgh_flush_init(struct gru_state *gru);
583extern int gru_kservices_init(struct gru_state *gru);
584extern irqreturn_t gru_intr(int irq, void *dev_id);
585extern int gru_handle_user_call_os(unsigned long address);
586extern int gru_user_flush_tlb(unsigned long arg);
587extern int gru_user_unload_context(unsigned long arg);
588extern int gru_get_exception_detail(unsigned long arg);
589extern int gru_set_task_slice(long address);
590extern int gru_cpu_fault_map_id(void);
591extern struct vm_area_struct *gru_find_vma(unsigned long vaddr);
592extern void gru_flush_all_tlb(struct gru_state *gru);
593extern int gru_proc_init(void);
594extern void gru_proc_exit(void);
595
596extern unsigned long gru_reserve_cb_resources(struct gru_state *gru,
597 int cbr_au_count, char *cbmap);
598extern unsigned long gru_reserve_ds_resources(struct gru_state *gru,
599 int dsr_au_count, char *dsmap);
600extern int gru_fault(struct vm_area_struct *, struct vm_fault *vmf);
601extern struct gru_mm_struct *gru_register_mmu_notifier(void);
602extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
603
604extern void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
605 unsigned long len);
606
607extern unsigned long gru_options;
608
609#endif /* __GRUTABLES_H__ */
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
new file mode 100644
index 000000000000..c84496a77691
--- /dev/null
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -0,0 +1,371 @@
1/*
2 * SN Platform GRU Driver
3 *
4 * MMUOPS callbacks + TLB flushing
5 *
6 * This file handles emu notifier callbacks from the core kernel. The callbacks
7 * are used to update the TLB in the GRU as a result of changes in the
8 * state of a process address space. This file also handles TLB invalidates
9 * from the GRU driver.
10 *
11 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 */
27
28#include <linux/kernel.h>
29#include <linux/list.h>
30#include <linux/spinlock.h>
31#include <linux/mm.h>
32#include <linux/slab.h>
33#include <linux/device.h>
34#include <linux/hugetlb.h>
35#include <linux/delay.h>
36#include <linux/timex.h>
37#include <linux/srcu.h>
38#include <asm/processor.h>
39#include "gru.h"
40#include "grutables.h"
41#include <asm/uv/uv_hub.h>
42
43#define gru_random() get_cycles()
44
45/* ---------------------------------- TLB Invalidation functions --------
46 * get_tgh_handle
47 *
48 * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the
49 * local blade, use a fixed TGH that is a function of the blade-local cpu
50 * number. Normally, this TGH is private to the cpu & no contention occurs for
51 * the TGH. For offblade GRUs, select a random TGH in the range above the
52 * private TGHs. A spinlock is required to access this TGH & the lock must be
53 * released when the invalidate is completes. This sucks, but it is the best we
54 * can do.
55 *
56 * Note that the spinlock is IN the TGH handle so locking does not involve
57 * additional cache lines.
58 *
59 */
60static inline int get_off_blade_tgh(struct gru_state *gru)
61{
62 int n;
63
64 n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
65 n = gru_random() % n;
66 n += gru->gs_tgh_first_remote;
67 return n;
68}
69
70static inline int get_on_blade_tgh(struct gru_state *gru)
71{
72 return uv_blade_processor_id() >> gru->gs_tgh_local_shift;
73}
74
75static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state
76 *gru)
77{
78 struct gru_tlb_global_handle *tgh;
79 int n;
80
81 preempt_disable();
82 if (uv_numa_blade_id() == gru->gs_blade_id)
83 n = get_on_blade_tgh(gru);
84 else
85 n = get_off_blade_tgh(gru);
86 tgh = get_tgh_by_index(gru, n);
87 lock_tgh_handle(tgh);
88
89 return tgh;
90}
91
92static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
93{
94 unlock_tgh_handle(tgh);
95 preempt_enable();
96}
97
98/*
99 * gru_flush_tlb_range
100 *
101 * General purpose TLB invalidation function. This function scans every GRU in
102 * the ENTIRE system (partition) looking for GRUs where the specified MM has
103 * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR
104 * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned
105 * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the
106 * cost of (possibly) a large number of future TLBmisses.
107 *
108 * The current algorithm is optimized based on the following (somewhat true)
109 * assumptions:
110 * - GRU contexts are not loaded into a GRU unless a reference is made to
111 * the data segment or control block (this is true, not an assumption).
112 * If a DS/CB is referenced, the user will also issue instructions that
113 * cause TLBmisses. It is not necessary to optimize for the case where
114 * contexts are loaded but no instructions cause TLB misses. (I know
115 * this will happen but I'm not optimizing for it).
116 * - GRU instructions to invalidate TLB entries are SLOOOOWWW - normally
117 * a few usec but in unusual cases, it could be longer. Avoid if
118 * possible.
119 * - intrablade process migration between cpus is not frequent but is
120 * common.
121 * - a GRU context is not typically migrated to a different GRU on the
122 * blade because of intrablade migration
123 * - interblade migration is rare. Processes migrate their GRU context to
124 * the new blade.
125 * - if interblade migration occurs, migration back to the original blade
126 * is very very rare (ie., no optimization for this case)
127 * - most GRU instruction operate on a subset of the user REGIONS. Code
128 * & shared library regions are not likely targets of GRU instructions.
129 *
130 * To help improve the efficiency of TLB invalidation, the GMS data
131 * structure is maintained for EACH address space (MM struct). The GMS is
132 * also the structure that contains the pointer to the mmu callout
133 * functions. This structure is linked to the mm_struct for the address space
134 * using the mmu "register" function. The mmu interfaces are used to
135 * provide the callbacks for TLB invalidation. The GMS contains:
136 *
137 * - asid[maxgrus] array. ASIDs are assigned to a GRU when a context is
138 * loaded into the GRU.
139 * - asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in
140 * the above array
141 * - ctxbitmap[maxgrus]. Indicates the contexts that are currently active
142 * in the GRU for the address space. This bitmap must be passed to the
143 * GRU to do an invalidate.
144 *
145 * The current algorithm for invalidating TLBs is:
146 * - scan the asidmap for GRUs where the context has been loaded, ie,
147 * asid is non-zero.
148 * - for each gru found:
149 * - if the ctxtmap is non-zero, there are active contexts in the
150 * GRU. TLB invalidate instructions must be issued to the GRU.
151 * - if the ctxtmap is zero, no context is active. Set the ASID to
152 * zero to force a full TLB invalidation. This is fast but will
153 * cause a lot of TLB misses if the context is reloaded onto the
154 * GRU
155 *
156 */
157
158void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
159 unsigned long len)
160{
161 struct gru_state *gru;
162 struct gru_mm_tracker *asids;
163 struct gru_tlb_global_handle *tgh;
164 unsigned long num;
165 int grupagesize, pagesize, pageshift, gid, asid;
166
167 /* ZZZ TODO - handle huge pages */
168 pageshift = PAGE_SHIFT;
169 pagesize = (1UL << pageshift);
170 grupagesize = GRU_PAGESIZE(pageshift);
171 num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL);
172
173 STAT(flush_tlb);
174 gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms,
175 start, len, gms->ms_asidmap[0]);
176
177 spin_lock(&gms->ms_asid_lock);
178 for_each_gru_in_bitmap(gid, gms->ms_asidmap) {
179 STAT(flush_tlb_gru);
180 gru = GID_TO_GRU(gid);
181 asids = gms->ms_asids + gid;
182 asid = asids->mt_asid;
183 if (asids->mt_ctxbitmap && asid) {
184 STAT(flush_tlb_gru_tgh);
185 asid = GRUASID(asid, start);
186 gru_dbg(grudev,
187 " FLUSH gruid %d, asid 0x%x, num %ld, cbmap 0x%x\n",
188 gid, asid, num, asids->mt_ctxbitmap);
189 tgh = get_lock_tgh_handle(gru);
190 tgh_invalidate(tgh, start, 0, asid, grupagesize, 0,
191 num - 1, asids->mt_ctxbitmap);
192 get_unlock_tgh_handle(tgh);
193 } else {
194 STAT(flush_tlb_gru_zero_asid);
195 asids->mt_asid = 0;
196 __clear_bit(gru->gs_gid, gms->ms_asidmap);
197 gru_dbg(grudev,
198 " CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n",
199 gid, asid, asids->mt_ctxbitmap,
200 gms->ms_asidmap[0]);
201 }
202 }
203 spin_unlock(&gms->ms_asid_lock);
204}
205
206/*
207 * Flush the entire TLB on a chiplet.
208 */
209void gru_flush_all_tlb(struct gru_state *gru)
210{
211 struct gru_tlb_global_handle *tgh;
212
213 gru_dbg(grudev, "gru %p, gid %d\n", gru, gru->gs_gid);
214 tgh = get_lock_tgh_handle(gru);
215 tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0);
216 get_unlock_tgh_handle(tgh);
217 preempt_enable();
218}
219
220/*
221 * MMUOPS notifier callout functions
222 */
223static void gru_invalidate_range_start(struct mmu_notifier *mn,
224 struct mm_struct *mm,
225 unsigned long start, unsigned long end)
226{
227 struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
228 ms_notifier);
229
230 STAT(mmu_invalidate_range);
231 atomic_inc(&gms->ms_range_active);
232 gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
233 start, end, atomic_read(&gms->ms_range_active));
234 gru_flush_tlb_range(gms, start, end - start);
235}
236
237static void gru_invalidate_range_end(struct mmu_notifier *mn,
238 struct mm_struct *mm, unsigned long start,
239 unsigned long end)
240{
241 struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
242 ms_notifier);
243
244 /* ..._and_test() provides needed barrier */
245 (void)atomic_dec_and_test(&gms->ms_range_active);
246
247 wake_up_all(&gms->ms_wait_queue);
248 gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end);
249}
250
251static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
252 unsigned long address)
253{
254 struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
255 ms_notifier);
256
257 STAT(mmu_invalidate_page);
258 gru_flush_tlb_range(gms, address, PAGE_SIZE);
259 gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address);
260}
261
262static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
263{
264 struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
265 ms_notifier);
266
267 gms->ms_released = 1;
268 gru_dbg(grudev, "gms %p\n", gms);
269}
270
271
272static const struct mmu_notifier_ops gru_mmuops = {
273 .invalidate_page = gru_invalidate_page,
274 .invalidate_range_start = gru_invalidate_range_start,
275 .invalidate_range_end = gru_invalidate_range_end,
276 .release = gru_release,
277};
278
279/* Move this to the basic mmu_notifier file. But for now... */
280static struct mmu_notifier *mmu_find_ops(struct mm_struct *mm,
281 const struct mmu_notifier_ops *ops)
282{
283 struct mmu_notifier *mn, *gru_mn = NULL;
284 struct hlist_node *n;
285
286 if (mm->mmu_notifier_mm) {
287 rcu_read_lock();
288 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list,
289 hlist)
290 if (mn->ops == ops) {
291 gru_mn = mn;
292 break;
293 }
294 rcu_read_unlock();
295 }
296 return gru_mn;
297}
298
299struct gru_mm_struct *gru_register_mmu_notifier(void)
300{
301 struct gru_mm_struct *gms;
302 struct mmu_notifier *mn;
303
304 mn = mmu_find_ops(current->mm, &gru_mmuops);
305 if (mn) {
306 gms = container_of(mn, struct gru_mm_struct, ms_notifier);
307 atomic_inc(&gms->ms_refcnt);
308 } else {
309 gms = kzalloc(sizeof(*gms), GFP_KERNEL);
310 if (gms) {
311 spin_lock_init(&gms->ms_asid_lock);
312 gms->ms_notifier.ops = &gru_mmuops;
313 atomic_set(&gms->ms_refcnt, 1);
314 init_waitqueue_head(&gms->ms_wait_queue);
315 __mmu_notifier_register(&gms->ms_notifier, current->mm);
316 }
317 }
318 gru_dbg(grudev, "gms %p, refcnt %d\n", gms,
319 atomic_read(&gms->ms_refcnt));
320 return gms;
321}
322
323void gru_drop_mmu_notifier(struct gru_mm_struct *gms)
324{
325 gru_dbg(grudev, "gms %p, refcnt %d, released %d\n", gms,
326 atomic_read(&gms->ms_refcnt), gms->ms_released);
327 if (atomic_dec_return(&gms->ms_refcnt) == 0) {
328 if (!gms->ms_released)
329 mmu_notifier_unregister(&gms->ms_notifier, current->mm);
330 kfree(gms);
331 }
332}
333
334/*
335 * Setup TGH parameters. There are:
336 * - 24 TGH handles per GRU chiplet
337 * - a portion (MAX_LOCAL_TGH) of the handles are reserved for
338 * use by blade-local cpus
339 * - the rest are used by off-blade cpus. This usage is
340 * less frequent than blade-local usage.
341 *
342 * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade
343 * has less tan or equal to 16 cpus, each cpu has a unique handle that it can
344 * use.
345 */
346#define MAX_LOCAL_TGH 16
347
348void gru_tgh_flush_init(struct gru_state *gru)
349{
350 int cpus, shift = 0, n;
351
352 cpus = uv_blade_nr_possible_cpus(gru->gs_blade_id);
353
354 /* n = cpus rounded up to next power of 2 */
355 if (cpus) {
356 n = 1 << fls(cpus - 1);
357
358 /*
359 * shift count for converting local cpu# to TGH index
360 * 0 if cpus <= MAX_LOCAL_TGH,
361 * 1 if cpus <= 2*MAX_LOCAL_TGH,
362 * etc
363 */
364 shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1));
365 }
366 gru->gs_tgh_local_shift = shift;
367
368 /* first starting TGH index to use for remote purges */
369 gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift;
370
371}
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile
index b6e40a7958ce..35ce28578075 100644
--- a/drivers/misc/sgi-xp/Makefile
+++ b/drivers/misc/sgi-xp/Makefile
@@ -3,9 +3,17 @@
3# 3#
4 4
5obj-$(CONFIG_SGI_XP) += xp.o 5obj-$(CONFIG_SGI_XP) += xp.o
6xp-y := xp_main.o xp_nofault.o 6xp-y := xp_main.o
7xp-$(CONFIG_IA64_SGI_SN2) += xp_sn2.o xp_nofault.o
8xp-$(CONFIG_IA64_GENERIC) += xp_sn2.o xp_nofault.o xp_uv.o
9xp-$(CONFIG_IA64_SGI_UV) += xp_uv.o
10xp-$(CONFIG_X86_64) += xp_uv.o
7 11
8obj-$(CONFIG_SGI_XP) += xpc.o 12obj-$(CONFIG_SGI_XP) += xpc.o
9xpc-y := xpc_main.o xpc_channel.o xpc_partition.o 13xpc-y := xpc_main.o xpc_channel.o xpc_partition.o
14xpc-$(CONFIG_IA64_SGI_SN2) += xpc_sn2.o
15xpc-$(CONFIG_IA64_GENERIC) += xpc_sn2.o xpc_uv.o
16xpc-$(CONFIG_IA64_SGI_UV) += xpc_uv.o
17xpc-$(CONFIG_X86_64) += xpc_uv.o
10 18
11obj-$(CONFIG_SGI_XP) += xpnet.o 19obj-$(CONFIG_SGI_XP) += xpnet.o
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
index 03a87a307e32..859a5281c61b 100644
--- a/drivers/misc/sgi-xp/xp.h
+++ b/drivers/misc/sgi-xp/xp.h
@@ -13,11 +13,34 @@
13#ifndef _DRIVERS_MISC_SGIXP_XP_H 13#ifndef _DRIVERS_MISC_SGIXP_XP_H
14#define _DRIVERS_MISC_SGIXP_XP_H 14#define _DRIVERS_MISC_SGIXP_XP_H
15 15
16#include <linux/cache.h>
17#include <linux/hardirq.h>
18#include <linux/mutex.h> 16#include <linux/mutex.h>
19#include <asm/sn/types.h> 17
20#include <asm/sn/bte.h> 18#ifdef CONFIG_IA64
19#include <asm/system.h>
20#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */
21#define is_shub() ia64_platform_is("sn2")
22#define is_uv() ia64_platform_is("uv")
23#endif
24#ifdef CONFIG_X86_64
25#include <asm/genapic.h>
26#define is_uv() is_uv_system()
27#endif
28
29#ifndef is_shub1
30#define is_shub1() 0
31#endif
32
33#ifndef is_shub2
34#define is_shub2() 0
35#endif
36
37#ifndef is_shub
38#define is_shub() 0
39#endif
40
41#ifndef is_uv
42#define is_uv() 0
43#endif
21 44
22#ifdef USE_DBUG_ON 45#ifdef USE_DBUG_ON
23#define DBUG_ON(condition) BUG_ON(condition) 46#define DBUG_ON(condition) BUG_ON(condition)
@@ -26,133 +49,56 @@
26#endif 49#endif
27 50
28/* 51/*
29 * Define the maximum number of logically defined partitions the system 52 * Define the maximum number of partitions the system can possibly support.
30 * can support. It is constrained by the maximum number of hardware 53 * It is based on the maximum number of hardware partitionable regions. The
31 * partitionable regions. The term 'region' in this context refers to the 54 * term 'region' in this context refers to the minimum number of nodes that
32 * minimum number of nodes that can comprise an access protection grouping. 55 * can comprise an access protection grouping. The access protection is in
33 * The access protection is in regards to memory, IPI and IOI. 56 * regards to memory, IPI and IOI.
34 * 57 *
35 * The maximum number of hardware partitionable regions is equal to the 58 * The maximum number of hardware partitionable regions is equal to the
36 * maximum number of nodes in the entire system divided by the minimum number 59 * maximum number of nodes in the entire system divided by the minimum number
37 * of nodes that comprise an access protection grouping. 60 * of nodes that comprise an access protection grouping.
38 */ 61 */
39#define XP_MAX_PARTITIONS 64 62#define XP_MAX_NPARTITIONS_SN2 64
40 63#define XP_MAX_NPARTITIONS_UV 256
41/*
42 * Define the number of u64s required to represent all the C-brick nasids
43 * as a bitmap. The cross-partition kernel modules deal only with
44 * C-brick nasids, thus the need for bitmaps which don't account for
45 * odd-numbered (non C-brick) nasids.
46 */
47#define XP_MAX_PHYSNODE_ID (MAX_NUMALINK_NODES / 2)
48#define XP_NASID_MASK_BYTES ((XP_MAX_PHYSNODE_ID + 7) / 8)
49#define XP_NASID_MASK_WORDS ((XP_MAX_PHYSNODE_ID + 63) / 64)
50
51/*
52 * Wrapper for bte_copy() that should it return a failure status will retry
53 * the bte_copy() once in the hope that the failure was due to a temporary
54 * aberration (i.e., the link going down temporarily).
55 *
56 * src - physical address of the source of the transfer.
57 * vdst - virtual address of the destination of the transfer.
58 * len - number of bytes to transfer from source to destination.
59 * mode - see bte_copy() for definition.
60 * notification - see bte_copy() for definition.
61 *
62 * Note: xp_bte_copy() should never be called while holding a spinlock.
63 */
64static inline bte_result_t
65xp_bte_copy(u64 src, u64 vdst, u64 len, u64 mode, void *notification)
66{
67 bte_result_t ret;
68 u64 pdst = ia64_tpa(vdst);
69
70 /*
71 * Ensure that the physically mapped memory is contiguous.
72 *
73 * We do this by ensuring that the memory is from region 7 only.
74 * If the need should arise to use memory from one of the other
75 * regions, then modify the BUG_ON() statement to ensure that the
76 * memory from that region is always physically contiguous.
77 */
78 BUG_ON(REGION_NUMBER(vdst) != RGN_KERNEL);
79
80 ret = bte_copy(src, pdst, len, mode, notification);
81 if ((ret != BTE_SUCCESS) && BTE_ERROR_RETRY(ret)) {
82 if (!in_interrupt())
83 cond_resched();
84
85 ret = bte_copy(src, pdst, len, mode, notification);
86 }
87
88 return ret;
89}
90 64
91/* 65/*
92 * XPC establishes channel connections between the local partition and any 66 * XPC establishes channel connections between the local partition and any
93 * other partition that is currently up. Over these channels, kernel-level 67 * other partition that is currently up. Over these channels, kernel-level
94 * `users' can communicate with their counterparts on the other partitions. 68 * `users' can communicate with their counterparts on the other partitions.
95 * 69 *
96 * The maxinum number of channels is limited to eight. For performance reasons,
97 * the internal cross partition structures require sixteen bytes per channel,
98 * and eight allows all of this interface-shared info to fit in one cache line.
99 *
100 * XPC_NCHANNELS reflects the total number of channels currently defined.
101 * If the need for additional channels arises, one can simply increase 70 * If the need for additional channels arises, one can simply increase
102 * XPC_NCHANNELS accordingly. If the day should come where that number 71 * XPC_MAX_NCHANNELS accordingly. If the day should come where that number
103 * exceeds the MAXIMUM number of channels allowed (eight), then one will need 72 * exceeds the absolute MAXIMUM number of channels possible (eight), then one
104 * to make changes to the XPC code to allow for this. 73 * will need to make changes to the XPC code to accommodate for this.
74 *
75 * The absolute maximum number of channels possible is limited to eight for
76 * performance reasons on sn2 hardware. The internal cross partition structures
77 * require sixteen bytes per channel, and eight allows all of this
78 * interface-shared info to fit in one 128-byte cacheline.
105 */ 79 */
106#define XPC_MEM_CHANNEL 0 /* memory channel number */ 80#define XPC_MEM_CHANNEL 0 /* memory channel number */
107#define XPC_NET_CHANNEL 1 /* network channel number */ 81#define XPC_NET_CHANNEL 1 /* network channel number */
108 82
109#define XPC_NCHANNELS 2 /* #of defined channels */ 83#define XPC_MAX_NCHANNELS 2 /* max #of channels allowed */
110#define XPC_MAX_NCHANNELS 8 /* max #of channels allowed */
111 84
112#if XPC_NCHANNELS > XPC_MAX_NCHANNELS 85#if XPC_MAX_NCHANNELS > 8
113#error XPC_NCHANNELS exceeds MAXIMUM allowed. 86#error XPC_MAX_NCHANNELS exceeds absolute MAXIMUM possible.
114#endif 87#endif
115 88
116/* 89/*
117 * The format of an XPC message is as follows: 90 * Define macro, XPC_MSG_SIZE(), is provided for the user
118 *
119 * +-------+--------------------------------+
120 * | flags |////////////////////////////////|
121 * +-------+--------------------------------+
122 * | message # |
123 * +----------------------------------------+
124 * | payload (user-defined message) |
125 * | |
126 * :
127 * | |
128 * +----------------------------------------+
129 *
130 * The size of the payload is defined by the user via xpc_connect(). A user-
131 * defined message resides in the payload area.
132 *
133 * The user should have no dealings with the message header, but only the
134 * message's payload. When a message entry is allocated (via xpc_allocate())
135 * a pointer to the payload area is returned and not the actual beginning of
136 * the XPC message. The user then constructs a message in the payload area
137 * and passes that pointer as an argument on xpc_send() or xpc_send_notify().
138 *
139 * The size of a message entry (within a message queue) must be a cacheline
140 * sized multiple in order to facilitate the BTE transfer of messages from one
141 * message queue to another. A macro, XPC_MSG_SIZE(), is provided for the user
142 * that wants to fit as many msg entries as possible in a given memory size 91 * that wants to fit as many msg entries as possible in a given memory size
143 * (e.g. a memory page). 92 * (e.g. a memory page).
144 */ 93 */
145struct xpc_msg { 94#define XPC_MSG_MAX_SIZE 128
146 u8 flags; /* FOR XPC INTERNAL USE ONLY */ 95#define XPC_MSG_HDR_MAX_SIZE 16
147 u8 reserved[7]; /* FOR XPC INTERNAL USE ONLY */ 96#define XPC_MSG_PAYLOAD_MAX_SIZE (XPC_MSG_MAX_SIZE - XPC_MSG_HDR_MAX_SIZE)
148 s64 number; /* FOR XPC INTERNAL USE ONLY */
149
150 u64 payload; /* user defined portion of message */
151};
152 97
153#define XPC_MSG_PAYLOAD_OFFSET (u64) (&((struct xpc_msg *)0)->payload)
154#define XPC_MSG_SIZE(_payload_size) \ 98#define XPC_MSG_SIZE(_payload_size) \
155 L1_CACHE_ALIGN(XPC_MSG_PAYLOAD_OFFSET + (_payload_size)) 99 ALIGN(XPC_MSG_HDR_MAX_SIZE + (_payload_size), \
100 is_uv() ? 64 : 128)
101
156 102
157/* 103/*
158 * Define the return values and values passed to user's callout functions. 104 * Define the return values and values passed to user's callout functions.
@@ -233,8 +179,20 @@ enum xp_retval {
233 xpDisconnected, /* 51: channel disconnected (closed) */ 179 xpDisconnected, /* 51: channel disconnected (closed) */
234 180
235 xpBteCopyError, /* 52: bte_copy() returned error */ 181 xpBteCopyError, /* 52: bte_copy() returned error */
182 xpSalError, /* 53: sn SAL error */
183 xpRsvdPageNotSet, /* 54: the reserved page is not set up */
184 xpPayloadTooBig, /* 55: payload too large for message slot */
185
186 xpUnsupported, /* 56: unsupported functionality or resource */
187 xpNeedMoreInfo, /* 57: more info is needed by SAL */
236 188
237 xpUnknownReason /* 53: unknown reason - must be last in enum */ 189 xpGruCopyError, /* 58: gru_copy_gru() returned error */
190 xpGruSendMqError, /* 59: gru send message queue related error */
191
192 xpBadChannelNumber, /* 60: invalid channel number */
193 xpBadMsgType, /* 60: invalid message type */
194
195 xpUnknownReason /* 61: unknown reason - must be last in enum */
238}; 196};
239 197
240/* 198/*
@@ -285,6 +243,9 @@ typedef void (*xpc_channel_func) (enum xp_retval reason, short partid,
285 * calling xpc_received(). 243 * calling xpc_received().
286 * 244 *
287 * All other reason codes indicate failure. 245 * All other reason codes indicate failure.
246 *
247 * NOTE: The user defined function must be callable by an interrupt handler
248 * and thus cannot block.
288 */ 249 */
289typedef void (*xpc_notify_func) (enum xp_retval reason, short partid, 250typedef void (*xpc_notify_func) (enum xp_retval reason, short partid,
290 int ch_number, void *key); 251 int ch_number, void *key);
@@ -308,23 +269,22 @@ struct xpc_registration {
308 xpc_channel_func func; /* function to call */ 269 xpc_channel_func func; /* function to call */
309 void *key; /* pointer to user's key */ 270 void *key; /* pointer to user's key */
310 u16 nentries; /* #of msg entries in local msg queue */ 271 u16 nentries; /* #of msg entries in local msg queue */
311 u16 msg_size; /* message queue's message size */ 272 u16 entry_size; /* message queue's message entry size */
312 u32 assigned_limit; /* limit on #of assigned kthreads */ 273 u32 assigned_limit; /* limit on #of assigned kthreads */
313 u32 idle_limit; /* limit on #of idle kthreads */ 274 u32 idle_limit; /* limit on #of idle kthreads */
314} ____cacheline_aligned; 275} ____cacheline_aligned;
315 276
316#define XPC_CHANNEL_REGISTERED(_c) (xpc_registrations[_c].func != NULL) 277#define XPC_CHANNEL_REGISTERED(_c) (xpc_registrations[_c].func != NULL)
317 278
318/* the following are valid xpc_allocate() flags */ 279/* the following are valid xpc_send() or xpc_send_notify() flags */
319#define XPC_WAIT 0 /* wait flag */ 280#define XPC_WAIT 0 /* wait flag */
320#define XPC_NOWAIT 1 /* no wait flag */ 281#define XPC_NOWAIT 1 /* no wait flag */
321 282
322struct xpc_interface { 283struct xpc_interface {
323 void (*connect) (int); 284 void (*connect) (int);
324 void (*disconnect) (int); 285 void (*disconnect) (int);
325 enum xp_retval (*allocate) (short, int, u32, void **); 286 enum xp_retval (*send) (short, int, u32, void *, u16);
326 enum xp_retval (*send) (short, int, void *); 287 enum xp_retval (*send_notify) (short, int, u32, void *, u16,
327 enum xp_retval (*send_notify) (short, int, void *,
328 xpc_notify_func, void *); 288 xpc_notify_func, void *);
329 void (*received) (short, int, void *); 289 void (*received) (short, int, void *);
330 enum xp_retval (*partid_to_nasids) (short, void *); 290 enum xp_retval (*partid_to_nasids) (short, void *);
@@ -334,10 +294,9 @@ extern struct xpc_interface xpc_interface;
334 294
335extern void xpc_set_interface(void (*)(int), 295extern void xpc_set_interface(void (*)(int),
336 void (*)(int), 296 void (*)(int),
337 enum xp_retval (*)(short, int, u32, void **), 297 enum xp_retval (*)(short, int, u32, void *, u16),
338 enum xp_retval (*)(short, int, void *), 298 enum xp_retval (*)(short, int, u32, void *, u16,
339 enum xp_retval (*)(short, int, void *, 299 xpc_notify_func, void *),
340 xpc_notify_func, void *),
341 void (*)(short, int, void *), 300 void (*)(short, int, void *),
342 enum xp_retval (*)(short, void *)); 301 enum xp_retval (*)(short, void *));
343extern void xpc_clear_interface(void); 302extern void xpc_clear_interface(void);
@@ -347,22 +306,19 @@ extern enum xp_retval xpc_connect(int, xpc_channel_func, void *, u16,
347extern void xpc_disconnect(int); 306extern void xpc_disconnect(int);
348 307
349static inline enum xp_retval 308static inline enum xp_retval
350xpc_allocate(short partid, int ch_number, u32 flags, void **payload) 309xpc_send(short partid, int ch_number, u32 flags, void *payload,
351{ 310 u16 payload_size)
352 return xpc_interface.allocate(partid, ch_number, flags, payload);
353}
354
355static inline enum xp_retval
356xpc_send(short partid, int ch_number, void *payload)
357{ 311{
358 return xpc_interface.send(partid, ch_number, payload); 312 return xpc_interface.send(partid, ch_number, flags, payload,
313 payload_size);
359} 314}
360 315
361static inline enum xp_retval 316static inline enum xp_retval
362xpc_send_notify(short partid, int ch_number, void *payload, 317xpc_send_notify(short partid, int ch_number, u32 flags, void *payload,
363 xpc_notify_func func, void *key) 318 u16 payload_size, xpc_notify_func func, void *key)
364{ 319{
365 return xpc_interface.send_notify(partid, ch_number, payload, func, key); 320 return xpc_interface.send_notify(partid, ch_number, flags, payload,
321 payload_size, func, key);
366} 322}
367 323
368static inline void 324static inline void
@@ -377,8 +333,23 @@ xpc_partid_to_nasids(short partid, void *nasids)
377 return xpc_interface.partid_to_nasids(partid, nasids); 333 return xpc_interface.partid_to_nasids(partid, nasids);
378} 334}
379 335
336extern short xp_max_npartitions;
337extern short xp_partition_id;
338extern u8 xp_region_size;
339
340extern unsigned long (*xp_pa) (void *);
341extern enum xp_retval (*xp_remote_memcpy) (unsigned long, const unsigned long,
342 size_t);
343extern int (*xp_cpu_to_nasid) (int);
344
380extern u64 xp_nofault_PIOR_target; 345extern u64 xp_nofault_PIOR_target;
381extern int xp_nofault_PIOR(void *); 346extern int xp_nofault_PIOR(void *);
382extern int xp_error_PIOR(void); 347extern int xp_error_PIOR(void);
383 348
349extern struct device *xp;
350extern enum xp_retval xp_init_sn2(void);
351extern enum xp_retval xp_init_uv(void);
352extern void xp_exit_sn2(void);
353extern void xp_exit_uv(void);
354
384#endif /* _DRIVERS_MISC_SGIXP_XP_H */ 355#endif /* _DRIVERS_MISC_SGIXP_XP_H */
diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c
index 196480b691a1..66a1d19e08ad 100644
--- a/drivers/misc/sgi-xp/xp_main.c
+++ b/drivers/misc/sgi-xp/xp_main.c
@@ -14,29 +14,48 @@
14 * 14 *
15 */ 15 */
16 16
17#include <linux/kernel.h>
18#include <linux/interrupt.h>
19#include <linux/module.h> 17#include <linux/module.h>
20#include <linux/mutex.h> 18#include <linux/device.h>
21#include <asm/sn/intr.h>
22#include <asm/sn/sn_sal.h>
23#include "xp.h" 19#include "xp.h"
24 20
25/* 21/* define the XP debug device structures to be used with dev_dbg() et al */
26 * The export of xp_nofault_PIOR needs to happen here since it is defined 22
27 * in drivers/misc/sgi-xp/xp_nofault.S. The target of the nofault read is 23struct device_driver xp_dbg_name = {
28 * defined here. 24 .name = "xp"
29 */ 25};
30EXPORT_SYMBOL_GPL(xp_nofault_PIOR); 26
27struct device xp_dbg_subname = {
28 .bus_id = {0}, /* set to "" */
29 .driver = &xp_dbg_name
30};
31
32struct device *xp = &xp_dbg_subname;
33
34/* max #of partitions possible */
35short xp_max_npartitions;
36EXPORT_SYMBOL_GPL(xp_max_npartitions);
37
38short xp_partition_id;
39EXPORT_SYMBOL_GPL(xp_partition_id);
40
41u8 xp_region_size;
42EXPORT_SYMBOL_GPL(xp_region_size);
43
44unsigned long (*xp_pa) (void *addr);
45EXPORT_SYMBOL_GPL(xp_pa);
46
47enum xp_retval (*xp_remote_memcpy) (unsigned long dst_gpa,
48 const unsigned long src_gpa, size_t len);
49EXPORT_SYMBOL_GPL(xp_remote_memcpy);
31 50
32u64 xp_nofault_PIOR_target; 51int (*xp_cpu_to_nasid) (int cpuid);
33EXPORT_SYMBOL_GPL(xp_nofault_PIOR_target); 52EXPORT_SYMBOL_GPL(xp_cpu_to_nasid);
34 53
35/* 54/*
36 * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level 55 * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
37 * users of XPC. 56 * users of XPC.
38 */ 57 */
39struct xpc_registration xpc_registrations[XPC_NCHANNELS]; 58struct xpc_registration xpc_registrations[XPC_MAX_NCHANNELS];
40EXPORT_SYMBOL_GPL(xpc_registrations); 59EXPORT_SYMBOL_GPL(xpc_registrations);
41 60
42/* 61/*
@@ -51,10 +70,9 @@ xpc_notloaded(void)
51struct xpc_interface xpc_interface = { 70struct xpc_interface xpc_interface = {
52 (void (*)(int))xpc_notloaded, 71 (void (*)(int))xpc_notloaded,
53 (void (*)(int))xpc_notloaded, 72 (void (*)(int))xpc_notloaded,
54 (enum xp_retval(*)(short, int, u32, void **))xpc_notloaded, 73 (enum xp_retval(*)(short, int, u32, void *, u16))xpc_notloaded,
55 (enum xp_retval(*)(short, int, void *))xpc_notloaded, 74 (enum xp_retval(*)(short, int, u32, void *, u16, xpc_notify_func,
56 (enum xp_retval(*)(short, int, void *, xpc_notify_func, void *)) 75 void *))xpc_notloaded,
57 xpc_notloaded,
58 (void (*)(short, int, void *))xpc_notloaded, 76 (void (*)(short, int, void *))xpc_notloaded,
59 (enum xp_retval(*)(short, void *))xpc_notloaded 77 (enum xp_retval(*)(short, void *))xpc_notloaded
60}; 78};
@@ -66,16 +84,14 @@ EXPORT_SYMBOL_GPL(xpc_interface);
66void 84void
67xpc_set_interface(void (*connect) (int), 85xpc_set_interface(void (*connect) (int),
68 void (*disconnect) (int), 86 void (*disconnect) (int),
69 enum xp_retval (*allocate) (short, int, u32, void **), 87 enum xp_retval (*send) (short, int, u32, void *, u16),
70 enum xp_retval (*send) (short, int, void *), 88 enum xp_retval (*send_notify) (short, int, u32, void *, u16,
71 enum xp_retval (*send_notify) (short, int, void *,
72 xpc_notify_func, void *), 89 xpc_notify_func, void *),
73 void (*received) (short, int, void *), 90 void (*received) (short, int, void *),
74 enum xp_retval (*partid_to_nasids) (short, void *)) 91 enum xp_retval (*partid_to_nasids) (short, void *))
75{ 92{
76 xpc_interface.connect = connect; 93 xpc_interface.connect = connect;
77 xpc_interface.disconnect = disconnect; 94 xpc_interface.disconnect = disconnect;
78 xpc_interface.allocate = allocate;
79 xpc_interface.send = send; 95 xpc_interface.send = send;
80 xpc_interface.send_notify = send_notify; 96 xpc_interface.send_notify = send_notify;
81 xpc_interface.received = received; 97 xpc_interface.received = received;
@@ -91,13 +107,11 @@ xpc_clear_interface(void)
91{ 107{
92 xpc_interface.connect = (void (*)(int))xpc_notloaded; 108 xpc_interface.connect = (void (*)(int))xpc_notloaded;
93 xpc_interface.disconnect = (void (*)(int))xpc_notloaded; 109 xpc_interface.disconnect = (void (*)(int))xpc_notloaded;
94 xpc_interface.allocate = (enum xp_retval(*)(short, int, u32, 110 xpc_interface.send = (enum xp_retval(*)(short, int, u32, void *, u16))
95 void **))xpc_notloaded;
96 xpc_interface.send = (enum xp_retval(*)(short, int, void *))
97 xpc_notloaded; 111 xpc_notloaded;
98 xpc_interface.send_notify = (enum xp_retval(*)(short, int, void *, 112 xpc_interface.send_notify = (enum xp_retval(*)(short, int, u32, void *,
99 xpc_notify_func, 113 u16, xpc_notify_func,
100 void *))xpc_notloaded; 114 void *))xpc_notloaded;
101 xpc_interface.received = (void (*)(short, int, void *)) 115 xpc_interface.received = (void (*)(short, int, void *))
102 xpc_notloaded; 116 xpc_notloaded;
103 xpc_interface.partid_to_nasids = (enum xp_retval(*)(short, void *)) 117 xpc_interface.partid_to_nasids = (enum xp_retval(*)(short, void *))
@@ -135,11 +149,14 @@ xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
135{ 149{
136 struct xpc_registration *registration; 150 struct xpc_registration *registration;
137 151
138 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS); 152 DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
139 DBUG_ON(payload_size == 0 || nentries == 0); 153 DBUG_ON(payload_size == 0 || nentries == 0);
140 DBUG_ON(func == NULL); 154 DBUG_ON(func == NULL);
141 DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit); 155 DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
142 156
157 if (XPC_MSG_SIZE(payload_size) > XPC_MSG_MAX_SIZE)
158 return xpPayloadTooBig;
159
143 registration = &xpc_registrations[ch_number]; 160 registration = &xpc_registrations[ch_number];
144 161
145 if (mutex_lock_interruptible(&registration->mutex) != 0) 162 if (mutex_lock_interruptible(&registration->mutex) != 0)
@@ -152,7 +169,7 @@ xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
152 } 169 }
153 170
154 /* register the channel for connection */ 171 /* register the channel for connection */
155 registration->msg_size = XPC_MSG_SIZE(payload_size); 172 registration->entry_size = XPC_MSG_SIZE(payload_size);
156 registration->nentries = nentries; 173 registration->nentries = nentries;
157 registration->assigned_limit = assigned_limit; 174 registration->assigned_limit = assigned_limit;
158 registration->idle_limit = idle_limit; 175 registration->idle_limit = idle_limit;
@@ -185,7 +202,7 @@ xpc_disconnect(int ch_number)
185{ 202{
186 struct xpc_registration *registration; 203 struct xpc_registration *registration;
187 204
188 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS); 205 DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
189 206
190 registration = &xpc_registrations[ch_number]; 207 registration = &xpc_registrations[ch_number];
191 208
@@ -206,7 +223,7 @@ xpc_disconnect(int ch_number)
206 registration->func = NULL; 223 registration->func = NULL;
207 registration->key = NULL; 224 registration->key = NULL;
208 registration->nentries = 0; 225 registration->nentries = 0;
209 registration->msg_size = 0; 226 registration->entry_size = 0;
210 registration->assigned_limit = 0; 227 registration->assigned_limit = 0;
211 registration->idle_limit = 0; 228 registration->idle_limit = 0;
212 229
@@ -221,39 +238,21 @@ EXPORT_SYMBOL_GPL(xpc_disconnect);
221int __init 238int __init
222xp_init(void) 239xp_init(void)
223{ 240{
224 int ret, ch_number; 241 enum xp_retval ret;
225 u64 func_addr = *(u64 *)xp_nofault_PIOR; 242 int ch_number;
226 u64 err_func_addr = *(u64 *)xp_error_PIOR;
227
228 if (!ia64_platform_is("sn2"))
229 return -ENODEV;
230 243
231 /* 244 if (is_shub())
232 * Register a nofault code region which performs a cross-partition 245 ret = xp_init_sn2();
233 * PIO read. If the PIO read times out, the MCA handler will consume 246 else if (is_uv())
234 * the error and return to a kernel-provided instruction to indicate 247 ret = xp_init_uv();
235 * an error. This PIO read exists because it is guaranteed to timeout
236 * if the destination is down (AMO operations do not timeout on at
237 * least some CPUs on Shubs <= v1.2, which unfortunately we have to
238 * work around).
239 */
240 ret = sn_register_nofault_code(func_addr, err_func_addr, err_func_addr,
241 1, 1);
242 if (ret != 0) {
243 printk(KERN_ERR "XP: can't register nofault code, error=%d\n",
244 ret);
245 }
246 /*
247 * Setup the nofault PIO read target. (There is no special reason why
248 * SH_IPI_ACCESS was selected.)
249 */
250 if (is_shub2())
251 xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
252 else 248 else
253 xp_nofault_PIOR_target = SH1_IPI_ACCESS; 249 ret = xpUnsupported;
250
251 if (ret != xpSuccess)
252 return -ENODEV;
254 253
255 /* initialize the connection registration mutex */ 254 /* initialize the connection registration mutex */
256 for (ch_number = 0; ch_number < XPC_NCHANNELS; ch_number++) 255 for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++)
257 mutex_init(&xpc_registrations[ch_number].mutex); 256 mutex_init(&xpc_registrations[ch_number].mutex);
258 257
259 return 0; 258 return 0;
@@ -264,12 +263,10 @@ module_init(xp_init);
264void __exit 263void __exit
265xp_exit(void) 264xp_exit(void)
266{ 265{
267 u64 func_addr = *(u64 *)xp_nofault_PIOR; 266 if (is_shub())
268 u64 err_func_addr = *(u64 *)xp_error_PIOR; 267 xp_exit_sn2();
269 268 else if (is_uv())
270 /* unregister the PIO read nofault code region */ 269 xp_exit_uv();
271 (void)sn_register_nofault_code(func_addr, err_func_addr,
272 err_func_addr, 1, 0);
273} 270}
274 271
275module_exit(xp_exit); 272module_exit(xp_exit);
diff --git a/drivers/misc/sgi-xp/xp_sn2.c b/drivers/misc/sgi-xp/xp_sn2.c
new file mode 100644
index 000000000000..1440134caf31
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_sn2.c
@@ -0,0 +1,146 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition (XP) sn2-based functions.
11 *
12 * Architecture specific implementation of common functions.
13 */
14
15#include <linux/module.h>
16#include <linux/device.h>
17#include <asm/sn/bte.h>
18#include <asm/sn/sn_sal.h>
19#include "xp.h"
20
21/*
22 * The export of xp_nofault_PIOR needs to happen here since it is defined
23 * in drivers/misc/sgi-xp/xp_nofault.S. The target of the nofault read is
24 * defined here.
25 */
26EXPORT_SYMBOL_GPL(xp_nofault_PIOR);
27
28u64 xp_nofault_PIOR_target;
29EXPORT_SYMBOL_GPL(xp_nofault_PIOR_target);
30
31/*
32 * Register a nofault code region which performs a cross-partition PIO read.
33 * If the PIO read times out, the MCA handler will consume the error and
34 * return to a kernel-provided instruction to indicate an error. This PIO read
35 * exists because it is guaranteed to timeout if the destination is down
36 * (amo operations do not timeout on at least some CPUs on Shubs <= v1.2,
37 * which unfortunately we have to work around).
38 */
39static enum xp_retval
40xp_register_nofault_code_sn2(void)
41{
42 int ret;
43 u64 func_addr;
44 u64 err_func_addr;
45
46 func_addr = *(u64 *)xp_nofault_PIOR;
47 err_func_addr = *(u64 *)xp_error_PIOR;
48 ret = sn_register_nofault_code(func_addr, err_func_addr, err_func_addr,
49 1, 1);
50 if (ret != 0) {
51 dev_err(xp, "can't register nofault code, error=%d\n", ret);
52 return xpSalError;
53 }
54 /*
55 * Setup the nofault PIO read target. (There is no special reason why
56 * SH_IPI_ACCESS was selected.)
57 */
58 if (is_shub1())
59 xp_nofault_PIOR_target = SH1_IPI_ACCESS;
60 else if (is_shub2())
61 xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
62
63 return xpSuccess;
64}
65
66static void
67xp_unregister_nofault_code_sn2(void)
68{
69 u64 func_addr = *(u64 *)xp_nofault_PIOR;
70 u64 err_func_addr = *(u64 *)xp_error_PIOR;
71
72 /* unregister the PIO read nofault code region */
73 (void)sn_register_nofault_code(func_addr, err_func_addr,
74 err_func_addr, 1, 0);
75}
76
77/*
78 * Convert a virtual memory address to a physical memory address.
79 */
80static unsigned long
81xp_pa_sn2(void *addr)
82{
83 return __pa(addr);
84}
85
86/*
87 * Wrapper for bte_copy().
88 *
89 * dst_pa - physical address of the destination of the transfer.
90 * src_pa - physical address of the source of the transfer.
91 * len - number of bytes to transfer from source to destination.
92 *
93 * Note: xp_remote_memcpy_sn2() should never be called while holding a spinlock.
94 */
95static enum xp_retval
96xp_remote_memcpy_sn2(unsigned long dst_pa, const unsigned long src_pa,
97 size_t len)
98{
99 bte_result_t ret;
100
101 ret = bte_copy(src_pa, dst_pa, len, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
102 if (ret == BTE_SUCCESS)
103 return xpSuccess;
104
105 if (is_shub2()) {
106 dev_err(xp, "bte_copy() on shub2 failed, error=0x%x dst_pa="
107 "0x%016lx src_pa=0x%016lx len=%ld\\n", ret, dst_pa,
108 src_pa, len);
109 } else {
110 dev_err(xp, "bte_copy() failed, error=%d dst_pa=0x%016lx "
111 "src_pa=0x%016lx len=%ld\\n", ret, dst_pa, src_pa, len);
112 }
113
114 return xpBteCopyError;
115}
116
117static int
118xp_cpu_to_nasid_sn2(int cpuid)
119{
120 return cpuid_to_nasid(cpuid);
121}
122
123enum xp_retval
124xp_init_sn2(void)
125{
126 BUG_ON(!is_shub());
127
128 xp_max_npartitions = XP_MAX_NPARTITIONS_SN2;
129 xp_partition_id = sn_partition_id;
130 xp_region_size = sn_region_size;
131
132 xp_pa = xp_pa_sn2;
133 xp_remote_memcpy = xp_remote_memcpy_sn2;
134 xp_cpu_to_nasid = xp_cpu_to_nasid_sn2;
135
136 return xp_register_nofault_code_sn2();
137}
138
139void
140xp_exit_sn2(void)
141{
142 BUG_ON(!is_shub());
143
144 xp_unregister_nofault_code_sn2();
145}
146
diff --git a/drivers/misc/sgi-xp/xp_uv.c b/drivers/misc/sgi-xp/xp_uv.c
new file mode 100644
index 000000000000..d9f7ce2510bc
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_uv.c
@@ -0,0 +1,72 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition (XP) uv-based functions.
11 *
12 * Architecture specific implementation of common functions.
13 *
14 */
15
16#include <linux/device.h>
17#include <asm/uv/uv_hub.h>
18#include "../sgi-gru/grukservices.h"
19#include "xp.h"
20
21/*
22 * Convert a virtual memory address to a physical memory address.
23 */
24static unsigned long
25xp_pa_uv(void *addr)
26{
27 return uv_gpa(addr);
28}
29
30static enum xp_retval
31xp_remote_memcpy_uv(unsigned long dst_gpa, const unsigned long src_gpa,
32 size_t len)
33{
34 int ret;
35
36 ret = gru_copy_gpa(dst_gpa, src_gpa, len);
37 if (ret == 0)
38 return xpSuccess;
39
40 dev_err(xp, "gru_copy_gpa() failed, dst_gpa=0x%016lx src_gpa=0x%016lx "
41 "len=%ld\n", dst_gpa, src_gpa, len);
42 return xpGruCopyError;
43}
44
45static int
46xp_cpu_to_nasid_uv(int cpuid)
47{
48 /* ??? Is this same as sn2 nasid in mach/part bitmaps set up by SAL? */
49 return UV_PNODE_TO_NASID(uv_cpu_to_pnode(cpuid));
50}
51
52enum xp_retval
53xp_init_uv(void)
54{
55 BUG_ON(!is_uv());
56
57 xp_max_npartitions = XP_MAX_NPARTITIONS_UV;
58 xp_partition_id = 0; /* !!! not correct value */
59 xp_region_size = 0; /* !!! not correct value */
60
61 xp_pa = xp_pa_uv;
62 xp_remote_memcpy = xp_remote_memcpy_uv;
63 xp_cpu_to_nasid = xp_cpu_to_nasid_uv;
64
65 return xpSuccess;
66}
67
68void
69xp_exit_uv(void)
70{
71 BUG_ON(!is_uv());
72}
diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h
index 11ac267ed68f..619208d61862 100644
--- a/drivers/misc/sgi-xp/xpc.h
+++ b/drivers/misc/sgi-xp/xpc.h
@@ -13,18 +13,10 @@
13#ifndef _DRIVERS_MISC_SGIXP_XPC_H 13#ifndef _DRIVERS_MISC_SGIXP_XPC_H
14#define _DRIVERS_MISC_SGIXP_XPC_H 14#define _DRIVERS_MISC_SGIXP_XPC_H
15 15
16#include <linux/interrupt.h> 16#include <linux/wait.h>
17#include <linux/sysctl.h>
18#include <linux/device.h>
19#include <linux/mutex.h>
20#include <linux/completion.h> 17#include <linux/completion.h>
21#include <asm/pgtable.h> 18#include <linux/timer.h>
22#include <asm/processor.h> 19#include <linux/sched.h>
23#include <asm/sn/bte.h>
24#include <asm/sn/clksupport.h>
25#include <asm/sn/addrs.h>
26#include <asm/sn/mspec.h>
27#include <asm/sn/shub_mmr.h>
28#include "xp.h" 20#include "xp.h"
29 21
30/* 22/*
@@ -36,23 +28,7 @@
36#define XPC_VERSION_MAJOR(_v) ((_v) >> 4) 28#define XPC_VERSION_MAJOR(_v) ((_v) >> 4)
37#define XPC_VERSION_MINOR(_v) ((_v) & 0xf) 29#define XPC_VERSION_MINOR(_v) ((_v) & 0xf)
38 30
39/* 31/* define frequency of the heartbeat and frequency how often it's checked */
40 * The next macros define word or bit representations for given
41 * C-brick nasid in either the SAL provided bit array representing
42 * nasids in the partition/machine or the AMO_t array used for
43 * inter-partition initiation communications.
44 *
45 * For SN2 machines, C-Bricks are alway even numbered NASIDs. As
46 * such, some space will be saved by insisting that nasid information
47 * passed from SAL always be packed for C-Bricks and the
48 * cross-partition interrupts use the same packing scheme.
49 */
50#define XPC_NASID_W_INDEX(_n) (((_n) / 64) / 2)
51#define XPC_NASID_B_INDEX(_n) (((_n) / 2) & (64 - 1))
52#define XPC_NASID_IN_ARRAY(_n, _p) ((_p)[XPC_NASID_W_INDEX(_n)] & \
53 (1UL << XPC_NASID_B_INDEX(_n)))
54#define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2)
55
56#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */ 32#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */
57#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */ 33#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */
58 34
@@ -72,11 +48,11 @@
72 * 48 *
73 * reserved page header 49 * reserved page header
74 * 50 *
75 * The first cacheline of the reserved page contains the header 51 * The first two 64-byte cachelines of the reserved page contain the
76 * (struct xpc_rsvd_page). Before SAL initialization has completed, 52 * header (struct xpc_rsvd_page). Before SAL initialization has completed,
77 * SAL has set up the following fields of the reserved page header: 53 * SAL has set up the following fields of the reserved page header:
78 * SAL_signature, SAL_version, partid, and nasids_size. The other 54 * SAL_signature, SAL_version, SAL_partid, and SAL_nasids_size. The
79 * fields are set up by XPC. (xpc_rsvd_page points to the local 55 * other fields are set up by XPC. (xpc_rsvd_page points to the local
80 * partition's reserved page.) 56 * partition's reserved page.)
81 * 57 *
82 * part_nasids mask 58 * part_nasids mask
@@ -87,14 +63,16 @@
87 * the actual nasids in the entire machine (mach_nasids). We're only 63 * the actual nasids in the entire machine (mach_nasids). We're only
88 * interested in the even numbered nasids (which contain the processors 64 * interested in the even numbered nasids (which contain the processors
89 * and/or memory), so we only need half as many bits to represent the 65 * and/or memory), so we only need half as many bits to represent the
90 * nasids. The part_nasids mask is located starting at the first cacheline 66 * nasids. When mapping nasid to bit in a mask (or bit to nasid) be sure
91 * following the reserved page header. The mach_nasids mask follows right 67 * to either divide or multiply by 2. The part_nasids mask is located
92 * after the part_nasids mask. The size in bytes of each mask is reflected 68 * starting at the first cacheline following the reserved page header. The
93 * by the reserved page header field 'nasids_size'. (Local partition's 69 * mach_nasids mask follows right after the part_nasids mask. The size in
94 * mask pointers are xpc_part_nasids and xpc_mach_nasids.) 70 * bytes of each mask is reflected by the reserved page header field
71 * 'SAL_nasids_size'. (Local partition's mask pointers are xpc_part_nasids
72 * and xpc_mach_nasids.)
95 * 73 *
96 * vars 74 * vars (ia64-sn2 only)
97 * vars part 75 * vars part (ia64-sn2 only)
98 * 76 *
99 * Immediately following the mach_nasids mask are the XPC variables 77 * Immediately following the mach_nasids mask are the XPC variables
100 * required by other partitions. First are those that are generic to all 78 * required by other partitions. First are those that are generic to all
@@ -102,43 +80,26 @@
102 * which are partition specific (vars part). These are setup by XPC. 80 * which are partition specific (vars part). These are setup by XPC.
103 * (Local partition's vars pointers are xpc_vars and xpc_vars_part.) 81 * (Local partition's vars pointers are xpc_vars and xpc_vars_part.)
104 * 82 *
105 * Note: Until vars_pa is set, the partition XPC code has not been initialized. 83 * Note: Until 'ts_jiffies' is set non-zero, the partition XPC code has not been
84 * initialized.
106 */ 85 */
107struct xpc_rsvd_page { 86struct xpc_rsvd_page {
108 u64 SAL_signature; /* SAL: unique signature */ 87 u64 SAL_signature; /* SAL: unique signature */
109 u64 SAL_version; /* SAL: version */ 88 u64 SAL_version; /* SAL: version */
110 u8 partid; /* SAL: partition ID */ 89 short SAL_partid; /* SAL: partition ID */
90 short max_npartitions; /* value of XPC_MAX_PARTITIONS */
111 u8 version; 91 u8 version;
112 u8 pad1[6]; /* align to next u64 in cacheline */ 92 u8 pad1[3]; /* align to next u64 in 1st 64-byte cacheline */
113 u64 vars_pa; /* physical address of struct xpc_vars */ 93 union {
114 struct timespec stamp; /* time when reserved page was setup by XPC */ 94 unsigned long vars_pa; /* phys address of struct xpc_vars */
115 u64 pad2[9]; /* align to last u64 in cacheline */ 95 unsigned long activate_mq_gpa; /* gru phy addr of activate_mq */
116 u64 nasids_size; /* SAL: size of each nasid mask in bytes */ 96 } sn;
97 unsigned long ts_jiffies; /* timestamp when rsvd pg was setup by XPC */
98 u64 pad2[10]; /* align to last u64 in 2nd 64-byte cacheline */
99 u64 SAL_nasids_size; /* SAL: size of each nasid mask in bytes */
117}; 100};
118 101
119#define XPC_RP_VERSION _XPC_VERSION(1, 1) /* version 1.1 of the reserved page */ 102#define XPC_RP_VERSION _XPC_VERSION(2, 0) /* version 2.0 of the reserved page */
120
121#define XPC_SUPPORTS_RP_STAMP(_version) \
122 (_version >= _XPC_VERSION(1, 1))
123
124/*
125 * compare stamps - the return value is:
126 *
127 * < 0, if stamp1 < stamp2
128 * = 0, if stamp1 == stamp2
129 * > 0, if stamp1 > stamp2
130 */
131static inline int
132xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2)
133{
134 int ret;
135
136 ret = stamp1->tv_sec - stamp2->tv_sec;
137 if (ret == 0)
138 ret = stamp1->tv_nsec - stamp2->tv_nsec;
139
140 return ret;
141}
142 103
143/* 104/*
144 * Define the structures by which XPC variables can be exported to other 105 * Define the structures by which XPC variables can be exported to other
@@ -154,85 +115,40 @@ xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2)
154 * reflected by incrementing either the major or minor version numbers 115 * reflected by incrementing either the major or minor version numbers
155 * of struct xpc_vars. 116 * of struct xpc_vars.
156 */ 117 */
157struct xpc_vars { 118struct xpc_vars_sn2 {
158 u8 version; 119 u8 version;
159 u64 heartbeat; 120 u64 heartbeat;
160 u64 heartbeating_to_mask; 121 DECLARE_BITMAP(heartbeating_to_mask, XP_MAX_NPARTITIONS_SN2);
161 u64 heartbeat_offline; /* if 0, heartbeat should be changing */ 122 u64 heartbeat_offline; /* if 0, heartbeat should be changing */
162 int act_nasid; 123 int activate_IRQ_nasid;
163 int act_phys_cpuid; 124 int activate_IRQ_phys_cpuid;
164 u64 vars_part_pa; 125 unsigned long vars_part_pa;
165 u64 amos_page_pa; /* paddr of page of AMOs from MSPEC driver */ 126 unsigned long amos_page_pa;/* paddr of page of amos from MSPEC driver */
166 AMO_t *amos_page; /* vaddr of page of AMOs from MSPEC driver */ 127 struct amo *amos_page; /* vaddr of page of amos from MSPEC driver */
167}; 128};
168 129
169#define XPC_V_VERSION _XPC_VERSION(3, 1) /* version 3.1 of the cross vars */ 130#define XPC_V_VERSION _XPC_VERSION(3, 1) /* version 3.1 of the cross vars */
170 131
171#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \
172 (_version >= _XPC_VERSION(3, 1))
173
174static inline int
175xpc_hb_allowed(short partid, struct xpc_vars *vars)
176{
177 return ((vars->heartbeating_to_mask & (1UL << partid)) != 0);
178}
179
180static inline void
181xpc_allow_hb(short partid, struct xpc_vars *vars)
182{
183 u64 old_mask, new_mask;
184
185 do {
186 old_mask = vars->heartbeating_to_mask;
187 new_mask = (old_mask | (1UL << partid));
188 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
189 old_mask);
190}
191
192static inline void
193xpc_disallow_hb(short partid, struct xpc_vars *vars)
194{
195 u64 old_mask, new_mask;
196
197 do {
198 old_mask = vars->heartbeating_to_mask;
199 new_mask = (old_mask & ~(1UL << partid));
200 } while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
201 old_mask);
202}
203
204/*
205 * The AMOs page consists of a number of AMO variables which are divided into
206 * four groups, The first two groups are used to identify an IRQ's sender.
207 * These two groups consist of 64 and 128 AMO variables respectively. The last
208 * two groups, consisting of just one AMO variable each, are used to identify
209 * the remote partitions that are currently engaged (from the viewpoint of
210 * the XPC running on the remote partition).
211 */
212#define XPC_NOTIFY_IRQ_AMOS 0
213#define XPC_ACTIVATE_IRQ_AMOS (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS)
214#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS)
215#define XPC_DISENGAGE_REQUEST_AMO (XPC_ENGAGED_PARTITIONS_AMO + 1)
216
217/* 132/*
218 * The following structure describes the per partition specific variables. 133 * The following structure describes the per partition specific variables.
219 * 134 *
220 * An array of these structures, one per partition, will be defined. As a 135 * An array of these structures, one per partition, will be defined. As a
221 * partition becomes active XPC will copy the array entry corresponding to 136 * partition becomes active XPC will copy the array entry corresponding to
222 * itself from that partition. It is desirable that the size of this 137 * itself from that partition. It is desirable that the size of this structure
223 * structure evenly divide into a cacheline, such that none of the entries 138 * evenly divides into a 128-byte cacheline, such that none of the entries in
224 * in this array crosses a cacheline boundary. As it is now, each entry 139 * this array crosses a 128-byte cacheline boundary. As it is now, each entry
225 * occupies half a cacheline. 140 * occupies 64-bytes.
226 */ 141 */
227struct xpc_vars_part { 142struct xpc_vars_part_sn2 {
228 u64 magic; 143 u64 magic;
229 144
230 u64 openclose_args_pa; /* physical address of open and close args */ 145 unsigned long openclose_args_pa; /* phys addr of open and close args */
231 u64 GPs_pa; /* physical address of Get/Put values */ 146 unsigned long GPs_pa; /* physical address of Get/Put values */
147
148 unsigned long chctl_amo_pa; /* physical address of chctl flags' amo */
232 149
233 u64 IPI_amo_pa; /* physical address of IPI AMO_t structure */ 150 int notify_IRQ_nasid; /* nasid of where to send notify IRQs */
234 int IPI_nasid; /* nasid of where to send IPIs */ 151 int notify_IRQ_phys_cpuid; /* CPUID of where to send notify IRQs */
235 int IPI_phys_cpuid; /* physical CPU ID of where to send IPIs */
236 152
237 u8 nchannels; /* #of defined channels supported */ 153 u8 nchannels; /* #of defined channels supported */
238 154
@@ -248,20 +164,95 @@ struct xpc_vars_part {
248 * MAGIC2 indicates that this partition has pulled the remote partititions 164 * MAGIC2 indicates that this partition has pulled the remote partititions
249 * per partition variables that pertain to this partition. 165 * per partition variables that pertain to this partition.
250 */ 166 */
251#define XPC_VP_MAGIC1 0x0053524156435058L /* 'XPCVARS\0'L (little endian) */ 167#define XPC_VP_MAGIC1_SN2 0x0053524156435058L /* 'XPCVARS\0'L (little endian) */
252#define XPC_VP_MAGIC2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */ 168#define XPC_VP_MAGIC2_SN2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */
253 169
254/* the reserved page sizes and offsets */ 170/* the reserved page sizes and offsets */
255 171
256#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page)) 172#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
257#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars)) 173#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars_sn2))
258 174
259#define XPC_RP_PART_NASIDS(_rp) ((u64 *)((u8 *)(_rp) + XPC_RP_HEADER_SIZE)) 175#define XPC_RP_PART_NASIDS(_rp) ((unsigned long *)((u8 *)(_rp) + \
260#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + xp_nasid_mask_words) 176 XPC_RP_HEADER_SIZE))
261#define XPC_RP_VARS(_rp) ((struct xpc_vars *)(XPC_RP_MACH_NASIDS(_rp) + \ 177#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + \
262 xp_nasid_mask_words)) 178 xpc_nasid_mask_nlongs)
263#define XPC_RP_VARS_PART(_rp) ((struct xpc_vars_part *) \ 179#define XPC_RP_VARS(_rp) ((struct xpc_vars_sn2 *) \
264 ((u8 *)XPC_RP_VARS(_rp) + XPC_RP_VARS_SIZE)) 180 (XPC_RP_MACH_NASIDS(_rp) + \
181 xpc_nasid_mask_nlongs))
182
183/*
184 * The activate_mq is used to send/receive GRU messages that affect XPC's
185 * heartbeat, partition active state, and channel state. This is UV only.
186 */
187struct xpc_activate_mq_msghdr_uv {
188 short partid; /* sender's partid */
189 u8 act_state; /* sender's act_state at time msg sent */
190 u8 type; /* message's type */
191 unsigned long rp_ts_jiffies; /* timestamp of sender's rp setup by XPC */
192};
193
194/* activate_mq defined message types */
195#define XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV 0
196#define XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV 1
197#define XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV 2
198#define XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV 3
199
200#define XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV 4
201#define XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV 5
202
203#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV 6
204#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV 7
205#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV 8
206#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV 9
207
208#define XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV 10
209#define XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV 11
210
211struct xpc_activate_mq_msg_uv {
212 struct xpc_activate_mq_msghdr_uv hdr;
213};
214
215struct xpc_activate_mq_msg_heartbeat_req_uv {
216 struct xpc_activate_mq_msghdr_uv hdr;
217 u64 heartbeat;
218};
219
220struct xpc_activate_mq_msg_activate_req_uv {
221 struct xpc_activate_mq_msghdr_uv hdr;
222 unsigned long rp_gpa;
223 unsigned long activate_mq_gpa;
224};
225
226struct xpc_activate_mq_msg_deactivate_req_uv {
227 struct xpc_activate_mq_msghdr_uv hdr;
228 enum xp_retval reason;
229};
230
231struct xpc_activate_mq_msg_chctl_closerequest_uv {
232 struct xpc_activate_mq_msghdr_uv hdr;
233 short ch_number;
234 enum xp_retval reason;
235};
236
237struct xpc_activate_mq_msg_chctl_closereply_uv {
238 struct xpc_activate_mq_msghdr_uv hdr;
239 short ch_number;
240};
241
242struct xpc_activate_mq_msg_chctl_openrequest_uv {
243 struct xpc_activate_mq_msghdr_uv hdr;
244 short ch_number;
245 short entry_size; /* size of notify_mq's GRU messages */
246 short local_nentries; /* ??? Is this needed? What is? */
247};
248
249struct xpc_activate_mq_msg_chctl_openreply_uv {
250 struct xpc_activate_mq_msghdr_uv hdr;
251 short ch_number;
252 short remote_nentries; /* ??? Is this needed? What is? */
253 short local_nentries; /* ??? Is this needed? What is? */
254 unsigned long local_notify_mq_gpa;
255};
265 256
266/* 257/*
267 * Functions registered by add_timer() or called by kernel_thread() only 258 * Functions registered by add_timer() or called by kernel_thread() only
@@ -270,22 +261,22 @@ struct xpc_vars_part {
270 * the passed argument. 261 * the passed argument.
271 */ 262 */
272#define XPC_PACK_ARGS(_arg1, _arg2) \ 263#define XPC_PACK_ARGS(_arg1, _arg2) \
273 ((((u64) _arg1) & 0xffffffff) | \ 264 ((((u64)_arg1) & 0xffffffff) | \
274 ((((u64) _arg2) & 0xffffffff) << 32)) 265 ((((u64)_arg2) & 0xffffffff) << 32))
275 266
276#define XPC_UNPACK_ARG1(_args) (((u64) _args) & 0xffffffff) 267#define XPC_UNPACK_ARG1(_args) (((u64)_args) & 0xffffffff)
277#define XPC_UNPACK_ARG2(_args) ((((u64) _args) >> 32) & 0xffffffff) 268#define XPC_UNPACK_ARG2(_args) ((((u64)_args) >> 32) & 0xffffffff)
278 269
279/* 270/*
280 * Define a Get/Put value pair (pointers) used with a message queue. 271 * Define a Get/Put value pair (pointers) used with a message queue.
281 */ 272 */
282struct xpc_gp { 273struct xpc_gp_sn2 {
283 s64 get; /* Get value */ 274 s64 get; /* Get value */
284 s64 put; /* Put value */ 275 s64 put; /* Put value */
285}; 276};
286 277
287#define XPC_GP_SIZE \ 278#define XPC_GP_SIZE \
288 L1_CACHE_ALIGN(sizeof(struct xpc_gp) * XPC_NCHANNELS) 279 L1_CACHE_ALIGN(sizeof(struct xpc_gp_sn2) * XPC_MAX_NCHANNELS)
289 280
290/* 281/*
291 * Define a structure that contains arguments associated with opening and 282 * Define a structure that contains arguments associated with opening and
@@ -293,31 +284,89 @@ struct xpc_gp {
293 */ 284 */
294struct xpc_openclose_args { 285struct xpc_openclose_args {
295 u16 reason; /* reason why channel is closing */ 286 u16 reason; /* reason why channel is closing */
296 u16 msg_size; /* sizeof each message entry */ 287 u16 entry_size; /* sizeof each message entry */
297 u16 remote_nentries; /* #of message entries in remote msg queue */ 288 u16 remote_nentries; /* #of message entries in remote msg queue */
298 u16 local_nentries; /* #of message entries in local msg queue */ 289 u16 local_nentries; /* #of message entries in local msg queue */
299 u64 local_msgqueue_pa; /* physical address of local message queue */ 290 unsigned long local_msgqueue_pa; /* phys addr of local message queue */
300}; 291};
301 292
302#define XPC_OPENCLOSE_ARGS_SIZE \ 293#define XPC_OPENCLOSE_ARGS_SIZE \
303 L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * XPC_NCHANNELS) 294 L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * \
295 XPC_MAX_NCHANNELS)
304 296
305/* struct xpc_msg flags */
306 297
307#define XPC_M_DONE 0x01 /* msg has been received/consumed */ 298/*
308#define XPC_M_READY 0x02 /* msg is ready to be sent */ 299 * Structures to define a fifo singly-linked list.
309#define XPC_M_INTERRUPT 0x04 /* send interrupt when msg consumed */ 300 */
310 301
311#define XPC_MSG_ADDRESS(_payload) \ 302struct xpc_fifo_entry_uv {
312 ((struct xpc_msg *)((u8 *)(_payload) - XPC_MSG_PAYLOAD_OFFSET)) 303 struct xpc_fifo_entry_uv *next;
304};
305
306struct xpc_fifo_head_uv {
307 struct xpc_fifo_entry_uv *first;
308 struct xpc_fifo_entry_uv *last;
309 spinlock_t lock;
310 int n_entries;
311};
313 312
314/* 313/*
315 * Defines notify entry. 314 * Define a sn2 styled message.
315 *
316 * A user-defined message resides in the payload area. The max size of the
317 * payload is defined by the user via xpc_connect().
318 *
319 * The size of a message entry (within a message queue) must be a 128-byte
320 * cacheline sized multiple in order to facilitate the BTE transfer of messages
321 * from one message queue to another.
322 */
323struct xpc_msg_sn2 {
324 u8 flags; /* FOR XPC INTERNAL USE ONLY */
325 u8 reserved[7]; /* FOR XPC INTERNAL USE ONLY */
326 s64 number; /* FOR XPC INTERNAL USE ONLY */
327
328 u64 payload; /* user defined portion of message */
329};
330
331/* struct xpc_msg_sn2 flags */
332
333#define XPC_M_SN2_DONE 0x01 /* msg has been received/consumed */
334#define XPC_M_SN2_READY 0x02 /* msg is ready to be sent */
335#define XPC_M_SN2_INTERRUPT 0x04 /* send interrupt when msg consumed */
336
337/*
338 * The format of a uv XPC notify_mq GRU message is as follows:
339 *
340 * A user-defined message resides in the payload area. The max size of the
341 * payload is defined by the user via xpc_connect().
342 *
343 * The size of a message (payload and header) sent via the GRU must be either 1
344 * or 2 GRU_CACHE_LINE_BYTES in length.
345 */
346
347struct xpc_notify_mq_msghdr_uv {
348 union {
349 unsigned int gru_msg_hdr; /* FOR GRU INTERNAL USE ONLY */
350 struct xpc_fifo_entry_uv next; /* FOR XPC INTERNAL USE ONLY */
351 } u;
352 short partid; /* FOR XPC INTERNAL USE ONLY */
353 u8 ch_number; /* FOR XPC INTERNAL USE ONLY */
354 u8 size; /* FOR XPC INTERNAL USE ONLY */
355 unsigned int msg_slot_number; /* FOR XPC INTERNAL USE ONLY */
356};
357
358struct xpc_notify_mq_msg_uv {
359 struct xpc_notify_mq_msghdr_uv hdr;
360 unsigned long payload;
361};
362
363/*
364 * Define sn2's notify entry.
316 * 365 *
317 * This is used to notify a message's sender that their message was received 366 * This is used to notify a message's sender that their message was received
318 * and consumed by the intended recipient. 367 * and consumed by the intended recipient.
319 */ 368 */
320struct xpc_notify { 369struct xpc_notify_sn2 {
321 u8 type; /* type of notification */ 370 u8 type; /* type of notification */
322 371
323 /* the following two fields are only used if type == XPC_N_CALL */ 372 /* the following two fields are only used if type == XPC_N_CALL */
@@ -325,9 +374,20 @@ struct xpc_notify {
325 void *key; /* pointer to user's key */ 374 void *key; /* pointer to user's key */
326}; 375};
327 376
328/* struct xpc_notify type of notification */ 377/* struct xpc_notify_sn2 type of notification */
378
379#define XPC_N_CALL 0x01 /* notify function provided by user */
329 380
330#define XPC_N_CALL 0x01 /* notify function provided by user */ 381/*
382 * Define uv's version of the notify entry. It additionally is used to allocate
383 * a msg slot on the remote partition into which is copied a sent message.
384 */
385struct xpc_send_msg_slot_uv {
386 struct xpc_fifo_entry_uv next;
387 unsigned int msg_slot_number;
388 xpc_notify_func func; /* user's notify function */
389 void *key; /* pointer to user's key */
390};
331 391
332/* 392/*
333 * Define the structure that manages all the stuff required by a channel. In 393 * Define the structure that manages all the stuff required by a channel. In
@@ -339,8 +399,12 @@ struct xpc_notify {
339 * There is an array of these structures for each remote partition. It is 399 * There is an array of these structures for each remote partition. It is
340 * allocated at the time a partition becomes active. The array contains one 400 * allocated at the time a partition becomes active. The array contains one
341 * of these structures for each potential channel connection to that partition. 401 * of these structures for each potential channel connection to that partition.
402 */
403
404/*
405 * The following is sn2 only.
342 * 406 *
343 * Each of these structures manages two message queues (circular buffers). 407 * Each channel structure manages two message queues (circular buffers).
344 * They are allocated at the time a channel connection is made. One of 408 * They are allocated at the time a channel connection is made. One of
345 * these message queues (local_msgqueue) holds the locally created messages 409 * these message queues (local_msgqueue) holds the locally created messages
346 * that are destined for the remote partition. The other of these message 410 * that are destined for the remote partition. The other of these message
@@ -407,58 +471,72 @@ struct xpc_notify {
407 * new messages, by the clearing of the message flags of the acknowledged 471 * new messages, by the clearing of the message flags of the acknowledged
408 * messages. 472 * messages.
409 */ 473 */
474
475struct xpc_channel_sn2 {
476 struct xpc_openclose_args *local_openclose_args; /* args passed on */
477 /* opening or closing of channel */
478
479 void *local_msgqueue_base; /* base address of kmalloc'd space */
480 struct xpc_msg_sn2 *local_msgqueue; /* local message queue */
481 void *remote_msgqueue_base; /* base address of kmalloc'd space */
482 struct xpc_msg_sn2 *remote_msgqueue; /* cached copy of remote */
483 /* partition's local message queue */
484 unsigned long remote_msgqueue_pa; /* phys addr of remote partition's */
485 /* local message queue */
486
487 struct xpc_notify_sn2 *notify_queue;/* notify queue for messages sent */
488
489 /* various flavors of local and remote Get/Put values */
490
491 struct xpc_gp_sn2 *local_GP; /* local Get/Put values */
492 struct xpc_gp_sn2 remote_GP; /* remote Get/Put values */
493 struct xpc_gp_sn2 w_local_GP; /* working local Get/Put values */
494 struct xpc_gp_sn2 w_remote_GP; /* working remote Get/Put values */
495 s64 next_msg_to_pull; /* Put value of next msg to pull */
496
497 struct mutex msg_to_pull_mutex; /* next msg to pull serialization */
498};
499
500struct xpc_channel_uv {
501 unsigned long remote_notify_mq_gpa; /* gru phys address of remote */
502 /* partition's notify mq */
503
504 struct xpc_send_msg_slot_uv *send_msg_slots;
505 struct xpc_notify_mq_msg_uv *recv_msg_slots;
506
507 struct xpc_fifo_head_uv msg_slot_free_list;
508 struct xpc_fifo_head_uv recv_msg_list; /* deliverable payloads */
509};
510
410struct xpc_channel { 511struct xpc_channel {
411 short partid; /* ID of remote partition connected */ 512 short partid; /* ID of remote partition connected */
412 spinlock_t lock; /* lock for updating this structure */ 513 spinlock_t lock; /* lock for updating this structure */
413 u32 flags; /* general flags */ 514 unsigned int flags; /* general flags */
414 515
415 enum xp_retval reason; /* reason why channel is disconnect'g */ 516 enum xp_retval reason; /* reason why channel is disconnect'g */
416 int reason_line; /* line# disconnect initiated from */ 517 int reason_line; /* line# disconnect initiated from */
417 518
418 u16 number; /* channel # */ 519 u16 number; /* channel # */
419 520
420 u16 msg_size; /* sizeof each msg entry */ 521 u16 entry_size; /* sizeof each msg entry */
421 u16 local_nentries; /* #of msg entries in local msg queue */ 522 u16 local_nentries; /* #of msg entries in local msg queue */
422 u16 remote_nentries; /* #of msg entries in remote msg queue */ 523 u16 remote_nentries; /* #of msg entries in remote msg queue */
423 524
424 void *local_msgqueue_base; /* base address of kmalloc'd space */
425 struct xpc_msg *local_msgqueue; /* local message queue */
426 void *remote_msgqueue_base; /* base address of kmalloc'd space */
427 struct xpc_msg *remote_msgqueue; /* cached copy of remote partition's */
428 /* local message queue */
429 u64 remote_msgqueue_pa; /* phys addr of remote partition's */
430 /* local message queue */
431
432 atomic_t references; /* #of external references to queues */ 525 atomic_t references; /* #of external references to queues */
433 526
434 atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */ 527 atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */
435 wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */ 528 wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */
436 529
437 u8 delayed_IPI_flags; /* IPI flags received, but delayed */ 530 u8 delayed_chctl_flags; /* chctl flags received, but delayed */
438 /* action until channel disconnected */ 531 /* action until channel disconnected */
439 532
440 /* queue of msg senders who want to be notified when msg received */
441
442 atomic_t n_to_notify; /* #of msg senders to notify */ 533 atomic_t n_to_notify; /* #of msg senders to notify */
443 struct xpc_notify *notify_queue; /* notify queue for messages sent */
444 534
445 xpc_channel_func func; /* user's channel function */ 535 xpc_channel_func func; /* user's channel function */
446 void *key; /* pointer to user's key */ 536 void *key; /* pointer to user's key */
447 537
448 struct mutex msg_to_pull_mutex; /* next msg to pull serialization */
449 struct completion wdisconnect_wait; /* wait for channel disconnect */ 538 struct completion wdisconnect_wait; /* wait for channel disconnect */
450 539
451 struct xpc_openclose_args *local_openclose_args; /* args passed on */
452 /* opening or closing of channel */
453
454 /* various flavors of local and remote Get/Put values */
455
456 struct xpc_gp *local_GP; /* local Get/Put values */
457 struct xpc_gp remote_GP; /* remote Get/Put values */
458 struct xpc_gp w_local_GP; /* working local Get/Put values */
459 struct xpc_gp w_remote_GP; /* working remote Get/Put values */
460 s64 next_msg_to_pull; /* Put value of next msg to pull */
461
462 /* kthread management related fields */ 540 /* kthread management related fields */
463 541
464 atomic_t kthreads_assigned; /* #of kthreads assigned to channel */ 542 atomic_t kthreads_assigned; /* #of kthreads assigned to channel */
@@ -469,6 +547,11 @@ struct xpc_channel {
469 547
470 wait_queue_head_t idle_wq; /* idle kthread wait queue */ 548 wait_queue_head_t idle_wq; /* idle kthread wait queue */
471 549
550 union {
551 struct xpc_channel_sn2 sn2;
552 struct xpc_channel_uv uv;
553 } sn;
554
472} ____cacheline_aligned; 555} ____cacheline_aligned;
473 556
474/* struct xpc_channel flags */ 557/* struct xpc_channel flags */
@@ -501,33 +584,128 @@ struct xpc_channel {
501#define XPC_C_WDISCONNECT 0x00040000 /* waiting for channel disconnect */ 584#define XPC_C_WDISCONNECT 0x00040000 /* waiting for channel disconnect */
502 585
503/* 586/*
504 * Manages channels on a partition basis. There is one of these structures 587 * The channel control flags (chctl) union consists of a 64-bit variable which
588 * is divided up into eight bytes, ordered from right to left. Byte zero
589 * pertains to channel 0, byte one to channel 1, and so on. Each channel's byte
590 * can have one or more of the chctl flags set in it.
591 */
592
593union xpc_channel_ctl_flags {
594 u64 all_flags;
595 u8 flags[XPC_MAX_NCHANNELS];
596};
597
598/* chctl flags */
599#define XPC_CHCTL_CLOSEREQUEST 0x01
600#define XPC_CHCTL_CLOSEREPLY 0x02
601#define XPC_CHCTL_OPENREQUEST 0x04
602#define XPC_CHCTL_OPENREPLY 0x08
603#define XPC_CHCTL_MSGREQUEST 0x10
604
605#define XPC_OPENCLOSE_CHCTL_FLAGS \
606 (XPC_CHCTL_CLOSEREQUEST | XPC_CHCTL_CLOSEREPLY | \
607 XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY)
608#define XPC_MSG_CHCTL_FLAGS XPC_CHCTL_MSGREQUEST
609
610static inline int
611xpc_any_openclose_chctl_flags_set(union xpc_channel_ctl_flags *chctl)
612{
613 int ch_number;
614
615 for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) {
616 if (chctl->flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS)
617 return 1;
618 }
619 return 0;
620}
621
622static inline int
623xpc_any_msg_chctl_flags_set(union xpc_channel_ctl_flags *chctl)
624{
625 int ch_number;
626
627 for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) {
628 if (chctl->flags[ch_number] & XPC_MSG_CHCTL_FLAGS)
629 return 1;
630 }
631 return 0;
632}
633
634/*
635 * Manage channels on a partition basis. There is one of these structures
505 * for each partition (a partition will never utilize the structure that 636 * for each partition (a partition will never utilize the structure that
506 * represents itself). 637 * represents itself).
507 */ 638 */
639
640struct xpc_partition_sn2 {
641 unsigned long remote_amos_page_pa; /* paddr of partition's amos page */
642 int activate_IRQ_nasid; /* active partition's act/deact nasid */
643 int activate_IRQ_phys_cpuid; /* active part's act/deact phys cpuid */
644
645 unsigned long remote_vars_pa; /* phys addr of partition's vars */
646 unsigned long remote_vars_part_pa; /* paddr of partition's vars part */
647 u8 remote_vars_version; /* version# of partition's vars */
648
649 void *local_GPs_base; /* base address of kmalloc'd space */
650 struct xpc_gp_sn2 *local_GPs; /* local Get/Put values */
651 void *remote_GPs_base; /* base address of kmalloc'd space */
652 struct xpc_gp_sn2 *remote_GPs; /* copy of remote partition's local */
653 /* Get/Put values */
654 unsigned long remote_GPs_pa; /* phys addr of remote partition's local */
655 /* Get/Put values */
656
657 void *local_openclose_args_base; /* base address of kmalloc'd space */
658 struct xpc_openclose_args *local_openclose_args; /* local's args */
659 unsigned long remote_openclose_args_pa; /* phys addr of remote's args */
660
661 int notify_IRQ_nasid; /* nasid of where to send notify IRQs */
662 int notify_IRQ_phys_cpuid; /* CPUID of where to send notify IRQs */
663 char notify_IRQ_owner[8]; /* notify IRQ's owner's name */
664
665 struct amo *remote_chctl_amo_va; /* addr of remote chctl flags' amo */
666 struct amo *local_chctl_amo_va; /* address of chctl flags' amo */
667
668 struct timer_list dropped_notify_IRQ_timer; /* dropped IRQ timer */
669};
670
671struct xpc_partition_uv {
672 unsigned long remote_activate_mq_gpa; /* gru phys address of remote */
673 /* partition's activate mq */
674 spinlock_t flags_lock; /* protect updating of flags */
675 unsigned int flags; /* general flags */
676 u8 remote_act_state; /* remote partition's act_state */
677 u8 act_state_req; /* act_state request from remote partition */
678 enum xp_retval reason; /* reason for deactivate act_state request */
679 u64 heartbeat; /* incremented by remote partition */
680};
681
682/* struct xpc_partition_uv flags */
683
684#define XPC_P_HEARTBEAT_OFFLINE_UV 0x00000001
685#define XPC_P_ENGAGED_UV 0x00000002
686
687/* struct xpc_partition_uv act_state change requests */
688
689#define XPC_P_ASR_ACTIVATE_UV 0x01
690#define XPC_P_ASR_REACTIVATE_UV 0x02
691#define XPC_P_ASR_DEACTIVATE_UV 0x03
692
508struct xpc_partition { 693struct xpc_partition {
509 694
510 /* XPC HB infrastructure */ 695 /* XPC HB infrastructure */
511 696
512 u8 remote_rp_version; /* version# of partition's rsvd pg */ 697 u8 remote_rp_version; /* version# of partition's rsvd pg */
513 struct timespec remote_rp_stamp; /* time when rsvd pg was initialized */ 698 unsigned long remote_rp_ts_jiffies; /* timestamp when rsvd pg setup */
514 u64 remote_rp_pa; /* phys addr of partition's rsvd pg */ 699 unsigned long remote_rp_pa; /* phys addr of partition's rsvd pg */
515 u64 remote_vars_pa; /* phys addr of partition's vars */
516 u64 remote_vars_part_pa; /* phys addr of partition's vars part */
517 u64 last_heartbeat; /* HB at last read */ 700 u64 last_heartbeat; /* HB at last read */
518 u64 remote_amos_page_pa; /* phys addr of partition's amos page */ 701 u32 activate_IRQ_rcvd; /* IRQs since activation */
519 int remote_act_nasid; /* active part's act/deact nasid */
520 int remote_act_phys_cpuid; /* active part's act/deact phys cpuid */
521 u32 act_IRQ_rcvd; /* IRQs since activation */
522 spinlock_t act_lock; /* protect updating of act_state */ 702 spinlock_t act_lock; /* protect updating of act_state */
523 u8 act_state; /* from XPC HB viewpoint */ 703 u8 act_state; /* from XPC HB viewpoint */
524 u8 remote_vars_version; /* version# of partition's vars */
525 enum xp_retval reason; /* reason partition is deactivating */ 704 enum xp_retval reason; /* reason partition is deactivating */
526 int reason_line; /* line# deactivation initiated from */ 705 int reason_line; /* line# deactivation initiated from */
527 int reactivate_nasid; /* nasid in partition to reactivate */
528 706
529 unsigned long disengage_request_timeout; /* timeout in jiffies */ 707 unsigned long disengage_timeout; /* timeout in jiffies */
530 struct timer_list disengage_request_timer; 708 struct timer_list disengage_timer;
531 709
532 /* XPC infrastructure referencing and teardown control */ 710 /* XPC infrastructure referencing and teardown control */
533 711
@@ -535,85 +713,63 @@ struct xpc_partition {
535 wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */ 713 wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */
536 atomic_t references; /* #of references to infrastructure */ 714 atomic_t references; /* #of references to infrastructure */
537 715
538 /*
539 * NONE OF THE PRECEDING FIELDS OF THIS STRUCTURE WILL BE CLEARED WHEN
540 * XPC SETS UP THE NECESSARY INFRASTRUCTURE TO SUPPORT CROSS PARTITION
541 * COMMUNICATION. ALL OF THE FOLLOWING FIELDS WILL BE CLEARED. (THE
542 * 'nchannels' FIELD MUST BE THE FIRST OF THE FIELDS TO BE CLEARED.)
543 */
544
545 u8 nchannels; /* #of defined channels supported */ 716 u8 nchannels; /* #of defined channels supported */
546 atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */ 717 atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */
547 atomic_t nchannels_engaged; /* #of channels engaged with remote part */ 718 atomic_t nchannels_engaged; /* #of channels engaged with remote part */
548 struct xpc_channel *channels; /* array of channel structures */ 719 struct xpc_channel *channels; /* array of channel structures */
549 720
550 void *local_GPs_base; /* base address of kmalloc'd space */ 721 /* fields used for managing channel avialability and activity */
551 struct xpc_gp *local_GPs; /* local Get/Put values */
552 void *remote_GPs_base; /* base address of kmalloc'd space */
553 struct xpc_gp *remote_GPs; /* copy of remote partition's local */
554 /* Get/Put values */
555 u64 remote_GPs_pa; /* phys address of remote partition's local */
556 /* Get/Put values */
557 722
558 /* fields used to pass args when opening or closing a channel */ 723 union xpc_channel_ctl_flags chctl; /* chctl flags yet to be processed */
724 spinlock_t chctl_lock; /* chctl flags lock */
559 725
560 void *local_openclose_args_base; /* base address of kmalloc'd space */
561 struct xpc_openclose_args *local_openclose_args; /* local's args */
562 void *remote_openclose_args_base; /* base address of kmalloc'd space */ 726 void *remote_openclose_args_base; /* base address of kmalloc'd space */
563 struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */ 727 struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */
564 /* args */ 728 /* args */
565 u64 remote_openclose_args_pa; /* phys addr of remote's args */
566
567 /* IPI sending, receiving and handling related fields */
568
569 int remote_IPI_nasid; /* nasid of where to send IPIs */
570 int remote_IPI_phys_cpuid; /* phys CPU ID of where to send IPIs */
571 AMO_t *remote_IPI_amo_va; /* address of remote IPI AMO_t structure */
572
573 AMO_t *local_IPI_amo_va; /* address of IPI AMO_t structure */
574 u64 local_IPI_amo; /* IPI amo flags yet to be handled */
575 char IPI_owner[8]; /* IPI owner's name */
576 struct timer_list dropped_IPI_timer; /* dropped IPI timer */
577
578 spinlock_t IPI_lock; /* IPI handler lock */
579 729
580 /* channel manager related fields */ 730 /* channel manager related fields */
581 731
582 atomic_t channel_mgr_requests; /* #of requests to activate chan mgr */ 732 atomic_t channel_mgr_requests; /* #of requests to activate chan mgr */
583 wait_queue_head_t channel_mgr_wq; /* channel mgr's wait queue */ 733 wait_queue_head_t channel_mgr_wq; /* channel mgr's wait queue */
584 734
735 union {
736 struct xpc_partition_sn2 sn2;
737 struct xpc_partition_uv uv;
738 } sn;
739
585} ____cacheline_aligned; 740} ____cacheline_aligned;
586 741
587/* struct xpc_partition act_state values (for XPC HB) */ 742/* struct xpc_partition act_state values (for XPC HB) */
588 743
589#define XPC_P_INACTIVE 0x00 /* partition is not active */ 744#define XPC_P_AS_INACTIVE 0x00 /* partition is not active */
590#define XPC_P_ACTIVATION_REQ 0x01 /* created thread to activate */ 745#define XPC_P_AS_ACTIVATION_REQ 0x01 /* created thread to activate */
591#define XPC_P_ACTIVATING 0x02 /* activation thread started */ 746#define XPC_P_AS_ACTIVATING 0x02 /* activation thread started */
592#define XPC_P_ACTIVE 0x03 /* xpc_partition_up() was called */ 747#define XPC_P_AS_ACTIVE 0x03 /* xpc_partition_up() was called */
593#define XPC_P_DEACTIVATING 0x04 /* partition deactivation initiated */ 748#define XPC_P_AS_DEACTIVATING 0x04 /* partition deactivation initiated */
594 749
595#define XPC_DEACTIVATE_PARTITION(_p, _reason) \ 750#define XPC_DEACTIVATE_PARTITION(_p, _reason) \
596 xpc_deactivate_partition(__LINE__, (_p), (_reason)) 751 xpc_deactivate_partition(__LINE__, (_p), (_reason))
597 752
598/* struct xpc_partition setup_state values */ 753/* struct xpc_partition setup_state values */
599 754
600#define XPC_P_UNSET 0x00 /* infrastructure was never setup */ 755#define XPC_P_SS_UNSET 0x00 /* infrastructure was never setup */
601#define XPC_P_SETUP 0x01 /* infrastructure is setup */ 756#define XPC_P_SS_SETUP 0x01 /* infrastructure is setup */
602#define XPC_P_WTEARDOWN 0x02 /* waiting to teardown infrastructure */ 757#define XPC_P_SS_WTEARDOWN 0x02 /* waiting to teardown infrastructure */
603#define XPC_P_TORNDOWN 0x03 /* infrastructure is torndown */ 758#define XPC_P_SS_TORNDOWN 0x03 /* infrastructure is torndown */
604 759
605/* 760/*
606 * struct xpc_partition IPI_timer #of seconds to wait before checking for 761 * struct xpc_partition_sn2's dropped notify IRQ timer is set to wait the
607 * dropped IPIs. These occur whenever an IPI amo write doesn't complete until 762 * following interval #of seconds before checking for dropped notify IRQs.
608 * after the IPI was received. 763 * These can occur whenever an IRQ's associated amo write doesn't complete
764 * until after the IRQ was received.
609 */ 765 */
610#define XPC_P_DROPPED_IPI_WAIT (0.25 * HZ) 766#define XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL (0.25 * HZ)
611 767
612/* number of seconds to wait for other partitions to disengage */ 768/* number of seconds to wait for other partitions to disengage */
613#define XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT 90 769#define XPC_DISENGAGE_DEFAULT_TIMELIMIT 90
614 770
615/* interval in seconds to print 'waiting disengagement' messages */ 771/* interval in seconds to print 'waiting deactivation' messages */
616#define XPC_DISENGAGE_PRINTMSG_INTERVAL 10 772#define XPC_DEACTIVATE_PRINTMSG_INTERVAL 10
617 773
618#define XPC_PARTID(_p) ((short)((_p) - &xpc_partitions[0])) 774#define XPC_PARTID(_p) ((short)((_p) - &xpc_partitions[0]))
619 775
@@ -623,33 +779,92 @@ extern struct xpc_registration xpc_registrations[];
623/* found in xpc_main.c */ 779/* found in xpc_main.c */
624extern struct device *xpc_part; 780extern struct device *xpc_part;
625extern struct device *xpc_chan; 781extern struct device *xpc_chan;
626extern int xpc_disengage_request_timelimit; 782extern int xpc_disengage_timelimit;
627extern int xpc_disengage_request_timedout; 783extern int xpc_disengage_timedout;
628extern irqreturn_t xpc_notify_IRQ_handler(int, void *); 784extern int xpc_activate_IRQ_rcvd;
629extern void xpc_dropped_IPI_check(struct xpc_partition *); 785extern spinlock_t xpc_activate_IRQ_rcvd_lock;
786extern wait_queue_head_t xpc_activate_IRQ_wq;
787extern void *xpc_heartbeating_to_mask;
788extern void *xpc_kzalloc_cacheline_aligned(size_t, gfp_t, void **);
630extern void xpc_activate_partition(struct xpc_partition *); 789extern void xpc_activate_partition(struct xpc_partition *);
631extern void xpc_activate_kthreads(struct xpc_channel *, int); 790extern void xpc_activate_kthreads(struct xpc_channel *, int);
632extern void xpc_create_kthreads(struct xpc_channel *, int, int); 791extern void xpc_create_kthreads(struct xpc_channel *, int, int);
633extern void xpc_disconnect_wait(int); 792extern void xpc_disconnect_wait(int);
793extern int (*xpc_setup_partitions_sn) (void);
794extern enum xp_retval (*xpc_get_partition_rsvd_page_pa) (void *, u64 *,
795 unsigned long *,
796 size_t *);
797extern int (*xpc_setup_rsvd_page_sn) (struct xpc_rsvd_page *);
798extern void (*xpc_heartbeat_init) (void);
799extern void (*xpc_heartbeat_exit) (void);
800extern void (*xpc_increment_heartbeat) (void);
801extern void (*xpc_offline_heartbeat) (void);
802extern void (*xpc_online_heartbeat) (void);
803extern enum xp_retval (*xpc_get_remote_heartbeat) (struct xpc_partition *);
804extern enum xp_retval (*xpc_make_first_contact) (struct xpc_partition *);
805extern u64 (*xpc_get_chctl_all_flags) (struct xpc_partition *);
806extern enum xp_retval (*xpc_setup_msg_structures) (struct xpc_channel *);
807extern void (*xpc_teardown_msg_structures) (struct xpc_channel *);
808extern void (*xpc_notify_senders_of_disconnect) (struct xpc_channel *);
809extern void (*xpc_process_msg_chctl_flags) (struct xpc_partition *, int);
810extern int (*xpc_n_of_deliverable_payloads) (struct xpc_channel *);
811extern void *(*xpc_get_deliverable_payload) (struct xpc_channel *);
812extern void (*xpc_request_partition_activation) (struct xpc_rsvd_page *,
813 unsigned long, int);
814extern void (*xpc_request_partition_reactivation) (struct xpc_partition *);
815extern void (*xpc_request_partition_deactivation) (struct xpc_partition *);
816extern void (*xpc_cancel_partition_deactivation_request) (
817 struct xpc_partition *);
818extern void (*xpc_process_activate_IRQ_rcvd) (void);
819extern enum xp_retval (*xpc_setup_ch_structures_sn) (struct xpc_partition *);
820extern void (*xpc_teardown_ch_structures_sn) (struct xpc_partition *);
821
822extern void (*xpc_indicate_partition_engaged) (struct xpc_partition *);
823extern int (*xpc_partition_engaged) (short);
824extern int (*xpc_any_partition_engaged) (void);
825extern void (*xpc_indicate_partition_disengaged) (struct xpc_partition *);
826extern void (*xpc_assume_partition_disengaged) (short);
827
828extern void (*xpc_send_chctl_closerequest) (struct xpc_channel *,
829 unsigned long *);
830extern void (*xpc_send_chctl_closereply) (struct xpc_channel *,
831 unsigned long *);
832extern void (*xpc_send_chctl_openrequest) (struct xpc_channel *,
833 unsigned long *);
834extern void (*xpc_send_chctl_openreply) (struct xpc_channel *, unsigned long *);
835
836extern void (*xpc_save_remote_msgqueue_pa) (struct xpc_channel *,
837 unsigned long);
838
839extern enum xp_retval (*xpc_send_payload) (struct xpc_channel *, u32, void *,
840 u16, u8, xpc_notify_func, void *);
841extern void (*xpc_received_payload) (struct xpc_channel *, void *);
842
843/* found in xpc_sn2.c */
844extern int xpc_init_sn2(void);
845extern void xpc_exit_sn2(void);
846
847/* found in xpc_uv.c */
848extern int xpc_init_uv(void);
849extern void xpc_exit_uv(void);
634 850
635/* found in xpc_partition.c */ 851/* found in xpc_partition.c */
636extern int xpc_exiting; 852extern int xpc_exiting;
637extern struct xpc_vars *xpc_vars; 853extern int xpc_nasid_mask_nlongs;
638extern struct xpc_rsvd_page *xpc_rsvd_page; 854extern struct xpc_rsvd_page *xpc_rsvd_page;
639extern struct xpc_vars_part *xpc_vars_part; 855extern unsigned long *xpc_mach_nasids;
640extern struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1]; 856extern struct xpc_partition *xpc_partitions;
641extern char *xpc_remote_copy_buffer;
642extern void *xpc_remote_copy_buffer_base;
643extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **); 857extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **);
644extern struct xpc_rsvd_page *xpc_rsvd_page_init(void); 858extern int xpc_setup_rsvd_page(void);
645extern void xpc_allow_IPI_ops(void); 859extern void xpc_teardown_rsvd_page(void);
646extern void xpc_restrict_IPI_ops(void); 860extern int xpc_identify_activate_IRQ_sender(void);
647extern int xpc_identify_act_IRQ_sender(void);
648extern int xpc_partition_disengaged(struct xpc_partition *); 861extern int xpc_partition_disengaged(struct xpc_partition *);
649extern enum xp_retval xpc_mark_partition_active(struct xpc_partition *); 862extern enum xp_retval xpc_mark_partition_active(struct xpc_partition *);
650extern void xpc_mark_partition_inactive(struct xpc_partition *); 863extern void xpc_mark_partition_inactive(struct xpc_partition *);
651extern void xpc_discovery(void); 864extern void xpc_discovery(void);
652extern void xpc_check_remote_hb(void); 865extern enum xp_retval xpc_get_remote_rp(int, unsigned long *,
866 struct xpc_rsvd_page *,
867 unsigned long *);
653extern void xpc_deactivate_partition(const int, struct xpc_partition *, 868extern void xpc_deactivate_partition(const int, struct xpc_partition *,
654 enum xp_retval); 869 enum xp_retval);
655extern enum xp_retval xpc_initiate_partid_to_nasids(short, void *); 870extern enum xp_retval xpc_initiate_partid_to_nasids(short, void *);
@@ -657,21 +872,52 @@ extern enum xp_retval xpc_initiate_partid_to_nasids(short, void *);
657/* found in xpc_channel.c */ 872/* found in xpc_channel.c */
658extern void xpc_initiate_connect(int); 873extern void xpc_initiate_connect(int);
659extern void xpc_initiate_disconnect(int); 874extern void xpc_initiate_disconnect(int);
660extern enum xp_retval xpc_initiate_allocate(short, int, u32, void **); 875extern enum xp_retval xpc_allocate_msg_wait(struct xpc_channel *);
661extern enum xp_retval xpc_initiate_send(short, int, void *); 876extern enum xp_retval xpc_initiate_send(short, int, u32, void *, u16);
662extern enum xp_retval xpc_initiate_send_notify(short, int, void *, 877extern enum xp_retval xpc_initiate_send_notify(short, int, u32, void *, u16,
663 xpc_notify_func, void *); 878 xpc_notify_func, void *);
664extern void xpc_initiate_received(short, int, void *); 879extern void xpc_initiate_received(short, int, void *);
665extern enum xp_retval xpc_setup_infrastructure(struct xpc_partition *); 880extern void xpc_process_sent_chctl_flags(struct xpc_partition *);
666extern enum xp_retval xpc_pull_remote_vars_part(struct xpc_partition *);
667extern void xpc_process_channel_activity(struct xpc_partition *);
668extern void xpc_connected_callout(struct xpc_channel *); 881extern void xpc_connected_callout(struct xpc_channel *);
669extern void xpc_deliver_msg(struct xpc_channel *); 882extern void xpc_deliver_payload(struct xpc_channel *);
670extern void xpc_disconnect_channel(const int, struct xpc_channel *, 883extern void xpc_disconnect_channel(const int, struct xpc_channel *,
671 enum xp_retval, unsigned long *); 884 enum xp_retval, unsigned long *);
672extern void xpc_disconnect_callout(struct xpc_channel *, enum xp_retval); 885extern void xpc_disconnect_callout(struct xpc_channel *, enum xp_retval);
673extern void xpc_partition_going_down(struct xpc_partition *, enum xp_retval); 886extern void xpc_partition_going_down(struct xpc_partition *, enum xp_retval);
674extern void xpc_teardown_infrastructure(struct xpc_partition *); 887
888static inline int
889xpc_hb_allowed(short partid, void *heartbeating_to_mask)
890{
891 return test_bit(partid, heartbeating_to_mask);
892}
893
894static inline int
895xpc_any_hbs_allowed(void)
896{
897 DBUG_ON(xpc_heartbeating_to_mask == NULL);
898 return !bitmap_empty(xpc_heartbeating_to_mask, xp_max_npartitions);
899}
900
901static inline void
902xpc_allow_hb(short partid)
903{
904 DBUG_ON(xpc_heartbeating_to_mask == NULL);
905 set_bit(partid, xpc_heartbeating_to_mask);
906}
907
908static inline void
909xpc_disallow_hb(short partid)
910{
911 DBUG_ON(xpc_heartbeating_to_mask == NULL);
912 clear_bit(partid, xpc_heartbeating_to_mask);
913}
914
915static inline void
916xpc_disallow_all_hbs(void)
917{
918 DBUG_ON(xpc_heartbeating_to_mask == NULL);
919 bitmap_zero(xpc_heartbeating_to_mask, xp_max_npartitions);
920}
675 921
676static inline void 922static inline void
677xpc_wakeup_channel_mgr(struct xpc_partition *part) 923xpc_wakeup_channel_mgr(struct xpc_partition *part)
@@ -713,7 +959,7 @@ xpc_part_deref(struct xpc_partition *part)
713 s32 refs = atomic_dec_return(&part->references); 959 s32 refs = atomic_dec_return(&part->references);
714 960
715 DBUG_ON(refs < 0); 961 DBUG_ON(refs < 0);
716 if (refs == 0 && part->setup_state == XPC_P_WTEARDOWN) 962 if (refs == 0 && part->setup_state == XPC_P_SS_WTEARDOWN)
717 wake_up(&part->teardown_wq); 963 wake_up(&part->teardown_wq);
718} 964}
719 965
@@ -723,7 +969,7 @@ xpc_part_ref(struct xpc_partition *part)
723 int setup; 969 int setup;
724 970
725 atomic_inc(&part->references); 971 atomic_inc(&part->references);
726 setup = (part->setup_state == XPC_P_SETUP); 972 setup = (part->setup_state == XPC_P_SS_SETUP);
727 if (!setup) 973 if (!setup)
728 xpc_part_deref(part); 974 xpc_part_deref(part);
729 975
@@ -741,416 +987,4 @@ xpc_part_ref(struct xpc_partition *part)
741 (_p)->reason_line = _line; \ 987 (_p)->reason_line = _line; \
742 } 988 }
743 989
744/*
745 * This next set of inlines are used to keep track of when a partition is
746 * potentially engaged in accessing memory belonging to another partition.
747 */
748
749static inline void
750xpc_mark_partition_engaged(struct xpc_partition *part)
751{
752 unsigned long irq_flags;
753 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
754 (XPC_ENGAGED_PARTITIONS_AMO *
755 sizeof(AMO_t)));
756
757 local_irq_save(irq_flags);
758
759 /* set bit corresponding to our partid in remote partition's AMO */
760 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
761 (1UL << sn_partition_id));
762 /*
763 * We must always use the nofault function regardless of whether we
764 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
765 * didn't, we'd never know that the other partition is down and would
766 * keep sending IPIs and AMOs to it until the heartbeat times out.
767 */
768 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
769 variable),
770 xp_nofault_PIOR_target));
771
772 local_irq_restore(irq_flags);
773}
774
775static inline void
776xpc_mark_partition_disengaged(struct xpc_partition *part)
777{
778 unsigned long irq_flags;
779 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
780 (XPC_ENGAGED_PARTITIONS_AMO *
781 sizeof(AMO_t)));
782
783 local_irq_save(irq_flags);
784
785 /* clear bit corresponding to our partid in remote partition's AMO */
786 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
787 ~(1UL << sn_partition_id));
788 /*
789 * We must always use the nofault function regardless of whether we
790 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
791 * didn't, we'd never know that the other partition is down and would
792 * keep sending IPIs and AMOs to it until the heartbeat times out.
793 */
794 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
795 variable),
796 xp_nofault_PIOR_target));
797
798 local_irq_restore(irq_flags);
799}
800
801static inline void
802xpc_request_partition_disengage(struct xpc_partition *part)
803{
804 unsigned long irq_flags;
805 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
806 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
807
808 local_irq_save(irq_flags);
809
810 /* set bit corresponding to our partid in remote partition's AMO */
811 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
812 (1UL << sn_partition_id));
813 /*
814 * We must always use the nofault function regardless of whether we
815 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
816 * didn't, we'd never know that the other partition is down and would
817 * keep sending IPIs and AMOs to it until the heartbeat times out.
818 */
819 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
820 variable),
821 xp_nofault_PIOR_target));
822
823 local_irq_restore(irq_flags);
824}
825
826static inline void
827xpc_cancel_partition_disengage_request(struct xpc_partition *part)
828{
829 unsigned long irq_flags;
830 AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
831 (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
832
833 local_irq_save(irq_flags);
834
835 /* clear bit corresponding to our partid in remote partition's AMO */
836 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
837 ~(1UL << sn_partition_id));
838 /*
839 * We must always use the nofault function regardless of whether we
840 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
841 * didn't, we'd never know that the other partition is down and would
842 * keep sending IPIs and AMOs to it until the heartbeat times out.
843 */
844 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
845 variable),
846 xp_nofault_PIOR_target));
847
848 local_irq_restore(irq_flags);
849}
850
851static inline u64
852xpc_partition_engaged(u64 partid_mask)
853{
854 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
855
856 /* return our partition's AMO variable ANDed with partid_mask */
857 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
858 partid_mask);
859}
860
861static inline u64
862xpc_partition_disengage_requested(u64 partid_mask)
863{
864 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
865
866 /* return our partition's AMO variable ANDed with partid_mask */
867 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
868 partid_mask);
869}
870
871static inline void
872xpc_clear_partition_engaged(u64 partid_mask)
873{
874 AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
875
876 /* clear bit(s) based on partid_mask in our partition's AMO */
877 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
878 ~partid_mask);
879}
880
881static inline void
882xpc_clear_partition_disengage_request(u64 partid_mask)
883{
884 AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
885
886 /* clear bit(s) based on partid_mask in our partition's AMO */
887 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
888 ~partid_mask);
889}
890
891/*
892 * The following set of macros and inlines are used for the sending and
893 * receiving of IPIs (also known as IRQs). There are two flavors of IPIs,
894 * one that is associated with partition activity (SGI_XPC_ACTIVATE) and
895 * the other that is associated with channel activity (SGI_XPC_NOTIFY).
896 */
897
898static inline u64
899xpc_IPI_receive(AMO_t *amo)
900{
901 return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_CLEAR);
902}
903
904static inline enum xp_retval
905xpc_IPI_send(AMO_t *amo, u64 flag, int nasid, int phys_cpuid, int vector)
906{
907 int ret = 0;
908 unsigned long irq_flags;
909
910 local_irq_save(irq_flags);
911
912 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, flag);
913 sn_send_IPI_phys(nasid, phys_cpuid, vector, 0);
914
915 /*
916 * We must always use the nofault function regardless of whether we
917 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
918 * didn't, we'd never know that the other partition is down and would
919 * keep sending IPIs and AMOs to it until the heartbeat times out.
920 */
921 ret = xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->variable),
922 xp_nofault_PIOR_target));
923
924 local_irq_restore(irq_flags);
925
926 return ((ret == 0) ? xpSuccess : xpPioReadError);
927}
928
929/*
930 * IPIs associated with SGI_XPC_ACTIVATE IRQ.
931 */
932
933/*
934 * Flag the appropriate AMO variable and send an IPI to the specified node.
935 */
936static inline void
937xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid,
938 int to_phys_cpuid)
939{
940 int w_index = XPC_NASID_W_INDEX(from_nasid);
941 int b_index = XPC_NASID_B_INDEX(from_nasid);
942 AMO_t *amos = (AMO_t *)__va(amos_page_pa +
943 (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t)));
944
945 (void)xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid,
946 to_phys_cpuid, SGI_XPC_ACTIVATE);
947}
948
949static inline void
950xpc_IPI_send_activate(struct xpc_vars *vars)
951{
952 xpc_activate_IRQ_send(vars->amos_page_pa, cnodeid_to_nasid(0),
953 vars->act_nasid, vars->act_phys_cpuid);
954}
955
956static inline void
957xpc_IPI_send_activated(struct xpc_partition *part)
958{
959 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
960 part->remote_act_nasid,
961 part->remote_act_phys_cpuid);
962}
963
964static inline void
965xpc_IPI_send_reactivate(struct xpc_partition *part)
966{
967 xpc_activate_IRQ_send(xpc_vars->amos_page_pa, part->reactivate_nasid,
968 xpc_vars->act_nasid, xpc_vars->act_phys_cpuid);
969}
970
971static inline void
972xpc_IPI_send_disengage(struct xpc_partition *part)
973{
974 xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
975 part->remote_act_nasid,
976 part->remote_act_phys_cpuid);
977}
978
979/*
980 * IPIs associated with SGI_XPC_NOTIFY IRQ.
981 */
982
983/*
984 * Send an IPI to the remote partition that is associated with the
985 * specified channel.
986 */
987#define XPC_NOTIFY_IRQ_SEND(_ch, _ipi_f, _irq_f) \
988 xpc_notify_IRQ_send(_ch, _ipi_f, #_ipi_f, _irq_f)
989
990static inline void
991xpc_notify_IRQ_send(struct xpc_channel *ch, u8 ipi_flag, char *ipi_flag_string,
992 unsigned long *irq_flags)
993{
994 struct xpc_partition *part = &xpc_partitions[ch->partid];
995 enum xp_retval ret;
996
997 if (likely(part->act_state != XPC_P_DEACTIVATING)) {
998 ret = xpc_IPI_send(part->remote_IPI_amo_va,
999 (u64)ipi_flag << (ch->number * 8),
1000 part->remote_IPI_nasid,
1001 part->remote_IPI_phys_cpuid, SGI_XPC_NOTIFY);
1002 dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n",
1003 ipi_flag_string, ch->partid, ch->number, ret);
1004 if (unlikely(ret != xpSuccess)) {
1005 if (irq_flags != NULL)
1006 spin_unlock_irqrestore(&ch->lock, *irq_flags);
1007 XPC_DEACTIVATE_PARTITION(part, ret);
1008 if (irq_flags != NULL)
1009 spin_lock_irqsave(&ch->lock, *irq_flags);
1010 }
1011 }
1012}
1013
1014/*
1015 * Make it look like the remote partition, which is associated with the
1016 * specified channel, sent us an IPI. This faked IPI will be handled
1017 * by xpc_dropped_IPI_check().
1018 */
1019#define XPC_NOTIFY_IRQ_SEND_LOCAL(_ch, _ipi_f) \
1020 xpc_notify_IRQ_send_local(_ch, _ipi_f, #_ipi_f)
1021
1022static inline void
1023xpc_notify_IRQ_send_local(struct xpc_channel *ch, u8 ipi_flag,
1024 char *ipi_flag_string)
1025{
1026 struct xpc_partition *part = &xpc_partitions[ch->partid];
1027
1028 FETCHOP_STORE_OP(TO_AMO((u64)&part->local_IPI_amo_va->variable),
1029 FETCHOP_OR, ((u64)ipi_flag << (ch->number * 8)));
1030 dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n",
1031 ipi_flag_string, ch->partid, ch->number);
1032}
1033
1034/*
1035 * The sending and receiving of IPIs includes the setting of an AMO variable
1036 * to indicate the reason the IPI was sent. The 64-bit variable is divided
1037 * up into eight bytes, ordered from right to left. Byte zero pertains to
1038 * channel 0, byte one to channel 1, and so on. Each byte is described by
1039 * the following IPI flags.
1040 */
1041
1042#define XPC_IPI_CLOSEREQUEST 0x01
1043#define XPC_IPI_CLOSEREPLY 0x02
1044#define XPC_IPI_OPENREQUEST 0x04
1045#define XPC_IPI_OPENREPLY 0x08
1046#define XPC_IPI_MSGREQUEST 0x10
1047
1048/* given an AMO variable and a channel#, get its associated IPI flags */
1049#define XPC_GET_IPI_FLAGS(_amo, _c) ((u8) (((_amo) >> ((_c) * 8)) & 0xff))
1050#define XPC_SET_IPI_FLAGS(_amo, _c, _f) (_amo) |= ((u64) (_f) << ((_c) * 8))
1051
1052#define XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(_amo) ((_amo) & 0x0f0f0f0f0f0f0f0fUL)
1053#define XPC_ANY_MSG_IPI_FLAGS_SET(_amo) ((_amo) & 0x1010101010101010UL)
1054
1055static inline void
1056xpc_IPI_send_closerequest(struct xpc_channel *ch, unsigned long *irq_flags)
1057{
1058 struct xpc_openclose_args *args = ch->local_openclose_args;
1059
1060 args->reason = ch->reason;
1061
1062 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREQUEST, irq_flags);
1063}
1064
1065static inline void
1066xpc_IPI_send_closereply(struct xpc_channel *ch, unsigned long *irq_flags)
1067{
1068 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREPLY, irq_flags);
1069}
1070
1071static inline void
1072xpc_IPI_send_openrequest(struct xpc_channel *ch, unsigned long *irq_flags)
1073{
1074 struct xpc_openclose_args *args = ch->local_openclose_args;
1075
1076 args->msg_size = ch->msg_size;
1077 args->local_nentries = ch->local_nentries;
1078
1079 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREQUEST, irq_flags);
1080}
1081
1082static inline void
1083xpc_IPI_send_openreply(struct xpc_channel *ch, unsigned long *irq_flags)
1084{
1085 struct xpc_openclose_args *args = ch->local_openclose_args;
1086
1087 args->remote_nentries = ch->remote_nentries;
1088 args->local_nentries = ch->local_nentries;
1089 args->local_msgqueue_pa = __pa(ch->local_msgqueue);
1090
1091 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREPLY, irq_flags);
1092}
1093
1094static inline void
1095xpc_IPI_send_msgrequest(struct xpc_channel *ch)
1096{
1097 XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_MSGREQUEST, NULL);
1098}
1099
1100static inline void
1101xpc_IPI_send_local_msgrequest(struct xpc_channel *ch)
1102{
1103 XPC_NOTIFY_IRQ_SEND_LOCAL(ch, XPC_IPI_MSGREQUEST);
1104}
1105
1106/*
1107 * Memory for XPC's AMO variables is allocated by the MSPEC driver. These
1108 * pages are located in the lowest granule. The lowest granule uses 4k pages
1109 * for cached references and an alternate TLB handler to never provide a
1110 * cacheable mapping for the entire region. This will prevent speculative
1111 * reading of cached copies of our lines from being issued which will cause
1112 * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
1113 * AMO variables (based on XP_MAX_PARTITIONS) for message notification and an
1114 * additional 128 AMO variables (based on XP_NASID_MASK_WORDS) for partition
1115 * activation and 2 AMO variables for partition deactivation.
1116 */
1117static inline AMO_t *
1118xpc_IPI_init(int index)
1119{
1120 AMO_t *amo = xpc_vars->amos_page + index;
1121
1122 (void)xpc_IPI_receive(amo); /* clear AMO variable */
1123 return amo;
1124}
1125
1126static inline enum xp_retval
1127xpc_map_bte_errors(bte_result_t error)
1128{
1129 return ((error == BTE_SUCCESS) ? xpSuccess : xpBteCopyError);
1130}
1131
1132/*
1133 * Check to see if there is any channel activity to/from the specified
1134 * partition.
1135 */
1136static inline void
1137xpc_check_for_channel_activity(struct xpc_partition *part)
1138{
1139 u64 IPI_amo;
1140 unsigned long irq_flags;
1141
1142 IPI_amo = xpc_IPI_receive(part->local_IPI_amo_va);
1143 if (IPI_amo == 0)
1144 return;
1145
1146 spin_lock_irqsave(&part->IPI_lock, irq_flags);
1147 part->local_IPI_amo |= IPI_amo;
1148 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
1149
1150 dev_dbg(xpc_chan, "received IPI from partid=%d, IPI_amo=0x%lx\n",
1151 XPC_PARTID(part), IPI_amo);
1152
1153 xpc_wakeup_channel_mgr(part);
1154}
1155
1156#endif /* _DRIVERS_MISC_SGIXP_XPC_H */ 990#endif /* _DRIVERS_MISC_SGIXP_XPC_H */
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
index 9c90c2d55c08..9cd2ebe2a3b6 100644
--- a/drivers/misc/sgi-xp/xpc_channel.c
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -14,536 +14,10 @@
14 * 14 *
15 */ 15 */
16 16
17#include <linux/kernel.h> 17#include <linux/device.h>
18#include <linux/init.h>
19#include <linux/sched.h>
20#include <linux/cache.h>
21#include <linux/interrupt.h>
22#include <linux/mutex.h>
23#include <linux/completion.h>
24#include <asm/sn/bte.h>
25#include <asm/sn/sn_sal.h>
26#include "xpc.h" 18#include "xpc.h"
27 19
28/* 20/*
29 * Guarantee that the kzalloc'd memory is cacheline aligned.
30 */
31static void *
32xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
33{
34 /* see if kzalloc will give us cachline aligned memory by default */
35 *base = kzalloc(size, flags);
36 if (*base == NULL)
37 return NULL;
38
39 if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
40 return *base;
41
42 kfree(*base);
43
44 /* nope, we'll have to do it ourselves */
45 *base = kzalloc(size + L1_CACHE_BYTES, flags);
46 if (*base == NULL)
47 return NULL;
48
49 return (void *)L1_CACHE_ALIGN((u64)*base);
50}
51
52/*
53 * Set up the initial values for the XPartition Communication channels.
54 */
55static void
56xpc_initialize_channels(struct xpc_partition *part, short partid)
57{
58 int ch_number;
59 struct xpc_channel *ch;
60
61 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
62 ch = &part->channels[ch_number];
63
64 ch->partid = partid;
65 ch->number = ch_number;
66 ch->flags = XPC_C_DISCONNECTED;
67
68 ch->local_GP = &part->local_GPs[ch_number];
69 ch->local_openclose_args =
70 &part->local_openclose_args[ch_number];
71
72 atomic_set(&ch->kthreads_assigned, 0);
73 atomic_set(&ch->kthreads_idle, 0);
74 atomic_set(&ch->kthreads_active, 0);
75
76 atomic_set(&ch->references, 0);
77 atomic_set(&ch->n_to_notify, 0);
78
79 spin_lock_init(&ch->lock);
80 mutex_init(&ch->msg_to_pull_mutex);
81 init_completion(&ch->wdisconnect_wait);
82
83 atomic_set(&ch->n_on_msg_allocate_wq, 0);
84 init_waitqueue_head(&ch->msg_allocate_wq);
85 init_waitqueue_head(&ch->idle_wq);
86 }
87}
88
89/*
90 * Setup the infrastructure necessary to support XPartition Communication
91 * between the specified remote partition and the local one.
92 */
93enum xp_retval
94xpc_setup_infrastructure(struct xpc_partition *part)
95{
96 int ret, cpuid;
97 struct timer_list *timer;
98 short partid = XPC_PARTID(part);
99
100 /*
101 * Zero out MOST of the entry for this partition. Only the fields
102 * starting with `nchannels' will be zeroed. The preceding fields must
103 * remain `viable' across partition ups and downs, since they may be
104 * referenced during this memset() operation.
105 */
106 memset(&part->nchannels, 0, sizeof(struct xpc_partition) -
107 offsetof(struct xpc_partition, nchannels));
108
109 /*
110 * Allocate all of the channel structures as a contiguous chunk of
111 * memory.
112 */
113 part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
114 GFP_KERNEL);
115 if (part->channels == NULL) {
116 dev_err(xpc_chan, "can't get memory for channels\n");
117 return xpNoMemory;
118 }
119
120 part->nchannels = XPC_NCHANNELS;
121
122 /* allocate all the required GET/PUT values */
123
124 part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
125 GFP_KERNEL,
126 &part->local_GPs_base);
127 if (part->local_GPs == NULL) {
128 kfree(part->channels);
129 part->channels = NULL;
130 dev_err(xpc_chan, "can't get memory for local get/put "
131 "values\n");
132 return xpNoMemory;
133 }
134
135 part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
136 GFP_KERNEL,
137 &part->
138 remote_GPs_base);
139 if (part->remote_GPs == NULL) {
140 dev_err(xpc_chan, "can't get memory for remote get/put "
141 "values\n");
142 kfree(part->local_GPs_base);
143 part->local_GPs = NULL;
144 kfree(part->channels);
145 part->channels = NULL;
146 return xpNoMemory;
147 }
148
149 /* allocate all the required open and close args */
150
151 part->local_openclose_args =
152 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
153 &part->local_openclose_args_base);
154 if (part->local_openclose_args == NULL) {
155 dev_err(xpc_chan, "can't get memory for local connect args\n");
156 kfree(part->remote_GPs_base);
157 part->remote_GPs = NULL;
158 kfree(part->local_GPs_base);
159 part->local_GPs = NULL;
160 kfree(part->channels);
161 part->channels = NULL;
162 return xpNoMemory;
163 }
164
165 part->remote_openclose_args =
166 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
167 &part->remote_openclose_args_base);
168 if (part->remote_openclose_args == NULL) {
169 dev_err(xpc_chan, "can't get memory for remote connect args\n");
170 kfree(part->local_openclose_args_base);
171 part->local_openclose_args = NULL;
172 kfree(part->remote_GPs_base);
173 part->remote_GPs = NULL;
174 kfree(part->local_GPs_base);
175 part->local_GPs = NULL;
176 kfree(part->channels);
177 part->channels = NULL;
178 return xpNoMemory;
179 }
180
181 xpc_initialize_channels(part, partid);
182
183 atomic_set(&part->nchannels_active, 0);
184 atomic_set(&part->nchannels_engaged, 0);
185
186 /* local_IPI_amo were set to 0 by an earlier memset() */
187
188 /* Initialize this partitions AMO_t structure */
189 part->local_IPI_amo_va = xpc_IPI_init(partid);
190
191 spin_lock_init(&part->IPI_lock);
192
193 atomic_set(&part->channel_mgr_requests, 1);
194 init_waitqueue_head(&part->channel_mgr_wq);
195
196 sprintf(part->IPI_owner, "xpc%02d", partid);
197 ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, IRQF_SHARED,
198 part->IPI_owner, (void *)(u64)partid);
199 if (ret != 0) {
200 dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
201 "errno=%d\n", -ret);
202 kfree(part->remote_openclose_args_base);
203 part->remote_openclose_args = NULL;
204 kfree(part->local_openclose_args_base);
205 part->local_openclose_args = NULL;
206 kfree(part->remote_GPs_base);
207 part->remote_GPs = NULL;
208 kfree(part->local_GPs_base);
209 part->local_GPs = NULL;
210 kfree(part->channels);
211 part->channels = NULL;
212 return xpLackOfResources;
213 }
214
215 /* Setup a timer to check for dropped IPIs */
216 timer = &part->dropped_IPI_timer;
217 init_timer(timer);
218 timer->function = (void (*)(unsigned long))xpc_dropped_IPI_check;
219 timer->data = (unsigned long)part;
220 timer->expires = jiffies + XPC_P_DROPPED_IPI_WAIT;
221 add_timer(timer);
222
223 /*
224 * With the setting of the partition setup_state to XPC_P_SETUP, we're
225 * declaring that this partition is ready to go.
226 */
227 part->setup_state = XPC_P_SETUP;
228
229 /*
230 * Setup the per partition specific variables required by the
231 * remote partition to establish channel connections with us.
232 *
233 * The setting of the magic # indicates that these per partition
234 * specific variables are ready to be used.
235 */
236 xpc_vars_part[partid].GPs_pa = __pa(part->local_GPs);
237 xpc_vars_part[partid].openclose_args_pa =
238 __pa(part->local_openclose_args);
239 xpc_vars_part[partid].IPI_amo_pa = __pa(part->local_IPI_amo_va);
240 cpuid = raw_smp_processor_id(); /* any CPU in this partition will do */
241 xpc_vars_part[partid].IPI_nasid = cpuid_to_nasid(cpuid);
242 xpc_vars_part[partid].IPI_phys_cpuid = cpu_physical_id(cpuid);
243 xpc_vars_part[partid].nchannels = part->nchannels;
244 xpc_vars_part[partid].magic = XPC_VP_MAGIC1;
245
246 return xpSuccess;
247}
248
249/*
250 * Create a wrapper that hides the underlying mechanism for pulling a cacheline
251 * (or multiple cachelines) from a remote partition.
252 *
253 * src must be a cacheline aligned physical address on the remote partition.
254 * dst must be a cacheline aligned virtual address on this partition.
255 * cnt must be an cacheline sized
256 */
257static enum xp_retval
258xpc_pull_remote_cachelines(struct xpc_partition *part, void *dst,
259 const void *src, size_t cnt)
260{
261 bte_result_t bte_ret;
262
263 DBUG_ON((u64)src != L1_CACHE_ALIGN((u64)src));
264 DBUG_ON((u64)dst != L1_CACHE_ALIGN((u64)dst));
265 DBUG_ON(cnt != L1_CACHE_ALIGN(cnt));
266
267 if (part->act_state == XPC_P_DEACTIVATING)
268 return part->reason;
269
270 bte_ret = xp_bte_copy((u64)src, (u64)dst, (u64)cnt,
271 (BTE_NORMAL | BTE_WACQUIRE), NULL);
272 if (bte_ret == BTE_SUCCESS)
273 return xpSuccess;
274
275 dev_dbg(xpc_chan, "xp_bte_copy() from partition %d failed, ret=%d\n",
276 XPC_PARTID(part), bte_ret);
277
278 return xpc_map_bte_errors(bte_ret);
279}
280
281/*
282 * Pull the remote per partition specific variables from the specified
283 * partition.
284 */
285enum xp_retval
286xpc_pull_remote_vars_part(struct xpc_partition *part)
287{
288 u8 buffer[L1_CACHE_BYTES * 2];
289 struct xpc_vars_part *pulled_entry_cacheline =
290 (struct xpc_vars_part *)L1_CACHE_ALIGN((u64)buffer);
291 struct xpc_vars_part *pulled_entry;
292 u64 remote_entry_cacheline_pa, remote_entry_pa;
293 short partid = XPC_PARTID(part);
294 enum xp_retval ret;
295
296 /* pull the cacheline that contains the variables we're interested in */
297
298 DBUG_ON(part->remote_vars_part_pa !=
299 L1_CACHE_ALIGN(part->remote_vars_part_pa));
300 DBUG_ON(sizeof(struct xpc_vars_part) != L1_CACHE_BYTES / 2);
301
302 remote_entry_pa = part->remote_vars_part_pa +
303 sn_partition_id * sizeof(struct xpc_vars_part);
304
305 remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1));
306
307 pulled_entry = (struct xpc_vars_part *)((u64)pulled_entry_cacheline +
308 (remote_entry_pa &
309 (L1_CACHE_BYTES - 1)));
310
311 ret = xpc_pull_remote_cachelines(part, pulled_entry_cacheline,
312 (void *)remote_entry_cacheline_pa,
313 L1_CACHE_BYTES);
314 if (ret != xpSuccess) {
315 dev_dbg(xpc_chan, "failed to pull XPC vars_part from "
316 "partition %d, ret=%d\n", partid, ret);
317 return ret;
318 }
319
320 /* see if they've been set up yet */
321
322 if (pulled_entry->magic != XPC_VP_MAGIC1 &&
323 pulled_entry->magic != XPC_VP_MAGIC2) {
324
325 if (pulled_entry->magic != 0) {
326 dev_dbg(xpc_chan, "partition %d's XPC vars_part for "
327 "partition %d has bad magic value (=0x%lx)\n",
328 partid, sn_partition_id, pulled_entry->magic);
329 return xpBadMagic;
330 }
331
332 /* they've not been initialized yet */
333 return xpRetry;
334 }
335
336 if (xpc_vars_part[partid].magic == XPC_VP_MAGIC1) {
337
338 /* validate the variables */
339
340 if (pulled_entry->GPs_pa == 0 ||
341 pulled_entry->openclose_args_pa == 0 ||
342 pulled_entry->IPI_amo_pa == 0) {
343
344 dev_err(xpc_chan, "partition %d's XPC vars_part for "
345 "partition %d are not valid\n", partid,
346 sn_partition_id);
347 return xpInvalidAddress;
348 }
349
350 /* the variables we imported look to be valid */
351
352 part->remote_GPs_pa = pulled_entry->GPs_pa;
353 part->remote_openclose_args_pa =
354 pulled_entry->openclose_args_pa;
355 part->remote_IPI_amo_va =
356 (AMO_t *)__va(pulled_entry->IPI_amo_pa);
357 part->remote_IPI_nasid = pulled_entry->IPI_nasid;
358 part->remote_IPI_phys_cpuid = pulled_entry->IPI_phys_cpuid;
359
360 if (part->nchannels > pulled_entry->nchannels)
361 part->nchannels = pulled_entry->nchannels;
362
363 /* let the other side know that we've pulled their variables */
364
365 xpc_vars_part[partid].magic = XPC_VP_MAGIC2;
366 }
367
368 if (pulled_entry->magic == XPC_VP_MAGIC1)
369 return xpRetry;
370
371 return xpSuccess;
372}
373
374/*
375 * Get the IPI flags and pull the openclose args and/or remote GPs as needed.
376 */
377static u64
378xpc_get_IPI_flags(struct xpc_partition *part)
379{
380 unsigned long irq_flags;
381 u64 IPI_amo;
382 enum xp_retval ret;
383
384 /*
385 * See if there are any IPI flags to be handled.
386 */
387
388 spin_lock_irqsave(&part->IPI_lock, irq_flags);
389 IPI_amo = part->local_IPI_amo;
390 if (IPI_amo != 0)
391 part->local_IPI_amo = 0;
392
393 spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
394
395 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_amo)) {
396 ret = xpc_pull_remote_cachelines(part,
397 part->remote_openclose_args,
398 (void *)part->
399 remote_openclose_args_pa,
400 XPC_OPENCLOSE_ARGS_SIZE);
401 if (ret != xpSuccess) {
402 XPC_DEACTIVATE_PARTITION(part, ret);
403
404 dev_dbg(xpc_chan, "failed to pull openclose args from "
405 "partition %d, ret=%d\n", XPC_PARTID(part),
406 ret);
407
408 /* don't bother processing IPIs anymore */
409 IPI_amo = 0;
410 }
411 }
412
413 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_amo)) {
414 ret = xpc_pull_remote_cachelines(part, part->remote_GPs,
415 (void *)part->remote_GPs_pa,
416 XPC_GP_SIZE);
417 if (ret != xpSuccess) {
418 XPC_DEACTIVATE_PARTITION(part, ret);
419
420 dev_dbg(xpc_chan, "failed to pull GPs from partition "
421 "%d, ret=%d\n", XPC_PARTID(part), ret);
422
423 /* don't bother processing IPIs anymore */
424 IPI_amo = 0;
425 }
426 }
427
428 return IPI_amo;
429}
430
431/*
432 * Allocate the local message queue and the notify queue.
433 */
434static enum xp_retval
435xpc_allocate_local_msgqueue(struct xpc_channel *ch)
436{
437 unsigned long irq_flags;
438 int nentries;
439 size_t nbytes;
440
441 for (nentries = ch->local_nentries; nentries > 0; nentries--) {
442
443 nbytes = nentries * ch->msg_size;
444 ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
445 GFP_KERNEL,
446 &ch->local_msgqueue_base);
447 if (ch->local_msgqueue == NULL)
448 continue;
449
450 nbytes = nentries * sizeof(struct xpc_notify);
451 ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
452 if (ch->notify_queue == NULL) {
453 kfree(ch->local_msgqueue_base);
454 ch->local_msgqueue = NULL;
455 continue;
456 }
457
458 spin_lock_irqsave(&ch->lock, irq_flags);
459 if (nentries < ch->local_nentries) {
460 dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, "
461 "partid=%d, channel=%d\n", nentries,
462 ch->local_nentries, ch->partid, ch->number);
463
464 ch->local_nentries = nentries;
465 }
466 spin_unlock_irqrestore(&ch->lock, irq_flags);
467 return xpSuccess;
468 }
469
470 dev_dbg(xpc_chan, "can't get memory for local message queue and notify "
471 "queue, partid=%d, channel=%d\n", ch->partid, ch->number);
472 return xpNoMemory;
473}
474
475/*
476 * Allocate the cached remote message queue.
477 */
478static enum xp_retval
479xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
480{
481 unsigned long irq_flags;
482 int nentries;
483 size_t nbytes;
484
485 DBUG_ON(ch->remote_nentries <= 0);
486
487 for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
488
489 nbytes = nentries * ch->msg_size;
490 ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
491 GFP_KERNEL,
492 &ch->remote_msgqueue_base);
493 if (ch->remote_msgqueue == NULL)
494 continue;
495
496 spin_lock_irqsave(&ch->lock, irq_flags);
497 if (nentries < ch->remote_nentries) {
498 dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, "
499 "partid=%d, channel=%d\n", nentries,
500 ch->remote_nentries, ch->partid, ch->number);
501
502 ch->remote_nentries = nentries;
503 }
504 spin_unlock_irqrestore(&ch->lock, irq_flags);
505 return xpSuccess;
506 }
507
508 dev_dbg(xpc_chan, "can't get memory for cached remote message queue, "
509 "partid=%d, channel=%d\n", ch->partid, ch->number);
510 return xpNoMemory;
511}
512
513/*
514 * Allocate message queues and other stuff associated with a channel.
515 *
516 * Note: Assumes all of the channel sizes are filled in.
517 */
518static enum xp_retval
519xpc_allocate_msgqueues(struct xpc_channel *ch)
520{
521 unsigned long irq_flags;
522 enum xp_retval ret;
523
524 DBUG_ON(ch->flags & XPC_C_SETUP);
525
526 ret = xpc_allocate_local_msgqueue(ch);
527 if (ret != xpSuccess)
528 return ret;
529
530 ret = xpc_allocate_remote_msgqueue(ch);
531 if (ret != xpSuccess) {
532 kfree(ch->local_msgqueue_base);
533 ch->local_msgqueue = NULL;
534 kfree(ch->notify_queue);
535 ch->notify_queue = NULL;
536 return ret;
537 }
538
539 spin_lock_irqsave(&ch->lock, irq_flags);
540 ch->flags |= XPC_C_SETUP;
541 spin_unlock_irqrestore(&ch->lock, irq_flags);
542
543 return xpSuccess;
544}
545
546/*
547 * Process a connect message from a remote partition. 21 * Process a connect message from a remote partition.
548 * 22 *
549 * Note: xpc_process_connect() is expecting to be called with the 23 * Note: xpc_process_connect() is expecting to be called with the
@@ -565,30 +39,29 @@ xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
565 39
566 if (!(ch->flags & XPC_C_SETUP)) { 40 if (!(ch->flags & XPC_C_SETUP)) {
567 spin_unlock_irqrestore(&ch->lock, *irq_flags); 41 spin_unlock_irqrestore(&ch->lock, *irq_flags);
568 ret = xpc_allocate_msgqueues(ch); 42 ret = xpc_setup_msg_structures(ch);
569 spin_lock_irqsave(&ch->lock, *irq_flags); 43 spin_lock_irqsave(&ch->lock, *irq_flags);
570 44
571 if (ret != xpSuccess) 45 if (ret != xpSuccess)
572 XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags); 46 XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags);
573 47
48 ch->flags |= XPC_C_SETUP;
49
574 if (ch->flags & (XPC_C_CONNECTED | XPC_C_DISCONNECTING)) 50 if (ch->flags & (XPC_C_CONNECTED | XPC_C_DISCONNECTING))
575 return; 51 return;
576 52
577 DBUG_ON(!(ch->flags & XPC_C_SETUP));
578 DBUG_ON(ch->local_msgqueue == NULL); 53 DBUG_ON(ch->local_msgqueue == NULL);
579 DBUG_ON(ch->remote_msgqueue == NULL); 54 DBUG_ON(ch->remote_msgqueue == NULL);
580 } 55 }
581 56
582 if (!(ch->flags & XPC_C_OPENREPLY)) { 57 if (!(ch->flags & XPC_C_OPENREPLY)) {
583 ch->flags |= XPC_C_OPENREPLY; 58 ch->flags |= XPC_C_OPENREPLY;
584 xpc_IPI_send_openreply(ch, irq_flags); 59 xpc_send_chctl_openreply(ch, irq_flags);
585 } 60 }
586 61
587 if (!(ch->flags & XPC_C_ROPENREPLY)) 62 if (!(ch->flags & XPC_C_ROPENREPLY))
588 return; 63 return;
589 64
590 DBUG_ON(ch->remote_msgqueue_pa == 0);
591
592 ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP); /* clear all else */ 65 ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP); /* clear all else */
593 66
594 dev_info(xpc_chan, "channel %d to partition %d connected\n", 67 dev_info(xpc_chan, "channel %d to partition %d connected\n",
@@ -600,99 +73,6 @@ xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
600} 73}
601 74
602/* 75/*
603 * Notify those who wanted to be notified upon delivery of their message.
604 */
605static void
606xpc_notify_senders(struct xpc_channel *ch, enum xp_retval reason, s64 put)
607{
608 struct xpc_notify *notify;
609 u8 notify_type;
610 s64 get = ch->w_remote_GP.get - 1;
611
612 while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
613
614 notify = &ch->notify_queue[get % ch->local_nentries];
615
616 /*
617 * See if the notify entry indicates it was associated with
618 * a message who's sender wants to be notified. It is possible
619 * that it is, but someone else is doing or has done the
620 * notification.
621 */
622 notify_type = notify->type;
623 if (notify_type == 0 ||
624 cmpxchg(&notify->type, notify_type, 0) != notify_type) {
625 continue;
626 }
627
628 DBUG_ON(notify_type != XPC_N_CALL);
629
630 atomic_dec(&ch->n_to_notify);
631
632 if (notify->func != NULL) {
633 dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, "
634 "msg_number=%ld, partid=%d, channel=%d\n",
635 (void *)notify, get, ch->partid, ch->number);
636
637 notify->func(reason, ch->partid, ch->number,
638 notify->key);
639
640 dev_dbg(xpc_chan, "notify->func() returned, "
641 "notify=0x%p, msg_number=%ld, partid=%d, "
642 "channel=%d\n", (void *)notify, get,
643 ch->partid, ch->number);
644 }
645 }
646}
647
648/*
649 * Free up message queues and other stuff that were allocated for the specified
650 * channel.
651 *
652 * Note: ch->reason and ch->reason_line are left set for debugging purposes,
653 * they're cleared when XPC_C_DISCONNECTED is cleared.
654 */
655static void
656xpc_free_msgqueues(struct xpc_channel *ch)
657{
658 DBUG_ON(!spin_is_locked(&ch->lock));
659 DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
660
661 ch->remote_msgqueue_pa = 0;
662 ch->func = NULL;
663 ch->key = NULL;
664 ch->msg_size = 0;
665 ch->local_nentries = 0;
666 ch->remote_nentries = 0;
667 ch->kthreads_assigned_limit = 0;
668 ch->kthreads_idle_limit = 0;
669
670 ch->local_GP->get = 0;
671 ch->local_GP->put = 0;
672 ch->remote_GP.get = 0;
673 ch->remote_GP.put = 0;
674 ch->w_local_GP.get = 0;
675 ch->w_local_GP.put = 0;
676 ch->w_remote_GP.get = 0;
677 ch->w_remote_GP.put = 0;
678 ch->next_msg_to_pull = 0;
679
680 if (ch->flags & XPC_C_SETUP) {
681 ch->flags &= ~XPC_C_SETUP;
682
683 dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n",
684 ch->flags, ch->partid, ch->number);
685
686 kfree(ch->local_msgqueue_base);
687 ch->local_msgqueue = NULL;
688 kfree(ch->remote_msgqueue_base);
689 ch->remote_msgqueue = NULL;
690 kfree(ch->notify_queue);
691 ch->notify_queue = NULL;
692 }
693}
694
695/*
696 * spin_lock_irqsave() is expected to be held on entry. 76 * spin_lock_irqsave() is expected to be held on entry.
697 */ 77 */
698static void 78static void
@@ -717,9 +97,9 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
717 DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) && 97 DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
718 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE)); 98 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE));
719 99
720 if (part->act_state == XPC_P_DEACTIVATING) { 100 if (part->act_state == XPC_P_AS_DEACTIVATING) {
721 /* can't proceed until the other side disengages from us */ 101 /* can't proceed until the other side disengages from us */
722 if (xpc_partition_engaged(1UL << ch->partid)) 102 if (xpc_partition_engaged(ch->partid))
723 return; 103 return;
724 104
725 } else { 105 } else {
@@ -731,7 +111,7 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
731 111
732 if (!(ch->flags & XPC_C_CLOSEREPLY)) { 112 if (!(ch->flags & XPC_C_CLOSEREPLY)) {
733 ch->flags |= XPC_C_CLOSEREPLY; 113 ch->flags |= XPC_C_CLOSEREPLY;
734 xpc_IPI_send_closereply(ch, irq_flags); 114 xpc_send_chctl_closereply(ch, irq_flags);
735 } 115 }
736 116
737 if (!(ch->flags & XPC_C_RCLOSEREPLY)) 117 if (!(ch->flags & XPC_C_RCLOSEREPLY))
@@ -740,8 +120,8 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
740 120
741 /* wake those waiting for notify completion */ 121 /* wake those waiting for notify completion */
742 if (atomic_read(&ch->n_to_notify) > 0) { 122 if (atomic_read(&ch->n_to_notify) > 0) {
743 /* >>> we do callout while holding ch->lock */ 123 /* we do callout while holding ch->lock, callout can't block */
744 xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put); 124 xpc_notify_senders_of_disconnect(ch);
745 } 125 }
746 126
747 /* both sides are disconnected now */ 127 /* both sides are disconnected now */
@@ -752,10 +132,24 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
752 spin_lock_irqsave(&ch->lock, *irq_flags); 132 spin_lock_irqsave(&ch->lock, *irq_flags);
753 } 133 }
754 134
135 DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
136
755 /* it's now safe to free the channel's message queues */ 137 /* it's now safe to free the channel's message queues */
756 xpc_free_msgqueues(ch); 138 xpc_teardown_msg_structures(ch);
757 139
758 /* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */ 140 ch->func = NULL;
141 ch->key = NULL;
142 ch->entry_size = 0;
143 ch->local_nentries = 0;
144 ch->remote_nentries = 0;
145 ch->kthreads_assigned_limit = 0;
146 ch->kthreads_idle_limit = 0;
147
148 /*
149 * Mark the channel disconnected and clear all other flags, including
150 * XPC_C_SETUP (because of call to xpc_teardown_msg_structures()) but
151 * not including XPC_C_WDISCONNECT (if it was set).
152 */
759 ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT)); 153 ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
760 154
761 atomic_dec(&part->nchannels_active); 155 atomic_dec(&part->nchannels_active);
@@ -768,15 +162,15 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
768 if (ch->flags & XPC_C_WDISCONNECT) { 162 if (ch->flags & XPC_C_WDISCONNECT) {
769 /* we won't lose the CPU since we're holding ch->lock */ 163 /* we won't lose the CPU since we're holding ch->lock */
770 complete(&ch->wdisconnect_wait); 164 complete(&ch->wdisconnect_wait);
771 } else if (ch->delayed_IPI_flags) { 165 } else if (ch->delayed_chctl_flags) {
772 if (part->act_state != XPC_P_DEACTIVATING) { 166 if (part->act_state != XPC_P_AS_DEACTIVATING) {
773 /* time to take action on any delayed IPI flags */ 167 /* time to take action on any delayed chctl flags */
774 spin_lock(&part->IPI_lock); 168 spin_lock(&part->chctl_lock);
775 XPC_SET_IPI_FLAGS(part->local_IPI_amo, ch->number, 169 part->chctl.flags[ch->number] |=
776 ch->delayed_IPI_flags); 170 ch->delayed_chctl_flags;
777 spin_unlock(&part->IPI_lock); 171 spin_unlock(&part->chctl_lock);
778 } 172 }
779 ch->delayed_IPI_flags = 0; 173 ch->delayed_chctl_flags = 0;
780 } 174 }
781} 175}
782 176
@@ -784,8 +178,8 @@ xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
784 * Process a change in the channel's remote connection state. 178 * Process a change in the channel's remote connection state.
785 */ 179 */
786static void 180static void
787xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number, 181xpc_process_openclose_chctl_flags(struct xpc_partition *part, int ch_number,
788 u8 IPI_flags) 182 u8 chctl_flags)
789{ 183{
790 unsigned long irq_flags; 184 unsigned long irq_flags;
791 struct xpc_openclose_args *args = 185 struct xpc_openclose_args *args =
@@ -800,24 +194,24 @@ again:
800 if ((ch->flags & XPC_C_DISCONNECTED) && 194 if ((ch->flags & XPC_C_DISCONNECTED) &&
801 (ch->flags & XPC_C_WDISCONNECT)) { 195 (ch->flags & XPC_C_WDISCONNECT)) {
802 /* 196 /*
803 * Delay processing IPI flags until thread waiting disconnect 197 * Delay processing chctl flags until thread waiting disconnect
804 * has had a chance to see that the channel is disconnected. 198 * has had a chance to see that the channel is disconnected.
805 */ 199 */
806 ch->delayed_IPI_flags |= IPI_flags; 200 ch->delayed_chctl_flags |= chctl_flags;
807 spin_unlock_irqrestore(&ch->lock, irq_flags); 201 spin_unlock_irqrestore(&ch->lock, irq_flags);
808 return; 202 return;
809 } 203 }
810 204
811 if (IPI_flags & XPC_IPI_CLOSEREQUEST) { 205 if (chctl_flags & XPC_CHCTL_CLOSEREQUEST) {
812 206
813 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREQUEST (reason=%d) received " 207 dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREQUEST (reason=%d) received "
814 "from partid=%d, channel=%d\n", args->reason, 208 "from partid=%d, channel=%d\n", args->reason,
815 ch->partid, ch->number); 209 ch->partid, ch->number);
816 210
817 /* 211 /*
818 * If RCLOSEREQUEST is set, we're probably waiting for 212 * If RCLOSEREQUEST is set, we're probably waiting for
819 * RCLOSEREPLY. We should find it and a ROPENREQUEST packed 213 * RCLOSEREPLY. We should find it and a ROPENREQUEST packed
820 * with this RCLOSEREQUEST in the IPI_flags. 214 * with this RCLOSEREQUEST in the chctl_flags.
821 */ 215 */
822 216
823 if (ch->flags & XPC_C_RCLOSEREQUEST) { 217 if (ch->flags & XPC_C_RCLOSEREQUEST) {
@@ -826,8 +220,8 @@ again:
826 DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY)); 220 DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY));
827 DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY); 221 DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY);
828 222
829 DBUG_ON(!(IPI_flags & XPC_IPI_CLOSEREPLY)); 223 DBUG_ON(!(chctl_flags & XPC_CHCTL_CLOSEREPLY));
830 IPI_flags &= ~XPC_IPI_CLOSEREPLY; 224 chctl_flags &= ~XPC_CHCTL_CLOSEREPLY;
831 ch->flags |= XPC_C_RCLOSEREPLY; 225 ch->flags |= XPC_C_RCLOSEREPLY;
832 226
833 /* both sides have finished disconnecting */ 227 /* both sides have finished disconnecting */
@@ -837,17 +231,15 @@ again:
837 } 231 }
838 232
839 if (ch->flags & XPC_C_DISCONNECTED) { 233 if (ch->flags & XPC_C_DISCONNECTED) {
840 if (!(IPI_flags & XPC_IPI_OPENREQUEST)) { 234 if (!(chctl_flags & XPC_CHCTL_OPENREQUEST)) {
841 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, 235 if (part->chctl.flags[ch_number] &
842 ch_number) & 236 XPC_CHCTL_OPENREQUEST) {
843 XPC_IPI_OPENREQUEST)) { 237
844 238 DBUG_ON(ch->delayed_chctl_flags != 0);
845 DBUG_ON(ch->delayed_IPI_flags != 0); 239 spin_lock(&part->chctl_lock);
846 spin_lock(&part->IPI_lock); 240 part->chctl.flags[ch_number] |=
847 XPC_SET_IPI_FLAGS(part->local_IPI_amo, 241 XPC_CHCTL_CLOSEREQUEST;
848 ch_number, 242 spin_unlock(&part->chctl_lock);
849 XPC_IPI_CLOSEREQUEST);
850 spin_unlock(&part->IPI_lock);
851 } 243 }
852 spin_unlock_irqrestore(&ch->lock, irq_flags); 244 spin_unlock_irqrestore(&ch->lock, irq_flags);
853 return; 245 return;
@@ -860,7 +252,7 @@ again:
860 ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST); 252 ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST);
861 } 253 }
862 254
863 IPI_flags &= ~(XPC_IPI_OPENREQUEST | XPC_IPI_OPENREPLY); 255 chctl_flags &= ~(XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY);
864 256
865 /* 257 /*
866 * The meaningful CLOSEREQUEST connection state fields are: 258 * The meaningful CLOSEREQUEST connection state fields are:
@@ -878,7 +270,7 @@ again:
878 270
879 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags); 271 XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
880 272
881 DBUG_ON(IPI_flags & XPC_IPI_CLOSEREPLY); 273 DBUG_ON(chctl_flags & XPC_CHCTL_CLOSEREPLY);
882 spin_unlock_irqrestore(&ch->lock, irq_flags); 274 spin_unlock_irqrestore(&ch->lock, irq_flags);
883 return; 275 return;
884 } 276 }
@@ -886,13 +278,13 @@ again:
886 xpc_process_disconnect(ch, &irq_flags); 278 xpc_process_disconnect(ch, &irq_flags);
887 } 279 }
888 280
889 if (IPI_flags & XPC_IPI_CLOSEREPLY) { 281 if (chctl_flags & XPC_CHCTL_CLOSEREPLY) {
890 282
891 dev_dbg(xpc_chan, "XPC_IPI_CLOSEREPLY received from partid=%d," 283 dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREPLY received from partid="
892 " channel=%d\n", ch->partid, ch->number); 284 "%d, channel=%d\n", ch->partid, ch->number);
893 285
894 if (ch->flags & XPC_C_DISCONNECTED) { 286 if (ch->flags & XPC_C_DISCONNECTED) {
895 DBUG_ON(part->act_state != XPC_P_DEACTIVATING); 287 DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING);
896 spin_unlock_irqrestore(&ch->lock, irq_flags); 288 spin_unlock_irqrestore(&ch->lock, irq_flags);
897 return; 289 return;
898 } 290 }
@@ -900,15 +292,14 @@ again:
900 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST)); 292 DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
901 293
902 if (!(ch->flags & XPC_C_RCLOSEREQUEST)) { 294 if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
903 if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, ch_number) 295 if (part->chctl.flags[ch_number] &
904 & XPC_IPI_CLOSEREQUEST)) { 296 XPC_CHCTL_CLOSEREQUEST) {
905 297
906 DBUG_ON(ch->delayed_IPI_flags != 0); 298 DBUG_ON(ch->delayed_chctl_flags != 0);
907 spin_lock(&part->IPI_lock); 299 spin_lock(&part->chctl_lock);
908 XPC_SET_IPI_FLAGS(part->local_IPI_amo, 300 part->chctl.flags[ch_number] |=
909 ch_number, 301 XPC_CHCTL_CLOSEREPLY;
910 XPC_IPI_CLOSEREPLY); 302 spin_unlock(&part->chctl_lock);
911 spin_unlock(&part->IPI_lock);
912 } 303 }
913 spin_unlock_irqrestore(&ch->lock, irq_flags); 304 spin_unlock_irqrestore(&ch->lock, irq_flags);
914 return; 305 return;
@@ -922,21 +313,21 @@ again:
922 } 313 }
923 } 314 }
924 315
925 if (IPI_flags & XPC_IPI_OPENREQUEST) { 316 if (chctl_flags & XPC_CHCTL_OPENREQUEST) {
926 317
927 dev_dbg(xpc_chan, "XPC_IPI_OPENREQUEST (msg_size=%d, " 318 dev_dbg(xpc_chan, "XPC_CHCTL_OPENREQUEST (entry_size=%d, "
928 "local_nentries=%d) received from partid=%d, " 319 "local_nentries=%d) received from partid=%d, "
929 "channel=%d\n", args->msg_size, args->local_nentries, 320 "channel=%d\n", args->entry_size, args->local_nentries,
930 ch->partid, ch->number); 321 ch->partid, ch->number);
931 322
932 if (part->act_state == XPC_P_DEACTIVATING || 323 if (part->act_state == XPC_P_AS_DEACTIVATING ||
933 (ch->flags & XPC_C_ROPENREQUEST)) { 324 (ch->flags & XPC_C_ROPENREQUEST)) {
934 spin_unlock_irqrestore(&ch->lock, irq_flags); 325 spin_unlock_irqrestore(&ch->lock, irq_flags);
935 return; 326 return;
936 } 327 }
937 328
938 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) { 329 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
939 ch->delayed_IPI_flags |= XPC_IPI_OPENREQUEST; 330 ch->delayed_chctl_flags |= XPC_CHCTL_OPENREQUEST;
940 spin_unlock_irqrestore(&ch->lock, irq_flags); 331 spin_unlock_irqrestore(&ch->lock, irq_flags);
941 return; 332 return;
942 } 333 }
@@ -947,10 +338,10 @@ again:
947 338
948 /* 339 /*
949 * The meaningful OPENREQUEST connection state fields are: 340 * The meaningful OPENREQUEST connection state fields are:
950 * msg_size = size of channel's messages in bytes 341 * entry_size = size of channel's messages in bytes
951 * local_nentries = remote partition's local_nentries 342 * local_nentries = remote partition's local_nentries
952 */ 343 */
953 if (args->msg_size == 0 || args->local_nentries == 0) { 344 if (args->entry_size == 0 || args->local_nentries == 0) {
954 /* assume OPENREQUEST was delayed by mistake */ 345 /* assume OPENREQUEST was delayed by mistake */
955 spin_unlock_irqrestore(&ch->lock, irq_flags); 346 spin_unlock_irqrestore(&ch->lock, irq_flags);
956 return; 347 return;
@@ -960,14 +351,14 @@ again:
960 ch->remote_nentries = args->local_nentries; 351 ch->remote_nentries = args->local_nentries;
961 352
962 if (ch->flags & XPC_C_OPENREQUEST) { 353 if (ch->flags & XPC_C_OPENREQUEST) {
963 if (args->msg_size != ch->msg_size) { 354 if (args->entry_size != ch->entry_size) {
964 XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes, 355 XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes,
965 &irq_flags); 356 &irq_flags);
966 spin_unlock_irqrestore(&ch->lock, irq_flags); 357 spin_unlock_irqrestore(&ch->lock, irq_flags);
967 return; 358 return;
968 } 359 }
969 } else { 360 } else {
970 ch->msg_size = args->msg_size; 361 ch->entry_size = args->entry_size;
971 362
972 XPC_SET_REASON(ch, 0, 0); 363 XPC_SET_REASON(ch, 0, 0);
973 ch->flags &= ~XPC_C_DISCONNECTED; 364 ch->flags &= ~XPC_C_DISCONNECTED;
@@ -978,13 +369,13 @@ again:
978 xpc_process_connect(ch, &irq_flags); 369 xpc_process_connect(ch, &irq_flags);
979 } 370 }
980 371
981 if (IPI_flags & XPC_IPI_OPENREPLY) { 372 if (chctl_flags & XPC_CHCTL_OPENREPLY) {
982 373
983 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY (local_msgqueue_pa=0x%lx, " 374 dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY (local_msgqueue_pa="
984 "local_nentries=%d, remote_nentries=%d) received from " 375 "0x%lx, local_nentries=%d, remote_nentries=%d) "
985 "partid=%d, channel=%d\n", args->local_msgqueue_pa, 376 "received from partid=%d, channel=%d\n",
986 args->local_nentries, args->remote_nentries, 377 args->local_msgqueue_pa, args->local_nentries,
987 ch->partid, ch->number); 378 args->remote_nentries, ch->partid, ch->number);
988 379
989 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) { 380 if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) {
990 spin_unlock_irqrestore(&ch->lock, irq_flags); 381 spin_unlock_irqrestore(&ch->lock, irq_flags);
@@ -1012,10 +403,10 @@ again:
1012 DBUG_ON(args->remote_nentries == 0); 403 DBUG_ON(args->remote_nentries == 0);
1013 404
1014 ch->flags |= XPC_C_ROPENREPLY; 405 ch->flags |= XPC_C_ROPENREPLY;
1015 ch->remote_msgqueue_pa = args->local_msgqueue_pa; 406 xpc_save_remote_msgqueue_pa(ch, args->local_msgqueue_pa);
1016 407
1017 if (args->local_nentries < ch->remote_nentries) { 408 if (args->local_nentries < ch->remote_nentries) {
1018 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new " 409 dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new "
1019 "remote_nentries=%d, old remote_nentries=%d, " 410 "remote_nentries=%d, old remote_nentries=%d, "
1020 "partid=%d, channel=%d\n", 411 "partid=%d, channel=%d\n",
1021 args->local_nentries, ch->remote_nentries, 412 args->local_nentries, ch->remote_nentries,
@@ -1024,7 +415,7 @@ again:
1024 ch->remote_nentries = args->local_nentries; 415 ch->remote_nentries = args->local_nentries;
1025 } 416 }
1026 if (args->remote_nentries < ch->local_nentries) { 417 if (args->remote_nentries < ch->local_nentries) {
1027 dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new " 418 dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new "
1028 "local_nentries=%d, old local_nentries=%d, " 419 "local_nentries=%d, old local_nentries=%d, "
1029 "partid=%d, channel=%d\n", 420 "partid=%d, channel=%d\n",
1030 args->remote_nentries, ch->local_nentries, 421 args->remote_nentries, ch->local_nentries,
@@ -1082,7 +473,7 @@ xpc_connect_channel(struct xpc_channel *ch)
1082 ch->local_nentries = registration->nentries; 473 ch->local_nentries = registration->nentries;
1083 474
1084 if (ch->flags & XPC_C_ROPENREQUEST) { 475 if (ch->flags & XPC_C_ROPENREQUEST) {
1085 if (registration->msg_size != ch->msg_size) { 476 if (registration->entry_size != ch->entry_size) {
1086 /* the local and remote sides aren't the same */ 477 /* the local and remote sides aren't the same */
1087 478
1088 /* 479 /*
@@ -1101,7 +492,7 @@ xpc_connect_channel(struct xpc_channel *ch)
1101 return xpUnequalMsgSizes; 492 return xpUnequalMsgSizes;
1102 } 493 }
1103 } else { 494 } else {
1104 ch->msg_size = registration->msg_size; 495 ch->entry_size = registration->entry_size;
1105 496
1106 XPC_SET_REASON(ch, 0, 0); 497 XPC_SET_REASON(ch, 0, 0);
1107 ch->flags &= ~XPC_C_DISCONNECTED; 498 ch->flags &= ~XPC_C_DISCONNECTED;
@@ -1114,7 +505,7 @@ xpc_connect_channel(struct xpc_channel *ch)
1114 /* initiate the connection */ 505 /* initiate the connection */
1115 506
1116 ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING); 507 ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING);
1117 xpc_IPI_send_openrequest(ch, &irq_flags); 508 xpc_send_chctl_openrequest(ch, &irq_flags);
1118 509
1119 xpc_process_connect(ch, &irq_flags); 510 xpc_process_connect(ch, &irq_flags);
1120 511
@@ -1123,152 +514,16 @@ xpc_connect_channel(struct xpc_channel *ch)
1123 return xpSuccess; 514 return xpSuccess;
1124} 515}
1125 516
1126/*
1127 * Clear some of the msg flags in the local message queue.
1128 */
1129static inline void
1130xpc_clear_local_msgqueue_flags(struct xpc_channel *ch)
1131{
1132 struct xpc_msg *msg;
1133 s64 get;
1134
1135 get = ch->w_remote_GP.get;
1136 do {
1137 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1138 (get % ch->local_nentries) *
1139 ch->msg_size);
1140 msg->flags = 0;
1141 } while (++get < ch->remote_GP.get);
1142}
1143
1144/*
1145 * Clear some of the msg flags in the remote message queue.
1146 */
1147static inline void
1148xpc_clear_remote_msgqueue_flags(struct xpc_channel *ch)
1149{
1150 struct xpc_msg *msg;
1151 s64 put;
1152
1153 put = ch->w_remote_GP.put;
1154 do {
1155 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
1156 (put % ch->remote_nentries) *
1157 ch->msg_size);
1158 msg->flags = 0;
1159 } while (++put < ch->remote_GP.put);
1160}
1161
1162static void
1163xpc_process_msg_IPI(struct xpc_partition *part, int ch_number)
1164{
1165 struct xpc_channel *ch = &part->channels[ch_number];
1166 int nmsgs_sent;
1167
1168 ch->remote_GP = part->remote_GPs[ch_number];
1169
1170 /* See what, if anything, has changed for each connected channel */
1171
1172 xpc_msgqueue_ref(ch);
1173
1174 if (ch->w_remote_GP.get == ch->remote_GP.get &&
1175 ch->w_remote_GP.put == ch->remote_GP.put) {
1176 /* nothing changed since GPs were last pulled */
1177 xpc_msgqueue_deref(ch);
1178 return;
1179 }
1180
1181 if (!(ch->flags & XPC_C_CONNECTED)) {
1182 xpc_msgqueue_deref(ch);
1183 return;
1184 }
1185
1186 /*
1187 * First check to see if messages recently sent by us have been
1188 * received by the other side. (The remote GET value will have
1189 * changed since we last looked at it.)
1190 */
1191
1192 if (ch->w_remote_GP.get != ch->remote_GP.get) {
1193
1194 /*
1195 * We need to notify any senders that want to be notified
1196 * that their sent messages have been received by their
1197 * intended recipients. We need to do this before updating
1198 * w_remote_GP.get so that we don't allocate the same message
1199 * queue entries prematurely (see xpc_allocate_msg()).
1200 */
1201 if (atomic_read(&ch->n_to_notify) > 0) {
1202 /*
1203 * Notify senders that messages sent have been
1204 * received and delivered by the other side.
1205 */
1206 xpc_notify_senders(ch, xpMsgDelivered,
1207 ch->remote_GP.get);
1208 }
1209
1210 /*
1211 * Clear msg->flags in previously sent messages, so that
1212 * they're ready for xpc_allocate_msg().
1213 */
1214 xpc_clear_local_msgqueue_flags(ch);
1215
1216 ch->w_remote_GP.get = ch->remote_GP.get;
1217
1218 dev_dbg(xpc_chan, "w_remote_GP.get changed to %ld, partid=%d, "
1219 "channel=%d\n", ch->w_remote_GP.get, ch->partid,
1220 ch->number);
1221
1222 /*
1223 * If anyone was waiting for message queue entries to become
1224 * available, wake them up.
1225 */
1226 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
1227 wake_up(&ch->msg_allocate_wq);
1228 }
1229
1230 /*
1231 * Now check for newly sent messages by the other side. (The remote
1232 * PUT value will have changed since we last looked at it.)
1233 */
1234
1235 if (ch->w_remote_GP.put != ch->remote_GP.put) {
1236 /*
1237 * Clear msg->flags in previously received messages, so that
1238 * they're ready for xpc_get_deliverable_msg().
1239 */
1240 xpc_clear_remote_msgqueue_flags(ch);
1241
1242 ch->w_remote_GP.put = ch->remote_GP.put;
1243
1244 dev_dbg(xpc_chan, "w_remote_GP.put changed to %ld, partid=%d, "
1245 "channel=%d\n", ch->w_remote_GP.put, ch->partid,
1246 ch->number);
1247
1248 nmsgs_sent = ch->w_remote_GP.put - ch->w_local_GP.get;
1249 if (nmsgs_sent > 0) {
1250 dev_dbg(xpc_chan, "msgs waiting to be copied and "
1251 "delivered=%d, partid=%d, channel=%d\n",
1252 nmsgs_sent, ch->partid, ch->number);
1253
1254 if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)
1255 xpc_activate_kthreads(ch, nmsgs_sent);
1256 }
1257 }
1258
1259 xpc_msgqueue_deref(ch);
1260}
1261
1262void 517void
1263xpc_process_channel_activity(struct xpc_partition *part) 518xpc_process_sent_chctl_flags(struct xpc_partition *part)
1264{ 519{
1265 unsigned long irq_flags; 520 unsigned long irq_flags;
1266 u64 IPI_amo, IPI_flags; 521 union xpc_channel_ctl_flags chctl;
1267 struct xpc_channel *ch; 522 struct xpc_channel *ch;
1268 int ch_number; 523 int ch_number;
1269 u32 ch_flags; 524 u32 ch_flags;
1270 525
1271 IPI_amo = xpc_get_IPI_flags(part); 526 chctl.all_flags = xpc_get_chctl_all_flags(part);
1272 527
1273 /* 528 /*
1274 * Initiate channel connections for registered channels. 529 * Initiate channel connections for registered channels.
@@ -1281,14 +536,14 @@ xpc_process_channel_activity(struct xpc_partition *part)
1281 ch = &part->channels[ch_number]; 536 ch = &part->channels[ch_number];
1282 537
1283 /* 538 /*
1284 * Process any open or close related IPI flags, and then deal 539 * Process any open or close related chctl flags, and then deal
1285 * with connecting or disconnecting the channel as required. 540 * with connecting or disconnecting the channel as required.
1286 */ 541 */
1287 542
1288 IPI_flags = XPC_GET_IPI_FLAGS(IPI_amo, ch_number); 543 if (chctl.flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS) {
1289 544 xpc_process_openclose_chctl_flags(part, ch_number,
1290 if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_flags)) 545 chctl.flags[ch_number]);
1291 xpc_process_openclose_IPI(part, ch_number, IPI_flags); 546 }
1292 547
1293 ch_flags = ch->flags; /* need an atomic snapshot of flags */ 548 ch_flags = ch->flags; /* need an atomic snapshot of flags */
1294 549
@@ -1299,7 +554,7 @@ xpc_process_channel_activity(struct xpc_partition *part)
1299 continue; 554 continue;
1300 } 555 }
1301 556
1302 if (part->act_state == XPC_P_DEACTIVATING) 557 if (part->act_state == XPC_P_AS_DEACTIVATING)
1303 continue; 558 continue;
1304 559
1305 if (!(ch_flags & XPC_C_CONNECTED)) { 560 if (!(ch_flags & XPC_C_CONNECTED)) {
@@ -1315,13 +570,13 @@ xpc_process_channel_activity(struct xpc_partition *part)
1315 } 570 }
1316 571
1317 /* 572 /*
1318 * Process any message related IPI flags, this may involve the 573 * Process any message related chctl flags, this may involve
1319 * activation of kthreads to deliver any pending messages sent 574 * the activation of kthreads to deliver any pending messages
1320 * from the other partition. 575 * sent from the other partition.
1321 */ 576 */
1322 577
1323 if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_flags)) 578 if (chctl.flags[ch_number] & XPC_MSG_CHCTL_FLAGS)
1324 xpc_process_msg_IPI(part, ch_number); 579 xpc_process_msg_chctl_flags(part, ch_number);
1325 } 580 }
1326} 581}
1327 582
@@ -1369,59 +624,6 @@ xpc_partition_going_down(struct xpc_partition *part, enum xp_retval reason)
1369} 624}
1370 625
1371/* 626/*
1372 * Teardown the infrastructure necessary to support XPartition Communication
1373 * between the specified remote partition and the local one.
1374 */
1375void
1376xpc_teardown_infrastructure(struct xpc_partition *part)
1377{
1378 short partid = XPC_PARTID(part);
1379
1380 /*
1381 * We start off by making this partition inaccessible to local
1382 * processes by marking it as no longer setup. Then we make it
1383 * inaccessible to remote processes by clearing the XPC per partition
1384 * specific variable's magic # (which indicates that these variables
1385 * are no longer valid) and by ignoring all XPC notify IPIs sent to
1386 * this partition.
1387 */
1388
1389 DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
1390 DBUG_ON(atomic_read(&part->nchannels_active) != 0);
1391 DBUG_ON(part->setup_state != XPC_P_SETUP);
1392 part->setup_state = XPC_P_WTEARDOWN;
1393
1394 xpc_vars_part[partid].magic = 0;
1395
1396 free_irq(SGI_XPC_NOTIFY, (void *)(u64)partid);
1397
1398 /*
1399 * Before proceeding with the teardown we have to wait until all
1400 * existing references cease.
1401 */
1402 wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
1403
1404 /* now we can begin tearing down the infrastructure */
1405
1406 part->setup_state = XPC_P_TORNDOWN;
1407
1408 /* in case we've still got outstanding timers registered... */
1409 del_timer_sync(&part->dropped_IPI_timer);
1410
1411 kfree(part->remote_openclose_args_base);
1412 part->remote_openclose_args = NULL;
1413 kfree(part->local_openclose_args_base);
1414 part->local_openclose_args = NULL;
1415 kfree(part->remote_GPs_base);
1416 part->remote_GPs = NULL;
1417 kfree(part->local_GPs_base);
1418 part->local_GPs = NULL;
1419 kfree(part->channels);
1420 part->channels = NULL;
1421 part->local_IPI_amo_va = NULL;
1422}
1423
1424/*
1425 * Called by XP at the time of channel connection registration to cause 627 * Called by XP at the time of channel connection registration to cause
1426 * XPC to establish connections to all currently active partitions. 628 * XPC to establish connections to all currently active partitions.
1427 */ 629 */
@@ -1432,9 +634,9 @@ xpc_initiate_connect(int ch_number)
1432 struct xpc_partition *part; 634 struct xpc_partition *part;
1433 struct xpc_channel *ch; 635 struct xpc_channel *ch;
1434 636
1435 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS); 637 DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
1436 638
1437 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { 639 for (partid = 0; partid < xp_max_npartitions; partid++) {
1438 part = &xpc_partitions[partid]; 640 part = &xpc_partitions[partid];
1439 641
1440 if (xpc_part_ref(part)) { 642 if (xpc_part_ref(part)) {
@@ -1488,10 +690,10 @@ xpc_initiate_disconnect(int ch_number)
1488 struct xpc_partition *part; 690 struct xpc_partition *part;
1489 struct xpc_channel *ch; 691 struct xpc_channel *ch;
1490 692
1491 DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS); 693 DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
1492 694
1493 /* initiate the channel disconnect for every active partition */ 695 /* initiate the channel disconnect for every active partition */
1494 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { 696 for (partid = 0; partid < xp_max_npartitions; partid++) {
1495 part = &xpc_partitions[partid]; 697 part = &xpc_partitions[partid];
1496 698
1497 if (xpc_part_ref(part)) { 699 if (xpc_part_ref(part)) {
@@ -1550,7 +752,7 @@ xpc_disconnect_channel(const int line, struct xpc_channel *ch,
1550 XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY | 752 XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
1551 XPC_C_CONNECTING | XPC_C_CONNECTED); 753 XPC_C_CONNECTING | XPC_C_CONNECTED);
1552 754
1553 xpc_IPI_send_closerequest(ch, irq_flags); 755 xpc_send_chctl_closerequest(ch, irq_flags);
1554 756
1555 if (channel_was_connected) 757 if (channel_was_connected)
1556 ch->flags |= XPC_C_WASCONNECTED; 758 ch->flags |= XPC_C_WASCONNECTED;
@@ -1598,7 +800,7 @@ xpc_disconnect_callout(struct xpc_channel *ch, enum xp_retval reason)
1598 * Wait for a message entry to become available for the specified channel, 800 * Wait for a message entry to become available for the specified channel,
1599 * but don't wait any longer than 1 jiffy. 801 * but don't wait any longer than 1 jiffy.
1600 */ 802 */
1601static enum xp_retval 803enum xp_retval
1602xpc_allocate_msg_wait(struct xpc_channel *ch) 804xpc_allocate_msg_wait(struct xpc_channel *ch)
1603{ 805{
1604 enum xp_retval ret; 806 enum xp_retval ret;
@@ -1625,315 +827,54 @@ xpc_allocate_msg_wait(struct xpc_channel *ch)
1625} 827}
1626 828
1627/* 829/*
1628 * Allocate an entry for a message from the message queue associated with the 830 * Send a message that contains the user's payload on the specified channel
1629 * specified channel. 831 * connected to the specified partition.
1630 */
1631static enum xp_retval
1632xpc_allocate_msg(struct xpc_channel *ch, u32 flags,
1633 struct xpc_msg **address_of_msg)
1634{
1635 struct xpc_msg *msg;
1636 enum xp_retval ret;
1637 s64 put;
1638
1639 /* this reference will be dropped in xpc_send_msg() */
1640 xpc_msgqueue_ref(ch);
1641
1642 if (ch->flags & XPC_C_DISCONNECTING) {
1643 xpc_msgqueue_deref(ch);
1644 return ch->reason;
1645 }
1646 if (!(ch->flags & XPC_C_CONNECTED)) {
1647 xpc_msgqueue_deref(ch);
1648 return xpNotConnected;
1649 }
1650
1651 /*
1652 * Get the next available message entry from the local message queue.
1653 * If none are available, we'll make sure that we grab the latest
1654 * GP values.
1655 */
1656 ret = xpTimeout;
1657
1658 while (1) {
1659
1660 put = ch->w_local_GP.put;
1661 rmb(); /* guarantee that .put loads before .get */
1662 if (put - ch->w_remote_GP.get < ch->local_nentries) {
1663
1664 /* There are available message entries. We need to try
1665 * to secure one for ourselves. We'll do this by trying
1666 * to increment w_local_GP.put as long as someone else
1667 * doesn't beat us to it. If they do, we'll have to
1668 * try again.
1669 */
1670 if (cmpxchg(&ch->w_local_GP.put, put, put + 1) == put) {
1671 /* we got the entry referenced by put */
1672 break;
1673 }
1674 continue; /* try again */
1675 }
1676
1677 /*
1678 * There aren't any available msg entries at this time.
1679 *
1680 * In waiting for a message entry to become available,
1681 * we set a timeout in case the other side is not
1682 * sending completion IPIs. This lets us fake an IPI
1683 * that will cause the IPI handler to fetch the latest
1684 * GP values as if an IPI was sent by the other side.
1685 */
1686 if (ret == xpTimeout)
1687 xpc_IPI_send_local_msgrequest(ch);
1688
1689 if (flags & XPC_NOWAIT) {
1690 xpc_msgqueue_deref(ch);
1691 return xpNoWait;
1692 }
1693
1694 ret = xpc_allocate_msg_wait(ch);
1695 if (ret != xpInterrupted && ret != xpTimeout) {
1696 xpc_msgqueue_deref(ch);
1697 return ret;
1698 }
1699 }
1700
1701 /* get the message's address and initialize it */
1702 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1703 (put % ch->local_nentries) * ch->msg_size);
1704
1705 DBUG_ON(msg->flags != 0);
1706 msg->number = put;
1707
1708 dev_dbg(xpc_chan, "w_local_GP.put changed to %ld; msg=0x%p, "
1709 "msg_number=%ld, partid=%d, channel=%d\n", put + 1,
1710 (void *)msg, msg->number, ch->partid, ch->number);
1711
1712 *address_of_msg = msg;
1713
1714 return xpSuccess;
1715}
1716
1717/*
1718 * Allocate an entry for a message from the message queue associated with the
1719 * specified channel. NOTE that this routine can sleep waiting for a message
1720 * entry to become available. To not sleep, pass in the XPC_NOWAIT flag.
1721 * 832 *
1722 * Arguments: 833 * NOTE that this routine can sleep waiting for a message entry to become
834 * available. To not sleep, pass in the XPC_NOWAIT flag.
1723 * 835 *
1724 * partid - ID of partition to which the channel is connected. 836 * Once sent, this routine will not wait for the message to be received, nor
1725 * ch_number - channel #. 837 * will notification be given when it does happen.
1726 * flags - see xpc.h for valid flags.
1727 * payload - address of the allocated payload area pointer (filled in on
1728 * return) in which the user-defined message is constructed.
1729 */
1730enum xp_retval
1731xpc_initiate_allocate(short partid, int ch_number, u32 flags, void **payload)
1732{
1733 struct xpc_partition *part = &xpc_partitions[partid];
1734 enum xp_retval ret = xpUnknownReason;
1735 struct xpc_msg *msg = NULL;
1736
1737 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
1738 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1739
1740 *payload = NULL;
1741
1742 if (xpc_part_ref(part)) {
1743 ret = xpc_allocate_msg(&part->channels[ch_number], flags, &msg);
1744 xpc_part_deref(part);
1745
1746 if (msg != NULL)
1747 *payload = &msg->payload;
1748 }
1749
1750 return ret;
1751}
1752
1753/*
1754 * Now we actually send the messages that are ready to be sent by advancing
1755 * the local message queue's Put value and then send an IPI to the recipient
1756 * partition.
1757 */
1758static void
1759xpc_send_msgs(struct xpc_channel *ch, s64 initial_put)
1760{
1761 struct xpc_msg *msg;
1762 s64 put = initial_put + 1;
1763 int send_IPI = 0;
1764
1765 while (1) {
1766
1767 while (1) {
1768 if (put == ch->w_local_GP.put)
1769 break;
1770
1771 msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
1772 (put % ch->local_nentries) *
1773 ch->msg_size);
1774
1775 if (!(msg->flags & XPC_M_READY))
1776 break;
1777
1778 put++;
1779 }
1780
1781 if (put == initial_put) {
1782 /* nothing's changed */
1783 break;
1784 }
1785
1786 if (cmpxchg_rel(&ch->local_GP->put, initial_put, put) !=
1787 initial_put) {
1788 /* someone else beat us to it */
1789 DBUG_ON(ch->local_GP->put < initial_put);
1790 break;
1791 }
1792
1793 /* we just set the new value of local_GP->put */
1794
1795 dev_dbg(xpc_chan, "local_GP->put changed to %ld, partid=%d, "
1796 "channel=%d\n", put, ch->partid, ch->number);
1797
1798 send_IPI = 1;
1799
1800 /*
1801 * We need to ensure that the message referenced by
1802 * local_GP->put is not XPC_M_READY or that local_GP->put
1803 * equals w_local_GP.put, so we'll go have a look.
1804 */
1805 initial_put = put;
1806 }
1807
1808 if (send_IPI)
1809 xpc_IPI_send_msgrequest(ch);
1810}
1811
1812/*
1813 * Common code that does the actual sending of the message by advancing the
1814 * local message queue's Put value and sends an IPI to the partition the
1815 * message is being sent to.
1816 */
1817static enum xp_retval
1818xpc_send_msg(struct xpc_channel *ch, struct xpc_msg *msg, u8 notify_type,
1819 xpc_notify_func func, void *key)
1820{
1821 enum xp_retval ret = xpSuccess;
1822 struct xpc_notify *notify = notify;
1823 s64 put, msg_number = msg->number;
1824
1825 DBUG_ON(notify_type == XPC_N_CALL && func == NULL);
1826 DBUG_ON((((u64)msg - (u64)ch->local_msgqueue) / ch->msg_size) !=
1827 msg_number % ch->local_nentries);
1828 DBUG_ON(msg->flags & XPC_M_READY);
1829
1830 if (ch->flags & XPC_C_DISCONNECTING) {
1831 /* drop the reference grabbed in xpc_allocate_msg() */
1832 xpc_msgqueue_deref(ch);
1833 return ch->reason;
1834 }
1835
1836 if (notify_type != 0) {
1837 /*
1838 * Tell the remote side to send an ACK interrupt when the
1839 * message has been delivered.
1840 */
1841 msg->flags |= XPC_M_INTERRUPT;
1842
1843 atomic_inc(&ch->n_to_notify);
1844
1845 notify = &ch->notify_queue[msg_number % ch->local_nentries];
1846 notify->func = func;
1847 notify->key = key;
1848 notify->type = notify_type;
1849
1850 /* >>> is a mb() needed here? */
1851
1852 if (ch->flags & XPC_C_DISCONNECTING) {
1853 /*
1854 * An error occurred between our last error check and
1855 * this one. We will try to clear the type field from
1856 * the notify entry. If we succeed then
1857 * xpc_disconnect_channel() didn't already process
1858 * the notify entry.
1859 */
1860 if (cmpxchg(&notify->type, notify_type, 0) ==
1861 notify_type) {
1862 atomic_dec(&ch->n_to_notify);
1863 ret = ch->reason;
1864 }
1865
1866 /* drop the reference grabbed in xpc_allocate_msg() */
1867 xpc_msgqueue_deref(ch);
1868 return ret;
1869 }
1870 }
1871
1872 msg->flags |= XPC_M_READY;
1873
1874 /*
1875 * The preceding store of msg->flags must occur before the following
1876 * load of ch->local_GP->put.
1877 */
1878 mb();
1879
1880 /* see if the message is next in line to be sent, if so send it */
1881
1882 put = ch->local_GP->put;
1883 if (put == msg_number)
1884 xpc_send_msgs(ch, put);
1885
1886 /* drop the reference grabbed in xpc_allocate_msg() */
1887 xpc_msgqueue_deref(ch);
1888 return ret;
1889}
1890
1891/*
1892 * Send a message previously allocated using xpc_initiate_allocate() on the
1893 * specified channel connected to the specified partition.
1894 *
1895 * This routine will not wait for the message to be received, nor will
1896 * notification be given when it does happen. Once this routine has returned
1897 * the message entry allocated via xpc_initiate_allocate() is no longer
1898 * accessable to the caller.
1899 *
1900 * This routine, although called by users, does not call xpc_part_ref() to
1901 * ensure that the partition infrastructure is in place. It relies on the
1902 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
1903 * 838 *
1904 * Arguments: 839 * Arguments:
1905 * 840 *
1906 * partid - ID of partition to which the channel is connected. 841 * partid - ID of partition to which the channel is connected.
1907 * ch_number - channel # to send message on. 842 * ch_number - channel # to send message on.
1908 * payload - pointer to the payload area allocated via 843 * flags - see xp.h for valid flags.
1909 * xpc_initiate_allocate(). 844 * payload - pointer to the payload which is to be sent.
845 * payload_size - size of the payload in bytes.
1910 */ 846 */
1911enum xp_retval 847enum xp_retval
1912xpc_initiate_send(short partid, int ch_number, void *payload) 848xpc_initiate_send(short partid, int ch_number, u32 flags, void *payload,
849 u16 payload_size)
1913{ 850{
1914 struct xpc_partition *part = &xpc_partitions[partid]; 851 struct xpc_partition *part = &xpc_partitions[partid];
1915 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload); 852 enum xp_retval ret = xpUnknownReason;
1916 enum xp_retval ret;
1917 853
1918 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg, 854 dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload,
1919 partid, ch_number); 855 partid, ch_number);
1920 856
1921 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS); 857 DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
1922 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels); 858 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1923 DBUG_ON(msg == NULL); 859 DBUG_ON(payload == NULL);
1924 860
1925 ret = xpc_send_msg(&part->channels[ch_number], msg, 0, NULL, NULL); 861 if (xpc_part_ref(part)) {
862 ret = xpc_send_payload(&part->channels[ch_number], flags,
863 payload, payload_size, 0, NULL, NULL);
864 xpc_part_deref(part);
865 }
1926 866
1927 return ret; 867 return ret;
1928} 868}
1929 869
1930/* 870/*
1931 * Send a message previously allocated using xpc_initiate_allocate on the 871 * Send a message that contains the user's payload on the specified channel
1932 * specified channel connected to the specified partition. 872 * connected to the specified partition.
1933 * 873 *
1934 * This routine will not wait for the message to be sent. Once this routine 874 * NOTE that this routine can sleep waiting for a message entry to become
1935 * has returned the message entry allocated via xpc_initiate_allocate() is no 875 * available. To not sleep, pass in the XPC_NOWAIT flag.
1936 * longer accessable to the caller. 876 *
877 * This routine will not wait for the message to be sent or received.
1937 * 878 *
1938 * Once the remote end of the channel has received the message, the function 879 * Once the remote end of the channel has received the message, the function
1939 * passed as an argument to xpc_initiate_send_notify() will be called. This 880 * passed as an argument to xpc_initiate_send_notify() will be called. This
@@ -1943,158 +884,51 @@ xpc_initiate_send(short partid, int ch_number, void *payload)
1943 * 884 *
1944 * If this routine returns an error, the caller's function will NOT be called. 885 * If this routine returns an error, the caller's function will NOT be called.
1945 * 886 *
1946 * This routine, although called by users, does not call xpc_part_ref() to
1947 * ensure that the partition infrastructure is in place. It relies on the
1948 * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
1949 *
1950 * Arguments: 887 * Arguments:
1951 * 888 *
1952 * partid - ID of partition to which the channel is connected. 889 * partid - ID of partition to which the channel is connected.
1953 * ch_number - channel # to send message on. 890 * ch_number - channel # to send message on.
1954 * payload - pointer to the payload area allocated via 891 * flags - see xp.h for valid flags.
1955 * xpc_initiate_allocate(). 892 * payload - pointer to the payload which is to be sent.
893 * payload_size - size of the payload in bytes.
1956 * func - function to call with asynchronous notification of message 894 * func - function to call with asynchronous notification of message
1957 * receipt. THIS FUNCTION MUST BE NON-BLOCKING. 895 * receipt. THIS FUNCTION MUST BE NON-BLOCKING.
1958 * key - user-defined key to be passed to the function when it's called. 896 * key - user-defined key to be passed to the function when it's called.
1959 */ 897 */
1960enum xp_retval 898enum xp_retval
1961xpc_initiate_send_notify(short partid, int ch_number, void *payload, 899xpc_initiate_send_notify(short partid, int ch_number, u32 flags, void *payload,
1962 xpc_notify_func func, void *key) 900 u16 payload_size, xpc_notify_func func, void *key)
1963{ 901{
1964 struct xpc_partition *part = &xpc_partitions[partid]; 902 struct xpc_partition *part = &xpc_partitions[partid];
1965 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload); 903 enum xp_retval ret = xpUnknownReason;
1966 enum xp_retval ret;
1967 904
1968 dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg, 905 dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload,
1969 partid, ch_number); 906 partid, ch_number);
1970 907
1971 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS); 908 DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
1972 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels); 909 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
1973 DBUG_ON(msg == NULL); 910 DBUG_ON(payload == NULL);
1974 DBUG_ON(func == NULL); 911 DBUG_ON(func == NULL);
1975 912
1976 ret = xpc_send_msg(&part->channels[ch_number], msg, XPC_N_CALL, 913 if (xpc_part_ref(part)) {
1977 func, key); 914 ret = xpc_send_payload(&part->channels[ch_number], flags,
1978 return ret; 915 payload, payload_size, XPC_N_CALL, func,
1979} 916 key);
1980 917 xpc_part_deref(part);
1981static struct xpc_msg *
1982xpc_pull_remote_msg(struct xpc_channel *ch, s64 get)
1983{
1984 struct xpc_partition *part = &xpc_partitions[ch->partid];
1985 struct xpc_msg *remote_msg, *msg;
1986 u32 msg_index, nmsgs;
1987 u64 msg_offset;
1988 enum xp_retval ret;
1989
1990 if (mutex_lock_interruptible(&ch->msg_to_pull_mutex) != 0) {
1991 /* we were interrupted by a signal */
1992 return NULL;
1993 }
1994
1995 while (get >= ch->next_msg_to_pull) {
1996
1997 /* pull as many messages as are ready and able to be pulled */
1998
1999 msg_index = ch->next_msg_to_pull % ch->remote_nentries;
2000
2001 DBUG_ON(ch->next_msg_to_pull >= ch->w_remote_GP.put);
2002 nmsgs = ch->w_remote_GP.put - ch->next_msg_to_pull;
2003 if (msg_index + nmsgs > ch->remote_nentries) {
2004 /* ignore the ones that wrap the msg queue for now */
2005 nmsgs = ch->remote_nentries - msg_index;
2006 }
2007
2008 msg_offset = msg_index * ch->msg_size;
2009 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
2010 remote_msg = (struct xpc_msg *)(ch->remote_msgqueue_pa +
2011 msg_offset);
2012
2013 ret = xpc_pull_remote_cachelines(part, msg, remote_msg,
2014 nmsgs * ch->msg_size);
2015 if (ret != xpSuccess) {
2016
2017 dev_dbg(xpc_chan, "failed to pull %d msgs starting with"
2018 " msg %ld from partition %d, channel=%d, "
2019 "ret=%d\n", nmsgs, ch->next_msg_to_pull,
2020 ch->partid, ch->number, ret);
2021
2022 XPC_DEACTIVATE_PARTITION(part, ret);
2023
2024 mutex_unlock(&ch->msg_to_pull_mutex);
2025 return NULL;
2026 }
2027
2028 ch->next_msg_to_pull += nmsgs;
2029 } 918 }
2030 919 return ret;
2031 mutex_unlock(&ch->msg_to_pull_mutex);
2032
2033 /* return the message we were looking for */
2034 msg_offset = (get % ch->remote_nentries) * ch->msg_size;
2035 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
2036
2037 return msg;
2038}
2039
2040/*
2041 * Get a message to be delivered.
2042 */
2043static struct xpc_msg *
2044xpc_get_deliverable_msg(struct xpc_channel *ch)
2045{
2046 struct xpc_msg *msg = NULL;
2047 s64 get;
2048
2049 do {
2050 if (ch->flags & XPC_C_DISCONNECTING)
2051 break;
2052
2053 get = ch->w_local_GP.get;
2054 rmb(); /* guarantee that .get loads before .put */
2055 if (get == ch->w_remote_GP.put)
2056 break;
2057
2058 /* There are messages waiting to be pulled and delivered.
2059 * We need to try to secure one for ourselves. We'll do this
2060 * by trying to increment w_local_GP.get and hope that no one
2061 * else beats us to it. If they do, we'll we'll simply have
2062 * to try again for the next one.
2063 */
2064
2065 if (cmpxchg(&ch->w_local_GP.get, get, get + 1) == get) {
2066 /* we got the entry referenced by get */
2067
2068 dev_dbg(xpc_chan, "w_local_GP.get changed to %ld, "
2069 "partid=%d, channel=%d\n", get + 1,
2070 ch->partid, ch->number);
2071
2072 /* pull the message from the remote partition */
2073
2074 msg = xpc_pull_remote_msg(ch, get);
2075
2076 DBUG_ON(msg != NULL && msg->number != get);
2077 DBUG_ON(msg != NULL && (msg->flags & XPC_M_DONE));
2078 DBUG_ON(msg != NULL && !(msg->flags & XPC_M_READY));
2079
2080 break;
2081 }
2082
2083 } while (1);
2084
2085 return msg;
2086} 920}
2087 921
2088/* 922/*
2089 * Deliver a message to its intended recipient. 923 * Deliver a message's payload to its intended recipient.
2090 */ 924 */
2091void 925void
2092xpc_deliver_msg(struct xpc_channel *ch) 926xpc_deliver_payload(struct xpc_channel *ch)
2093{ 927{
2094 struct xpc_msg *msg; 928 void *payload;
2095 929
2096 msg = xpc_get_deliverable_msg(ch); 930 payload = xpc_get_deliverable_payload(ch);
2097 if (msg != NULL) { 931 if (payload != NULL) {
2098 932
2099 /* 933 /*
2100 * This ref is taken to protect the payload itself from being 934 * This ref is taken to protect the payload itself from being
@@ -2106,18 +940,16 @@ xpc_deliver_msg(struct xpc_channel *ch)
2106 atomic_inc(&ch->kthreads_active); 940 atomic_inc(&ch->kthreads_active);
2107 941
2108 if (ch->func != NULL) { 942 if (ch->func != NULL) {
2109 dev_dbg(xpc_chan, "ch->func() called, msg=0x%p, " 943 dev_dbg(xpc_chan, "ch->func() called, payload=0x%p "
2110 "msg_number=%ld, partid=%d, channel=%d\n", 944 "partid=%d channel=%d\n", payload, ch->partid,
2111 (void *)msg, msg->number, ch->partid,
2112 ch->number); 945 ch->number);
2113 946
2114 /* deliver the message to its intended recipient */ 947 /* deliver the message to its intended recipient */
2115 ch->func(xpMsgReceived, ch->partid, ch->number, 948 ch->func(xpMsgReceived, ch->partid, ch->number, payload,
2116 &msg->payload, ch->key); 949 ch->key);
2117 950
2118 dev_dbg(xpc_chan, "ch->func() returned, msg=0x%p, " 951 dev_dbg(xpc_chan, "ch->func() returned, payload=0x%p "
2119 "msg_number=%ld, partid=%d, channel=%d\n", 952 "partid=%d channel=%d\n", payload, ch->partid,
2120 (void *)msg, msg->number, ch->partid,
2121 ch->number); 953 ch->number);
2122 } 954 }
2123 955
@@ -2126,118 +958,31 @@ xpc_deliver_msg(struct xpc_channel *ch)
2126} 958}
2127 959
2128/* 960/*
2129 * Now we actually acknowledge the messages that have been delivered and ack'd 961 * Acknowledge receipt of a delivered message's payload.
2130 * by advancing the cached remote message queue's Get value and if requested
2131 * send an IPI to the message sender's partition.
2132 */
2133static void
2134xpc_acknowledge_msgs(struct xpc_channel *ch, s64 initial_get, u8 msg_flags)
2135{
2136 struct xpc_msg *msg;
2137 s64 get = initial_get + 1;
2138 int send_IPI = 0;
2139
2140 while (1) {
2141
2142 while (1) {
2143 if (get == ch->w_local_GP.get)
2144 break;
2145
2146 msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
2147 (get % ch->remote_nentries) *
2148 ch->msg_size);
2149
2150 if (!(msg->flags & XPC_M_DONE))
2151 break;
2152
2153 msg_flags |= msg->flags;
2154 get++;
2155 }
2156
2157 if (get == initial_get) {
2158 /* nothing's changed */
2159 break;
2160 }
2161
2162 if (cmpxchg_rel(&ch->local_GP->get, initial_get, get) !=
2163 initial_get) {
2164 /* someone else beat us to it */
2165 DBUG_ON(ch->local_GP->get <= initial_get);
2166 break;
2167 }
2168
2169 /* we just set the new value of local_GP->get */
2170
2171 dev_dbg(xpc_chan, "local_GP->get changed to %ld, partid=%d, "
2172 "channel=%d\n", get, ch->partid, ch->number);
2173
2174 send_IPI = (msg_flags & XPC_M_INTERRUPT);
2175
2176 /*
2177 * We need to ensure that the message referenced by
2178 * local_GP->get is not XPC_M_DONE or that local_GP->get
2179 * equals w_local_GP.get, so we'll go have a look.
2180 */
2181 initial_get = get;
2182 }
2183
2184 if (send_IPI)
2185 xpc_IPI_send_msgrequest(ch);
2186}
2187
2188/*
2189 * Acknowledge receipt of a delivered message.
2190 *
2191 * If a message has XPC_M_INTERRUPT set, send an interrupt to the partition
2192 * that sent the message.
2193 * 962 *
2194 * This function, although called by users, does not call xpc_part_ref() to 963 * This function, although called by users, does not call xpc_part_ref() to
2195 * ensure that the partition infrastructure is in place. It relies on the 964 * ensure that the partition infrastructure is in place. It relies on the
2196 * fact that we called xpc_msgqueue_ref() in xpc_deliver_msg(). 965 * fact that we called xpc_msgqueue_ref() in xpc_deliver_payload().
2197 * 966 *
2198 * Arguments: 967 * Arguments:
2199 * 968 *
2200 * partid - ID of partition to which the channel is connected. 969 * partid - ID of partition to which the channel is connected.
2201 * ch_number - channel # message received on. 970 * ch_number - channel # message received on.
2202 * payload - pointer to the payload area allocated via 971 * payload - pointer to the payload area allocated via
2203 * xpc_initiate_allocate(). 972 * xpc_initiate_send() or xpc_initiate_send_notify().
2204 */ 973 */
2205void 974void
2206xpc_initiate_received(short partid, int ch_number, void *payload) 975xpc_initiate_received(short partid, int ch_number, void *payload)
2207{ 976{
2208 struct xpc_partition *part = &xpc_partitions[partid]; 977 struct xpc_partition *part = &xpc_partitions[partid];
2209 struct xpc_channel *ch; 978 struct xpc_channel *ch;
2210 struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
2211 s64 get, msg_number = msg->number;
2212 979
2213 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS); 980 DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
2214 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels); 981 DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
2215 982
2216 ch = &part->channels[ch_number]; 983 ch = &part->channels[ch_number];
984 xpc_received_payload(ch, payload);
2217 985
2218 dev_dbg(xpc_chan, "msg=0x%p, msg_number=%ld, partid=%d, channel=%d\n", 986 /* the call to xpc_msgqueue_ref() was done by xpc_deliver_payload() */
2219 (void *)msg, msg_number, ch->partid, ch->number);
2220
2221 DBUG_ON((((u64)msg - (u64)ch->remote_msgqueue) / ch->msg_size) !=
2222 msg_number % ch->remote_nentries);
2223 DBUG_ON(msg->flags & XPC_M_DONE);
2224
2225 msg->flags |= XPC_M_DONE;
2226
2227 /*
2228 * The preceding store of msg->flags must occur before the following
2229 * load of ch->local_GP->get.
2230 */
2231 mb();
2232
2233 /*
2234 * See if this message is next in line to be acknowledged as having
2235 * been delivered.
2236 */
2237 get = ch->local_GP->get;
2238 if (get == msg_number)
2239 xpc_acknowledge_msgs(ch, get, msg->flags);
2240
2241 /* the call to xpc_msgqueue_ref() was done by xpc_deliver_msg() */
2242 xpc_msgqueue_deref(ch); 987 xpc_msgqueue_deref(ch);
2243} 988}
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 579b01ff82d4..46325fc84811 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -25,37 +25,31 @@
25 * 25 *
26 * Caveats: 26 * Caveats:
27 * 27 *
28 * . We currently have no way to determine which nasid an IPI came 28 * . Currently on sn2, we have no way to determine which nasid an IRQ
29 * from. Thus, xpc_IPI_send() does a remote AMO write followed by 29 * came from. Thus, xpc_send_IRQ_sn2() does a remote amo write
30 * an IPI. The AMO indicates where data is to be pulled from, so 30 * followed by an IPI. The amo indicates where data is to be pulled
31 * after the IPI arrives, the remote partition checks the AMO word. 31 * from, so after the IPI arrives, the remote partition checks the amo
32 * The IPI can actually arrive before the AMO however, so other code 32 * word. The IPI can actually arrive before the amo however, so other
33 * must periodically check for this case. Also, remote AMO operations 33 * code must periodically check for this case. Also, remote amo
34 * do not reliably time out. Thus we do a remote PIO read solely to 34 * operations do not reliably time out. Thus we do a remote PIO read
35 * know whether the remote partition is down and whether we should 35 * solely to know whether the remote partition is down and whether we
36 * stop sending IPIs to it. This remote PIO read operation is set up 36 * should stop sending IPIs to it. This remote PIO read operation is
37 * in a special nofault region so SAL knows to ignore (and cleanup) 37 * set up in a special nofault region so SAL knows to ignore (and
38 * any errors due to the remote AMO write, PIO read, and/or PIO 38 * cleanup) any errors due to the remote amo write, PIO read, and/or
39 * write operations. 39 * PIO write operations.
40 * 40 *
41 * If/when new hardware solves this IPI problem, we should abandon 41 * If/when new hardware solves this IPI problem, we should abandon
42 * the current approach. 42 * the current approach.
43 * 43 *
44 */ 44 */
45 45
46#include <linux/kernel.h>
47#include <linux/module.h> 46#include <linux/module.h>
48#include <linux/init.h> 47#include <linux/sysctl.h>
49#include <linux/cache.h> 48#include <linux/device.h>
50#include <linux/interrupt.h>
51#include <linux/delay.h> 49#include <linux/delay.h>
52#include <linux/reboot.h> 50#include <linux/reboot.h>
53#include <linux/completion.h>
54#include <linux/kdebug.h> 51#include <linux/kdebug.h>
55#include <linux/kthread.h> 52#include <linux/kthread.h>
56#include <linux/uaccess.h>
57#include <asm/sn/intr.h>
58#include <asm/sn/sn_sal.h>
59#include "xpc.h" 53#include "xpc.h"
60 54
61/* define two XPC debug device structures to be used with dev_dbg() et al */ 55/* define two XPC debug device structures to be used with dev_dbg() et al */
@@ -89,9 +83,9 @@ static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
89static int xpc_hb_check_min_interval = 10; 83static int xpc_hb_check_min_interval = 10;
90static int xpc_hb_check_max_interval = 120; 84static int xpc_hb_check_max_interval = 120;
91 85
92int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT; 86int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
93static int xpc_disengage_request_min_timelimit; /* = 0 */ 87static int xpc_disengage_min_timelimit; /* = 0 */
94static int xpc_disengage_request_max_timelimit = 120; 88static int xpc_disengage_max_timelimit = 120;
95 89
96static ctl_table xpc_sys_xpc_hb_dir[] = { 90static ctl_table xpc_sys_xpc_hb_dir[] = {
97 { 91 {
@@ -124,14 +118,14 @@ static ctl_table xpc_sys_xpc_dir[] = {
124 .child = xpc_sys_xpc_hb_dir}, 118 .child = xpc_sys_xpc_hb_dir},
125 { 119 {
126 .ctl_name = CTL_UNNUMBERED, 120 .ctl_name = CTL_UNNUMBERED,
127 .procname = "disengage_request_timelimit", 121 .procname = "disengage_timelimit",
128 .data = &xpc_disengage_request_timelimit, 122 .data = &xpc_disengage_timelimit,
129 .maxlen = sizeof(int), 123 .maxlen = sizeof(int),
130 .mode = 0644, 124 .mode = 0644,
131 .proc_handler = &proc_dointvec_minmax, 125 .proc_handler = &proc_dointvec_minmax,
132 .strategy = &sysctl_intvec, 126 .strategy = &sysctl_intvec,
133 .extra1 = &xpc_disengage_request_min_timelimit, 127 .extra1 = &xpc_disengage_min_timelimit,
134 .extra2 = &xpc_disengage_request_max_timelimit}, 128 .extra2 = &xpc_disengage_max_timelimit},
135 {} 129 {}
136}; 130};
137static ctl_table xpc_sys_dir[] = { 131static ctl_table xpc_sys_dir[] = {
@@ -144,16 +138,19 @@ static ctl_table xpc_sys_dir[] = {
144}; 138};
145static struct ctl_table_header *xpc_sysctl; 139static struct ctl_table_header *xpc_sysctl;
146 140
147/* non-zero if any remote partition disengage request was timed out */ 141/* non-zero if any remote partition disengage was timed out */
148int xpc_disengage_request_timedout; 142int xpc_disengage_timedout;
149 143
150/* #of IRQs received */ 144/* #of activate IRQs received and not yet processed */
151static atomic_t xpc_act_IRQ_rcvd; 145int xpc_activate_IRQ_rcvd;
146DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock);
152 147
153/* IRQ handler notifies this wait queue on receipt of an IRQ */ 148/* IRQ handler notifies this wait queue on receipt of an IRQ */
154static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq); 149DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);
155 150
156static unsigned long xpc_hb_check_timeout; 151static unsigned long xpc_hb_check_timeout;
152static struct timer_list xpc_hb_timer;
153void *xpc_heartbeating_to_mask;
157 154
158/* notification that the xpc_hb_checker thread has exited */ 155/* notification that the xpc_hb_checker thread has exited */
159static DECLARE_COMPLETION(xpc_hb_checker_exited); 156static DECLARE_COMPLETION(xpc_hb_checker_exited);
@@ -161,8 +158,6 @@ static DECLARE_COMPLETION(xpc_hb_checker_exited);
161/* notification that the xpc_discovery thread has exited */ 158/* notification that the xpc_discovery thread has exited */
162static DECLARE_COMPLETION(xpc_discovery_exited); 159static DECLARE_COMPLETION(xpc_discovery_exited);
163 160
164static struct timer_list xpc_hb_timer;
165
166static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *); 161static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
167 162
168static int xpc_system_reboot(struct notifier_block *, unsigned long, void *); 163static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
@@ -175,31 +170,76 @@ static struct notifier_block xpc_die_notifier = {
175 .notifier_call = xpc_system_die, 170 .notifier_call = xpc_system_die,
176}; 171};
177 172
173int (*xpc_setup_partitions_sn) (void);
174enum xp_retval (*xpc_get_partition_rsvd_page_pa) (void *buf, u64 *cookie,
175 unsigned long *rp_pa,
176 size_t *len);
177int (*xpc_setup_rsvd_page_sn) (struct xpc_rsvd_page *rp);
178void (*xpc_heartbeat_init) (void);
179void (*xpc_heartbeat_exit) (void);
180void (*xpc_increment_heartbeat) (void);
181void (*xpc_offline_heartbeat) (void);
182void (*xpc_online_heartbeat) (void);
183enum xp_retval (*xpc_get_remote_heartbeat) (struct xpc_partition *part);
184
185enum xp_retval (*xpc_make_first_contact) (struct xpc_partition *part);
186void (*xpc_notify_senders_of_disconnect) (struct xpc_channel *ch);
187u64 (*xpc_get_chctl_all_flags) (struct xpc_partition *part);
188enum xp_retval (*xpc_setup_msg_structures) (struct xpc_channel *ch);
189void (*xpc_teardown_msg_structures) (struct xpc_channel *ch);
190void (*xpc_process_msg_chctl_flags) (struct xpc_partition *part, int ch_number);
191int (*xpc_n_of_deliverable_payloads) (struct xpc_channel *ch);
192void *(*xpc_get_deliverable_payload) (struct xpc_channel *ch);
193
194void (*xpc_request_partition_activation) (struct xpc_rsvd_page *remote_rp,
195 unsigned long remote_rp_pa,
196 int nasid);
197void (*xpc_request_partition_reactivation) (struct xpc_partition *part);
198void (*xpc_request_partition_deactivation) (struct xpc_partition *part);
199void (*xpc_cancel_partition_deactivation_request) (struct xpc_partition *part);
200
201void (*xpc_process_activate_IRQ_rcvd) (void);
202enum xp_retval (*xpc_setup_ch_structures_sn) (struct xpc_partition *part);
203void (*xpc_teardown_ch_structures_sn) (struct xpc_partition *part);
204
205void (*xpc_indicate_partition_engaged) (struct xpc_partition *part);
206int (*xpc_partition_engaged) (short partid);
207int (*xpc_any_partition_engaged) (void);
208void (*xpc_indicate_partition_disengaged) (struct xpc_partition *part);
209void (*xpc_assume_partition_disengaged) (short partid);
210
211void (*xpc_send_chctl_closerequest) (struct xpc_channel *ch,
212 unsigned long *irq_flags);
213void (*xpc_send_chctl_closereply) (struct xpc_channel *ch,
214 unsigned long *irq_flags);
215void (*xpc_send_chctl_openrequest) (struct xpc_channel *ch,
216 unsigned long *irq_flags);
217void (*xpc_send_chctl_openreply) (struct xpc_channel *ch,
218 unsigned long *irq_flags);
219
220void (*xpc_save_remote_msgqueue_pa) (struct xpc_channel *ch,
221 unsigned long msgqueue_pa);
222
223enum xp_retval (*xpc_send_payload) (struct xpc_channel *ch, u32 flags,
224 void *payload, u16 payload_size,
225 u8 notify_type, xpc_notify_func func,
226 void *key);
227void (*xpc_received_payload) (struct xpc_channel *ch, void *payload);
228
178/* 229/*
179 * Timer function to enforce the timelimit on the partition disengage request. 230 * Timer function to enforce the timelimit on the partition disengage.
180 */ 231 */
181static void 232static void
182xpc_timeout_partition_disengage_request(unsigned long data) 233xpc_timeout_partition_disengage(unsigned long data)
183{ 234{
184 struct xpc_partition *part = (struct xpc_partition *)data; 235 struct xpc_partition *part = (struct xpc_partition *)data;
185 236
186 DBUG_ON(time_before(jiffies, part->disengage_request_timeout)); 237 DBUG_ON(time_is_after_jiffies(part->disengage_timeout));
187 238
188 (void)xpc_partition_disengaged(part); 239 (void)xpc_partition_disengaged(part);
189 240
190 DBUG_ON(part->disengage_request_timeout != 0); 241 DBUG_ON(part->disengage_timeout != 0);
191 DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0); 242 DBUG_ON(xpc_partition_engaged(XPC_PARTID(part)));
192}
193
194/*
195 * Notify the heartbeat check thread that an IRQ has been received.
196 */
197static irqreturn_t
198xpc_act_IRQ_handler(int irq, void *dev_id)
199{
200 atomic_inc(&xpc_act_IRQ_rcvd);
201 wake_up_interruptible(&xpc_act_IRQ_wq);
202 return IRQ_HANDLED;
203} 243}
204 244
205/* 245/*
@@ -210,15 +250,63 @@ xpc_act_IRQ_handler(int irq, void *dev_id)
210static void 250static void
211xpc_hb_beater(unsigned long dummy) 251xpc_hb_beater(unsigned long dummy)
212{ 252{
213 xpc_vars->heartbeat++; 253 xpc_increment_heartbeat();
214 254
215 if (time_after_eq(jiffies, xpc_hb_check_timeout)) 255 if (time_is_before_eq_jiffies(xpc_hb_check_timeout))
216 wake_up_interruptible(&xpc_act_IRQ_wq); 256 wake_up_interruptible(&xpc_activate_IRQ_wq);
217 257
218 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ); 258 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
219 add_timer(&xpc_hb_timer); 259 add_timer(&xpc_hb_timer);
220} 260}
221 261
262static void
263xpc_start_hb_beater(void)
264{
265 xpc_heartbeat_init();
266 init_timer(&xpc_hb_timer);
267 xpc_hb_timer.function = xpc_hb_beater;
268 xpc_hb_beater(0);
269}
270
271static void
272xpc_stop_hb_beater(void)
273{
274 del_timer_sync(&xpc_hb_timer);
275 xpc_heartbeat_exit();
276}
277
278/*
279 * At periodic intervals, scan through all active partitions and ensure
280 * their heartbeat is still active. If not, the partition is deactivated.
281 */
282static void
283xpc_check_remote_hb(void)
284{
285 struct xpc_partition *part;
286 short partid;
287 enum xp_retval ret;
288
289 for (partid = 0; partid < xp_max_npartitions; partid++) {
290
291 if (xpc_exiting)
292 break;
293
294 if (partid == xp_partition_id)
295 continue;
296
297 part = &xpc_partitions[partid];
298
299 if (part->act_state == XPC_P_AS_INACTIVE ||
300 part->act_state == XPC_P_AS_DEACTIVATING) {
301 continue;
302 }
303
304 ret = xpc_get_remote_heartbeat(part);
305 if (ret != xpSuccess)
306 XPC_DEACTIVATE_PARTITION(part, ret);
307 }
308}
309
222/* 310/*
223 * This thread is responsible for nearly all of the partition 311 * This thread is responsible for nearly all of the partition
224 * activation/deactivation. 312 * activation/deactivation.
@@ -226,67 +314,57 @@ xpc_hb_beater(unsigned long dummy)
226static int 314static int
227xpc_hb_checker(void *ignore) 315xpc_hb_checker(void *ignore)
228{ 316{
229 int last_IRQ_count = 0;
230 int new_IRQ_count;
231 int force_IRQ = 0; 317 int force_IRQ = 0;
232 cpumask_of_cpu_ptr(cpumask, XPC_HB_CHECK_CPU);
233 318
234 /* this thread was marked active by xpc_hb_init() */ 319 /* this thread was marked active by xpc_hb_init() */
235 320
236 set_cpus_allowed_ptr(current, cpumask); 321 set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU));
237 322
238 /* set our heartbeating to other partitions into motion */ 323 /* set our heartbeating to other partitions into motion */
239 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); 324 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
240 xpc_hb_beater(0); 325 xpc_start_hb_beater();
241 326
242 while (!xpc_exiting) { 327 while (!xpc_exiting) {
243 328
244 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have " 329 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
245 "been received\n", 330 "been received\n",
246 (int)(xpc_hb_check_timeout - jiffies), 331 (int)(xpc_hb_check_timeout - jiffies),
247 atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count); 332 xpc_activate_IRQ_rcvd);
248 333
249 /* checking of remote heartbeats is skewed by IRQ handling */ 334 /* checking of remote heartbeats is skewed by IRQ handling */
250 if (time_after_eq(jiffies, xpc_hb_check_timeout)) { 335 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) {
336 xpc_hb_check_timeout = jiffies +
337 (xpc_hb_check_interval * HZ);
338
251 dev_dbg(xpc_part, "checking remote heartbeats\n"); 339 dev_dbg(xpc_part, "checking remote heartbeats\n");
252 xpc_check_remote_hb(); 340 xpc_check_remote_hb();
253 341
254 /* 342 /*
255 * We need to periodically recheck to ensure no 343 * On sn2 we need to periodically recheck to ensure no
256 * IPI/AMO pairs have been missed. That check 344 * IRQ/amo pairs have been missed.
257 * must always reset xpc_hb_check_timeout.
258 */ 345 */
259 force_IRQ = 1; 346 if (is_shub())
347 force_IRQ = 1;
260 } 348 }
261 349
262 /* check for outstanding IRQs */ 350 /* check for outstanding IRQs */
263 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd); 351 if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) {
264 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
265 force_IRQ = 0; 352 force_IRQ = 0;
266 353 dev_dbg(xpc_part, "processing activate IRQs "
267 dev_dbg(xpc_part, "found an IRQ to process; will be " 354 "received\n");
268 "resetting xpc_hb_check_timeout\n"); 355 xpc_process_activate_IRQ_rcvd();
269
270 last_IRQ_count += xpc_identify_act_IRQ_sender();
271 if (last_IRQ_count < new_IRQ_count) {
272 /* retry once to help avoid missing AMO */
273 (void)xpc_identify_act_IRQ_sender();
274 }
275 last_IRQ_count = new_IRQ_count;
276
277 xpc_hb_check_timeout = jiffies +
278 (xpc_hb_check_interval * HZ);
279 } 356 }
280 357
281 /* wait for IRQ or timeout */ 358 /* wait for IRQ or timeout */
282 (void)wait_event_interruptible(xpc_act_IRQ_wq, 359 (void)wait_event_interruptible(xpc_activate_IRQ_wq,
283 (last_IRQ_count < 360 (time_is_before_eq_jiffies(
284 atomic_read(&xpc_act_IRQ_rcvd) 361 xpc_hb_check_timeout) ||
285 || time_after_eq(jiffies, 362 xpc_activate_IRQ_rcvd > 0 ||
286 xpc_hb_check_timeout) ||
287 xpc_exiting)); 363 xpc_exiting));
288 } 364 }
289 365
366 xpc_stop_hb_beater();
367
290 dev_dbg(xpc_part, "heartbeat checker is exiting\n"); 368 dev_dbg(xpc_part, "heartbeat checker is exiting\n");
291 369
292 /* mark this thread as having exited */ 370 /* mark this thread as having exited */
@@ -312,37 +390,8 @@ xpc_initiate_discovery(void *ignore)
312} 390}
313 391
314/* 392/*
315 * Establish first contact with the remote partititon. This involves pulling
316 * the XPC per partition variables from the remote partition and waiting for
317 * the remote partition to pull ours.
318 */
319static enum xp_retval
320xpc_make_first_contact(struct xpc_partition *part)
321{
322 enum xp_retval ret;
323
324 while ((ret = xpc_pull_remote_vars_part(part)) != xpSuccess) {
325 if (ret != xpRetry) {
326 XPC_DEACTIVATE_PARTITION(part, ret);
327 return ret;
328 }
329
330 dev_dbg(xpc_chan, "waiting to make first contact with "
331 "partition %d\n", XPC_PARTID(part));
332
333 /* wait a 1/4 of a second or so */
334 (void)msleep_interruptible(250);
335
336 if (part->act_state == XPC_P_DEACTIVATING)
337 return part->reason;
338 }
339
340 return xpc_mark_partition_active(part);
341}
342
343/*
344 * The first kthread assigned to a newly activated partition is the one 393 * The first kthread assigned to a newly activated partition is the one
345 * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to 394 * created by XPC HB with which it calls xpc_activating(). XPC hangs on to
346 * that kthread until the partition is brought down, at which time that kthread 395 * that kthread until the partition is brought down, at which time that kthread
347 * returns back to XPC HB. (The return of that kthread will signify to XPC HB 396 * returns back to XPC HB. (The return of that kthread will signify to XPC HB
348 * that XPC has dismantled all communication infrastructure for the associated 397 * that XPC has dismantled all communication infrastructure for the associated
@@ -355,11 +404,11 @@ xpc_make_first_contact(struct xpc_partition *part)
355static void 404static void
356xpc_channel_mgr(struct xpc_partition *part) 405xpc_channel_mgr(struct xpc_partition *part)
357{ 406{
358 while (part->act_state != XPC_P_DEACTIVATING || 407 while (part->act_state != XPC_P_AS_DEACTIVATING ||
359 atomic_read(&part->nchannels_active) > 0 || 408 atomic_read(&part->nchannels_active) > 0 ||
360 !xpc_partition_disengaged(part)) { 409 !xpc_partition_disengaged(part)) {
361 410
362 xpc_process_channel_activity(part); 411 xpc_process_sent_chctl_flags(part);
363 412
364 /* 413 /*
365 * Wait until we've been requested to activate kthreads or 414 * Wait until we've been requested to activate kthreads or
@@ -377,8 +426,8 @@ xpc_channel_mgr(struct xpc_partition *part)
377 atomic_dec(&part->channel_mgr_requests); 426 atomic_dec(&part->channel_mgr_requests);
378 (void)wait_event_interruptible(part->channel_mgr_wq, 427 (void)wait_event_interruptible(part->channel_mgr_wq,
379 (atomic_read(&part->channel_mgr_requests) > 0 || 428 (atomic_read(&part->channel_mgr_requests) > 0 ||
380 part->local_IPI_amo != 0 || 429 part->chctl.all_flags != 0 ||
381 (part->act_state == XPC_P_DEACTIVATING && 430 (part->act_state == XPC_P_AS_DEACTIVATING &&
382 atomic_read(&part->nchannels_active) == 0 && 431 atomic_read(&part->nchannels_active) == 0 &&
383 xpc_partition_disengaged(part)))); 432 xpc_partition_disengaged(part))));
384 atomic_set(&part->channel_mgr_requests, 1); 433 atomic_set(&part->channel_mgr_requests, 1);
@@ -386,47 +435,163 @@ xpc_channel_mgr(struct xpc_partition *part)
386} 435}
387 436
388/* 437/*
389 * When XPC HB determines that a partition has come up, it will create a new 438 * Guarantee that the kzalloc'd memory is cacheline aligned.
390 * kthread and that kthread will call this function to attempt to set up the
391 * basic infrastructure used for Cross Partition Communication with the newly
392 * upped partition.
393 *
394 * The kthread that was created by XPC HB and which setup the XPC
395 * infrastructure will remain assigned to the partition until the partition
396 * goes down. At which time the kthread will teardown the XPC infrastructure
397 * and then exit.
398 *
399 * XPC HB will put the remote partition's XPC per partition specific variables
400 * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
401 * calling xpc_partition_up().
402 */ 439 */
403static void 440void *
404xpc_partition_up(struct xpc_partition *part) 441xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
442{
443 /* see if kzalloc will give us cachline aligned memory by default */
444 *base = kzalloc(size, flags);
445 if (*base == NULL)
446 return NULL;
447
448 if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
449 return *base;
450
451 kfree(*base);
452
453 /* nope, we'll have to do it ourselves */
454 *base = kzalloc(size + L1_CACHE_BYTES, flags);
455 if (*base == NULL)
456 return NULL;
457
458 return (void *)L1_CACHE_ALIGN((u64)*base);
459}
460
461/*
462 * Setup the channel structures necessary to support XPartition Communication
463 * between the specified remote partition and the local one.
464 */
465static enum xp_retval
466xpc_setup_ch_structures(struct xpc_partition *part)
405{ 467{
468 enum xp_retval ret;
469 int ch_number;
470 struct xpc_channel *ch;
471 short partid = XPC_PARTID(part);
472
473 /*
474 * Allocate all of the channel structures as a contiguous chunk of
475 * memory.
476 */
406 DBUG_ON(part->channels != NULL); 477 DBUG_ON(part->channels != NULL);
478 part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_MAX_NCHANNELS,
479 GFP_KERNEL);
480 if (part->channels == NULL) {
481 dev_err(xpc_chan, "can't get memory for channels\n");
482 return xpNoMemory;
483 }
407 484
408 dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part)); 485 /* allocate the remote open and close args */
409 486
410 if (xpc_setup_infrastructure(part) != xpSuccess) 487 part->remote_openclose_args =
411 return; 488 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
489 GFP_KERNEL, &part->
490 remote_openclose_args_base);
491 if (part->remote_openclose_args == NULL) {
492 dev_err(xpc_chan, "can't get memory for remote connect args\n");
493 ret = xpNoMemory;
494 goto out_1;
495 }
496
497 part->chctl.all_flags = 0;
498 spin_lock_init(&part->chctl_lock);
499
500 atomic_set(&part->channel_mgr_requests, 1);
501 init_waitqueue_head(&part->channel_mgr_wq);
502
503 part->nchannels = XPC_MAX_NCHANNELS;
504
505 atomic_set(&part->nchannels_active, 0);
506 atomic_set(&part->nchannels_engaged, 0);
507
508 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
509 ch = &part->channels[ch_number];
510
511 ch->partid = partid;
512 ch->number = ch_number;
513 ch->flags = XPC_C_DISCONNECTED;
514
515 atomic_set(&ch->kthreads_assigned, 0);
516 atomic_set(&ch->kthreads_idle, 0);
517 atomic_set(&ch->kthreads_active, 0);
518
519 atomic_set(&ch->references, 0);
520 atomic_set(&ch->n_to_notify, 0);
521
522 spin_lock_init(&ch->lock);
523 init_completion(&ch->wdisconnect_wait);
524
525 atomic_set(&ch->n_on_msg_allocate_wq, 0);
526 init_waitqueue_head(&ch->msg_allocate_wq);
527 init_waitqueue_head(&ch->idle_wq);
528 }
529
530 ret = xpc_setup_ch_structures_sn(part);
531 if (ret != xpSuccess)
532 goto out_2;
533
534 /*
535 * With the setting of the partition setup_state to XPC_P_SS_SETUP,
536 * we're declaring that this partition is ready to go.
537 */
538 part->setup_state = XPC_P_SS_SETUP;
539
540 return xpSuccess;
541
542 /* setup of ch structures failed */
543out_2:
544 kfree(part->remote_openclose_args_base);
545 part->remote_openclose_args = NULL;
546out_1:
547 kfree(part->channels);
548 part->channels = NULL;
549 return ret;
550}
551
552/*
553 * Teardown the channel structures necessary to support XPartition Communication
554 * between the specified remote partition and the local one.
555 */
556static void
557xpc_teardown_ch_structures(struct xpc_partition *part)
558{
559 DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
560 DBUG_ON(atomic_read(&part->nchannels_active) != 0);
412 561
413 /* 562 /*
414 * The kthread that XPC HB called us with will become the 563 * Make this partition inaccessible to local processes by marking it
415 * channel manager for this partition. It will not return 564 * as no longer setup. Then wait before proceeding with the teardown
416 * back to XPC HB until the partition's XPC infrastructure 565 * until all existing references cease.
417 * has been dismantled.
418 */ 566 */
567 DBUG_ON(part->setup_state != XPC_P_SS_SETUP);
568 part->setup_state = XPC_P_SS_WTEARDOWN;
419 569
420 (void)xpc_part_ref(part); /* this will always succeed */ 570 wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
421 571
422 if (xpc_make_first_contact(part) == xpSuccess) 572 /* now we can begin tearing down the infrastructure */
423 xpc_channel_mgr(part);
424 573
425 xpc_part_deref(part); 574 xpc_teardown_ch_structures_sn(part);
426 575
427 xpc_teardown_infrastructure(part); 576 kfree(part->remote_openclose_args_base);
577 part->remote_openclose_args = NULL;
578 kfree(part->channels);
579 part->channels = NULL;
580
581 part->setup_state = XPC_P_SS_TORNDOWN;
428} 582}
429 583
584/*
585 * When XPC HB determines that a partition has come up, it will create a new
586 * kthread and that kthread will call this function to attempt to set up the
587 * basic infrastructure used for Cross Partition Communication with the newly
588 * upped partition.
589 *
590 * The kthread that was created by XPC HB and which setup the XPC
591 * infrastructure will remain assigned to the partition becoming the channel
592 * manager for that partition until the partition is deactivating, at which
593 * time the kthread will teardown the XPC infrastructure and then exit.
594 */
430static int 595static int
431xpc_activating(void *__partid) 596xpc_activating(void *__partid)
432{ 597{
@@ -434,64 +599,47 @@ xpc_activating(void *__partid)
434 struct xpc_partition *part = &xpc_partitions[partid]; 599 struct xpc_partition *part = &xpc_partitions[partid];
435 unsigned long irq_flags; 600 unsigned long irq_flags;
436 601
437 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS); 602 DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
438 603
439 spin_lock_irqsave(&part->act_lock, irq_flags); 604 spin_lock_irqsave(&part->act_lock, irq_flags);
440 605
441 if (part->act_state == XPC_P_DEACTIVATING) { 606 if (part->act_state == XPC_P_AS_DEACTIVATING) {
442 part->act_state = XPC_P_INACTIVE; 607 part->act_state = XPC_P_AS_INACTIVE;
443 spin_unlock_irqrestore(&part->act_lock, irq_flags); 608 spin_unlock_irqrestore(&part->act_lock, irq_flags);
444 part->remote_rp_pa = 0; 609 part->remote_rp_pa = 0;
445 return 0; 610 return 0;
446 } 611 }
447 612
448 /* indicate the thread is activating */ 613 /* indicate the thread is activating */
449 DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ); 614 DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ);
450 part->act_state = XPC_P_ACTIVATING; 615 part->act_state = XPC_P_AS_ACTIVATING;
451 616
452 XPC_SET_REASON(part, 0, 0); 617 XPC_SET_REASON(part, 0, 0);
453 spin_unlock_irqrestore(&part->act_lock, irq_flags); 618 spin_unlock_irqrestore(&part->act_lock, irq_flags);
454 619
455 dev_dbg(xpc_part, "bringing partition %d up\n", partid); 620 dev_dbg(xpc_part, "activating partition %d\n", partid);
456 621
457 /* 622 xpc_allow_hb(partid);
458 * Register the remote partition's AMOs with SAL so it can handle
459 * and cleanup errors within that address range should the remote
460 * partition go down. We don't unregister this range because it is
461 * difficult to tell when outstanding writes to the remote partition
462 * are finished and thus when it is safe to unregister. This should
463 * not result in wasted space in the SAL xp_addr_region table because
464 * we should get the same page for remote_amos_page_pa after module
465 * reloads and system reboots.
466 */
467 if (sn_register_xp_addr_region(part->remote_amos_page_pa,
468 PAGE_SIZE, 1) < 0) {
469 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
470 "xp_addr region\n", partid);
471 623
472 spin_lock_irqsave(&part->act_lock, irq_flags); 624 if (xpc_setup_ch_structures(part) == xpSuccess) {
473 part->act_state = XPC_P_INACTIVE; 625 (void)xpc_part_ref(part); /* this will always succeed */
474 XPC_SET_REASON(part, xpPhysAddrRegFailed, __LINE__);
475 spin_unlock_irqrestore(&part->act_lock, irq_flags);
476 part->remote_rp_pa = 0;
477 return 0;
478 }
479 626
480 xpc_allow_hb(partid, xpc_vars); 627 if (xpc_make_first_contact(part) == xpSuccess) {
481 xpc_IPI_send_activated(part); 628 xpc_mark_partition_active(part);
629 xpc_channel_mgr(part);
630 /* won't return until partition is deactivating */
631 }
482 632
483 /* 633 xpc_part_deref(part);
484 * xpc_partition_up() holds this thread and marks this partition as 634 xpc_teardown_ch_structures(part);
485 * XPC_P_ACTIVE by calling xpc_hb_mark_active(). 635 }
486 */
487 (void)xpc_partition_up(part);
488 636
489 xpc_disallow_hb(partid, xpc_vars); 637 xpc_disallow_hb(partid);
490 xpc_mark_partition_inactive(part); 638 xpc_mark_partition_inactive(part);
491 639
492 if (part->reason == xpReactivating) { 640 if (part->reason == xpReactivating) {
493 /* interrupting ourselves results in activating partition */ 641 /* interrupting ourselves results in activating partition */
494 xpc_IPI_send_reactivate(part); 642 xpc_request_partition_reactivation(part);
495 } 643 }
496 644
497 return 0; 645 return 0;
@@ -506,9 +654,9 @@ xpc_activate_partition(struct xpc_partition *part)
506 654
507 spin_lock_irqsave(&part->act_lock, irq_flags); 655 spin_lock_irqsave(&part->act_lock, irq_flags);
508 656
509 DBUG_ON(part->act_state != XPC_P_INACTIVE); 657 DBUG_ON(part->act_state != XPC_P_AS_INACTIVE);
510 658
511 part->act_state = XPC_P_ACTIVATION_REQ; 659 part->act_state = XPC_P_AS_ACTIVATION_REQ;
512 XPC_SET_REASON(part, xpCloneKThread, __LINE__); 660 XPC_SET_REASON(part, xpCloneKThread, __LINE__);
513 661
514 spin_unlock_irqrestore(&part->act_lock, irq_flags); 662 spin_unlock_irqrestore(&part->act_lock, irq_flags);
@@ -517,62 +665,12 @@ xpc_activate_partition(struct xpc_partition *part)
517 partid); 665 partid);
518 if (IS_ERR(kthread)) { 666 if (IS_ERR(kthread)) {
519 spin_lock_irqsave(&part->act_lock, irq_flags); 667 spin_lock_irqsave(&part->act_lock, irq_flags);
520 part->act_state = XPC_P_INACTIVE; 668 part->act_state = XPC_P_AS_INACTIVE;
521 XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__); 669 XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__);
522 spin_unlock_irqrestore(&part->act_lock, irq_flags); 670 spin_unlock_irqrestore(&part->act_lock, irq_flags);
523 } 671 }
524} 672}
525 673
526/*
527 * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
528 * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
529 * than one partition, we use an AMO_t structure per partition to indicate
530 * whether a partition has sent an IPI or not. If it has, then wake up the
531 * associated kthread to handle it.
532 *
533 * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
534 * running on other partitions.
535 *
536 * Noteworthy Arguments:
537 *
538 * irq - Interrupt ReQuest number. NOT USED.
539 *
540 * dev_id - partid of IPI's potential sender.
541 */
542irqreturn_t
543xpc_notify_IRQ_handler(int irq, void *dev_id)
544{
545 short partid = (short)(u64)dev_id;
546 struct xpc_partition *part = &xpc_partitions[partid];
547
548 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
549
550 if (xpc_part_ref(part)) {
551 xpc_check_for_channel_activity(part);
552
553 xpc_part_deref(part);
554 }
555 return IRQ_HANDLED;
556}
557
558/*
559 * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
560 * because the write to their associated IPI amo completed after the IRQ/IPI
561 * was received.
562 */
563void
564xpc_dropped_IPI_check(struct xpc_partition *part)
565{
566 if (xpc_part_ref(part)) {
567 xpc_check_for_channel_activity(part);
568
569 part->dropped_IPI_timer.expires = jiffies +
570 XPC_P_DROPPED_IPI_WAIT;
571 add_timer(&part->dropped_IPI_timer);
572 xpc_part_deref(part);
573 }
574}
575
576void 674void
577xpc_activate_kthreads(struct xpc_channel *ch, int needed) 675xpc_activate_kthreads(struct xpc_channel *ch, int needed)
578{ 676{
@@ -617,9 +715,9 @@ xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
617 do { 715 do {
618 /* deliver messages to their intended recipients */ 716 /* deliver messages to their intended recipients */
619 717
620 while (ch->w_local_GP.get < ch->w_remote_GP.put && 718 while (xpc_n_of_deliverable_payloads(ch) > 0 &&
621 !(ch->flags & XPC_C_DISCONNECTING)) { 719 !(ch->flags & XPC_C_DISCONNECTING)) {
622 xpc_deliver_msg(ch); 720 xpc_deliver_payload(ch);
623 } 721 }
624 722
625 if (atomic_inc_return(&ch->kthreads_idle) > 723 if (atomic_inc_return(&ch->kthreads_idle) >
@@ -633,7 +731,7 @@ xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
633 "wait_event_interruptible_exclusive()\n"); 731 "wait_event_interruptible_exclusive()\n");
634 732
635 (void)wait_event_interruptible_exclusive(ch->idle_wq, 733 (void)wait_event_interruptible_exclusive(ch->idle_wq,
636 (ch->w_local_GP.get < ch->w_remote_GP.put || 734 (xpc_n_of_deliverable_payloads(ch) > 0 ||
637 (ch->flags & XPC_C_DISCONNECTING))); 735 (ch->flags & XPC_C_DISCONNECTING)));
638 736
639 atomic_dec(&ch->kthreads_idle); 737 atomic_dec(&ch->kthreads_idle);
@@ -678,7 +776,7 @@ xpc_kthread_start(void *args)
678 * additional kthreads to help deliver them. We only 776 * additional kthreads to help deliver them. We only
679 * need one less than total #of messages to deliver. 777 * need one less than total #of messages to deliver.
680 */ 778 */
681 n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1; 779 n_needed = xpc_n_of_deliverable_payloads(ch) - 1;
682 if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING)) 780 if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
683 xpc_activate_kthreads(ch, n_needed); 781 xpc_activate_kthreads(ch, n_needed);
684 782
@@ -704,11 +802,9 @@ xpc_kthread_start(void *args)
704 } 802 }
705 spin_unlock_irqrestore(&ch->lock, irq_flags); 803 spin_unlock_irqrestore(&ch->lock, irq_flags);
706 804
707 if (atomic_dec_return(&ch->kthreads_assigned) == 0) { 805 if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
708 if (atomic_dec_return(&part->nchannels_engaged) == 0) { 806 atomic_dec_return(&part->nchannels_engaged) == 0) {
709 xpc_mark_partition_disengaged(part); 807 xpc_indicate_partition_disengaged(part);
710 xpc_IPI_send_disengage(part);
711 }
712 } 808 }
713 809
714 xpc_msgqueue_deref(ch); 810 xpc_msgqueue_deref(ch);
@@ -759,9 +855,9 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed,
759 } else if (ch->flags & XPC_C_DISCONNECTING) { 855 } else if (ch->flags & XPC_C_DISCONNECTING) {
760 break; 856 break;
761 857
762 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1) { 858 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
763 if (atomic_inc_return(&part->nchannels_engaged) == 1) 859 atomic_inc_return(&part->nchannels_engaged) == 1) {
764 xpc_mark_partition_engaged(part); 860 xpc_indicate_partition_engaged(part);
765 } 861 }
766 (void)xpc_part_ref(part); 862 (void)xpc_part_ref(part);
767 xpc_msgqueue_ref(ch); 863 xpc_msgqueue_ref(ch);
@@ -783,8 +879,7 @@ xpc_create_kthreads(struct xpc_channel *ch, int needed,
783 879
784 if (atomic_dec_return(&ch->kthreads_assigned) == 0 && 880 if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
785 atomic_dec_return(&part->nchannels_engaged) == 0) { 881 atomic_dec_return(&part->nchannels_engaged) == 0) {
786 xpc_mark_partition_disengaged(part); 882 xpc_indicate_partition_disengaged(part);
787 xpc_IPI_send_disengage(part);
788 } 883 }
789 xpc_msgqueue_deref(ch); 884 xpc_msgqueue_deref(ch);
790 xpc_part_deref(part); 885 xpc_part_deref(part);
@@ -816,7 +911,7 @@ xpc_disconnect_wait(int ch_number)
816 int wakeup_channel_mgr; 911 int wakeup_channel_mgr;
817 912
818 /* now wait for all callouts to the caller's function to cease */ 913 /* now wait for all callouts to the caller's function to cease */
819 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { 914 for (partid = 0; partid < xp_max_npartitions; partid++) {
820 part = &xpc_partitions[partid]; 915 part = &xpc_partitions[partid];
821 916
822 if (!xpc_part_ref(part)) 917 if (!xpc_part_ref(part))
@@ -835,16 +930,15 @@ xpc_disconnect_wait(int ch_number)
835 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); 930 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
836 wakeup_channel_mgr = 0; 931 wakeup_channel_mgr = 0;
837 932
838 if (ch->delayed_IPI_flags) { 933 if (ch->delayed_chctl_flags) {
839 if (part->act_state != XPC_P_DEACTIVATING) { 934 if (part->act_state != XPC_P_AS_DEACTIVATING) {
840 spin_lock(&part->IPI_lock); 935 spin_lock(&part->chctl_lock);
841 XPC_SET_IPI_FLAGS(part->local_IPI_amo, 936 part->chctl.flags[ch->number] |=
842 ch->number, 937 ch->delayed_chctl_flags;
843 ch->delayed_IPI_flags); 938 spin_unlock(&part->chctl_lock);
844 spin_unlock(&part->IPI_lock);
845 wakeup_channel_mgr = 1; 939 wakeup_channel_mgr = 1;
846 } 940 }
847 ch->delayed_IPI_flags = 0; 941 ch->delayed_chctl_flags = 0;
848 } 942 }
849 943
850 ch->flags &= ~XPC_C_WDISCONNECT; 944 ch->flags &= ~XPC_C_WDISCONNECT;
@@ -857,13 +951,63 @@ xpc_disconnect_wait(int ch_number)
857 } 951 }
858} 952}
859 953
954static int
955xpc_setup_partitions(void)
956{
957 short partid;
958 struct xpc_partition *part;
959
960 xpc_partitions = kzalloc(sizeof(struct xpc_partition) *
961 xp_max_npartitions, GFP_KERNEL);
962 if (xpc_partitions == NULL) {
963 dev_err(xpc_part, "can't get memory for partition structure\n");
964 return -ENOMEM;
965 }
966
967 /*
968 * The first few fields of each entry of xpc_partitions[] need to
969 * be initialized now so that calls to xpc_connect() and
970 * xpc_disconnect() can be made prior to the activation of any remote
971 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
972 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
973 * PARTITION HAS BEEN ACTIVATED.
974 */
975 for (partid = 0; partid < xp_max_npartitions; partid++) {
976 part = &xpc_partitions[partid];
977
978 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
979
980 part->activate_IRQ_rcvd = 0;
981 spin_lock_init(&part->act_lock);
982 part->act_state = XPC_P_AS_INACTIVE;
983 XPC_SET_REASON(part, 0, 0);
984
985 init_timer(&part->disengage_timer);
986 part->disengage_timer.function =
987 xpc_timeout_partition_disengage;
988 part->disengage_timer.data = (unsigned long)part;
989
990 part->setup_state = XPC_P_SS_UNSET;
991 init_waitqueue_head(&part->teardown_wq);
992 atomic_set(&part->references, 0);
993 }
994
995 return xpc_setup_partitions_sn();
996}
997
998static void
999xpc_teardown_partitions(void)
1000{
1001 kfree(xpc_partitions);
1002}
1003
860static void 1004static void
861xpc_do_exit(enum xp_retval reason) 1005xpc_do_exit(enum xp_retval reason)
862{ 1006{
863 short partid; 1007 short partid;
864 int active_part_count, printed_waiting_msg = 0; 1008 int active_part_count, printed_waiting_msg = 0;
865 struct xpc_partition *part; 1009 struct xpc_partition *part;
866 unsigned long printmsg_time, disengage_request_timeout = 0; 1010 unsigned long printmsg_time, disengage_timeout = 0;
867 1011
868 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */ 1012 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
869 DBUG_ON(xpc_exiting == 1); 1013 DBUG_ON(xpc_exiting == 1);
@@ -874,10 +1018,7 @@ xpc_do_exit(enum xp_retval reason)
874 * the heartbeat checker thread in case it's sleeping. 1018 * the heartbeat checker thread in case it's sleeping.
875 */ 1019 */
876 xpc_exiting = 1; 1020 xpc_exiting = 1;
877 wake_up_interruptible(&xpc_act_IRQ_wq); 1021 wake_up_interruptible(&xpc_activate_IRQ_wq);
878
879 /* ignore all incoming interrupts */
880 free_irq(SGI_XPC_ACTIVATE, NULL);
881 1022
882 /* wait for the discovery thread to exit */ 1023 /* wait for the discovery thread to exit */
883 wait_for_completion(&xpc_discovery_exited); 1024 wait_for_completion(&xpc_discovery_exited);
@@ -890,17 +1031,17 @@ xpc_do_exit(enum xp_retval reason)
890 1031
891 /* wait for all partitions to become inactive */ 1032 /* wait for all partitions to become inactive */
892 1033
893 printmsg_time = jiffies + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ); 1034 printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
894 xpc_disengage_request_timedout = 0; 1035 xpc_disengage_timedout = 0;
895 1036
896 do { 1037 do {
897 active_part_count = 0; 1038 active_part_count = 0;
898 1039
899 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { 1040 for (partid = 0; partid < xp_max_npartitions; partid++) {
900 part = &xpc_partitions[partid]; 1041 part = &xpc_partitions[partid];
901 1042
902 if (xpc_partition_disengaged(part) && 1043 if (xpc_partition_disengaged(part) &&
903 part->act_state == XPC_P_INACTIVE) { 1044 part->act_state == XPC_P_AS_INACTIVE) {
904 continue; 1045 continue;
905 } 1046 }
906 1047
@@ -908,36 +1049,32 @@ xpc_do_exit(enum xp_retval reason)
908 1049
909 XPC_DEACTIVATE_PARTITION(part, reason); 1050 XPC_DEACTIVATE_PARTITION(part, reason);
910 1051
911 if (part->disengage_request_timeout > 1052 if (part->disengage_timeout > disengage_timeout)
912 disengage_request_timeout) { 1053 disengage_timeout = part->disengage_timeout;
913 disengage_request_timeout =
914 part->disengage_request_timeout;
915 }
916 } 1054 }
917 1055
918 if (xpc_partition_engaged(-1UL)) { 1056 if (xpc_any_partition_engaged()) {
919 if (time_after(jiffies, printmsg_time)) { 1057 if (time_is_before_jiffies(printmsg_time)) {
920 dev_info(xpc_part, "waiting for remote " 1058 dev_info(xpc_part, "waiting for remote "
921 "partitions to disengage, timeout in " 1059 "partitions to deactivate, timeout in "
922 "%ld seconds\n", 1060 "%ld seconds\n", (disengage_timeout -
923 (disengage_request_timeout - jiffies) 1061 jiffies) / HZ);
924 / HZ);
925 printmsg_time = jiffies + 1062 printmsg_time = jiffies +
926 (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ); 1063 (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
927 printed_waiting_msg = 1; 1064 printed_waiting_msg = 1;
928 } 1065 }
929 1066
930 } else if (active_part_count > 0) { 1067 } else if (active_part_count > 0) {
931 if (printed_waiting_msg) { 1068 if (printed_waiting_msg) {
932 dev_info(xpc_part, "waiting for local partition" 1069 dev_info(xpc_part, "waiting for local partition"
933 " to disengage\n"); 1070 " to deactivate\n");
934 printed_waiting_msg = 0; 1071 printed_waiting_msg = 0;
935 } 1072 }
936 1073
937 } else { 1074 } else {
938 if (!xpc_disengage_request_timedout) { 1075 if (!xpc_disengage_timedout) {
939 dev_info(xpc_part, "all partitions have " 1076 dev_info(xpc_part, "all partitions have "
940 "disengaged\n"); 1077 "deactivated\n");
941 } 1078 }
942 break; 1079 break;
943 } 1080 }
@@ -947,33 +1084,28 @@ xpc_do_exit(enum xp_retval reason)
947 1084
948 } while (1); 1085 } while (1);
949 1086
950 DBUG_ON(xpc_partition_engaged(-1UL)); 1087 DBUG_ON(xpc_any_partition_engaged());
1088 DBUG_ON(xpc_any_hbs_allowed() != 0);
951 1089
952 /* indicate to others that our reserved page is uninitialized */ 1090 xpc_teardown_rsvd_page();
953 xpc_rsvd_page->vars_pa = 0;
954
955 /* now it's time to eliminate our heartbeat */
956 del_timer_sync(&xpc_hb_timer);
957 DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
958 1091
959 if (reason == xpUnloading) { 1092 if (reason == xpUnloading) {
960 /* take ourselves off of the reboot_notifier_list */
961 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
962
963 /* take ourselves off of the die_notifier list */
964 (void)unregister_die_notifier(&xpc_die_notifier); 1093 (void)unregister_die_notifier(&xpc_die_notifier);
1094 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
965 } 1095 }
966 1096
967 /* close down protections for IPI operations */
968 xpc_restrict_IPI_ops();
969
970 /* clear the interface to XPC's functions */ 1097 /* clear the interface to XPC's functions */
971 xpc_clear_interface(); 1098 xpc_clear_interface();
972 1099
973 if (xpc_sysctl) 1100 if (xpc_sysctl)
974 unregister_sysctl_table(xpc_sysctl); 1101 unregister_sysctl_table(xpc_sysctl);
975 1102
976 kfree(xpc_remote_copy_buffer_base); 1103 xpc_teardown_partitions();
1104
1105 if (is_shub())
1106 xpc_exit_sn2();
1107 else
1108 xpc_exit_uv();
977} 1109}
978 1110
979/* 1111/*
@@ -1003,60 +1135,57 @@ xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1003} 1135}
1004 1136
1005/* 1137/*
1006 * Notify other partitions to disengage from all references to our memory. 1138 * Notify other partitions to deactivate from us by first disengaging from all
1139 * references to our memory.
1007 */ 1140 */
1008static void 1141static void
1009xpc_die_disengage(void) 1142xpc_die_deactivate(void)
1010{ 1143{
1011 struct xpc_partition *part; 1144 struct xpc_partition *part;
1012 short partid; 1145 short partid;
1013 unsigned long engaged; 1146 int any_engaged;
1014 long time, printmsg_time, disengage_request_timeout; 1147 long keep_waiting;
1148 long wait_to_print;
1015 1149
1016 /* keep xpc_hb_checker thread from doing anything (just in case) */ 1150 /* keep xpc_hb_checker thread from doing anything (just in case) */
1017 xpc_exiting = 1; 1151 xpc_exiting = 1;
1018 1152
1019 xpc_vars->heartbeating_to_mask = 0; /* indicate we're deactivated */ 1153 xpc_disallow_all_hbs(); /*indicate we're deactivated */
1020 1154
1021 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { 1155 for (partid = 0; partid < xp_max_npartitions; partid++) {
1022 part = &xpc_partitions[partid]; 1156 part = &xpc_partitions[partid];
1023 1157
1024 if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part-> 1158 if (xpc_partition_engaged(partid) ||
1025 remote_vars_version)) { 1159 part->act_state != XPC_P_AS_INACTIVE) {
1026 1160 xpc_request_partition_deactivation(part);
1027 /* just in case it was left set by an earlier XPC */ 1161 xpc_indicate_partition_disengaged(part);
1028 xpc_clear_partition_engaged(1UL << partid);
1029 continue;
1030 }
1031
1032 if (xpc_partition_engaged(1UL << partid) ||
1033 part->act_state != XPC_P_INACTIVE) {
1034 xpc_request_partition_disengage(part);
1035 xpc_mark_partition_disengaged(part);
1036 xpc_IPI_send_disengage(part);
1037 } 1162 }
1038 } 1163 }
1039 1164
1040 time = rtc_time(); 1165 /*
1041 printmsg_time = time + 1166 * Though we requested that all other partitions deactivate from us,
1042 (XPC_DISENGAGE_PRINTMSG_INTERVAL * sn_rtc_cycles_per_second); 1167 * we only wait until they've all disengaged or we've reached the
1043 disengage_request_timeout = time + 1168 * defined timelimit.
1044 (xpc_disengage_request_timelimit * sn_rtc_cycles_per_second); 1169 *
1045 1170 * Given that one iteration through the following while-loop takes
1046 /* wait for all other partitions to disengage from us */ 1171 * approximately 200 microseconds, calculate the #of loops to take
1172 * before bailing and the #of loops before printing a waiting message.
1173 */
1174 keep_waiting = xpc_disengage_timelimit * 1000 * 5;
1175 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5;
1047 1176
1048 while (1) { 1177 while (1) {
1049 engaged = xpc_partition_engaged(-1UL); 1178 any_engaged = xpc_any_partition_engaged();
1050 if (!engaged) { 1179 if (!any_engaged) {
1051 dev_info(xpc_part, "all partitions have disengaged\n"); 1180 dev_info(xpc_part, "all partitions have deactivated\n");
1052 break; 1181 break;
1053 } 1182 }
1054 1183
1055 time = rtc_time(); 1184 if (!keep_waiting--) {
1056 if (time >= disengage_request_timeout) { 1185 for (partid = 0; partid < xp_max_npartitions;
1057 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { 1186 partid++) {
1058 if (engaged & (1UL << partid)) { 1187 if (xpc_partition_engaged(partid)) {
1059 dev_info(xpc_part, "disengage from " 1188 dev_info(xpc_part, "deactivate from "
1060 "remote partition %d timed " 1189 "remote partition %d timed "
1061 "out\n", partid); 1190 "out\n", partid);
1062 } 1191 }
@@ -1064,15 +1193,15 @@ xpc_die_disengage(void)
1064 break; 1193 break;
1065 } 1194 }
1066 1195
1067 if (time >= printmsg_time) { 1196 if (!wait_to_print--) {
1068 dev_info(xpc_part, "waiting for remote partitions to " 1197 dev_info(xpc_part, "waiting for remote partitions to "
1069 "disengage, timeout in %ld seconds\n", 1198 "deactivate, timeout in %ld seconds\n",
1070 (disengage_request_timeout - time) / 1199 keep_waiting / (1000 * 5));
1071 sn_rtc_cycles_per_second); 1200 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL *
1072 printmsg_time = time + 1201 1000 * 5;
1073 (XPC_DISENGAGE_PRINTMSG_INTERVAL *
1074 sn_rtc_cycles_per_second);
1075 } 1202 }
1203
1204 udelay(200);
1076 } 1205 }
1077} 1206}
1078 1207
@@ -1087,10 +1216,11 @@ xpc_die_disengage(void)
1087static int 1216static int
1088xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused) 1217xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1089{ 1218{
1219#ifdef CONFIG_IA64 /* !!! temporary kludge */
1090 switch (event) { 1220 switch (event) {
1091 case DIE_MACHINE_RESTART: 1221 case DIE_MACHINE_RESTART:
1092 case DIE_MACHINE_HALT: 1222 case DIE_MACHINE_HALT:
1093 xpc_die_disengage(); 1223 xpc_die_deactivate();
1094 break; 1224 break;
1095 1225
1096 case DIE_KDEBUG_ENTER: 1226 case DIE_KDEBUG_ENTER:
@@ -1101,8 +1231,7 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1101 /* fall through */ 1231 /* fall through */
1102 case DIE_MCA_MONARCH_ENTER: 1232 case DIE_MCA_MONARCH_ENTER:
1103 case DIE_INIT_MONARCH_ENTER: 1233 case DIE_INIT_MONARCH_ENTER:
1104 xpc_vars->heartbeat++; 1234 xpc_offline_heartbeat();
1105 xpc_vars->heartbeat_offline = 1;
1106 break; 1235 break;
1107 1236
1108 case DIE_KDEBUG_LEAVE: 1237 case DIE_KDEBUG_LEAVE:
@@ -1113,10 +1242,12 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1113 /* fall through */ 1242 /* fall through */
1114 case DIE_MCA_MONARCH_LEAVE: 1243 case DIE_MCA_MONARCH_LEAVE:
1115 case DIE_INIT_MONARCH_LEAVE: 1244 case DIE_INIT_MONARCH_LEAVE:
1116 xpc_vars->heartbeat++; 1245 xpc_online_heartbeat();
1117 xpc_vars->heartbeat_offline = 0;
1118 break; 1246 break;
1119 } 1247 }
1248#else
1249 xpc_die_deactivate();
1250#endif
1120 1251
1121 return NOTIFY_DONE; 1252 return NOTIFY_DONE;
1122} 1253}
@@ -1125,105 +1256,52 @@ int __init
1125xpc_init(void) 1256xpc_init(void)
1126{ 1257{
1127 int ret; 1258 int ret;
1128 short partid;
1129 struct xpc_partition *part;
1130 struct task_struct *kthread; 1259 struct task_struct *kthread;
1131 size_t buf_size;
1132
1133 if (!ia64_platform_is("sn2"))
1134 return -ENODEV;
1135
1136 buf_size = max(XPC_RP_VARS_SIZE,
1137 XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES);
1138 xpc_remote_copy_buffer = xpc_kmalloc_cacheline_aligned(buf_size,
1139 GFP_KERNEL,
1140 &xpc_remote_copy_buffer_base);
1141 if (xpc_remote_copy_buffer == NULL)
1142 return -ENOMEM;
1143 1260
1144 snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part"); 1261 snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
1145 snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan"); 1262 snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
1146 1263
1147 xpc_sysctl = register_sysctl_table(xpc_sys_dir); 1264 if (is_shub()) {
1148 1265 /*
1149 /* 1266 * The ia64-sn2 architecture supports at most 64 partitions.
1150 * The first few fields of each entry of xpc_partitions[] need to 1267 * And the inability to unregister remote amos restricts us
1151 * be initialized now so that calls to xpc_connect() and 1268 * further to only support exactly 64 partitions on this
1152 * xpc_disconnect() can be made prior to the activation of any remote 1269 * architecture, no less.
1153 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE 1270 */
1154 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING 1271 if (xp_max_npartitions != 64) {
1155 * PARTITION HAS BEEN ACTIVATED. 1272 dev_err(xpc_part, "max #of partitions not set to 64\n");
1156 */ 1273 ret = -EINVAL;
1157 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) { 1274 } else {
1158 part = &xpc_partitions[partid]; 1275 ret = xpc_init_sn2();
1159 1276 }
1160 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
1161
1162 part->act_IRQ_rcvd = 0;
1163 spin_lock_init(&part->act_lock);
1164 part->act_state = XPC_P_INACTIVE;
1165 XPC_SET_REASON(part, 0, 0);
1166 1277
1167 init_timer(&part->disengage_request_timer); 1278 } else if (is_uv()) {
1168 part->disengage_request_timer.function = 1279 ret = xpc_init_uv();
1169 xpc_timeout_partition_disengage_request;
1170 part->disengage_request_timer.data = (unsigned long)part;
1171 1280
1172 part->setup_state = XPC_P_UNSET; 1281 } else {
1173 init_waitqueue_head(&part->teardown_wq); 1282 ret = -ENODEV;
1174 atomic_set(&part->references, 0);
1175 } 1283 }
1176 1284
1177 /* 1285 if (ret != 0)
1178 * Open up protections for IPI operations (and AMO operations on 1286 return ret;
1179 * Shub 1.1 systems).
1180 */
1181 xpc_allow_IPI_ops();
1182
1183 /*
1184 * Interrupts being processed will increment this atomic variable and
1185 * awaken the heartbeat thread which will process the interrupts.
1186 */
1187 atomic_set(&xpc_act_IRQ_rcvd, 0);
1188 1287
1189 /* 1288 ret = xpc_setup_partitions();
1190 * This is safe to do before the xpc_hb_checker thread has started
1191 * because the handler releases a wait queue. If an interrupt is
1192 * received before the thread is waiting, it will not go to sleep,
1193 * but rather immediately process the interrupt.
1194 */
1195 ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
1196 "xpc hb", NULL);
1197 if (ret != 0) { 1289 if (ret != 0) {
1198 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, " 1290 dev_err(xpc_part, "can't get memory for partition structure\n");
1199 "errno=%d\n", -ret); 1291 goto out_1;
1200
1201 xpc_restrict_IPI_ops();
1202
1203 if (xpc_sysctl)
1204 unregister_sysctl_table(xpc_sysctl);
1205
1206 kfree(xpc_remote_copy_buffer_base);
1207 return -EBUSY;
1208 } 1292 }
1209 1293
1294 xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1295
1210 /* 1296 /*
1211 * Fill the partition reserved page with the information needed by 1297 * Fill the partition reserved page with the information needed by
1212 * other partitions to discover we are alive and establish initial 1298 * other partitions to discover we are alive and establish initial
1213 * communications. 1299 * communications.
1214 */ 1300 */
1215 xpc_rsvd_page = xpc_rsvd_page_init(); 1301 ret = xpc_setup_rsvd_page();
1216 if (xpc_rsvd_page == NULL) { 1302 if (ret != 0) {
1217 dev_err(xpc_part, "could not setup our reserved page\n"); 1303 dev_err(xpc_part, "can't setup our reserved page\n");
1218 1304 goto out_2;
1219 free_irq(SGI_XPC_ACTIVATE, NULL);
1220 xpc_restrict_IPI_ops();
1221
1222 if (xpc_sysctl)
1223 unregister_sysctl_table(xpc_sysctl);
1224
1225 kfree(xpc_remote_copy_buffer_base);
1226 return -EBUSY;
1227 } 1305 }
1228 1306
1229 /* add ourselves to the reboot_notifier_list */ 1307 /* add ourselves to the reboot_notifier_list */
@@ -1236,9 +1314,6 @@ xpc_init(void)
1236 if (ret != 0) 1314 if (ret != 0)
1237 dev_warn(xpc_part, "can't register die notifier\n"); 1315 dev_warn(xpc_part, "can't register die notifier\n");
1238 1316
1239 init_timer(&xpc_hb_timer);
1240 xpc_hb_timer.function = xpc_hb_beater;
1241
1242 /* 1317 /*
1243 * The real work-horse behind xpc. This processes incoming 1318 * The real work-horse behind xpc. This processes incoming
1244 * interrupts and monitors remote heartbeats. 1319 * interrupts and monitors remote heartbeats.
@@ -1246,25 +1321,8 @@ xpc_init(void)
1246 kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME); 1321 kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
1247 if (IS_ERR(kthread)) { 1322 if (IS_ERR(kthread)) {
1248 dev_err(xpc_part, "failed while forking hb check thread\n"); 1323 dev_err(xpc_part, "failed while forking hb check thread\n");
1249 1324 ret = -EBUSY;
1250 /* indicate to others that our reserved page is uninitialized */ 1325 goto out_3;
1251 xpc_rsvd_page->vars_pa = 0;
1252
1253 /* take ourselves off of the reboot_notifier_list */
1254 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1255
1256 /* take ourselves off of the die_notifier list */
1257 (void)unregister_die_notifier(&xpc_die_notifier);
1258
1259 del_timer_sync(&xpc_hb_timer);
1260 free_irq(SGI_XPC_ACTIVATE, NULL);
1261 xpc_restrict_IPI_ops();
1262
1263 if (xpc_sysctl)
1264 unregister_sysctl_table(xpc_sysctl);
1265
1266 kfree(xpc_remote_copy_buffer_base);
1267 return -EBUSY;
1268 } 1326 }
1269 1327
1270 /* 1328 /*
@@ -1286,11 +1344,28 @@ xpc_init(void)
1286 1344
1287 /* set the interface to point at XPC's functions */ 1345 /* set the interface to point at XPC's functions */
1288 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect, 1346 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1289 xpc_initiate_allocate, xpc_initiate_send, 1347 xpc_initiate_send, xpc_initiate_send_notify,
1290 xpc_initiate_send_notify, xpc_initiate_received, 1348 xpc_initiate_received, xpc_initiate_partid_to_nasids);
1291 xpc_initiate_partid_to_nasids);
1292 1349
1293 return 0; 1350 return 0;
1351
1352 /* initialization was not successful */
1353out_3:
1354 xpc_teardown_rsvd_page();
1355
1356 (void)unregister_die_notifier(&xpc_die_notifier);
1357 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1358out_2:
1359 if (xpc_sysctl)
1360 unregister_sysctl_table(xpc_sysctl);
1361
1362 xpc_teardown_partitions();
1363out_1:
1364 if (is_shub())
1365 xpc_exit_sn2();
1366 else
1367 xpc_exit_uv();
1368 return ret;
1294} 1369}
1295 1370
1296module_init(xpc_init); 1371module_init(xpc_init);
@@ -1315,9 +1390,9 @@ module_param(xpc_hb_check_interval, int, 0);
1315MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between " 1390MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1316 "heartbeat checks."); 1391 "heartbeat checks.");
1317 1392
1318module_param(xpc_disengage_request_timelimit, int, 0); 1393module_param(xpc_disengage_timelimit, int, 0);
1319MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait " 1394MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait "
1320 "for disengage request to complete."); 1395 "for disengage to complete.");
1321 1396
1322module_param(xpc_kdebug_ignore, int, 0); 1397module_param(xpc_kdebug_ignore, int, 0);
1323MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by " 1398MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c
index 7dd4b5812c42..6722f6fe4dc7 100644
--- a/drivers/misc/sgi-xp/xpc_partition.c
+++ b/drivers/misc/sgi-xp/xpc_partition.c
@@ -15,57 +15,22 @@
15 * 15 *
16 */ 16 */
17 17
18#include <linux/kernel.h> 18#include <linux/device.h>
19#include <linux/sysctl.h> 19#include <linux/hardirq.h>
20#include <linux/cache.h>
21#include <linux/mmzone.h>
22#include <linux/nodemask.h>
23#include <asm/uncached.h>
24#include <asm/sn/bte.h>
25#include <asm/sn/intr.h>
26#include <asm/sn/sn_sal.h>
27#include <asm/sn/nodepda.h>
28#include <asm/sn/addrs.h>
29#include "xpc.h" 20#include "xpc.h"
30 21
31/* XPC is exiting flag */ 22/* XPC is exiting flag */
32int xpc_exiting; 23int xpc_exiting;
33 24
34/* SH_IPI_ACCESS shub register value on startup */
35static u64 xpc_sh1_IPI_access;
36static u64 xpc_sh2_IPI_access0;
37static u64 xpc_sh2_IPI_access1;
38static u64 xpc_sh2_IPI_access2;
39static u64 xpc_sh2_IPI_access3;
40
41/* original protection values for each node */
42u64 xpc_prot_vec[MAX_NUMNODES];
43
44/* this partition's reserved page pointers */ 25/* this partition's reserved page pointers */
45struct xpc_rsvd_page *xpc_rsvd_page; 26struct xpc_rsvd_page *xpc_rsvd_page;
46static u64 *xpc_part_nasids; 27static unsigned long *xpc_part_nasids;
47static u64 *xpc_mach_nasids; 28unsigned long *xpc_mach_nasids;
48struct xpc_vars *xpc_vars;
49struct xpc_vars_part *xpc_vars_part;
50 29
51static int xp_nasid_mask_bytes; /* actual size in bytes of nasid mask */ 30static int xpc_nasid_mask_nbytes; /* #of bytes in nasid mask */
52static int xp_nasid_mask_words; /* actual size in words of nasid mask */ 31int xpc_nasid_mask_nlongs; /* #of longs in nasid mask */
53
54/*
55 * For performance reasons, each entry of xpc_partitions[] is cacheline
56 * aligned. And xpc_partitions[] is padded with an additional entry at the
57 * end so that the last legitimate entry doesn't share its cacheline with
58 * another variable.
59 */
60struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
61 32
62/* 33struct xpc_partition *xpc_partitions;
63 * Generic buffer used to store a local copy of portions of a remote
64 * partition's reserved page (either its header and part_nasids mask,
65 * or its vars).
66 */
67char *xpc_remote_copy_buffer;
68void *xpc_remote_copy_buffer_base;
69 34
70/* 35/*
71 * Guarantee that the kmalloc'd memory is cacheline aligned. 36 * Guarantee that the kmalloc'd memory is cacheline aligned.
@@ -95,56 +60,59 @@ xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
95 * Given a nasid, get the physical address of the partition's reserved page 60 * Given a nasid, get the physical address of the partition's reserved page
96 * for that nasid. This function returns 0 on any error. 61 * for that nasid. This function returns 0 on any error.
97 */ 62 */
98static u64 63static unsigned long
99xpc_get_rsvd_page_pa(int nasid) 64xpc_get_rsvd_page_pa(int nasid)
100{ 65{
101 bte_result_t bte_res; 66 enum xp_retval ret;
102 s64 status;
103 u64 cookie = 0; 67 u64 cookie = 0;
104 u64 rp_pa = nasid; /* seed with nasid */ 68 unsigned long rp_pa = nasid; /* seed with nasid */
105 u64 len = 0; 69 size_t len = 0;
106 u64 buf = buf; 70 size_t buf_len = 0;
107 u64 buf_len = 0; 71 void *buf = buf;
108 void *buf_base = NULL; 72 void *buf_base = NULL;
109 73
110 while (1) { 74 while (1) {
111 75
112 status = sn_partition_reserved_page_pa(buf, &cookie, &rp_pa, 76 /* !!! rp_pa will need to be _gpa on UV.
113 &len); 77 * ??? So do we save it into the architecture specific parts
78 * ??? of the xpc_partition structure? Do we rename this
79 * ??? function or have two versions? Rename rp_pa for UV to
80 * ??? rp_gpa?
81 */
82 ret = xpc_get_partition_rsvd_page_pa(buf, &cookie, &rp_pa,
83 &len);
114 84
115 dev_dbg(xpc_part, "SAL returned with status=%li, cookie=" 85 dev_dbg(xpc_part, "SAL returned with ret=%d, cookie=0x%016lx, "
116 "0x%016lx, address=0x%016lx, len=0x%016lx\n", 86 "address=0x%016lx, len=0x%016lx\n", ret,
117 status, cookie, rp_pa, len); 87 (unsigned long)cookie, rp_pa, len);
118 88
119 if (status != SALRET_MORE_PASSES) 89 if (ret != xpNeedMoreInfo)
120 break; 90 break;
121 91
92 /* !!! L1_CACHE_ALIGN() is only a sn2-bte_copy requirement */
122 if (L1_CACHE_ALIGN(len) > buf_len) { 93 if (L1_CACHE_ALIGN(len) > buf_len) {
123 kfree(buf_base); 94 kfree(buf_base);
124 buf_len = L1_CACHE_ALIGN(len); 95 buf_len = L1_CACHE_ALIGN(len);
125 buf = (u64)xpc_kmalloc_cacheline_aligned(buf_len, 96 buf = xpc_kmalloc_cacheline_aligned(buf_len, GFP_KERNEL,
126 GFP_KERNEL, 97 &buf_base);
127 &buf_base);
128 if (buf_base == NULL) { 98 if (buf_base == NULL) {
129 dev_err(xpc_part, "unable to kmalloc " 99 dev_err(xpc_part, "unable to kmalloc "
130 "len=0x%016lx\n", buf_len); 100 "len=0x%016lx\n", buf_len);
131 status = SALRET_ERROR; 101 ret = xpNoMemory;
132 break; 102 break;
133 } 103 }
134 } 104 }
135 105
136 bte_res = xp_bte_copy(rp_pa, buf, buf_len, 106 ret = xp_remote_memcpy(xp_pa(buf), rp_pa, buf_len);
137 (BTE_NOTIFY | BTE_WACQUIRE), NULL); 107 if (ret != xpSuccess) {
138 if (bte_res != BTE_SUCCESS) { 108 dev_dbg(xpc_part, "xp_remote_memcpy failed %d\n", ret);
139 dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res);
140 status = SALRET_ERROR;
141 break; 109 break;
142 } 110 }
143 } 111 }
144 112
145 kfree(buf_base); 113 kfree(buf_base);
146 114
147 if (status != SALRET_OK) 115 if (ret != xpSuccess)
148 rp_pa = 0; 116 rp_pa = 0;
149 117
150 dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa); 118 dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
@@ -156,300 +124,77 @@ xpc_get_rsvd_page_pa(int nasid)
156 * other partitions to discover we are alive and establish initial 124 * other partitions to discover we are alive and establish initial
157 * communications. 125 * communications.
158 */ 126 */
159struct xpc_rsvd_page * 127int
160xpc_rsvd_page_init(void) 128xpc_setup_rsvd_page(void)
161{ 129{
130 int ret;
162 struct xpc_rsvd_page *rp; 131 struct xpc_rsvd_page *rp;
163 AMO_t *amos_page; 132 unsigned long rp_pa;
164 u64 rp_pa, nasid_array = 0; 133 unsigned long new_ts_jiffies;
165 int i, ret;
166 134
167 /* get the local reserved page's address */ 135 /* get the local reserved page's address */
168 136
169 preempt_disable(); 137 preempt_disable();
170 rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id())); 138 rp_pa = xpc_get_rsvd_page_pa(xp_cpu_to_nasid(smp_processor_id()));
171 preempt_enable(); 139 preempt_enable();
172 if (rp_pa == 0) { 140 if (rp_pa == 0) {
173 dev_err(xpc_part, "SAL failed to locate the reserved page\n"); 141 dev_err(xpc_part, "SAL failed to locate the reserved page\n");
174 return NULL; 142 return -ESRCH;
175 } 143 }
176 rp = (struct xpc_rsvd_page *)__va(rp_pa); 144 rp = (struct xpc_rsvd_page *)__va(rp_pa);
177 145
178 if (rp->partid != sn_partition_id) { 146 if (rp->SAL_version < 3) {
179 dev_err(xpc_part, "the reserved page's partid of %d should be " 147 /* SAL_versions < 3 had a SAL_partid defined as a u8 */
180 "%d\n", rp->partid, sn_partition_id); 148 rp->SAL_partid &= 0xff;
181 return NULL; 149 }
150 BUG_ON(rp->SAL_partid != xp_partition_id);
151
152 if (rp->SAL_partid < 0 || rp->SAL_partid >= xp_max_npartitions) {
153 dev_err(xpc_part, "the reserved page's partid of %d is outside "
154 "supported range (< 0 || >= %d)\n", rp->SAL_partid,
155 xp_max_npartitions);
156 return -EINVAL;
182 } 157 }
183 158
184 rp->version = XPC_RP_VERSION; 159 rp->version = XPC_RP_VERSION;
160 rp->max_npartitions = xp_max_npartitions;
185 161
186 /* establish the actual sizes of the nasid masks */ 162 /* establish the actual sizes of the nasid masks */
187 if (rp->SAL_version == 1) { 163 if (rp->SAL_version == 1) {
188 /* SAL_version 1 didn't set the nasids_size field */ 164 /* SAL_version 1 didn't set the nasids_size field */
189 rp->nasids_size = 128; 165 rp->SAL_nasids_size = 128;
190 } 166 }
191 xp_nasid_mask_bytes = rp->nasids_size; 167 xpc_nasid_mask_nbytes = rp->SAL_nasids_size;
192 xp_nasid_mask_words = xp_nasid_mask_bytes / 8; 168 xpc_nasid_mask_nlongs = BITS_TO_LONGS(rp->SAL_nasids_size *
169 BITS_PER_BYTE);
193 170
194 /* setup the pointers to the various items in the reserved page */ 171 /* setup the pointers to the various items in the reserved page */
195 xpc_part_nasids = XPC_RP_PART_NASIDS(rp); 172 xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
196 xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp); 173 xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
197 xpc_vars = XPC_RP_VARS(rp);
198 xpc_vars_part = XPC_RP_VARS_PART(rp);
199
200 /*
201 * Before clearing xpc_vars, see if a page of AMOs had been previously
202 * allocated. If not we'll need to allocate one and set permissions
203 * so that cross-partition AMOs are allowed.
204 *
205 * The allocated AMO page needs MCA reporting to remain disabled after
206 * XPC has unloaded. To make this work, we keep a copy of the pointer
207 * to this page (i.e., amos_page) in the struct xpc_vars structure,
208 * which is pointed to by the reserved page, and re-use that saved copy
209 * on subsequent loads of XPC. This AMO page is never freed, and its
210 * memory protections are never restricted.
211 */
212 amos_page = xpc_vars->amos_page;
213 if (amos_page == NULL) {
214 amos_page = (AMO_t *)TO_AMO(uncached_alloc_page(0, 1));
215 if (amos_page == NULL) {
216 dev_err(xpc_part, "can't allocate page of AMOs\n");
217 return NULL;
218 }
219
220 /*
221 * Open up AMO-R/W to cpu. This is done for Shub 1.1 systems
222 * when xpc_allow_IPI_ops() is called via xpc_hb_init().
223 */
224 if (!enable_shub_wars_1_1()) {
225 ret = sn_change_memprotect(ia64_tpa((u64)amos_page),
226 PAGE_SIZE,
227 SN_MEMPROT_ACCESS_CLASS_1,
228 &nasid_array);
229 if (ret != 0) {
230 dev_err(xpc_part, "can't change memory "
231 "protections\n");
232 uncached_free_page(__IA64_UNCACHED_OFFSET |
233 TO_PHYS((u64)amos_page), 1);
234 return NULL;
235 }
236 }
237 } else if (!IS_AMO_ADDRESS((u64)amos_page)) {
238 /*
239 * EFI's XPBOOT can also set amos_page in the reserved page,
240 * but it happens to leave it as an uncached physical address
241 * and we need it to be an uncached virtual, so we'll have to
242 * convert it.
243 */
244 if (!IS_AMO_PHYS_ADDRESS((u64)amos_page)) {
245 dev_err(xpc_part, "previously used amos_page address "
246 "is bad = 0x%p\n", (void *)amos_page);
247 return NULL;
248 }
249 amos_page = (AMO_t *)TO_AMO((u64)amos_page);
250 }
251
252 /* clear xpc_vars */
253 memset(xpc_vars, 0, sizeof(struct xpc_vars));
254
255 xpc_vars->version = XPC_V_VERSION;
256 xpc_vars->act_nasid = cpuid_to_nasid(0);
257 xpc_vars->act_phys_cpuid = cpu_physical_id(0);
258 xpc_vars->vars_part_pa = __pa(xpc_vars_part);
259 xpc_vars->amos_page_pa = ia64_tpa((u64)amos_page);
260 xpc_vars->amos_page = amos_page; /* save for next load of XPC */
261
262 /* clear xpc_vars_part */
263 memset((u64 *)xpc_vars_part, 0, sizeof(struct xpc_vars_part) *
264 XP_MAX_PARTITIONS);
265
266 /* initialize the activate IRQ related AMO variables */
267 for (i = 0; i < xp_nasid_mask_words; i++)
268 (void)xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i);
269
270 /* initialize the engaged remote partitions related AMO variables */
271 (void)xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO);
272 (void)xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO);
273 174
274 /* timestamp of when reserved page was setup by XPC */ 175 ret = xpc_setup_rsvd_page_sn(rp);
275 rp->stamp = CURRENT_TIME; 176 if (ret != 0)
177 return ret;
276 178
277 /* 179 /*
180 * Set timestamp of when reserved page was setup by XPC.
278 * This signifies to the remote partition that our reserved 181 * This signifies to the remote partition that our reserved
279 * page is initialized. 182 * page is initialized.
280 */ 183 */
281 rp->vars_pa = __pa(xpc_vars); 184 new_ts_jiffies = jiffies;
185 if (new_ts_jiffies == 0 || new_ts_jiffies == rp->ts_jiffies)
186 new_ts_jiffies++;
187 rp->ts_jiffies = new_ts_jiffies;
282 188
283 return rp; 189 xpc_rsvd_page = rp;
190 return 0;
284} 191}
285 192
286/*
287 * Change protections to allow IPI operations (and AMO operations on
288 * Shub 1.1 systems).
289 */
290void 193void
291xpc_allow_IPI_ops(void) 194xpc_teardown_rsvd_page(void)
292{ 195{
293 int node; 196 /* a zero timestamp indicates our rsvd page is not initialized */
294 int nasid; 197 xpc_rsvd_page->ts_jiffies = 0;
295
296 /* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
297
298 if (is_shub2()) {
299 xpc_sh2_IPI_access0 =
300 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
301 xpc_sh2_IPI_access1 =
302 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
303 xpc_sh2_IPI_access2 =
304 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
305 xpc_sh2_IPI_access3 =
306 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
307
308 for_each_online_node(node) {
309 nasid = cnodeid_to_nasid(node);
310 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
311 -1UL);
312 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
313 -1UL);
314 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
315 -1UL);
316 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
317 -1UL);
318 }
319
320 } else {
321 xpc_sh1_IPI_access =
322 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
323
324 for_each_online_node(node) {
325 nasid = cnodeid_to_nasid(node);
326 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
327 -1UL);
328
329 /*
330 * Since the BIST collides with memory operations on
331 * SHUB 1.1 sn_change_memprotect() cannot be used.
332 */
333 if (enable_shub_wars_1_1()) {
334 /* open up everything */
335 xpc_prot_vec[node] = (u64)HUB_L((u64 *)
336 GLOBAL_MMR_ADDR
337 (nasid,
338 SH1_MD_DQLP_MMR_DIR_PRIVEC0));
339 HUB_S((u64 *)
340 GLOBAL_MMR_ADDR(nasid,
341 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
342 -1UL);
343 HUB_S((u64 *)
344 GLOBAL_MMR_ADDR(nasid,
345 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
346 -1UL);
347 }
348 }
349 }
350}
351
352/*
353 * Restrict protections to disallow IPI operations (and AMO operations on
354 * Shub 1.1 systems).
355 */
356void
357xpc_restrict_IPI_ops(void)
358{
359 int node;
360 int nasid;
361
362 /* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
363
364 if (is_shub2()) {
365
366 for_each_online_node(node) {
367 nasid = cnodeid_to_nasid(node);
368 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
369 xpc_sh2_IPI_access0);
370 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
371 xpc_sh2_IPI_access1);
372 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
373 xpc_sh2_IPI_access2);
374 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
375 xpc_sh2_IPI_access3);
376 }
377
378 } else {
379
380 for_each_online_node(node) {
381 nasid = cnodeid_to_nasid(node);
382 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
383 xpc_sh1_IPI_access);
384
385 if (enable_shub_wars_1_1()) {
386 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
387 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
388 xpc_prot_vec[node]);
389 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
390 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
391 xpc_prot_vec[node]);
392 }
393 }
394 }
395}
396
397/*
398 * At periodic intervals, scan through all active partitions and ensure
399 * their heartbeat is still active. If not, the partition is deactivated.
400 */
401void
402xpc_check_remote_hb(void)
403{
404 struct xpc_vars *remote_vars;
405 struct xpc_partition *part;
406 short partid;
407 bte_result_t bres;
408
409 remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
410
411 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
412
413 if (xpc_exiting)
414 break;
415
416 if (partid == sn_partition_id)
417 continue;
418
419 part = &xpc_partitions[partid];
420
421 if (part->act_state == XPC_P_INACTIVE ||
422 part->act_state == XPC_P_DEACTIVATING) {
423 continue;
424 }
425
426 /* pull the remote_hb cache line */
427 bres = xp_bte_copy(part->remote_vars_pa,
428 (u64)remote_vars,
429 XPC_RP_VARS_SIZE,
430 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
431 if (bres != BTE_SUCCESS) {
432 XPC_DEACTIVATE_PARTITION(part,
433 xpc_map_bte_errors(bres));
434 continue;
435 }
436
437 dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
438 " = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
439 partid, remote_vars->heartbeat, part->last_heartbeat,
440 remote_vars->heartbeat_offline,
441 remote_vars->heartbeating_to_mask);
442
443 if (((remote_vars->heartbeat == part->last_heartbeat) &&
444 (remote_vars->heartbeat_offline == 0)) ||
445 !xpc_hb_allowed(sn_partition_id, remote_vars)) {
446
447 XPC_DEACTIVATE_PARTITION(part, xpNoHeartbeat);
448 continue;
449 }
450
451 part->last_heartbeat = remote_vars->heartbeat;
452 }
453} 198}
454 199
455/* 200/*
@@ -459,11 +204,12 @@ xpc_check_remote_hb(void)
459 * is large enough to contain a copy of their reserved page header and 204 * is large enough to contain a copy of their reserved page header and
460 * part_nasids mask. 205 * part_nasids mask.
461 */ 206 */
462static enum xp_retval 207enum xp_retval
463xpc_get_remote_rp(int nasid, u64 *discovered_nasids, 208xpc_get_remote_rp(int nasid, unsigned long *discovered_nasids,
464 struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa) 209 struct xpc_rsvd_page *remote_rp, unsigned long *remote_rp_pa)
465{ 210{
466 int bres, i; 211 int l;
212 enum xp_retval ret;
467 213
468 /* get the reserved page's physical address */ 214 /* get the reserved page's physical address */
469 215
@@ -472,355 +218,45 @@ xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
472 return xpNoRsvdPageAddr; 218 return xpNoRsvdPageAddr;
473 219
474 /* pull over the reserved page header and part_nasids mask */ 220 /* pull over the reserved page header and part_nasids mask */
475 bres = xp_bte_copy(*remote_rp_pa, (u64)remote_rp, 221 ret = xp_remote_memcpy(xp_pa(remote_rp), *remote_rp_pa,
476 XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes, 222 XPC_RP_HEADER_SIZE + xpc_nasid_mask_nbytes);
477 (BTE_NOTIFY | BTE_WACQUIRE), NULL); 223 if (ret != xpSuccess)
478 if (bres != BTE_SUCCESS) 224 return ret;
479 return xpc_map_bte_errors(bres);
480 225
481 if (discovered_nasids != NULL) { 226 if (discovered_nasids != NULL) {
482 u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp); 227 unsigned long *remote_part_nasids =
483 228 XPC_RP_PART_NASIDS(remote_rp);
484 for (i = 0; i < xp_nasid_mask_words; i++)
485 discovered_nasids[i] |= remote_part_nasids[i];
486 }
487
488 /* check that the partid is for another partition */
489 229
490 if (remote_rp->partid < 1 || 230 for (l = 0; l < xpc_nasid_mask_nlongs; l++)
491 remote_rp->partid > (XP_MAX_PARTITIONS - 1)) { 231 discovered_nasids[l] |= remote_part_nasids[l];
492 return xpInvalidPartid;
493 } 232 }
494 233
495 if (remote_rp->partid == sn_partition_id) 234 /* zero timestamp indicates the reserved page has not been setup */
496 return xpLocalPartid; 235 if (remote_rp->ts_jiffies == 0)
236 return xpRsvdPageNotSet;
497 237
498 if (XPC_VERSION_MAJOR(remote_rp->version) != 238 if (XPC_VERSION_MAJOR(remote_rp->version) !=
499 XPC_VERSION_MAJOR(XPC_RP_VERSION)) { 239 XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
500 return xpBadVersion; 240 return xpBadVersion;
501 } 241 }
502 242
503 return xpSuccess; 243 /* check that both remote and local partids are valid for each side */
504} 244 if (remote_rp->SAL_partid < 0 ||
505 245 remote_rp->SAL_partid >= xp_max_npartitions ||
506/* 246 remote_rp->max_npartitions <= xp_partition_id) {
507 * Get a copy of the remote partition's XPC variables from the reserved page. 247 return xpInvalidPartid;
508 *
509 * remote_vars points to a buffer that is cacheline aligned for BTE copies and
510 * assumed to be of size XPC_RP_VARS_SIZE.
511 */
512static enum xp_retval
513xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
514{
515 int bres;
516
517 if (remote_vars_pa == 0)
518 return xpVarsNotSet;
519
520 /* pull over the cross partition variables */
521 bres = xp_bte_copy(remote_vars_pa, (u64)remote_vars, XPC_RP_VARS_SIZE,
522 (BTE_NOTIFY | BTE_WACQUIRE), NULL);
523 if (bres != BTE_SUCCESS)
524 return xpc_map_bte_errors(bres);
525
526 if (XPC_VERSION_MAJOR(remote_vars->version) !=
527 XPC_VERSION_MAJOR(XPC_V_VERSION)) {
528 return xpBadVersion;
529 }
530
531 return xpSuccess;
532}
533
534/*
535 * Update the remote partition's info.
536 */
537static void
538xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version,
539 struct timespec *remote_rp_stamp, u64 remote_rp_pa,
540 u64 remote_vars_pa, struct xpc_vars *remote_vars)
541{
542 part->remote_rp_version = remote_rp_version;
543 dev_dbg(xpc_part, " remote_rp_version = 0x%016x\n",
544 part->remote_rp_version);
545
546 part->remote_rp_stamp = *remote_rp_stamp;
547 dev_dbg(xpc_part, " remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n",
548 part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec);
549
550 part->remote_rp_pa = remote_rp_pa;
551 dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
552
553 part->remote_vars_pa = remote_vars_pa;
554 dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n",
555 part->remote_vars_pa);
556
557 part->last_heartbeat = remote_vars->heartbeat;
558 dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n",
559 part->last_heartbeat);
560
561 part->remote_vars_part_pa = remote_vars->vars_part_pa;
562 dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n",
563 part->remote_vars_part_pa);
564
565 part->remote_act_nasid = remote_vars->act_nasid;
566 dev_dbg(xpc_part, " remote_act_nasid = 0x%x\n",
567 part->remote_act_nasid);
568
569 part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
570 dev_dbg(xpc_part, " remote_act_phys_cpuid = 0x%x\n",
571 part->remote_act_phys_cpuid);
572
573 part->remote_amos_page_pa = remote_vars->amos_page_pa;
574 dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n",
575 part->remote_amos_page_pa);
576
577 part->remote_vars_version = remote_vars->version;
578 dev_dbg(xpc_part, " remote_vars_version = 0x%x\n",
579 part->remote_vars_version);
580}
581
582/*
583 * Prior code has determined the nasid which generated an IPI. Inspect
584 * that nasid to determine if its partition needs to be activated or
585 * deactivated.
586 *
587 * A partition is consider "awaiting activation" if our partition
588 * flags indicate it is not active and it has a heartbeat. A
589 * partition is considered "awaiting deactivation" if our partition
590 * flags indicate it is active but it has no heartbeat or it is not
591 * sending its heartbeat to us.
592 *
593 * To determine the heartbeat, the remote nasid must have a properly
594 * initialized reserved page.
595 */
596static void
597xpc_identify_act_IRQ_req(int nasid)
598{
599 struct xpc_rsvd_page *remote_rp;
600 struct xpc_vars *remote_vars;
601 u64 remote_rp_pa;
602 u64 remote_vars_pa;
603 int remote_rp_version;
604 int reactivate = 0;
605 int stamp_diff;
606 struct timespec remote_rp_stamp = { 0, 0 };
607 short partid;
608 struct xpc_partition *part;
609 enum xp_retval ret;
610
611 /* pull over the reserved page structure */
612
613 remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer;
614
615 ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
616 if (ret != xpSuccess) {
617 dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
618 "which sent interrupt, reason=%d\n", nasid, ret);
619 return;
620 }
621
622 remote_vars_pa = remote_rp->vars_pa;
623 remote_rp_version = remote_rp->version;
624 if (XPC_SUPPORTS_RP_STAMP(remote_rp_version))
625 remote_rp_stamp = remote_rp->stamp;
626
627 partid = remote_rp->partid;
628 part = &xpc_partitions[partid];
629
630 /* pull over the cross partition variables */
631
632 remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
633
634 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
635 if (ret != xpSuccess) {
636
637 dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
638 "which sent interrupt, reason=%d\n", nasid, ret);
639
640 XPC_DEACTIVATE_PARTITION(part, ret);
641 return;
642 }
643
644 part->act_IRQ_rcvd++;
645
646 dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
647 "%ld:0x%lx\n", (int)nasid, (int)partid, part->act_IRQ_rcvd,
648 remote_vars->heartbeat, remote_vars->heartbeating_to_mask);
649
650 if (xpc_partition_disengaged(part) &&
651 part->act_state == XPC_P_INACTIVE) {
652
653 xpc_update_partition_info(part, remote_rp_version,
654 &remote_rp_stamp, remote_rp_pa,
655 remote_vars_pa, remote_vars);
656
657 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
658 if (xpc_partition_disengage_requested(1UL << partid)) {
659 /*
660 * Other side is waiting on us to disengage,
661 * even though we already have.
662 */
663 return;
664 }
665 } else {
666 /* other side doesn't support disengage requests */
667 xpc_clear_partition_disengage_request(1UL << partid);
668 }
669
670 xpc_activate_partition(part);
671 return;
672 }
673
674 DBUG_ON(part->remote_rp_version == 0);
675 DBUG_ON(part->remote_vars_version == 0);
676
677 if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) {
678 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part->
679 remote_vars_version));
680
681 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
682 DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
683 version));
684 /* see if the other side rebooted */
685 if (part->remote_amos_page_pa ==
686 remote_vars->amos_page_pa &&
687 xpc_hb_allowed(sn_partition_id, remote_vars)) {
688 /* doesn't look that way, so ignore the IPI */
689 return;
690 }
691 }
692
693 /*
694 * Other side rebooted and previous XPC didn't support the
695 * disengage request, so we don't need to do anything special.
696 */
697
698 xpc_update_partition_info(part, remote_rp_version,
699 &remote_rp_stamp, remote_rp_pa,
700 remote_vars_pa, remote_vars);
701 part->reactivate_nasid = nasid;
702 XPC_DEACTIVATE_PARTITION(part, xpReactivating);
703 return;
704 }
705
706 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version));
707
708 if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
709 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
710
711 /*
712 * Other side rebooted and previous XPC did support the
713 * disengage request, but the new one doesn't.
714 */
715
716 xpc_clear_partition_engaged(1UL << partid);
717 xpc_clear_partition_disengage_request(1UL << partid);
718
719 xpc_update_partition_info(part, remote_rp_version,
720 &remote_rp_stamp, remote_rp_pa,
721 remote_vars_pa, remote_vars);
722 reactivate = 1;
723
724 } else {
725 DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
726
727 stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp,
728 &remote_rp_stamp);
729 if (stamp_diff != 0) {
730 DBUG_ON(stamp_diff >= 0);
731
732 /*
733 * Other side rebooted and the previous XPC did support
734 * the disengage request, as does the new one.
735 */
736
737 DBUG_ON(xpc_partition_engaged(1UL << partid));
738 DBUG_ON(xpc_partition_disengage_requested(1UL <<
739 partid));
740
741 xpc_update_partition_info(part, remote_rp_version,
742 &remote_rp_stamp,
743 remote_rp_pa, remote_vars_pa,
744 remote_vars);
745 reactivate = 1;
746 }
747 }
748
749 if (part->disengage_request_timeout > 0 &&
750 !xpc_partition_disengaged(part)) {
751 /* still waiting on other side to disengage from us */
752 return;
753 }
754
755 if (reactivate) {
756 part->reactivate_nasid = nasid;
757 XPC_DEACTIVATE_PARTITION(part, xpReactivating);
758
759 } else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) &&
760 xpc_partition_disengage_requested(1UL << partid)) {
761 XPC_DEACTIVATE_PARTITION(part, xpOtherGoingDown);
762 } 248 }
763}
764 249
765/* 250 if (remote_rp->SAL_partid == xp_partition_id)
766 * Loop through the activation AMO variables and process any bits 251 return xpLocalPartid;
767 * which are set. Each bit indicates a nasid sending a partition
768 * activation or deactivation request.
769 *
770 * Return #of IRQs detected.
771 */
772int
773xpc_identify_act_IRQ_sender(void)
774{
775 int word, bit;
776 u64 nasid_mask;
777 u64 nasid; /* remote nasid */
778 int n_IRQs_detected = 0;
779 AMO_t *act_amos;
780
781 act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS;
782
783 /* scan through act AMO variable looking for non-zero entries */
784 for (word = 0; word < xp_nasid_mask_words; word++) {
785
786 if (xpc_exiting)
787 break;
788
789 nasid_mask = xpc_IPI_receive(&act_amos[word]);
790 if (nasid_mask == 0) {
791 /* no IRQs from nasids in this variable */
792 continue;
793 }
794
795 dev_dbg(xpc_part, "AMO[%d] gave back 0x%lx\n", word,
796 nasid_mask);
797
798 /*
799 * If this nasid has been added to the machine since
800 * our partition was reset, this will retain the
801 * remote nasid in our reserved pages machine mask.
802 * This is used in the event of module reload.
803 */
804 xpc_mach_nasids[word] |= nasid_mask;
805
806 /* locate the nasid(s) which sent interrupts */
807 252
808 for (bit = 0; bit < (8 * sizeof(u64)); bit++) { 253 return xpSuccess;
809 if (nasid_mask & (1UL << bit)) {
810 n_IRQs_detected++;
811 nasid = XPC_NASID_FROM_W_B(word, bit);
812 dev_dbg(xpc_part, "interrupt from nasid %ld\n",
813 nasid);
814 xpc_identify_act_IRQ_req(nasid);
815 }
816 }
817 }
818 return n_IRQs_detected;
819} 254}
820 255
821/* 256/*
822 * See if the other side has responded to a partition disengage request 257 * See if the other side has responded to a partition deactivate request
823 * from us. 258 * from us. Though we requested the remote partition to deactivate with regard
259 * to us, we really only need to wait for the other side to disengage from us.
824 */ 260 */
825int 261int
826xpc_partition_disengaged(struct xpc_partition *part) 262xpc_partition_disengaged(struct xpc_partition *part)
@@ -828,41 +264,37 @@ xpc_partition_disengaged(struct xpc_partition *part)
828 short partid = XPC_PARTID(part); 264 short partid = XPC_PARTID(part);
829 int disengaged; 265 int disengaged;
830 266
831 disengaged = (xpc_partition_engaged(1UL << partid) == 0); 267 disengaged = !xpc_partition_engaged(partid);
832 if (part->disengage_request_timeout) { 268 if (part->disengage_timeout) {
833 if (!disengaged) { 269 if (!disengaged) {
834 if (time_before(jiffies, 270 if (time_is_after_jiffies(part->disengage_timeout)) {
835 part->disengage_request_timeout)) {
836 /* timelimit hasn't been reached yet */ 271 /* timelimit hasn't been reached yet */
837 return 0; 272 return 0;
838 } 273 }
839 274
840 /* 275 /*
841 * Other side hasn't responded to our disengage 276 * Other side hasn't responded to our deactivate
842 * request in a timely fashion, so assume it's dead. 277 * request in a timely fashion, so assume it's dead.
843 */ 278 */
844 279
845 dev_info(xpc_part, "disengage from remote partition %d " 280 dev_info(xpc_part, "deactivate request to remote "
846 "timed out\n", partid); 281 "partition %d timed out\n", partid);
847 xpc_disengage_request_timedout = 1; 282 xpc_disengage_timedout = 1;
848 xpc_clear_partition_engaged(1UL << partid); 283 xpc_assume_partition_disengaged(partid);
849 disengaged = 1; 284 disengaged = 1;
850 } 285 }
851 part->disengage_request_timeout = 0; 286 part->disengage_timeout = 0;
852 287
853 /* cancel the timer function, provided it's not us */ 288 /* cancel the timer function, provided it's not us */
854 if (!in_interrupt()) { 289 if (!in_interrupt())
855 del_singleshot_timer_sync(&part-> 290 del_singleshot_timer_sync(&part->disengage_timer);
856 disengage_request_timer);
857 }
858 291
859 DBUG_ON(part->act_state != XPC_P_DEACTIVATING && 292 DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING &&
860 part->act_state != XPC_P_INACTIVE); 293 part->act_state != XPC_P_AS_INACTIVE);
861 if (part->act_state != XPC_P_INACTIVE) 294 if (part->act_state != XPC_P_AS_INACTIVE)
862 xpc_wakeup_channel_mgr(part); 295 xpc_wakeup_channel_mgr(part);
863 296
864 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) 297 xpc_cancel_partition_deactivation_request(part);
865 xpc_cancel_partition_disengage_request(part);
866 } 298 }
867 return disengaged; 299 return disengaged;
868} 300}
@@ -879,8 +311,8 @@ xpc_mark_partition_active(struct xpc_partition *part)
879 dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part)); 311 dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));
880 312
881 spin_lock_irqsave(&part->act_lock, irq_flags); 313 spin_lock_irqsave(&part->act_lock, irq_flags);
882 if (part->act_state == XPC_P_ACTIVATING) { 314 if (part->act_state == XPC_P_AS_ACTIVATING) {
883 part->act_state = XPC_P_ACTIVE; 315 part->act_state = XPC_P_AS_ACTIVE;
884 ret = xpSuccess; 316 ret = xpSuccess;
885 } else { 317 } else {
886 DBUG_ON(part->reason == xpSuccess); 318 DBUG_ON(part->reason == xpSuccess);
@@ -892,7 +324,7 @@ xpc_mark_partition_active(struct xpc_partition *part)
892} 324}
893 325
894/* 326/*
895 * Notify XPC that the partition is down. 327 * Start the process of deactivating the specified partition.
896 */ 328 */
897void 329void
898xpc_deactivate_partition(const int line, struct xpc_partition *part, 330xpc_deactivate_partition(const int line, struct xpc_partition *part,
@@ -902,16 +334,16 @@ xpc_deactivate_partition(const int line, struct xpc_partition *part,
902 334
903 spin_lock_irqsave(&part->act_lock, irq_flags); 335 spin_lock_irqsave(&part->act_lock, irq_flags);
904 336
905 if (part->act_state == XPC_P_INACTIVE) { 337 if (part->act_state == XPC_P_AS_INACTIVE) {
906 XPC_SET_REASON(part, reason, line); 338 XPC_SET_REASON(part, reason, line);
907 spin_unlock_irqrestore(&part->act_lock, irq_flags); 339 spin_unlock_irqrestore(&part->act_lock, irq_flags);
908 if (reason == xpReactivating) { 340 if (reason == xpReactivating) {
909 /* we interrupt ourselves to reactivate partition */ 341 /* we interrupt ourselves to reactivate partition */
910 xpc_IPI_send_reactivate(part); 342 xpc_request_partition_reactivation(part);
911 } 343 }
912 return; 344 return;
913 } 345 }
914 if (part->act_state == XPC_P_DEACTIVATING) { 346 if (part->act_state == XPC_P_AS_DEACTIVATING) {
915 if ((part->reason == xpUnloading && reason != xpUnloading) || 347 if ((part->reason == xpUnloading && reason != xpUnloading) ||
916 reason == xpReactivating) { 348 reason == xpReactivating) {
917 XPC_SET_REASON(part, reason, line); 349 XPC_SET_REASON(part, reason, line);
@@ -920,22 +352,18 @@ xpc_deactivate_partition(const int line, struct xpc_partition *part,
920 return; 352 return;
921 } 353 }
922 354
923 part->act_state = XPC_P_DEACTIVATING; 355 part->act_state = XPC_P_AS_DEACTIVATING;
924 XPC_SET_REASON(part, reason, line); 356 XPC_SET_REASON(part, reason, line);
925 357
926 spin_unlock_irqrestore(&part->act_lock, irq_flags); 358 spin_unlock_irqrestore(&part->act_lock, irq_flags);
927 359
928 if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) { 360 /* ask remote partition to deactivate with regard to us */
929 xpc_request_partition_disengage(part); 361 xpc_request_partition_deactivation(part);
930 xpc_IPI_send_disengage(part);
931 362
932 /* set a timelimit on the disengage request */ 363 /* set a timelimit on the disengage phase of the deactivation request */
933 part->disengage_request_timeout = jiffies + 364 part->disengage_timeout = jiffies + (xpc_disengage_timelimit * HZ);
934 (xpc_disengage_request_timelimit * HZ); 365 part->disengage_timer.expires = part->disengage_timeout;
935 part->disengage_request_timer.expires = 366 add_timer(&part->disengage_timer);
936 part->disengage_request_timeout;
937 add_timer(&part->disengage_request_timer);
938 }
939 367
940 dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n", 368 dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
941 XPC_PARTID(part), reason); 369 XPC_PARTID(part), reason);
@@ -955,7 +383,7 @@ xpc_mark_partition_inactive(struct xpc_partition *part)
955 XPC_PARTID(part)); 383 XPC_PARTID(part));
956 384
957 spin_lock_irqsave(&part->act_lock, irq_flags); 385 spin_lock_irqsave(&part->act_lock, irq_flags);
958 part->act_state = XPC_P_INACTIVE; 386 part->act_state = XPC_P_AS_INACTIVE;
959 spin_unlock_irqrestore(&part->act_lock, irq_flags); 387 spin_unlock_irqrestore(&part->act_lock, irq_flags);
960 part->remote_rp_pa = 0; 388 part->remote_rp_pa = 0;
961} 389}
@@ -974,28 +402,22 @@ xpc_discovery(void)
974{ 402{
975 void *remote_rp_base; 403 void *remote_rp_base;
976 struct xpc_rsvd_page *remote_rp; 404 struct xpc_rsvd_page *remote_rp;
977 struct xpc_vars *remote_vars; 405 unsigned long remote_rp_pa;
978 u64 remote_rp_pa;
979 u64 remote_vars_pa;
980 int region; 406 int region;
981 int region_size; 407 int region_size;
982 int max_regions; 408 int max_regions;
983 int nasid; 409 int nasid;
984 struct xpc_rsvd_page *rp; 410 struct xpc_rsvd_page *rp;
985 short partid; 411 unsigned long *discovered_nasids;
986 struct xpc_partition *part;
987 u64 *discovered_nasids;
988 enum xp_retval ret; 412 enum xp_retval ret;
989 413
990 remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE + 414 remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
991 xp_nasid_mask_bytes, 415 xpc_nasid_mask_nbytes,
992 GFP_KERNEL, &remote_rp_base); 416 GFP_KERNEL, &remote_rp_base);
993 if (remote_rp == NULL) 417 if (remote_rp == NULL)
994 return; 418 return;
995 419
996 remote_vars = (struct xpc_vars *)remote_rp; 420 discovered_nasids = kzalloc(sizeof(long) * xpc_nasid_mask_nlongs,
997
998 discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
999 GFP_KERNEL); 421 GFP_KERNEL);
1000 if (discovered_nasids == NULL) { 422 if (discovered_nasids == NULL) {
1001 kfree(remote_rp_base); 423 kfree(remote_rp_base);
@@ -1010,7 +432,7 @@ xpc_discovery(void)
1010 * protection is in regards to memory, IOI and IPI. 432 * protection is in regards to memory, IOI and IPI.
1011 */ 433 */
1012 max_regions = 64; 434 max_regions = 64;
1013 region_size = sn_region_size; 435 region_size = xp_region_size;
1014 436
1015 switch (region_size) { 437 switch (region_size) {
1016 case 128: 438 case 128:
@@ -1038,28 +460,28 @@ xpc_discovery(void)
1038 460
1039 dev_dbg(xpc_part, "checking nasid %d\n", nasid); 461 dev_dbg(xpc_part, "checking nasid %d\n", nasid);
1040 462
1041 if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) { 463 if (test_bit(nasid / 2, xpc_part_nasids)) {
1042 dev_dbg(xpc_part, "PROM indicates Nasid %d is " 464 dev_dbg(xpc_part, "PROM indicates Nasid %d is "
1043 "part of the local partition; skipping " 465 "part of the local partition; skipping "
1044 "region\n", nasid); 466 "region\n", nasid);
1045 break; 467 break;
1046 } 468 }
1047 469
1048 if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) { 470 if (!(test_bit(nasid / 2, xpc_mach_nasids))) {
1049 dev_dbg(xpc_part, "PROM indicates Nasid %d was " 471 dev_dbg(xpc_part, "PROM indicates Nasid %d was "
1050 "not on Numa-Link network at reset\n", 472 "not on Numa-Link network at reset\n",
1051 nasid); 473 nasid);
1052 continue; 474 continue;
1053 } 475 }
1054 476
1055 if (XPC_NASID_IN_ARRAY(nasid, discovered_nasids)) { 477 if (test_bit(nasid / 2, discovered_nasids)) {
1056 dev_dbg(xpc_part, "Nasid %d is part of a " 478 dev_dbg(xpc_part, "Nasid %d is part of a "
1057 "partition which was previously " 479 "partition which was previously "
1058 "discovered\n", nasid); 480 "discovered\n", nasid);
1059 continue; 481 continue;
1060 } 482 }
1061 483
1062 /* pull over the reserved page structure */ 484 /* pull over the rsvd page header & part_nasids mask */
1063 485
1064 ret = xpc_get_remote_rp(nasid, discovered_nasids, 486 ret = xpc_get_remote_rp(nasid, discovered_nasids,
1065 remote_rp, &remote_rp_pa); 487 remote_rp, &remote_rp_pa);
@@ -1074,72 +496,8 @@ xpc_discovery(void)
1074 continue; 496 continue;
1075 } 497 }
1076 498
1077 remote_vars_pa = remote_rp->vars_pa; 499 xpc_request_partition_activation(remote_rp,
1078 500 remote_rp_pa, nasid);
1079 partid = remote_rp->partid;
1080 part = &xpc_partitions[partid];
1081
1082 /* pull over the cross partition variables */
1083
1084 ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
1085 if (ret != xpSuccess) {
1086 dev_dbg(xpc_part, "unable to get XPC variables "
1087 "from nasid %d, reason=%d\n", nasid,
1088 ret);
1089
1090 XPC_DEACTIVATE_PARTITION(part, ret);
1091 continue;
1092 }
1093
1094 if (part->act_state != XPC_P_INACTIVE) {
1095 dev_dbg(xpc_part, "partition %d on nasid %d is "
1096 "already activating\n", partid, nasid);
1097 break;
1098 }
1099
1100 /*
1101 * Register the remote partition's AMOs with SAL so it
1102 * can handle and cleanup errors within that address
1103 * range should the remote partition go down. We don't
1104 * unregister this range because it is difficult to
1105 * tell when outstanding writes to the remote partition
1106 * are finished and thus when it is thus safe to
1107 * unregister. This should not result in wasted space
1108 * in the SAL xp_addr_region table because we should
1109 * get the same page for remote_act_amos_pa after
1110 * module reloads and system reboots.
1111 */
1112 if (sn_register_xp_addr_region
1113 (remote_vars->amos_page_pa, PAGE_SIZE, 1) < 0) {
1114 dev_dbg(xpc_part,
1115 "partition %d failed to "
1116 "register xp_addr region 0x%016lx\n",
1117 partid, remote_vars->amos_page_pa);
1118
1119 XPC_SET_REASON(part, xpPhysAddrRegFailed,
1120 __LINE__);
1121 break;
1122 }
1123
1124 /*
1125 * The remote nasid is valid and available.
1126 * Send an interrupt to that nasid to notify
1127 * it that we are ready to begin activation.
1128 */
1129 dev_dbg(xpc_part, "sending an interrupt to AMO 0x%lx, "
1130 "nasid %d, phys_cpuid 0x%x\n",
1131 remote_vars->amos_page_pa,
1132 remote_vars->act_nasid,
1133 remote_vars->act_phys_cpuid);
1134
1135 if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
1136 version)) {
1137 part->remote_amos_page_pa =
1138 remote_vars->amos_page_pa;
1139 xpc_mark_partition_disengaged(part);
1140 xpc_cancel_partition_disengage_request(part);
1141 }
1142 xpc_IPI_send_activate(remote_vars);
1143 } 501 }
1144 } 502 }
1145 503
@@ -1155,20 +513,16 @@ enum xp_retval
1155xpc_initiate_partid_to_nasids(short partid, void *nasid_mask) 513xpc_initiate_partid_to_nasids(short partid, void *nasid_mask)
1156{ 514{
1157 struct xpc_partition *part; 515 struct xpc_partition *part;
1158 u64 part_nasid_pa; 516 unsigned long part_nasid_pa;
1159 int bte_res;
1160 517
1161 part = &xpc_partitions[partid]; 518 part = &xpc_partitions[partid];
1162 if (part->remote_rp_pa == 0) 519 if (part->remote_rp_pa == 0)
1163 return xpPartitionDown; 520 return xpPartitionDown;
1164 521
1165 memset(nasid_mask, 0, XP_NASID_MASK_BYTES); 522 memset(nasid_mask, 0, xpc_nasid_mask_nbytes);
1166
1167 part_nasid_pa = (u64)XPC_RP_PART_NASIDS(part->remote_rp_pa);
1168 523
1169 bte_res = xp_bte_copy(part_nasid_pa, (u64)nasid_mask, 524 part_nasid_pa = (unsigned long)XPC_RP_PART_NASIDS(part->remote_rp_pa);
1170 xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE),
1171 NULL);
1172 525
1173 return xpc_map_bte_errors(bte_res); 526 return xp_remote_memcpy(xp_pa(nasid_mask), part_nasid_pa,
527 xpc_nasid_mask_nbytes);
1174} 528}
diff --git a/drivers/misc/sgi-xp/xpc_sn2.c b/drivers/misc/sgi-xp/xpc_sn2.c
new file mode 100644
index 000000000000..b4882ccf6344
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_sn2.c
@@ -0,0 +1,2404 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) sn2-based functions.
11 *
12 * Architecture specific implementation of common functions.
13 *
14 */
15
16#include <linux/delay.h>
17#include <asm/uncached.h>
18#include <asm/sn/mspec.h>
19#include <asm/sn/sn_sal.h>
20#include "xpc.h"
21
22/*
23 * Define the number of u64s required to represent all the C-brick nasids
24 * as a bitmap. The cross-partition kernel modules deal only with
25 * C-brick nasids, thus the need for bitmaps which don't account for
26 * odd-numbered (non C-brick) nasids.
27 */
28#define XPC_MAX_PHYSNODES_SN2 (MAX_NUMALINK_NODES / 2)
29#define XP_NASID_MASK_BYTES_SN2 ((XPC_MAX_PHYSNODES_SN2 + 7) / 8)
30#define XP_NASID_MASK_WORDS_SN2 ((XPC_MAX_PHYSNODES_SN2 + 63) / 64)
31
32/*
33 * Memory for XPC's amo variables is allocated by the MSPEC driver. These
34 * pages are located in the lowest granule. The lowest granule uses 4k pages
35 * for cached references and an alternate TLB handler to never provide a
36 * cacheable mapping for the entire region. This will prevent speculative
37 * reading of cached copies of our lines from being issued which will cause
38 * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
39 * amo variables (based on XP_MAX_NPARTITIONS_SN2) to identify the senders of
40 * NOTIFY IRQs, 128 amo variables (based on XP_NASID_MASK_WORDS_SN2) to identify
41 * the senders of ACTIVATE IRQs, 1 amo variable to identify which remote
42 * partitions (i.e., XPCs) consider themselves currently engaged with the
43 * local XPC and 1 amo variable to request partition deactivation.
44 */
45#define XPC_NOTIFY_IRQ_AMOS_SN2 0
46#define XPC_ACTIVATE_IRQ_AMOS_SN2 (XPC_NOTIFY_IRQ_AMOS_SN2 + \
47 XP_MAX_NPARTITIONS_SN2)
48#define XPC_ENGAGED_PARTITIONS_AMO_SN2 (XPC_ACTIVATE_IRQ_AMOS_SN2 + \
49 XP_NASID_MASK_WORDS_SN2)
50#define XPC_DEACTIVATE_REQUEST_AMO_SN2 (XPC_ENGAGED_PARTITIONS_AMO_SN2 + 1)
51
52/*
53 * Buffer used to store a local copy of portions of a remote partition's
54 * reserved page (either its header and part_nasids mask, or its vars).
55 */
56static void *xpc_remote_copy_buffer_base_sn2;
57static char *xpc_remote_copy_buffer_sn2;
58
59static struct xpc_vars_sn2 *xpc_vars_sn2;
60static struct xpc_vars_part_sn2 *xpc_vars_part_sn2;
61
62static int
63xpc_setup_partitions_sn_sn2(void)
64{
65 /* nothing needs to be done */
66 return 0;
67}
68
69/* SH_IPI_ACCESS shub register value on startup */
70static u64 xpc_sh1_IPI_access_sn2;
71static u64 xpc_sh2_IPI_access0_sn2;
72static u64 xpc_sh2_IPI_access1_sn2;
73static u64 xpc_sh2_IPI_access2_sn2;
74static u64 xpc_sh2_IPI_access3_sn2;
75
76/*
77 * Change protections to allow IPI operations.
78 */
79static void
80xpc_allow_IPI_ops_sn2(void)
81{
82 int node;
83 int nasid;
84
85 /* !!! The following should get moved into SAL. */
86 if (is_shub2()) {
87 xpc_sh2_IPI_access0_sn2 =
88 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
89 xpc_sh2_IPI_access1_sn2 =
90 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
91 xpc_sh2_IPI_access2_sn2 =
92 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
93 xpc_sh2_IPI_access3_sn2 =
94 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
95
96 for_each_online_node(node) {
97 nasid = cnodeid_to_nasid(node);
98 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
99 -1UL);
100 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
101 -1UL);
102 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
103 -1UL);
104 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
105 -1UL);
106 }
107 } else {
108 xpc_sh1_IPI_access_sn2 =
109 (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
110
111 for_each_online_node(node) {
112 nasid = cnodeid_to_nasid(node);
113 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
114 -1UL);
115 }
116 }
117}
118
119/*
120 * Restrict protections to disallow IPI operations.
121 */
122static void
123xpc_disallow_IPI_ops_sn2(void)
124{
125 int node;
126 int nasid;
127
128 /* !!! The following should get moved into SAL. */
129 if (is_shub2()) {
130 for_each_online_node(node) {
131 nasid = cnodeid_to_nasid(node);
132 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
133 xpc_sh2_IPI_access0_sn2);
134 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
135 xpc_sh2_IPI_access1_sn2);
136 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
137 xpc_sh2_IPI_access2_sn2);
138 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
139 xpc_sh2_IPI_access3_sn2);
140 }
141 } else {
142 for_each_online_node(node) {
143 nasid = cnodeid_to_nasid(node);
144 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
145 xpc_sh1_IPI_access_sn2);
146 }
147 }
148}
149
150/*
151 * The following set of functions are used for the sending and receiving of
152 * IRQs (also known as IPIs). There are two flavors of IRQs, one that is
153 * associated with partition activity (SGI_XPC_ACTIVATE) and the other that
154 * is associated with channel activity (SGI_XPC_NOTIFY).
155 */
156
157static u64
158xpc_receive_IRQ_amo_sn2(struct amo *amo)
159{
160 return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_CLEAR);
161}
162
163static enum xp_retval
164xpc_send_IRQ_sn2(struct amo *amo, u64 flag, int nasid, int phys_cpuid,
165 int vector)
166{
167 int ret = 0;
168 unsigned long irq_flags;
169
170 local_irq_save(irq_flags);
171
172 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, flag);
173 sn_send_IPI_phys(nasid, phys_cpuid, vector, 0);
174
175 /*
176 * We must always use the nofault function regardless of whether we
177 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
178 * didn't, we'd never know that the other partition is down and would
179 * keep sending IRQs and amos to it until the heartbeat times out.
180 */
181 ret = xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->variable),
182 xp_nofault_PIOR_target));
183
184 local_irq_restore(irq_flags);
185
186 return (ret == 0) ? xpSuccess : xpPioReadError;
187}
188
189static struct amo *
190xpc_init_IRQ_amo_sn2(int index)
191{
192 struct amo *amo = xpc_vars_sn2->amos_page + index;
193
194 (void)xpc_receive_IRQ_amo_sn2(amo); /* clear amo variable */
195 return amo;
196}
197
198/*
199 * Functions associated with SGI_XPC_ACTIVATE IRQ.
200 */
201
202/*
203 * Notify the heartbeat check thread that an activate IRQ has been received.
204 */
205static irqreturn_t
206xpc_handle_activate_IRQ_sn2(int irq, void *dev_id)
207{
208 unsigned long irq_flags;
209
210 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
211 xpc_activate_IRQ_rcvd++;
212 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
213
214 wake_up_interruptible(&xpc_activate_IRQ_wq);
215 return IRQ_HANDLED;
216}
217
218/*
219 * Flag the appropriate amo variable and send an IRQ to the specified node.
220 */
221static void
222xpc_send_activate_IRQ_sn2(unsigned long amos_page_pa, int from_nasid,
223 int to_nasid, int to_phys_cpuid)
224{
225 struct amo *amos = (struct amo *)__va(amos_page_pa +
226 (XPC_ACTIVATE_IRQ_AMOS_SN2 *
227 sizeof(struct amo)));
228
229 (void)xpc_send_IRQ_sn2(&amos[BIT_WORD(from_nasid / 2)],
230 BIT_MASK(from_nasid / 2), to_nasid,
231 to_phys_cpuid, SGI_XPC_ACTIVATE);
232}
233
234static void
235xpc_send_local_activate_IRQ_sn2(int from_nasid)
236{
237 unsigned long irq_flags;
238 struct amo *amos = (struct amo *)__va(xpc_vars_sn2->amos_page_pa +
239 (XPC_ACTIVATE_IRQ_AMOS_SN2 *
240 sizeof(struct amo)));
241
242 /* fake the sending and receipt of an activate IRQ from remote nasid */
243 FETCHOP_STORE_OP(TO_AMO((u64)&amos[BIT_WORD(from_nasid / 2)].variable),
244 FETCHOP_OR, BIT_MASK(from_nasid / 2));
245
246 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
247 xpc_activate_IRQ_rcvd++;
248 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
249
250 wake_up_interruptible(&xpc_activate_IRQ_wq);
251}
252
253/*
254 * Functions associated with SGI_XPC_NOTIFY IRQ.
255 */
256
257/*
258 * Check to see if any chctl flags were sent from the specified partition.
259 */
260static void
261xpc_check_for_sent_chctl_flags_sn2(struct xpc_partition *part)
262{
263 union xpc_channel_ctl_flags chctl;
264 unsigned long irq_flags;
265
266 chctl.all_flags = xpc_receive_IRQ_amo_sn2(part->sn.sn2.
267 local_chctl_amo_va);
268 if (chctl.all_flags == 0)
269 return;
270
271 spin_lock_irqsave(&part->chctl_lock, irq_flags);
272 part->chctl.all_flags |= chctl.all_flags;
273 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
274
275 dev_dbg(xpc_chan, "received notify IRQ from partid=%d, chctl.all_flags="
276 "0x%lx\n", XPC_PARTID(part), chctl.all_flags);
277
278 xpc_wakeup_channel_mgr(part);
279}
280
281/*
282 * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
283 * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
284 * than one partition, we use an amo structure per partition to indicate
285 * whether a partition has sent an IRQ or not. If it has, then wake up the
286 * associated kthread to handle it.
287 *
288 * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IRQs sent by XPC
289 * running on other partitions.
290 *
291 * Noteworthy Arguments:
292 *
293 * irq - Interrupt ReQuest number. NOT USED.
294 *
295 * dev_id - partid of IRQ's potential sender.
296 */
297static irqreturn_t
298xpc_handle_notify_IRQ_sn2(int irq, void *dev_id)
299{
300 short partid = (short)(u64)dev_id;
301 struct xpc_partition *part = &xpc_partitions[partid];
302
303 DBUG_ON(partid < 0 || partid >= XP_MAX_NPARTITIONS_SN2);
304
305 if (xpc_part_ref(part)) {
306 xpc_check_for_sent_chctl_flags_sn2(part);
307
308 xpc_part_deref(part);
309 }
310 return IRQ_HANDLED;
311}
312
313/*
314 * Check to see if xpc_handle_notify_IRQ_sn2() dropped any IRQs on the floor
315 * because the write to their associated amo variable completed after the IRQ
316 * was received.
317 */
318static void
319xpc_check_for_dropped_notify_IRQ_sn2(struct xpc_partition *part)
320{
321 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
322
323 if (xpc_part_ref(part)) {
324 xpc_check_for_sent_chctl_flags_sn2(part);
325
326 part_sn2->dropped_notify_IRQ_timer.expires = jiffies +
327 XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL;
328 add_timer(&part_sn2->dropped_notify_IRQ_timer);
329 xpc_part_deref(part);
330 }
331}
332
333/*
334 * Send a notify IRQ to the remote partition that is associated with the
335 * specified channel.
336 */
337static void
338xpc_send_notify_IRQ_sn2(struct xpc_channel *ch, u8 chctl_flag,
339 char *chctl_flag_string, unsigned long *irq_flags)
340{
341 struct xpc_partition *part = &xpc_partitions[ch->partid];
342 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
343 union xpc_channel_ctl_flags chctl = { 0 };
344 enum xp_retval ret;
345
346 if (likely(part->act_state != XPC_P_AS_DEACTIVATING)) {
347 chctl.flags[ch->number] = chctl_flag;
348 ret = xpc_send_IRQ_sn2(part_sn2->remote_chctl_amo_va,
349 chctl.all_flags,
350 part_sn2->notify_IRQ_nasid,
351 part_sn2->notify_IRQ_phys_cpuid,
352 SGI_XPC_NOTIFY);
353 dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n",
354 chctl_flag_string, ch->partid, ch->number, ret);
355 if (unlikely(ret != xpSuccess)) {
356 if (irq_flags != NULL)
357 spin_unlock_irqrestore(&ch->lock, *irq_flags);
358 XPC_DEACTIVATE_PARTITION(part, ret);
359 if (irq_flags != NULL)
360 spin_lock_irqsave(&ch->lock, *irq_flags);
361 }
362 }
363}
364
365#define XPC_SEND_NOTIFY_IRQ_SN2(_ch, _ipi_f, _irq_f) \
366 xpc_send_notify_IRQ_sn2(_ch, _ipi_f, #_ipi_f, _irq_f)
367
368/*
369 * Make it look like the remote partition, which is associated with the
370 * specified channel, sent us a notify IRQ. This faked IRQ will be handled
371 * by xpc_check_for_dropped_notify_IRQ_sn2().
372 */
373static void
374xpc_send_local_notify_IRQ_sn2(struct xpc_channel *ch, u8 chctl_flag,
375 char *chctl_flag_string)
376{
377 struct xpc_partition *part = &xpc_partitions[ch->partid];
378 union xpc_channel_ctl_flags chctl = { 0 };
379
380 chctl.flags[ch->number] = chctl_flag;
381 FETCHOP_STORE_OP(TO_AMO((u64)&part->sn.sn2.local_chctl_amo_va->
382 variable), FETCHOP_OR, chctl.all_flags);
383 dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n",
384 chctl_flag_string, ch->partid, ch->number);
385}
386
387#define XPC_SEND_LOCAL_NOTIFY_IRQ_SN2(_ch, _ipi_f) \
388 xpc_send_local_notify_IRQ_sn2(_ch, _ipi_f, #_ipi_f)
389
390static void
391xpc_send_chctl_closerequest_sn2(struct xpc_channel *ch,
392 unsigned long *irq_flags)
393{
394 struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args;
395
396 args->reason = ch->reason;
397 XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_CLOSEREQUEST, irq_flags);
398}
399
400static void
401xpc_send_chctl_closereply_sn2(struct xpc_channel *ch, unsigned long *irq_flags)
402{
403 XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_CLOSEREPLY, irq_flags);
404}
405
406static void
407xpc_send_chctl_openrequest_sn2(struct xpc_channel *ch, unsigned long *irq_flags)
408{
409 struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args;
410
411 args->entry_size = ch->entry_size;
412 args->local_nentries = ch->local_nentries;
413 XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENREQUEST, irq_flags);
414}
415
416static void
417xpc_send_chctl_openreply_sn2(struct xpc_channel *ch, unsigned long *irq_flags)
418{
419 struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args;
420
421 args->remote_nentries = ch->remote_nentries;
422 args->local_nentries = ch->local_nentries;
423 args->local_msgqueue_pa = xp_pa(ch->sn.sn2.local_msgqueue);
424 XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENREPLY, irq_flags);
425}
426
427static void
428xpc_send_chctl_msgrequest_sn2(struct xpc_channel *ch)
429{
430 XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_MSGREQUEST, NULL);
431}
432
433static void
434xpc_send_chctl_local_msgrequest_sn2(struct xpc_channel *ch)
435{
436 XPC_SEND_LOCAL_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_MSGREQUEST);
437}
438
439static void
440xpc_save_remote_msgqueue_pa_sn2(struct xpc_channel *ch,
441 unsigned long msgqueue_pa)
442{
443 ch->sn.sn2.remote_msgqueue_pa = msgqueue_pa;
444}
445
446/*
447 * This next set of functions are used to keep track of when a partition is
448 * potentially engaged in accessing memory belonging to another partition.
449 */
450
451static void
452xpc_indicate_partition_engaged_sn2(struct xpc_partition *part)
453{
454 unsigned long irq_flags;
455 struct amo *amo = (struct amo *)__va(part->sn.sn2.remote_amos_page_pa +
456 (XPC_ENGAGED_PARTITIONS_AMO_SN2 *
457 sizeof(struct amo)));
458
459 local_irq_save(irq_flags);
460
461 /* set bit corresponding to our partid in remote partition's amo */
462 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
463 BIT(sn_partition_id));
464
465 /*
466 * We must always use the nofault function regardless of whether we
467 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
468 * didn't, we'd never know that the other partition is down and would
469 * keep sending IRQs and amos to it until the heartbeat times out.
470 */
471 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
472 variable),
473 xp_nofault_PIOR_target));
474
475 local_irq_restore(irq_flags);
476}
477
478static void
479xpc_indicate_partition_disengaged_sn2(struct xpc_partition *part)
480{
481 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
482 unsigned long irq_flags;
483 struct amo *amo = (struct amo *)__va(part_sn2->remote_amos_page_pa +
484 (XPC_ENGAGED_PARTITIONS_AMO_SN2 *
485 sizeof(struct amo)));
486
487 local_irq_save(irq_flags);
488
489 /* clear bit corresponding to our partid in remote partition's amo */
490 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
491 ~BIT(sn_partition_id));
492
493 /*
494 * We must always use the nofault function regardless of whether we
495 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
496 * didn't, we'd never know that the other partition is down and would
497 * keep sending IRQs and amos to it until the heartbeat times out.
498 */
499 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
500 variable),
501 xp_nofault_PIOR_target));
502
503 local_irq_restore(irq_flags);
504
505 /*
506 * Send activate IRQ to get other side to see that we've cleared our
507 * bit in their engaged partitions amo.
508 */
509 xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa,
510 cnodeid_to_nasid(0),
511 part_sn2->activate_IRQ_nasid,
512 part_sn2->activate_IRQ_phys_cpuid);
513}
514
515static void
516xpc_assume_partition_disengaged_sn2(short partid)
517{
518 struct amo *amo = xpc_vars_sn2->amos_page +
519 XPC_ENGAGED_PARTITIONS_AMO_SN2;
520
521 /* clear bit(s) based on partid mask in our partition's amo */
522 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
523 ~BIT(partid));
524}
525
526static int
527xpc_partition_engaged_sn2(short partid)
528{
529 struct amo *amo = xpc_vars_sn2->amos_page +
530 XPC_ENGAGED_PARTITIONS_AMO_SN2;
531
532 /* our partition's amo variable ANDed with partid mask */
533 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
534 BIT(partid)) != 0;
535}
536
537static int
538xpc_any_partition_engaged_sn2(void)
539{
540 struct amo *amo = xpc_vars_sn2->amos_page +
541 XPC_ENGAGED_PARTITIONS_AMO_SN2;
542
543 /* our partition's amo variable */
544 return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) != 0;
545}
546
547/* original protection values for each node */
548static u64 xpc_prot_vec_sn2[MAX_NUMNODES];
549
550/*
551 * Change protections to allow amo operations on non-Shub 1.1 systems.
552 */
553static enum xp_retval
554xpc_allow_amo_ops_sn2(struct amo *amos_page)
555{
556 u64 nasid_array = 0;
557 int ret;
558
559 /*
560 * On SHUB 1.1, we cannot call sn_change_memprotect() since the BIST
561 * collides with memory operations. On those systems we call
562 * xpc_allow_amo_ops_shub_wars_1_1_sn2() instead.
563 */
564 if (!enable_shub_wars_1_1()) {
565 ret = sn_change_memprotect(ia64_tpa((u64)amos_page), PAGE_SIZE,
566 SN_MEMPROT_ACCESS_CLASS_1,
567 &nasid_array);
568 if (ret != 0)
569 return xpSalError;
570 }
571 return xpSuccess;
572}
573
574/*
575 * Change protections to allow amo operations on Shub 1.1 systems.
576 */
577static void
578xpc_allow_amo_ops_shub_wars_1_1_sn2(void)
579{
580 int node;
581 int nasid;
582
583 if (!enable_shub_wars_1_1())
584 return;
585
586 for_each_online_node(node) {
587 nasid = cnodeid_to_nasid(node);
588 /* save current protection values */
589 xpc_prot_vec_sn2[node] =
590 (u64)HUB_L((u64 *)GLOBAL_MMR_ADDR(nasid,
591 SH1_MD_DQLP_MMR_DIR_PRIVEC0));
592 /* open up everything */
593 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
594 SH1_MD_DQLP_MMR_DIR_PRIVEC0),
595 -1UL);
596 HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
597 SH1_MD_DQRP_MMR_DIR_PRIVEC0),
598 -1UL);
599 }
600}
601
602static enum xp_retval
603xpc_get_partition_rsvd_page_pa_sn2(void *buf, u64 *cookie, unsigned long *rp_pa,
604 size_t *len)
605{
606 s64 status;
607 enum xp_retval ret;
608
609 status = sn_partition_reserved_page_pa((u64)buf, cookie, rp_pa, len);
610 if (status == SALRET_OK)
611 ret = xpSuccess;
612 else if (status == SALRET_MORE_PASSES)
613 ret = xpNeedMoreInfo;
614 else
615 ret = xpSalError;
616
617 return ret;
618}
619
620
621static int
622xpc_setup_rsvd_page_sn_sn2(struct xpc_rsvd_page *rp)
623{
624 struct amo *amos_page;
625 int i;
626 int ret;
627
628 xpc_vars_sn2 = XPC_RP_VARS(rp);
629
630 rp->sn.vars_pa = xp_pa(xpc_vars_sn2);
631
632 /* vars_part array follows immediately after vars */
633 xpc_vars_part_sn2 = (struct xpc_vars_part_sn2 *)((u8 *)XPC_RP_VARS(rp) +
634 XPC_RP_VARS_SIZE);
635
636 /*
637 * Before clearing xpc_vars_sn2, see if a page of amos had been
638 * previously allocated. If not we'll need to allocate one and set
639 * permissions so that cross-partition amos are allowed.
640 *
641 * The allocated amo page needs MCA reporting to remain disabled after
642 * XPC has unloaded. To make this work, we keep a copy of the pointer
643 * to this page (i.e., amos_page) in the struct xpc_vars_sn2 structure,
644 * which is pointed to by the reserved page, and re-use that saved copy
645 * on subsequent loads of XPC. This amo page is never freed, and its
646 * memory protections are never restricted.
647 */
648 amos_page = xpc_vars_sn2->amos_page;
649 if (amos_page == NULL) {
650 amos_page = (struct amo *)TO_AMO(uncached_alloc_page(0, 1));
651 if (amos_page == NULL) {
652 dev_err(xpc_part, "can't allocate page of amos\n");
653 return -ENOMEM;
654 }
655
656 /*
657 * Open up amo-R/W to cpu. This is done on Shub 1.1 systems
658 * when xpc_allow_amo_ops_shub_wars_1_1_sn2() is called.
659 */
660 ret = xpc_allow_amo_ops_sn2(amos_page);
661 if (ret != xpSuccess) {
662 dev_err(xpc_part, "can't allow amo operations\n");
663 uncached_free_page(__IA64_UNCACHED_OFFSET |
664 TO_PHYS((u64)amos_page), 1);
665 return -EPERM;
666 }
667 }
668
669 /* clear xpc_vars_sn2 */
670 memset(xpc_vars_sn2, 0, sizeof(struct xpc_vars_sn2));
671
672 xpc_vars_sn2->version = XPC_V_VERSION;
673 xpc_vars_sn2->activate_IRQ_nasid = cpuid_to_nasid(0);
674 xpc_vars_sn2->activate_IRQ_phys_cpuid = cpu_physical_id(0);
675 xpc_vars_sn2->vars_part_pa = xp_pa(xpc_vars_part_sn2);
676 xpc_vars_sn2->amos_page_pa = ia64_tpa((u64)amos_page);
677 xpc_vars_sn2->amos_page = amos_page; /* save for next load of XPC */
678
679 /* clear xpc_vars_part_sn2 */
680 memset((u64 *)xpc_vars_part_sn2, 0, sizeof(struct xpc_vars_part_sn2) *
681 XP_MAX_NPARTITIONS_SN2);
682
683 /* initialize the activate IRQ related amo variables */
684 for (i = 0; i < xpc_nasid_mask_nlongs; i++)
685 (void)xpc_init_IRQ_amo_sn2(XPC_ACTIVATE_IRQ_AMOS_SN2 + i);
686
687 /* initialize the engaged remote partitions related amo variables */
688 (void)xpc_init_IRQ_amo_sn2(XPC_ENGAGED_PARTITIONS_AMO_SN2);
689 (void)xpc_init_IRQ_amo_sn2(XPC_DEACTIVATE_REQUEST_AMO_SN2);
690
691 return 0;
692}
693
694static void
695xpc_increment_heartbeat_sn2(void)
696{
697 xpc_vars_sn2->heartbeat++;
698}
699
700static void
701xpc_offline_heartbeat_sn2(void)
702{
703 xpc_increment_heartbeat_sn2();
704 xpc_vars_sn2->heartbeat_offline = 1;
705}
706
707static void
708xpc_online_heartbeat_sn2(void)
709{
710 xpc_increment_heartbeat_sn2();
711 xpc_vars_sn2->heartbeat_offline = 0;
712}
713
714static void
715xpc_heartbeat_init_sn2(void)
716{
717 DBUG_ON(xpc_vars_sn2 == NULL);
718
719 bitmap_zero(xpc_vars_sn2->heartbeating_to_mask, XP_MAX_NPARTITIONS_SN2);
720 xpc_heartbeating_to_mask = &xpc_vars_sn2->heartbeating_to_mask[0];
721 xpc_online_heartbeat_sn2();
722}
723
724static void
725xpc_heartbeat_exit_sn2(void)
726{
727 xpc_offline_heartbeat_sn2();
728}
729
730static enum xp_retval
731xpc_get_remote_heartbeat_sn2(struct xpc_partition *part)
732{
733 struct xpc_vars_sn2 *remote_vars;
734 enum xp_retval ret;
735
736 remote_vars = (struct xpc_vars_sn2 *)xpc_remote_copy_buffer_sn2;
737
738 /* pull the remote vars structure that contains the heartbeat */
739 ret = xp_remote_memcpy(xp_pa(remote_vars),
740 part->sn.sn2.remote_vars_pa,
741 XPC_RP_VARS_SIZE);
742 if (ret != xpSuccess)
743 return ret;
744
745 dev_dbg(xpc_part, "partid=%d, heartbeat=%ld, last_heartbeat=%ld, "
746 "heartbeat_offline=%ld, HB_mask[0]=0x%lx\n", XPC_PARTID(part),
747 remote_vars->heartbeat, part->last_heartbeat,
748 remote_vars->heartbeat_offline,
749 remote_vars->heartbeating_to_mask[0]);
750
751 if ((remote_vars->heartbeat == part->last_heartbeat &&
752 remote_vars->heartbeat_offline == 0) ||
753 !xpc_hb_allowed(sn_partition_id,
754 &remote_vars->heartbeating_to_mask)) {
755 ret = xpNoHeartbeat;
756 } else {
757 part->last_heartbeat = remote_vars->heartbeat;
758 }
759
760 return ret;
761}
762
763/*
764 * Get a copy of the remote partition's XPC variables from the reserved page.
765 *
766 * remote_vars points to a buffer that is cacheline aligned for BTE copies and
767 * assumed to be of size XPC_RP_VARS_SIZE.
768 */
769static enum xp_retval
770xpc_get_remote_vars_sn2(unsigned long remote_vars_pa,
771 struct xpc_vars_sn2 *remote_vars)
772{
773 enum xp_retval ret;
774
775 if (remote_vars_pa == 0)
776 return xpVarsNotSet;
777
778 /* pull over the cross partition variables */
779 ret = xp_remote_memcpy(xp_pa(remote_vars), remote_vars_pa,
780 XPC_RP_VARS_SIZE);
781 if (ret != xpSuccess)
782 return ret;
783
784 if (XPC_VERSION_MAJOR(remote_vars->version) !=
785 XPC_VERSION_MAJOR(XPC_V_VERSION)) {
786 return xpBadVersion;
787 }
788
789 return xpSuccess;
790}
791
792static void
793xpc_request_partition_activation_sn2(struct xpc_rsvd_page *remote_rp,
794 unsigned long remote_rp_pa, int nasid)
795{
796 xpc_send_local_activate_IRQ_sn2(nasid);
797}
798
799static void
800xpc_request_partition_reactivation_sn2(struct xpc_partition *part)
801{
802 xpc_send_local_activate_IRQ_sn2(part->sn.sn2.activate_IRQ_nasid);
803}
804
805static void
806xpc_request_partition_deactivation_sn2(struct xpc_partition *part)
807{
808 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
809 unsigned long irq_flags;
810 struct amo *amo = (struct amo *)__va(part_sn2->remote_amos_page_pa +
811 (XPC_DEACTIVATE_REQUEST_AMO_SN2 *
812 sizeof(struct amo)));
813
814 local_irq_save(irq_flags);
815
816 /* set bit corresponding to our partid in remote partition's amo */
817 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
818 BIT(sn_partition_id));
819
820 /*
821 * We must always use the nofault function regardless of whether we
822 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
823 * didn't, we'd never know that the other partition is down and would
824 * keep sending IRQs and amos to it until the heartbeat times out.
825 */
826 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
827 variable),
828 xp_nofault_PIOR_target));
829
830 local_irq_restore(irq_flags);
831
832 /*
833 * Send activate IRQ to get other side to see that we've set our
834 * bit in their deactivate request amo.
835 */
836 xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa,
837 cnodeid_to_nasid(0),
838 part_sn2->activate_IRQ_nasid,
839 part_sn2->activate_IRQ_phys_cpuid);
840}
841
842static void
843xpc_cancel_partition_deactivation_request_sn2(struct xpc_partition *part)
844{
845 unsigned long irq_flags;
846 struct amo *amo = (struct amo *)__va(part->sn.sn2.remote_amos_page_pa +
847 (XPC_DEACTIVATE_REQUEST_AMO_SN2 *
848 sizeof(struct amo)));
849
850 local_irq_save(irq_flags);
851
852 /* clear bit corresponding to our partid in remote partition's amo */
853 FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
854 ~BIT(sn_partition_id));
855
856 /*
857 * We must always use the nofault function regardless of whether we
858 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
859 * didn't, we'd never know that the other partition is down and would
860 * keep sending IRQs and amos to it until the heartbeat times out.
861 */
862 (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
863 variable),
864 xp_nofault_PIOR_target));
865
866 local_irq_restore(irq_flags);
867}
868
869static int
870xpc_partition_deactivation_requested_sn2(short partid)
871{
872 struct amo *amo = xpc_vars_sn2->amos_page +
873 XPC_DEACTIVATE_REQUEST_AMO_SN2;
874
875 /* our partition's amo variable ANDed with partid mask */
876 return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
877 BIT(partid)) != 0;
878}
879
880/*
881 * Update the remote partition's info.
882 */
883static void
884xpc_update_partition_info_sn2(struct xpc_partition *part, u8 remote_rp_version,
885 unsigned long *remote_rp_ts_jiffies,
886 unsigned long remote_rp_pa,
887 unsigned long remote_vars_pa,
888 struct xpc_vars_sn2 *remote_vars)
889{
890 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
891
892 part->remote_rp_version = remote_rp_version;
893 dev_dbg(xpc_part, " remote_rp_version = 0x%016x\n",
894 part->remote_rp_version);
895
896 part->remote_rp_ts_jiffies = *remote_rp_ts_jiffies;
897 dev_dbg(xpc_part, " remote_rp_ts_jiffies = 0x%016lx\n",
898 part->remote_rp_ts_jiffies);
899
900 part->remote_rp_pa = remote_rp_pa;
901 dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
902
903 part_sn2->remote_vars_pa = remote_vars_pa;
904 dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n",
905 part_sn2->remote_vars_pa);
906
907 part->last_heartbeat = remote_vars->heartbeat;
908 dev_dbg(xpc_part, " last_heartbeat = 0x%016lx\n",
909 part->last_heartbeat);
910
911 part_sn2->remote_vars_part_pa = remote_vars->vars_part_pa;
912 dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n",
913 part_sn2->remote_vars_part_pa);
914
915 part_sn2->activate_IRQ_nasid = remote_vars->activate_IRQ_nasid;
916 dev_dbg(xpc_part, " activate_IRQ_nasid = 0x%x\n",
917 part_sn2->activate_IRQ_nasid);
918
919 part_sn2->activate_IRQ_phys_cpuid =
920 remote_vars->activate_IRQ_phys_cpuid;
921 dev_dbg(xpc_part, " activate_IRQ_phys_cpuid = 0x%x\n",
922 part_sn2->activate_IRQ_phys_cpuid);
923
924 part_sn2->remote_amos_page_pa = remote_vars->amos_page_pa;
925 dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n",
926 part_sn2->remote_amos_page_pa);
927
928 part_sn2->remote_vars_version = remote_vars->version;
929 dev_dbg(xpc_part, " remote_vars_version = 0x%x\n",
930 part_sn2->remote_vars_version);
931}
932
933/*
934 * Prior code has determined the nasid which generated a activate IRQ.
935 * Inspect that nasid to determine if its partition needs to be activated
936 * or deactivated.
937 *
938 * A partition is considered "awaiting activation" if our partition
939 * flags indicate it is not active and it has a heartbeat. A
940 * partition is considered "awaiting deactivation" if our partition
941 * flags indicate it is active but it has no heartbeat or it is not
942 * sending its heartbeat to us.
943 *
944 * To determine the heartbeat, the remote nasid must have a properly
945 * initialized reserved page.
946 */
947static void
948xpc_identify_activate_IRQ_req_sn2(int nasid)
949{
950 struct xpc_rsvd_page *remote_rp;
951 struct xpc_vars_sn2 *remote_vars;
952 unsigned long remote_rp_pa;
953 unsigned long remote_vars_pa;
954 int remote_rp_version;
955 int reactivate = 0;
956 unsigned long remote_rp_ts_jiffies = 0;
957 short partid;
958 struct xpc_partition *part;
959 struct xpc_partition_sn2 *part_sn2;
960 enum xp_retval ret;
961
962 /* pull over the reserved page structure */
963
964 remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer_sn2;
965
966 ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
967 if (ret != xpSuccess) {
968 dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
969 "which sent interrupt, reason=%d\n", nasid, ret);
970 return;
971 }
972
973 remote_vars_pa = remote_rp->sn.vars_pa;
974 remote_rp_version = remote_rp->version;
975 remote_rp_ts_jiffies = remote_rp->ts_jiffies;
976
977 partid = remote_rp->SAL_partid;
978 part = &xpc_partitions[partid];
979 part_sn2 = &part->sn.sn2;
980
981 /* pull over the cross partition variables */
982
983 remote_vars = (struct xpc_vars_sn2 *)xpc_remote_copy_buffer_sn2;
984
985 ret = xpc_get_remote_vars_sn2(remote_vars_pa, remote_vars);
986 if (ret != xpSuccess) {
987 dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
988 "which sent interrupt, reason=%d\n", nasid, ret);
989
990 XPC_DEACTIVATE_PARTITION(part, ret);
991 return;
992 }
993
994 part->activate_IRQ_rcvd++;
995
996 dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
997 "%ld:0x%lx\n", (int)nasid, (int)partid, part->activate_IRQ_rcvd,
998 remote_vars->heartbeat, remote_vars->heartbeating_to_mask[0]);
999
1000 if (xpc_partition_disengaged(part) &&
1001 part->act_state == XPC_P_AS_INACTIVE) {
1002
1003 xpc_update_partition_info_sn2(part, remote_rp_version,
1004 &remote_rp_ts_jiffies,
1005 remote_rp_pa, remote_vars_pa,
1006 remote_vars);
1007
1008 if (xpc_partition_deactivation_requested_sn2(partid)) {
1009 /*
1010 * Other side is waiting on us to deactivate even though
1011 * we already have.
1012 */
1013 return;
1014 }
1015
1016 xpc_activate_partition(part);
1017 return;
1018 }
1019
1020 DBUG_ON(part->remote_rp_version == 0);
1021 DBUG_ON(part_sn2->remote_vars_version == 0);
1022
1023 if (remote_rp_ts_jiffies != part->remote_rp_ts_jiffies) {
1024
1025 /* the other side rebooted */
1026
1027 DBUG_ON(xpc_partition_engaged_sn2(partid));
1028 DBUG_ON(xpc_partition_deactivation_requested_sn2(partid));
1029
1030 xpc_update_partition_info_sn2(part, remote_rp_version,
1031 &remote_rp_ts_jiffies,
1032 remote_rp_pa, remote_vars_pa,
1033 remote_vars);
1034 reactivate = 1;
1035 }
1036
1037 if (part->disengage_timeout > 0 && !xpc_partition_disengaged(part)) {
1038 /* still waiting on other side to disengage from us */
1039 return;
1040 }
1041
1042 if (reactivate)
1043 XPC_DEACTIVATE_PARTITION(part, xpReactivating);
1044 else if (xpc_partition_deactivation_requested_sn2(partid))
1045 XPC_DEACTIVATE_PARTITION(part, xpOtherGoingDown);
1046}
1047
1048/*
1049 * Loop through the activation amo variables and process any bits
1050 * which are set. Each bit indicates a nasid sending a partition
1051 * activation or deactivation request.
1052 *
1053 * Return #of IRQs detected.
1054 */
1055int
1056xpc_identify_activate_IRQ_sender_sn2(void)
1057{
1058 int l;
1059 int b;
1060 unsigned long nasid_mask_long;
1061 u64 nasid; /* remote nasid */
1062 int n_IRQs_detected = 0;
1063 struct amo *act_amos;
1064
1065 act_amos = xpc_vars_sn2->amos_page + XPC_ACTIVATE_IRQ_AMOS_SN2;
1066
1067 /* scan through activate amo variables looking for non-zero entries */
1068 for (l = 0; l < xpc_nasid_mask_nlongs; l++) {
1069
1070 if (xpc_exiting)
1071 break;
1072
1073 nasid_mask_long = xpc_receive_IRQ_amo_sn2(&act_amos[l]);
1074
1075 b = find_first_bit(&nasid_mask_long, BITS_PER_LONG);
1076 if (b >= BITS_PER_LONG) {
1077 /* no IRQs from nasids in this amo variable */
1078 continue;
1079 }
1080
1081 dev_dbg(xpc_part, "amo[%d] gave back 0x%lx\n", l,
1082 nasid_mask_long);
1083
1084 /*
1085 * If this nasid has been added to the machine since
1086 * our partition was reset, this will retain the
1087 * remote nasid in our reserved pages machine mask.
1088 * This is used in the event of module reload.
1089 */
1090 xpc_mach_nasids[l] |= nasid_mask_long;
1091
1092 /* locate the nasid(s) which sent interrupts */
1093
1094 do {
1095 n_IRQs_detected++;
1096 nasid = (l * BITS_PER_LONG + b) * 2;
1097 dev_dbg(xpc_part, "interrupt from nasid %ld\n", nasid);
1098 xpc_identify_activate_IRQ_req_sn2(nasid);
1099
1100 b = find_next_bit(&nasid_mask_long, BITS_PER_LONG,
1101 b + 1);
1102 } while (b < BITS_PER_LONG);
1103 }
1104 return n_IRQs_detected;
1105}
1106
1107static void
1108xpc_process_activate_IRQ_rcvd_sn2(void)
1109{
1110 unsigned long irq_flags;
1111 int n_IRQs_expected;
1112 int n_IRQs_detected;
1113
1114 DBUG_ON(xpc_activate_IRQ_rcvd == 0);
1115
1116 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
1117 n_IRQs_expected = xpc_activate_IRQ_rcvd;
1118 xpc_activate_IRQ_rcvd = 0;
1119 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
1120
1121 n_IRQs_detected = xpc_identify_activate_IRQ_sender_sn2();
1122 if (n_IRQs_detected < n_IRQs_expected) {
1123 /* retry once to help avoid missing amo */
1124 (void)xpc_identify_activate_IRQ_sender_sn2();
1125 }
1126}
1127
1128/*
1129 * Setup the channel structures that are sn2 specific.
1130 */
1131static enum xp_retval
1132xpc_setup_ch_structures_sn_sn2(struct xpc_partition *part)
1133{
1134 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
1135 struct xpc_channel_sn2 *ch_sn2;
1136 enum xp_retval retval;
1137 int ret;
1138 int cpuid;
1139 int ch_number;
1140 struct timer_list *timer;
1141 short partid = XPC_PARTID(part);
1142
1143 /* allocate all the required GET/PUT values */
1144
1145 part_sn2->local_GPs =
1146 xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL,
1147 &part_sn2->local_GPs_base);
1148 if (part_sn2->local_GPs == NULL) {
1149 dev_err(xpc_chan, "can't get memory for local get/put "
1150 "values\n");
1151 return xpNoMemory;
1152 }
1153
1154 part_sn2->remote_GPs =
1155 xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL,
1156 &part_sn2->remote_GPs_base);
1157 if (part_sn2->remote_GPs == NULL) {
1158 dev_err(xpc_chan, "can't get memory for remote get/put "
1159 "values\n");
1160 retval = xpNoMemory;
1161 goto out_1;
1162 }
1163
1164 part_sn2->remote_GPs_pa = 0;
1165
1166 /* allocate all the required open and close args */
1167
1168 part_sn2->local_openclose_args =
1169 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
1170 GFP_KERNEL, &part_sn2->
1171 local_openclose_args_base);
1172 if (part_sn2->local_openclose_args == NULL) {
1173 dev_err(xpc_chan, "can't get memory for local connect args\n");
1174 retval = xpNoMemory;
1175 goto out_2;
1176 }
1177
1178 part_sn2->remote_openclose_args_pa = 0;
1179
1180 part_sn2->local_chctl_amo_va = xpc_init_IRQ_amo_sn2(partid);
1181
1182 part_sn2->notify_IRQ_nasid = 0;
1183 part_sn2->notify_IRQ_phys_cpuid = 0;
1184 part_sn2->remote_chctl_amo_va = NULL;
1185
1186 sprintf(part_sn2->notify_IRQ_owner, "xpc%02d", partid);
1187 ret = request_irq(SGI_XPC_NOTIFY, xpc_handle_notify_IRQ_sn2,
1188 IRQF_SHARED, part_sn2->notify_IRQ_owner,
1189 (void *)(u64)partid);
1190 if (ret != 0) {
1191 dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
1192 "errno=%d\n", -ret);
1193 retval = xpLackOfResources;
1194 goto out_3;
1195 }
1196
1197 /* Setup a timer to check for dropped notify IRQs */
1198 timer = &part_sn2->dropped_notify_IRQ_timer;
1199 init_timer(timer);
1200 timer->function =
1201 (void (*)(unsigned long))xpc_check_for_dropped_notify_IRQ_sn2;
1202 timer->data = (unsigned long)part;
1203 timer->expires = jiffies + XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL;
1204 add_timer(timer);
1205
1206 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
1207 ch_sn2 = &part->channels[ch_number].sn.sn2;
1208
1209 ch_sn2->local_GP = &part_sn2->local_GPs[ch_number];
1210 ch_sn2->local_openclose_args =
1211 &part_sn2->local_openclose_args[ch_number];
1212
1213 mutex_init(&ch_sn2->msg_to_pull_mutex);
1214 }
1215
1216 /*
1217 * Setup the per partition specific variables required by the
1218 * remote partition to establish channel connections with us.
1219 *
1220 * The setting of the magic # indicates that these per partition
1221 * specific variables are ready to be used.
1222 */
1223 xpc_vars_part_sn2[partid].GPs_pa = xp_pa(part_sn2->local_GPs);
1224 xpc_vars_part_sn2[partid].openclose_args_pa =
1225 xp_pa(part_sn2->local_openclose_args);
1226 xpc_vars_part_sn2[partid].chctl_amo_pa =
1227 xp_pa(part_sn2->local_chctl_amo_va);
1228 cpuid = raw_smp_processor_id(); /* any CPU in this partition will do */
1229 xpc_vars_part_sn2[partid].notify_IRQ_nasid = cpuid_to_nasid(cpuid);
1230 xpc_vars_part_sn2[partid].notify_IRQ_phys_cpuid =
1231 cpu_physical_id(cpuid);
1232 xpc_vars_part_sn2[partid].nchannels = part->nchannels;
1233 xpc_vars_part_sn2[partid].magic = XPC_VP_MAGIC1_SN2;
1234
1235 return xpSuccess;
1236
1237 /* setup of ch structures failed */
1238out_3:
1239 kfree(part_sn2->local_openclose_args_base);
1240 part_sn2->local_openclose_args = NULL;
1241out_2:
1242 kfree(part_sn2->remote_GPs_base);
1243 part_sn2->remote_GPs = NULL;
1244out_1:
1245 kfree(part_sn2->local_GPs_base);
1246 part_sn2->local_GPs = NULL;
1247 return retval;
1248}
1249
1250/*
1251 * Teardown the channel structures that are sn2 specific.
1252 */
1253static void
1254xpc_teardown_ch_structures_sn_sn2(struct xpc_partition *part)
1255{
1256 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
1257 short partid = XPC_PARTID(part);
1258
1259 /*
1260 * Indicate that the variables specific to the remote partition are no
1261 * longer available for its use.
1262 */
1263 xpc_vars_part_sn2[partid].magic = 0;
1264
1265 /* in case we've still got outstanding timers registered... */
1266 del_timer_sync(&part_sn2->dropped_notify_IRQ_timer);
1267 free_irq(SGI_XPC_NOTIFY, (void *)(u64)partid);
1268
1269 kfree(part_sn2->local_openclose_args_base);
1270 part_sn2->local_openclose_args = NULL;
1271 kfree(part_sn2->remote_GPs_base);
1272 part_sn2->remote_GPs = NULL;
1273 kfree(part_sn2->local_GPs_base);
1274 part_sn2->local_GPs = NULL;
1275 part_sn2->local_chctl_amo_va = NULL;
1276}
1277
1278/*
1279 * Create a wrapper that hides the underlying mechanism for pulling a cacheline
1280 * (or multiple cachelines) from a remote partition.
1281 *
1282 * src_pa must be a cacheline aligned physical address on the remote partition.
1283 * dst must be a cacheline aligned virtual address on this partition.
1284 * cnt must be cacheline sized
1285 */
1286/* ??? Replace this function by call to xp_remote_memcpy() or bte_copy()? */
1287static enum xp_retval
1288xpc_pull_remote_cachelines_sn2(struct xpc_partition *part, void *dst,
1289 const unsigned long src_pa, size_t cnt)
1290{
1291 enum xp_retval ret;
1292
1293 DBUG_ON(src_pa != L1_CACHE_ALIGN(src_pa));
1294 DBUG_ON((unsigned long)dst != L1_CACHE_ALIGN((unsigned long)dst));
1295 DBUG_ON(cnt != L1_CACHE_ALIGN(cnt));
1296
1297 if (part->act_state == XPC_P_AS_DEACTIVATING)
1298 return part->reason;
1299
1300 ret = xp_remote_memcpy(xp_pa(dst), src_pa, cnt);
1301 if (ret != xpSuccess) {
1302 dev_dbg(xpc_chan, "xp_remote_memcpy() from partition %d failed,"
1303 " ret=%d\n", XPC_PARTID(part), ret);
1304 }
1305 return ret;
1306}
1307
1308/*
1309 * Pull the remote per partition specific variables from the specified
1310 * partition.
1311 */
1312static enum xp_retval
1313xpc_pull_remote_vars_part_sn2(struct xpc_partition *part)
1314{
1315 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
1316 u8 buffer[L1_CACHE_BYTES * 2];
1317 struct xpc_vars_part_sn2 *pulled_entry_cacheline =
1318 (struct xpc_vars_part_sn2 *)L1_CACHE_ALIGN((u64)buffer);
1319 struct xpc_vars_part_sn2 *pulled_entry;
1320 unsigned long remote_entry_cacheline_pa;
1321 unsigned long remote_entry_pa;
1322 short partid = XPC_PARTID(part);
1323 enum xp_retval ret;
1324
1325 /* pull the cacheline that contains the variables we're interested in */
1326
1327 DBUG_ON(part_sn2->remote_vars_part_pa !=
1328 L1_CACHE_ALIGN(part_sn2->remote_vars_part_pa));
1329 DBUG_ON(sizeof(struct xpc_vars_part_sn2) != L1_CACHE_BYTES / 2);
1330
1331 remote_entry_pa = part_sn2->remote_vars_part_pa +
1332 sn_partition_id * sizeof(struct xpc_vars_part_sn2);
1333
1334 remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1));
1335
1336 pulled_entry = (struct xpc_vars_part_sn2 *)((u64)pulled_entry_cacheline
1337 + (remote_entry_pa &
1338 (L1_CACHE_BYTES - 1)));
1339
1340 ret = xpc_pull_remote_cachelines_sn2(part, pulled_entry_cacheline,
1341 remote_entry_cacheline_pa,
1342 L1_CACHE_BYTES);
1343 if (ret != xpSuccess) {
1344 dev_dbg(xpc_chan, "failed to pull XPC vars_part from "
1345 "partition %d, ret=%d\n", partid, ret);
1346 return ret;
1347 }
1348
1349 /* see if they've been set up yet */
1350
1351 if (pulled_entry->magic != XPC_VP_MAGIC1_SN2 &&
1352 pulled_entry->magic != XPC_VP_MAGIC2_SN2) {
1353
1354 if (pulled_entry->magic != 0) {
1355 dev_dbg(xpc_chan, "partition %d's XPC vars_part for "
1356 "partition %d has bad magic value (=0x%lx)\n",
1357 partid, sn_partition_id, pulled_entry->magic);
1358 return xpBadMagic;
1359 }
1360
1361 /* they've not been initialized yet */
1362 return xpRetry;
1363 }
1364
1365 if (xpc_vars_part_sn2[partid].magic == XPC_VP_MAGIC1_SN2) {
1366
1367 /* validate the variables */
1368
1369 if (pulled_entry->GPs_pa == 0 ||
1370 pulled_entry->openclose_args_pa == 0 ||
1371 pulled_entry->chctl_amo_pa == 0) {
1372
1373 dev_err(xpc_chan, "partition %d's XPC vars_part for "
1374 "partition %d are not valid\n", partid,
1375 sn_partition_id);
1376 return xpInvalidAddress;
1377 }
1378
1379 /* the variables we imported look to be valid */
1380
1381 part_sn2->remote_GPs_pa = pulled_entry->GPs_pa;
1382 part_sn2->remote_openclose_args_pa =
1383 pulled_entry->openclose_args_pa;
1384 part_sn2->remote_chctl_amo_va =
1385 (struct amo *)__va(pulled_entry->chctl_amo_pa);
1386 part_sn2->notify_IRQ_nasid = pulled_entry->notify_IRQ_nasid;
1387 part_sn2->notify_IRQ_phys_cpuid =
1388 pulled_entry->notify_IRQ_phys_cpuid;
1389
1390 if (part->nchannels > pulled_entry->nchannels)
1391 part->nchannels = pulled_entry->nchannels;
1392
1393 /* let the other side know that we've pulled their variables */
1394
1395 xpc_vars_part_sn2[partid].magic = XPC_VP_MAGIC2_SN2;
1396 }
1397
1398 if (pulled_entry->magic == XPC_VP_MAGIC1_SN2)
1399 return xpRetry;
1400
1401 return xpSuccess;
1402}
1403
1404/*
1405 * Establish first contact with the remote partititon. This involves pulling
1406 * the XPC per partition variables from the remote partition and waiting for
1407 * the remote partition to pull ours.
1408 */
1409static enum xp_retval
1410xpc_make_first_contact_sn2(struct xpc_partition *part)
1411{
1412 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
1413 enum xp_retval ret;
1414
1415 /*
1416 * Register the remote partition's amos with SAL so it can handle
1417 * and cleanup errors within that address range should the remote
1418 * partition go down. We don't unregister this range because it is
1419 * difficult to tell when outstanding writes to the remote partition
1420 * are finished and thus when it is safe to unregister. This should
1421 * not result in wasted space in the SAL xp_addr_region table because
1422 * we should get the same page for remote_amos_page_pa after module
1423 * reloads and system reboots.
1424 */
1425 if (sn_register_xp_addr_region(part_sn2->remote_amos_page_pa,
1426 PAGE_SIZE, 1) < 0) {
1427 dev_warn(xpc_part, "xpc_activating(%d) failed to register "
1428 "xp_addr region\n", XPC_PARTID(part));
1429
1430 ret = xpPhysAddrRegFailed;
1431 XPC_DEACTIVATE_PARTITION(part, ret);
1432 return ret;
1433 }
1434
1435 /*
1436 * Send activate IRQ to get other side to activate if they've not
1437 * already begun to do so.
1438 */
1439 xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa,
1440 cnodeid_to_nasid(0),
1441 part_sn2->activate_IRQ_nasid,
1442 part_sn2->activate_IRQ_phys_cpuid);
1443
1444 while ((ret = xpc_pull_remote_vars_part_sn2(part)) != xpSuccess) {
1445 if (ret != xpRetry) {
1446 XPC_DEACTIVATE_PARTITION(part, ret);
1447 return ret;
1448 }
1449
1450 dev_dbg(xpc_part, "waiting to make first contact with "
1451 "partition %d\n", XPC_PARTID(part));
1452
1453 /* wait a 1/4 of a second or so */
1454 (void)msleep_interruptible(250);
1455
1456 if (part->act_state == XPC_P_AS_DEACTIVATING)
1457 return part->reason;
1458 }
1459
1460 return xpSuccess;
1461}
1462
1463/*
1464 * Get the chctl flags and pull the openclose args and/or remote GPs as needed.
1465 */
1466static u64
1467xpc_get_chctl_all_flags_sn2(struct xpc_partition *part)
1468{
1469 struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
1470 unsigned long irq_flags;
1471 union xpc_channel_ctl_flags chctl;
1472 enum xp_retval ret;
1473
1474 /*
1475 * See if there are any chctl flags to be handled.
1476 */
1477
1478 spin_lock_irqsave(&part->chctl_lock, irq_flags);
1479 chctl = part->chctl;
1480 if (chctl.all_flags != 0)
1481 part->chctl.all_flags = 0;
1482
1483 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
1484
1485 if (xpc_any_openclose_chctl_flags_set(&chctl)) {
1486 ret = xpc_pull_remote_cachelines_sn2(part, part->
1487 remote_openclose_args,
1488 part_sn2->
1489 remote_openclose_args_pa,
1490 XPC_OPENCLOSE_ARGS_SIZE);
1491 if (ret != xpSuccess) {
1492 XPC_DEACTIVATE_PARTITION(part, ret);
1493
1494 dev_dbg(xpc_chan, "failed to pull openclose args from "
1495 "partition %d, ret=%d\n", XPC_PARTID(part),
1496 ret);
1497
1498 /* don't bother processing chctl flags anymore */
1499 chctl.all_flags = 0;
1500 }
1501 }
1502
1503 if (xpc_any_msg_chctl_flags_set(&chctl)) {
1504 ret = xpc_pull_remote_cachelines_sn2(part, part_sn2->remote_GPs,
1505 part_sn2->remote_GPs_pa,
1506 XPC_GP_SIZE);
1507 if (ret != xpSuccess) {
1508 XPC_DEACTIVATE_PARTITION(part, ret);
1509
1510 dev_dbg(xpc_chan, "failed to pull GPs from partition "
1511 "%d, ret=%d\n", XPC_PARTID(part), ret);
1512
1513 /* don't bother processing chctl flags anymore */
1514 chctl.all_flags = 0;
1515 }
1516 }
1517
1518 return chctl.all_flags;
1519}
1520
1521/*
1522 * Allocate the local message queue and the notify queue.
1523 */
1524static enum xp_retval
1525xpc_allocate_local_msgqueue_sn2(struct xpc_channel *ch)
1526{
1527 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1528 unsigned long irq_flags;
1529 int nentries;
1530 size_t nbytes;
1531
1532 for (nentries = ch->local_nentries; nentries > 0; nentries--) {
1533
1534 nbytes = nentries * ch->entry_size;
1535 ch_sn2->local_msgqueue =
1536 xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL,
1537 &ch_sn2->local_msgqueue_base);
1538 if (ch_sn2->local_msgqueue == NULL)
1539 continue;
1540
1541 nbytes = nentries * sizeof(struct xpc_notify_sn2);
1542 ch_sn2->notify_queue = kzalloc(nbytes, GFP_KERNEL);
1543 if (ch_sn2->notify_queue == NULL) {
1544 kfree(ch_sn2->local_msgqueue_base);
1545 ch_sn2->local_msgqueue = NULL;
1546 continue;
1547 }
1548
1549 spin_lock_irqsave(&ch->lock, irq_flags);
1550 if (nentries < ch->local_nentries) {
1551 dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, "
1552 "partid=%d, channel=%d\n", nentries,
1553 ch->local_nentries, ch->partid, ch->number);
1554
1555 ch->local_nentries = nentries;
1556 }
1557 spin_unlock_irqrestore(&ch->lock, irq_flags);
1558 return xpSuccess;
1559 }
1560
1561 dev_dbg(xpc_chan, "can't get memory for local message queue and notify "
1562 "queue, partid=%d, channel=%d\n", ch->partid, ch->number);
1563 return xpNoMemory;
1564}
1565
1566/*
1567 * Allocate the cached remote message queue.
1568 */
1569static enum xp_retval
1570xpc_allocate_remote_msgqueue_sn2(struct xpc_channel *ch)
1571{
1572 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1573 unsigned long irq_flags;
1574 int nentries;
1575 size_t nbytes;
1576
1577 DBUG_ON(ch->remote_nentries <= 0);
1578
1579 for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
1580
1581 nbytes = nentries * ch->entry_size;
1582 ch_sn2->remote_msgqueue =
1583 xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, &ch_sn2->
1584 remote_msgqueue_base);
1585 if (ch_sn2->remote_msgqueue == NULL)
1586 continue;
1587
1588 spin_lock_irqsave(&ch->lock, irq_flags);
1589 if (nentries < ch->remote_nentries) {
1590 dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, "
1591 "partid=%d, channel=%d\n", nentries,
1592 ch->remote_nentries, ch->partid, ch->number);
1593
1594 ch->remote_nentries = nentries;
1595 }
1596 spin_unlock_irqrestore(&ch->lock, irq_flags);
1597 return xpSuccess;
1598 }
1599
1600 dev_dbg(xpc_chan, "can't get memory for cached remote message queue, "
1601 "partid=%d, channel=%d\n", ch->partid, ch->number);
1602 return xpNoMemory;
1603}
1604
1605/*
1606 * Allocate message queues and other stuff associated with a channel.
1607 *
1608 * Note: Assumes all of the channel sizes are filled in.
1609 */
1610static enum xp_retval
1611xpc_setup_msg_structures_sn2(struct xpc_channel *ch)
1612{
1613 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1614 enum xp_retval ret;
1615
1616 DBUG_ON(ch->flags & XPC_C_SETUP);
1617
1618 ret = xpc_allocate_local_msgqueue_sn2(ch);
1619 if (ret == xpSuccess) {
1620
1621 ret = xpc_allocate_remote_msgqueue_sn2(ch);
1622 if (ret != xpSuccess) {
1623 kfree(ch_sn2->local_msgqueue_base);
1624 ch_sn2->local_msgqueue = NULL;
1625 kfree(ch_sn2->notify_queue);
1626 ch_sn2->notify_queue = NULL;
1627 }
1628 }
1629 return ret;
1630}
1631
1632/*
1633 * Free up message queues and other stuff that were allocated for the specified
1634 * channel.
1635 */
1636static void
1637xpc_teardown_msg_structures_sn2(struct xpc_channel *ch)
1638{
1639 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1640
1641 DBUG_ON(!spin_is_locked(&ch->lock));
1642
1643 ch_sn2->remote_msgqueue_pa = 0;
1644
1645 ch_sn2->local_GP->get = 0;
1646 ch_sn2->local_GP->put = 0;
1647 ch_sn2->remote_GP.get = 0;
1648 ch_sn2->remote_GP.put = 0;
1649 ch_sn2->w_local_GP.get = 0;
1650 ch_sn2->w_local_GP.put = 0;
1651 ch_sn2->w_remote_GP.get = 0;
1652 ch_sn2->w_remote_GP.put = 0;
1653 ch_sn2->next_msg_to_pull = 0;
1654
1655 if (ch->flags & XPC_C_SETUP) {
1656 dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n",
1657 ch->flags, ch->partid, ch->number);
1658
1659 kfree(ch_sn2->local_msgqueue_base);
1660 ch_sn2->local_msgqueue = NULL;
1661 kfree(ch_sn2->remote_msgqueue_base);
1662 ch_sn2->remote_msgqueue = NULL;
1663 kfree(ch_sn2->notify_queue);
1664 ch_sn2->notify_queue = NULL;
1665 }
1666}
1667
1668/*
1669 * Notify those who wanted to be notified upon delivery of their message.
1670 */
1671static void
1672xpc_notify_senders_sn2(struct xpc_channel *ch, enum xp_retval reason, s64 put)
1673{
1674 struct xpc_notify_sn2 *notify;
1675 u8 notify_type;
1676 s64 get = ch->sn.sn2.w_remote_GP.get - 1;
1677
1678 while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
1679
1680 notify = &ch->sn.sn2.notify_queue[get % ch->local_nentries];
1681
1682 /*
1683 * See if the notify entry indicates it was associated with
1684 * a message who's sender wants to be notified. It is possible
1685 * that it is, but someone else is doing or has done the
1686 * notification.
1687 */
1688 notify_type = notify->type;
1689 if (notify_type == 0 ||
1690 cmpxchg(&notify->type, notify_type, 0) != notify_type) {
1691 continue;
1692 }
1693
1694 DBUG_ON(notify_type != XPC_N_CALL);
1695
1696 atomic_dec(&ch->n_to_notify);
1697
1698 if (notify->func != NULL) {
1699 dev_dbg(xpc_chan, "notify->func() called, notify=0x%p "
1700 "msg_number=%ld partid=%d channel=%d\n",
1701 (void *)notify, get, ch->partid, ch->number);
1702
1703 notify->func(reason, ch->partid, ch->number,
1704 notify->key);
1705
1706 dev_dbg(xpc_chan, "notify->func() returned, notify=0x%p"
1707 " msg_number=%ld partid=%d channel=%d\n",
1708 (void *)notify, get, ch->partid, ch->number);
1709 }
1710 }
1711}
1712
1713static void
1714xpc_notify_senders_of_disconnect_sn2(struct xpc_channel *ch)
1715{
1716 xpc_notify_senders_sn2(ch, ch->reason, ch->sn.sn2.w_local_GP.put);
1717}
1718
1719/*
1720 * Clear some of the msg flags in the local message queue.
1721 */
1722static inline void
1723xpc_clear_local_msgqueue_flags_sn2(struct xpc_channel *ch)
1724{
1725 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1726 struct xpc_msg_sn2 *msg;
1727 s64 get;
1728
1729 get = ch_sn2->w_remote_GP.get;
1730 do {
1731 msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->local_msgqueue +
1732 (get % ch->local_nentries) *
1733 ch->entry_size);
1734 msg->flags = 0;
1735 } while (++get < ch_sn2->remote_GP.get);
1736}
1737
1738/*
1739 * Clear some of the msg flags in the remote message queue.
1740 */
1741static inline void
1742xpc_clear_remote_msgqueue_flags_sn2(struct xpc_channel *ch)
1743{
1744 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1745 struct xpc_msg_sn2 *msg;
1746 s64 put;
1747
1748 put = ch_sn2->w_remote_GP.put;
1749 do {
1750 msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue +
1751 (put % ch->remote_nentries) *
1752 ch->entry_size);
1753 msg->flags = 0;
1754 } while (++put < ch_sn2->remote_GP.put);
1755}
1756
1757static int
1758xpc_n_of_deliverable_payloads_sn2(struct xpc_channel *ch)
1759{
1760 return ch->sn.sn2.w_remote_GP.put - ch->sn.sn2.w_local_GP.get;
1761}
1762
1763static void
1764xpc_process_msg_chctl_flags_sn2(struct xpc_partition *part, int ch_number)
1765{
1766 struct xpc_channel *ch = &part->channels[ch_number];
1767 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1768 int npayloads_sent;
1769
1770 ch_sn2->remote_GP = part->sn.sn2.remote_GPs[ch_number];
1771
1772 /* See what, if anything, has changed for each connected channel */
1773
1774 xpc_msgqueue_ref(ch);
1775
1776 if (ch_sn2->w_remote_GP.get == ch_sn2->remote_GP.get &&
1777 ch_sn2->w_remote_GP.put == ch_sn2->remote_GP.put) {
1778 /* nothing changed since GPs were last pulled */
1779 xpc_msgqueue_deref(ch);
1780 return;
1781 }
1782
1783 if (!(ch->flags & XPC_C_CONNECTED)) {
1784 xpc_msgqueue_deref(ch);
1785 return;
1786 }
1787
1788 /*
1789 * First check to see if messages recently sent by us have been
1790 * received by the other side. (The remote GET value will have
1791 * changed since we last looked at it.)
1792 */
1793
1794 if (ch_sn2->w_remote_GP.get != ch_sn2->remote_GP.get) {
1795
1796 /*
1797 * We need to notify any senders that want to be notified
1798 * that their sent messages have been received by their
1799 * intended recipients. We need to do this before updating
1800 * w_remote_GP.get so that we don't allocate the same message
1801 * queue entries prematurely (see xpc_allocate_msg()).
1802 */
1803 if (atomic_read(&ch->n_to_notify) > 0) {
1804 /*
1805 * Notify senders that messages sent have been
1806 * received and delivered by the other side.
1807 */
1808 xpc_notify_senders_sn2(ch, xpMsgDelivered,
1809 ch_sn2->remote_GP.get);
1810 }
1811
1812 /*
1813 * Clear msg->flags in previously sent messages, so that
1814 * they're ready for xpc_allocate_msg().
1815 */
1816 xpc_clear_local_msgqueue_flags_sn2(ch);
1817
1818 ch_sn2->w_remote_GP.get = ch_sn2->remote_GP.get;
1819
1820 dev_dbg(xpc_chan, "w_remote_GP.get changed to %ld, partid=%d, "
1821 "channel=%d\n", ch_sn2->w_remote_GP.get, ch->partid,
1822 ch->number);
1823
1824 /*
1825 * If anyone was waiting for message queue entries to become
1826 * available, wake them up.
1827 */
1828 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
1829 wake_up(&ch->msg_allocate_wq);
1830 }
1831
1832 /*
1833 * Now check for newly sent messages by the other side. (The remote
1834 * PUT value will have changed since we last looked at it.)
1835 */
1836
1837 if (ch_sn2->w_remote_GP.put != ch_sn2->remote_GP.put) {
1838 /*
1839 * Clear msg->flags in previously received messages, so that
1840 * they're ready for xpc_get_deliverable_payload_sn2().
1841 */
1842 xpc_clear_remote_msgqueue_flags_sn2(ch);
1843
1844 ch_sn2->w_remote_GP.put = ch_sn2->remote_GP.put;
1845
1846 dev_dbg(xpc_chan, "w_remote_GP.put changed to %ld, partid=%d, "
1847 "channel=%d\n", ch_sn2->w_remote_GP.put, ch->partid,
1848 ch->number);
1849
1850 npayloads_sent = xpc_n_of_deliverable_payloads_sn2(ch);
1851 if (npayloads_sent > 0) {
1852 dev_dbg(xpc_chan, "msgs waiting to be copied and "
1853 "delivered=%d, partid=%d, channel=%d\n",
1854 npayloads_sent, ch->partid, ch->number);
1855
1856 if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)
1857 xpc_activate_kthreads(ch, npayloads_sent);
1858 }
1859 }
1860
1861 xpc_msgqueue_deref(ch);
1862}
1863
1864static struct xpc_msg_sn2 *
1865xpc_pull_remote_msg_sn2(struct xpc_channel *ch, s64 get)
1866{
1867 struct xpc_partition *part = &xpc_partitions[ch->partid];
1868 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1869 unsigned long remote_msg_pa;
1870 struct xpc_msg_sn2 *msg;
1871 u32 msg_index;
1872 u32 nmsgs;
1873 u64 msg_offset;
1874 enum xp_retval ret;
1875
1876 if (mutex_lock_interruptible(&ch_sn2->msg_to_pull_mutex) != 0) {
1877 /* we were interrupted by a signal */
1878 return NULL;
1879 }
1880
1881 while (get >= ch_sn2->next_msg_to_pull) {
1882
1883 /* pull as many messages as are ready and able to be pulled */
1884
1885 msg_index = ch_sn2->next_msg_to_pull % ch->remote_nentries;
1886
1887 DBUG_ON(ch_sn2->next_msg_to_pull >= ch_sn2->w_remote_GP.put);
1888 nmsgs = ch_sn2->w_remote_GP.put - ch_sn2->next_msg_to_pull;
1889 if (msg_index + nmsgs > ch->remote_nentries) {
1890 /* ignore the ones that wrap the msg queue for now */
1891 nmsgs = ch->remote_nentries - msg_index;
1892 }
1893
1894 msg_offset = msg_index * ch->entry_size;
1895 msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue +
1896 msg_offset);
1897 remote_msg_pa = ch_sn2->remote_msgqueue_pa + msg_offset;
1898
1899 ret = xpc_pull_remote_cachelines_sn2(part, msg, remote_msg_pa,
1900 nmsgs * ch->entry_size);
1901 if (ret != xpSuccess) {
1902
1903 dev_dbg(xpc_chan, "failed to pull %d msgs starting with"
1904 " msg %ld from partition %d, channel=%d, "
1905 "ret=%d\n", nmsgs, ch_sn2->next_msg_to_pull,
1906 ch->partid, ch->number, ret);
1907
1908 XPC_DEACTIVATE_PARTITION(part, ret);
1909
1910 mutex_unlock(&ch_sn2->msg_to_pull_mutex);
1911 return NULL;
1912 }
1913
1914 ch_sn2->next_msg_to_pull += nmsgs;
1915 }
1916
1917 mutex_unlock(&ch_sn2->msg_to_pull_mutex);
1918
1919 /* return the message we were looking for */
1920 msg_offset = (get % ch->remote_nentries) * ch->entry_size;
1921 msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue + msg_offset);
1922
1923 return msg;
1924}
1925
1926/*
1927 * Get the next deliverable message's payload.
1928 */
1929static void *
1930xpc_get_deliverable_payload_sn2(struct xpc_channel *ch)
1931{
1932 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1933 struct xpc_msg_sn2 *msg;
1934 void *payload = NULL;
1935 s64 get;
1936
1937 do {
1938 if (ch->flags & XPC_C_DISCONNECTING)
1939 break;
1940
1941 get = ch_sn2->w_local_GP.get;
1942 rmb(); /* guarantee that .get loads before .put */
1943 if (get == ch_sn2->w_remote_GP.put)
1944 break;
1945
1946 /* There are messages waiting to be pulled and delivered.
1947 * We need to try to secure one for ourselves. We'll do this
1948 * by trying to increment w_local_GP.get and hope that no one
1949 * else beats us to it. If they do, we'll we'll simply have
1950 * to try again for the next one.
1951 */
1952
1953 if (cmpxchg(&ch_sn2->w_local_GP.get, get, get + 1) == get) {
1954 /* we got the entry referenced by get */
1955
1956 dev_dbg(xpc_chan, "w_local_GP.get changed to %ld, "
1957 "partid=%d, channel=%d\n", get + 1,
1958 ch->partid, ch->number);
1959
1960 /* pull the message from the remote partition */
1961
1962 msg = xpc_pull_remote_msg_sn2(ch, get);
1963
1964 DBUG_ON(msg != NULL && msg->number != get);
1965 DBUG_ON(msg != NULL && (msg->flags & XPC_M_SN2_DONE));
1966 DBUG_ON(msg != NULL && !(msg->flags & XPC_M_SN2_READY));
1967
1968 payload = &msg->payload;
1969 break;
1970 }
1971
1972 } while (1);
1973
1974 return payload;
1975}
1976
1977/*
1978 * Now we actually send the messages that are ready to be sent by advancing
1979 * the local message queue's Put value and then send a chctl msgrequest to the
1980 * recipient partition.
1981 */
1982static void
1983xpc_send_msgs_sn2(struct xpc_channel *ch, s64 initial_put)
1984{
1985 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
1986 struct xpc_msg_sn2 *msg;
1987 s64 put = initial_put + 1;
1988 int send_msgrequest = 0;
1989
1990 while (1) {
1991
1992 while (1) {
1993 if (put == ch_sn2->w_local_GP.put)
1994 break;
1995
1996 msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->
1997 local_msgqueue + (put %
1998 ch->local_nentries) *
1999 ch->entry_size);
2000
2001 if (!(msg->flags & XPC_M_SN2_READY))
2002 break;
2003
2004 put++;
2005 }
2006
2007 if (put == initial_put) {
2008 /* nothing's changed */
2009 break;
2010 }
2011
2012 if (cmpxchg_rel(&ch_sn2->local_GP->put, initial_put, put) !=
2013 initial_put) {
2014 /* someone else beat us to it */
2015 DBUG_ON(ch_sn2->local_GP->put < initial_put);
2016 break;
2017 }
2018
2019 /* we just set the new value of local_GP->put */
2020
2021 dev_dbg(xpc_chan, "local_GP->put changed to %ld, partid=%d, "
2022 "channel=%d\n", put, ch->partid, ch->number);
2023
2024 send_msgrequest = 1;
2025
2026 /*
2027 * We need to ensure that the message referenced by
2028 * local_GP->put is not XPC_M_SN2_READY or that local_GP->put
2029 * equals w_local_GP.put, so we'll go have a look.
2030 */
2031 initial_put = put;
2032 }
2033
2034 if (send_msgrequest)
2035 xpc_send_chctl_msgrequest_sn2(ch);
2036}
2037
2038/*
2039 * Allocate an entry for a message from the message queue associated with the
2040 * specified channel.
2041 */
2042static enum xp_retval
2043xpc_allocate_msg_sn2(struct xpc_channel *ch, u32 flags,
2044 struct xpc_msg_sn2 **address_of_msg)
2045{
2046 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
2047 struct xpc_msg_sn2 *msg;
2048 enum xp_retval ret;
2049 s64 put;
2050
2051 /*
2052 * Get the next available message entry from the local message queue.
2053 * If none are available, we'll make sure that we grab the latest
2054 * GP values.
2055 */
2056 ret = xpTimeout;
2057
2058 while (1) {
2059
2060 put = ch_sn2->w_local_GP.put;
2061 rmb(); /* guarantee that .put loads before .get */
2062 if (put - ch_sn2->w_remote_GP.get < ch->local_nentries) {
2063
2064 /* There are available message entries. We need to try
2065 * to secure one for ourselves. We'll do this by trying
2066 * to increment w_local_GP.put as long as someone else
2067 * doesn't beat us to it. If they do, we'll have to
2068 * try again.
2069 */
2070 if (cmpxchg(&ch_sn2->w_local_GP.put, put, put + 1) ==
2071 put) {
2072 /* we got the entry referenced by put */
2073 break;
2074 }
2075 continue; /* try again */
2076 }
2077
2078 /*
2079 * There aren't any available msg entries at this time.
2080 *
2081 * In waiting for a message entry to become available,
2082 * we set a timeout in case the other side is not sending
2083 * completion interrupts. This lets us fake a notify IRQ
2084 * that will cause the notify IRQ handler to fetch the latest
2085 * GP values as if an interrupt was sent by the other side.
2086 */
2087 if (ret == xpTimeout)
2088 xpc_send_chctl_local_msgrequest_sn2(ch);
2089
2090 if (flags & XPC_NOWAIT)
2091 return xpNoWait;
2092
2093 ret = xpc_allocate_msg_wait(ch);
2094 if (ret != xpInterrupted && ret != xpTimeout)
2095 return ret;
2096 }
2097
2098 /* get the message's address and initialize it */
2099 msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->local_msgqueue +
2100 (put % ch->local_nentries) *
2101 ch->entry_size);
2102
2103 DBUG_ON(msg->flags != 0);
2104 msg->number = put;
2105
2106 dev_dbg(xpc_chan, "w_local_GP.put changed to %ld; msg=0x%p, "
2107 "msg_number=%ld, partid=%d, channel=%d\n", put + 1,
2108 (void *)msg, msg->number, ch->partid, ch->number);
2109
2110 *address_of_msg = msg;
2111 return xpSuccess;
2112}
2113
2114/*
2115 * Common code that does the actual sending of the message by advancing the
2116 * local message queue's Put value and sends a chctl msgrequest to the
2117 * partition the message is being sent to.
2118 */
2119static enum xp_retval
2120xpc_send_payload_sn2(struct xpc_channel *ch, u32 flags, void *payload,
2121 u16 payload_size, u8 notify_type, xpc_notify_func func,
2122 void *key)
2123{
2124 enum xp_retval ret = xpSuccess;
2125 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
2126 struct xpc_msg_sn2 *msg = msg;
2127 struct xpc_notify_sn2 *notify = notify;
2128 s64 msg_number;
2129 s64 put;
2130
2131 DBUG_ON(notify_type == XPC_N_CALL && func == NULL);
2132
2133 if (XPC_MSG_SIZE(payload_size) > ch->entry_size)
2134 return xpPayloadTooBig;
2135
2136 xpc_msgqueue_ref(ch);
2137
2138 if (ch->flags & XPC_C_DISCONNECTING) {
2139 ret = ch->reason;
2140 goto out_1;
2141 }
2142 if (!(ch->flags & XPC_C_CONNECTED)) {
2143 ret = xpNotConnected;
2144 goto out_1;
2145 }
2146
2147 ret = xpc_allocate_msg_sn2(ch, flags, &msg);
2148 if (ret != xpSuccess)
2149 goto out_1;
2150
2151 msg_number = msg->number;
2152
2153 if (notify_type != 0) {
2154 /*
2155 * Tell the remote side to send an ACK interrupt when the
2156 * message has been delivered.
2157 */
2158 msg->flags |= XPC_M_SN2_INTERRUPT;
2159
2160 atomic_inc(&ch->n_to_notify);
2161
2162 notify = &ch_sn2->notify_queue[msg_number % ch->local_nentries];
2163 notify->func = func;
2164 notify->key = key;
2165 notify->type = notify_type;
2166
2167 /* ??? Is a mb() needed here? */
2168
2169 if (ch->flags & XPC_C_DISCONNECTING) {
2170 /*
2171 * An error occurred between our last error check and
2172 * this one. We will try to clear the type field from
2173 * the notify entry. If we succeed then
2174 * xpc_disconnect_channel() didn't already process
2175 * the notify entry.
2176 */
2177 if (cmpxchg(&notify->type, notify_type, 0) ==
2178 notify_type) {
2179 atomic_dec(&ch->n_to_notify);
2180 ret = ch->reason;
2181 }
2182 goto out_1;
2183 }
2184 }
2185
2186 memcpy(&msg->payload, payload, payload_size);
2187
2188 msg->flags |= XPC_M_SN2_READY;
2189
2190 /*
2191 * The preceding store of msg->flags must occur before the following
2192 * load of local_GP->put.
2193 */
2194 mb();
2195
2196 /* see if the message is next in line to be sent, if so send it */
2197
2198 put = ch_sn2->local_GP->put;
2199 if (put == msg_number)
2200 xpc_send_msgs_sn2(ch, put);
2201
2202out_1:
2203 xpc_msgqueue_deref(ch);
2204 return ret;
2205}
2206
2207/*
2208 * Now we actually acknowledge the messages that have been delivered and ack'd
2209 * by advancing the cached remote message queue's Get value and if requested
2210 * send a chctl msgrequest to the message sender's partition.
2211 *
2212 * If a message has XPC_M_SN2_INTERRUPT set, send an interrupt to the partition
2213 * that sent the message.
2214 */
2215static void
2216xpc_acknowledge_msgs_sn2(struct xpc_channel *ch, s64 initial_get, u8 msg_flags)
2217{
2218 struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
2219 struct xpc_msg_sn2 *msg;
2220 s64 get = initial_get + 1;
2221 int send_msgrequest = 0;
2222
2223 while (1) {
2224
2225 while (1) {
2226 if (get == ch_sn2->w_local_GP.get)
2227 break;
2228
2229 msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->
2230 remote_msgqueue + (get %
2231 ch->remote_nentries) *
2232 ch->entry_size);
2233
2234 if (!(msg->flags & XPC_M_SN2_DONE))
2235 break;
2236
2237 msg_flags |= msg->flags;
2238 get++;
2239 }
2240
2241 if (get == initial_get) {
2242 /* nothing's changed */
2243 break;
2244 }
2245
2246 if (cmpxchg_rel(&ch_sn2->local_GP->get, initial_get, get) !=
2247 initial_get) {
2248 /* someone else beat us to it */
2249 DBUG_ON(ch_sn2->local_GP->get <= initial_get);
2250 break;
2251 }
2252
2253 /* we just set the new value of local_GP->get */
2254
2255 dev_dbg(xpc_chan, "local_GP->get changed to %ld, partid=%d, "
2256 "channel=%d\n", get, ch->partid, ch->number);
2257
2258 send_msgrequest = (msg_flags & XPC_M_SN2_INTERRUPT);
2259
2260 /*
2261 * We need to ensure that the message referenced by
2262 * local_GP->get is not XPC_M_SN2_DONE or that local_GP->get
2263 * equals w_local_GP.get, so we'll go have a look.
2264 */
2265 initial_get = get;
2266 }
2267
2268 if (send_msgrequest)
2269 xpc_send_chctl_msgrequest_sn2(ch);
2270}
2271
2272static void
2273xpc_received_payload_sn2(struct xpc_channel *ch, void *payload)
2274{
2275 struct xpc_msg_sn2 *msg;
2276 s64 msg_number;
2277 s64 get;
2278
2279 msg = container_of(payload, struct xpc_msg_sn2, payload);
2280 msg_number = msg->number;
2281
2282 dev_dbg(xpc_chan, "msg=0x%p, msg_number=%ld, partid=%d, channel=%d\n",
2283 (void *)msg, msg_number, ch->partid, ch->number);
2284
2285 DBUG_ON((((u64)msg - (u64)ch->remote_msgqueue) / ch->entry_size) !=
2286 msg_number % ch->remote_nentries);
2287 DBUG_ON(msg->flags & XPC_M_SN2_DONE);
2288
2289 msg->flags |= XPC_M_SN2_DONE;
2290
2291 /*
2292 * The preceding store of msg->flags must occur before the following
2293 * load of local_GP->get.
2294 */
2295 mb();
2296
2297 /*
2298 * See if this message is next in line to be acknowledged as having
2299 * been delivered.
2300 */
2301 get = ch->sn.sn2.local_GP->get;
2302 if (get == msg_number)
2303 xpc_acknowledge_msgs_sn2(ch, get, msg->flags);
2304}
2305
2306int
2307xpc_init_sn2(void)
2308{
2309 int ret;
2310 size_t buf_size;
2311
2312 xpc_setup_partitions_sn = xpc_setup_partitions_sn_sn2;
2313 xpc_get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_sn2;
2314 xpc_setup_rsvd_page_sn = xpc_setup_rsvd_page_sn_sn2;
2315 xpc_increment_heartbeat = xpc_increment_heartbeat_sn2;
2316 xpc_offline_heartbeat = xpc_offline_heartbeat_sn2;
2317 xpc_online_heartbeat = xpc_online_heartbeat_sn2;
2318 xpc_heartbeat_init = xpc_heartbeat_init_sn2;
2319 xpc_heartbeat_exit = xpc_heartbeat_exit_sn2;
2320 xpc_get_remote_heartbeat = xpc_get_remote_heartbeat_sn2;
2321
2322 xpc_request_partition_activation = xpc_request_partition_activation_sn2;
2323 xpc_request_partition_reactivation =
2324 xpc_request_partition_reactivation_sn2;
2325 xpc_request_partition_deactivation =
2326 xpc_request_partition_deactivation_sn2;
2327 xpc_cancel_partition_deactivation_request =
2328 xpc_cancel_partition_deactivation_request_sn2;
2329
2330 xpc_process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_sn2;
2331 xpc_setup_ch_structures_sn = xpc_setup_ch_structures_sn_sn2;
2332 xpc_teardown_ch_structures_sn = xpc_teardown_ch_structures_sn_sn2;
2333 xpc_make_first_contact = xpc_make_first_contact_sn2;
2334
2335 xpc_get_chctl_all_flags = xpc_get_chctl_all_flags_sn2;
2336 xpc_send_chctl_closerequest = xpc_send_chctl_closerequest_sn2;
2337 xpc_send_chctl_closereply = xpc_send_chctl_closereply_sn2;
2338 xpc_send_chctl_openrequest = xpc_send_chctl_openrequest_sn2;
2339 xpc_send_chctl_openreply = xpc_send_chctl_openreply_sn2;
2340
2341 xpc_save_remote_msgqueue_pa = xpc_save_remote_msgqueue_pa_sn2;
2342
2343 xpc_setup_msg_structures = xpc_setup_msg_structures_sn2;
2344 xpc_teardown_msg_structures = xpc_teardown_msg_structures_sn2;
2345
2346 xpc_notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_sn2;
2347 xpc_process_msg_chctl_flags = xpc_process_msg_chctl_flags_sn2;
2348 xpc_n_of_deliverable_payloads = xpc_n_of_deliverable_payloads_sn2;
2349 xpc_get_deliverable_payload = xpc_get_deliverable_payload_sn2;
2350
2351 xpc_indicate_partition_engaged = xpc_indicate_partition_engaged_sn2;
2352 xpc_indicate_partition_disengaged =
2353 xpc_indicate_partition_disengaged_sn2;
2354 xpc_partition_engaged = xpc_partition_engaged_sn2;
2355 xpc_any_partition_engaged = xpc_any_partition_engaged_sn2;
2356 xpc_assume_partition_disengaged = xpc_assume_partition_disengaged_sn2;
2357
2358 xpc_send_payload = xpc_send_payload_sn2;
2359 xpc_received_payload = xpc_received_payload_sn2;
2360
2361 if (offsetof(struct xpc_msg_sn2, payload) > XPC_MSG_HDR_MAX_SIZE) {
2362 dev_err(xpc_part, "header portion of struct xpc_msg_sn2 is "
2363 "larger than %d\n", XPC_MSG_HDR_MAX_SIZE);
2364 return -E2BIG;
2365 }
2366
2367 buf_size = max(XPC_RP_VARS_SIZE,
2368 XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES_SN2);
2369 xpc_remote_copy_buffer_sn2 = xpc_kmalloc_cacheline_aligned(buf_size,
2370 GFP_KERNEL,
2371 &xpc_remote_copy_buffer_base_sn2);
2372 if (xpc_remote_copy_buffer_sn2 == NULL) {
2373 dev_err(xpc_part, "can't get memory for remote copy buffer\n");
2374 return -ENOMEM;
2375 }
2376
2377 /* open up protections for IPI and [potentially] amo operations */
2378 xpc_allow_IPI_ops_sn2();
2379 xpc_allow_amo_ops_shub_wars_1_1_sn2();
2380
2381 /*
2382 * This is safe to do before the xpc_hb_checker thread has started
2383 * because the handler releases a wait queue. If an interrupt is
2384 * received before the thread is waiting, it will not go to sleep,
2385 * but rather immediately process the interrupt.
2386 */
2387 ret = request_irq(SGI_XPC_ACTIVATE, xpc_handle_activate_IRQ_sn2, 0,
2388 "xpc hb", NULL);
2389 if (ret != 0) {
2390 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
2391 "errno=%d\n", -ret);
2392 xpc_disallow_IPI_ops_sn2();
2393 kfree(xpc_remote_copy_buffer_base_sn2);
2394 }
2395 return ret;
2396}
2397
2398void
2399xpc_exit_sn2(void)
2400{
2401 free_irq(SGI_XPC_ACTIVATE, NULL);
2402 xpc_disallow_IPI_ops_sn2();
2403 kfree(xpc_remote_copy_buffer_base_sn2);
2404}
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
new file mode 100644
index 000000000000..1ac694c01623
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -0,0 +1,1443 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) uv-based functions.
11 *
12 * Architecture specific implementation of common functions.
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/mm.h>
18#include <linux/interrupt.h>
19#include <linux/delay.h>
20#include <linux/device.h>
21#include <asm/uv/uv_hub.h>
22#include "../sgi-gru/gru.h"
23#include "../sgi-gru/grukservices.h"
24#include "xpc.h"
25
26static atomic64_t xpc_heartbeat_uv;
27static DECLARE_BITMAP(xpc_heartbeating_to_mask_uv, XP_MAX_NPARTITIONS_UV);
28
29#define XPC_ACTIVATE_MSG_SIZE_UV (1 * GRU_CACHE_LINE_BYTES)
30#define XPC_NOTIFY_MSG_SIZE_UV (2 * GRU_CACHE_LINE_BYTES)
31
32#define XPC_ACTIVATE_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \
33 XPC_ACTIVATE_MSG_SIZE_UV)
34#define XPC_NOTIFY_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \
35 XPC_NOTIFY_MSG_SIZE_UV)
36
37static void *xpc_activate_mq_uv;
38static void *xpc_notify_mq_uv;
39
40static int
41xpc_setup_partitions_sn_uv(void)
42{
43 short partid;
44 struct xpc_partition_uv *part_uv;
45
46 for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
47 part_uv = &xpc_partitions[partid].sn.uv;
48
49 spin_lock_init(&part_uv->flags_lock);
50 part_uv->remote_act_state = XPC_P_AS_INACTIVE;
51 }
52 return 0;
53}
54
55static void *
56xpc_create_gru_mq_uv(unsigned int mq_size, int cpuid, unsigned int irq,
57 irq_handler_t irq_handler)
58{
59 int ret;
60 int nid;
61 int mq_order;
62 struct page *page;
63 void *mq;
64
65 nid = cpu_to_node(cpuid);
66 mq_order = get_order(mq_size);
67 page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
68 mq_order);
69 if (page == NULL) {
70 dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
71 "bytes of memory on nid=%d for GRU mq\n", mq_size, nid);
72 return NULL;
73 }
74
75 mq = page_address(page);
76 ret = gru_create_message_queue(mq, mq_size);
77 if (ret != 0) {
78 dev_err(xpc_part, "gru_create_message_queue() returned "
79 "error=%d\n", ret);
80 free_pages((unsigned long)mq, mq_order);
81 return NULL;
82 }
83
84 /* !!! Need to do some other things to set up IRQ */
85
86 ret = request_irq(irq, irq_handler, 0, "xpc", NULL);
87 if (ret != 0) {
88 dev_err(xpc_part, "request_irq(irq=%d) returned error=%d\n",
89 irq, ret);
90 free_pages((unsigned long)mq, mq_order);
91 return NULL;
92 }
93
94 /* !!! enable generation of irq when GRU mq op occurs to this mq */
95
96 /* ??? allow other partitions to access GRU mq? */
97
98 return mq;
99}
100
101static void
102xpc_destroy_gru_mq_uv(void *mq, unsigned int mq_size, unsigned int irq)
103{
104 /* ??? disallow other partitions to access GRU mq? */
105
106 /* !!! disable generation of irq when GRU mq op occurs to this mq */
107
108 free_irq(irq, NULL);
109
110 free_pages((unsigned long)mq, get_order(mq_size));
111}
112
113static enum xp_retval
114xpc_send_gru_msg(unsigned long mq_gpa, void *msg, size_t msg_size)
115{
116 enum xp_retval xp_ret;
117 int ret;
118
119 while (1) {
120 ret = gru_send_message_gpa(mq_gpa, msg, msg_size);
121 if (ret == MQE_OK) {
122 xp_ret = xpSuccess;
123 break;
124 }
125
126 if (ret == MQE_QUEUE_FULL) {
127 dev_dbg(xpc_chan, "gru_send_message_gpa() returned "
128 "error=MQE_QUEUE_FULL\n");
129 /* !!! handle QLimit reached; delay & try again */
130 /* ??? Do we add a limit to the number of retries? */
131 (void)msleep_interruptible(10);
132 } else if (ret == MQE_CONGESTION) {
133 dev_dbg(xpc_chan, "gru_send_message_gpa() returned "
134 "error=MQE_CONGESTION\n");
135 /* !!! handle LB Overflow; simply try again */
136 /* ??? Do we add a limit to the number of retries? */
137 } else {
138 /* !!! Currently this is MQE_UNEXPECTED_CB_ERR */
139 dev_err(xpc_chan, "gru_send_message_gpa() returned "
140 "error=%d\n", ret);
141 xp_ret = xpGruSendMqError;
142 break;
143 }
144 }
145 return xp_ret;
146}
147
148static void
149xpc_process_activate_IRQ_rcvd_uv(void)
150{
151 unsigned long irq_flags;
152 short partid;
153 struct xpc_partition *part;
154 u8 act_state_req;
155
156 DBUG_ON(xpc_activate_IRQ_rcvd == 0);
157
158 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
159 for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
160 part = &xpc_partitions[partid];
161
162 if (part->sn.uv.act_state_req == 0)
163 continue;
164
165 xpc_activate_IRQ_rcvd--;
166 BUG_ON(xpc_activate_IRQ_rcvd < 0);
167
168 act_state_req = part->sn.uv.act_state_req;
169 part->sn.uv.act_state_req = 0;
170 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
171
172 if (act_state_req == XPC_P_ASR_ACTIVATE_UV) {
173 if (part->act_state == XPC_P_AS_INACTIVE)
174 xpc_activate_partition(part);
175 else if (part->act_state == XPC_P_AS_DEACTIVATING)
176 XPC_DEACTIVATE_PARTITION(part, xpReactivating);
177
178 } else if (act_state_req == XPC_P_ASR_REACTIVATE_UV) {
179 if (part->act_state == XPC_P_AS_INACTIVE)
180 xpc_activate_partition(part);
181 else
182 XPC_DEACTIVATE_PARTITION(part, xpReactivating);
183
184 } else if (act_state_req == XPC_P_ASR_DEACTIVATE_UV) {
185 XPC_DEACTIVATE_PARTITION(part, part->sn.uv.reason);
186
187 } else {
188 BUG();
189 }
190
191 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
192 if (xpc_activate_IRQ_rcvd == 0)
193 break;
194 }
195 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
196
197}
198
199static void
200xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
201 struct xpc_activate_mq_msghdr_uv *msg_hdr,
202 int *wakeup_hb_checker)
203{
204 unsigned long irq_flags;
205 struct xpc_partition_uv *part_uv = &part->sn.uv;
206 struct xpc_openclose_args *args;
207
208 part_uv->remote_act_state = msg_hdr->act_state;
209
210 switch (msg_hdr->type) {
211 case XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV:
212 /* syncing of remote_act_state was just done above */
213 break;
214
215 case XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV: {
216 struct xpc_activate_mq_msg_heartbeat_req_uv *msg;
217
218 msg = container_of(msg_hdr,
219 struct xpc_activate_mq_msg_heartbeat_req_uv,
220 hdr);
221 part_uv->heartbeat = msg->heartbeat;
222 break;
223 }
224 case XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV: {
225 struct xpc_activate_mq_msg_heartbeat_req_uv *msg;
226
227 msg = container_of(msg_hdr,
228 struct xpc_activate_mq_msg_heartbeat_req_uv,
229 hdr);
230 part_uv->heartbeat = msg->heartbeat;
231
232 spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
233 part_uv->flags |= XPC_P_HEARTBEAT_OFFLINE_UV;
234 spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
235 break;
236 }
237 case XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV: {
238 struct xpc_activate_mq_msg_heartbeat_req_uv *msg;
239
240 msg = container_of(msg_hdr,
241 struct xpc_activate_mq_msg_heartbeat_req_uv,
242 hdr);
243 part_uv->heartbeat = msg->heartbeat;
244
245 spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
246 part_uv->flags &= ~XPC_P_HEARTBEAT_OFFLINE_UV;
247 spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
248 break;
249 }
250 case XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV: {
251 struct xpc_activate_mq_msg_activate_req_uv *msg;
252
253 /*
254 * ??? Do we deal here with ts_jiffies being different
255 * ??? if act_state != XPC_P_AS_INACTIVE instead of
256 * ??? below?
257 */
258 msg = container_of(msg_hdr, struct
259 xpc_activate_mq_msg_activate_req_uv, hdr);
260
261 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
262 if (part_uv->act_state_req == 0)
263 xpc_activate_IRQ_rcvd++;
264 part_uv->act_state_req = XPC_P_ASR_ACTIVATE_UV;
265 part->remote_rp_pa = msg->rp_gpa; /* !!! _pa is _gpa */
266 part->remote_rp_ts_jiffies = msg_hdr->rp_ts_jiffies;
267 part_uv->remote_activate_mq_gpa = msg->activate_mq_gpa;
268 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
269
270 (*wakeup_hb_checker)++;
271 break;
272 }
273 case XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV: {
274 struct xpc_activate_mq_msg_deactivate_req_uv *msg;
275
276 msg = container_of(msg_hdr, struct
277 xpc_activate_mq_msg_deactivate_req_uv, hdr);
278
279 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
280 if (part_uv->act_state_req == 0)
281 xpc_activate_IRQ_rcvd++;
282 part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
283 part_uv->reason = msg->reason;
284 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
285
286 (*wakeup_hb_checker)++;
287 return;
288 }
289 case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV: {
290 struct xpc_activate_mq_msg_chctl_closerequest_uv *msg;
291
292 msg = container_of(msg_hdr, struct
293 xpc_activate_mq_msg_chctl_closerequest_uv,
294 hdr);
295 args = &part->remote_openclose_args[msg->ch_number];
296 args->reason = msg->reason;
297
298 spin_lock_irqsave(&part->chctl_lock, irq_flags);
299 part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREQUEST;
300 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
301
302 xpc_wakeup_channel_mgr(part);
303 break;
304 }
305 case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV: {
306 struct xpc_activate_mq_msg_chctl_closereply_uv *msg;
307
308 msg = container_of(msg_hdr, struct
309 xpc_activate_mq_msg_chctl_closereply_uv,
310 hdr);
311
312 spin_lock_irqsave(&part->chctl_lock, irq_flags);
313 part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREPLY;
314 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
315
316 xpc_wakeup_channel_mgr(part);
317 break;
318 }
319 case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV: {
320 struct xpc_activate_mq_msg_chctl_openrequest_uv *msg;
321
322 msg = container_of(msg_hdr, struct
323 xpc_activate_mq_msg_chctl_openrequest_uv,
324 hdr);
325 args = &part->remote_openclose_args[msg->ch_number];
326 args->entry_size = msg->entry_size;
327 args->local_nentries = msg->local_nentries;
328
329 spin_lock_irqsave(&part->chctl_lock, irq_flags);
330 part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREQUEST;
331 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
332
333 xpc_wakeup_channel_mgr(part);
334 break;
335 }
336 case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV: {
337 struct xpc_activate_mq_msg_chctl_openreply_uv *msg;
338
339 msg = container_of(msg_hdr, struct
340 xpc_activate_mq_msg_chctl_openreply_uv, hdr);
341 args = &part->remote_openclose_args[msg->ch_number];
342 args->remote_nentries = msg->remote_nentries;
343 args->local_nentries = msg->local_nentries;
344 args->local_msgqueue_pa = msg->local_notify_mq_gpa;
345
346 spin_lock_irqsave(&part->chctl_lock, irq_flags);
347 part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREPLY;
348 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
349
350 xpc_wakeup_channel_mgr(part);
351 break;
352 }
353 case XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV:
354 spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
355 part_uv->flags |= XPC_P_ENGAGED_UV;
356 spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
357 break;
358
359 case XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV:
360 spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
361 part_uv->flags &= ~XPC_P_ENGAGED_UV;
362 spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
363 break;
364
365 default:
366 dev_err(xpc_part, "received unknown activate_mq msg type=%d "
367 "from partition=%d\n", msg_hdr->type, XPC_PARTID(part));
368
369 /* get hb checker to deactivate from the remote partition */
370 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
371 if (part_uv->act_state_req == 0)
372 xpc_activate_IRQ_rcvd++;
373 part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
374 part_uv->reason = xpBadMsgType;
375 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
376
377 (*wakeup_hb_checker)++;
378 return;
379 }
380
381 if (msg_hdr->rp_ts_jiffies != part->remote_rp_ts_jiffies &&
382 part->remote_rp_ts_jiffies != 0) {
383 /*
384 * ??? Does what we do here need to be sensitive to
385 * ??? act_state or remote_act_state?
386 */
387 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
388 if (part_uv->act_state_req == 0)
389 xpc_activate_IRQ_rcvd++;
390 part_uv->act_state_req = XPC_P_ASR_REACTIVATE_UV;
391 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
392
393 (*wakeup_hb_checker)++;
394 }
395}
396
397static irqreturn_t
398xpc_handle_activate_IRQ_uv(int irq, void *dev_id)
399{
400 struct xpc_activate_mq_msghdr_uv *msg_hdr;
401 short partid;
402 struct xpc_partition *part;
403 int wakeup_hb_checker = 0;
404
405 while ((msg_hdr = gru_get_next_message(xpc_activate_mq_uv)) != NULL) {
406
407 partid = msg_hdr->partid;
408 if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) {
409 dev_err(xpc_part, "xpc_handle_activate_IRQ_uv() "
410 "received invalid partid=0x%x in message\n",
411 partid);
412 } else {
413 part = &xpc_partitions[partid];
414 if (xpc_part_ref(part)) {
415 xpc_handle_activate_mq_msg_uv(part, msg_hdr,
416 &wakeup_hb_checker);
417 xpc_part_deref(part);
418 }
419 }
420
421 gru_free_message(xpc_activate_mq_uv, msg_hdr);
422 }
423
424 if (wakeup_hb_checker)
425 wake_up_interruptible(&xpc_activate_IRQ_wq);
426
427 return IRQ_HANDLED;
428}
429
430static enum xp_retval
431xpc_send_activate_IRQ_uv(struct xpc_partition *part, void *msg, size_t msg_size,
432 int msg_type)
433{
434 struct xpc_activate_mq_msghdr_uv *msg_hdr = msg;
435
436 DBUG_ON(msg_size > XPC_ACTIVATE_MSG_SIZE_UV);
437
438 msg_hdr->type = msg_type;
439 msg_hdr->partid = XPC_PARTID(part);
440 msg_hdr->act_state = part->act_state;
441 msg_hdr->rp_ts_jiffies = xpc_rsvd_page->ts_jiffies;
442
443 /* ??? Is holding a spin_lock (ch->lock) during this call a bad idea? */
444 return xpc_send_gru_msg(part->sn.uv.remote_activate_mq_gpa, msg,
445 msg_size);
446}
447
448static void
449xpc_send_activate_IRQ_part_uv(struct xpc_partition *part, void *msg,
450 size_t msg_size, int msg_type)
451{
452 enum xp_retval ret;
453
454 ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type);
455 if (unlikely(ret != xpSuccess))
456 XPC_DEACTIVATE_PARTITION(part, ret);
457}
458
459static void
460xpc_send_activate_IRQ_ch_uv(struct xpc_channel *ch, unsigned long *irq_flags,
461 void *msg, size_t msg_size, int msg_type)
462{
463 struct xpc_partition *part = &xpc_partitions[ch->number];
464 enum xp_retval ret;
465
466 ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type);
467 if (unlikely(ret != xpSuccess)) {
468 if (irq_flags != NULL)
469 spin_unlock_irqrestore(&ch->lock, *irq_flags);
470
471 XPC_DEACTIVATE_PARTITION(part, ret);
472
473 if (irq_flags != NULL)
474 spin_lock_irqsave(&ch->lock, *irq_flags);
475 }
476}
477
478static void
479xpc_send_local_activate_IRQ_uv(struct xpc_partition *part, int act_state_req)
480{
481 unsigned long irq_flags;
482 struct xpc_partition_uv *part_uv = &part->sn.uv;
483
484 /*
485 * !!! Make our side think that the remote parition sent an activate
486 * !!! message our way by doing what the activate IRQ handler would
487 * !!! do had one really been sent.
488 */
489
490 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
491 if (part_uv->act_state_req == 0)
492 xpc_activate_IRQ_rcvd++;
493 part_uv->act_state_req = act_state_req;
494 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
495
496 wake_up_interruptible(&xpc_activate_IRQ_wq);
497}
498
499static enum xp_retval
500xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
501 size_t *len)
502{
503 /* !!! call the UV version of sn_partition_reserved_page_pa() */
504 return xpUnsupported;
505}
506
507static int
508xpc_setup_rsvd_page_sn_uv(struct xpc_rsvd_page *rp)
509{
510 rp->sn.activate_mq_gpa = uv_gpa(xpc_activate_mq_uv);
511 return 0;
512}
513
514static void
515xpc_send_heartbeat_uv(int msg_type)
516{
517 short partid;
518 struct xpc_partition *part;
519 struct xpc_activate_mq_msg_heartbeat_req_uv msg;
520
521 /*
522 * !!! On uv we're broadcasting a heartbeat message every 5 seconds.
523 * !!! Whereas on sn2 we're bte_copy'ng the heartbeat info every 20
524 * !!! seconds. This is an increase in numalink traffic.
525 * ??? Is this good?
526 */
527
528 msg.heartbeat = atomic64_inc_return(&xpc_heartbeat_uv);
529
530 partid = find_first_bit(xpc_heartbeating_to_mask_uv,
531 XP_MAX_NPARTITIONS_UV);
532
533 while (partid < XP_MAX_NPARTITIONS_UV) {
534 part = &xpc_partitions[partid];
535
536 xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
537 msg_type);
538
539 partid = find_next_bit(xpc_heartbeating_to_mask_uv,
540 XP_MAX_NPARTITIONS_UV, partid + 1);
541 }
542}
543
544static void
545xpc_increment_heartbeat_uv(void)
546{
547 xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV);
548}
549
550static void
551xpc_offline_heartbeat_uv(void)
552{
553 xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV);
554}
555
556static void
557xpc_online_heartbeat_uv(void)
558{
559 xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV);
560}
561
562static void
563xpc_heartbeat_init_uv(void)
564{
565 atomic64_set(&xpc_heartbeat_uv, 0);
566 bitmap_zero(xpc_heartbeating_to_mask_uv, XP_MAX_NPARTITIONS_UV);
567 xpc_heartbeating_to_mask = &xpc_heartbeating_to_mask_uv[0];
568}
569
570static void
571xpc_heartbeat_exit_uv(void)
572{
573 xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV);
574}
575
576static enum xp_retval
577xpc_get_remote_heartbeat_uv(struct xpc_partition *part)
578{
579 struct xpc_partition_uv *part_uv = &part->sn.uv;
580 enum xp_retval ret = xpNoHeartbeat;
581
582 if (part_uv->remote_act_state != XPC_P_AS_INACTIVE &&
583 part_uv->remote_act_state != XPC_P_AS_DEACTIVATING) {
584
585 if (part_uv->heartbeat != part->last_heartbeat ||
586 (part_uv->flags & XPC_P_HEARTBEAT_OFFLINE_UV)) {
587
588 part->last_heartbeat = part_uv->heartbeat;
589 ret = xpSuccess;
590 }
591 }
592 return ret;
593}
594
595static void
596xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp,
597 unsigned long remote_rp_gpa, int nasid)
598{
599 short partid = remote_rp->SAL_partid;
600 struct xpc_partition *part = &xpc_partitions[partid];
601 struct xpc_activate_mq_msg_activate_req_uv msg;
602
603 part->remote_rp_pa = remote_rp_gpa; /* !!! _pa here is really _gpa */
604 part->remote_rp_ts_jiffies = remote_rp->ts_jiffies;
605 part->sn.uv.remote_activate_mq_gpa = remote_rp->sn.activate_mq_gpa;
606
607 /*
608 * ??? Is it a good idea to make this conditional on what is
609 * ??? potentially stale state information?
610 */
611 if (part->sn.uv.remote_act_state == XPC_P_AS_INACTIVE) {
612 msg.rp_gpa = uv_gpa(xpc_rsvd_page);
613 msg.activate_mq_gpa = xpc_rsvd_page->sn.activate_mq_gpa;
614 xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
615 XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV);
616 }
617
618 if (part->act_state == XPC_P_AS_INACTIVE)
619 xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV);
620}
621
622static void
623xpc_request_partition_reactivation_uv(struct xpc_partition *part)
624{
625 xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV);
626}
627
628static void
629xpc_request_partition_deactivation_uv(struct xpc_partition *part)
630{
631 struct xpc_activate_mq_msg_deactivate_req_uv msg;
632
633 /*
634 * ??? Is it a good idea to make this conditional on what is
635 * ??? potentially stale state information?
636 */
637 if (part->sn.uv.remote_act_state != XPC_P_AS_DEACTIVATING &&
638 part->sn.uv.remote_act_state != XPC_P_AS_INACTIVE) {
639
640 msg.reason = part->reason;
641 xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
642 XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV);
643 }
644}
645
646static void
647xpc_cancel_partition_deactivation_request_uv(struct xpc_partition *part)
648{
649 /* nothing needs to be done */
650 return;
651}
652
653static void
654xpc_init_fifo_uv(struct xpc_fifo_head_uv *head)
655{
656 head->first = NULL;
657 head->last = NULL;
658 spin_lock_init(&head->lock);
659 head->n_entries = 0;
660}
661
662static void *
663xpc_get_fifo_entry_uv(struct xpc_fifo_head_uv *head)
664{
665 unsigned long irq_flags;
666 struct xpc_fifo_entry_uv *first;
667
668 spin_lock_irqsave(&head->lock, irq_flags);
669 first = head->first;
670 if (head->first != NULL) {
671 head->first = first->next;
672 if (head->first == NULL)
673 head->last = NULL;
674 }
675 head->n_entries++;
676 spin_unlock_irqrestore(&head->lock, irq_flags);
677 first->next = NULL;
678 return first;
679}
680
681static void
682xpc_put_fifo_entry_uv(struct xpc_fifo_head_uv *head,
683 struct xpc_fifo_entry_uv *last)
684{
685 unsigned long irq_flags;
686
687 last->next = NULL;
688 spin_lock_irqsave(&head->lock, irq_flags);
689 if (head->last != NULL)
690 head->last->next = last;
691 else
692 head->first = last;
693 head->last = last;
694 head->n_entries--;
695 BUG_ON(head->n_entries < 0);
696 spin_unlock_irqrestore(&head->lock, irq_flags);
697}
698
699static int
700xpc_n_of_fifo_entries_uv(struct xpc_fifo_head_uv *head)
701{
702 return head->n_entries;
703}
704
705/*
706 * Setup the channel structures that are uv specific.
707 */
708static enum xp_retval
709xpc_setup_ch_structures_sn_uv(struct xpc_partition *part)
710{
711 struct xpc_channel_uv *ch_uv;
712 int ch_number;
713
714 for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
715 ch_uv = &part->channels[ch_number].sn.uv;
716
717 xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
718 xpc_init_fifo_uv(&ch_uv->recv_msg_list);
719 }
720
721 return xpSuccess;
722}
723
724/*
725 * Teardown the channel structures that are uv specific.
726 */
727static void
728xpc_teardown_ch_structures_sn_uv(struct xpc_partition *part)
729{
730 /* nothing needs to be done */
731 return;
732}
733
734static enum xp_retval
735xpc_make_first_contact_uv(struct xpc_partition *part)
736{
737 struct xpc_activate_mq_msg_uv msg;
738
739 /*
740 * We send a sync msg to get the remote partition's remote_act_state
741 * updated to our current act_state which at this point should
742 * be XPC_P_AS_ACTIVATING.
743 */
744 xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
745 XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV);
746
747 while (part->sn.uv.remote_act_state != XPC_P_AS_ACTIVATING) {
748
749 dev_dbg(xpc_part, "waiting to make first contact with "
750 "partition %d\n", XPC_PARTID(part));
751
752 /* wait a 1/4 of a second or so */
753 (void)msleep_interruptible(250);
754
755 if (part->act_state == XPC_P_AS_DEACTIVATING)
756 return part->reason;
757 }
758
759 return xpSuccess;
760}
761
762static u64
763xpc_get_chctl_all_flags_uv(struct xpc_partition *part)
764{
765 unsigned long irq_flags;
766 union xpc_channel_ctl_flags chctl;
767
768 spin_lock_irqsave(&part->chctl_lock, irq_flags);
769 chctl = part->chctl;
770 if (chctl.all_flags != 0)
771 part->chctl.all_flags = 0;
772
773 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
774 return chctl.all_flags;
775}
776
777static enum xp_retval
778xpc_allocate_send_msg_slot_uv(struct xpc_channel *ch)
779{
780 struct xpc_channel_uv *ch_uv = &ch->sn.uv;
781 struct xpc_send_msg_slot_uv *msg_slot;
782 unsigned long irq_flags;
783 int nentries;
784 int entry;
785 size_t nbytes;
786
787 for (nentries = ch->local_nentries; nentries > 0; nentries--) {
788 nbytes = nentries * sizeof(struct xpc_send_msg_slot_uv);
789 ch_uv->send_msg_slots = kzalloc(nbytes, GFP_KERNEL);
790 if (ch_uv->send_msg_slots == NULL)
791 continue;
792
793 for (entry = 0; entry < nentries; entry++) {
794 msg_slot = &ch_uv->send_msg_slots[entry];
795
796 msg_slot->msg_slot_number = entry;
797 xpc_put_fifo_entry_uv(&ch_uv->msg_slot_free_list,
798 &msg_slot->next);
799 }
800
801 spin_lock_irqsave(&ch->lock, irq_flags);
802 if (nentries < ch->local_nentries)
803 ch->local_nentries = nentries;
804 spin_unlock_irqrestore(&ch->lock, irq_flags);
805 return xpSuccess;
806 }
807
808 return xpNoMemory;
809}
810
811static enum xp_retval
812xpc_allocate_recv_msg_slot_uv(struct xpc_channel *ch)
813{
814 struct xpc_channel_uv *ch_uv = &ch->sn.uv;
815 struct xpc_notify_mq_msg_uv *msg_slot;
816 unsigned long irq_flags;
817 int nentries;
818 int entry;
819 size_t nbytes;
820
821 for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
822 nbytes = nentries * ch->entry_size;
823 ch_uv->recv_msg_slots = kzalloc(nbytes, GFP_KERNEL);
824 if (ch_uv->recv_msg_slots == NULL)
825 continue;
826
827 for (entry = 0; entry < nentries; entry++) {
828 msg_slot = ch_uv->recv_msg_slots + entry *
829 ch->entry_size;
830
831 msg_slot->hdr.msg_slot_number = entry;
832 }
833
834 spin_lock_irqsave(&ch->lock, irq_flags);
835 if (nentries < ch->remote_nentries)
836 ch->remote_nentries = nentries;
837 spin_unlock_irqrestore(&ch->lock, irq_flags);
838 return xpSuccess;
839 }
840
841 return xpNoMemory;
842}
843
844/*
845 * Allocate msg_slots associated with the channel.
846 */
847static enum xp_retval
848xpc_setup_msg_structures_uv(struct xpc_channel *ch)
849{
850 static enum xp_retval ret;
851 struct xpc_channel_uv *ch_uv = &ch->sn.uv;
852
853 DBUG_ON(ch->flags & XPC_C_SETUP);
854
855 ret = xpc_allocate_send_msg_slot_uv(ch);
856 if (ret == xpSuccess) {
857
858 ret = xpc_allocate_recv_msg_slot_uv(ch);
859 if (ret != xpSuccess) {
860 kfree(ch_uv->send_msg_slots);
861 xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
862 }
863 }
864 return ret;
865}
866
867/*
868 * Free up msg_slots and clear other stuff that were setup for the specified
869 * channel.
870 */
871static void
872xpc_teardown_msg_structures_uv(struct xpc_channel *ch)
873{
874 struct xpc_channel_uv *ch_uv = &ch->sn.uv;
875
876 DBUG_ON(!spin_is_locked(&ch->lock));
877
878 ch_uv->remote_notify_mq_gpa = 0;
879
880 if (ch->flags & XPC_C_SETUP) {
881 xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
882 kfree(ch_uv->send_msg_slots);
883 xpc_init_fifo_uv(&ch_uv->recv_msg_list);
884 kfree(ch_uv->recv_msg_slots);
885 }
886}
887
888static void
889xpc_send_chctl_closerequest_uv(struct xpc_channel *ch, unsigned long *irq_flags)
890{
891 struct xpc_activate_mq_msg_chctl_closerequest_uv msg;
892
893 msg.ch_number = ch->number;
894 msg.reason = ch->reason;
895 xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
896 XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV);
897}
898
899static void
900xpc_send_chctl_closereply_uv(struct xpc_channel *ch, unsigned long *irq_flags)
901{
902 struct xpc_activate_mq_msg_chctl_closereply_uv msg;
903
904 msg.ch_number = ch->number;
905 xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
906 XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV);
907}
908
909static void
910xpc_send_chctl_openrequest_uv(struct xpc_channel *ch, unsigned long *irq_flags)
911{
912 struct xpc_activate_mq_msg_chctl_openrequest_uv msg;
913
914 msg.ch_number = ch->number;
915 msg.entry_size = ch->entry_size;
916 msg.local_nentries = ch->local_nentries;
917 xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
918 XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV);
919}
920
921static void
922xpc_send_chctl_openreply_uv(struct xpc_channel *ch, unsigned long *irq_flags)
923{
924 struct xpc_activate_mq_msg_chctl_openreply_uv msg;
925
926 msg.ch_number = ch->number;
927 msg.local_nentries = ch->local_nentries;
928 msg.remote_nentries = ch->remote_nentries;
929 msg.local_notify_mq_gpa = uv_gpa(xpc_notify_mq_uv);
930 xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
931 XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV);
932}
933
934static void
935xpc_send_chctl_local_msgrequest_uv(struct xpc_partition *part, int ch_number)
936{
937 unsigned long irq_flags;
938
939 spin_lock_irqsave(&part->chctl_lock, irq_flags);
940 part->chctl.flags[ch_number] |= XPC_CHCTL_MSGREQUEST;
941 spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
942
943 xpc_wakeup_channel_mgr(part);
944}
945
946static void
947xpc_save_remote_msgqueue_pa_uv(struct xpc_channel *ch,
948 unsigned long msgqueue_pa)
949{
950 ch->sn.uv.remote_notify_mq_gpa = msgqueue_pa;
951}
952
953static void
954xpc_indicate_partition_engaged_uv(struct xpc_partition *part)
955{
956 struct xpc_activate_mq_msg_uv msg;
957
958 xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
959 XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV);
960}
961
962static void
963xpc_indicate_partition_disengaged_uv(struct xpc_partition *part)
964{
965 struct xpc_activate_mq_msg_uv msg;
966
967 xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
968 XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV);
969}
970
971static void
972xpc_assume_partition_disengaged_uv(short partid)
973{
974 struct xpc_partition_uv *part_uv = &xpc_partitions[partid].sn.uv;
975 unsigned long irq_flags;
976
977 spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
978 part_uv->flags &= ~XPC_P_ENGAGED_UV;
979 spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
980}
981
982static int
983xpc_partition_engaged_uv(short partid)
984{
985 return (xpc_partitions[partid].sn.uv.flags & XPC_P_ENGAGED_UV) != 0;
986}
987
988static int
989xpc_any_partition_engaged_uv(void)
990{
991 struct xpc_partition_uv *part_uv;
992 short partid;
993
994 for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
995 part_uv = &xpc_partitions[partid].sn.uv;
996 if ((part_uv->flags & XPC_P_ENGAGED_UV) != 0)
997 return 1;
998 }
999 return 0;
1000}
1001
1002static enum xp_retval
1003xpc_allocate_msg_slot_uv(struct xpc_channel *ch, u32 flags,
1004 struct xpc_send_msg_slot_uv **address_of_msg_slot)
1005{
1006 enum xp_retval ret;
1007 struct xpc_send_msg_slot_uv *msg_slot;
1008 struct xpc_fifo_entry_uv *entry;
1009
1010 while (1) {
1011 entry = xpc_get_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list);
1012 if (entry != NULL)
1013 break;
1014
1015 if (flags & XPC_NOWAIT)
1016 return xpNoWait;
1017
1018 ret = xpc_allocate_msg_wait(ch);
1019 if (ret != xpInterrupted && ret != xpTimeout)
1020 return ret;
1021 }
1022
1023 msg_slot = container_of(entry, struct xpc_send_msg_slot_uv, next);
1024 *address_of_msg_slot = msg_slot;
1025 return xpSuccess;
1026}
1027
1028static void
1029xpc_free_msg_slot_uv(struct xpc_channel *ch,
1030 struct xpc_send_msg_slot_uv *msg_slot)
1031{
1032 xpc_put_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list, &msg_slot->next);
1033
1034 /* wakeup anyone waiting for a free msg slot */
1035 if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
1036 wake_up(&ch->msg_allocate_wq);
1037}
1038
1039static void
1040xpc_notify_sender_uv(struct xpc_channel *ch,
1041 struct xpc_send_msg_slot_uv *msg_slot,
1042 enum xp_retval reason)
1043{
1044 xpc_notify_func func = msg_slot->func;
1045
1046 if (func != NULL && cmpxchg(&msg_slot->func, func, NULL) == func) {
1047
1048 atomic_dec(&ch->n_to_notify);
1049
1050 dev_dbg(xpc_chan, "msg_slot->func() called, msg_slot=0x%p "
1051 "msg_slot_number=%d partid=%d channel=%d\n", msg_slot,
1052 msg_slot->msg_slot_number, ch->partid, ch->number);
1053
1054 func(reason, ch->partid, ch->number, msg_slot->key);
1055
1056 dev_dbg(xpc_chan, "msg_slot->func() returned, msg_slot=0x%p "
1057 "msg_slot_number=%d partid=%d channel=%d\n", msg_slot,
1058 msg_slot->msg_slot_number, ch->partid, ch->number);
1059 }
1060}
1061
1062static void
1063xpc_handle_notify_mq_ack_uv(struct xpc_channel *ch,
1064 struct xpc_notify_mq_msg_uv *msg)
1065{
1066 struct xpc_send_msg_slot_uv *msg_slot;
1067 int entry = msg->hdr.msg_slot_number % ch->local_nentries;
1068
1069 msg_slot = &ch->sn.uv.send_msg_slots[entry];
1070
1071 BUG_ON(msg_slot->msg_slot_number != msg->hdr.msg_slot_number);
1072 msg_slot->msg_slot_number += ch->local_nentries;
1073
1074 if (msg_slot->func != NULL)
1075 xpc_notify_sender_uv(ch, msg_slot, xpMsgDelivered);
1076
1077 xpc_free_msg_slot_uv(ch, msg_slot);
1078}
1079
1080static void
1081xpc_handle_notify_mq_msg_uv(struct xpc_partition *part,
1082 struct xpc_notify_mq_msg_uv *msg)
1083{
1084 struct xpc_partition_uv *part_uv = &part->sn.uv;
1085 struct xpc_channel *ch;
1086 struct xpc_channel_uv *ch_uv;
1087 struct xpc_notify_mq_msg_uv *msg_slot;
1088 unsigned long irq_flags;
1089 int ch_number = msg->hdr.ch_number;
1090
1091 if (unlikely(ch_number >= part->nchannels)) {
1092 dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received invalid "
1093 "channel number=0x%x in message from partid=%d\n",
1094 ch_number, XPC_PARTID(part));
1095
1096 /* get hb checker to deactivate from the remote partition */
1097 spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
1098 if (part_uv->act_state_req == 0)
1099 xpc_activate_IRQ_rcvd++;
1100 part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
1101 part_uv->reason = xpBadChannelNumber;
1102 spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
1103
1104 wake_up_interruptible(&xpc_activate_IRQ_wq);
1105 return;
1106 }
1107
1108 ch = &part->channels[ch_number];
1109 xpc_msgqueue_ref(ch);
1110
1111 if (!(ch->flags & XPC_C_CONNECTED)) {
1112 xpc_msgqueue_deref(ch);
1113 return;
1114 }
1115
1116 /* see if we're really dealing with an ACK for a previously sent msg */
1117 if (msg->hdr.size == 0) {
1118 xpc_handle_notify_mq_ack_uv(ch, msg);
1119 xpc_msgqueue_deref(ch);
1120 return;
1121 }
1122
1123 /* we're dealing with a normal message sent via the notify_mq */
1124 ch_uv = &ch->sn.uv;
1125
1126 msg_slot = (struct xpc_notify_mq_msg_uv *)((u64)ch_uv->recv_msg_slots +
1127 (msg->hdr.msg_slot_number % ch->remote_nentries) *
1128 ch->entry_size);
1129
1130 BUG_ON(msg->hdr.msg_slot_number != msg_slot->hdr.msg_slot_number);
1131 BUG_ON(msg_slot->hdr.size != 0);
1132
1133 memcpy(msg_slot, msg, msg->hdr.size);
1134
1135 xpc_put_fifo_entry_uv(&ch_uv->recv_msg_list, &msg_slot->hdr.u.next);
1136
1137 if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) {
1138 /*
1139 * If there is an existing idle kthread get it to deliver
1140 * the payload, otherwise we'll have to get the channel mgr
1141 * for this partition to create a kthread to do the delivery.
1142 */
1143 if (atomic_read(&ch->kthreads_idle) > 0)
1144 wake_up_nr(&ch->idle_wq, 1);
1145 else
1146 xpc_send_chctl_local_msgrequest_uv(part, ch->number);
1147 }
1148 xpc_msgqueue_deref(ch);
1149}
1150
1151static irqreturn_t
1152xpc_handle_notify_IRQ_uv(int irq, void *dev_id)
1153{
1154 struct xpc_notify_mq_msg_uv *msg;
1155 short partid;
1156 struct xpc_partition *part;
1157
1158 while ((msg = gru_get_next_message(xpc_notify_mq_uv)) != NULL) {
1159
1160 partid = msg->hdr.partid;
1161 if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) {
1162 dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received "
1163 "invalid partid=0x%x in message\n", partid);
1164 } else {
1165 part = &xpc_partitions[partid];
1166
1167 if (xpc_part_ref(part)) {
1168 xpc_handle_notify_mq_msg_uv(part, msg);
1169 xpc_part_deref(part);
1170 }
1171 }
1172
1173 gru_free_message(xpc_notify_mq_uv, msg);
1174 }
1175
1176 return IRQ_HANDLED;
1177}
1178
1179static int
1180xpc_n_of_deliverable_payloads_uv(struct xpc_channel *ch)
1181{
1182 return xpc_n_of_fifo_entries_uv(&ch->sn.uv.recv_msg_list);
1183}
1184
1185static void
1186xpc_process_msg_chctl_flags_uv(struct xpc_partition *part, int ch_number)
1187{
1188 struct xpc_channel *ch = &part->channels[ch_number];
1189 int ndeliverable_payloads;
1190
1191 xpc_msgqueue_ref(ch);
1192
1193 ndeliverable_payloads = xpc_n_of_deliverable_payloads_uv(ch);
1194
1195 if (ndeliverable_payloads > 0 &&
1196 (ch->flags & XPC_C_CONNECTED) &&
1197 (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)) {
1198
1199 xpc_activate_kthreads(ch, ndeliverable_payloads);
1200 }
1201
1202 xpc_msgqueue_deref(ch);
1203}
1204
1205static enum xp_retval
1206xpc_send_payload_uv(struct xpc_channel *ch, u32 flags, void *payload,
1207 u16 payload_size, u8 notify_type, xpc_notify_func func,
1208 void *key)
1209{
1210 enum xp_retval ret = xpSuccess;
1211 struct xpc_send_msg_slot_uv *msg_slot = NULL;
1212 struct xpc_notify_mq_msg_uv *msg;
1213 u8 msg_buffer[XPC_NOTIFY_MSG_SIZE_UV];
1214 size_t msg_size;
1215
1216 DBUG_ON(notify_type != XPC_N_CALL);
1217
1218 msg_size = sizeof(struct xpc_notify_mq_msghdr_uv) + payload_size;
1219 if (msg_size > ch->entry_size)
1220 return xpPayloadTooBig;
1221
1222 xpc_msgqueue_ref(ch);
1223
1224 if (ch->flags & XPC_C_DISCONNECTING) {
1225 ret = ch->reason;
1226 goto out_1;
1227 }
1228 if (!(ch->flags & XPC_C_CONNECTED)) {
1229 ret = xpNotConnected;
1230 goto out_1;
1231 }
1232
1233 ret = xpc_allocate_msg_slot_uv(ch, flags, &msg_slot);
1234 if (ret != xpSuccess)
1235 goto out_1;
1236
1237 if (func != NULL) {
1238 atomic_inc(&ch->n_to_notify);
1239
1240 msg_slot->key = key;
1241 wmb(); /* a non-NULL func must hit memory after the key */
1242 msg_slot->func = func;
1243
1244 if (ch->flags & XPC_C_DISCONNECTING) {
1245 ret = ch->reason;
1246 goto out_2;
1247 }
1248 }
1249
1250 msg = (struct xpc_notify_mq_msg_uv *)&msg_buffer;
1251 msg->hdr.partid = xp_partition_id;
1252 msg->hdr.ch_number = ch->number;
1253 msg->hdr.size = msg_size;
1254 msg->hdr.msg_slot_number = msg_slot->msg_slot_number;
1255 memcpy(&msg->payload, payload, payload_size);
1256
1257 ret = xpc_send_gru_msg(ch->sn.uv.remote_notify_mq_gpa, msg, msg_size);
1258 if (ret == xpSuccess)
1259 goto out_1;
1260
1261 XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret);
1262out_2:
1263 if (func != NULL) {
1264 /*
1265 * Try to NULL the msg_slot's func field. If we fail, then
1266 * xpc_notify_senders_of_disconnect_uv() beat us to it, in which
1267 * case we need to pretend we succeeded to send the message
1268 * since the user will get a callout for the disconnect error
1269 * by xpc_notify_senders_of_disconnect_uv(), and to also get an
1270 * error returned here will confuse them. Additionally, since
1271 * in this case the channel is being disconnected we don't need
1272 * to put the the msg_slot back on the free list.
1273 */
1274 if (cmpxchg(&msg_slot->func, func, NULL) != func) {
1275 ret = xpSuccess;
1276 goto out_1;
1277 }
1278
1279 msg_slot->key = NULL;
1280 atomic_dec(&ch->n_to_notify);
1281 }
1282 xpc_free_msg_slot_uv(ch, msg_slot);
1283out_1:
1284 xpc_msgqueue_deref(ch);
1285 return ret;
1286}
1287
1288/*
1289 * Tell the callers of xpc_send_notify() that the status of their payloads
1290 * is unknown because the channel is now disconnecting.
1291 *
1292 * We don't worry about putting these msg_slots on the free list since the
1293 * msg_slots themselves are about to be kfree'd.
1294 */
1295static void
1296xpc_notify_senders_of_disconnect_uv(struct xpc_channel *ch)
1297{
1298 struct xpc_send_msg_slot_uv *msg_slot;
1299 int entry;
1300
1301 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
1302
1303 for (entry = 0; entry < ch->local_nentries; entry++) {
1304
1305 if (atomic_read(&ch->n_to_notify) == 0)
1306 break;
1307
1308 msg_slot = &ch->sn.uv.send_msg_slots[entry];
1309 if (msg_slot->func != NULL)
1310 xpc_notify_sender_uv(ch, msg_slot, ch->reason);
1311 }
1312}
1313
1314/*
1315 * Get the next deliverable message's payload.
1316 */
1317static void *
1318xpc_get_deliverable_payload_uv(struct xpc_channel *ch)
1319{
1320 struct xpc_fifo_entry_uv *entry;
1321 struct xpc_notify_mq_msg_uv *msg;
1322 void *payload = NULL;
1323
1324 if (!(ch->flags & XPC_C_DISCONNECTING)) {
1325 entry = xpc_get_fifo_entry_uv(&ch->sn.uv.recv_msg_list);
1326 if (entry != NULL) {
1327 msg = container_of(entry, struct xpc_notify_mq_msg_uv,
1328 hdr.u.next);
1329 payload = &msg->payload;
1330 }
1331 }
1332 return payload;
1333}
1334
1335static void
1336xpc_received_payload_uv(struct xpc_channel *ch, void *payload)
1337{
1338 struct xpc_notify_mq_msg_uv *msg;
1339 enum xp_retval ret;
1340
1341 msg = container_of(payload, struct xpc_notify_mq_msg_uv, payload);
1342
1343 /* return an ACK to the sender of this message */
1344
1345 msg->hdr.partid = xp_partition_id;
1346 msg->hdr.size = 0; /* size of zero indicates this is an ACK */
1347
1348 ret = xpc_send_gru_msg(ch->sn.uv.remote_notify_mq_gpa, msg,
1349 sizeof(struct xpc_notify_mq_msghdr_uv));
1350 if (ret != xpSuccess)
1351 XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret);
1352
1353 msg->hdr.msg_slot_number += ch->remote_nentries;
1354}
1355
1356int
1357xpc_init_uv(void)
1358{
1359 xpc_setup_partitions_sn = xpc_setup_partitions_sn_uv;
1360 xpc_process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_uv;
1361 xpc_get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_uv;
1362 xpc_setup_rsvd_page_sn = xpc_setup_rsvd_page_sn_uv;
1363 xpc_increment_heartbeat = xpc_increment_heartbeat_uv;
1364 xpc_offline_heartbeat = xpc_offline_heartbeat_uv;
1365 xpc_online_heartbeat = xpc_online_heartbeat_uv;
1366 xpc_heartbeat_init = xpc_heartbeat_init_uv;
1367 xpc_heartbeat_exit = xpc_heartbeat_exit_uv;
1368 xpc_get_remote_heartbeat = xpc_get_remote_heartbeat_uv;
1369
1370 xpc_request_partition_activation = xpc_request_partition_activation_uv;
1371 xpc_request_partition_reactivation =
1372 xpc_request_partition_reactivation_uv;
1373 xpc_request_partition_deactivation =
1374 xpc_request_partition_deactivation_uv;
1375 xpc_cancel_partition_deactivation_request =
1376 xpc_cancel_partition_deactivation_request_uv;
1377
1378 xpc_setup_ch_structures_sn = xpc_setup_ch_structures_sn_uv;
1379 xpc_teardown_ch_structures_sn = xpc_teardown_ch_structures_sn_uv;
1380
1381 xpc_make_first_contact = xpc_make_first_contact_uv;
1382
1383 xpc_get_chctl_all_flags = xpc_get_chctl_all_flags_uv;
1384 xpc_send_chctl_closerequest = xpc_send_chctl_closerequest_uv;
1385 xpc_send_chctl_closereply = xpc_send_chctl_closereply_uv;
1386 xpc_send_chctl_openrequest = xpc_send_chctl_openrequest_uv;
1387 xpc_send_chctl_openreply = xpc_send_chctl_openreply_uv;
1388
1389 xpc_save_remote_msgqueue_pa = xpc_save_remote_msgqueue_pa_uv;
1390
1391 xpc_setup_msg_structures = xpc_setup_msg_structures_uv;
1392 xpc_teardown_msg_structures = xpc_teardown_msg_structures_uv;
1393
1394 xpc_indicate_partition_engaged = xpc_indicate_partition_engaged_uv;
1395 xpc_indicate_partition_disengaged =
1396 xpc_indicate_partition_disengaged_uv;
1397 xpc_assume_partition_disengaged = xpc_assume_partition_disengaged_uv;
1398 xpc_partition_engaged = xpc_partition_engaged_uv;
1399 xpc_any_partition_engaged = xpc_any_partition_engaged_uv;
1400
1401 xpc_n_of_deliverable_payloads = xpc_n_of_deliverable_payloads_uv;
1402 xpc_process_msg_chctl_flags = xpc_process_msg_chctl_flags_uv;
1403 xpc_send_payload = xpc_send_payload_uv;
1404 xpc_notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_uv;
1405 xpc_get_deliverable_payload = xpc_get_deliverable_payload_uv;
1406 xpc_received_payload = xpc_received_payload_uv;
1407
1408 if (sizeof(struct xpc_notify_mq_msghdr_uv) > XPC_MSG_HDR_MAX_SIZE) {
1409 dev_err(xpc_part, "xpc_notify_mq_msghdr_uv is larger than %d\n",
1410 XPC_MSG_HDR_MAX_SIZE);
1411 return -E2BIG;
1412 }
1413
1414 /* ??? The cpuid argument's value is 0, is that what we want? */
1415 /* !!! The irq argument's value isn't correct. */
1416 xpc_activate_mq_uv = xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, 0, 0,
1417 xpc_handle_activate_IRQ_uv);
1418 if (xpc_activate_mq_uv == NULL)
1419 return -ENOMEM;
1420
1421 /* ??? The cpuid argument's value is 0, is that what we want? */
1422 /* !!! The irq argument's value isn't correct. */
1423 xpc_notify_mq_uv = xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, 0, 0,
1424 xpc_handle_notify_IRQ_uv);
1425 if (xpc_notify_mq_uv == NULL) {
1426 /* !!! The irq argument's value isn't correct. */
1427 xpc_destroy_gru_mq_uv(xpc_activate_mq_uv,
1428 XPC_ACTIVATE_MQ_SIZE_UV, 0);
1429 return -ENOMEM;
1430 }
1431
1432 return 0;
1433}
1434
1435void
1436xpc_exit_uv(void)
1437{
1438 /* !!! The irq argument's value isn't correct. */
1439 xpc_destroy_gru_mq_uv(xpc_notify_mq_uv, XPC_NOTIFY_MQ_SIZE_UV, 0);
1440
1441 /* !!! The irq argument's value isn't correct. */
1442 xpc_destroy_gru_mq_uv(xpc_activate_mq_uv, XPC_ACTIVATE_MQ_SIZE_UV, 0);
1443}
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
index 822dc8e8d7f0..71513b3af708 100644
--- a/drivers/misc/sgi-xp/xpnet.c
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -21,21 +21,8 @@
21 */ 21 */
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/init.h>
27#include <linux/ioport.h>
28#include <linux/netdevice.h> 24#include <linux/netdevice.h>
29#include <linux/etherdevice.h> 25#include <linux/etherdevice.h>
30#include <linux/delay.h>
31#include <linux/ethtool.h>
32#include <linux/mii.h>
33#include <linux/smp.h>
34#include <linux/string.h>
35#include <asm/sn/bte.h>
36#include <asm/sn/io.h>
37#include <asm/sn/sn_sal.h>
38#include <asm/atomic.h>
39#include "xp.h" 26#include "xp.h"
40 27
41/* 28/*
@@ -57,7 +44,7 @@ struct xpnet_message {
57 u16 version; /* Version for this message */ 44 u16 version; /* Version for this message */
58 u16 embedded_bytes; /* #of bytes embedded in XPC message */ 45 u16 embedded_bytes; /* #of bytes embedded in XPC message */
59 u32 magic; /* Special number indicating this is xpnet */ 46 u32 magic; /* Special number indicating this is xpnet */
60 u64 buf_pa; /* phys address of buffer to retrieve */ 47 unsigned long buf_pa; /* phys address of buffer to retrieve */
61 u32 size; /* #of bytes in buffer */ 48 u32 size; /* #of bytes in buffer */
62 u8 leadin_ignore; /* #of bytes to ignore at the beginning */ 49 u8 leadin_ignore; /* #of bytes to ignore at the beginning */
63 u8 tailout_ignore; /* #of bytes to ignore at the end */ 50 u8 tailout_ignore; /* #of bytes to ignore at the end */
@@ -70,11 +57,10 @@ struct xpnet_message {
70 * 57 *
71 * XPC expects each message to exist in an individual cacheline. 58 * XPC expects each message to exist in an individual cacheline.
72 */ 59 */
73#define XPNET_MSG_SIZE (L1_CACHE_BYTES - XPC_MSG_PAYLOAD_OFFSET) 60#define XPNET_MSG_SIZE XPC_MSG_PAYLOAD_MAX_SIZE
74#define XPNET_MSG_DATA_MAX \ 61#define XPNET_MSG_DATA_MAX \
75 (XPNET_MSG_SIZE - (u64)(&((struct xpnet_message *)0)->data)) 62 (XPNET_MSG_SIZE - offsetof(struct xpnet_message, data))
76#define XPNET_MSG_ALIGNED_SIZE (L1_CACHE_ALIGN(XPNET_MSG_SIZE)) 63#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPC_MSG_MAX_SIZE)
77#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPNET_MSG_ALIGNED_SIZE)
78 64
79#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1) 65#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1)
80#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1) 66#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1)
@@ -105,7 +91,6 @@ struct xpnet_message {
105 * then be released. 91 * then be released.
106 */ 92 */
107struct xpnet_pending_msg { 93struct xpnet_pending_msg {
108 struct list_head free_list;
109 struct sk_buff *skb; 94 struct sk_buff *skb;
110 atomic_t use_count; 95 atomic_t use_count;
111}; 96};
@@ -121,7 +106,7 @@ struct net_device *xpnet_device;
121 * When we are notified of other partitions activating, we add them to 106 * When we are notified of other partitions activating, we add them to
122 * our bitmask of partitions to which we broadcast. 107 * our bitmask of partitions to which we broadcast.
123 */ 108 */
124static u64 xpnet_broadcast_partitions; 109static unsigned long *xpnet_broadcast_partitions;
125/* protect above */ 110/* protect above */
126static DEFINE_SPINLOCK(xpnet_broadcast_lock); 111static DEFINE_SPINLOCK(xpnet_broadcast_lock);
127 112
@@ -141,16 +126,13 @@ static DEFINE_SPINLOCK(xpnet_broadcast_lock);
141#define XPNET_DEF_MTU (0x8000UL) 126#define XPNET_DEF_MTU (0x8000UL)
142 127
143/* 128/*
144 * The partition id is encapsulated in the MAC address. The following 129 * The partid is encapsulated in the MAC address beginning in the following
145 * define locates the octet the partid is in. 130 * octet and it consists of two octets.
146 */ 131 */
147#define XPNET_PARTID_OCTET 1 132#define XPNET_PARTID_OCTET 2
148#define XPNET_LICENSE_OCTET 2 133
134/* Define the XPNET debug device structures to be used with dev_dbg() et al */
149 135
150/*
151 * Define the XPNET debug device structure that is to be used with dev_dbg(),
152 * dev_err(), dev_warn(), and dev_info().
153 */
154struct device_driver xpnet_dbg_name = { 136struct device_driver xpnet_dbg_name = {
155 .name = "xpnet" 137 .name = "xpnet"
156}; 138};
@@ -169,7 +151,8 @@ static void
169xpnet_receive(short partid, int channel, struct xpnet_message *msg) 151xpnet_receive(short partid, int channel, struct xpnet_message *msg)
170{ 152{
171 struct sk_buff *skb; 153 struct sk_buff *skb;
172 bte_result_t bret; 154 void *dst;
155 enum xp_retval ret;
173 struct xpnet_dev_private *priv = 156 struct xpnet_dev_private *priv =
174 (struct xpnet_dev_private *)xpnet_device->priv; 157 (struct xpnet_dev_private *)xpnet_device->priv;
175 158
@@ -201,7 +184,7 @@ xpnet_receive(short partid, int channel, struct xpnet_message *msg)
201 184
202 /* 185 /*
203 * The allocated skb has some reserved space. 186 * The allocated skb has some reserved space.
204 * In order to use bte_copy, we need to get the 187 * In order to use xp_remote_memcpy(), we need to get the
205 * skb->data pointer moved forward. 188 * skb->data pointer moved forward.
206 */ 189 */
207 skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data & 190 skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
@@ -226,26 +209,21 @@ xpnet_receive(short partid, int channel, struct xpnet_message *msg)
226 skb_copy_to_linear_data(skb, &msg->data, 209 skb_copy_to_linear_data(skb, &msg->data,
227 (size_t)msg->embedded_bytes); 210 (size_t)msg->embedded_bytes);
228 } else { 211 } else {
212 dst = (void *)((u64)skb->data & ~(L1_CACHE_BYTES - 1));
229 dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t" 213 dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
230 "bte_copy(0x%p, 0x%p, %hu)\n", (void *)msg->buf_pa, 214 "xp_remote_memcpy(0x%p, 0x%p, %hu)\n", dst,
231 (void *)__pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)), 215 (void *)msg->buf_pa, msg->size);
232 msg->size);
233
234 bret = bte_copy(msg->buf_pa,
235 __pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
236 msg->size, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
237 216
238 if (bret != BTE_SUCCESS) { 217 ret = xp_remote_memcpy(xp_pa(dst), msg->buf_pa, msg->size);
218 if (ret != xpSuccess) {
239 /* 219 /*
240 * >>> Need better way of cleaning skb. Currently skb 220 * !!! Need better way of cleaning skb. Currently skb
241 * >>> appears in_use and we can't just call 221 * !!! appears in_use and we can't just call
242 * >>> dev_kfree_skb. 222 * !!! dev_kfree_skb.
243 */ 223 */
244 dev_err(xpnet, "bte_copy(0x%p, 0x%p, 0x%hx) returned " 224 dev_err(xpnet, "xp_remote_memcpy(0x%p, 0x%p, 0x%hx) "
245 "error=0x%x\n", (void *)msg->buf_pa, 225 "returned error=0x%x\n", dst,
246 (void *)__pa((u64)skb->data & 226 (void *)msg->buf_pa, msg->size, ret);
247 ~(L1_CACHE_BYTES - 1)),
248 msg->size, bret);
249 227
250 xpc_received(partid, channel, (void *)msg); 228 xpc_received(partid, channel, (void *)msg);
251 229
@@ -285,9 +263,7 @@ static void
285xpnet_connection_activity(enum xp_retval reason, short partid, int channel, 263xpnet_connection_activity(enum xp_retval reason, short partid, int channel,
286 void *data, void *key) 264 void *data, void *key)
287{ 265{
288 long bp; 266 DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
289
290 DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
291 DBUG_ON(channel != XPC_NET_CHANNEL); 267 DBUG_ON(channel != XPC_NET_CHANNEL);
292 268
293 switch (reason) { 269 switch (reason) {
@@ -299,31 +275,28 @@ xpnet_connection_activity(enum xp_retval reason, short partid, int channel,
299 275
300 case xpConnected: /* connection completed to a partition */ 276 case xpConnected: /* connection completed to a partition */
301 spin_lock_bh(&xpnet_broadcast_lock); 277 spin_lock_bh(&xpnet_broadcast_lock);
302 xpnet_broadcast_partitions |= 1UL << (partid - 1); 278 __set_bit(partid, xpnet_broadcast_partitions);
303 bp = xpnet_broadcast_partitions;
304 spin_unlock_bh(&xpnet_broadcast_lock); 279 spin_unlock_bh(&xpnet_broadcast_lock);
305 280
306 netif_carrier_on(xpnet_device); 281 netif_carrier_on(xpnet_device);
307 282
308 dev_dbg(xpnet, "%s connection created to partition %d; " 283 dev_dbg(xpnet, "%s connected to partition %d\n",
309 "xpnet_broadcast_partitions=0x%lx\n", 284 xpnet_device->name, partid);
310 xpnet_device->name, partid, bp);
311 break; 285 break;
312 286
313 default: 287 default:
314 spin_lock_bh(&xpnet_broadcast_lock); 288 spin_lock_bh(&xpnet_broadcast_lock);
315 xpnet_broadcast_partitions &= ~(1UL << (partid - 1)); 289 __clear_bit(partid, xpnet_broadcast_partitions);
316 bp = xpnet_broadcast_partitions;
317 spin_unlock_bh(&xpnet_broadcast_lock); 290 spin_unlock_bh(&xpnet_broadcast_lock);
318 291
319 if (bp == 0) 292 if (bitmap_empty((unsigned long *)xpnet_broadcast_partitions,
293 xp_max_npartitions)) {
320 netif_carrier_off(xpnet_device); 294 netif_carrier_off(xpnet_device);
295 }
321 296
322 dev_dbg(xpnet, "%s disconnected from partition %d; " 297 dev_dbg(xpnet, "%s disconnected from partition %d\n",
323 "xpnet_broadcast_partitions=0x%lx\n", 298 xpnet_device->name, partid);
324 xpnet_device->name, partid, bp);
325 break; 299 break;
326
327 } 300 }
328} 301}
329 302
@@ -334,8 +307,10 @@ xpnet_dev_open(struct net_device *dev)
334 307
335 dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, " 308 dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
336 "%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity, 309 "%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
337 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES, XPNET_MAX_KTHREADS, 310 (unsigned long)XPNET_MSG_SIZE,
338 XPNET_MAX_IDLE_KTHREADS); 311 (unsigned long)XPNET_MSG_NENTRIES,
312 (unsigned long)XPNET_MAX_KTHREADS,
313 (unsigned long)XPNET_MAX_IDLE_KTHREADS);
339 314
340 ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL, 315 ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
341 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES, 316 XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
@@ -426,35 +401,74 @@ xpnet_send_completed(enum xp_retval reason, short partid, int channel,
426 } 401 }
427} 402}
428 403
404static void
405xpnet_send(struct sk_buff *skb, struct xpnet_pending_msg *queued_msg,
406 u64 start_addr, u64 end_addr, u16 embedded_bytes, int dest_partid)
407{
408 u8 msg_buffer[XPNET_MSG_SIZE];
409 struct xpnet_message *msg = (struct xpnet_message *)&msg_buffer;
410 u16 msg_size = sizeof(struct xpnet_message);
411 enum xp_retval ret;
412
413 msg->embedded_bytes = embedded_bytes;
414 if (unlikely(embedded_bytes != 0)) {
415 msg->version = XPNET_VERSION_EMBED;
416 dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
417 &msg->data, skb->data, (size_t)embedded_bytes);
418 skb_copy_from_linear_data(skb, &msg->data,
419 (size_t)embedded_bytes);
420 msg_size += embedded_bytes - 1;
421 } else {
422 msg->version = XPNET_VERSION;
423 }
424 msg->magic = XPNET_MAGIC;
425 msg->size = end_addr - start_addr;
426 msg->leadin_ignore = (u64)skb->data - start_addr;
427 msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
428 msg->buf_pa = xp_pa((void *)start_addr);
429
430 dev_dbg(xpnet, "sending XPC message to %d:%d\n"
431 KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, "
432 "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
433 dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
434 msg->leadin_ignore, msg->tailout_ignore);
435
436 atomic_inc(&queued_msg->use_count);
437
438 ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, XPC_NOWAIT, msg,
439 msg_size, xpnet_send_completed, queued_msg);
440 if (unlikely(ret != xpSuccess))
441 atomic_dec(&queued_msg->use_count);
442}
443
429/* 444/*
430 * Network layer has formatted a packet (skb) and is ready to place it 445 * Network layer has formatted a packet (skb) and is ready to place it
431 * "on the wire". Prepare and send an xpnet_message to all partitions 446 * "on the wire". Prepare and send an xpnet_message to all partitions
432 * which have connected with us and are targets of this packet. 447 * which have connected with us and are targets of this packet.
433 * 448 *
434 * MAC-NOTE: For the XPNET driver, the MAC address contains the 449 * MAC-NOTE: For the XPNET driver, the MAC address contains the
435 * destination partition_id. If the destination partition id word 450 * destination partid. If the destination partid octets are 0xffff,
436 * is 0xff, this packet is to broadcast to all partitions. 451 * this packet is to be broadcast to all connected partitions.
437 */ 452 */
438static int 453static int
439xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) 454xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
440{ 455{
441 struct xpnet_pending_msg *queued_msg; 456 struct xpnet_pending_msg *queued_msg;
442 enum xp_retval ret;
443 struct xpnet_message *msg;
444 u64 start_addr, end_addr; 457 u64 start_addr, end_addr;
445 long dp;
446 u8 second_mac_octet;
447 short dest_partid; 458 short dest_partid;
448 struct xpnet_dev_private *priv; 459 struct xpnet_dev_private *priv = (struct xpnet_dev_private *)dev->priv;
449 u16 embedded_bytes; 460 u16 embedded_bytes = 0;
450
451 priv = (struct xpnet_dev_private *)dev->priv;
452 461
453 dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p " 462 dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
454 "skb->end=0x%p skb->len=%d\n", (void *)skb->head, 463 "skb->end=0x%p skb->len=%d\n", (void *)skb->head,
455 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb), 464 (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
456 skb->len); 465 skb->len);
457 466
467 if (skb->data[0] == 0x33) {
468 dev_kfree_skb(skb);
469 return 0; /* nothing needed to be done */
470 }
471
458 /* 472 /*
459 * The xpnet_pending_msg tracks how many outstanding 473 * The xpnet_pending_msg tracks how many outstanding
460 * xpc_send_notifies are relying on this skb. When none 474 * xpc_send_notifies are relying on this skb. When none
@@ -466,7 +480,6 @@ xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
466 "packet\n", sizeof(struct xpnet_pending_msg)); 480 "packet\n", sizeof(struct xpnet_pending_msg));
467 481
468 priv->stats.tx_errors++; 482 priv->stats.tx_errors++;
469
470 return -ENOMEM; 483 return -ENOMEM;
471 } 484 }
472 485
@@ -475,7 +488,6 @@ xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
475 end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb)); 488 end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
476 489
477 /* calculate how many bytes to embed in the XPC message */ 490 /* calculate how many bytes to embed in the XPC message */
478 embedded_bytes = 0;
479 if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) { 491 if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
480 /* skb->data does fit so embed */ 492 /* skb->data does fit so embed */
481 embedded_bytes = skb->len; 493 embedded_bytes = skb->len;
@@ -491,82 +503,28 @@ xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
491 atomic_set(&queued_msg->use_count, 1); 503 atomic_set(&queued_msg->use_count, 1);
492 queued_msg->skb = skb; 504 queued_msg->skb = skb;
493 505
494 second_mac_octet = skb->data[XPNET_PARTID_OCTET]; 506 if (skb->data[0] == 0xff) {
495 if (second_mac_octet == 0xff) {
496 /* we are being asked to broadcast to all partitions */ 507 /* we are being asked to broadcast to all partitions */
497 dp = xpnet_broadcast_partitions; 508 for_each_bit(dest_partid, xpnet_broadcast_partitions,
498 } else if (second_mac_octet != 0) { 509 xp_max_npartitions) {
499 dp = xpnet_broadcast_partitions &
500 (1UL << (second_mac_octet - 1));
501 } else {
502 /* 0 is an invalid partid. Ignore */
503 dp = 0;
504 }
505 dev_dbg(xpnet, "destination Partitions mask (dp) = 0x%lx\n", dp);
506
507 /*
508 * If we wanted to allow promiscuous mode to work like an
509 * unswitched network, this would be a good point to OR in a
510 * mask of partitions which should be receiving all packets.
511 */
512
513 /*
514 * Main send loop.
515 */
516 for (dest_partid = 1; dp && dest_partid < XP_MAX_PARTITIONS;
517 dest_partid++) {
518 510
519 if (!(dp & (1UL << (dest_partid - 1)))) { 511 xpnet_send(skb, queued_msg, start_addr, end_addr,
520 /* not destined for this partition */ 512 embedded_bytes, dest_partid);
521 continue;
522 } 513 }
514 } else {
515 dest_partid = (short)skb->data[XPNET_PARTID_OCTET + 1];
516 dest_partid |= (short)skb->data[XPNET_PARTID_OCTET + 0] << 8;
523 517
524 /* remove this partition from the destinations mask */ 518 if (dest_partid >= 0 &&
525 dp &= ~(1UL << (dest_partid - 1)); 519 dest_partid < xp_max_npartitions &&
526 520 test_bit(dest_partid, xpnet_broadcast_partitions) != 0) {
527 /* found a partition to send to */ 521
528 522 xpnet_send(skb, queued_msg, start_addr, end_addr,
529 ret = xpc_allocate(dest_partid, XPC_NET_CHANNEL, 523 embedded_bytes, dest_partid);
530 XPC_NOWAIT, (void **)&msg);
531 if (unlikely(ret != xpSuccess))
532 continue;
533
534 msg->embedded_bytes = embedded_bytes;
535 if (unlikely(embedded_bytes != 0)) {
536 msg->version = XPNET_VERSION_EMBED;
537 dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
538 &msg->data, skb->data, (size_t)embedded_bytes);
539 skb_copy_from_linear_data(skb, &msg->data,
540 (size_t)embedded_bytes);
541 } else {
542 msg->version = XPNET_VERSION;
543 }
544 msg->magic = XPNET_MAGIC;
545 msg->size = end_addr - start_addr;
546 msg->leadin_ignore = (u64)skb->data - start_addr;
547 msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
548 msg->buf_pa = __pa(start_addr);
549
550 dev_dbg(xpnet, "sending XPC message to %d:%d\n"
551 KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, "
552 "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
553 dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
554 msg->leadin_ignore, msg->tailout_ignore);
555
556 atomic_inc(&queued_msg->use_count);
557
558 ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, msg,
559 xpnet_send_completed, queued_msg);
560 if (unlikely(ret != xpSuccess)) {
561 atomic_dec(&queued_msg->use_count);
562 continue;
563 } 524 }
564 } 525 }
565 526
566 if (atomic_dec_return(&queued_msg->use_count) == 0) { 527 if (atomic_dec_return(&queued_msg->use_count) == 0) {
567 dev_dbg(xpnet, "no partitions to receive packet destined for "
568 "%d\n", dest_partid);
569
570 dev_kfree_skb(skb); 528 dev_kfree_skb(skb);
571 kfree(queued_msg); 529 kfree(queued_msg);
572 } 530 }
@@ -594,23 +552,28 @@ xpnet_dev_tx_timeout(struct net_device *dev)
594static int __init 552static int __init
595xpnet_init(void) 553xpnet_init(void)
596{ 554{
597 int i; 555 int result;
598 u32 license_num;
599 int result = -ENOMEM;
600 556
601 if (!ia64_platform_is("sn2")) 557 if (!is_shub() && !is_uv())
602 return -ENODEV; 558 return -ENODEV;
603 559
604 dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME); 560 dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
605 561
562 xpnet_broadcast_partitions = kzalloc(BITS_TO_LONGS(xp_max_npartitions) *
563 sizeof(long), GFP_KERNEL);
564 if (xpnet_broadcast_partitions == NULL)
565 return -ENOMEM;
566
606 /* 567 /*
607 * use ether_setup() to init the majority of our device 568 * use ether_setup() to init the majority of our device
608 * structure and then override the necessary pieces. 569 * structure and then override the necessary pieces.
609 */ 570 */
610 xpnet_device = alloc_netdev(sizeof(struct xpnet_dev_private), 571 xpnet_device = alloc_netdev(sizeof(struct xpnet_dev_private),
611 XPNET_DEVICE_NAME, ether_setup); 572 XPNET_DEVICE_NAME, ether_setup);
612 if (xpnet_device == NULL) 573 if (xpnet_device == NULL) {
574 kfree(xpnet_broadcast_partitions);
613 return -ENOMEM; 575 return -ENOMEM;
576 }
614 577
615 netif_carrier_off(xpnet_device); 578 netif_carrier_off(xpnet_device);
616 579
@@ -628,14 +591,10 @@ xpnet_init(void)
628 * MAC addresses. We chose the first octet of the MAC to be unlikely 591 * MAC addresses. We chose the first octet of the MAC to be unlikely
629 * to collide with any vendor's officially issued MAC. 592 * to collide with any vendor's officially issued MAC.
630 */ 593 */
631 xpnet_device->dev_addr[0] = 0xfe; 594 xpnet_device->dev_addr[0] = 0x02; /* locally administered, no OUI */
632 xpnet_device->dev_addr[XPNET_PARTID_OCTET] = sn_partition_id; 595
633 license_num = sn_partition_serial_number_val(); 596 xpnet_device->dev_addr[XPNET_PARTID_OCTET + 1] = xp_partition_id;
634 for (i = 3; i >= 0; i--) { 597 xpnet_device->dev_addr[XPNET_PARTID_OCTET + 0] = (xp_partition_id >> 8);
635 xpnet_device->dev_addr[XPNET_LICENSE_OCTET + i] =
636 license_num & 0xff;
637 license_num = license_num >> 8;
638 }
639 598
640 /* 599 /*
641 * ether_setup() sets this to a multicast device. We are 600 * ether_setup() sets this to a multicast device. We are
@@ -651,8 +610,10 @@ xpnet_init(void)
651 xpnet_device->features = NETIF_F_NO_CSUM; 610 xpnet_device->features = NETIF_F_NO_CSUM;
652 611
653 result = register_netdev(xpnet_device); 612 result = register_netdev(xpnet_device);
654 if (result != 0) 613 if (result != 0) {
655 free_netdev(xpnet_device); 614 free_netdev(xpnet_device);
615 kfree(xpnet_broadcast_partitions);
616 }
656 617
657 return result; 618 return result;
658} 619}
@@ -666,8 +627,8 @@ xpnet_exit(void)
666 xpnet_device[0].name); 627 xpnet_device[0].name);
667 628
668 unregister_netdev(xpnet_device); 629 unregister_netdev(xpnet_device);
669
670 free_netdev(xpnet_device); 630 free_netdev(xpnet_device);
631 kfree(xpnet_broadcast_partitions);
671} 632}
672 633
673module_exit(xpnet_exit); 634module_exit(xpnet_exit);